├── acl ├── __init__.py ├── preprocessing │ ├── __init__.py │ ├── scraper.py │ ├── citation_mapping.py │ ├── negative_sampling.py │ └── parsecit.py ├── utils.py ├── dataset.py ├── __data_prep.py └── trainer_utils.py ├── cord19 ├── __init__.py ├── preprocessing │ ├── __init__.py │ ├── negative_sampling.py │ └── cord19_reader.py ├── utils.py └── dataset.py ├── tests ├── __init__.py ├── test_data_helper.py ├── test_acl.py ├── test_experiment.py ├── test_rnn.py ├── test_auto_modeling.py └── test_trainer.py ├── datasets ├── __init__.py ├── acl_docrel │ ├── __init__.py │ └── acl_docrel.py └── cord19_docrel │ ├── __init__.py │ └── cord19_docrel.py ├── output └── README.md ├── demo.gif ├── docrel.png ├── cli.py ├── requirements.txt ├── sbin ├── cord19 │ ├── config.sh │ ├── predict_only.sh │ ├── xlnet.sh │ ├── roberta-base.sh │ ├── scibert.sh │ ├── bert-base.sh │ ├── covid-bert-base.sh │ ├── scincl.sh │ ├── electra-base-discriminator.sh │ └── baseline-lstm-gpu.sh ├── acl │ ├── config.sh │ ├── xlnet.sh │ ├── bert-base.sh │ ├── roberta-base.sh │ ├── scibert.sh │ ├── scincl.sh │ ├── baseline-lstm-colab.sh │ ├── electra-base-discriminator.sh │ ├── baseline-lstm.sh │ ├── baseline-lstm-gpu.sh │ ├── 1.sh │ └── gpu1.sh └── compress_data_and_upload.sh ├── experiments ├── data_loaders.py ├── environment.py ├── utils.py └── data_helpers.py ├── environments └── default.yml ├── LICENSE.txt ├── models ├── __init__.py ├── roberta.py ├── electra.py ├── bart.py ├── bert.py ├── xlnet.py ├── longformer.py ├── utils.py ├── rnn.py └── auto_modeling.py ├── demo_utils.py ├── .gitignore ├── demo.ipynb ├── README.md └── word_vectors.ipynb /acl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cord19/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /acl/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cord19/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/acl_docrel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/cord19_docrel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/README.md: -------------------------------------------------------------------------------- 1 | Keep this directory for all output files. -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/aspect-document-similarity/HEAD/demo.gif -------------------------------------------------------------------------------- /docrel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/malteos/aspect-document-similarity/HEAD/docrel.png -------------------------------------------------------------------------------- /cli.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import fire 3 | 4 | from commands import word_vectors, compute_doc_vecs 5 | 6 | logging.basicConfig(level=logging.INFO) 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | if __name__ == '__main__': 11 | fire.Fire({ 12 | 'compute_doc_vecs': compute_doc_vecs.compute_doc_vecs, 13 | 'extract_text': word_vectors.extract_text, 14 | 'train_fasttext': word_vectors.train_fasttext, 15 | }) 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # general 2 | pandas 3 | jupyter 4 | numpy 5 | tqdm 6 | matplotlib 7 | pyyaml 8 | nltk 9 | scikit-learn==0.23.1 10 | 11 | # data accusition & preprocessing 12 | fuzzywuzzy 13 | lxml 14 | cssselect 15 | # bibtexparser 16 | requests 17 | smart-open 18 | 19 | # wiki related (+ nlp) 20 | # wikipedia2vec 21 | gensim 22 | spacy 23 | 24 | # model & evaluation 25 | torch==1.6.0 26 | transformers==2.10.0 27 | tokenizers==0.7.0 28 | nlp==0.1.0 29 | 30 | # experiments 31 | tensorboard 32 | wandb==0.8.36 33 | -------------------------------------------------------------------------------- /sbin/cord19/config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export LR=2e-5 # 5e-05 # LEARNING_RATE = 2e-5 # 2e-6 does not work (?) 4 | export EPOCHS=4 # or 4? 5 | export SEED=0 6 | export NLP_CACHE_DIR=./data/nlp_cache 7 | export CACHE_DIR=./data/trainer_cache 8 | 9 | export OUTPUT_DIR=./output/cord19_docrel/folds 10 | export DOC_ID_COL=doi 11 | export DOC_A_COL=from_doi 12 | export DOC_B_COL=to_doi 13 | export NLP_DATASET=./datasets/cord19_docrel/cord19_docrel.py 14 | 15 | # wandb 16 | export WANDB_API_KEY= 17 | export WANDB_PROJECT= 18 | -------------------------------------------------------------------------------- /sbin/acl/config.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # export CUDA_VISIBLE_DEVICES=1 4 | 5 | export LR=2e-5 # 5e-05 # LEARNING_RATE = 2e-5 # 2e-6 does not work (?) 6 | export EPOCHS=4 # or 4? 7 | export SEED=0 8 | export NLP_CACHE_DIR=./data/nlp_cache 9 | export CACHE_DIR=./data/trainer_cache 10 | 11 | export OUTPUT_DIR=./output/acl_docrel/folds 12 | export DOC_ID_COL=s2_id 13 | export DOC_A_COL=from_s2_id 14 | export DOC_B_COL=to_s2_id 15 | export NLP_DATASET=./datasets/acl_docrel/acl_docrel.py 16 | 17 | # wandb 18 | export WANDB_API_KEY= 19 | export WANDB_PROJECT= 20 | -------------------------------------------------------------------------------- /experiments/data_loaders.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from torch.utils.data import DataLoader 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class XYDataLoader(DataLoader): 9 | """ 10 | 11 | Batch consists only of X (item data) and Y (label) 12 | 13 | """ 14 | def get_x_from_batch(self, batch): 15 | raise NotImplementedError() 16 | 17 | def get_y_from_batch(self, batch): 18 | raise NotImplementedError() 19 | 20 | 21 | class DefaultXYDataLoader(XYDataLoader): 22 | """ 23 | 24 | Last item of batch is Y, everything else is X. 25 | 26 | """ 27 | def get_x_from_batch(self, batch): 28 | return batch[:-1] 29 | 30 | def get_y_from_batch(self, batch): 31 | return batch[-1] -------------------------------------------------------------------------------- /environments/default.yml: -------------------------------------------------------------------------------- 1 | # Settings for your local setup 2 | local_mac: 3 | must_exists: /Volumes/data/repo/ 4 | bert_dir: /Volumes/data/repo/data/bert 5 | datasets_dir: /Volumes/data/repo/data 6 | workers: 4 7 | 8 | # Settings for your GPU server 9 | gpu_server: 10 | #must_exists: /usr/bin/nvidia-smi 11 | must_exists: ~/gpu1 12 | bert_dir: /mnt/hdd/datasets/BERT_pre_trained_models/pytorch 13 | datasets_dir: /mnt/hdd/datasets 14 | workers: 12 15 | 16 | gpu_server2: 17 | must_exists: /home/mostendorff/gpu2 18 | datasets_dir: /data/datasets/ 19 | bert_dir: /data/datasets/huggingface_transformers/pytorch 20 | workers: 36 21 | 22 | google_colab: 23 | must_exists: /content 24 | datasets_dir: /dev/null 25 | bert_dir: /dev/null 26 | workers: 24 27 | -------------------------------------------------------------------------------- /sbin/cord19/predict_only.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export CV_FOLD=1 4 | export TRAIN_BATCH_SIZE=8 5 | export EVAL_BATCH_SIZE=32 6 | 7 | python trainer_cli.py --cv_fold $CV_FOLD \ 8 | --output_dir $OUTPUT_DIR \ 9 | --model_name_or_path $MODEL_NAME \ 10 | --doc_id_col $DOC_ID_COL \ 11 | --doc_a_col $DOC_A_COL \ 12 | --doc_b_col $DOC_B_COL \ 13 | --nlp_dataset $NLP_DATASET \ 14 | --nlp_cache_dir $NLP_CACHE_DIR \ 15 | --cache_dir $CACHE_DIR \ 16 | --num_train_epochs $EPOCHS \ 17 | --seed $SEED \ 18 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 19 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 20 | --learning_rate $LR \ 21 | --logging_steps 100 \ 22 | --save_steps 0 \ 23 | --save_total_limit 3 \ 24 | --save_predictions 25 | -------------------------------------------------------------------------------- /experiments/environment.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import yaml 4 | 5 | 6 | def get_env(): 7 | env = None 8 | 9 | base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 10 | env_dir = os.path.join(base_dir, 'environments') 11 | 12 | print(env_dir) 13 | 14 | for fn in os.listdir(env_dir): 15 | if fn.endswith('.yml'): 16 | with open(os.path.join(env_dir, fn), 'r') as f: 17 | envs = yaml.load(f, Loader=yaml.SafeLoader) 18 | 19 | for env_name, _env in envs.items(): 20 | if os.path.exists(_env['must_exists']): 21 | print(f'Environment detected: {env_name} (in {fn})') 22 | env = _env 23 | break 24 | if env is not None: 25 | break 26 | 27 | if env: 28 | return env 29 | else: 30 | raise ValueError('Could not determine env!') 31 | -------------------------------------------------------------------------------- /sbin/acl/xlnet.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=xlnet-base 10 | 11 | # xlnet-base 12 | export EVAL_BATCH_SIZE=32 13 | export TRAIN_BATCH_SIZE=6 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/cord19/xlnet.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=xlnet-base-cased 10 | 11 | # xlnet-base 12 | export EVAL_BATCH_SIZE=24 13 | export TRAIN_BATCH_SIZE=6 14 | 15 | for CV_FOLD in 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/acl/bert-base.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=bert-base-cased 10 | 11 | # bert-base 12 | export EVAL_BATCH_SIZE=16 13 | export TRAIN_BATCH_SIZE=8 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/acl/roberta-base.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=roberta-base 10 | 11 | # roberta-base 12 | export EVAL_BATCH_SIZE=32 13 | export TRAIN_BATCH_SIZE=6 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/acl/scibert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=scibert-scivocab-uncased 10 | 11 | # bert-base 12 | export EVAL_BATCH_SIZE=16 13 | export TRAIN_BATCH_SIZE=8 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/cord19/roberta-base.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=roberta-base 10 | 11 | # roberta-base 12 | export EVAL_BATCH_SIZE=24 13 | export TRAIN_BATCH_SIZE=8 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/cord19/scibert.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=scibert-scivocab-uncased 10 | 11 | # bert-base 12 | export EVAL_BATCH_SIZE=16 13 | export TRAIN_BATCH_SIZE=8 14 | 15 | for CV_FOLD in 1 2 3 4 16 | do 17 | python trainer_cli.py --cv_fold $CV_FOLD \ 18 | --output_dir $OUTPUT_DIR \ 19 | --model_name_or_path $MODEL_NAME \ 20 | --doc_id_col $DOC_ID_COL \ 21 | --doc_a_col $DOC_A_COL \ 22 | --doc_b_col $DOC_B_COL \ 23 | --nlp_dataset $NLP_DATASET \ 24 | --nlp_cache_dir $NLP_CACHE_DIR \ 25 | --cache_dir $CACHE_DIR \ 26 | --num_train_epochs $EPOCHS \ 27 | --seed $SEED \ 28 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 29 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 30 | --learning_rate $LR \ 31 | --logging_steps 100 \ 32 | --save_steps 0 \ 33 | --save_total_limit 3 \ 34 | --do_train \ 35 | --save_predictions 36 | done 37 | 38 | export PYTHONUNBUFFERED="" 39 | -------------------------------------------------------------------------------- /sbin/cord19/bert-base.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export CV_FOLD=1 10 | export MODEL_NAME=bert-base-cased 11 | 12 | # bert-base 13 | export EVAL_BATCH_SIZE=32 14 | export TRAIN_BATCH_SIZE=8 15 | 16 | for CV_FOLD in 1 2 3 4 17 | do 18 | python trainer_cli.py --cv_fold $CV_FOLD \ 19 | --output_dir $OUTPUT_DIR \ 20 | --model_name_or_path $MODEL_NAME \ 21 | --doc_id_col $DOC_ID_COL \ 22 | --doc_a_col $DOC_A_COL \ 23 | --doc_b_col $DOC_B_COL \ 24 | --nlp_dataset $NLP_DATASET \ 25 | --nlp_cache_dir $NLP_CACHE_DIR \ 26 | --cache_dir $CACHE_DIR \ 27 | --num_train_epochs $EPOCHS \ 28 | --seed $SEED \ 29 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 30 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 31 | --learning_rate $LR \ 32 | --logging_steps 100 \ 33 | --save_steps 0 \ 34 | --save_total_limit 3 \ 35 | --do_train \ 36 | --save_predictions 37 | done 38 | 39 | export PYTHONUNBUFFERED="" 40 | -------------------------------------------------------------------------------- /sbin/acl/scincl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Train on 12 GB GPU 4 | 5 | export PYTHONUNBUFFERED=1 6 | 7 | export APP_ROOT=$(dirname "$0") 8 | 9 | . $APP_ROOT/config.sh 10 | 11 | export MODEL_NAME=scincl 12 | 13 | # bert-base 14 | export EVAL_BATCH_SIZE=16 15 | export TRAIN_BATCH_SIZE=8 16 | 17 | # 1 2 3 4 18 | for CV_FOLD in 2 3 4 19 | do 20 | python trainer_cli.py --cv_fold $CV_FOLD \ 21 | --output_dir $OUTPUT_DIR \ 22 | --model_name_or_path $MODEL_NAME \ 23 | --doc_id_col $DOC_ID_COL \ 24 | --doc_a_col $DOC_A_COL \ 25 | --doc_b_col $DOC_B_COL \ 26 | --nlp_dataset $NLP_DATASET \ 27 | --nlp_cache_dir $NLP_CACHE_DIR \ 28 | --cache_dir $CACHE_DIR \ 29 | --num_train_epochs $EPOCHS \ 30 | --seed $SEED \ 31 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 32 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 33 | --learning_rate $LR \ 34 | --logging_steps 100 \ 35 | --save_steps 0 \ 36 | --save_total_limit 3 \ 37 | --do_train \ 38 | --save_predictions 39 | done 40 | 41 | export PYTHONUNBUFFERED="" 42 | -------------------------------------------------------------------------------- /sbin/cord19/covid-bert-base.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export CV_FOLD=1 10 | export MODEL_NAME=deepset/covid_bert_base 11 | 12 | # bert-base 13 | export EVAL_BATCH_SIZE=32 14 | export TRAIN_BATCH_SIZE=8 15 | 16 | for CV_FOLD in 1 2 3 4 17 | do 18 | python trainer_cli.py --cv_fold $CV_FOLD \ 19 | --output_dir $OUTPUT_DIR \ 20 | --model_name_or_path $MODEL_NAME \ 21 | --doc_id_col $DOC_ID_COL \ 22 | --doc_a_col $DOC_A_COL \ 23 | --doc_b_col $DOC_B_COL \ 24 | --nlp_dataset $NLP_DATASET \ 25 | --nlp_cache_dir $NLP_CACHE_DIR \ 26 | --cache_dir $CACHE_DIR \ 27 | --num_train_epochs $EPOCHS \ 28 | --seed $SEED \ 29 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 30 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 31 | --learning_rate $LR \ 32 | --logging_steps 100 \ 33 | --save_steps 0 \ 34 | --save_total_limit 3 \ 35 | --do_train \ 36 | --save_predictions 37 | done 38 | 39 | export PYTHONUNBUFFERED="" 40 | -------------------------------------------------------------------------------- /sbin/cord19/scincl.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Train on 12 GB GPU 4 | 5 | export PYTHONUNBUFFERED=1 6 | 7 | export APP_ROOT=$(dirname "$0") 8 | 9 | . $APP_ROOT/config.sh 10 | 11 | export MODEL_NAME=scincl 12 | 13 | # bert-base 14 | export EVAL_BATCH_SIZE=16 15 | export TRAIN_BATCH_SIZE=8 16 | 17 | # 1 2 3 4 18 | for CV_FOLD in 1 2 3 4 19 | do 20 | python trainer_cli.py --cv_fold $CV_FOLD \ 21 | --output_dir $OUTPUT_DIR \ 22 | --model_name_or_path $MODEL_NAME \ 23 | --doc_id_col $DOC_ID_COL \ 24 | --doc_a_col $DOC_A_COL \ 25 | --doc_b_col $DOC_B_COL \ 26 | --nlp_dataset $NLP_DATASET \ 27 | --nlp_cache_dir $NLP_CACHE_DIR \ 28 | --cache_dir $CACHE_DIR \ 29 | --num_train_epochs $EPOCHS \ 30 | --seed $SEED \ 31 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 32 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 33 | --learning_rate $LR \ 34 | --logging_steps 100 \ 35 | --save_steps 0 \ 36 | --save_total_limit 3 \ 37 | --do_train \ 38 | --save_predictions 39 | done 40 | 41 | export PYTHONUNBUFFERED="" 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Malte Ostendorff 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /sbin/acl/baseline-lstm-colab.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=baseline-rnn 10 | export EPOCHS=10 11 | export SPACY_MODEL=./en_glove_6b_300d 12 | 13 | for CV_FOLD in 1 14 | do 15 | python trainer_cli.py --cv_fold $CV_FOLD \ 16 | --output_dir $OUTPUT_DIR \ 17 | --model_name_or_path $MODEL_NAME \ 18 | --doc_id_col $DOC_ID_COL \ 19 | --doc_a_col $DOC_A_COL \ 20 | --doc_b_col $DOC_B_COL \ 21 | --nlp_dataset $NLP_DATASET \ 22 | --nlp_cache_dir $NLP_CACHE_DIR \ 23 | --cache_dir $CACHE_DIR \ 24 | --num_train_epochs $EPOCHS \ 25 | --seed $SEED \ 26 | --learning_rate $LR \ 27 | --logging_steps 500 \ 28 | --save_steps 0 \ 29 | --save_total_limit 3 \ 30 | --do_train \ 31 | --save_predictions \ 32 | --spacy_model $SPACY_MODEL \ 33 | --rnn_type lstm \ 34 | --evaluate_during_training \ 35 | --per_gpu_eval_batch_size 24 \ 36 | --per_gpu_train_batch_size 12 37 | done 38 | 39 | export PYTHONUNBUFFERED="" -------------------------------------------------------------------------------- /sbin/acl/electra-base-discriminator.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export CV_FOLD=1 10 | export MODEL_NAME=google/electra-base-discriminator 11 | 12 | # roberta-base 13 | export EVAL_BATCH_SIZE=12 14 | export TRAIN_BATCH_SIZE=8 15 | 16 | for CV_FOLD in 1 2 3 4 17 | do 18 | python trainer_cli.py --cv_fold $CV_FOLD \ 19 | --output_dir $OUTPUT_DIR \ 20 | --model_name_or_path $MODEL_NAME \ 21 | --doc_id_col $DOC_ID_COL \ 22 | --doc_a_col $DOC_A_COL \ 23 | --doc_b_col $DOC_B_COL \ 24 | --nlp_dataset $NLP_DATASET \ 25 | --nlp_cache_dir $NLP_CACHE_DIR \ 26 | --cache_dir $CACHE_DIR \ 27 | --num_train_epochs $EPOCHS \ 28 | --seed $SEED \ 29 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 30 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 31 | --learning_rate $LR \ 32 | --logging_steps 100 \ 33 | --save_steps 0 \ 34 | --save_total_limit 3 \ 35 | --do_train \ 36 | --save_predictions 37 | done 38 | 39 | export PYTHONUNBUFFERED="" 40 | -------------------------------------------------------------------------------- /sbin/cord19/electra-base-discriminator.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export CV_FOLD=1 10 | export MODEL_NAME=google/electra-base-discriminator 11 | 12 | # roberta-base 13 | export EVAL_BATCH_SIZE=32 14 | export TRAIN_BATCH_SIZE=8 15 | 16 | for CV_FOLD in 1 2 3 4 17 | do 18 | python trainer_cli.py --cv_fold $CV_FOLD \ 19 | --output_dir $OUTPUT_DIR \ 20 | --model_name_or_path $MODEL_NAME \ 21 | --doc_id_col $DOC_ID_COL \ 22 | --doc_a_col $DOC_A_COL \ 23 | --doc_b_col $DOC_B_COL \ 24 | --nlp_dataset $NLP_DATASET \ 25 | --nlp_cache_dir $NLP_CACHE_DIR \ 26 | --cache_dir $CACHE_DIR \ 27 | --num_train_epochs $EPOCHS \ 28 | --seed $SEED \ 29 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 30 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 31 | --learning_rate $LR \ 32 | --logging_steps 100 \ 33 | --save_steps 0 \ 34 | --save_total_limit 3 \ 35 | --do_train \ 36 | --save_predictions 37 | done 38 | 39 | export PYTHONUNBUFFERED="" 40 | -------------------------------------------------------------------------------- /sbin/acl/baseline-lstm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=baseline-rnn 10 | export EPOCHS=10 11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d 12 | export SPACY_MODEL=/Volumes/data/repo/data/spacy/en_glove_6b_300d 13 | 14 | for CV_FOLD in 1 15 | do 16 | python trainer_cli.py --cv_fold $CV_FOLD \ 17 | --output_dir $OUTPUT_DIR \ 18 | --model_name_or_path $MODEL_NAME \ 19 | --doc_id_col $DOC_ID_COL \ 20 | --doc_a_col $DOC_A_COL \ 21 | --doc_b_col $DOC_B_COL \ 22 | --nlp_dataset $NLP_DATASET \ 23 | --nlp_cache_dir $NLP_CACHE_DIR \ 24 | --cache_dir $CACHE_DIR \ 25 | --num_train_epochs $EPOCHS \ 26 | --seed $SEED \ 27 | --learning_rate $LR \ 28 | --logging_steps 500 \ 29 | --save_steps 0 \ 30 | --save_total_limit 3 \ 31 | --do_train \ 32 | --save_predictions \ 33 | --spacy_model $SPACY_MODEL \ 34 | --rnn_type lstm \ 35 | --evaluate_during_training \ 36 | --no_cuda 37 | done 38 | 39 | export PYTHONUNBUFFERED="" 40 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import torch.nn as nn 4 | 5 | from models.utils import get_concat, get_mlp 6 | 7 | __all__ = [ 8 | 'ExperimentModel', 9 | ] 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class ExperimentModel(nn.Module): 15 | labels_count = 1 16 | 17 | def forward(self, *input): 18 | raise NotImplementedError() 19 | 20 | def get_single_device(self): 21 | """ 22 | If all parameters are on a single device, use this method to get current device. 23 | See: https://github.com/pytorch/pytorch/issues/7460 24 | """ 25 | return next(self.parameters()).device 26 | 27 | def get_classification_probability_layer(self, mode='auto'): 28 | logger.debug(f'Classification probability layer with {mode}') 29 | if mode == 'auto': 30 | logger.debug(f'Auto-mode; labels count = {self.labels_count}') 31 | if self.labels_count == 1: 32 | return self.get_classification_probability_layer('sigmoid') 33 | else: 34 | return self.get_classification_probability_layer('softmax') 35 | elif mode == 'sigmoid': 36 | return nn.Sigmoid() 37 | elif mode == 'softmax': 38 | return nn.Softmax(dim=0) 39 | elif mode == 'none': 40 | return None 41 | else: 42 | raise ValueError('Unsupported mode') 43 | 44 | -------------------------------------------------------------------------------- /models/roberta.py: -------------------------------------------------------------------------------- 1 | from torch.nn import BCEWithLogitsLoss 2 | from transformers import RobertaForSequenceClassification 3 | 4 | 5 | class RobertaForMultiLabelSequenceClassification(RobertaForSequenceClassification): 6 | def forward( 7 | self, 8 | input_ids=None, 9 | attention_mask=None, 10 | token_type_ids=None, 11 | position_ids=None, 12 | head_mask=None, 13 | inputs_embeds=None, 14 | labels=None, 15 | ): 16 | outputs = self.roberta( 17 | input_ids, 18 | attention_mask=attention_mask, 19 | token_type_ids=token_type_ids, 20 | position_ids=position_ids, 21 | head_mask=head_mask, 22 | inputs_embeds=inputs_embeds, 23 | ) 24 | sequence_output = outputs[0] 25 | logits = self.classifier(sequence_output) 26 | 27 | outputs = (logits,) + outputs[2:] 28 | if labels is not None: 29 | # Single-label classification (as in RobertaForSequenceClassification) 30 | # loss_fct = CrossEntropyLoss() 31 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 32 | loss_fct = BCEWithLogitsLoss() 33 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 34 | 35 | outputs = (loss,) + outputs 36 | 37 | return outputs # (loss), logits, (hidden_states), (attentions) -------------------------------------------------------------------------------- /tests/test_data_helper.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | 4 | # TODO implement these tests 5 | import numpy as np 6 | import pandas as pd 7 | import torch 8 | from torch.utils.data import WeightedRandomSampler, TensorDataset, DataLoader 9 | 10 | from sci.data_helpers import BaseSciDataHelper 11 | 12 | 13 | class DataHelperTests(TestCase): 14 | def test_negative_sampling(self): 15 | raise NotImplementedError() 16 | 17 | def test_weighted_sampler(self): 18 | 19 | items_a = ['a'] * 10 20 | items_b = ['b'] * 3 21 | items_c = ['c'] * 5 22 | items = items_a + items_b + items_c 23 | 24 | dh = BaseSciDataHelper(label_col='label', labels=['a', 'b', 'c'], none_label=None) 25 | 26 | df = pd.DataFrame({'label': items}) 27 | dh.set_label_encoder(df) 28 | 29 | label_weights, weights = dh.get_sampler_weights(df) 30 | 31 | ys = torch.tensor(dh.label_encoder.transform(items)) 32 | 33 | sampler = WeightedRandomSampler(weights, num_samples=int(weights.sum()), replacement=True) 34 | 35 | dl = DataLoader(TensorDataset(ys), sampler=sampler, batch_size=4) 36 | 37 | out = [] 38 | 39 | for batch in dl: 40 | yss = batch[0].numpy() 41 | out += dh.label_encoder.inverse_transform(yss).tolist() 42 | 43 | odf = pd.DataFrame({'label': out}) 44 | 45 | print(df['label'].value_counts()) 46 | print(odf['label'].value_counts()) 47 | -------------------------------------------------------------------------------- /tests/test_acl.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from unittest import TestCase 4 | 5 | from acl.__data_prep import load_parscit_file, get_citation_context 6 | from experiments.environment import get_env 7 | 8 | 9 | class ACLTest(TestCase): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.env = get_env() 13 | self.data_dir = data_dir = os.path.join(self.env['datasets_dir'], 'acl-anthology') 14 | 15 | def test_get_cits(self): 16 | title2acl_ids = {} 17 | author_last2titles = {} 18 | year2titles = defaultdict(list) 19 | 20 | fp = self.data_dir + '/parscit/D/D15/D15-1312-parscit.130908.xml' 21 | fn = 'D15-1312-parscit.130908.xml' 22 | 23 | error_files = [] 24 | out = [] 25 | acl_id2sects = {} 26 | acl_id2markers = {} 27 | 28 | sects, cits, markers = load_parscit_file(fp) 29 | 30 | from_id = '-'.join(fn.split('-', 2)[:2]) # ACL ID 31 | 32 | acl_id2sects[from_id] = sects 33 | acl_id2markers[from_id] = markers 34 | 35 | print('----') 36 | 37 | print(sects) 38 | 39 | print('----') 40 | 41 | print(cits) 42 | 43 | print('---') 44 | 45 | cits_with_context = get_citation_context(cits, sects, title2acl_ids, year2titles, author_last2titles) 46 | # 47 | # out += [(from_id, to_id, context[0], context[1], context[2]) for to_id, context in cits_with_context] 48 | 49 | -------------------------------------------------------------------------------- /cord19/utils.py: -------------------------------------------------------------------------------- 1 | # normalize section title 2 | def normalize_section(title): 3 | return title.strip().lower()\ 4 | .replace('conclusions', 'conclusion')\ 5 | .replace('concluding remarks', 'conclusion')\ 6 | .replace('future perspectives', 'future work')\ 7 | .replace('future directions', 'future work')\ 8 | .replace('viruses.', 'virus')\ 9 | .replace('viruses', 'virus') 10 | #.replace('conclusion and future perspectives', 'conclusion')\ 11 | #.replace('materials and methods', 'methods') 12 | 13 | 14 | def resolve_and_sect_titles(cits): 15 | for from_doi, to_doi, sect_title in cits: 16 | for t in normalize_section(sect_title).split(' and '): 17 | yield (from_doi, to_doi, t) 18 | 19 | 20 | def get_text_from_doi(doi, doi2paper, raise_not_found_error=True): 21 | text = '' 22 | sep = '\n' 23 | 24 | # if doi in doi2s2paper: 25 | # # from s2 scraper 26 | # # text += doi2s2paper[doi]['title'] 27 | # 28 | # if doi2s2paper[doi]['abstract']: 29 | # # text += '\n' + doi2s2paper[doi]['abstract'] 30 | # text = doi2s2paper[doi]['title'] + sep + doi2s2paper[doi]['abstract'] 31 | 32 | if doi in doi2paper: 33 | # text += doi2paper[doi]['metadata']['title'] 34 | 35 | if doi2paper[doi]['abstract'] and len(doi2paper[doi]['abstract']) > 10: 36 | # text += doi2paper[doi]['metadata']['title'] + '\n' + doi2paper[doi]['abstract'][0]['text'] 37 | text = doi2paper[doi]['title'] + sep + doi2paper[doi]['abstract'] 38 | 39 | elif raise_not_found_error: 40 | raise ValueError('DOI not found') 41 | 42 | return text 43 | -------------------------------------------------------------------------------- /models/electra.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import BCEWithLogitsLoss 4 | from transformers import BertModel, BertPreTrainedModel, BertForSequenceClassification, ElectraForSequenceClassification 5 | 6 | 7 | class ElectraForMultiLabelSequenceClassification(ElectraForSequenceClassification): 8 | """Electra model for classification. 9 | This module is composed of Electra BERT model with a linear layer on top of 10 | the pooled output. 11 | """ 12 | 13 | def forward( 14 | self, 15 | input_ids=None, 16 | attention_mask=None, 17 | token_type_ids=None, 18 | position_ids=None, 19 | head_mask=None, 20 | inputs_embeds=None, 21 | labels=None, 22 | ): 23 | discriminator_hidden_states = self.electra( 24 | input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds 25 | ) 26 | sequence_output = discriminator_hidden_states[0] 27 | logits = self.classifier(sequence_output) 28 | 29 | outputs = (logits,) + discriminator_hidden_states[2:] # add hidden states and attention if they are here 30 | 31 | if labels is not None: 32 | # Single-label classification (as in BertForSequenceClassification) 33 | # loss_fct = CrossEntropyLoss() 34 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 35 | 36 | loss_fct = BCEWithLogitsLoss() 37 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 38 | 39 | outputs = (loss,) + outputs 40 | 41 | return outputs # (loss), logits, (hidden_states), (attentions) 42 | -------------------------------------------------------------------------------- /models/bart.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn import BCEWithLogitsLoss 3 | from transformers import BartForSequenceClassification 4 | 5 | 6 | class BartForMultiLabelSequenceClassification(BartForSequenceClassification): 7 | def forward( 8 | self, 9 | input_ids, 10 | attention_mask=None, 11 | encoder_outputs=None, 12 | decoder_input_ids=None, 13 | decoder_attention_mask=None, 14 | labels=None, 15 | ): 16 | outputs = self.model( 17 | input_ids, 18 | attention_mask=attention_mask, 19 | decoder_input_ids=decoder_input_ids, 20 | decoder_attention_mask=decoder_attention_mask, 21 | encoder_outputs=encoder_outputs, 22 | ) 23 | x = outputs[0] # last hidden state 24 | eos_mask = input_ids.eq(self.config.eos_token_id) 25 | if len(torch.unique(eos_mask.sum(1))) > 1: 26 | raise ValueError("All examples must have the same number of tokens.") 27 | sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :] 28 | logits = self.classification_head(sentence_representation) 29 | # Prepend logits 30 | outputs = (logits,) + outputs[1:] # Add hidden states and attention if they are here 31 | if labels is not None: # prepend loss to output, 32 | # Single label 33 | # loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) 34 | loss_fct = BCEWithLogitsLoss() 35 | loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels)) 36 | 37 | outputs = (loss,) + outputs 38 | 39 | return outputs 40 | 41 | -------------------------------------------------------------------------------- /demo_utils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from transformers import AutoTokenizer 3 | 4 | from models.bert import BertForMultiLabelSequenceClassification 5 | 6 | 7 | def get_paper(doc_id): 8 | res = requests.get(f'https://api.semanticscholar.org/v1/paper/{doc_id}') 9 | 10 | if res.status_code == 200: 11 | return res.json() 12 | else: 13 | raise ValueError(f'Cannot load paper from S2 API: {doc_id}') 14 | 15 | 16 | def get_prediction(model_name_or_path: str, from_id, to_id): 17 | from_doc = get_paper(from_id) 18 | to_doc = get_paper(to_id) 19 | 20 | if 'acl' in model_name_or_path: 21 | labels = ['introduction', 'related work', 'experiment', 'conclusion', 'results', 'background', 'discussion', 22 | 'evaluation', 'method', 'other', 'none'] 23 | else: 24 | labels = ['discussion', 'introduction', 'conclusion', 'results', 'methods', 'background', 'materials', 'virus', 25 | 'future work', 'other', 'none'] 26 | 27 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 28 | model = BertForMultiLabelSequenceClassification.from_pretrained(model_name_or_path) 29 | 30 | model_input = tokenizer.batch_encode_plus( 31 | [(from_doc['title'] + '\n' + from_doc['abstract'], to_doc['title'] + '\n' + to_doc['abstract'])], 32 | pad_to_max_length=True, truncation_strategy='longest_first', return_token_type_ids=True, 33 | return_attention_masks=True, return_tensors='pt', max_length=512 34 | ) 35 | 36 | model_out = model(**model_input) 37 | 38 | pred_scores = model_out[0].detach().numpy()[0] 39 | pred_labels = [label for idx, label in enumerate(labels) if pred_scores[idx] > 0.] 40 | 41 | return pred_scores, pred_labels, from_doc, to_doc 42 | -------------------------------------------------------------------------------- /models/bert.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import BCEWithLogitsLoss 4 | from transformers import BertModel, BertPreTrainedModel, BertForSequenceClassification 5 | 6 | 7 | class BertForMultiLabelSequenceClassification(BertForSequenceClassification): 8 | """BERT model for classification. 9 | This module is composed of the BERT model with a linear layer on top of 10 | the pooled output. 11 | """ 12 | 13 | def forward( 14 | self, 15 | input_ids=None, 16 | attention_mask=None, 17 | token_type_ids=None, 18 | position_ids=None, 19 | head_mask=None, 20 | inputs_embeds=None, 21 | labels=None, 22 | ): 23 | outputs = self.bert( 24 | input_ids, 25 | attention_mask=attention_mask, 26 | token_type_ids=token_type_ids, 27 | position_ids=position_ids, 28 | head_mask=head_mask, 29 | inputs_embeds=inputs_embeds, 30 | ) 31 | pooled_output = outputs[1] 32 | 33 | pooled_output = self.dropout(pooled_output) 34 | logits = self.classifier(pooled_output) 35 | 36 | outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here 37 | 38 | if labels is not None: 39 | # Single-label classification (as in BertForSequenceClassification) 40 | # loss_fct = CrossEntropyLoss() 41 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 42 | 43 | loss_fct = BCEWithLogitsLoss() 44 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 45 | 46 | outputs = (loss,) + outputs 47 | 48 | return outputs # (loss), logits, (hidden_states), (attentions) 49 | -------------------------------------------------------------------------------- /models/xlnet.py: -------------------------------------------------------------------------------- 1 | from torch.nn import BCEWithLogitsLoss 2 | from transformers import XLNetForSequenceClassification 3 | 4 | 5 | class XLNetForMultiLabelSequenceClassification(XLNetForSequenceClassification): 6 | def forward( 7 | self, 8 | input_ids=None, 9 | attention_mask=None, 10 | mems=None, 11 | perm_mask=None, 12 | target_mapping=None, 13 | token_type_ids=None, 14 | input_mask=None, 15 | head_mask=None, 16 | inputs_embeds=None, 17 | use_cache=True, 18 | labels=None, 19 | ): 20 | 21 | transformer_outputs = self.transformer( 22 | input_ids, 23 | attention_mask=attention_mask, 24 | mems=mems, 25 | perm_mask=perm_mask, 26 | target_mapping=target_mapping, 27 | token_type_ids=token_type_ids, 28 | input_mask=input_mask, 29 | head_mask=head_mask, 30 | inputs_embeds=inputs_embeds, 31 | use_cache=use_cache, 32 | ) 33 | output = transformer_outputs[0] 34 | 35 | output = self.sequence_summary(output) 36 | logits = self.logits_proj(output) 37 | 38 | outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it 39 | 40 | if labels is not None: 41 | # Single-label classification (as in XLNetForSequenceClassification 42 | # loss_fct = CrossEntropyLoss() 43 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 44 | 45 | loss_fct = BCEWithLogitsLoss() 46 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 47 | 48 | outputs = (loss,) + outputs 49 | 50 | return outputs # return (loss), logits, (mems), (hidden states), (attentions) -------------------------------------------------------------------------------- /sbin/compress_data_and_upload.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Compress data for upload 4 | 5 | # ACL: S2 papers 6 | tar -cvf acl_s2.tar title2dblp_hits.json.gz acl_id2s2.json.gz arxiv2s2.json.gz doi2s2.json.gz 7 | 8 | # CORD-19 9 | tar -cvf cord19_s2.tar metadata.csv doi2s2paper.json.gz 10 | bzip2 cord19_s2.tar 11 | 12 | # Models (SciBERT) 13 | tar -cvzf ./cord19_fold-1_scibert-scivocab-uncased.tar.gz --directory=../output/cord19_docrel/folds/1/ scibert-scivocab-uncased 14 | tar -cvzf ./acl_fold-1_scibert-scivocab-uncased.tar.gz --directory=../output/acl_docrel/folds/1/ scibert-scivocab-uncased 15 | 16 | # Results 17 | tar -cvzf acl_output.tar.gz --exclude='*.bin' --exclude='__*' ../output/acl_docrel/* 18 | tar -cvzf cord19_output.tar.gz --exclude='*.bin' --exclude='__*' ../output/cord19_docrel/* 19 | 20 | 21 | ### Upload to GitHub release (with https://github.com/github-release/github-release) 22 | export GITHUB_TOKEN= 23 | export GITHUB_USER= 24 | export GITHUB_REPO= 25 | 26 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_s2.tar --file acl_s2.tar 27 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_s2.tar.bz2 --file cord19_s2.tar.bz2 28 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_fold-1_scibert-scivocab-uncased.tar.gz --file acl_fold-1_scibert-scivocab-uncased.tar.gz 29 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_fold-1_scibert-scivocab-uncased.tar.gz --file cord19_fold-1_scibert-scivocab-uncased.tar.gz 30 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_output.tar.gz --file acl_output.tar.gz 31 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_output.tar.gz --file cord19_output.tar.gz 32 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name scibert-vocab.txt --file ~/datasets/BERT_pre_trained_models/pytorch/scibert-scivocab-uncased/vocab.txt 33 | 34 | -------------------------------------------------------------------------------- /models/longformer.py: -------------------------------------------------------------------------------- 1 | from torch.nn import BCEWithLogitsLoss 2 | from transformers import RobertaForSequenceClassification, BertPreTrainedModel, LongformerConfig, LongformerModel, \ 3 | LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP 4 | from transformers.modeling_roberta import RobertaClassificationHead 5 | 6 | 7 | class LongformerForMultiLabelSequenceClassification(BertPreTrainedModel): 8 | config_class = LongformerConfig 9 | pretrained_model_archive_map = LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP 10 | base_model_prefix = "longformer" 11 | 12 | def __init__(self, config): 13 | super().__init__(config) 14 | self.num_labels = config.num_labels 15 | 16 | self.roberta = LongformerModel(config) 17 | self.classifier = RobertaClassificationHead(config) 18 | 19 | def forward( 20 | self, 21 | input_ids=None, 22 | attention_mask=None, 23 | token_type_ids=None, 24 | position_ids=None, 25 | head_mask=None, 26 | inputs_embeds=None, 27 | labels=None, 28 | ): 29 | outputs = self.roberta( 30 | input_ids, 31 | attention_mask=attention_mask, 32 | token_type_ids=token_type_ids, 33 | position_ids=position_ids, 34 | head_mask=head_mask, 35 | inputs_embeds=inputs_embeds, 36 | ) 37 | sequence_output = outputs[0] 38 | logits = self.classifier(sequence_output) 39 | 40 | outputs = (logits,) + outputs[2:] 41 | if labels is not None: 42 | # Single-label classification (as in RobertaForSequenceClassification) 43 | # loss_fct = CrossEntropyLoss() 44 | # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) 45 | loss_fct = BCEWithLogitsLoss() 46 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 47 | 48 | outputs = (loss,) + outputs 49 | 50 | return outputs # (loss), logits, (hidden_states), (attentions) -------------------------------------------------------------------------------- /tests/test_experiment.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from torch.nn import BCELoss 4 | from tqdm import tqdm 5 | 6 | from experiments import Experiment 7 | from experiments.utils import flatten 8 | from models.transformers import JointBERT 9 | from wiki.data_helpers import JointBERTWikiDataHelper 10 | 11 | 12 | class ExperimentTest(TestCase): 13 | def test_cls_init(self): 14 | 15 | exp = Experiment( 16 | # random_seed=0, 17 | epochs=1, 18 | model_cls='models.JointBERT', 19 | model_params={ 20 | 'bert_model_path': '/Volumes/data/repo/data/bert/bert-base-cased', 21 | 'labels_count': 3, 22 | }, 23 | loss_func_cls='torch.nn.BCELoss', # loss, 24 | model_output_to_loss_input=lambda ys: ys.double(), 25 | data_helper_cls='wiki.data_helpers.JointBERTDataHelper', 26 | data_helper_params={ 27 | 'wiki_relations_path': '../wiki/relations.csv', 28 | 'wiki_articles_path': '../wiki/docs.pickle', 29 | 'labels': ['employer', 'country_of_citizenship'], 30 | # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of' 31 | 'label_col': 'relation_name', 32 | 'negative_sampling_ratio': 1., 33 | 'train_test_split': 0.7, 34 | 'max_seq_length': 512, 35 | 'train_batch_size': 4, 36 | 'test_batch_size': 4, 37 | 'bert_model_path': '/Volumes/data/repo/data/bert/bert-base-cased', 38 | # 'bert_tokenizer_cls': '', 39 | 'bert_tokenizer_params': { 40 | 'do_lower_case': False, 41 | }, 42 | 'df_limit': 3, 43 | }, 44 | tqdm_cls='tqdm.tqdm', 45 | output_dir='../output', 46 | ) 47 | 48 | assert isinstance(exp.model, JointBERT) 49 | assert isinstance(exp.data_helper, JointBERTWikiDataHelper) 50 | assert isinstance(exp.loss_func, BCELoss) 51 | assert tqdm == exp.tqdm_cls 52 | 53 | print(flatten(exp.to_dict())) 54 | 55 | exp.run() -------------------------------------------------------------------------------- /tests/test_rnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from unittest import TestCase 4 | 5 | import spacy 6 | import torch 7 | from transformers import AutoTokenizer, AutoConfig, RobertaTokenizer, RobertaForSequenceClassification 8 | 9 | from acl.__data_prep import load_parscit_file, get_citation_context 10 | from acl.trainer_utils import get_vectors_from_spacy_model 11 | from experiments.environment import get_env 12 | from models.auto_modeling import AutoModelForMultiLabelSequenceClassification 13 | from models.rnn import RNNForMultiLabelSequenceClassification 14 | from trainer_cli import ExperimentArguments 15 | 16 | 17 | class AutoModelingTest(TestCase): 18 | env = None 19 | 20 | def setUp(self) -> None: 21 | os.environ["WANDB_DISABLED"] = "true" 22 | os.environ["WANDB_WATCH"] = "false" 23 | self.env = get_env() 24 | 25 | self.cache_dir = '../data/transformers_cache' 26 | self.sample_text = ' '.join(['Hello world! '] * 10) 27 | self.num_labels = 5 28 | 29 | def test_rnn_model(self): 30 | 31 | # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=self.cache_dir) 32 | # model_config = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, cache_dir=self.cache_dir) 33 | 34 | # model = AutoModelForMultiLabelSequenceClassification.from_pretrained(model_name_or_path, config=model_config, cache_dir=self.cache_dir) 35 | 36 | experiment_args = ExperimentArguments('s2_id', 'from_s2_id', 'to_s2_id', 1, 'acl_docrel') 37 | # label_classes 38 | spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"]) 39 | 40 | model = RNNForMultiLabelSequenceClassification( 41 | word_vectors=get_vectors_from_spacy_model(spacy_nlp), 42 | hidden_size=experiment_args.rnn_hidden_size, 43 | rnn=experiment_args.rnn_type, 44 | num_labels=self.num_labels, 45 | num_layers=experiment_args.rnn_num_layers, 46 | dropout=experiment_args.rnn_dropout, 47 | ) 48 | # 49 | # model.eval() 50 | # 51 | # encodings = tokenizer.batch_encode_plus([text], return_tensors='pt') 52 | # 53 | # return model(encodings['input_ids']), model, tokenizer 54 | -------------------------------------------------------------------------------- /sbin/cord19/baseline-lstm-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=baseline-rnn 10 | export EPOCHS=10 11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d 12 | 13 | export MODEL_NAME=baseline-rnn__fasttext 14 | export SPACY_MODEL=~/datasets/spacy/en_fasttext_wiki-news-300d-1m 15 | 16 | export MODEL_NAME=baseline-rnn__fasttext__custom 17 | export SPACY_MODEL=./output/cord19_docrel/spacy/en_cord19_fasttext_300d 18 | 19 | export EPOCHS=10 20 | export CV_FOLD=1 21 | export LR=1e-5 22 | export RNN_NUM_LAYERS=2 23 | export RNN_HIDDEN_SIZE=100 24 | export RNN_DROPOUT=0.1 25 | 26 | # [1] Reimers, N. and Gurevych, I. 2016. Optimal Hyperparameters for Deep LSTM-Networks for Sequence Labeling Tasks. (2016). 27 | # - 28 | # A value of about 100 for each LSTM-network appears to be a good rule of thumb for the tested tasks 29 | # - 30 | # For tasks with small training sets appears a mini-batch size of 8 a robust selection. 31 | # For tasks with larger training sets appears a mini-batch size of 32 a robust selection. 32 | # - 33 | # Except for the reduced POS tagging task, two BiLSTM-layers produced the best re- sults. 34 | # - 35 | # Variational dropout was on all tasks superior to no-dropout or naive dropout. 36 | # Applying dropout along the vertical as well as the recurrent dimension achieved on all benchmark tasks the best result. 37 | # 0.1 => same as in transformers 38 | 39 | for CV_FOLD in 1 2 3 4 40 | do 41 | python trainer_cli.py --cv_fold $CV_FOLD \ 42 | --output_dir $OUTPUT_DIR \ 43 | --model_name_or_path $MODEL_NAME \ 44 | --doc_id_col $DOC_ID_COL \ 45 | --doc_a_col $DOC_A_COL \ 46 | --doc_b_col $DOC_B_COL \ 47 | --nlp_dataset $NLP_DATASET \ 48 | --nlp_cache_dir $NLP_CACHE_DIR \ 49 | --cache_dir $CACHE_DIR \ 50 | --num_train_epochs $EPOCHS \ 51 | --seed $SEED \ 52 | --learning_rate $LR \ 53 | --logging_steps 500 \ 54 | --save_steps 0 \ 55 | --save_total_limit 3 \ 56 | --do_train \ 57 | --save_predictions \ 58 | --spacy_model $SPACY_MODEL \ 59 | --rnn_type lstm \ 60 | --rnn_num_layers $RNN_NUM_LAYERS \ 61 | --rnn_hidden_size $RNN_HIDDEN_SIZE \ 62 | --rnn_dropout $RNN_DROPOUT \ 63 | --per_gpu_eval_batch_size 32 \ 64 | --per_gpu_train_batch_size 8 \ 65 | --evaluate_during_training 66 | done 67 | 68 | export PYTHONUNBUFFERED="" 69 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | # Edit at https://www.gitignore.io/?templates=python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # celery beat schedule file 98 | celerybeat-schedule 99 | 100 | # SageMath parsed files 101 | *.sage.py 102 | 103 | # Environments 104 | .env 105 | .venv 106 | env/ 107 | venv/ 108 | ENV/ 109 | env.bak/ 110 | venv.bak/ 111 | 112 | # Spyder project settings 113 | .spyderproject 114 | .spyproject 115 | 116 | # Rope project settings 117 | .ropeproject 118 | 119 | # mkdocs documentation 120 | /site 121 | 122 | # mypy 123 | .mypy_cache/ 124 | .dmypy.json 125 | dmypy.json 126 | 127 | # Pyre type checker 128 | .pyre/ 129 | 130 | # End of https://www.gitignore.io/api/python 131 | .idea/ 132 | -------------------------------------------------------------------------------- /sbin/acl/baseline-lstm-gpu.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | export MODEL_NAME=baseline-rnn 10 | export EPOCHS=10 11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d 12 | 13 | export MODEL_NAME=baseline-rnn__fasttext 14 | export SPACY_MODEL=~/datasets/spacy/en_fasttext_wiki-news-300d-1m 15 | 16 | export MODEL_NAME=baseline-rnn__fasttext__custom 17 | export SPACY_MODEL=./output/acl_docrel/spacy/en_acl_fasttext_300d 18 | 19 | export EVAL_BATCH_SIZE=12 20 | export TRAIN_BATCH_SIZE=8 21 | 22 | export EPOCHS=10 23 | export CV_FOLD=1 24 | export LR=1e-5 25 | export RNN_NUM_LAYERS=2 26 | export RNN_HIDDEN_SIZE=100 27 | export RNN_DROPOUT=0.1 28 | 29 | # [1] Reimers, N. and Gurevych, I. 2016. Optimal Hyperparameters for Deep LSTM-Networks for Sequence Labeling Tasks. (2016). 30 | # - 31 | # A value of about 100 for each LSTM-network appears to be a good rule of thumb for the tested tasks 32 | # - 33 | # For tasks with small training sets appears a mini-batch size of 8 a robust selection. 34 | # For tasks with larger training sets appears a mini-batch size of 32 a robust selection. 35 | # - 36 | # Except for the reduced POS tagging task, two BiLSTM-layers produced the best re- sults. 37 | # - 38 | # Variational dropout was on all tasks superior to no-dropout or naive dropout. 39 | # Applying dropout along the vertical as well as the recurrent dimension achieved on all benchmark tasks the best result. 40 | # 0.1 => same as in transformers 41 | 42 | for CV_FOLD in 1 2 3 4 43 | do 44 | python trainer_cli.py --cv_fold $CV_FOLD \ 45 | --output_dir $OUTPUT_DIR \ 46 | --model_name_or_path $MODEL_NAME \ 47 | --doc_id_col $DOC_ID_COL \ 48 | --doc_a_col $DOC_A_COL \ 49 | --doc_b_col $DOC_B_COL \ 50 | --nlp_dataset $NLP_DATASET \ 51 | --nlp_cache_dir $NLP_CACHE_DIR \ 52 | --cache_dir $CACHE_DIR \ 53 | --num_train_epochs $EPOCHS \ 54 | --seed $SEED \ 55 | --learning_rate $LR \ 56 | --logging_steps 500 \ 57 | --save_steps 0 \ 58 | --save_total_limit 3 \ 59 | --do_train \ 60 | --save_predictions \ 61 | --spacy_model $SPACY_MODEL \ 62 | --rnn_type lstm \ 63 | --rnn_num_layers $RNN_NUM_LAYERS \ 64 | --rnn_hidden_size $RNN_HIDDEN_SIZE \ 65 | --rnn_dropout $RNN_DROPOUT \ 66 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 67 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 68 | --evaluate_during_training 69 | done 70 | 71 | export PYTHONUNBUFFERED="" 72 | -------------------------------------------------------------------------------- /acl/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | def get_sorted_pair(a, b): 8 | # ensure citation pair is always in same order 9 | if a > b: 10 | return (a, b) 11 | else: 12 | return (b, a) 13 | 14 | 15 | def to_label(t, labels): 16 | if t in labels: 17 | return t 18 | else: 19 | return 'other' 20 | 21 | 22 | def normalize_title(t): 23 | if t: 24 | t = t.replace('.', ' ').replace('-', ' ').strip().lower() 25 | #t = re.sub(r'\W+', '', t) 26 | return t 27 | 28 | 29 | def normalize_section(title): 30 | if title: 31 | return re.sub(r'[\.0-9]', '', 32 | title. 33 | strip() \ 34 | .lower() \ 35 | .replace('conclusions', 'conclusion') \ 36 | .replace('methodology', 'method') \ 37 | .replace('methods', 'method') \ 38 | .replace('related works', 'related work') \ 39 | .replace('models', 'model') \ 40 | .replace('datasets', 'dataset') \ 41 | .replace('our ', '') \ 42 | .replace('evaluations', 'evaluation') \ 43 | .replace('experiments', 'experiment') 44 | ).strip() 45 | # .replace('conclusion and future perspectives', 'conclusion')\ 46 | # .replace('materials and methods', 'methods') 47 | 48 | 49 | def get_text_from_doc(doc) -> str: 50 | """ 51 | Build document text from title + abstract 52 | 53 | :param doc: S2 paper 54 | :return: Document text 55 | """ 56 | 57 | text = '' 58 | 59 | if 'title' in doc: 60 | text += doc['title'] 61 | 62 | if doc['abstract']: 63 | text += '\n' + doc['abstract'] 64 | 65 | return text 66 | 67 | 68 | def get_text_from_doc_id(doc_id: str, doc_index) -> str: 69 | """ 70 | 71 | Build document text from title + abstract 72 | 73 | :param doc_id: S2-id 74 | :param doc_index: S2-id to S2-paper data 75 | :return: Document text 76 | """ 77 | 78 | if doc_id in doc_index: 79 | return get_text_from_doc(doc_index[doc_id]) 80 | else: 81 | raise ValueError(f'Document not found in index: {doc_id}') 82 | 83 | 84 | # resolve 'and' titles and filter for out-of-index docs 85 | def resolve_and_sect_titles(items, doc_index=None): 86 | for from_s2_id, to_s2_id, sect_generic, sect_title, sect_marker in items: 87 | if doc_index and (from_s2_id not in doc_index or to_s2_id not in doc_index): 88 | # One of the IDs does not exist in document index 89 | continue 90 | 91 | sect_title = normalize_section(sect_title) 92 | 93 | if sect_title: 94 | # Resolve combined sections 95 | for t in sect_title.split(' and '): 96 | if t: 97 | yield (from_s2_id, to_s2_id, t, sect_marker) 98 | -------------------------------------------------------------------------------- /acl/preprocessing/scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import tqdm 3 | 4 | 5 | def scrape_s2(job_name, needed_ids, id2s2, id2s2_errors, id_prefix='', sleep=2.5, save_every_n=1000, offset=0): 6 | api_url = 'http://api.semanticscholar.org/v1/paper/' 7 | 8 | try: 9 | for i, needed_id in enumerate(tqdm(needed_ids, total=len(needed_ids))): 10 | if i < offset: # skip 11 | continue 12 | 13 | if needed_id in id2s2 or needed_id in id2s2_errors: 14 | continue 15 | 16 | res = requests.get(api_url + id_prefix + needed_id) 17 | 18 | if res.status_code == 200: 19 | try: 20 | id2s2[needed_id] = res.json() 21 | except ValueError as e: 22 | print(f'Error cannot parse JSON: {needed_id}') 23 | id2s2_errors[needed_id] = str(e) 24 | elif res.status_code == 429: 25 | print(f'Stop! Rate limit reached at: {i}') 26 | break 27 | elif res.status_code == 403: 28 | print(f'Stop! Forbidden / rate limit reached at: {i}') 29 | break 30 | elif res.status_code == 404: 31 | id2s2_errors[needed_id] = 404 32 | else: 33 | print(f'Error status: {res.status_code} - {needed_id}') 34 | id2s2_errors[needed_id] = res.text 35 | 36 | if save_every_n > 0 and (i % save_every_n) == 0 and i > 0: 37 | json.dump(id2s2, open(output_dir / f'{job_name}.json', 'w')) 38 | json.dump(id2s2_errors, open(output_dir / f'{job_name}_errors.json', 'w')) 39 | 40 | time.sleep(sleep) 41 | except KeyboardInterrupt: 42 | print('Aborting...') 43 | pass 44 | 45 | return id2s2, id2s2_errors 46 | 47 | 48 | 49 | def scrape_dblp(): 50 | missing_titles = set(filtered_cits.keys()).difference(set(title2dblp_hits.keys())) 51 | print(f'Missing titles: {len(missing_titles):,}') 52 | 53 | title2dblp_hits = {} 54 | dblp_errors = {} 55 | 56 | url = 'https://dblp.org/search/publ/api' 57 | 58 | for i, (title, idxs) in tqdm(enumerate(filtered_cits.items()), total=len(filtered_cits)): 59 | if title in title2dblp_hits or title in dblp_errors: 60 | continue 61 | 62 | q = title 63 | res = requests.get(url, params={'query': q, 'format': 'json'}) 64 | 65 | if res.status_code == 200: 66 | title2dblp_hits[title] = res.json()['result']['hits'] 67 | elif res.status_code == 422: 68 | dblp_errors[title] = res.status_code 69 | print(f'422: unprocesseble entity: {title}') 70 | else: 71 | # dblp_errors[title] = res.status_code 72 | print(f'Error: {res.text}') 73 | break 74 | 75 | time.sleep(0.5) 76 | 77 | # if i > 3: 78 | # break 79 | 80 | print(f'Scraped data for {len(title2dblp_hits)} papers from DBPL (errors: {len(dblp_errors)})') 81 | -------------------------------------------------------------------------------- /sbin/acl/1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | # models: albert-base-v1 bert-base-german-cased biobert-v1-1 longformer-base-4096.tar.gz pytorch 10 | # scibert-scivocab-uncased 11 | #albert-base-v2 bert-base-multilingual-cased distilbert-base-uncased 12 | # longformer-large-4096 roberta-base xlnet-base-cased 13 | #bert-base-cased bert-large-cased 14 | # longformer-base-4096 longformer-large-4096.tar.gz 15 | # roberta-large 16 | 17 | export MODEL_NAME=bert-base-cased 18 | export MODEL_NAME=bert-large-cased 19 | 20 | export MODEL_NAME=roberta-base 21 | export MODEL_NAME=longformer-base-4096 22 | export MODEL_NAME=xlnet-base 23 | 24 | export CV_FOLD=1 25 | 26 | # longformer 27 | export EVAL_BATCH_SIZE=4 28 | export TRAIN_BATCH_SIZE=4 29 | 30 | # large 31 | export EVAL_BATCH_SIZE=4 32 | export TRAIN_BATCH_SIZE=2 33 | 34 | # bert-base 35 | export EVAL_BATCH_SIZE=16 36 | export TRAIN_BATCH_SIZE=8 37 | 38 | # xlnet-base 39 | export EVAL_BATCH_SIZE=12 40 | export TRAIN_BATCH_SIZE=6 41 | 42 | 43 | 44 | for CV_FOLD in 1 2 3 4 45 | do 46 | python trainer_cli.py --cv_fold $CV_FOLD \ 47 | --output_dir $OUTPUT_DIR \ 48 | --model_name_or_path $MODEL_NAME \ 49 | --doc_id_col $DOC_ID_COL \ 50 | --doc_a_col $DOC_A_COL \ 51 | --doc_b_col $DOC_B_COL \ 52 | --nlp_dataset $NLP_DATASET \ 53 | --nlp_cache_dir $NLP_CACHE_DIR \ 54 | --cache_dir $CACHE_DIR \ 55 | --num_train_epochs $EPOCHS \ 56 | --seed $SEED \ 57 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 58 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 59 | --learning_rate $LR \ 60 | --logging_steps 100 \ 61 | --save_steps 0 \ 62 | --save_total_limit 3 \ 63 | --do_train \ 64 | --save_predictions 65 | done 66 | 67 | ###### 68 | 69 | export EVAL_BATCH_SIZE=16 70 | export TRAIN_BATCH_SIZE=8 71 | 72 | for MODEL_NAME in "bert-base-cased" "scibert-scivocab-uncased" "roberta-base" "xlnet-base-cased" "google/electra-base-discriminator" "deepset/covid_bert_base" 73 | do 74 | echo $MODEL_NAME 75 | export CV_FOLD=1 76 | python trainer_cli.py --cv_fold $CV_FOLD \ 77 | --output_dir $OUTPUT_DIR \ 78 | --model_name_or_path $MODEL_NAME \ 79 | --doc_id_col $DOC_ID_COL \ 80 | --doc_a_col $DOC_A_COL \ 81 | --doc_b_col $DOC_B_COL \ 82 | --nlp_dataset $NLP_DATASET \ 83 | --nlp_cache_dir $NLP_CACHE_DIR \ 84 | --cache_dir $CACHE_DIR \ 85 | --num_train_epochs $EPOCHS \ 86 | --seed $SEED \ 87 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 88 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 89 | --learning_rate $LR \ 90 | --logging_steps 100 \ 91 | --save_steps 0 \ 92 | --save_total_limit 3 \ 93 | --do_train \ 94 | --save_predictions 95 | done 96 | 97 | 98 | export PYTHONUNBUFFERED="" 99 | 100 | -------------------------------------------------------------------------------- /acl/preprocessing/citation_mapping.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import List 4 | 5 | 6 | def get_title2s2_id(id2s2__title_list: List): 7 | title2s2_id = {} 8 | 9 | for id2s2, id2title in id2s2__title_list: 10 | title2s2_id.update({id2title[_id]: s2['paperId'] for _id, s2 in id2s2.items() if _id in id2title}) 11 | 12 | return title2s2_id 13 | 14 | 15 | def get_dblp_titles(fp): 16 | """ 17 | 18 | 19 | :param fp: Path to DBLP scraper results (JSON) 20 | :return: acl_id2title, doi2title, arxiv2title 21 | """ 22 | title2dblp_hits = json.load(open(fp, 'r')) 23 | 24 | title2doi = {} 25 | doi2title = {} 26 | 27 | title2arxiv = {} 28 | arxiv2title = {} 29 | 30 | title2acl_id = {} 31 | acl_id2title = {} 32 | 33 | for i, (title, hits) in enumerate(title2dblp_hits.items()): 34 | if hits['@total'] == '1': # igore multi matches 35 | hit = hits['hit'][0] 36 | 37 | if 'doi' in hit['info']: 38 | doi = hit['info']['doi'].replace('https://doi.org/', '') 39 | 40 | doi2title[doi] = title 41 | title2doi[title] = doi 42 | continue 43 | 44 | if 'ee' in hit['info']: 45 | ee = hit['info']['ee'] 46 | if 'aclweb.org/anthology/' in ee: 47 | match = re.search(r'anthology/([-a-zA-Z0-9]+)', ee) 48 | if match: 49 | acl_id = match.group(1) 50 | title2acl_id[title] = acl_id 51 | acl_id2title[acl_id] = title 52 | continue 53 | 54 | # print(acl_id) 55 | 56 | if 'arxiv.org' in ee: 57 | match = re.search(r'arxiv.org\/abs\/(.+)', ee) 58 | if match: 59 | arxiv_id = match.group(1) 60 | title2arxiv[title] = arxiv_id 61 | arxiv2title[arxiv_id] = title 62 | continue 63 | 64 | # print(arxiv_id) 65 | # other 66 | # print(hit['info']['ee']) 67 | 68 | # print(hits) 69 | # print('----') 70 | # if i > 100: 71 | # break 72 | 73 | found = len(doi2title) + len(arxiv2title) + len(acl_id2title) 74 | 75 | print(f'Found DOIs: {len(doi2title)} ({len(title2doi)})') 76 | print(f'Found arXiv: {len(arxiv2title)}') 77 | print(f'Found ACL: {len(acl_id2title)}') 78 | 79 | print(f'-- Found all: {found:,} / {len(title2dblp_hits):,}') 80 | 81 | return acl_id2title, doi2title, arxiv2title 82 | 83 | 84 | def get_s2_pairs_from_cits(cit_pairs, acl_id2s2): 85 | s2_pairs = [] 86 | not_found = [] 87 | 88 | for from_s2_id, from_acl_id, to_s2_id, sect_generic, sect_title, sect_marker in cit_pairs: 89 | if from_s2_id == None: 90 | if from_acl_id in acl_id2s2: 91 | from_s2_id = acl_id2s2[from_acl_id]['paperId'] 92 | else: 93 | not_found.append((from_acl_id, to_s2_id)) 94 | continue 95 | 96 | s2_pairs.append(( 97 | from_s2_id, 98 | to_s2_id, 99 | sect_generic, 100 | sect_title, 101 | sect_marker, 102 | ), ) 103 | 104 | return s2_pairs, not_found 105 | -------------------------------------------------------------------------------- /models/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import logging 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def get_concat(concat: str, embedding_dim: int): 11 | """ 12 | 13 | :param concat: Concatenation style 14 | :param embedding_dim: Size of inputs that are subject to concatenation 15 | :return: Function that performs concatenation, Size of concatenation output 16 | """ 17 | concat_func = None 18 | concat_dim = None 19 | 20 | if concat == 'simple': 21 | concat_func = lambda a, b: torch.cat((a, b), dim=1) 22 | concat_dim = 2 * embedding_dim 23 | elif concat == 'dif': 24 | # x = np.abs(a-b) 25 | concat_func = lambda a, b: (a - b).abs() 26 | concat_dim = 1 * embedding_dim 27 | elif concat == 'prod': 28 | # x = a * b 29 | concat_func = lambda a, b: a * b 30 | concat_dim = 1 * embedding_dim 31 | elif concat == 'dif-prod': 32 | # x = np.hstack((np.abs(a-b), a * b)) 33 | concat_func = lambda a, b: torch.cat(((a - b).abs(), a * b), dim=1) 34 | concat_dim = 2 * embedding_dim 35 | 36 | elif concat == '3d-prod': 37 | # x = np.hstack((a, b, a*b)) 38 | concat_func = lambda a, b: torch.cat((a, b, a * b), dim=1) 39 | concat_dim = 3 * embedding_dim 40 | 41 | elif concat == '3d-dif': 42 | # x = np.hstack((a, b, np.abs(a-b))) 43 | concat_func = lambda a, b: torch.cat((a, b, (a - b).abs()), dim=1) 44 | concat_dim = 3 * embedding_dim 45 | elif concat == '4d-prod-dif': 46 | # x = np.hstack((a, b, a*b, np.abs(a-b))) 47 | concat_func = lambda a, b: torch.cat((a, b, a * b, (a - b).abs()), dim=1) 48 | concat_dim = 4 * embedding_dim 49 | 50 | else: 51 | raise ValueError('Unsupported concat mode') 52 | 53 | logger.debug(f'concat_dim = {concat_dim} ({concat})') 54 | 55 | return concat_func, concat_dim 56 | 57 | 58 | def get_mlp(input_dim, output_dim, hidden_dim, hidden_layers_count=1, dropout_p=0., activation_cls=nn.ReLU): 59 | """ 60 | Generate a fully-connected layer (MLP) with dynamic input, output and hidden dimension, and hidden layer count. 61 | 62 | - when dropout_p > 0, then dropout is applied with given probability after the activation function. 63 | 64 | :param input_dim: 65 | :return: Sequential layer 66 | """ 67 | layers = [ 68 | # first layer 69 | nn.Linear(input_dim, hidden_dim), 70 | activation_cls(), 71 | ] 72 | 73 | if dropout_p > 0: 74 | layers.append(nn.Dropout(dropout_p)) 75 | 76 | for layer_idx in range(1, hidden_layers_count): 77 | layers.append(nn.Linear(hidden_dim, hidden_dim)), 78 | layers.append(activation_cls()), 79 | 80 | if dropout_p > 0: 81 | layers.append(nn.Dropout(dropout_p)) 82 | 83 | # last layer 84 | layers.append(nn.Linear(hidden_dim, output_dim)) 85 | 86 | # TODO fill linear layers 87 | # nn.init.xavier_normal_(self.classifier.weight) 88 | # Fills the input Tensor with values according to the method described in “Understanding the difficulty of training deep feedforward neural networks” - Glorot, X. & Bengio, Y. (2010), using a normal distribution. 89 | # kaiming_normal_ 90 | # Fills the input Tensor with values according to the method described in “Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification” - He, K. et al. (2015), using a normal distribution. 91 | 92 | return nn.Sequential(*layers) -------------------------------------------------------------------------------- /tests/test_auto_modeling.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from unittest import TestCase 4 | 5 | import torch 6 | from transformers import AutoTokenizer, AutoConfig, RobertaTokenizer, RobertaForSequenceClassification 7 | 8 | from acl.__data_prep import load_parscit_file, get_citation_context 9 | from experiments.environment import get_env 10 | from models.auto_modeling import AutoModelForMultiLabelSequenceClassification 11 | 12 | 13 | class AutoModelingTest(TestCase): 14 | env = None 15 | 16 | def setUp(self) -> None: 17 | os.environ["WANDB_DISABLED"] = "true" 18 | os.environ["WANDB_WATCH"] = "false" 19 | self.env = get_env() 20 | 21 | self.cache_dir = '../data/transformers_cache' 22 | self.sample_text = ' '.join(['Hello world! '] * 10) 23 | self.num_labels = 5 24 | 25 | def forward_model(self, model_name_or_path, text, tokenizer_name=None): 26 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=self.cache_dir) 27 | model_config = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, cache_dir=self.cache_dir) 28 | 29 | model = AutoModelForMultiLabelSequenceClassification.from_pretrained(model_name_or_path, config=model_config, cache_dir=self.cache_dir) 30 | 31 | model.eval() 32 | 33 | encodings = tokenizer.batch_encode_plus([text], return_tensors='pt') 34 | 35 | return model(encodings['input_ids']), model, tokenizer 36 | 37 | def test_bert_auto(self): 38 | model_name_or_path = self.env['bert_dir'] + '/bert-base-cased' 39 | out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text) 40 | 41 | print(out) 42 | print(type(model)) 43 | 44 | print(model.config.max_position_embeddings) 45 | 46 | def test_distilbert_auto(self): 47 | model_name_or_path = self.env['bert_dir'] + '/distilbert-base-uncased' 48 | out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text) 49 | 50 | print(out) 51 | print(type(model)) 52 | 53 | 54 | print(model.config.max_position_embeddings) 55 | 56 | 57 | def test_xlnet_auto(self): 58 | model_name_or_path = 'xlnet-base-cased' 59 | out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text) 60 | 61 | print(out) 62 | print(type(model)) 63 | 64 | self.assertEqual(self.num_labels, out[0].shape[1]) 65 | 66 | print(model.config.max_position_embeddings) 67 | print(tokenizer.model_max_length) 68 | 69 | def test_roberta_auto(self): 70 | model_name_or_path = 'roberta-base' 71 | out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text) 72 | 73 | print(out) 74 | print(type(model)) 75 | 76 | self.assertEqual(self.num_labels, out[0].shape[1]) 77 | 78 | print(model.roberta) 79 | print(model.config.max_position_embeddings) 80 | # model.save_pretrained(self.cache_dir) 81 | 82 | def test_roberta_manual(self): 83 | tokenizer = RobertaTokenizer.from_pretrained('roberta-base', cache_dir=self.cache_dir) 84 | model = RobertaForSequenceClassification.from_pretrained('roberta-base', cache_dir=self.cache_dir) 85 | 86 | encodings = tokenizer.batch_encode_plus(['foo bar'], return_tensors='pt') 87 | 88 | print(model(encodings['input_ids'])) 89 | 90 | def test_longformer_auto(self): 91 | model_name_or_path = 'longformer-base-4096' 92 | out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text, 'roberta-base') 93 | 94 | print(out) 95 | print(type(model)) 96 | 97 | self.assertEqual(self.num_labels, out[0].shape[1]) 98 | 99 | print(model.roberta) 100 | print(model.config.max_position_embeddings) 101 | # model.save_pretrained(self.cache_dir) -------------------------------------------------------------------------------- /cord19/preprocessing/negative_sampling.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import random 4 | from typing import List 5 | 6 | from fuzzywuzzy import fuzz 7 | 8 | from acl.utils import get_sorted_pair 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def get_authors(doi, doi2paper): 14 | if doi in doi2paper: 15 | paper = doi2paper[doi] 16 | 17 | if 'authors' in paper: 18 | last_names = [a.split()[-1].lower() for a in paper['authors']] 19 | return last_names 20 | else: 21 | return [] 22 | 23 | # elif doi in doi2paper: 24 | # paper = doi2paper[doi] 25 | # last_names = [a['last'].lower() for a in paper['metadata']['authors']] 26 | # return last_names 27 | else: 28 | raise ValueError(f'DOI not found: {doi}') 29 | 30 | 31 | def have_no_shared_authors(a_doi, b_doi, doi2paper): 32 | try: 33 | a_authors = set(get_authors(a_doi, doi2paper)) 34 | b_authors = set(get_authors(b_doi, doi2paper)) 35 | 36 | overlap = a_authors & b_authors 37 | 38 | if len(overlap) == 0: 39 | return True 40 | else: 41 | return False 42 | 43 | except ValueError: 44 | return False 45 | 46 | 47 | # has same venue 48 | def get_venue(doi, doi2paper): 49 | if doi in doi2paper: 50 | paper = doi2paper[doi] 51 | return str(paper['venue']).lower().strip() if 'venue' in paper else None 52 | else: 53 | raise ValueError(f'DOI not found: {doi}') 54 | 55 | 56 | def have_not_same_venue(a_doi, b_doi, doi2paper): 57 | a_venue = get_venue(a_doi, doi2paper) 58 | b_venue = get_venue(b_doi, doi2paper) 59 | 60 | if a_venue is None or b_venue is None or a_venue == "" or b_venue == "": 61 | # cant answer if venue is not set 62 | return False 63 | 64 | if fuzz.ratio(a_venue, b_venue) < 0.75: 65 | # fuzzy string matching score must be low! 66 | return True 67 | else: 68 | return False 69 | 70 | 71 | def get_negative_pairs(doi2paper, candidate_doc_ids: List[str], positive_pairs, cits_set, cocits_set, negative_ratio=0.5, negative_count=0): 72 | # negative_label = 'none' 73 | # negative_needed = 10000 #105492 # len(df) 74 | 75 | if negative_count > 0: 76 | negative_needed = negative_count 77 | else: 78 | negative_needed = math.ceil(len(positive_pairs) * negative_ratio) 79 | 80 | negative_rows = [] 81 | negative_pairs = set() 82 | tries = 0 83 | 84 | print(f'Negatives needed: {negative_needed:,} (ratio: {negative_ratio})') 85 | 86 | while len(negative_pairs) < negative_needed: 87 | a = random.choice(candidate_doc_ids) 88 | b = random.choice(candidate_doc_ids) 89 | 90 | if a == b: 91 | tries += 1 92 | continue 93 | 94 | pair = tuple((a, b)) 95 | 96 | if pair in negative_pairs: 97 | continue 98 | 99 | cit_pair = get_sorted_pair(a, b) 100 | 101 | if cit_pair in cits_set: 102 | tries += 1 103 | continue 104 | 105 | if cit_pair in cocits_set: 106 | tries += 1 107 | continue 108 | 109 | if not have_no_shared_authors(a, b, doi2paper): 110 | tries += 1 111 | continue 112 | 113 | if not have_not_same_venue(a, b, doi2paper): 114 | tries += 1 115 | continue 116 | 117 | # text = get_text_from_doi(a) 118 | # text_b = get_text_from_doi(b) 119 | # if text == '' or text_b == '': 120 | # continue 121 | 122 | # None of the criteria above matches... 123 | negative_pairs.add(pair) 124 | # negative_rows.append(( 125 | # a, 126 | # b, 127 | # text, 128 | # text_b, 129 | # negative_label, 130 | # )) 131 | 132 | logger.info(f'Found {len(negative_pairs):,} negative rows (tried {tries:,} random samples)') 133 | 134 | return negative_pairs 135 | -------------------------------------------------------------------------------- /tests/test_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import pandas as pd 5 | from nlp import load_dataset 6 | 7 | from acl.trainer_utils import get_label_classes_from_nlp_dataset 8 | from datasets.acl_docrel.acl_docrel import get_train_split, get_test_split 9 | from experiments.environment import get_env 10 | 11 | 12 | class TrainerTest(TestCase): 13 | def __init__(self, *args, **kwargs): 14 | os.environ["WANDB_DISABLED"] = "true" 15 | os.environ["WANDB_WATCH"] = "false" 16 | 17 | super().__init__(*args, **kwargs) 18 | self.env = get_env() 19 | 20 | def test_label_classes(self): 21 | ds = "./datasets/acl_docrel/acl_docrel.py" 22 | 23 | ls = get_label_classes_from_nlp_dataset(ds) 24 | 25 | self.assertEqual(['introduction', 26 | 'related work', 27 | 'experiment', 28 | 'conclusion', 29 | 'results', 30 | 'background', 31 | 'discussion', 32 | 'evaluation', 33 | 'method', 34 | #'previous work', 35 | 'other', 36 | 'none'], ls) 37 | 38 | def test_load_dataset(self): 39 | pass 40 | 41 | def test_load_datasets_and_compare_label_class_distribution(self): 42 | cache_dir = '../data/nlp_cache' 43 | acl_ds = "../datasets/acl_docrel/acl_docrel.py" 44 | cv_fold = 1 45 | 46 | train_ds = load_dataset(acl_ds, 47 | name='relations', 48 | cache_dir=cache_dir, 49 | split=get_train_split(cv_fold)) 50 | test_ds = load_dataset(acl_ds, 51 | name='relations', 52 | cache_dir=cache_dir, 53 | split=get_test_split(cv_fold)) 54 | 55 | labels = [l for r in test_ds for l in r['label']] + [l for r in train_ds for l in r['label']] 56 | df = pd.DataFrame(labels, columns=['label']) 57 | 58 | print('ACL') 59 | print(df['label'].value_counts()) 60 | 61 | print('Pairs: %s '(len(train_ds) + len(test_ds))) 62 | 63 | 64 | ###### 65 | 66 | cord19_ds = "../datasets/cord19_docrel/cord19_docrel.py" 67 | train_ds = load_dataset(cord19_ds, 68 | name='relations', 69 | cache_dir=cache_dir, 70 | split=get_train_split(cv_fold)) 71 | test_ds = load_dataset(cord19_ds, 72 | name='relations', 73 | cache_dir=cache_dir, 74 | split=get_test_split(cv_fold)) 75 | 76 | labels = [l for r in test_ds for l in r['label']] + [l for r in train_ds for l in r['label']] 77 | df = pd.DataFrame(labels, columns=['label']) 78 | 79 | print('CORD19') 80 | print(df['label'].value_counts()) 81 | 82 | print('Pairs: %s ' (len(train_ds) + len(test_ds))) 83 | 84 | 85 | def test_dataset_splits(self): 86 | cache_dir = '../data/nlp_cache' 87 | 88 | for ds in ["../datasets/acl_docrel/acl_docrel.py", "../datasets/cord19_docrel/cord19_docrel.py"]: 89 | print(ds) 90 | 91 | train_count = 0 92 | test_count = 0 93 | 94 | for cv_fold in [1,2,3,4]: 95 | train_ds = load_dataset(ds, 96 | name='relations', 97 | cache_dir=cache_dir, 98 | split=get_train_split(cv_fold)) 99 | 100 | train_count += len(train_ds) 101 | 102 | test_ds = load_dataset(ds, 103 | name='relations', 104 | cache_dir=cache_dir, 105 | split=get_test_split(cv_fold)) 106 | test_count += len(test_ds) 107 | 108 | print('Train: %s' % (train_count / 4)) 109 | print('Test: %s' % (test_count / 4)) 110 | print() -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": {} 7 | }, 8 | "source": "# Demo for Aspect-oriented Similarity between Research Papers\n\n### Download models \u0026 install dependencies" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "pycharm": {} 15 | }, 16 | "outputs": [], 17 | "source": "!git clone https://github.com/malteos/aspect-document-similarity.git repo\n%cd repo" 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "pycharm": {} 24 | }, 25 | "outputs": [], 26 | "source": "# Download models (scibert-scivocab-uncased, trained on first CV fold)\n!mkdir -p models/acl models/cord19\n!wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/acl_fold-1_scibert-scivocab-uncased.tar.gz \n!tar -xzvf acl_fold-1_scibert-scivocab-uncased.tar.gz\n!mv scibert-scivocab-uncased models/acl\n\n!wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/cord19_fold-1_scibert-scivocab-uncased.tar.gz\n!tar -xzvf cord19_fold-1_scibert-scivocab-uncased.tar.gz\n!mv scibert-scivocab-uncased models/cord19\n\n!wget -O models/cord19/scibert-scivocab-uncased/vocab.txt https://github.com/malteos/aspect-document-similarity/releases/download/1.0/scibert-vocab.txt\n!cp models/cord19/scibert-scivocab-uncased/vocab.txt models/acl/scibert-scivocab-uncased/vocab.txt" 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "pycharm": {} 33 | }, 34 | "outputs": [], 35 | "source": "# Install dependencies (for colab)\n!pip install requests transformers\u003d\u003d2.10.0\n\n# Install all dependencies\n#!pip install -r requirements.txt " 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 31, 40 | "metadata": { 41 | "pycharm": {} 42 | }, 43 | "outputs": [], 44 | "source": "from IPython.core.display import display, HTML\nfrom demo_utils import get_prediction" 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": { 49 | "pycharm": {} 50 | }, 51 | "source": "### Try your own papers\n\n1. Select model (either trained on ACL Anthology or CORD-19)\n2. Select input documents by paper IDs. \n\nAll IDs from [Semantic Scholar API](https://api.semanticscholar.org/) are supported (DOI, ArXiv ID, PubMed ID, ACL ID)\n" 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": { 57 | "pycharm": {} 58 | }, 59 | "outputs": [], 60 | "source": "selected_model \u003d \u0027acl\u0027 #@param [\"acl\", \"cord19\"]\nfrom_id \u003d \u002710.3115/1667583.1667640\u0027 #@param {type:\"string\"}\nto_id \u003d \u002710.1145/1367497.1367545\u0027 #@param {type:\"string\"}" 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 15, 65 | "metadata": { 66 | "pycharm": {} 67 | }, 68 | "outputs": [], 69 | "source": "# Perform predictions\npred_scores, pred_labels, from_doc, to_doc \u003d get_prediction(f\u0027./models/{selected_model}/scibert-scivocab-uncased\u0027, from_id, to_id)\n\ndisplay(HTML(f\u0027\u003ch3\u003eSeed: \u003ca href\u003d\"{from_doc[\"url\"]}\"\u003e{from_doc[\"title\"]}\u003c/a\u003e\u003c/h3\u003e\u0027))\ndisplay(HTML(f\u0027\u003ch3\u003eTarget: \u003ca href\u003d\"{to_doc[\"url\"]}\"\u003e{to_doc[\"title\"]}\u003c/a\u003e\u003c/h3\u003e\u0027))\ndisplay(HTML(f\u0027\u003ch4\u003ePredicted labels:\u003c/h4\u003e\u003cpre\u003e{\", \".join(pred_labels)}\u003c/pre\u003e\u0027))" 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.7.4" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 2 93 | } -------------------------------------------------------------------------------- /acl/preprocessing/negative_sampling.py: -------------------------------------------------------------------------------- 1 | # shared author 2 | import logging 3 | import math 4 | import random 5 | from collections import defaultdict 6 | from typing import List, Tuple, Set 7 | 8 | from fuzzywuzzy import fuzz 9 | 10 | from acl.utils import get_sorted_pair 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def get_cocitations(cits: List[Tuple[str, str]]) -> Set[Tuple[str, str]]: 16 | from_to_cits = defaultdict(set) 17 | 18 | for from_id, to_id in cits: 19 | from_to_cits[from_id].add(to_id) 20 | 21 | cocits_set = set() 22 | 23 | for from_cit, to_cits in from_to_cits.items(): 24 | for a in to_cits: 25 | for b in to_cits: 26 | cocits_set.add(get_sorted_pair(a, b)) 27 | 28 | logger.info(f'total co-citation count: {len(cocits_set):,}') 29 | 30 | return cocits_set 31 | 32 | 33 | def get_authors(doc_id, doc_index): 34 | if doc_id in doc_index: 35 | s2paper = doc_index[doc_id] 36 | last_names = [a['name'].split()[-1].lower() for a in s2paper['authors']] 37 | return last_names 38 | else: 39 | raise ValueError(f'Doc ID not found: {doc_id}') 40 | 41 | 42 | def have_no_shared_authors(a_id, b_id, doc_index): 43 | try: 44 | a_authors = set(get_authors(a_id, doc_index)) 45 | b_authors = set(get_authors(b_id, doc_index)) 46 | 47 | overlap = a_authors & b_authors 48 | 49 | if len(overlap) == 0: 50 | return True 51 | else: 52 | return False 53 | 54 | except ValueError: 55 | return False 56 | 57 | 58 | # has same venue 59 | def get_venue(doc_id, doc_index): 60 | if doc_id in doc_index: 61 | s2paper = doc_index[doc_id] 62 | return s2paper['venue'].lower().strip() 63 | else: 64 | raise ValueError(f'Doc ID not found: {doc_id}') 65 | 66 | 67 | def have_not_same_venue(a_id, b_id, doc_index): 68 | a_venue = get_venue(a_id, doc_index) 69 | b_venue = get_venue(b_id, doc_index) 70 | 71 | if a_venue == "" or b_venue == "": 72 | # cant answer if venue is not set 73 | return False 74 | 75 | if fuzz.ratio(a_venue, b_venue) < 0.75: 76 | # fuzzy string matching score must be low! 77 | return True 78 | else: 79 | return False 80 | 81 | 82 | def get_negative_pairs(s2_id2s2_paper, positive_pairs, cits_set, cocits_set, negative_ratio=0.5, negative_count=0): 83 | # negative_label = 'none' 84 | # negative_needed = 10000 #105492 # len(df) 85 | 86 | if negative_count > 0: 87 | negative_needed = negative_count 88 | else: 89 | negative_needed = math.ceil(len(positive_pairs) * negative_ratio) 90 | 91 | # negative_rows = [] 92 | negative_pairs = set() 93 | tries = 0 94 | all_doc_ids = list(s2_id2s2_paper.keys()) 95 | 96 | logger.info(f'Negatives needed: {negative_needed:,} (ratio: {negative_ratio}, fixed: {negative_count})') 97 | 98 | while len(negative_pairs) < negative_needed: 99 | a = random.choice(all_doc_ids) 100 | b = random.choice(all_doc_ids) 101 | 102 | if a == b: 103 | tries += 1 104 | continue 105 | 106 | if not have_no_shared_authors(a, b, s2_id2s2_paper): 107 | tries += 1 108 | continue 109 | 110 | if not have_not_same_venue(a, b, s2_id2s2_paper): 111 | tries += 1 112 | continue 113 | 114 | cit_pair = get_sorted_pair(a, b) 115 | if cit_pair in cits_set: 116 | tries += 1 117 | continue 118 | 119 | if cit_pair in cocits_set: 120 | tries += 1 121 | continue 122 | 123 | # text = get_text_from_doc_id(a, s2_id2s2_paper) 124 | # text_b = get_text_from_doc_id(b, s2_id2s2_paper) 125 | # 126 | # if text == '' or text_b == '': 127 | # continue 128 | 129 | pair = tuple((a, b)) 130 | 131 | if pair in negative_pairs: 132 | continue 133 | 134 | negative_pairs.add(pair) 135 | 136 | # negative_rows.append(( 137 | # text, 138 | # text_b, 139 | # negative_label, 140 | # )) 141 | 142 | logger.info(f'Found {len(negative_pairs):,} negative rows (tried {tries:,} random samples)') 143 | 144 | return negative_pairs -------------------------------------------------------------------------------- /cord19/preprocessing/cord19_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from typing import Dict 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | def get_dict_value(d, key, default=None): 10 | if key in d: 11 | return d[key] 12 | else: 13 | return default 14 | 15 | 16 | def get_papers_and_citations_from_cord19(input_dir, id2meta): 17 | 18 | subsets = ['biorxiv_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset'] 19 | id2paper = {} 20 | 21 | has_doi = 0 22 | bib_count = 0 23 | cits = [] # from_doi, to_doi,
24 | 25 | for ss in subsets: 26 | ss_dir = os.path.join(input_dir, ss) 27 | 28 | # iterate over files 29 | for fn in os.listdir(ss_dir): 30 | if not fn.endswith('.json'): 31 | continue 32 | 33 | fp = os.path.join(ss_dir, fn) 34 | with open(fp, 'r') as f: 35 | paper = json.load(f) 36 | 37 | if paper['paper_id'] not in id2meta: 38 | continue 39 | 40 | meta = id2meta[paper['paper_id']] 41 | 42 | paper['_meta'] = dict(meta) 43 | 44 | id2paper[paper['paper_id']] = paper 45 | 46 | # has valid DOI 47 | if isinstance(meta['doi'], str) and len(meta['doi']) > 10: 48 | # iterate over body text 49 | for paragraph in paper['body_text']: 50 | # iterate over each citation marker 51 | for cit in paragraph['cite_spans']: 52 | # find corresponding bib entry 53 | if cit['ref_id'] in paper['bib_entries']: 54 | bib = paper['bib_entries'][cit['ref_id']] 55 | bib_count += 1 56 | 57 | # only use bib entries with DOI 58 | if 'DOI' in bib['other_ids']: 59 | has_doi += 1 60 | 61 | for out_doi in bib['other_ids']['DOI']: 62 | cits.append(( 63 | meta['doi'], 64 | out_doi, 65 | paragraph['section'] 66 | )) 67 | # break 68 | # break 69 | 70 | logger.info(f'Paper count: {len(id2paper)}') 71 | logger.info(f'DOI exists: {has_doi / bib_count} (total: {bib_count}; doi: {has_doi})') 72 | logger.info(f'Citation pairs: {len(cits)}') 73 | 74 | return id2paper, cits 75 | 76 | 77 | def merge_cord19_and_s2_papers(id2paper, id2meta, doi2s2paper: Dict[str, Dict]) -> Dict[str, Dict]: 78 | """ 79 | 80 | Merge CORD-19 + S2 81 | 82 | :param id2meta: 83 | :param id2paper: 84 | :param doi2s2paper: 85 | :return: DOI => Paper 86 | """ 87 | doi2paper = {} 88 | 89 | for pid, cord_paper in id2paper.items(): 90 | if pid in id2meta: 91 | doi = id2meta[pid]['doi'] 92 | 93 | paper = { 94 | 'cord19_id': cord_paper['paper_id'], 95 | 's2_id': None, 96 | 'title': cord_paper['metadata']['title'], 97 | 'abstract': cord_paper['abstract'][0]['text'] if len(cord_paper['abstract']) == 1 else None, 98 | 'arxivId': None, 99 | 'doi': doi, 100 | 'venue': cord_paper['_meta']['journal'], 101 | 'year': int(cord_paper['_meta']['publish_time'].split('-')[0]), 102 | 'citations_count': None, 103 | 'references_count': len(cord_paper['bib_entries']), 104 | 'authors': [author['first'] + ' ' + author['last'] for author in cord_paper['metadata']['authors']], 105 | } 106 | doi2paper[doi] = paper 107 | 108 | for doi, s2 in doi2s2paper.items(): 109 | 110 | paper = { 111 | 'cord19_id': None, 112 | 's2_id': get_dict_value(s2, 'paperId'), 113 | 'title': get_dict_value(s2, 'title'), 114 | 'abstract': get_dict_value(s2, 'abstract'), 115 | 'doi': doi, 116 | 'arxivId': get_dict_value(s2, 'arxivId'), 117 | 'venue': get_dict_value(s2, 'venue'), 118 | 'year': get_dict_value(s2, 'year', 0), 119 | 'citations_count': len(get_dict_value(s2, 'citations', [])), 120 | 'references_count': len(get_dict_value(s2, 'references', [])), 121 | 'authors': [a['name'] for a in get_dict_value(s2, 'authors', []) if 'name' in a], 122 | } 123 | 124 | if doi in doi2paper: 125 | logger.warning(f'Overriding CORD19 with S2 paper data: {doi}') 126 | 127 | paper['cord19_id'] = doi2paper[doi]['cord19_id'] 128 | 129 | doi2paper[doi] = paper 130 | 131 | return doi2paper 132 | -------------------------------------------------------------------------------- /experiments/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | from typing import List 3 | 4 | import numpy as np 5 | 6 | from torch.optim.lr_scheduler import LambdaLR 7 | 8 | 9 | def chunks(lst, n): 10 | """Yield successive n-sized chunks from lst.""" 11 | for i in range(0, len(lst), n): 12 | yield lst[i:i + n] 13 | 14 | 15 | def chunk(seq, num): 16 | avg = len(seq) / float(num) 17 | out = [] 18 | last = 0.0 19 | 20 | while last < len(seq): 21 | out.append(seq[int(last):int(last + avg)]) 22 | last += avg 23 | 24 | return out 25 | 26 | 27 | def flatten(d, parent_key='', sep='__'): 28 | items = [] 29 | for k, v in d.items(): 30 | new_key = parent_key + sep + k if parent_key else k 31 | if isinstance(v, collections.MutableMapping): 32 | items.extend(flatten(v, new_key, sep=sep).items()) 33 | else: 34 | items.append((new_key, v)) 35 | return dict(items) 36 | 37 | 38 | def unflatten(dictionary, sep='__'): 39 | out_dict = dict() 40 | for key, value in dictionary.items(): 41 | parts = key.split(sep) 42 | d = out_dict 43 | for part in parts[:-1]: 44 | if part not in d: 45 | d[part] = dict() 46 | d = d[part] 47 | d[parts[-1]] = value 48 | return out_dict 49 | 50 | 51 | 52 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): 53 | """ Create a schedule with a learning rate that decreases linearly after 54 | linearly increasing during a warmup period. 55 | """ 56 | def lr_lambda(current_step): 57 | if current_step < num_warmup_steps: 58 | return float(current_step) / float(max(1, num_warmup_steps)) 59 | return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) 60 | 61 | return LambdaLR(optimizer, lr_lambda, last_epoch) 62 | 63 | 64 | def get_categorical_one_hot_encoding_from_str(label_str, label_classes: List[str], label_sep=',', return_list=False): 65 | """ 66 | Converts a single or list categorical labels into a one-hot-encoded vectors. 67 | (multi-label multi-class classification) 68 | 69 | good,bad => [1.0, 1.0] 70 | good => [1.0, 0.0] 71 | 72 | [good,bad], [good] => [ [1.0, 1.0], [1.0, 0.0] ] 73 | 74 | :param return_list: 75 | :param label_str: 76 | :param label_classes: Label classes 77 | :param label_sep: Label separator (default: ,) 78 | :return: np.array or List 79 | """ 80 | if isinstance(label_str, List): 81 | # If input is a list of strings 82 | ls = [get_categorical_one_hot_encoding_from_str(ls, label_classes, label_sep, return_list) for ls in label_str] 83 | 84 | if return_list: 85 | return ls 86 | else: 87 | return np.array(ls) 88 | 89 | numerical_labels = [label_classes.index(l) for l in label_str.split(label_sep)] 90 | one_hot = np.zeros(len(label_classes)) 91 | 92 | one_hot[numerical_labels] = 1. 93 | 94 | if return_list: 95 | return one_hot.tolist() 96 | else: 97 | return one_hot 98 | 99 | 100 | def get_categorical_one_hot_encoding_from_str(label_str, label_classes: List[str], label_sep=',', return_list=False): 101 | """ 102 | Converts a single or list categorical labels into a one-hot-encoded vectors. 103 | (multi-label multi-class classification) 104 | 105 | good,bad => [1.0, 1.0] 106 | good => [1.0, 0.0] 107 | 108 | [good,bad], [good] => [ [1.0, 1.0], [1.0, 0.0] ] 109 | 110 | :param return_list: 111 | :param label_str: 112 | :param label_classes: Label classes 113 | :param label_sep: Label separator (default: ,) 114 | :return: np.array or List 115 | """ 116 | if isinstance(label_str, List): 117 | # If input is a list of strings 118 | ls = [get_categorical_one_hot_encoding(ls, label_classes, label_sep, return_list) for ls in label_str] 119 | 120 | if return_list: 121 | return ls 122 | else: 123 | return np.array(ls) 124 | 125 | numerical_labels = [label_classes.index(l) for l in label_str.split(label_sep)] 126 | one_hot = np.zeros(len(label_classes)) 127 | 128 | one_hot[numerical_labels] = 1. 129 | 130 | if return_list: 131 | return one_hot.tolist() 132 | else: 133 | return one_hot 134 | 135 | 136 | def highlight_max(data, color='green'): 137 | ''' 138 | highlight the maximum in a Series or DataFrame 139 | ''' 140 | attr = 'background-color: {}'.format(color) 141 | #remove % and cast to float 142 | data = data.replace('%','', regex=True).astype(float) 143 | if data.ndim == 1: # Series from .apply(axis=0) or axis=1 144 | is_max = data == data.max() 145 | return [attr if v else '' for v in is_max] 146 | else: # from .apply(axis=None) 147 | is_max = data == data.max().max() 148 | return pd.DataFrame(np.where(is_max, attr, ''), 149 | index=data.index, columns=data.columns) -------------------------------------------------------------------------------- /models/rnn.py: -------------------------------------------------------------------------------- 1 | from torch.nn import BCEWithLogitsLoss 2 | from transformers import PreTrainedModel 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence 8 | 9 | 10 | class RNNForMultiLabelSequenceClassification(nn.Module): 11 | """ 12 | 13 | LSTM/GRU with GloVe/FastText word embeddings 14 | 15 | forward() compatible with Tranformers Trainer 16 | 17 | """ 18 | 19 | def __init__(self, word_vectors, hidden_size=50, num_labels=2, num_layers=1, dropout=0., rnn='lstm'): 20 | super(RNNForMultiLabelSequenceClassification, self).__init__() 21 | 22 | self.num_labels = num_labels 23 | self.word_hidden_state = torch.zeros(2, 1, hidden_size) 24 | 25 | self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size)) 26 | self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size)) 27 | self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1)) 28 | 29 | self.lookup = nn.Embedding(num_embeddings=word_vectors.shape[0], embedding_dim=word_vectors.shape[1]) 30 | 31 | if rnn == 'gru': 32 | self.rnn = nn.GRU( 33 | input_size=word_vectors.shape[1], 34 | hidden_size=hidden_size, 35 | num_layers=num_layers, 36 | bidirectional=True, 37 | batch_first=True, 38 | dropout=dropout, 39 | ) 40 | elif rnn == 'lstm': 41 | self.rnn = nn.LSTM( 42 | input_size=word_vectors.shape[1], 43 | hidden_size=hidden_size, 44 | num_layers=num_layers, 45 | bidirectional=True, 46 | batch_first=True, 47 | dropout=dropout, 48 | ) 49 | else: 50 | raise ValueError('Unknown RNN type') 51 | 52 | self._create_weights(mean=0.0, std=0.05) 53 | 54 | self.word_attention = nn.Linear(2 * hidden_size, 50) 55 | 56 | # Word context vector to take dot-product with 57 | self.word_context_vector = nn.Linear(50, 1, bias=False) 58 | 59 | self.classifier = nn.Linear(2 * hidden_size, self.num_labels) 60 | 61 | # torch.set_printoptions(threshold=10000) 62 | 63 | def _create_weights(self, mean=0.0, std=0.05): 64 | self.word_weight.data.normal_(mean, std) 65 | self.context_weight.data.normal_(mean, std) 66 | 67 | def forward( 68 | self, 69 | input_ids=None, 70 | attention_mask=None, 71 | token_type_ids=None, 72 | position_ids=None, 73 | head_mask=None, 74 | inputs_embeds=None, 75 | labels=None, 76 | ): 77 | 78 | word_ids_lengths = attention_mask.sum(axis=1) 79 | word_embeddings = self.lookup(input_ids) 80 | 81 | packed_word_embeddings = pack_padded_sequence(word_embeddings, 82 | lengths=word_ids_lengths, 83 | batch_first=True, 84 | enforce_sorted=False) 85 | 86 | words_representation, _ = self.rnn(packed_word_embeddings) 87 | # This implementation uses the feature sentence_embeddings. Paper uses hidden state 88 | word_attention = self.word_attention(words_representation.data) 89 | word_attention = torch.tanh(word_attention) 90 | 91 | # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer) 92 | word_attention = self.word_context_vector(word_attention).squeeze(1) # (n_words) 93 | 94 | # Compute softmax over the dot-product manually 95 | # Manually because they have to be computed only over words in the same sentence 96 | 97 | # First, take the exponent 98 | max_value = word_attention.max() # scalar, for numerical stability during exponent calculation 99 | word_attention = torch.exp(word_attention - max_value) # (n_words) 100 | 101 | # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES) 102 | word_attention, _ = pad_packed_sequence(PackedSequence(data=word_attention, 103 | batch_sizes=words_representation.batch_sizes, 104 | sorted_indices=words_representation.sorted_indices, 105 | unsorted_indices=words_representation.unsorted_indices), 106 | batch_first=True) # (n_sentences, max(words_per_sentence)) 107 | 108 | # Calculate softmax values as now words are arranged in their respective sentences 109 | word_alphas = word_attention / torch.sum(word_attention, dim=1, 110 | keepdim=True) # (n_sentences, max(words_per_sentence)) 111 | 112 | # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES) 113 | sentences, _ = pad_packed_sequence(words_representation, 114 | batch_first=True) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) 115 | 116 | # Find sentence embeddings 117 | sentences = sentences * word_alphas.unsqueeze(2) # (n_sentences, max(words_per_sentence), 2 * word_rnn_size) 118 | 119 | # gets the representation for the sentence 120 | sentences = sentences.sum(dim=1) # (n_sentences) 121 | 122 | logits = self.classifier(sentences) 123 | 124 | outputs = (logits, sentences) 125 | 126 | if labels is not None: 127 | loss_fct = nn.BCEWithLogitsLoss() 128 | loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels)) 129 | 130 | outputs = (loss,) + outputs 131 | 132 | return outputs 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Aspect-based Document Similarity for Research Papers 2 | 3 | Implementation, trained models and result data for the paper **Aspect-based Document Similarity for Research Papers** [(PDF on Arxiv)](https://arxiv.org/abs/2010.06395). 4 | The supplemental material is available for download under [GitHub Releases](https://github.com/malteos/aspect-document-similarity/releases) or [Zenodo](http://doi.org/10.5281/zenodo.4087898). 5 | 6 | - Datasets are compatible with 🤗 [Huggingface NLP library](https://github.com/huggingface/nlp) (now known as [datasets](https://github.com/huggingface/datasets)). 7 | - Models are available on 🤗 [Huggingface Transformers models](https://huggingface.co/malteos). 8 | 9 | 10 | 11 | ## Demo 12 | 13 | Google Colab 14 | 15 | You can try our trained models directly on Google Colab on all papers available on Semantic Scholar (via DOI, ArXiv ID, ACL ID, PubMed ID): 16 | 17 | Click here for demo 18 | 19 | ## Requirements 20 | 21 | - Python 3.7 22 | - CUDA GPU (for Transformers) 23 | 24 | Datasets 25 | - [ACL Anthology Reference Corpus (ACL ARC)](http://acl-arc.comp.nus.edu.sg/) 26 | - [COVID-19 Open Research Dataset (CORD 19)](https://www.semanticscholar.org/cord19) 27 | 28 | ## Installation 29 | 30 | Create a new virtual environment for Python 3.7 with Conda: 31 | 32 | ```bash 33 | conda create -n paper python=3.7 34 | conda activate paper 35 | ``` 36 | 37 | Clone repository and install dependencies: 38 | ```bash 39 | git clone https://github.com/malteos/aspect-document-similarity.git repo 40 | cd repo 41 | pip install -r requirements.txt 42 | ``` 43 | 44 | ## Experiments 45 | 46 | To reproduce our experiments, follow these steps (if you just want to train and test the models, skip the first two steps): 47 | 48 | ### Prepare 49 | 50 | ```bash 51 | export DIR=./output 52 | 53 | # ACL Anthology 54 | # Get parscit files from: https://acl-arc.comp.nus.edu.sg/archives/acl-arc-160301-parscit/) 55 | sh ./sbin/download_parsecit.sh 56 | 57 | # CORD-19 58 | wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-03-13.tar.gz 59 | 60 | # Get additional data (collected from Semantic Scholar API) 61 | wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/acl_s2.tar 62 | wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/cord19_s2.tar 63 | ``` 64 | 65 | ### Build datasets 66 | 67 | ```bash 68 | # ACL 69 | python -m acl.dataset save_dataset 70 | 71 | # CORD-19 72 | python -m cord19.dataset save_dataset 73 | 74 | ``` 75 | 76 | ### Use dataset 77 | 78 | The datasets are built on the Huggingface NLP library (soon available on the official repository): 79 | 80 | ```python 81 | from nlp import load_dataset 82 | 83 | # Training data for first CV split 84 | train_dataset = load_dataset( 85 | './datasets/cord19_docrel/cord19_docrel.py', 86 | name='relations', 87 | split='fold_1_train' 88 | ) 89 | ``` 90 | 91 | ### Use models 92 | 93 | ```python 94 | from models.auto_modelling import AutoModelForMultiLabelSequenceClassification 95 | 96 | # Load models with pretrained weights from Huggingface model hub 97 | acl_model = AutoModelForMultiLabelSequenceClassification('malteos/aspect-acl-scibert-scivocab-uncased') 98 | cord19_model = AutoModelForMultiLabelSequenceClassification('malteos/aspect-cord19-scibert-scivocab-uncased') 99 | 100 | # Use the models in standard Huggingface fashion ... 101 | # acl_model(input_ids, token_type_ids, ...) 102 | # cord19_model(input_ids, token_type_ids, ...) 103 | 104 | ``` 105 | 106 | ### Train models 107 | 108 | All models are trained with the `trainer_cli.py` script: 109 | 110 | ```bash 111 | python trainer_cli.py --cv_fold $CV_FOLD \ 112 | --output_dir $OUTPUT_DIR \ 113 | --model_name_or_path $MODEL_NAME \ 114 | --doc_id_col $DOC_ID_COL \ 115 | --doc_a_col $DOC_A_COL \ 116 | --doc_b_col $DOC_B_COL \ 117 | --nlp_dataset $NLP_DATASET \ 118 | --nlp_cache_dir $NLP_CACHE_DIR \ 119 | --cache_dir $CACHE_DIR \ 120 | --num_train_epochs $EPOCHS \ 121 | --seed $SEED \ 122 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 123 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 124 | --learning_rate $LR \ 125 | --do_train \ 126 | --save_predictions 127 | ``` 128 | 129 | The exact parameters are available in `sbin/acl` and `sbin/cord19`. 130 | 131 | 132 | 133 | ### Evaluation 134 | 135 | The results can be computed and viewed with a Jupyter notebook. 136 | Figures and tables from the paper are part of the notebook. 137 | 138 | ```bash 139 | jupyter notebook evaluation.ipynb 140 | ``` 141 | 142 | Due to the space constraints some results could not be included in the paper. 143 | The full results for all methods and all test samples are available as 144 | CSV files under `Releases` 145 | (or via the Jupyter notebook). 146 | 147 | ## How to cite 148 | 149 | If you are using our code, please cite [our paper](https://arxiv.org/abs/2010.06395): 150 | 151 | ```bibtex 152 | @InProceedings{Ostendorff2020c, 153 | title = {Aspect-based Document Similarity for Research Papers}, 154 | booktitle = {Proceedings of the 28th International Conference on Computational Linguistics (COLING 2020)}, 155 | author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg}, 156 | year = {2020}, 157 | month = {Dec.}, 158 | } 159 | ``` 160 | 161 | ## License 162 | 163 | MIT 164 | 165 | 166 | -------------------------------------------------------------------------------- /datasets/acl_docrel/acl_docrel.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import json 4 | import os 5 | 6 | import nlp 7 | from pyarrow import csv 8 | 9 | _DESCRIPTION = """Aspect-oriented Document Similarity from the ACL-Anthology dataset""" 10 | 11 | _HOMEPAGE = "https://github.com/malteos/aspect-document-similarity" 12 | 13 | _CITATION = """ 14 | @InProceedings{Ostendorff2020b, 15 | title = {Aspect-based Document Similarity for Research Papers}, 16 | booktitle = {Proceedings of the 28th International Conference on Computational Linguistics, COLING 2020}, 17 | author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg}, 18 | year = {2020}, 19 | month = {Dec.}, 20 | } 21 | """ 22 | 23 | LABEL_CLASSES = ['introduction', 24 | 'related work', 25 | 'experiment', 26 | 'conclusion', 27 | 'results', 28 | 'background', 29 | 'discussion', 30 | 'evaluation', 31 | 'method', 32 | #'previous work', 33 | 'other', 34 | 'none'] 35 | 36 | DATA_URL = "http://datasets.fiq.de/acl_docrel.tar.gz" 37 | 38 | def get_train_split(k): 39 | return nlp.Split(f'fold_{k}_train') 40 | 41 | 42 | def get_test_split(k): 43 | return nlp.Split(f'fold_{k}_test') 44 | 45 | 46 | class AclDocrelConfig(nlp.BuilderConfig): 47 | def __init__(self, features, data_url, **kwargs): 48 | super(AclDocrelConfig, self).__init__(version=nlp.Version("0.1.0"), **kwargs) 49 | self.features = features 50 | self.data_url = data_url 51 | 52 | 53 | class AclDocrel(nlp.GeneratorBasedBuilder): 54 | """ACL anthology document relation dataset.""" 55 | 56 | BUILDER_CONFIGS = [ 57 | AclDocrelConfig( 58 | name="docs", 59 | description="document text and meta data", 60 | features={ 61 | "s2_id": nlp.Value("string"), 62 | "title": nlp.Value("string"), 63 | "abstract": nlp.Value("string"), 64 | "arxivId": nlp.Value("string"), 65 | "doi": nlp.Value("string"), 66 | "venue": nlp.Value("string"), 67 | "year": nlp.Value("int16"), 68 | "citations_count": nlp.Value("int32"), 69 | "references_count": nlp.Value("int32"), 70 | "authors": nlp.Sequence(nlp.Value('string', id='author_name')), 71 | }, 72 | data_url=DATA_URL, 73 | ), 74 | AclDocrelConfig( 75 | name="relations", 76 | description=" relation data", 77 | features={ 78 | "from_s2_id": nlp.Value("string"), 79 | "to_s2_id": nlp.Value("string"), 80 | "label": nlp.Sequence(nlp.Value('string', id='label')) 81 | }, 82 | data_url=DATA_URL, 83 | ), 84 | ] 85 | 86 | def _info(self): 87 | return nlp.DatasetInfo( 88 | description=_DESCRIPTION + self.config.description, 89 | features=nlp.Features(self.config.features), 90 | homepage=_HOMEPAGE, 91 | citation=_CITATION, 92 | ) 93 | 94 | def _split_generators(self, dl_manager): 95 | arch_path = dl_manager.download_and_extract(self.config.data_url) 96 | 97 | if self.config.name == "relations": 98 | train_file = "train.csv" 99 | test_file = "test.csv" 100 | 101 | generators = [] 102 | 103 | for k in [1, 2, 3, 4]: 104 | folds_path = os.path.join(arch_path, 'folds', str(k)) 105 | generators += [ 106 | nlp.SplitGenerator( 107 | name=get_train_split(k), 108 | gen_kwargs={'filepath': os.path.join(folds_path, train_file)} 109 | ), 110 | nlp.SplitGenerator( 111 | name=get_test_split(k), 112 | gen_kwargs={'filepath': os.path.join(folds_path, test_file)} 113 | ) 114 | ] 115 | return generators 116 | 117 | elif self.config.name == "docs": 118 | # docs 119 | docs_file = os.path.join(arch_path, "docs.jsonl") 120 | 121 | return [ 122 | nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}), 123 | ] 124 | else: 125 | raise ValueError() 126 | 127 | @staticmethod 128 | def get_s2_value(s2, key, default=None): 129 | if key in s2: 130 | return s2[key] 131 | else: 132 | return default 133 | 134 | def _generate_examples(self, filepath): 135 | """Generate docs + rel examples.""" 136 | 137 | if self.config.name == "relations": 138 | df = csv.read_csv(filepath).to_pandas() 139 | 140 | for idx, row in df.iterrows(): 141 | yield idx, dict(from_s2_id=row['from_s2_id'], to_s2_id=row['to_s2_id'], label=row['label'].split(',')) 142 | 143 | elif self.config.name == "docs": 144 | 145 | with open(filepath, 'r') as f: 146 | for i, line in enumerate(f): 147 | s2 = json.loads(line) 148 | 149 | yield i, { 150 | 's2_id': self.get_s2_value(s2, 'paperId'), 151 | 'title': self.get_s2_value(s2, 'title'), 152 | 'abstract': self.get_s2_value(s2, 'abstract'), 153 | 'doi': self.get_s2_value(s2, 'doi'), 154 | 'arxivId': self.get_s2_value(s2, 'arxivId'), 155 | 'venue': self.get_s2_value(s2, 'venue'), 156 | 'year': self.get_s2_value(s2, 'year', 0), 157 | 'citations_count': len(self.get_s2_value(s2, 'citations', [])), 158 | 'references_count': len(self.get_s2_value(s2, 'references', [])), 159 | 'authors': [a['name'] for a in self.get_s2_value(s2, 'authors', []) if 'name' in a], 160 | } 161 | -------------------------------------------------------------------------------- /datasets/cord19_docrel/cord19_docrel.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import json 4 | import os 5 | 6 | import nlp 7 | from pyarrow import csv 8 | 9 | _DESCRIPTION = """Aspect-oriented Document Similarity from the CORD-19 dataset""" 10 | 11 | _HOMEPAGE = "https://github.com/malteos/aspect-document-similarity" 12 | 13 | _CITATION = """ 14 | @InProceedings{Ostendorff2020b, 15 | title = {Aspect-based Document Similarity for Research Papers}, 16 | booktitle = {Proceedings of the 28th International Conference on Computational Linguistics, COLING 2020}, 17 | author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg}, 18 | year = {2020}, 19 | month = {Dec.}, 20 | } 21 | """ 22 | 23 | LABEL_CLASSES = ['discussion', 24 | 'introduction', 25 | 'conclusion', 26 | 'results', 27 | 'methods', 28 | 'background', 29 | 'materials', 30 | 'virus', 31 | 'future work', 32 | 'other', 33 | 'none'] 34 | 35 | DATA_URL = "http://datasets.fiq.de/cord19_docrel.tar.gz" 36 | 37 | DOC_A_COL = "from_doi" 38 | 39 | DOC_B_COL = "to_doi" 40 | 41 | LABEL_COL = "label" 42 | 43 | 44 | def get_train_split(k): 45 | return nlp.Split(f'fold_{k}_train') 46 | 47 | 48 | def get_test_split(k): 49 | return nlp.Split(f'fold_{k}_test') 50 | 51 | 52 | class Cord19DocrelConfig(nlp.BuilderConfig): 53 | def __init__(self, features, data_url, **kwargs): 54 | super(Cord19DocrelConfig, self).__init__(version=nlp.Version("0.1.0"), **kwargs) 55 | self.features = features 56 | self.data_url = data_url 57 | 58 | 59 | class Cord19Docrel(nlp.GeneratorBasedBuilder): 60 | """CORD-19 document relation dataset.""" 61 | 62 | BUILDER_CONFIGS = [ 63 | Cord19DocrelConfig( 64 | name="docs", 65 | description="document text and meta data", 66 | features={ 67 | "doi": nlp.Value("string"), 68 | "cord19_id": nlp.Value("string"), 69 | "s2_id": nlp.Value("string"), 70 | "title": nlp.Value("string"), 71 | "abstract": nlp.Value("string"), 72 | "arxivId": nlp.Value("string"), 73 | "venue": nlp.Value("string"), 74 | "year": nlp.Value("int16"), 75 | "citations_count": nlp.Value("int32"), 76 | "references_count": nlp.Value("int32"), 77 | "authors": nlp.Sequence(nlp.Value('string', id='author_name')), 78 | }, 79 | data_url=DATA_URL, 80 | ), 81 | Cord19DocrelConfig( 82 | name="relations", 83 | description=" relation data", 84 | features={ 85 | DOC_A_COL: nlp.Value("string"), 86 | DOC_B_COL: nlp.Value("string"), 87 | LABEL_COL: nlp.Sequence(nlp.Value('string', id='label')) 88 | }, 89 | data_url=DATA_URL, 90 | ), 91 | ] 92 | 93 | def _info(self): 94 | return nlp.DatasetInfo( 95 | description=_DESCRIPTION + self.config.description, 96 | features=nlp.Features(self.config.features), 97 | homepage=_HOMEPAGE, 98 | citation=_CITATION, 99 | ) 100 | 101 | def _split_generators(self, dl_manager): 102 | arch_path = dl_manager.download_and_extract(self.config.data_url) 103 | 104 | if "relations" in self.config.name: 105 | train_file = "train.csv" 106 | test_file = "test.csv" 107 | 108 | generators = [] 109 | 110 | for k in [1, 2, 3, 4]: 111 | folds_path = os.path.join(arch_path, 'folds', str(k)) 112 | generators += [ 113 | nlp.SplitGenerator( 114 | name=get_train_split(k), 115 | gen_kwargs={'filepath': os.path.join(folds_path, train_file)} 116 | ), 117 | nlp.SplitGenerator( 118 | name=get_test_split(k), 119 | gen_kwargs={'filepath': os.path.join(folds_path, test_file)} 120 | ) 121 | ] 122 | return generators 123 | 124 | elif "docs" in self.config.name: 125 | # docs 126 | docs_file = os.path.join(arch_path, "docs.jsonl") 127 | 128 | return [ 129 | nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}), 130 | ] 131 | else: 132 | raise ValueError() 133 | 134 | @staticmethod 135 | def get_dict_value(d, key, default=None): 136 | if key in d: 137 | return d[key] 138 | else: 139 | return default 140 | 141 | def _generate_examples(self, filepath): 142 | """Generate docs + rel examples.""" 143 | 144 | if "relations" in self.config.name: 145 | df = csv.read_csv(filepath).to_pandas() 146 | 147 | for idx, row in df.iterrows(): 148 | yield idx, { 149 | DOC_A_COL: row[DOC_A_COL], 150 | DOC_B_COL: row[DOC_B_COL], 151 | LABEL_COL: row[LABEL_COL].split(','), 152 | } 153 | 154 | elif self.config.name == "docs": 155 | 156 | with open(filepath, 'r') as f: 157 | for i, line in enumerate(f): 158 | doc = json.loads(line) 159 | 160 | yield i, { 161 | 'doi': str(self.get_dict_value(doc, 'doi')), # cast to str otherwise float 162 | 'cord19_id': self.get_dict_value(doc, 'cord19_id'), 163 | 's2_id': self.get_dict_value(doc, 's2_id'), 164 | 'title': self.get_dict_value(doc, 'title'), 165 | 'abstract': self.get_dict_value(doc, 'abstract'), 166 | 'arxivId': self.get_dict_value(doc, 'arxivId'), 167 | 'venue': str(self.get_dict_value(doc, 'venue') or ''), 168 | 'year': int(self.get_dict_value(doc, 'year', 0) or 0), 169 | 'citations_count': int(self.get_dict_value(doc, 'citations_count', 0) or 0), 170 | 'references_count': int(self.get_dict_value(doc, 'references_count', 0) or 0), 171 | 'authors': self.get_dict_value(doc, 'authors', []), 172 | } 173 | 174 | -------------------------------------------------------------------------------- /acl/preprocessing/parsecit.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | import lxml 3 | import re 4 | import logging 5 | import os 6 | 7 | from lxml.etree import LxmlError 8 | from tqdm import tqdm 9 | 10 | from acl.utils import normalize_title 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def get_parsecit_files(parscit_dir): 16 | parscit_files = [] 17 | 18 | for d in os.listdir(parscit_dir): 19 | if os.path.isdir(os.path.join(parscit_dir, d)): # subdir 20 | for dd in os.listdir(os.path.join(parscit_dir, d)): # subdir 2 21 | if os.path.isdir(os.path.join(parscit_dir, d, dd)): 22 | for fn in os.listdir(os.path.join(parscit_dir, d, dd)): # files 23 | fp = os.path.join(parscit_dir, d, dd, fn) 24 | 25 | parscit_files.append((fn, fp)) 26 | # Total files: 14,714 (server 21,520) 27 | logger.info(f'Total files: {len(parscit_files):,}') 28 | 29 | return parscit_files 30 | 31 | 32 | def load_parscit_file(fp, include_contexts=False): 33 | # read from file path 34 | tree = etree.parse(fp) 35 | 36 | # sections 37 | algo_sect = tree.getroot().cssselect('algorithm[name="SectLabel"] > variant')[0] 38 | sects = [] 39 | sect = None 40 | 41 | for child in algo_sect.getchildren(): 42 | if child.tag == 'sectionHeader': 43 | sects.append({ 44 | 'title': child.text.strip(), 45 | 'generic': child.get('genericHeader'), 46 | 'text': '', 47 | }) 48 | 49 | elif child.tag == 'bodyText': 50 | # Create untitled section if none exist 51 | if len(sects) == 0: 52 | sects.append({ 53 | 'title': None, 54 | 'generic': None, 55 | 'text': '', 56 | }) 57 | 58 | # Append to last section 59 | sects[-1]['text'] += child.text.strip() 60 | 61 | # replace line breaks within sentence (could be improved) 62 | for i, sect in enumerate(sects): 63 | sects[i]['text'] = re.sub(r'([A-Za-z],;)([\r\n]+)([A-Za-z])', r'\1 \3', sect['text']) 64 | 65 | # Iterate over all valid citations 66 | cits = [] 67 | 68 | def get_text_with_cssselect(ele, selector, default=None, ith=0): 69 | s = ele.cssselect(selector) 70 | 71 | if len(s) > ith: 72 | return s[ith].text 73 | else: 74 | return default 75 | 76 | for cit_ele in tree.getroot().cssselect('algorithm[name="ParsCit"] > citationList > citation[valid="true"]'): 77 | try: 78 | 79 | title = get_text_with_cssselect(cit_ele, 'title') 80 | marker = get_text_with_cssselect(cit_ele, 'marker') 81 | date = get_text_with_cssselect(cit_ele, 'date') # str 82 | book_title = get_text_with_cssselect(cit_ele, 'booktitle') 83 | 84 | authors = [e.text for e in cit_ele.cssselect('authors > author')] 85 | 86 | if date and len(date) != 4: 87 | raise ValueError(f'Invalid date: {date}') 88 | cit = dict(title=title, authors=authors, marker=marker, date=date, book_title=book_title) 89 | 90 | if include_contexts: 91 | cit['contexts'] = cit_ele.cssselect('contexts > context') 92 | 93 | cits.append(cit) 94 | except IndexError as e: 95 | print(f'Cannot parse citation: {e}; {etree.tostring(cit_ele)[:100]}') 96 | 97 | # Extract all citation markers (for later cleaning from section text) 98 | markers = [] 99 | for cit_context in tree.getroot().cssselect( 100 | 'algorithm[name="ParsCit"] > citationList > citation > contexts > context'): 101 | if 'citStr' in cit_context.attrib: 102 | markers.append(cit_context.get('citStr')) 103 | 104 | return sects, cits, markers 105 | 106 | 107 | def get_citation_pairs_from_parscit(parscit_files, acl_id2s2, title2s2_id): 108 | # Load citations with s2 109 | error_files = [] 110 | acl_id2sects = {} 111 | acl_id2markers = {} 112 | cit_pairs = [] 113 | 114 | # Iterate over papers 115 | for i, (fn, fp) in enumerate(tqdm(parscit_files, total=len(parscit_files), desc='Reading Parscit files')): 116 | try: 117 | sects, cits, markers = load_parscit_file(fp, include_contexts=True) 118 | 119 | from_acl_id = '-'.join(fn.split('-', 2)[:2]) # ACL ID 120 | acl_id2sects[from_acl_id] = sects 121 | acl_id2markers[from_acl_id] = markers 122 | 123 | from_s2_id = acl_id2s2[from_acl_id]['paperId'] if from_acl_id in acl_id2s2 else None 124 | 125 | # if from_s2_id not in s2_id2s2_paper: 126 | # logger.warning(f'From paper not in index') 127 | # continue 128 | 129 | # Citations in paper 130 | for cit in cits: 131 | if cit['title'] is None or cit['book_title'] is None or cit['date'] is None: 132 | continue 133 | 134 | # Find citing section context 135 | sect_contexts = [] 136 | for context in cit['contexts']: 137 | for i, sect in enumerate(sects): # Try to find citation string in all sections 138 | if context.get('citStr') in sect['text']: 139 | # found! 140 | sect_contexts.append((sect['generic'], sect['title'], context.get('citStr'))) 141 | 142 | # Skip citation if context is not available 143 | if len(sect_contexts) == 0: 144 | continue 145 | 146 | # Find to_s2_id 147 | cit_title = normalize_title(cit['title']) 148 | if cit_title in title2s2_id: 149 | to_s2_id = title2s2_id[cit_title] 150 | 151 | for context in sect_contexts: 152 | cit_pairs.append( 153 | # from_s2_id, (from_acl_id,) to_s2_id, sect_generic, sect_title, sect_marker 154 | ( 155 | from_s2_id, 156 | # from_acl_id, 157 | to_s2_id, 158 | ) + context 159 | ) 160 | else: 161 | # print('Not found:' + cit_title) 162 | pass 163 | 164 | except LxmlError as e: 165 | error_files.append((fn, fp)) 166 | # if i > 10: 167 | # break 168 | 169 | return cit_pairs, error_files 170 | -------------------------------------------------------------------------------- /acl/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | 7 | import fire 8 | import pandas as pd 9 | from sklearn.model_selection import StratifiedKFold 10 | from smart_open import open 11 | from tqdm import tqdm 12 | 13 | from acl.preprocessing.negative_sampling import get_cocitations, get_negative_pairs 14 | from acl.preprocessing.parsecit import get_parsecit_files, get_citation_pairs_from_parscit 15 | from acl.utils import resolve_and_sect_titles, to_label, get_sorted_pair, get_text_from_doc, \ 16 | normalize_title 17 | 18 | logging.basicConfig(level=logging.DEBUG) 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def save_dataset(input_dir, parscit_dir, output_dir, cv_folds=4): 23 | """ 24 | 25 | Run with: $ python -m acl.dataset save_dataset 26 | 27 | Required parscit directory (from ACL-anthology): 28 | - Download and extract from: https://acl-arc.comp.nus.edu.sg/archives/acl-arc-160301-parscit/ 29 | - parscit/A/A00/A00-1000-parscit.130908.xml 30 | - ... 31 | 32 | Required input files (.json or .json.gz): 33 | - title2dblp_hits.json 34 | - acl_id2s2.json 35 | - arxiv2s2.json 36 | - doi2s2.json.gz 37 | 38 | Output structure 39 | - docs.jsonl: each line is a S2-paper 40 | - folds/1/ 41 | - folds/2/ 42 | - ... 43 | - folds/k/train.csv: actual training samples 44 | - folds/k/test.csv 45 | 46 | Samples are provided as CSV files with the following columns: 47 | - doc_a: S2-id 48 | - doc_b: S2-id 49 | - label: List of labels (comma separated) 50 | 51 | After dataset creation use the following commands to compress and upload all files: 52 | 53 | cd 54 | tar -cvzf acl_docrel.tar.gz docs.jsonl folds/ 55 | curl --upload-file acl_docrel.tar.gz ftp://$FTP_LOGIN:$FTP_PASSWORD@$FTP_HOST/$FTP_DIR 56 | 57 | :param input_dir: S2 paper files 58 | :param output_dir: Dataset files written to this directory 59 | :param parscit_dir: 60 | :param cv_folds: 61 | :return: 62 | """ 63 | negative_label = 'none' 64 | min_text_length = 50 65 | negative_sampling_ratio = 0.5 66 | 67 | # Fixed labels 68 | """ 69 | introduction 20515 70 | related work 14883 71 | experiment 5749 72 | conclusion 1914 73 | results 1828 74 | background 1748 75 | discussion 1627 76 | evaluation 1386 77 | method 927 78 | (previous work 902) 79 | """ 80 | labels = [ 81 | 'introduction', 82 | 'related work', 83 | 'experiment', 84 | 'conclusion', 85 | 'results', 86 | 'background', 87 | 'discussion', 88 | 'evaluation', 89 | 'method', 90 | # Only top-9 label classes for v1.1 (equal to CORD-19) 91 | # 'previous work' 92 | ] 93 | 94 | doc_a_col = 'from_s2_id' 95 | doc_b_col = 'to_s2_id' 96 | label_col = 'label' 97 | 98 | # Convert dirs to Path if is string 99 | if isinstance(output_dir, str): 100 | output_dir = Path(output_dir) 101 | 102 | if isinstance(input_dir, str): 103 | input_dir = Path(input_dir) 104 | 105 | # Load paper data from various sources 106 | # acl_id2title, doi2title, arxiv2title = get_dblp_titles(input_dir / 'title2dblp_hits.json.gz') # TODO 107 | acl_id2s2 = json.load(open(input_dir / 'acl_id2s2.json.gz', 'r')) 108 | arxiv2s2 = json.load(open(input_dir / 'arxiv2s2.json.gz', 'r')) 109 | doi2s2 = json.load(open(input_dir / 'doi2s2.json.gz', 'r')) 110 | 111 | # Merge S2 data 112 | s2_id2s2_paper = {} 113 | s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in acl_id2s2.items()}) 114 | s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in arxiv2s2.items()}) 115 | s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in doi2s2.items()}) 116 | 117 | # Filter by empty text 118 | s2_id2s2_paper = {s2_id: p for s2_id, p in s2_id2s2_paper.items() if len(get_text_from_doc(p)) >= min_text_length} 119 | 120 | # Title mapping from document index 121 | title2s2_id = {normalize_title(p['title']): s2_id for s2_id, p in s2_id2s2_paper.items()} 122 | 123 | parscit_files = get_parsecit_files(parscit_dir) 124 | cit_pairs, error_files = get_citation_pairs_from_parscit(parscit_files, acl_id2s2, title2s2_id) 125 | 126 | # s2_pairs, s2_pairs_not_found = get_s2_pairs_from_cits(cit_pairs, acl_id2s2) 127 | normalized_s2_pairs = resolve_and_sect_titles(cit_pairs, doc_index=s2_id2s2_paper) 128 | 129 | # Convert to dataframe 130 | df = pd.DataFrame(normalized_s2_pairs, columns=['from_s2_id', 'to_s2_id', 'citing_section', 'marker']) 131 | 132 | # Auto-determine top labels 133 | pre_label_col = 'citing_section' 134 | # top_sections = 10 135 | # labels = list(filter(lambda t: t, df[pre_label_col].value_counts()[:top_sections].keys())) 136 | 137 | # Remove duplicates 138 | logger.info(f'Before drop duplications: {len(df)}') 139 | 140 | df[label_col] = [to_label(t, labels) for t in df[pre_label_col]] 141 | df.drop_duplicates([doc_a_col, doc_b_col, label_col], keep='first', inplace=True) 142 | 143 | logger.info(f'After drop duplications: {len(df)}') 144 | 145 | # join multi-labels 146 | # df = df.groupby([doc_a_col, doc_b_col]).label.agg( 147 | # [('label_count', 'count'), (label_col, ','.join)]).reset_index() 148 | df = df.groupby([doc_a_col, doc_b_col]).label.agg( 149 | [(label_col, ','.join)]).reset_index() 150 | 151 | # Positive samples 152 | # pos_rows = [] 153 | # 154 | # for idx, r in df.iterrows(): 155 | # text = get_text_from_doc_id(r[doc_a_col], s2_id2s2_paper) 156 | # text_b = get_text_from_doc_id(r[doc_b_col], s2_id2s2_paper) 157 | # 158 | # # Filter out empty texts 159 | # if text != '' and text_b != '': 160 | # pos_rows.append((text, text_b, r[label_col])) 161 | cits_list = df[[doc_a_col, doc_b_col]].values.tolist() 162 | cits_set = {get_sorted_pair(from_id, to_id) for from_id, to_id in cits_list} 163 | 164 | logger.info(f'Total citation count: {len(cits_set):,}') 165 | 166 | # co cits 167 | cocits_set = get_cocitations(df[[doc_a_col, doc_b_col]].values.tolist()) 168 | 169 | # Negative sampling 170 | negative_pairs = get_negative_pairs(s2_id2s2_paper, cits_list, cits_set, cocits_set, 171 | negative_ratio=negative_sampling_ratio) 172 | 173 | # construct dataset frame 174 | logger.info('Constructing dataset data frame...') 175 | dataset = df[[doc_a_col, doc_b_col, label_col]].values.tolist()\ 176 | + list(map(lambda p: (p[0], p[1], negative_label), negative_pairs)) # positive + negative pairs 177 | 178 | dataset_df = pd.DataFrame(dataset, columns=[doc_a_col, doc_b_col, label_col]) 179 | 180 | # Verify 181 | missing_doc_ids = [doc_id for doc_id in dataset_df[doc_a_col].values if doc_id not in s2_id2s2_paper] 182 | missing_doc_ids += [doc_id for doc_id in dataset_df[doc_b_col].values if doc_id not in s2_id2s2_paper] 183 | 184 | if len(missing_doc_ids) > 0: 185 | raise ValueError(f'Document IDs are missing in index: {missing_doc_ids}') 186 | 187 | # Full training and test set 188 | logger.info(f'Creating {cv_folds}-Folds ') 189 | kf = StratifiedKFold(n_splits=cv_folds, random_state=0, shuffle=True) 190 | 191 | # Stratified K-Folds cross-validator 192 | for k, (train_index, test_index) in enumerate( 193 | kf.split(dataset_df.index.tolist(), dataset_df[label_col].values.tolist()), 1): 194 | fold_dir = os.path.join(output_dir, 'folds', str(k)) 195 | 196 | if not os.path.exists(fold_dir): 197 | logger.info(f'Create new fold dir: {fold_dir}') 198 | os.makedirs(fold_dir) 199 | 200 | split_train_df = dataset_df.iloc[train_index] 201 | split_test_df = dataset_df.iloc[test_index] 202 | 203 | logger.info(f'Total: {len(dataset_df):,}; Train: {len(split_train_df):,}; Test: {len(split_test_df):,}') 204 | 205 | split_train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=False) 206 | split_test_df.to_csv(os.path.join(fold_dir, 'test.csv'), index=False) 207 | 208 | # Write doc output 209 | with open(str(output_dir / 'docs.jsonl'), 'w') as f: 210 | for paper in tqdm(s2_id2s2_paper.values(), desc='Writing document data', total=len(s2_id2s2_paper)): 211 | f.write(json.dumps(paper) + '\n') 212 | 213 | logger.info('Done') 214 | 215 | 216 | if __name__ == '__main__': 217 | fire.Fire() 218 | sys.exit(0) 219 | -------------------------------------------------------------------------------- /cord19/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import sys 5 | from pathlib import Path 6 | from typing import Union 7 | 8 | import fire 9 | import pandas as pd 10 | from sklearn.model_selection import StratifiedKFold 11 | from smart_open import open 12 | from tqdm import tqdm 13 | 14 | from acl.preprocessing.negative_sampling import get_cocitations 15 | from acl.utils import get_sorted_pair, to_label 16 | from cord19.preprocessing.cord19_reader import get_papers_and_citations_from_cord19, merge_cord19_and_s2_papers 17 | from cord19.preprocessing.negative_sampling import get_negative_pairs 18 | from cord19.utils import normalize_section, resolve_and_sect_titles, get_text_from_doi 19 | 20 | logging.basicConfig(level=logging.DEBUG) 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def save_dataset(input_dir: Union[str, Path], output_dir: Union[str, Path], cv_folds: int = 4): 25 | """ 26 | 27 | Run with: $ python -m cord19.dataset save_dataset 28 | 29 | input_dir = '/home/mostendorff/datasets/cord-19/' 30 | output_dir = '/home/mostendorff/datasets/cord-19/dataset/' 31 | cv_folds = 4 32 | 33 | input_dir/metadata.csv 34 | input_dir/doi2paper.json.gz 35 | input_dir/ = ['biorxiv_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset'] 36 | 37 | output_dir/docs.jsonl 38 | output_dir/folds/1/train.csv 39 | output_dir/folds/1/test.csv 40 | 41 | tar -cvzf cord19_docrel.tar.gz docs.jsonl folds/ 42 | curl --upload-file cord19_docrel.tar.gz ftp://$FTP_LOGIN:$FTP_PASSWORD@ostendorff.org/cloud.ostendorff.org/static/ 43 | 44 | :param input_dir: Path to directory with input files 45 | :param output_dir: Output files are written to this dir 46 | :param cv_folds: Number of folds in k-fold cross validation 47 | """ 48 | label_col = 'label' 49 | negative_label = 'none' 50 | min_text_length = 50 51 | negative_sampling_ratio = 0.5 52 | 53 | doc_a_col = 'from_doi' 54 | doc_b_col = 'to_doi' 55 | 56 | labels = [ 57 | 'discussion', 58 | 'introduction', 59 | 'conclusion', 60 | 'results', 61 | 'methods', 62 | 'background', 63 | 'materials', 64 | 'virus', 65 | 'future work' 66 | ] 67 | 68 | # input_dir = os.path.join(env['datasets_dir'], 'cord-19') 69 | 70 | # Convert dirs to Path if is string 71 | if isinstance(output_dir, str): 72 | output_dir = Path(output_dir) 73 | 74 | if isinstance(input_dir, str): 75 | input_dir = Path(input_dir) 76 | 77 | # Read meta data 78 | meta_df = pd.read_csv(input_dir / 'metadata.csv', index_col=0, dtype={'doi': str, 'journal': str}) 79 | id2meta = {row['sha']: row for idx, row in meta_df.iterrows() if row['sha']} 80 | 81 | logger.info('Unique DOIs in meta data: %s' % (len(meta_df['doi'].unique()) / len(meta_df))) 82 | 83 | # Load paper data and citations from CORD-19 84 | id2paper, cits = get_papers_and_citations_from_cord19(input_dir, id2meta) 85 | 86 | # Load paper data from disk (scraped from S2) 87 | if os.path.exists(input_dir / 'doi2s2paper.json.gz'): 88 | with open(str(input_dir / 'doi2s2paper.json.gz'), 'r') as f: 89 | doi2s2paper = json.load(f) 90 | 91 | logger.info(f'Loaded {len(doi2s2paper):,} scraped papers from disk') 92 | else: 93 | logger.error('Cannot load S2 papers from: %s' % (input_dir / 'doi2paper.json.gz')) 94 | doi2s2paper = {} 95 | 96 | # Merge CORD-19 papers and S2 papers 97 | doi2paper = merge_cord19_and_s2_papers(id2paper, id2meta, doi2s2paper) 98 | 99 | logger.info(f'Loaded {len(doi2paper)} from CORD-19') 100 | 101 | all_dois = list(doi2paper.keys()) 102 | 103 | # DOIs with text 104 | doi2text = {} 105 | for doi in all_dois: 106 | text = get_text_from_doi(doi, doi2paper, raise_not_found_error=False) 107 | if len(text) > min_text_length: 108 | doi2text[doi] = text 109 | 110 | logger.info(f'Total DOIs: {len(all_dois):,}') 111 | logger.info(f'With text DOIs: {len(doi2text):,}') 112 | 113 | # Filter citations with existing DOI 114 | cits_with_doi = [c for c in cits if c[0] in doi2paper and c[1] in doi2paper] 115 | 116 | # CORD-19 only: Citations with DOI: 30655 (0.09342419246206499) 117 | # + S2: Citations with DOI: 170454 (0.5194756908148369) 118 | 119 | logger.info(f'Citations with DOI: {len(cits_with_doi)} ({len(cits_with_doi) / len(cits)})') 120 | 121 | missing_papers = [c[0] for c in cits if c[0] not in doi2paper] 122 | missing_papers += [c[1] for c in cits if c[1] not in doi2paper] 123 | 124 | logger.info(f'Missing paper data, but DOI: {len(missing_papers)}') 125 | 126 | unique_missing_papers = set(missing_papers) 127 | 128 | logger.info(f'Unique DOIs of missing papers: {len(unique_missing_papers)}') 129 | 130 | # resolve 'and' titles 131 | normalized_cits_with_doi = resolve_and_sect_titles(cits_with_doi) 132 | 133 | cits_df = pd.DataFrame(normalized_cits_with_doi, columns=[doc_a_col, doc_b_col, 'citing_section']) 134 | # cits_df 135 | 136 | logger.info(f'After normalization: {len(cits_df):,} (before: {len(cits_with_doi):,})') 137 | 138 | # top_sections = 10 139 | # labels = list(filter(lambda t: t, cits_df['citing_section'].value_counts()[:top_sections].keys())) 140 | 141 | # Remove duplicates 142 | cits_df[label_col] = [to_label(normalize_section(t), labels) for t in cits_df['citing_section']] 143 | cits_df.drop_duplicates([doc_a_col, doc_b_col, 'label'], keep='first', inplace=True) 144 | 145 | # Document must have text 146 | cits_df = cits_df[(cits_df[doc_a_col].isin(doi2text.keys())) & (cits_df[doc_a_col].isin(doi2text.keys()))] 147 | 148 | # Merge multi-labels 149 | df = cits_df.groupby([doc_a_col, doc_b_col]).label.agg([(label_col, ','.join)]).reset_index() 150 | 151 | # # Positive samples 152 | # pos_rows = [] 153 | # 154 | # for idx, r in df.iterrows(): 155 | # text = get_text_from_doi(r[doc_a_col], doi2s2paper, doi2paper) 156 | # text_b = get_text_from_doi(r[doc_b_col], doi2s2paper, doi2paper) 157 | # 158 | # # Filter out empty texts 159 | # if text != '' and text_b != '': 160 | # pos_rows.append((r[doc_a_col], r[doc_b_col], text, text_b, r[label_col])) 161 | 162 | cits_set = set([get_sorted_pair(from_doi, to_doi) for from_doi, to_doi, label in cits_with_doi]) 163 | 164 | logger.info(f'Total citation count: {len(cits_set):,}') 165 | 166 | cocits_set = get_cocitations([(from_doi, to_doi) for from_doi, to_doi, label in cits_with_doi]) 167 | 168 | # Negatives needed: 52,746 (ratio: 0.5) 169 | negative_pairs = get_negative_pairs( 170 | doi2paper, 171 | candidate_doc_ids=list(doi2text.keys()), 172 | positive_pairs=df[[doc_a_col, doc_b_col]].values.tolist(), 173 | cits_set=cits_set, 174 | cocits_set=cocits_set, 175 | negative_ratio=negative_sampling_ratio 176 | ) 177 | 178 | ### 179 | 180 | # construct dataset frame 181 | logger.info('Constructing dataset data frame...') 182 | dataset = df[[doc_a_col, doc_b_col, label_col]].values.tolist()\ 183 | + list(map(lambda p: (p[0], p[1], negative_label), negative_pairs)) # positive + negative pairs 184 | 185 | dataset_df = pd.DataFrame(dataset, columns=[doc_a_col, doc_b_col, label_col]) 186 | 187 | # TODO debug sample set? 188 | 189 | # Full training and test set 190 | logger.info(f'Creating {cv_folds}-Folds ') 191 | kf = StratifiedKFold(n_splits=cv_folds, random_state=0, shuffle=True) 192 | 193 | # Stratified K-Folds cross-validator 194 | for k, (train_index, test_index) in enumerate( 195 | kf.split(dataset_df.index.tolist(), dataset_df[label_col].values.tolist()), 1): 196 | fold_dir = os.path.join(output_dir, 'folds', str(k)) 197 | 198 | if not os.path.exists(fold_dir): 199 | logger.info(f'Create new fold dir: {fold_dir}') 200 | os.makedirs(fold_dir) 201 | 202 | split_train_df = dataset_df.iloc[train_index] 203 | split_test_df = dataset_df.iloc[test_index] 204 | 205 | logger.info(f'Total: {len(dataset_df):,}; Train: {len(split_train_df):,}; Test: {len(split_test_df):,}') 206 | 207 | split_train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=False) 208 | split_test_df.to_csv(os.path.join(fold_dir, 'test.csv'), index=False) 209 | 210 | # Write doc output 211 | with open(str(output_dir / 'docs.jsonl'), 'w') as f: 212 | for paper in tqdm(doi2paper.values(), desc='Writing document data', total=len(doi2paper)): 213 | f.write(json.dumps(paper) + '\n') 214 | 215 | logger.info('Done') 216 | 217 | 218 | if __name__ == '__main__': 219 | fire.Fire() 220 | sys.exit(0) 221 | -------------------------------------------------------------------------------- /sbin/acl/gpu1.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export PYTHONUNBUFFERED=1 4 | 5 | export APP_ROOT=$(dirname "$0") 6 | 7 | . $APP_ROOT/config.sh 8 | 9 | # models: albert-base-v1 bert-base-german-cased biobert-v1-1 longformer-base-4096.tar.gz pytorch 10 | # scibert-scivocab-uncased 11 | #albert-base-v2 bert-base-multilingual-cased distilbert-base-uncased 12 | # longformer-large-4096 roberta-base xlnet-base-cased 13 | #bert-base-cased bert-large-cased 14 | # longformer-base-4096 longformer-large-4096.tar.gz 15 | # roberta-large 16 | 17 | export EVAL_BATCH_SIZE=16 18 | export TRAIN_BATCH_SIZE=8 19 | 20 | export EVAL_BATCH_SIZE=16 21 | export TRAIN_BATCH_SIZE=8 22 | 23 | # serv 9212; gpu 0 24 | export CUDA_VISIBLE_DEVICES=0,1 25 | export MODEL_NAME="bert-base-cased" 26 | 27 | echo $MODEL_NAME 28 | for CV_FOLD in 1 2 3 4 29 | do 30 | echo $CV_FOLD 31 | python trainer_cli.py --cv_fold $CV_FOLD \ 32 | --output_dir $OUTPUT_DIR \ 33 | --model_name_or_path $MODEL_NAME \ 34 | --doc_id_col $DOC_ID_COL \ 35 | --doc_a_col $DOC_A_COL \ 36 | --doc_b_col $DOC_B_COL \ 37 | --nlp_dataset $NLP_DATASET \ 38 | --nlp_cache_dir $NLP_CACHE_DIR \ 39 | --cache_dir $CACHE_DIR \ 40 | --num_train_epochs $EPOCHS \ 41 | --seed $SEED \ 42 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 43 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 44 | --learning_rate $LR \ 45 | --logging_steps 100 \ 46 | --save_steps 0 \ 47 | --save_total_limit 3 \ 48 | --do_train \ 49 | --save_predictions 50 | done 51 | 52 | # serv 9212; gpu 1 53 | export CUDA_VISIBLE_DEVICES=1 54 | export MODEL_NAME="scibert-scivocab-uncased" 55 | echo $MODEL_NAME 56 | for CV_FOLD in 1 2 3 4 57 | do 58 | echo $CV_FOLD 59 | python trainer_cli.py --cv_fold $CV_FOLD \ 60 | --output_dir $OUTPUT_DIR \ 61 | --model_name_or_path $MODEL_NAME \ 62 | --doc_id_col $DOC_ID_COL \ 63 | --doc_a_col $DOC_A_COL \ 64 | --doc_b_col $DOC_B_COL \ 65 | --nlp_dataset $NLP_DATASET \ 66 | --nlp_cache_dir $NLP_CACHE_DIR \ 67 | --cache_dir $CACHE_DIR \ 68 | --num_train_epochs $EPOCHS \ 69 | --seed $SEED \ 70 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 71 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 72 | --learning_rate $LR \ 73 | --logging_steps 100 \ 74 | --save_steps 0 \ 75 | --save_total_limit 3 \ 76 | --do_train \ 77 | --save_predictions 78 | done 79 | 80 | # serv 9212; gpu 1 81 | export CUDA_VISIBLE_DEVICES=1 82 | 83 | export EVAL_BATCH_SIZE=16 84 | export TRAIN_BATCH_SIZE=12 85 | export EPOCHS=8 86 | export CV_FOLD=1 87 | export LR=1e-5 88 | export RNN_NUM_LAYERS=2 89 | export RNN_HIDDEN_SIZE=100 90 | export RNN_DROPOUT=0.1 91 | export SPACY_MODEL=./output/acl_docrel/spacy/en_acl_fasttext_300d 92 | export MODEL_NAME=baseline-rnn__fasttext__custom 93 | 94 | for CV_FOLD in 1 2 3 4 95 | do 96 | echo $CV_FOLD 97 | python trainer_cli.py --cv_fold $CV_FOLD \ 98 | --output_dir $OUTPUT_DIR \ 99 | --model_name_or_path $MODEL_NAME \ 100 | --doc_id_col $DOC_ID_COL \ 101 | --doc_a_col $DOC_A_COL \ 102 | --doc_b_col $DOC_B_COL \ 103 | --nlp_dataset $NLP_DATASET \ 104 | --nlp_cache_dir $NLP_CACHE_DIR \ 105 | --cache_dir $CACHE_DIR \ 106 | --num_train_epochs $EPOCHS \ 107 | --seed $SEED \ 108 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 109 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 110 | --learning_rate $LR \ 111 | --logging_steps 100 \ 112 | --save_steps 0 \ 113 | --save_total_limit 3 \ 114 | --spacy_model $SPACY_MODEL \ 115 | --rnn_type lstm \ 116 | --rnn_num_layers $RNN_NUM_LAYERS \ 117 | --rnn_hidden_size $RNN_HIDDEN_SIZE \ 118 | --rnn_dropout $RNN_DROPOUT \ 119 | --do_train \ 120 | --save_predictions 121 | done 122 | 123 | 124 | 125 | ###### 126 | ###### 127 | ###### 128 | ###### 129 | 130 | 131 | # serv 9200; gpu 2 132 | export CUDA_VISIBLE_DEVICES=2 133 | export EVAL_BATCH_SIZE=16 134 | export TRAIN_BATCH_SIZE=8 135 | export MODEL_NAME="roberta-base" 136 | for CV_FOLD in 1 2 3 4 137 | do 138 | echo $CV_FOLD 139 | python trainer_cli.py --cv_fold $CV_FOLD \ 140 | --output_dir $OUTPUT_DIR \ 141 | --model_name_or_path $MODEL_NAME \ 142 | --doc_id_col $DOC_ID_COL \ 143 | --doc_a_col $DOC_A_COL \ 144 | --doc_b_col $DOC_B_COL \ 145 | --nlp_dataset $NLP_DATASET \ 146 | --nlp_cache_dir $NLP_CACHE_DIR \ 147 | --cache_dir $CACHE_DIR \ 148 | --num_train_epochs $EPOCHS \ 149 | --seed $SEED \ 150 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 151 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 152 | --learning_rate $LR \ 153 | --logging_steps 100 \ 154 | --save_steps 0 \ 155 | --save_total_limit 3 \ 156 | --do_train \ 157 | --save_predictions 158 | done 159 | 160 | # serv 9200; gpu 3 161 | export CUDA_VISIBLE_DEVICES=3 162 | export EVAL_BATCH_SIZE=12 163 | export TRAIN_BATCH_SIZE=6 164 | export MODEL_NAME="xlnet-base-cased" 165 | for CV_FOLD in 1 2 3 4 166 | do 167 | echo $CV_FOLD 168 | python trainer_cli.py --cv_fold $CV_FOLD \ 169 | --output_dir $OUTPUT_DIR \ 170 | --model_name_or_path $MODEL_NAME \ 171 | --doc_id_col $DOC_ID_COL \ 172 | --doc_a_col $DOC_A_COL \ 173 | --doc_b_col $DOC_B_COL \ 174 | --nlp_dataset $NLP_DATASET \ 175 | --nlp_cache_dir $NLP_CACHE_DIR \ 176 | --cache_dir $CACHE_DIR \ 177 | --num_train_epochs $EPOCHS \ 178 | --seed $SEED \ 179 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 180 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 181 | --learning_rate $LR \ 182 | --logging_steps 100 \ 183 | --save_steps 0 \ 184 | --save_total_limit 3 \ 185 | --do_train \ 186 | --save_predictions 187 | done 188 | 189 | 190 | # serv 9200; gpu 4 191 | export CUDA_VISIBLE_DEVICES=4 192 | export EVAL_BATCH_SIZE=12 193 | export TRAIN_BATCH_SIZE=8 194 | export MODEL_NAME="google/electra-base-discriminator" 195 | for CV_FOLD in 1 2 3 4 196 | do 197 | echo $MODEL_NAME 198 | echo $CV_FOLD 199 | python trainer_cli.py --cv_fold $CV_FOLD \ 200 | --output_dir $OUTPUT_DIR \ 201 | --model_name_or_path $MODEL_NAME \ 202 | --doc_id_col $DOC_ID_COL \ 203 | --doc_a_col $DOC_A_COL \ 204 | --doc_b_col $DOC_B_COL \ 205 | --nlp_dataset $NLP_DATASET \ 206 | --nlp_cache_dir $NLP_CACHE_DIR \ 207 | --cache_dir $CACHE_DIR \ 208 | --num_train_epochs $EPOCHS \ 209 | --seed $SEED \ 210 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 211 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 212 | --learning_rate $LR \ 213 | --logging_steps 100 \ 214 | --save_steps 0 \ 215 | --save_total_limit 3 \ 216 | --do_train \ 217 | --save_predictions 218 | done 219 | 220 | 221 | # serv 9200; gpu 5 222 | export CUDA_VISIBLE_DEVICES=5 223 | export EVAL_BATCH_SIZE=16 224 | export TRAIN_BATCH_SIZE=8 225 | export MODEL_NAME="deepset/covid_bert_base" 226 | for CV_FOLD in 1 2 3 4 227 | do 228 | echo $MODEL_NAME 229 | echo $CV_FOLD 230 | python trainer_cli.py --cv_fold $CV_FOLD \ 231 | --output_dir $OUTPUT_DIR \ 232 | --model_name_or_path $MODEL_NAME \ 233 | --doc_id_col $DOC_ID_COL \ 234 | --doc_a_col $DOC_A_COL \ 235 | --doc_b_col $DOC_B_COL \ 236 | --nlp_dataset $NLP_DATASET \ 237 | --nlp_cache_dir $NLP_CACHE_DIR \ 238 | --cache_dir $CACHE_DIR \ 239 | --num_train_epochs $EPOCHS \ 240 | --seed $SEED \ 241 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 242 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 243 | --learning_rate $LR \ 244 | --logging_steps 100 \ 245 | --save_steps 0 \ 246 | --save_total_limit 3 \ 247 | --do_train \ 248 | --save_predictions 249 | done 250 | 251 | 252 | ##### 253 | 254 | # serv 9200; gpu 3 255 | export CUDA_VISIBLE_DEVICES=2,4,5 256 | export EVAL_BATCH_SIZE=12 257 | export TRAIN_BATCH_SIZE=6 258 | export MODEL_NAME="xlnet-base-cased" 259 | for CV_FOLD in 4 260 | do 261 | echo $CV_FOLD 262 | python trainer_cli.py --cv_fold $CV_FOLD \ 263 | --output_dir $OUTPUT_DIR \ 264 | --model_name_or_path $MODEL_NAME \ 265 | --doc_id_col $DOC_ID_COL \ 266 | --doc_a_col $DOC_A_COL \ 267 | --doc_b_col $DOC_B_COL \ 268 | --nlp_dataset $NLP_DATASET \ 269 | --nlp_cache_dir $NLP_CACHE_DIR \ 270 | --cache_dir $CACHE_DIR \ 271 | --num_train_epochs $EPOCHS \ 272 | --seed $SEED \ 273 | --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \ 274 | --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \ 275 | --learning_rate $LR \ 276 | --logging_steps 100 \ 277 | --save_steps 0 \ 278 | --save_total_limit 3 \ 279 | --do_train \ 280 | --save_predictions 281 | done 282 | 283 | export PYTHONUNBUFFERED="" 284 | 285 | -------------------------------------------------------------------------------- /acl/__data_prep.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import bibtexparser 4 | from lxml import etree 5 | import lxml 6 | from fuzzywuzzy import fuzz 7 | from fuzzywuzzy import process 8 | import pandas as pd 9 | import os 10 | import pickle 11 | import time 12 | import json 13 | import re 14 | import numpy as np 15 | from tqdm import tqdm_notebook as tqdm 16 | from collections import defaultdict 17 | import requests 18 | from lxml.etree import LxmlError 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def load_acl_corpus(data_dir): 24 | title2acl_ids = defaultdict(list) 25 | acl_id2meta = {} 26 | 27 | year2titles = defaultdict(list) 28 | author_last2titles = defaultdict(list) 29 | 30 | parser = etree.XMLParser(recover=True) 31 | 32 | for d in os.listdir(os.path.join(data_dir, 'aclxml')): 33 | if os.path.isdir(os.path.join(data_dir, 'aclxml', d)): 34 | for vol in os.listdir(os.path.join(data_dir, 'aclxml', d)): 35 | if os.path.isdir(os.path.join(data_dir, 'aclxml', d, vol)): 36 | xml_fp = os.path.join(data_dir, 'aclxml', d, vol, vol + '.xml') 37 | # print(vol) 38 | 39 | tree = etree.parse(xml_fp, parser=parser) 40 | 41 | # Parse volume 42 | papers = tree.getroot().cssselect('paper') 43 | 44 | for paper in papers: 45 | title = next(iter(paper.xpath('./title/text()')), None) 46 | year = next(iter(paper.xpath('./year/text()')), None) 47 | authors_first = paper.xpath('./author/first/text()') 48 | authors_last = paper.xpath('./author/last/text()') 49 | 50 | if title is None or year is None: 51 | continue 52 | 53 | acl_id = vol + '-' + paper.get('id') 54 | 55 | acl_id2meta[acl_id] = dict( 56 | title=title, 57 | year=year, 58 | book_title=next(iter(paper.xpath('./booktitle/text()')), None), 59 | bibkey=next(iter(paper.xpath('./bibkey/text()')), None), 60 | authors_first=authors_first, 61 | authors_last=authors_last, 62 | 63 | ) 64 | title2acl_ids[title].append(acl_id) 65 | year2titles[year].append(title) 66 | 67 | for last in authors_last: 68 | author_last2titles[last].append(title) 69 | 70 | # Extracted titles: 14,760 71 | print(f'Extracted titles: {len(title2acl_ids):,}') 72 | 73 | return title2acl_ids, acl_id2meta, year2titles, author_last2titles 74 | 75 | 76 | def get_text_with_cssselect(ele, selector, default=None, ith=0): 77 | s = ele.cssselect(selector) 78 | 79 | if len(s) > ith: 80 | return s[ith].text 81 | else: 82 | return default 83 | 84 | 85 | def load_parscit_file(fp, include_contexts=False): 86 | # read from file path 87 | tree = etree.parse(fp) 88 | 89 | # sections 90 | algo_sect = tree.getroot().cssselect('algorithm[name="SectLabel"] > variant')[0] 91 | sects = [] 92 | sect = None 93 | 94 | for child in algo_sect.getchildren(): 95 | if child.tag == 'sectionHeader': 96 | sects.append({ 97 | 'title': child.text.strip(), 98 | 'generic': child.get('genericHeader'), 99 | 'text': '', 100 | }) 101 | 102 | elif child.tag == 'bodyText': 103 | # Create untitled section if none exist 104 | if len(sects) == 0: 105 | sects.append({ 106 | 'title': None, 107 | 'generic': None, 108 | 'text': '', 109 | }) 110 | 111 | # Append to last section 112 | sects[-1]['text'] += child.text.strip() 113 | 114 | # replace line breaks within sentence (could be improved) 115 | for i, sect in enumerate(sects): 116 | sects[i]['text'] = re.sub(r'([A-Za-z],;)([\r\n]+)([A-Za-z])', r'\1 \3', sect['text']) 117 | 118 | # Iterate over all valid citations 119 | cits = [] 120 | 121 | def get_text_with_cssselect(ele, selector, default=None, ith=0): 122 | s = ele.cssselect(selector) 123 | 124 | if len(s) > ith: 125 | return s[ith].text 126 | else: 127 | return default 128 | 129 | for cit_ele in tree.getroot().cssselect('algorithm[name="ParsCit"] > citationList > citation[valid="true"]'): 130 | try: 131 | 132 | title = get_text_with_cssselect(cit_ele, 'title') 133 | marker = get_text_with_cssselect(cit_ele, 'marker') 134 | date = get_text_with_cssselect(cit_ele, 'date') # str 135 | book_title = get_text_with_cssselect(cit_ele, 'booktitle') 136 | 137 | authors = [e.text for e in cit_ele.cssselect('authors > author')] 138 | 139 | if date and len(date) != 4: 140 | raise ValueError(f'Invalid date: {date}') 141 | cit = dict(title=title, authors=authors, marker=marker, date=date, book_title=book_title) 142 | 143 | if include_contexts: 144 | cit['contexts'] = cit_ele.cssselect('contexts > context') 145 | 146 | cits.append(cit) 147 | except IndexError as e: 148 | print(f'Cannot parse citation: {e}; {etree.tostring(cit_ele)[:100]}') 149 | 150 | # Extract all citation markers (for later cleaning from section text) 151 | markers = [] 152 | for cit_context in tree.getroot().cssselect( 153 | 'algorithm[name="ParsCit"] > citationList > citation > contexts > context'): 154 | if 'citStr' in cit_context.attrib: 155 | markers.append(cit_context.get('citStr')) 156 | 157 | return sects, cits, markers 158 | 159 | 160 | # Extract citation context 161 | # - find section in which the citation markers can be found 162 | # - find the corresponding ACL paper 163 | # - fuzzy title search is expensive, therefore, we check on year + authors first to decrease search space. 164 | def get_citation_context(cits, sects, title2acl_ids, year2titles, author_last2titles): 165 | cits_with_context = [] # (bib_idx, sect_context) 166 | 167 | for cit in cits: 168 | if cit['title'] is None or cit['book_title'] is None or cit['date'] is None: 169 | continue 170 | 171 | # Find section context 172 | sect_contexts = [] 173 | for context in cit['contexts']: 174 | for i, sect in enumerate(sects): # Try to find citation string in all sections 175 | if context.get('citStr') in sect['text']: 176 | # found! 177 | # print(sect['title']) 178 | # print(sect['generic']) 179 | sect_contexts.append((sect['generic'], sect['title'], context.get('citStr'))) 180 | 181 | # print(context.get('citStr')) 182 | # print(context.get('position')) 183 | # print(context.get('startWordPos')) 184 | 185 | if len(sect_contexts) == 0: 186 | continue 187 | 188 | # Filter for ACL proceedings 189 | # TODO could be improved 190 | if 'ACL' in cit['book_title'] or 'Linguistics' in cit['book_title']: 191 | year_candidates = set(year2titles[cit['date']]) # papers from the same year 192 | 193 | if len(year_candidates) > 0: 194 | # papers from authors with same name 195 | # note: all name parts are used, bc we do not know what the first or last name is. 196 | author_names = [name for author in cit['authors'] for name in author.split()] 197 | author_candidates = [] 198 | for name in author_names: 199 | if name in author_last2titles: 200 | author_candidates += author_last2titles[name] 201 | author_candidates = set(author_candidates) 202 | 203 | if len(author_candidates) > 0: 204 | # candidate must be in both sets 205 | candidates = year_candidates & author_candidates 206 | 207 | if len(candidates) > 0: 208 | match_title, score = process.extractOne(cit['title'], candidates) 209 | 210 | # Candidate must be above threshold 211 | if score > .95 and match_title in title2acl_ids: 212 | for acl_id in title2acl_ids[match_title]: 213 | # Citation found in bib 214 | for sc in sect_contexts: 215 | cits_with_context.append((acl_id, sc)) 216 | 217 | # bib_candidates = process.extract(cit['title'], candidate_titles, limit=1) 218 | # for c_title, score in bib_candidates: 219 | # for acl_id in title2acl_ids[c_title]: 220 | # # Citation found in bib 221 | # for sc in sect_contexts: 222 | # cits_with_context.append((acl_id, sc)) 223 | 224 | # TODO multi title matches? -> check for year 225 | 226 | # print(c_idx) 227 | # print(bib_database.entries[c_idx]['title']) 228 | # print(marker) 229 | # break 230 | return cits_with_context 231 | 232 | 233 | -------------------------------------------------------------------------------- /experiments/data_helpers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC 3 | 4 | import torch 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 6 | from torch.nn.utils.rnn import pad_sequence 7 | from torch.utils.data import DataLoader, TensorDataset 8 | from transformers import BertTokenizer 9 | 10 | from experiments.data_loaders import DefaultXYDataLoader 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class DataHelper(object): 16 | """ 17 | 18 | Helps to load experimental data as PyTorch data loaders 19 | 20 | """ 21 | train_test_split = 0.8 22 | train_batch_size = None 23 | test_batch_size = None 24 | random_seed = None 25 | tqdm_cls = None 26 | 27 | def __init__(self, **kwargs): 28 | for k, v in kwargs.items(): 29 | if hasattr(self, k): 30 | setattr(self, k, v) 31 | else: 32 | raise ValueError(f'Unknown attribute: {k}') 33 | 34 | def get_data_loaders(self): 35 | raise NotImplementedError() 36 | 37 | @staticmethod 38 | def get_item_lengths(data_loader: DataLoader, masks_idx): 39 | """ 40 | Extract the length of data items in data loader (with masks) 41 | 42 | Inspect output with Pandas like this: `pd.Series(lengths).describe()` 43 | 44 | :param data_loader: 45 | :param masks_idx: Index of mask data in batch 46 | :return: List of length 47 | """ 48 | lengths = [] 49 | 50 | for batch in data_loader: 51 | for mask in batch[masks_idx]: 52 | lengths.append(int(mask.sum())) 53 | 54 | return lengths 55 | 56 | def get_train_test_split(self, df): 57 | split_at = int(len(df) * self.train_test_split) 58 | 59 | split_df = df.sample(frac=1., random_state=self.random_seed).reset_index(drop=True) 60 | 61 | train_df = split_df[:split_at] 62 | test_df = split_df[split_at:] 63 | 64 | logger.info(f'Train: {len(train_df)}; Test: {len(test_df)} (ratio: {self.train_test_split})') 65 | 66 | return train_df, test_df 67 | 68 | def get_data_sampler(self, sampler=None, dataset=None, sampler_cls=None): 69 | """ 70 | 71 | Handle different ways to sample data from data loader (Random, sequential, weighted, ..) 72 | 73 | :param sampler: 74 | :param dataset: 75 | :param sampler_cls: 76 | :return: 77 | """ 78 | if sampler is not None: 79 | return sampler # WeightedRandomSampler 80 | elif sampler_cls is not None: 81 | return sampler_cls(dataset) # Sequential or RandomSampler 82 | else: 83 | raise ValueError('Either `sampler` or `sampler_cls` must be set!') 84 | 85 | 86 | class BERTDataHelper(DataHelper): 87 | """ 88 | For BERT/Transformer specific input (tokenizer, ...) 89 | """ 90 | doc_a_col = None # type: str 91 | doc_b_col = None # type: str 92 | 93 | tokenizer = None 94 | bert_model_path = None 95 | bert_tokenizer_cls = BertTokenizer 96 | bert_tokenizer_params = { 97 | 'do_lower_case': True, 98 | } 99 | 100 | negative_sampling_ratio = 1. 101 | max_seq_length = 512 102 | 103 | def get_tokenizer(self): 104 | if self.tokenizer is None: 105 | self.tokenizer = self.bert_tokenizer_cls.from_pretrained(self.bert_model_path, **self.bert_tokenizer_params) 106 | 107 | return self.tokenizer 108 | 109 | def get_joint_token_ids_and_types(self, pairs, token_ids_map): 110 | """ 111 | Converts document pairs into a joint set of tokens for JointBERT models. 112 | 113 | Token format: [CLS] doc_a [SEP] doc_b [SEP] 114 | Token type ids: 0 0 0 1 1 115 | 116 | :param pairs: list of tuples with A + B (title or ids depending on keys of token_ids_map) 117 | :param token_ids_map: 118 | :return: joint_token_ids (tensor), masks (tensor), token_type_ids (tensor) 119 | """ 120 | 121 | reserved_tokens_count = 3 122 | max_side_length = int((self.max_seq_length - reserved_tokens_count) / 2) 123 | 124 | joint_ids = [] 125 | token_types = [] 126 | 127 | logger.info(f'Joining token pairs with max_side_length={max_side_length}') 128 | 129 | if self.tqdm_cls: 130 | pairs = self.tqdm_cls(pairs, total=len(pairs), desc='Joining documents') 131 | 132 | for a, b in pairs: 133 | token_ids_a = token_ids_map[a] 134 | token_ids_b = token_ids_map[b] 135 | 136 | len_a = len(token_ids_a) 137 | len_b = len(token_ids_b) 138 | 139 | if len_a > max_side_length and len_b > max_side_length: # both a too long 140 | token_ids_a = token_ids_a[:max_side_length] 141 | token_ids_b = token_ids_b[:max_side_length] 142 | elif len_a > max_side_length and len_b <= max_side_length: # a is long, b is short 143 | token_ids_a = token_ids_a[:max_side_length + max_side_length - len_b] 144 | token_ids_b = token_ids_b 145 | elif len_a <= max_side_length and len_b > max_side_length: # a is short, b is long 146 | token_ids_a = token_ids_a 147 | token_ids_b = token_ids_b[:max_side_length + max_side_length - len_a] 148 | else: 149 | token_ids_a = token_ids_a 150 | token_ids_b = token_ids_b 151 | 152 | # joint = [self.get_tokenizer().cls_token_id] + token_ids_a + \ 153 | # [self.get_tokenizer().sep_token_id] + token_ids_b + [self.get_tokenizer().sep_token_id] 154 | joint = self.get_tokenizer().build_inputs_with_special_tokens(token_ids_a, token_ids_b) 155 | 156 | joint_ids.append(torch.tensor(joint)) 157 | 158 | # [CLS] ids, .. [SEP] ... [SEP] 159 | # token_types.append(torch.tensor([0] * (2 + len(token_ids_a)) + [1] * (1 + len(token_ids_b)))) 160 | token_types.append(torch.tensor(self.get_tokenizer().create_token_type_ids_from_sequences(token_ids_a, token_ids_b))) 161 | 162 | joint_ids = pad_sequence(joint_ids, batch_first=True, padding_value=self.get_tokenizer().pad_token_id) 163 | #joint_ids.size() 164 | 165 | masks = torch.tensor([[float(i > 0) for i in ii] for ii in joint_ids]) 166 | 167 | token_types = pad_sequence(token_types, batch_first=True, padding_value=1) 168 | 169 | return joint_ids, masks, token_types 170 | 171 | 172 | def to_siamese_data_loader(self, df, token_ids_map, batch_size, sampler_cls=None, sampler=None): 173 | ys = self.get_ys_as_tensor(df) 174 | 175 | doc_ids = df[[self.doc_a_col, self.doc_b_col]].values 176 | 177 | if self.tqdm_cls: 178 | doc_ids = self.tqdm_cls(doc_ids, total=len(doc_ids), desc='Building tensor data set') 179 | 180 | #self.get_tokenizer() 181 | token_ids_a = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[a][:self.max_seq_length - 2] + [ 182 | self.get_tokenizer().sep_token_id]) for a, b in doc_ids] 183 | token_ids_b = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[b][:self.max_seq_length - 2] + [ 184 | self.get_tokenizer().sep_token_id]) for a, b in doc_ids] 185 | 186 | # token_ids_a = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[a][:self.max_seq_length - 2] + [self.get_tokenizer().sep_token_id]) for a, b in doc_ids] 187 | # token_ids_b = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[b][:self.max_seq_length - 2] + [self.get_tokenizer().sep_token_id]) for a, b in doc_ids] 188 | 189 | token_ids_a = pad_sequence(token_ids_a, batch_first=True, padding_value=self.get_tokenizer().pad_token_id) 190 | token_ids_b = pad_sequence(token_ids_b, batch_first=True, padding_value=self.get_tokenizer().pad_token_id) 191 | 192 | masks_a = torch.tensor([[float(i > 0) for i in ii] for ii in token_ids_a]) 193 | masks_b = torch.tensor([[float(i > 0) for i in ii] for ii in token_ids_b]) 194 | 195 | # build dataset 196 | dataset = TensorDataset( 197 | token_ids_a, 198 | masks_a, 199 | token_ids_b, 200 | masks_b, 201 | ys) 202 | 203 | return DefaultXYDataLoader(dataset, sampler=self.get_data_sampler(sampler, dataset, sampler_cls), batch_size=batch_size) 204 | 205 | def to_joint_data_loader(self, df, token_ids_map, batch_size, sampler_cls=None, sampler=None): 206 | ys = self.get_ys_as_tensor(df) 207 | 208 | doc_ids = df[[self.doc_a_col, self.doc_b_col]].values 209 | joint_ids, masks, token_types = self.get_joint_token_ids_and_types(doc_ids, token_ids_map) 210 | 211 | # build dataset 212 | dataset = TensorDataset( 213 | joint_ids, 214 | masks, 215 | token_types, 216 | ys) 217 | 218 | return DefaultXYDataLoader(dataset, sampler=self.get_data_sampler(sampler, dataset, sampler_cls), batch_size=batch_size) 219 | 220 | 221 | class DocRelDataHelper(object): 222 | labels = ['employer'] # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of' 223 | label_col = None 224 | none_label = 'none' 225 | label_encoder = None 226 | labels_integer_encoded = None 227 | onehot_encoder = None 228 | labels_onehot_encoded = None 229 | 230 | def get_labels_count(self): 231 | """ 232 | If "none label" is set, count is increased by one. 233 | 234 | :return: 235 | """ 236 | if self.none_label: 237 | return len(self.labels) + 1 238 | else: 239 | return len(self.labels) 240 | 241 | def set_label_encoder(self, df): 242 | self.label_encoder = LabelEncoder() 243 | # self.labels_integer_encoded = self.label_encoder.fit_transform(list(df[self.label_col].values)) 244 | label_values = list(df[self.label_col].values) 245 | 246 | if self.none_label: 247 | label_values.append(self.none_label) 248 | 249 | self.labels_integer_encoded = self.label_encoder.fit_transform(label_values) 250 | 251 | self.onehot_encoder = OneHotEncoder(sparse=False, categories='auto') 252 | self.labels_onehot_encoded = self.onehot_encoder.fit_transform( 253 | self.labels_integer_encoded.reshape(len(self.labels_integer_encoded), 1)) 254 | 255 | def is_binary_classification(self): 256 | return len(self.labels) == 1 257 | 258 | def get_ys_as_tensor(self, df): 259 | # convert categorical labels into numbers (one hot vectors) 260 | if self.is_binary_classification(): 261 | return torch.tensor(self.label_encoder.transform(df[self.label_col].values).reshape(len(df), 1)).double() 262 | else: 263 | onehot_encoded = self.onehot_encoder.transform( 264 | self.label_encoder.transform(df[self.label_col].values).reshape(len(df), 1) 265 | ) 266 | return torch.tensor(onehot_encoded) -------------------------------------------------------------------------------- /acl/trainer_utils.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import os 3 | from dataclasses import dataclass 4 | from typing import List, Dict, Callable, Optional, Any 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import spacy 9 | import torch 10 | from sklearn.metrics import classification_report 11 | from transformers import DataCollator, PreTrainedTokenizer 12 | from transformers import EvalPrediction 13 | 14 | from experiments.utils import flatten 15 | 16 | 17 | def get_label_classes_from_nlp_dataset(cls_path: str, attr_name='LABEL_CLASSES') -> List[str]: 18 | if not cls_path.endswith('.py'): 19 | raise ValueError('data path must point to .py-file') 20 | 21 | if not cls_path.startswith('./'): 22 | raise ValueError('Must be relative path') 23 | 24 | # Make absolute path from app root 25 | cls_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), cls_path[2:]) 26 | 27 | # Get file name, remove .py 28 | cls_name = cls_path[:-3].split('/')[-1] 29 | 30 | spec = importlib.util.spec_from_file_location(cls_name, cls_path) 31 | dataset_module = importlib.util.module_from_spec(spec) 32 | 33 | spec.loader.exec_module(dataset_module) 34 | 35 | if hasattr(dataset_module, attr_name): 36 | return getattr(dataset_module, attr_name) 37 | else: 38 | raise ValueError(f'dataset module does not have attribute: {attr_name}') 39 | 40 | 41 | def get_vectors_from_spacy_model(spacy_nlp): 42 | unk_token_vector = np.zeros((1, spacy_nlp.vocab.vectors.shape[1])) 43 | sep_token_vector = np.ones((1, spacy_nlp.vocab.vectors.shape[1])) 44 | return np.concatenate((spacy_nlp.vocab.vectors.data, 45 | unk_token_vector, 46 | sep_token_vector), axis=0) 47 | 48 | 49 | class DocRelTrainerHelper(object): 50 | def __init__(self, 51 | id2doc: Dict[str, Dict], 52 | label_classes: List[str], 53 | doc_a_col: str, 54 | doc_b_col: str, 55 | label_col: str, 56 | text_from_doc_func: Callable, 57 | max_length=512, 58 | spacy_nlp: Optional[Any] = None, 59 | transformers_tokenizer: Optional[PreTrainedTokenizer] = None, 60 | classification_threshold: float = 0.): 61 | self.id2doc = id2doc 62 | self.transformers_tokenizer = transformers_tokenizer 63 | self.spacy_nlp = spacy_nlp 64 | self.label_classes = label_classes 65 | self.doc_a_col = doc_a_col 66 | self.doc_b_col = doc_b_col 67 | self.label_col = label_col 68 | self.max_length = max_length 69 | self.classification_threshold = classification_threshold 70 | self.text_from_doc_func = text_from_doc_func 71 | 72 | if self.transformers_tokenizer and (self.transformers_tokenizer.max_len is None or self.transformers_tokenizer.max_len < 1): 73 | raise ValueError('Tokenizer max_length is not set!') 74 | 75 | if self.spacy_nlp: 76 | # Extend vocabulary with UNK + SEP token 77 | self.spacy_unk_token_id = len(self.spacy_nlp.vocab.vectors) + 0 78 | self.spacy_sep_token_id = len(self.spacy_nlp.vocab.vectors) + 1 79 | else: 80 | self.spacy_unk_token_id = self.spacy_sep_token_id = None 81 | 82 | def convert_to_features(self, batch): 83 | if self.transformers_tokenizer: 84 | return self.convert_to_features_transformers(batch) 85 | elif self.spacy_nlp: 86 | return self.convert_to_features_spacy(batch) 87 | else: 88 | raise ValueError('Neither Transformers tokenizer nor Spacy is set!') 89 | 90 | def convert_to_features_spacy(self, batch): 91 | snlp = self.spacy_nlp 92 | label_encodings = [] 93 | input_ids = [] 94 | attention_masks = [] 95 | 96 | for from_id, to_id, label in zip(batch[self.doc_a_col], batch[self.doc_b_col], batch[self.label_col]): 97 | if from_id not in self.id2doc: 98 | raise ValueError(f'Document not found. from_id={from_id}; label={label}') 99 | elif to_id not in self.id2doc: 100 | raise ValueError(f'Document not found. to_id={to_id}; label={label}') 101 | 102 | from_doc = self.id2doc[from_id] 103 | from_tokens = snlp(self.text_from_doc_func(from_doc))[:np.floor(self.max_length / 2)] 104 | from_token_ids = [snlp.vocab.vectors.key2row[t.norm] if t.has_vector and t.norm in snlp.vocab.vectors.key2row else self.spacy_unk_token_id for t in from_tokens] 105 | 106 | to_doc = self.id2doc[to_id] 107 | to_tokens = snlp(self.text_from_doc_func(to_doc))[:np.floor(self.max_length / 2)] 108 | to_token_ids = [snlp.vocab.vectors.key2row[t.norm] if t.has_vector and t.norm in snlp.vocab.vectors.key2row else self.spacy_unk_token_id for t in 109 | to_tokens] 110 | 111 | # Join with SEP token 112 | token_ids = from_token_ids + [self.spacy_sep_token_id] + to_token_ids 113 | token_ids = token_ids[:self.max_length] 114 | 115 | attention_mask = np.zeros(self.max_length) 116 | attention_mask[list(range(len(token_ids)))] = 1. 117 | 118 | # Zero-padding 119 | if len(token_ids) < self.max_length: 120 | token_ids += [0] * (self.max_length - len(token_ids)) 121 | 122 | one_hot_encoded_label = np.zeros(len(self.label_classes)) 123 | one_hot_encoded_label[[self.label_classes.index(l) for l in label]] = 1. 124 | 125 | # To list 126 | attention_masks.append(attention_mask.tolist()) 127 | input_ids.append(token_ids) 128 | label_encodings.append(one_hot_encoded_label) 129 | 130 | encodings = { 131 | 'input_ids': input_ids, 132 | 'attention_mask': attention_masks, 133 | 'token_type_ids': [[0] * self.max_length] * len(input_ids), 134 | 'labels': label_encodings, 135 | } 136 | 137 | return encodings 138 | 139 | def convert_to_features_transformers(self, batch): 140 | text_pairs = [] 141 | label_encodings = [] 142 | 143 | for from_id, to_id, label in zip(batch[self.doc_a_col], batch[self.doc_b_col], batch[self.label_col]): 144 | if from_id not in self.id2doc: 145 | raise ValueError(f'Document not found. from_id={from_id}; label={label}') 146 | elif to_id not in self.id2doc: 147 | raise ValueError(f'Document not found. to_id={to_id}; label={label}') 148 | else: 149 | from_doc = self.id2doc[from_id] 150 | to_doc = self.id2doc[to_id] 151 | 152 | text_pairs.append(( 153 | self.text_from_doc_func(from_doc), self.text_from_doc_func(to_doc) 154 | )) 155 | 156 | one_hot_encoded_label = np.zeros(len(self.label_classes)) 157 | one_hot_encoded_label[[self.label_classes.index(l) for l in label]] = 1. 158 | 159 | label_encodings.append(one_hot_encoded_label) 160 | 161 | input_encodings = self.transformers_tokenizer.batch_encode_plus( 162 | text_pairs, 163 | pad_to_max_length=True, 164 | truncation_strategy='longest_first', 165 | return_token_type_ids=True, 166 | return_attention_masks=True, 167 | max_length=self.max_length 168 | ) 169 | 170 | # RoBERTa does not make use of token type ids, therefore a list of zeros is returned. 171 | encodings = { 172 | 'input_ids': input_encodings['input_ids'], 173 | 'attention_mask': input_encodings['attention_mask'], 174 | 'token_type_ids': input_encodings['token_type_ids'], 175 | 'labels': label_encodings, 176 | } 177 | 178 | # if 'token_type_ids' in input_encodings: 179 | # input_encodings['token_type_ids'] = input_encodings['token_type_ids'] 180 | 181 | return encodings 182 | 183 | def compute_metrics(self, p: EvalPrediction) -> Dict: 184 | predicted_labels = np.where(p.predictions > self.classification_threshold, 1., 0.) 185 | 186 | return flatten(classification_report( 187 | y_true=p.label_ids, 188 | y_pred=predicted_labels, 189 | target_names=self.label_classes, 190 | zero_division=0, 191 | output_dict=True)) 192 | 193 | def get_df_from_predictions(self, relations_dataset, docs_dataset, predictions, exclude_columns: List=None): 194 | if exclude_columns is None: 195 | exclude_columns = [] 196 | 197 | # To dataframe with IDs ... 198 | true_dict = {'true_' + label: predictions.label_ids[:, idx] for idx, label in enumerate(self.label_classes)} 199 | predictions_dict = {'predicted_' + label: predictions.predictions[:, idx] for idx, label in 200 | enumerate(self.label_classes)} 201 | predictions_label_lists = [ 202 | [label for idx, label in enumerate(self.label_classes) if item[idx] > self.classification_threshold] for item in 203 | predictions.predictions] 204 | 205 | # Document meta data 206 | from_dict = { 207 | 'from_' + col: [self.id2doc[s2_id][col] if s2_id in self.id2doc else None for s2_id in relations_dataset[self.doc_a_col]] 208 | for col in docs_dataset.column_names if col not in exclude_columns} 209 | to_dict = {'to_' + col: [self.id2doc[s2_id][col] if s2_id in self.id2doc else None for s2_id in relations_dataset[self.doc_b_col]] 210 | for col in docs_dataset.column_names if col not in exclude_columns} 211 | 212 | df_dict = {} 213 | df_dict.update(from_dict) 214 | df_dict.update(to_dict) 215 | 216 | df_dict.update({ 217 | # Labels 218 | 'true': [','.join(label_list) for label_list in relations_dataset[self.label_col]], 219 | 'predicted': [','.join(label_list) for label_list in predictions_label_lists], 220 | }) 221 | df_dict.update(true_dict) 222 | df_dict.update(predictions_dict) 223 | 224 | return pd.DataFrame.from_dict(df_dict) 225 | 226 | 227 | @dataclass 228 | class DocRelDataCollator(DataCollator): 229 | def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]: 230 | """ 231 | Take a list of samples from a Dataset and collate them into a batch. 232 | Returns: 233 | A dictionary of tensors 234 | """ 235 | 236 | input_ids = torch.stack([example['input_ids'] for example in batch]) 237 | token_type_ids = torch.stack([example['token_type_ids'] for example in batch]) 238 | attention_mask = torch.stack([example['attention_mask'] for example in batch]) 239 | labels = torch.stack([example['labels'].squeeze() for example in batch]) 240 | 241 | model_kwargs = { 242 | 'input_ids': input_ids, 243 | 'attention_mask': attention_mask, 244 | 'token_type_ids': token_type_ids, 245 | 'labels': labels, 246 | } 247 | 248 | return model_kwargs 249 | 250 | 251 | def get_non_empty_text_from_doc(doc) -> str: 252 | """ 253 | Build document text from title + abstract 254 | 255 | :param doc: S2 paper 256 | :return: Document text 257 | """ 258 | 259 | text = '' 260 | 261 | if 'title' in doc: 262 | text += doc['title'] 263 | 264 | if doc['abstract']: 265 | text += '\n' + doc['abstract'] 266 | 267 | if len(text) == 0: 268 | # Ensure text is at least one char to make tokenizers work. 269 | text = ' ' 270 | 271 | return text 272 | -------------------------------------------------------------------------------- /models/auto_modeling.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | #from longformer.longformer import LongformerConfig 4 | from transformers import PretrainedConfig, AutoConfig, \ 5 | RobertaConfig, BertConfig, XLNetConfig, BartConfig, ElectraConfig 6 | 7 | from models.bart import BartForMultiLabelSequenceClassification 8 | from models.bert import BertForMultiLabelSequenceClassification 9 | from models.electra import ElectraForMultiLabelSequenceClassification 10 | from models.longformer import LongformerForMultiLabelSequenceClassification 11 | from models.roberta import RobertaForMultiLabelSequenceClassification 12 | from models.xlnet import XLNetForMultiLabelSequenceClassification 13 | 14 | MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( 15 | [ 16 | # (DistilBertConfig, DistilBertForSequenceClassification), 17 | # (AlbertConfig, AlbertForSequenceClassification), 18 | # (CamembertConfig, CamembertForSequenceClassification), 19 | # (XLMRobertaConfig, XLMRobertaForSequenceClassification), 20 | # (BartConfig, BartForSequenceClassification), 21 | (RobertaConfig, RobertaForMultiLabelSequenceClassification), 22 | (BertConfig, BertForMultiLabelSequenceClassification), 23 | (XLNetConfig, XLNetForMultiLabelSequenceClassification), 24 | #(LongformerConfig, LongformerForMultiLabelSequenceClassification), 25 | (BartConfig, BartForMultiLabelSequenceClassification), 26 | # (FlaubertConfig, FlaubertForSequenceClassification), 27 | # (XLMConfig, XLMForSequenceClassification), 28 | (ElectraConfig, ElectraForMultiLabelSequenceClassification), 29 | ] 30 | ) 31 | 32 | 33 | class AutoModelForMultiLabelSequenceClassification: 34 | r""" 35 | :class:`~transformers.AutoModelForSequenceClassification` is a generic model class 36 | that will be instantiated as one of the sequence classification model classes of the library 37 | when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` 38 | class method. 39 | 40 | This class cannot be instantiated using `__init__()` (throws an error). 41 | """ 42 | 43 | def __init__(self): 44 | raise EnvironmentError( 45 | "AutoModelForMultiLabelSequenceClassification is designed to be instantiated " 46 | "using the `AutoModelForMultiLabelSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " 47 | "`AutoModelForMultiLabelSequenceClassification.from_config(config)` methods." 48 | ) 49 | 50 | @classmethod 51 | def from_config(cls, config): 52 | r""" Instantiates one of the base model classes of the library 53 | from a configuration. 54 | 55 | Args: 56 | config (:class:`~transformers.PretrainedConfig`): 57 | The model class to instantiate is selected based on the configuration class: 58 | 59 | - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertForSequenceClassification` (DistilBERT model) 60 | - isInstance of `albert` configuration class: :class:`~transformers.AlbertForSequenceClassification` (ALBERT model) 61 | - isInstance of `camembert` configuration class: :class:`~transformers.CamembertForSequenceClassification` (CamemBERT model) 62 | - isInstance of `xlm roberta` configuration class: :class:`~transformers.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) 63 | - isInstance of `roberta` configuration class: :class:`~transformers.RobertaForSequenceClassification` (RoBERTa model) 64 | - isInstance of `bert` configuration class: :class:`~transformers.BertForSequenceClassification` (Bert model) 65 | - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForSequenceClassification` (XLNet model) 66 | - isInstance of `xlm` configuration class: :class:`~transformers.XLMForSequenceClassification` (XLM model) 67 | - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model) 68 | 69 | 70 | Examples:: 71 | 72 | config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. 73 | model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` 74 | """ 75 | for config_class, model_class in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.items(): 76 | if isinstance(config, config_class): 77 | return model_class(config) 78 | raise ValueError( 79 | "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" 80 | "Model type should be one of {}.".format( 81 | config.__class__, 82 | cls.__name__, 83 | ", ".join(c.__name__ for c in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.keys()), 84 | ) 85 | ) 86 | 87 | @classmethod 88 | def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): 89 | r""" Instantiates one of the sequence classification model classes of the library 90 | from a pre-trained model configuration. 91 | 92 | The `from_pretrained()` method takes care of returning the correct model class instance 93 | based on the `model_type` property of the config object, or when it's missing, 94 | falling back to using pattern matching on the `pretrained_model_name_or_path` string. 95 | 96 | The model class to instantiate is selected as the first pattern matching 97 | in the `pretrained_model_name_or_path` string (in the following order): 98 | - contains `distilbert`: :class:`~transformers.DistilBertForSequenceClassification` (DistilBERT model) 99 | - contains `albert`: :class:`~transformers.AlbertForSequenceClassification` (ALBERT model) 100 | - contains `camembert`: :class:`~transformers.CamembertForSequenceClassification` (CamemBERT model) 101 | - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForSequenceClassification` (XLM-RoBERTa model) 102 | - contains `roberta`: :class:`~transformers.RobertaForSequenceClassification` (RoBERTa model) 103 | - contains `bert`: :class:`~transformers.BertForSequenceClassification` (Bert model) 104 | - contains `xlnet`: :class:`~transformers.XLNetForSequenceClassification` (XLNet model) 105 | - contains `flaubert`: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model) 106 | 107 | The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) 108 | To train the model, you should first set it back in training mode with `model.train()` 109 | 110 | Args: 111 | pretrained_model_name_or_path: either: 112 | 113 | - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. 114 | - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. 115 | - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. 116 | - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. 117 | 118 | model_args: (`optional`) Sequence of positional arguments: 119 | All remaining positional arguments will be passed to the underlying model's ``__init__`` method 120 | 121 | config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: 122 | Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: 123 | 124 | - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or 125 | - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. 126 | - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. 127 | 128 | state_dict: (`optional`) dict: 129 | an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. 130 | This option can be used if you want to create a model from a pretrained configuration but load your own weights. 131 | In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. 132 | 133 | cache_dir: (`optional`) string: 134 | Path to a directory in which a downloaded pre-trained model 135 | configuration should be cached if the standard cache should not be used. 136 | 137 | force_download: (`optional`) boolean, default False: 138 | Force to (re-)download the model weights and configuration files and override the cached versions if they exists. 139 | 140 | resume_download: (`optional`) boolean, default False: 141 | Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. 142 | 143 | proxies: (`optional`) dict, default None: 144 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 145 | The proxies are used on each request. 146 | 147 | output_loading_info: (`optional`) boolean: 148 | Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. 149 | 150 | kwargs: (`optional`) Remaining dictionary of keyword arguments: 151 | These arguments will be passed to the configuration and the model. 152 | 153 | Examples:: 154 | 155 | model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. 156 | model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` 157 | assert model.config.output_attention == True 158 | # Loading from a TF checkpoint file instead of a PyTorch model (slower) 159 | config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') 160 | model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) 161 | 162 | """ 163 | config = kwargs.pop("config", None) 164 | if not isinstance(config, PretrainedConfig): 165 | config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) 166 | 167 | for config_class, model_class in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.items(): 168 | if isinstance(config, config_class): 169 | return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) 170 | raise ValueError( 171 | "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" 172 | "Model type should be one of {}.".format( 173 | config.__class__, 174 | cls.__name__, 175 | ", ".join(c.__name__ for c in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.keys()), 176 | ) 177 | ) 178 | -------------------------------------------------------------------------------- /word_vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "pycharm": {} 7 | }, 8 | "source": "# Word vectors (FastText) for Baseline\n\n#### Create Spacy model from word vectors\n\n```bash\npython -m spacy init-model en output/cord19_docrel/spacy/en_cord19_fasttext_300d --vectors-loc output/cord19_docrel/cord19.fasttext.w2v.txt\npython -m spacy init-model en output/acl_docrel/spacy/en_acl_fasttext_300d --vectors-loc output/acl_docrel/acl.fasttext.w2v.txt\n```\n" 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "pycharm": {} 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "wandb: WARNING W\u0026B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "import gensim\n", 27 | "import json\n", 28 | "import os\n", 29 | "import requests\n", 30 | "import pickle\n", 31 | "import pandas as pd\n", 32 | "import logging\n", 33 | "from pathlib import Path\n", 34 | "from tqdm import tqdm_notebook as tqdm\n", 35 | "from smart_open import open\n", 36 | "from nlp import load_dataset\n", 37 | "import nlp\n", 38 | "import acl.utils\n", 39 | "from trainer_cli import ExperimentArguments" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "pycharm": {} 46 | }, 47 | "source": [ 48 | "## CORD19" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 22, 54 | "metadata": { 55 | "pycharm": {} 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "data_dir \u003d Path(\u0027./output/cord19_docrel\u0027)\n", 60 | "\n", 61 | "experiment_args \u003d ExperimentArguments(\n", 62 | " nlp_dataset\u003d\u0027./datasets/cord19_docrel/cord19_docrel.py\u0027,\n", 63 | " nlp_cache_dir\u003d\u0027./data/nlp_cache\u0027,\n", 64 | " doc_id_col\u003d\u0027doi\u0027,\n", 65 | " doc_a_col\u003d\u0027from_doi\u0027,\n", 66 | " doc_b_col\u003d\u0027to_doi\u0027,\n", 67 | " cv_fold\u003d1,\n", 68 | ")\n", 69 | "\n", 70 | "docs_ds \u003d load_dataset(experiment_args.nlp_dataset,\n", 71 | " name\u003d\u0027docs\u0027,\n", 72 | " cache_dir\u003dexperiment_args.nlp_cache_dir,\n", 73 | " split\u003dnlp.Split(\u0027docs\u0027))" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 23, 79 | "metadata": { 80 | "pycharm": {} 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Total tokens: 16,181,414\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "# Extract tokens from each document and create token file.\n", 93 | "tokens_count \u003d 0\n", 94 | "with open(data_dir / \u0027tokens.txt\u0027, \u0027w\u0027) as f:\n", 95 | " for idx, doc in docs_ds.data.to_pandas().iterrows():\n", 96 | " text \u003d acl.utils.get_text_from_doc(doc) \n", 97 | " for token in gensim.utils.simple_preprocess(text, min_len\u003d2, max_len\u003d15):\n", 98 | " f.write(token + \u0027 \u0027)\n", 99 | " tokens_count +\u003d 1\n", 100 | " f.write(\u0027\\n\u0027)\n", 101 | "print(f\u0027Total tokens: {tokens_count:,}\u0027)\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 26, 107 | "metadata": { 108 | "pycharm": {} 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "import fasttext\n", 113 | "\n", 114 | "model \u003d fasttext.train_unsupervised(str(data_dir / \u0027tokens.txt\u0027), \n", 115 | " model\u003d\u0027skipgram\u0027, \n", 116 | " lr\u003d0.05, # learning rate [0.05]\n", 117 | " dim\u003d300, # size of word vectors [100]\n", 118 | " ws\u003d5, # size of the context window [5]\n", 119 | " epoch\u003d5, # number of epochs [5]\n", 120 | " thread\u003d4, # number of threads [number of cpus]\n", 121 | " )" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 27, 127 | "metadata": { 128 | "pycharm": {} 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "model.save_model(str(data_dir / \u0027cord19.fasttext.bin\u0027))" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 28, 138 | "metadata": { 139 | "pycharm": {} 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "from gensim.models.wrappers import FastText\n", 144 | "\n", 145 | "ft_model \u003d FastText.load_fasttext_format(str(data_dir / \u0027cord19.fasttext.bin\u0027))\n", 146 | "ft_model.wv.save_word2vec_format(data_dir / \u0027cord19.fasttext.w2v.txt\u0027)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "pycharm": {} 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "# Unset\n", 158 | "del ft_model\n", 159 | "del model\n", 160 | "del docs_ds\n", 161 | "del experiment_args\n", 162 | "del data_dir" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "pycharm": {} 169 | }, 170 | "source": [ 171 | "## ACL" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 2, 177 | "metadata": { 178 | "pycharm": {} 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "Downloading and preparing dataset acl_docrel/docs (download: Unknown size, generated: Unknown size, total: Unknown size) to ./data/nlp_cache/acl_docrel/docs/0.1.0...\n" 186 | ] 187 | }, 188 | { 189 | "data": { 190 | "application/vnd.jupyter.widget-view+json": { 191 | "model_id": "5212702e85614bef8a3c2add3e36093e", 192 | "version_major": 2, 193 | "version_minor": 0 194 | }, 195 | "text/plain": [ 196 | "HBox(children\u003d(IntProgress(value\u003d0, description\u003d\u0027Downloading\u0027, max\u003d312525939, style\u003dProgressStyle(description_…" 197 | ] 198 | }, 199 | "metadata": {}, 200 | "output_type": "display_data" 201 | }, 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "\n" 207 | ] 208 | }, 209 | { 210 | "data": { 211 | "application/vnd.jupyter.widget-view+json": { 212 | "model_id": "", 213 | "version_major": 2, 214 | "version_minor": 0 215 | }, 216 | "text/plain": [ 217 | "HBox(children\u003d(IntProgress(value\u003d1, bar_style\u003d\u0027info\u0027, max\u003d1), HTML(value\u003d\u0027\u0027)))" 218 | ] 219 | }, 220 | "metadata": {}, 221 | "output_type": "display_data" 222 | }, 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "\r", 228 | "Dataset acl_docrel downloaded and prepared to ./data/nlp_cache/acl_docrel/docs/0.1.0. Subsequent calls will reuse this data.\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "data_dir \u003d Path(\u0027./output/acl_docrel\u0027)\n", 234 | "\n", 235 | "experiment_args \u003d ExperimentArguments(\n", 236 | " nlp_dataset\u003d\u0027./datasets/acl_docrel/acl_docrel.py\u0027,\n", 237 | " nlp_cache_dir\u003d\u0027./data/nlp_cache\u0027,\n", 238 | " doc_id_col\u003d\u0027s2_id\u0027,\n", 239 | " doc_a_col\u003d\u0027from_s2_id\u0027,\n", 240 | " doc_b_col\u003d\u0027to_s2_id\u0027,\n", 241 | " cv_fold\u003d1,\n", 242 | ")\n", 243 | "\n", 244 | "docs_ds \u003d load_dataset(experiment_args.nlp_dataset,\n", 245 | " name\u003d\u0027docs\u0027,\n", 246 | " cache_dir\u003dexperiment_args.nlp_cache_dir,\n", 247 | " split\u003dnlp.Split(\u0027docs\u0027))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 3, 253 | "metadata": { 254 | "pycharm": {} 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "Total tokens: 2,194,010\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "# Extract tokens from each document and create token file.\n", 267 | "tokens_count \u003d 0\n", 268 | "with open(data_dir / \u0027tokens.txt\u0027, \u0027w\u0027) as f:\n", 269 | " for idx, doc in docs_ds.data.to_pandas().iterrows():\n", 270 | " text \u003d acl.utils.get_text_from_doc(doc) \n", 271 | " for token in gensim.utils.simple_preprocess(text, min_len\u003d2, max_len\u003d15):\n", 272 | " f.write(token + \u0027 \u0027)\n", 273 | " tokens_count +\u003d 1\n", 274 | " f.write(\u0027\\n\u0027)\n", 275 | " \n", 276 | "# Total tokens: 2,194,010\n", 277 | "print(f\u0027Total tokens: {tokens_count:,}\u0027)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 4, 283 | "metadata": { 284 | "pycharm": {} 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "import fasttext\n", 289 | "\n", 290 | "model \u003d fasttext.train_unsupervised(str(data_dir / \u0027tokens.txt\u0027), \n", 291 | " model\u003d\u0027skipgram\u0027, \n", 292 | " lr\u003d0.05, # learning rate [0.05]\n", 293 | " dim\u003d300, # size of word vectors [100]\n", 294 | " ws\u003d5, # size of the context window [5]\n", 295 | " epoch\u003d5, # number of epochs [5]\n", 296 | " thread\u003d4, # number of threads [number of cpus]\n", 297 | " )" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 5, 303 | "metadata": { 304 | "pycharm": {} 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "model.save_model(str(data_dir / \u0027acl.fasttext.bin\u0027))" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 6, 314 | "metadata": { 315 | "pycharm": {} 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "from gensim.models.wrappers import FastText\n", 320 | "\n", 321 | "ft_model \u003d FastText.load_fasttext_format(str(data_dir / \u0027acl.fasttext.bin\u0027))\n", 322 | "ft_model.wv.save_word2vec_format(data_dir / \u0027acl.fasttext.w2v.txt\u0027)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "pycharm": {} 330 | }, 331 | "outputs": [], 332 | "source": [] 333 | } 334 | ], 335 | "metadata": { 336 | "kernelspec": { 337 | "display_name": "Python [conda env:acl-anthology] *", 338 | "language": "python", 339 | "name": "conda-env-acl-anthology-py" 340 | }, 341 | "language_info": { 342 | "codemirror_mode": { 343 | "name": "ipython", 344 | "version": 3 345 | }, 346 | "file_extension": ".py", 347 | "mimetype": "text/x-python", 348 | "name": "python", 349 | "nbconvert_exporter": "python", 350 | "pygments_lexer": "ipython3", 351 | "version": "3.7.4" 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 2 356 | } --------------------------------------------------------------------------------