├── acl
    ├── __init__.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── scraper.py
    │   ├── citation_mapping.py
    │   ├── negative_sampling.py
    │   └── parsecit.py
    ├── utils.py
    ├── dataset.py
    ├── __data_prep.py
    └── trainer_utils.py
├── cord19
    ├── __init__.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── negative_sampling.py
    │   └── cord19_reader.py
    ├── utils.py
    └── dataset.py
├── tests
    ├── __init__.py
    ├── test_data_helper.py
    ├── test_acl.py
    ├── test_experiment.py
    ├── test_rnn.py
    ├── test_auto_modeling.py
    └── test_trainer.py
├── datasets
    ├── __init__.py
    ├── acl_docrel
    │   ├── __init__.py
    │   └── acl_docrel.py
    └── cord19_docrel
    │   ├── __init__.py
    │   └── cord19_docrel.py
├── output
    └── README.md
├── demo.gif
├── docrel.png
├── cli.py
├── requirements.txt
├── sbin
    ├── cord19
    │   ├── config.sh
    │   ├── predict_only.sh
    │   ├── xlnet.sh
    │   ├── roberta-base.sh
    │   ├── scibert.sh
    │   ├── bert-base.sh
    │   ├── covid-bert-base.sh
    │   ├── scincl.sh
    │   ├── electra-base-discriminator.sh
    │   └── baseline-lstm-gpu.sh
    ├── acl
    │   ├── config.sh
    │   ├── xlnet.sh
    │   ├── bert-base.sh
    │   ├── roberta-base.sh
    │   ├── scibert.sh
    │   ├── scincl.sh
    │   ├── baseline-lstm-colab.sh
    │   ├── electra-base-discriminator.sh
    │   ├── baseline-lstm.sh
    │   ├── baseline-lstm-gpu.sh
    │   ├── 1.sh
    │   └── gpu1.sh
    └── compress_data_and_upload.sh
├── experiments
    ├── data_loaders.py
    ├── environment.py
    ├── utils.py
    └── data_helpers.py
├── environments
    └── default.yml
├── LICENSE.txt
├── models
    ├── __init__.py
    ├── roberta.py
    ├── electra.py
    ├── bart.py
    ├── bert.py
    ├── xlnet.py
    ├── longformer.py
    ├── utils.py
    ├── rnn.py
    └── auto_modeling.py
├── demo_utils.py
├── .gitignore
├── demo.ipynb
├── README.md
└── word_vectors.ipynb


/acl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cord19/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/acl/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cord19/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/acl_docrel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/datasets/cord19_docrel/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/output/README.md:
--------------------------------------------------------------------------------
1 | Keep this directory for all output files.


--------------------------------------------------------------------------------
/demo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malteos/aspect-document-similarity/HEAD/demo.gif


--------------------------------------------------------------------------------
/docrel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/malteos/aspect-document-similarity/HEAD/docrel.png


--------------------------------------------------------------------------------
/cli.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import fire
 3 | 
 4 | from commands import word_vectors, compute_doc_vecs
 5 | 
 6 | logging.basicConfig(level=logging.INFO)
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     fire.Fire({
12 |         'compute_doc_vecs': compute_doc_vecs.compute_doc_vecs,
13 |         'extract_text': word_vectors.extract_text,
14 |         'train_fasttext': word_vectors.train_fasttext,
15 |     })
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # general
 2 | pandas
 3 | jupyter
 4 | numpy
 5 | tqdm
 6 | matplotlib
 7 | pyyaml
 8 | nltk
 9 | scikit-learn==0.23.1
10 | 
11 | # data accusition & preprocessing
12 | fuzzywuzzy
13 | lxml
14 | cssselect
15 | # bibtexparser
16 | requests
17 | smart-open
18 | 
19 | # wiki related (+ nlp)
20 | # wikipedia2vec
21 | gensim
22 | spacy
23 | 
24 | # model & evaluation
25 | torch==1.6.0
26 | transformers==2.10.0
27 | tokenizers==0.7.0
28 | nlp==0.1.0
29 | 
30 | # experiments
31 | tensorboard
32 | wandb==0.8.36
33 | 


--------------------------------------------------------------------------------
/sbin/cord19/config.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export LR=2e-5  # 5e-05 # LEARNING_RATE = 2e-5  # 2e-6 does not work (?)
 4 | export EPOCHS=4  # or 4?
 5 | export SEED=0
 6 | export NLP_CACHE_DIR=./data/nlp_cache
 7 | export CACHE_DIR=./data/trainer_cache
 8 | 
 9 | export OUTPUT_DIR=./output/cord19_docrel/folds
10 | export DOC_ID_COL=doi
11 | export DOC_A_COL=from_doi
12 | export DOC_B_COL=to_doi
13 | export NLP_DATASET=./datasets/cord19_docrel/cord19_docrel.py
14 | 
15 | # wandb
16 | export WANDB_API_KEY=
17 | export WANDB_PROJECT=
18 | 


--------------------------------------------------------------------------------
/sbin/acl/config.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # export CUDA_VISIBLE_DEVICES=1
 4 | 
 5 | export LR=2e-5  # 5e-05 # LEARNING_RATE = 2e-5  # 2e-6 does not work (?)
 6 | export EPOCHS=4  # or 4?
 7 | export SEED=0
 8 | export NLP_CACHE_DIR=./data/nlp_cache
 9 | export CACHE_DIR=./data/trainer_cache
10 | 
11 | export OUTPUT_DIR=./output/acl_docrel/folds
12 | export DOC_ID_COL=s2_id
13 | export DOC_A_COL=from_s2_id
14 | export DOC_B_COL=to_s2_id
15 | export NLP_DATASET=./datasets/acl_docrel/acl_docrel.py
16 | 
17 | # wandb
18 | export WANDB_API_KEY=
19 | export WANDB_PROJECT=
20 | 


--------------------------------------------------------------------------------
/experiments/data_loaders.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from torch.utils.data import DataLoader
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | class XYDataLoader(DataLoader):
 9 |     """
10 | 
11 |     Batch consists only of X (item data) and Y (label)
12 | 
13 |     """
14 |     def get_x_from_batch(self, batch):
15 |         raise NotImplementedError()
16 | 
17 |     def get_y_from_batch(self, batch):
18 |         raise NotImplementedError()
19 | 
20 | 
21 | class DefaultXYDataLoader(XYDataLoader):
22 |     """
23 | 
24 |     Last item of batch is Y, everything else is X.
25 | 
26 |     """
27 |     def get_x_from_batch(self, batch):
28 |         return batch[:-1]
29 | 
30 |     def get_y_from_batch(self, batch):
31 |         return batch[-1]


--------------------------------------------------------------------------------
/environments/default.yml:
--------------------------------------------------------------------------------
 1 | # Settings for your local setup
 2 | local_mac:
 3 |   must_exists: /Volumes/data/repo/
 4 |   bert_dir: /Volumes/data/repo/data/bert
 5 |   datasets_dir: /Volumes/data/repo/data
 6 |   workers: 4
 7 | 
 8 | # Settings for your GPU server
 9 | gpu_server:
10 |   #must_exists: /usr/bin/nvidia-smi
11 |   must_exists: ~/gpu1
12 |   bert_dir: /mnt/hdd/datasets/BERT_pre_trained_models/pytorch
13 |   datasets_dir: /mnt/hdd/datasets
14 |   workers: 12
15 | 
16 | gpu_server2:
17 |   must_exists: /home/mostendorff/gpu2
18 |   datasets_dir: /data/datasets/
19 |   bert_dir: /data/datasets/huggingface_transformers/pytorch
20 |   workers: 36
21 | 
22 | google_colab:
23 |   must_exists: /content
24 |   datasets_dir: /dev/null
25 |   bert_dir: /dev/null
26 |   workers: 24
27 | 


--------------------------------------------------------------------------------
/sbin/cord19/predict_only.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export CV_FOLD=1
 4 | export TRAIN_BATCH_SIZE=8
 5 | export EVAL_BATCH_SIZE=32
 6 | 
 7 |     python trainer_cli.py --cv_fold $CV_FOLD \
 8 |         --output_dir $OUTPUT_DIR \
 9 |         --model_name_or_path $MODEL_NAME \
10 |         --doc_id_col $DOC_ID_COL \
11 |         --doc_a_col $DOC_A_COL \
12 |         --doc_b_col $DOC_B_COL \
13 |         --nlp_dataset $NLP_DATASET \
14 |         --nlp_cache_dir $NLP_CACHE_DIR \
15 |         --cache_dir $CACHE_DIR \
16 |         --num_train_epochs $EPOCHS \
17 |         --seed $SEED \
18 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
19 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
20 |         --learning_rate $LR \
21 |         --logging_steps 100 \
22 |         --save_steps 0 \
23 |         --save_total_limit 3 \
24 |         --save_predictions
25 | 


--------------------------------------------------------------------------------
/experiments/environment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import yaml
 4 | 
 5 | 
 6 | def get_env():
 7 |     env = None
 8 | 
 9 |     base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10 |     env_dir = os.path.join(base_dir, 'environments')
11 | 
12 |     print(env_dir)
13 | 
14 |     for fn in os.listdir(env_dir):
15 |         if fn.endswith('.yml'):
16 |             with open(os.path.join(env_dir, fn), 'r') as f:
17 |                 envs = yaml.load(f, Loader=yaml.SafeLoader)
18 | 
19 |                 for env_name, _env in envs.items():
20 |                     if os.path.exists(_env['must_exists']):
21 |                         print(f'Environment detected: {env_name} (in {fn})')
22 |                         env = _env
23 |                         break
24 |         if env is not None:
25 |             break
26 | 
27 |     if env:
28 |         return env
29 |     else:
30 |         raise ValueError('Could not determine env!')
31 | 


--------------------------------------------------------------------------------
/sbin/acl/xlnet.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=xlnet-base
10 | 
11 | # xlnet-base
12 | export EVAL_BATCH_SIZE=32
13 | export TRAIN_BATCH_SIZE=6
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/cord19/xlnet.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=xlnet-base-cased
10 | 
11 | # xlnet-base
12 | export EVAL_BATCH_SIZE=24
13 | export TRAIN_BATCH_SIZE=6
14 | 
15 | for CV_FOLD in 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/acl/bert-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=bert-base-cased
10 | 
11 | # bert-base
12 | export EVAL_BATCH_SIZE=16
13 | export TRAIN_BATCH_SIZE=8
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/acl/roberta-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=roberta-base
10 | 
11 | # roberta-base
12 | export EVAL_BATCH_SIZE=32
13 | export TRAIN_BATCH_SIZE=6
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/acl/scibert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=scibert-scivocab-uncased
10 | 
11 | # bert-base
12 | export EVAL_BATCH_SIZE=16
13 | export TRAIN_BATCH_SIZE=8
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/cord19/roberta-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=roberta-base
10 | 
11 | # roberta-base
12 | export EVAL_BATCH_SIZE=24
13 | export TRAIN_BATCH_SIZE=8
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/cord19/scibert.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=scibert-scivocab-uncased
10 | 
11 | # bert-base
12 | export EVAL_BATCH_SIZE=16
13 | export TRAIN_BATCH_SIZE=8
14 | 
15 | for CV_FOLD in 1 2 3 4
16 | do
17 |     python trainer_cli.py --cv_fold $CV_FOLD \
18 |         --output_dir $OUTPUT_DIR \
19 |         --model_name_or_path $MODEL_NAME \
20 |         --doc_id_col $DOC_ID_COL \
21 |         --doc_a_col $DOC_A_COL \
22 |         --doc_b_col $DOC_B_COL \
23 |         --nlp_dataset $NLP_DATASET \
24 |         --nlp_cache_dir $NLP_CACHE_DIR \
25 |         --cache_dir $CACHE_DIR \
26 |         --num_train_epochs $EPOCHS \
27 |         --seed $SEED \
28 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
29 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
30 |         --learning_rate $LR \
31 |         --logging_steps 100 \
32 |         --save_steps 0 \
33 |         --save_total_limit 3 \
34 |         --do_train \
35 |         --save_predictions
36 | done
37 | 
38 | export PYTHONUNBUFFERED=""
39 | 


--------------------------------------------------------------------------------
/sbin/cord19/bert-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export CV_FOLD=1
10 | export MODEL_NAME=bert-base-cased
11 | 
12 | # bert-base
13 | export EVAL_BATCH_SIZE=32
14 | export TRAIN_BATCH_SIZE=8
15 | 
16 | for CV_FOLD in 1 2 3 4
17 | do
18 |     python trainer_cli.py --cv_fold $CV_FOLD \
19 |         --output_dir $OUTPUT_DIR \
20 |         --model_name_or_path $MODEL_NAME \
21 |         --doc_id_col $DOC_ID_COL \
22 |         --doc_a_col $DOC_A_COL \
23 |         --doc_b_col $DOC_B_COL \
24 |         --nlp_dataset $NLP_DATASET \
25 |         --nlp_cache_dir $NLP_CACHE_DIR \
26 |         --cache_dir $CACHE_DIR \
27 |         --num_train_epochs $EPOCHS \
28 |         --seed $SEED \
29 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
30 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
31 |         --learning_rate $LR \
32 |         --logging_steps 100 \
33 |         --save_steps 0 \
34 |         --save_total_limit 3 \
35 |         --do_train \
36 |         --save_predictions
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""
40 | 


--------------------------------------------------------------------------------
/sbin/acl/scincl.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Train on 12 GB GPU
 4 | 
 5 | export PYTHONUNBUFFERED=1
 6 | 
 7 | export APP_ROOT=$(dirname "$0")
 8 | 
 9 | . $APP_ROOT/config.sh
10 | 
11 | export MODEL_NAME=scincl
12 | 
13 | # bert-base
14 | export EVAL_BATCH_SIZE=16
15 | export TRAIN_BATCH_SIZE=8
16 | 
17 | # 1 2 3 4
18 | for CV_FOLD in 2 3 4
19 | do
20 |     python trainer_cli.py --cv_fold $CV_FOLD \
21 |         --output_dir $OUTPUT_DIR \
22 |         --model_name_or_path $MODEL_NAME \
23 |         --doc_id_col $DOC_ID_COL \
24 |         --doc_a_col $DOC_A_COL \
25 |         --doc_b_col $DOC_B_COL \
26 |         --nlp_dataset $NLP_DATASET \
27 |         --nlp_cache_dir $NLP_CACHE_DIR \
28 |         --cache_dir $CACHE_DIR \
29 |         --num_train_epochs $EPOCHS \
30 |         --seed $SEED \
31 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
32 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
33 |         --learning_rate $LR \
34 |         --logging_steps 100 \
35 |         --save_steps 0 \
36 |         --save_total_limit 3 \
37 |         --do_train \
38 |         --save_predictions
39 | done
40 | 
41 | export PYTHONUNBUFFERED=""
42 | 


--------------------------------------------------------------------------------
/sbin/cord19/covid-bert-base.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export CV_FOLD=1
10 | export MODEL_NAME=deepset/covid_bert_base
11 | 
12 | # bert-base
13 | export EVAL_BATCH_SIZE=32
14 | export TRAIN_BATCH_SIZE=8
15 | 
16 | for CV_FOLD in 1 2 3 4
17 | do
18 |     python trainer_cli.py --cv_fold $CV_FOLD \
19 |         --output_dir $OUTPUT_DIR \
20 |         --model_name_or_path $MODEL_NAME \
21 |         --doc_id_col $DOC_ID_COL \
22 |         --doc_a_col $DOC_A_COL \
23 |         --doc_b_col $DOC_B_COL \
24 |         --nlp_dataset $NLP_DATASET \
25 |         --nlp_cache_dir $NLP_CACHE_DIR \
26 |         --cache_dir $CACHE_DIR \
27 |         --num_train_epochs $EPOCHS \
28 |         --seed $SEED \
29 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
30 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
31 |         --learning_rate $LR \
32 |         --logging_steps 100 \
33 |         --save_steps 0 \
34 |         --save_total_limit 3 \
35 |         --do_train \
36 |         --save_predictions
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""
40 | 


--------------------------------------------------------------------------------
/sbin/cord19/scincl.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Train on 12 GB GPU
 4 | 
 5 | export PYTHONUNBUFFERED=1
 6 | 
 7 | export APP_ROOT=$(dirname "$0")
 8 | 
 9 | . $APP_ROOT/config.sh
10 | 
11 | export MODEL_NAME=scincl
12 | 
13 | # bert-base
14 | export EVAL_BATCH_SIZE=16
15 | export TRAIN_BATCH_SIZE=8
16 | 
17 | # 1 2 3 4
18 | for CV_FOLD in 1 2 3 4
19 | do
20 |     python trainer_cli.py --cv_fold $CV_FOLD \
21 |         --output_dir $OUTPUT_DIR \
22 |         --model_name_or_path $MODEL_NAME \
23 |         --doc_id_col $DOC_ID_COL \
24 |         --doc_a_col $DOC_A_COL \
25 |         --doc_b_col $DOC_B_COL \
26 |         --nlp_dataset $NLP_DATASET \
27 |         --nlp_cache_dir $NLP_CACHE_DIR \
28 |         --cache_dir $CACHE_DIR \
29 |         --num_train_epochs $EPOCHS \
30 |         --seed $SEED \
31 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
32 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
33 |         --learning_rate $LR \
34 |         --logging_steps 100 \
35 |         --save_steps 0 \
36 |         --save_total_limit 3 \
37 |         --do_train \
38 |         --save_predictions
39 | done
40 | 
41 | export PYTHONUNBUFFERED=""
42 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Malte Ostendorff
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/sbin/acl/baseline-lstm-colab.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=baseline-rnn
10 | export EPOCHS=10
11 | export SPACY_MODEL=./en_glove_6b_300d
12 | 
13 | for CV_FOLD in 1
14 | do
15 |     python trainer_cli.py --cv_fold $CV_FOLD \
16 |         --output_dir $OUTPUT_DIR \
17 |         --model_name_or_path $MODEL_NAME \
18 |         --doc_id_col $DOC_ID_COL \
19 |         --doc_a_col $DOC_A_COL \
20 |         --doc_b_col $DOC_B_COL \
21 |         --nlp_dataset $NLP_DATASET \
22 |         --nlp_cache_dir $NLP_CACHE_DIR \
23 |         --cache_dir $CACHE_DIR \
24 |         --num_train_epochs $EPOCHS \
25 |         --seed $SEED \
26 |         --learning_rate $LR \
27 |         --logging_steps 500 \
28 |         --save_steps 0 \
29 |         --save_total_limit 3 \
30 |         --do_train \
31 |         --save_predictions \
32 |         --spacy_model $SPACY_MODEL \
33 |         --rnn_type lstm \
34 |         --evaluate_during_training \
35 |         --per_gpu_eval_batch_size 24 \
36 |         --per_gpu_train_batch_size 12
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""


--------------------------------------------------------------------------------
/sbin/acl/electra-base-discriminator.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export CV_FOLD=1
10 | export MODEL_NAME=google/electra-base-discriminator
11 | 
12 | # roberta-base
13 | export EVAL_BATCH_SIZE=12
14 | export TRAIN_BATCH_SIZE=8
15 | 
16 | for CV_FOLD in 1 2 3 4
17 | do
18 |     python trainer_cli.py --cv_fold $CV_FOLD \
19 |         --output_dir $OUTPUT_DIR \
20 |         --model_name_or_path $MODEL_NAME \
21 |         --doc_id_col $DOC_ID_COL \
22 |         --doc_a_col $DOC_A_COL \
23 |         --doc_b_col $DOC_B_COL \
24 |         --nlp_dataset $NLP_DATASET \
25 |         --nlp_cache_dir $NLP_CACHE_DIR \
26 |         --cache_dir $CACHE_DIR \
27 |         --num_train_epochs $EPOCHS \
28 |         --seed $SEED \
29 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
30 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
31 |         --learning_rate $LR \
32 |         --logging_steps 100 \
33 |         --save_steps 0 \
34 |         --save_total_limit 3 \
35 |         --do_train \
36 |         --save_predictions
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""
40 | 


--------------------------------------------------------------------------------
/sbin/cord19/electra-base-discriminator.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export CV_FOLD=1
10 | export MODEL_NAME=google/electra-base-discriminator
11 | 
12 | # roberta-base
13 | export EVAL_BATCH_SIZE=32
14 | export TRAIN_BATCH_SIZE=8
15 | 
16 | for CV_FOLD in 1 2 3 4
17 | do
18 |     python trainer_cli.py --cv_fold $CV_FOLD \
19 |         --output_dir $OUTPUT_DIR \
20 |         --model_name_or_path $MODEL_NAME \
21 |         --doc_id_col $DOC_ID_COL \
22 |         --doc_a_col $DOC_A_COL \
23 |         --doc_b_col $DOC_B_COL \
24 |         --nlp_dataset $NLP_DATASET \
25 |         --nlp_cache_dir $NLP_CACHE_DIR \
26 |         --cache_dir $CACHE_DIR \
27 |         --num_train_epochs $EPOCHS \
28 |         --seed $SEED \
29 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
30 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
31 |         --learning_rate $LR \
32 |         --logging_steps 100 \
33 |         --save_steps 0 \
34 |         --save_total_limit 3 \
35 |         --do_train \
36 |         --save_predictions
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""
40 | 


--------------------------------------------------------------------------------
/sbin/acl/baseline-lstm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=baseline-rnn
10 | export EPOCHS=10
11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d
12 | export SPACY_MODEL=/Volumes/data/repo/data/spacy/en_glove_6b_300d
13 | 
14 | for CV_FOLD in 1
15 | do
16 |     python trainer_cli.py --cv_fold $CV_FOLD \
17 |         --output_dir $OUTPUT_DIR \
18 |         --model_name_or_path $MODEL_NAME \
19 |         --doc_id_col $DOC_ID_COL \
20 |         --doc_a_col $DOC_A_COL \
21 |         --doc_b_col $DOC_B_COL \
22 |         --nlp_dataset $NLP_DATASET \
23 |         --nlp_cache_dir $NLP_CACHE_DIR \
24 |         --cache_dir $CACHE_DIR \
25 |         --num_train_epochs $EPOCHS \
26 |         --seed $SEED \
27 |         --learning_rate $LR \
28 |         --logging_steps 500 \
29 |         --save_steps 0 \
30 |         --save_total_limit 3 \
31 |         --do_train \
32 |         --save_predictions \
33 |         --spacy_model $SPACY_MODEL \
34 |         --rnn_type lstm \
35 |         --evaluate_during_training \
36 |         --no_cuda
37 | done
38 | 
39 | export PYTHONUNBUFFERED=""
40 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | from models.utils import get_concat, get_mlp
 6 | 
 7 | __all__ = [
 8 |     'ExperimentModel',
 9 | ]
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class ExperimentModel(nn.Module):
15 |     labels_count = 1
16 | 
17 |     def forward(self, *input):
18 |         raise NotImplementedError()
19 | 
20 |     def get_single_device(self):
21 |         """
22 |         If all parameters are on a single device, use this method to get current device.
23 |         See: https://github.com/pytorch/pytorch/issues/7460
24 |         """
25 |         return next(self.parameters()).device
26 |         
27 |     def get_classification_probability_layer(self, mode='auto'):
28 |         logger.debug(f'Classification probability layer with {mode}')
29 |         if mode == 'auto':
30 |             logger.debug(f'Auto-mode; labels count = {self.labels_count}')
31 |             if self.labels_count == 1:
32 |                 return self.get_classification_probability_layer('sigmoid')
33 |             else:
34 |                 return self.get_classification_probability_layer('softmax')
35 |         elif mode == 'sigmoid':
36 |             return nn.Sigmoid()
37 |         elif mode == 'softmax':
38 |             return nn.Softmax(dim=0)
39 |         elif mode == 'none':
40 |             return None
41 |         else:
42 |             raise ValueError('Unsupported mode')
43 | 
44 | 


--------------------------------------------------------------------------------
/models/roberta.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import BCEWithLogitsLoss
 2 | from transformers import RobertaForSequenceClassification
 3 | 
 4 | 
 5 | class RobertaForMultiLabelSequenceClassification(RobertaForSequenceClassification):
 6 |     def forward(
 7 |         self,
 8 |         input_ids=None,
 9 |         attention_mask=None,
10 |         token_type_ids=None,
11 |         position_ids=None,
12 |         head_mask=None,
13 |         inputs_embeds=None,
14 |         labels=None,
15 |     ):
16 |         outputs = self.roberta(
17 |             input_ids,
18 |             attention_mask=attention_mask,
19 |             token_type_ids=token_type_ids,
20 |             position_ids=position_ids,
21 |             head_mask=head_mask,
22 |             inputs_embeds=inputs_embeds,
23 |         )
24 |         sequence_output = outputs[0]
25 |         logits = self.classifier(sequence_output)
26 | 
27 |         outputs = (logits,) + outputs[2:]
28 |         if labels is not None:
29 |             # Single-label classification (as in RobertaForSequenceClassification)
30 |             # loss_fct = CrossEntropyLoss()
31 |             # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
32 |             loss_fct = BCEWithLogitsLoss()
33 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
34 | 
35 |             outputs = (loss,) + outputs
36 | 
37 |         return outputs  # (loss), logits, (hidden_states), (attentions)


--------------------------------------------------------------------------------
/tests/test_data_helper.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | 
 4 | # TODO implement these tests
 5 | import numpy as np
 6 | import pandas as pd
 7 | import torch
 8 | from torch.utils.data import WeightedRandomSampler, TensorDataset, DataLoader
 9 | 
10 | from sci.data_helpers import BaseSciDataHelper
11 | 
12 | 
13 | class DataHelperTests(TestCase):
14 |     def test_negative_sampling(self):
15 |         raise NotImplementedError()
16 | 
17 |     def test_weighted_sampler(self):
18 | 
19 |         items_a = ['a'] * 10
20 |         items_b = ['b'] * 3
21 |         items_c = ['c'] * 5
22 |         items = items_a + items_b + items_c
23 | 
24 |         dh = BaseSciDataHelper(label_col='label', labels=['a', 'b', 'c'], none_label=None)
25 | 
26 |         df = pd.DataFrame({'label': items})
27 |         dh.set_label_encoder(df)
28 | 
29 |         label_weights, weights = dh.get_sampler_weights(df)
30 | 
31 |         ys = torch.tensor(dh.label_encoder.transform(items))
32 | 
33 |         sampler = WeightedRandomSampler(weights, num_samples=int(weights.sum()), replacement=True)
34 | 
35 |         dl = DataLoader(TensorDataset(ys), sampler=sampler, batch_size=4)
36 | 
37 |         out = []
38 | 
39 |         for batch in dl:
40 |             yss = batch[0].numpy()
41 |             out += dh.label_encoder.inverse_transform(yss).tolist()
42 | 
43 |         odf = pd.DataFrame({'label': out})
44 | 
45 |         print(df['label'].value_counts())
46 |         print(odf['label'].value_counts())
47 | 


--------------------------------------------------------------------------------
/tests/test_acl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | from unittest import TestCase
 4 | 
 5 | from acl.__data_prep import load_parscit_file, get_citation_context
 6 | from experiments.environment import get_env
 7 | 
 8 | 
 9 | class ACLTest(TestCase):
10 |     def __init__(self, *args, **kwargs):
11 |         super().__init__(*args, **kwargs)
12 |         self.env = get_env()
13 |         self.data_dir = data_dir = os.path.join(self.env['datasets_dir'], 'acl-anthology')
14 | 
15 |     def test_get_cits(self):
16 |         title2acl_ids = {}
17 |         author_last2titles = {}
18 |         year2titles = defaultdict(list)
19 | 
20 |         fp = self.data_dir + '/parscit/D/D15/D15-1312-parscit.130908.xml'
21 |         fn = 'D15-1312-parscit.130908.xml'
22 | 
23 |         error_files = []
24 |         out = []
25 |         acl_id2sects = {}
26 |         acl_id2markers = {}
27 | 
28 |         sects, cits, markers = load_parscit_file(fp)
29 | 
30 |         from_id = '-'.join(fn.split('-', 2)[:2])  # ACL ID
31 | 
32 |         acl_id2sects[from_id] = sects
33 |         acl_id2markers[from_id] = markers
34 | 
35 |         print('----')
36 | 
37 |         print(sects)
38 | 
39 |         print('----')
40 | 
41 |         print(cits)
42 | 
43 |         print('---')
44 | 
45 |         cits_with_context = get_citation_context(cits, sects, title2acl_ids, year2titles, author_last2titles)
46 |         #
47 |         # out += [(from_id, to_id, context[0], context[1], context[2]) for to_id, context in cits_with_context]
48 | 
49 | 


--------------------------------------------------------------------------------
/cord19/utils.py:
--------------------------------------------------------------------------------
 1 | # normalize section title
 2 | def normalize_section(title):
 3 |     return title.strip().lower()\
 4 |         .replace('conclusions', 'conclusion')\
 5 |         .replace('concluding remarks', 'conclusion')\
 6 |         .replace('future perspectives', 'future work')\
 7 |         .replace('future directions', 'future work')\
 8 |         .replace('viruses.', 'virus')\
 9 |         .replace('viruses', 'virus')
10 |         #.replace('conclusion and future perspectives', 'conclusion')\
11 |         #.replace('materials and methods', 'methods')
12 | 
13 | 
14 | def resolve_and_sect_titles(cits):
15 |     for from_doi, to_doi, sect_title in cits:
16 |         for t in normalize_section(sect_title).split(' and '):
17 |             yield (from_doi, to_doi, t)
18 | 
19 | 
20 | def get_text_from_doi(doi, doi2paper, raise_not_found_error=True):
21 |     text = ''
22 |     sep = '\n'
23 | 
24 |     # if doi in doi2s2paper:
25 |     #     # from s2 scraper
26 |     #     # text += doi2s2paper[doi]['title']
27 |     #
28 |     #     if doi2s2paper[doi]['abstract']:
29 |     #         # text += '\n' + doi2s2paper[doi]['abstract']
30 |     #         text = doi2s2paper[doi]['title'] + sep + doi2s2paper[doi]['abstract']
31 | 
32 |     if doi in doi2paper:
33 |         # text += doi2paper[doi]['metadata']['title']
34 | 
35 |         if doi2paper[doi]['abstract'] and len(doi2paper[doi]['abstract']) > 10:
36 |             # text += doi2paper[doi]['metadata']['title'] + '\n' + doi2paper[doi]['abstract'][0]['text']
37 |             text = doi2paper[doi]['title'] + sep + doi2paper[doi]['abstract']
38 | 
39 |     elif raise_not_found_error:
40 |         raise ValueError('DOI not found')
41 | 
42 |     return text
43 | 


--------------------------------------------------------------------------------
/models/electra.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import BCEWithLogitsLoss
 4 | from transformers import BertModel, BertPreTrainedModel, BertForSequenceClassification, ElectraForSequenceClassification
 5 | 
 6 | 
 7 | class ElectraForMultiLabelSequenceClassification(ElectraForSequenceClassification):
 8 |     """Electra model for classification.
 9 |     This module is composed of Electra BERT model with a linear layer on top of
10 |     the pooled output.
11 |     """
12 | 
13 |     def forward(
14 |             self,
15 |             input_ids=None,
16 |             attention_mask=None,
17 |             token_type_ids=None,
18 |             position_ids=None,
19 |             head_mask=None,
20 |             inputs_embeds=None,
21 |             labels=None,
22 |     ):
23 |         discriminator_hidden_states = self.electra(
24 |             input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
25 |         )
26 |         sequence_output = discriminator_hidden_states[0]
27 |         logits = self.classifier(sequence_output)
28 | 
29 |         outputs = (logits,) + discriminator_hidden_states[2:]  # add hidden states and attention if they are here
30 | 
31 |         if labels is not None:
32 |             # Single-label classification (as in BertForSequenceClassification)
33 |             # loss_fct = CrossEntropyLoss()
34 |             # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
35 | 
36 |             loss_fct = BCEWithLogitsLoss()
37 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
38 | 
39 |             outputs = (loss,) + outputs
40 | 
41 |         return outputs  # (loss), logits, (hidden_states), (attentions)
42 | 


--------------------------------------------------------------------------------
/models/bart.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn import BCEWithLogitsLoss
 3 | from transformers import BartForSequenceClassification
 4 | 
 5 | 
 6 | class BartForMultiLabelSequenceClassification(BartForSequenceClassification):
 7 |     def forward(
 8 |         self,
 9 |         input_ids,
10 |         attention_mask=None,
11 |         encoder_outputs=None,
12 |         decoder_input_ids=None,
13 |         decoder_attention_mask=None,
14 |         labels=None,
15 |     ):
16 |         outputs = self.model(
17 |             input_ids,
18 |             attention_mask=attention_mask,
19 |             decoder_input_ids=decoder_input_ids,
20 |             decoder_attention_mask=decoder_attention_mask,
21 |             encoder_outputs=encoder_outputs,
22 |         )
23 |         x = outputs[0]  # last hidden state
24 |         eos_mask = input_ids.eq(self.config.eos_token_id)
25 |         if len(torch.unique(eos_mask.sum(1))) > 1:
26 |             raise ValueError("All examples must have the same number of <eos> tokens.")
27 |         sentence_representation = x[eos_mask, :].view(x.size(0), -1, x.size(-1))[:, -1, :]
28 |         logits = self.classification_head(sentence_representation)
29 |         # Prepend logits
30 |         outputs = (logits,) + outputs[1:]  # Add hidden states and attention if they are here
31 |         if labels is not None:  # prepend loss to output,
32 |             # Single label
33 |             # loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1))
34 |             loss_fct = BCEWithLogitsLoss()
35 |             loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1, self.config.num_labels))
36 | 
37 |             outputs = (loss,) + outputs
38 | 
39 |         return outputs
40 | 
41 | 


--------------------------------------------------------------------------------
/demo_utils.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from transformers import AutoTokenizer
 3 | 
 4 | from models.bert import BertForMultiLabelSequenceClassification
 5 | 
 6 | 
 7 | def get_paper(doc_id):
 8 |     res = requests.get(f'https://api.semanticscholar.org/v1/paper/{doc_id}')
 9 | 
10 |     if res.status_code == 200:
11 |         return res.json()
12 |     else:
13 |         raise ValueError(f'Cannot load paper from S2 API: {doc_id}')
14 | 
15 | 
16 | def get_prediction(model_name_or_path: str, from_id, to_id):
17 |     from_doc = get_paper(from_id)
18 |     to_doc = get_paper(to_id)
19 | 
20 |     if 'acl' in model_name_or_path:
21 |         labels = ['introduction', 'related work', 'experiment', 'conclusion', 'results', 'background', 'discussion',
22 |                   'evaluation', 'method', 'other', 'none']
23 |     else:
24 |         labels = ['discussion', 'introduction', 'conclusion', 'results', 'methods', 'background', 'materials', 'virus',
25 |                   'future work', 'other', 'none']
26 | 
27 |     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
28 |     model = BertForMultiLabelSequenceClassification.from_pretrained(model_name_or_path)
29 | 
30 |     model_input = tokenizer.batch_encode_plus(
31 |         [(from_doc['title'] + '\n' + from_doc['abstract'], to_doc['title'] + '\n' + to_doc['abstract'])],
32 |         pad_to_max_length=True, truncation_strategy='longest_first', return_token_type_ids=True,
33 |         return_attention_masks=True, return_tensors='pt', max_length=512
34 |     )
35 | 
36 |     model_out = model(**model_input)
37 | 
38 |     pred_scores = model_out[0].detach().numpy()[0]
39 |     pred_labels = [label for idx, label in enumerate(labels) if pred_scores[idx] > 0.]
40 | 
41 |     return pred_scores, pred_labels, from_doc, to_doc
42 | 


--------------------------------------------------------------------------------
/models/bert.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from torch.nn import BCEWithLogitsLoss
 4 | from transformers import BertModel, BertPreTrainedModel, BertForSequenceClassification
 5 | 
 6 | 
 7 | class BertForMultiLabelSequenceClassification(BertForSequenceClassification):
 8 |     """BERT model for classification.
 9 |     This module is composed of the BERT model with a linear layer on top of
10 |     the pooled output.
11 |     """
12 | 
13 |     def forward(
14 |             self,
15 |             input_ids=None,
16 |             attention_mask=None,
17 |             token_type_ids=None,
18 |             position_ids=None,
19 |             head_mask=None,
20 |             inputs_embeds=None,
21 |             labels=None,
22 |     ):
23 |         outputs = self.bert(
24 |             input_ids,
25 |             attention_mask=attention_mask,
26 |             token_type_ids=token_type_ids,
27 |             position_ids=position_ids,
28 |             head_mask=head_mask,
29 |             inputs_embeds=inputs_embeds,
30 |         )
31 |         pooled_output = outputs[1]
32 | 
33 |         pooled_output = self.dropout(pooled_output)
34 |         logits = self.classifier(pooled_output)
35 | 
36 |         outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
37 | 
38 |         if labels is not None:
39 |             # Single-label classification (as in BertForSequenceClassification)
40 |             # loss_fct = CrossEntropyLoss()
41 |             # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
42 | 
43 |             loss_fct = BCEWithLogitsLoss()
44 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
45 | 
46 |             outputs = (loss,) + outputs
47 | 
48 |         return outputs  # (loss), logits, (hidden_states), (attentions)
49 | 


--------------------------------------------------------------------------------
/models/xlnet.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import BCEWithLogitsLoss
 2 | from transformers import XLNetForSequenceClassification
 3 | 
 4 | 
 5 | class XLNetForMultiLabelSequenceClassification(XLNetForSequenceClassification):
 6 |     def forward(
 7 |         self,
 8 |         input_ids=None,
 9 |         attention_mask=None,
10 |         mems=None,
11 |         perm_mask=None,
12 |         target_mapping=None,
13 |         token_type_ids=None,
14 |         input_mask=None,
15 |         head_mask=None,
16 |         inputs_embeds=None,
17 |         use_cache=True,
18 |         labels=None,
19 |     ):
20 | 
21 |         transformer_outputs = self.transformer(
22 |             input_ids,
23 |             attention_mask=attention_mask,
24 |             mems=mems,
25 |             perm_mask=perm_mask,
26 |             target_mapping=target_mapping,
27 |             token_type_ids=token_type_ids,
28 |             input_mask=input_mask,
29 |             head_mask=head_mask,
30 |             inputs_embeds=inputs_embeds,
31 |             use_cache=use_cache,
32 |         )
33 |         output = transformer_outputs[0]
34 | 
35 |         output = self.sequence_summary(output)
36 |         logits = self.logits_proj(output)
37 | 
38 |         outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
39 | 
40 |         if labels is not None:
41 |             # Single-label classification (as in XLNetForSequenceClassification
42 |             # loss_fct = CrossEntropyLoss()
43 |             # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
44 | 
45 |             loss_fct = BCEWithLogitsLoss()
46 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
47 | 
48 |             outputs = (loss,) + outputs
49 | 
50 |         return outputs  # return (loss), logits, (mems), (hidden states), (attentions)


--------------------------------------------------------------------------------
/sbin/compress_data_and_upload.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Compress data for upload
 4 | 
 5 | # ACL: S2 papers
 6 | tar -cvf acl_s2.tar title2dblp_hits.json.gz acl_id2s2.json.gz arxiv2s2.json.gz doi2s2.json.gz
 7 | 
 8 | # CORD-19
 9 | tar -cvf cord19_s2.tar metadata.csv doi2s2paper.json.gz
10 | bzip2 cord19_s2.tar
11 | 
12 | # Models (SciBERT)
13 | tar -cvzf ./cord19_fold-1_scibert-scivocab-uncased.tar.gz  --directory=../output/cord19_docrel/folds/1/ scibert-scivocab-uncased
14 | tar -cvzf ./acl_fold-1_scibert-scivocab-uncased.tar.gz  --directory=../output/acl_docrel/folds/1/ scibert-scivocab-uncased
15 | 
16 | # Results
17 | tar -cvzf acl_output.tar.gz --exclude='*.bin' --exclude='__*' ../output/acl_docrel/*
18 | tar -cvzf cord19_output.tar.gz --exclude='*.bin' --exclude='__*' ../output/cord19_docrel/*
19 | 
20 | 
21 | ### Upload to GitHub release (with https://github.com/github-release/github-release)
22 | export GITHUB_TOKEN=
23 | export GITHUB_USER=
24 | export GITHUB_REPO=
25 | 
26 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_s2.tar --file acl_s2.tar
27 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_s2.tar.bz2 --file cord19_s2.tar.bz2
28 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_fold-1_scibert-scivocab-uncased.tar.gz --file acl_fold-1_scibert-scivocab-uncased.tar.gz
29 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_fold-1_scibert-scivocab-uncased.tar.gz --file cord19_fold-1_scibert-scivocab-uncased.tar.gz
30 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name acl_output.tar.gz  --file acl_output.tar.gz
31 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name cord19_output.tar.gz  --file cord19_output.tar.gz
32 | ~/github-release upload --user $GITHUB_USER --repo $GITHUB_REPO --tag 1.0 --name scibert-vocab.txt --file ~/datasets/BERT_pre_trained_models/pytorch/scibert-scivocab-uncased/vocab.txt
33 | 
34 | 


--------------------------------------------------------------------------------
/models/longformer.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import BCEWithLogitsLoss
 2 | from transformers import RobertaForSequenceClassification, BertPreTrainedModel, LongformerConfig, LongformerModel, \
 3 |     LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
 4 | from transformers.modeling_roberta import RobertaClassificationHead
 5 | 
 6 | 
 7 | class LongformerForMultiLabelSequenceClassification(BertPreTrainedModel):
 8 |     config_class = LongformerConfig
 9 |     pretrained_model_archive_map = LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP
10 |     base_model_prefix = "longformer"
11 | 
12 |     def __init__(self, config):
13 |         super().__init__(config)
14 |         self.num_labels = config.num_labels
15 | 
16 |         self.roberta = LongformerModel(config)
17 |         self.classifier = RobertaClassificationHead(config)
18 | 
19 |     def forward(
20 |         self,
21 |         input_ids=None,
22 |         attention_mask=None,
23 |         token_type_ids=None,
24 |         position_ids=None,
25 |         head_mask=None,
26 |         inputs_embeds=None,
27 |         labels=None,
28 |     ):
29 |         outputs = self.roberta(
30 |             input_ids,
31 |             attention_mask=attention_mask,
32 |             token_type_ids=token_type_ids,
33 |             position_ids=position_ids,
34 |             head_mask=head_mask,
35 |             inputs_embeds=inputs_embeds,
36 |         )
37 |         sequence_output = outputs[0]
38 |         logits = self.classifier(sequence_output)
39 | 
40 |         outputs = (logits,) + outputs[2:]
41 |         if labels is not None:
42 |             # Single-label classification (as in RobertaForSequenceClassification)
43 |             # loss_fct = CrossEntropyLoss()
44 |             # loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
45 |             loss_fct = BCEWithLogitsLoss()
46 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
47 | 
48 |             outputs = (loss,) + outputs
49 | 
50 |         return outputs  # (loss), logits, (hidden_states), (attentions)


--------------------------------------------------------------------------------
/tests/test_experiment.py:
--------------------------------------------------------------------------------
 1 | from unittest import TestCase
 2 | 
 3 | from torch.nn import BCELoss
 4 | from tqdm import tqdm
 5 | 
 6 | from experiments import Experiment
 7 | from experiments.utils import flatten
 8 | from models.transformers import JointBERT
 9 | from wiki.data_helpers import JointBERTWikiDataHelper
10 | 
11 | 
12 | class ExperimentTest(TestCase):
13 |     def test_cls_init(self):
14 | 
15 |         exp = Experiment(
16 |             # random_seed=0,
17 |             epochs=1,
18 |             model_cls='models.JointBERT',
19 |             model_params={
20 |                 'bert_model_path': '/Volumes/data/repo/data/bert/bert-base-cased',
21 |                 'labels_count': 3,
22 |             },
23 |             loss_func_cls='torch.nn.BCELoss',  # loss,
24 |             model_output_to_loss_input=lambda ys: ys.double(),
25 |             data_helper_cls='wiki.data_helpers.JointBERTDataHelper',
26 |             data_helper_params={
27 |                 'wiki_relations_path': '../wiki/relations.csv',
28 |                 'wiki_articles_path': '../wiki/docs.pickle',
29 |                 'labels': ['employer', 'country_of_citizenship'],
30 |                 # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of'
31 |                 'label_col': 'relation_name',
32 |                 'negative_sampling_ratio': 1.,
33 |                 'train_test_split': 0.7,
34 |                 'max_seq_length': 512,
35 |                 'train_batch_size': 4,
36 |                 'test_batch_size': 4,
37 |                 'bert_model_path': '/Volumes/data/repo/data/bert/bert-base-cased',
38 |                 # 'bert_tokenizer_cls': '',
39 |                 'bert_tokenizer_params': {
40 |                     'do_lower_case': False,
41 |                 },
42 |                 'df_limit': 3,
43 |             },
44 |             tqdm_cls='tqdm.tqdm',
45 |             output_dir='../output',
46 |         )
47 | 
48 |         assert isinstance(exp.model, JointBERT)
49 |         assert isinstance(exp.data_helper, JointBERTWikiDataHelper)
50 |         assert isinstance(exp.loss_func, BCELoss)
51 |         assert tqdm == exp.tqdm_cls
52 | 
53 |         print(flatten(exp.to_dict()))
54 | 
55 |         exp.run()


--------------------------------------------------------------------------------
/tests/test_rnn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from collections import defaultdict
 3 | from unittest import TestCase
 4 | 
 5 | import spacy
 6 | import torch
 7 | from transformers import AutoTokenizer, AutoConfig, RobertaTokenizer, RobertaForSequenceClassification
 8 | 
 9 | from acl.__data_prep import load_parscit_file, get_citation_context
10 | from acl.trainer_utils import get_vectors_from_spacy_model
11 | from experiments.environment import get_env
12 | from models.auto_modeling import AutoModelForMultiLabelSequenceClassification
13 | from models.rnn import RNNForMultiLabelSequenceClassification
14 | from trainer_cli import ExperimentArguments
15 | 
16 | 
17 | class AutoModelingTest(TestCase):
18 |     env = None
19 | 
20 |     def setUp(self) -> None:
21 |         os.environ["WANDB_DISABLED"] = "true"
22 |         os.environ["WANDB_WATCH"] = "false"
23 |         self.env = get_env()
24 | 
25 |         self.cache_dir = '../data/transformers_cache'
26 |         self.sample_text = ' '.join(['Hello world! '] * 10)
27 |         self.num_labels = 5
28 | 
29 |     def test_rnn_model(self):
30 | 
31 |         # tokenizer = AutoTokenizer.from_pretrained(tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=self.cache_dir)
32 |         # model_config = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, cache_dir=self.cache_dir)
33 | 
34 |         # model = AutoModelForMultiLabelSequenceClassification.from_pretrained(model_name_or_path, config=model_config, cache_dir=self.cache_dir)
35 | 
36 |         experiment_args = ExperimentArguments('s2_id', 'from_s2_id', 'to_s2_id', 1, 'acl_docrel')
37 |         # label_classes
38 |         spacy_nlp = spacy.load(experiment_args.spacy_model, disable=["tagger", "ner", "textcat"])
39 | 
40 |         model = RNNForMultiLabelSequenceClassification(
41 |             word_vectors=get_vectors_from_spacy_model(spacy_nlp),
42 |             hidden_size=experiment_args.rnn_hidden_size,
43 |             rnn=experiment_args.rnn_type,
44 |             num_labels=self.num_labels,
45 |             num_layers=experiment_args.rnn_num_layers,
46 |             dropout=experiment_args.rnn_dropout,
47 |         )
48 |         #
49 |         # model.eval()
50 |         #
51 |         # encodings = tokenizer.batch_encode_plus([text], return_tensors='pt')
52 |         #
53 |         # return model(encodings['input_ids']), model, tokenizer
54 | 


--------------------------------------------------------------------------------
/sbin/cord19/baseline-lstm-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=baseline-rnn
10 | export EPOCHS=10
11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d
12 | 
13 | export MODEL_NAME=baseline-rnn__fasttext
14 | export SPACY_MODEL=~/datasets/spacy/en_fasttext_wiki-news-300d-1m
15 | 
16 | export MODEL_NAME=baseline-rnn__fasttext__custom
17 | export SPACY_MODEL=./output/cord19_docrel/spacy/en_cord19_fasttext_300d
18 | 
19 | export EPOCHS=10
20 | export CV_FOLD=1
21 | export LR=1e-5
22 | export RNN_NUM_LAYERS=2
23 | export RNN_HIDDEN_SIZE=100
24 | export RNN_DROPOUT=0.1
25 | 
26 | # [1] Reimers, N. and Gurevych, I. 2016. Optimal Hyperparameters for Deep LSTM-Networks for Sequence Labeling Tasks. (2016).
27 | # -
28 | # A value of about 100 for each LSTM-network appears to be a good rule of thumb for the tested tasks
29 | # -
30 | # For tasks with small training sets appears a mini-batch size of 8 a robust selection.
31 | # For tasks with larger training sets appears a mini-batch size of 32 a robust selection.
32 | # -
33 | # Except for the reduced POS tagging task, two BiLSTM-layers produced the best re- sults.
34 | # -
35 | # Variational dropout was on all tasks superior to no-dropout or naive dropout.
36 | # Applying dropout along the vertical as well as the recurrent dimension achieved on all benchmark tasks the best result.
37 | # 0.1 => same as in transformers
38 | 
39 | for CV_FOLD in 1 2 3 4
40 | do
41 |     python trainer_cli.py --cv_fold $CV_FOLD \
42 |         --output_dir $OUTPUT_DIR \
43 |         --model_name_or_path $MODEL_NAME \
44 |         --doc_id_col $DOC_ID_COL \
45 |         --doc_a_col $DOC_A_COL \
46 |         --doc_b_col $DOC_B_COL \
47 |         --nlp_dataset $NLP_DATASET \
48 |         --nlp_cache_dir $NLP_CACHE_DIR \
49 |         --cache_dir $CACHE_DIR \
50 |         --num_train_epochs $EPOCHS \
51 |         --seed $SEED \
52 |         --learning_rate $LR \
53 |         --logging_steps 500 \
54 |         --save_steps 0 \
55 |         --save_total_limit 3 \
56 |         --do_train \
57 |         --save_predictions \
58 |         --spacy_model $SPACY_MODEL \
59 |         --rnn_type lstm \
60 |         --rnn_num_layers $RNN_NUM_LAYERS \
61 |         --rnn_hidden_size $RNN_HIDDEN_SIZE \
62 |         --rnn_dropout $RNN_DROPOUT \
63 |         --per_gpu_eval_batch_size 32 \
64 |         --per_gpu_train_batch_size 8 \
65 |         --evaluate_during_training
66 | done
67 | 
68 | export PYTHONUNBUFFERED=""
69 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/python
  3 | # Edit at https://www.gitignore.io/?templates=python
  4 | 
  5 | ### Python ###
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don’t work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # celery beat schedule file
 98 | celerybeat-schedule
 99 | 
100 | # SageMath parsed files
101 | *.sage.py
102 | 
103 | # Environments
104 | .env
105 | .venv
106 | env/
107 | venv/
108 | ENV/
109 | env.bak/
110 | venv.bak/
111 | 
112 | # Spyder project settings
113 | .spyderproject
114 | .spyproject
115 | 
116 | # Rope project settings
117 | .ropeproject
118 | 
119 | # mkdocs documentation
120 | /site
121 | 
122 | # mypy
123 | .mypy_cache/
124 | .dmypy.json
125 | dmypy.json
126 | 
127 | # Pyre type checker
128 | .pyre/
129 | 
130 | # End of https://www.gitignore.io/api/python
131 | .idea/
132 | 


--------------------------------------------------------------------------------
/sbin/acl/baseline-lstm-gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | export PYTHONUNBUFFERED=1
 4 | 
 5 | export APP_ROOT=$(dirname "$0")
 6 | 
 7 | . $APP_ROOT/config.sh
 8 | 
 9 | export MODEL_NAME=baseline-rnn
10 | export EPOCHS=10
11 | export SPACY_MODEL=~/datasets/spacy/en_glove_6b_300d
12 | 
13 | export MODEL_NAME=baseline-rnn__fasttext
14 | export SPACY_MODEL=~/datasets/spacy/en_fasttext_wiki-news-300d-1m
15 | 
16 | export MODEL_NAME=baseline-rnn__fasttext__custom
17 | export SPACY_MODEL=./output/acl_docrel/spacy/en_acl_fasttext_300d
18 | 
19 | export EVAL_BATCH_SIZE=12
20 | export TRAIN_BATCH_SIZE=8
21 | 
22 | export EPOCHS=10
23 | export CV_FOLD=1
24 | export LR=1e-5
25 | export RNN_NUM_LAYERS=2
26 | export RNN_HIDDEN_SIZE=100
27 | export RNN_DROPOUT=0.1
28 | 
29 | # [1] Reimers, N. and Gurevych, I. 2016. Optimal Hyperparameters for Deep LSTM-Networks for Sequence Labeling Tasks. (2016).
30 | # -
31 | # A value of about 100 for each LSTM-network appears to be a good rule of thumb for the tested tasks
32 | # -
33 | # For tasks with small training sets appears a mini-batch size of 8 a robust selection.
34 | # For tasks with larger training sets appears a mini-batch size of 32 a robust selection.
35 | # -
36 | # Except for the reduced POS tagging task, two BiLSTM-layers produced the best re- sults.
37 | # -
38 | # Variational dropout was on all tasks superior to no-dropout or naive dropout.
39 | # Applying dropout along the vertical as well as the recurrent dimension achieved on all benchmark tasks the best result.
40 | # 0.1 => same as in transformers
41 | 
42 | for CV_FOLD in 1 2 3 4
43 | do
44 |     python trainer_cli.py --cv_fold $CV_FOLD \
45 |         --output_dir $OUTPUT_DIR \
46 |         --model_name_or_path $MODEL_NAME \
47 |         --doc_id_col $DOC_ID_COL \
48 |         --doc_a_col $DOC_A_COL \
49 |         --doc_b_col $DOC_B_COL \
50 |         --nlp_dataset $NLP_DATASET \
51 |         --nlp_cache_dir $NLP_CACHE_DIR \
52 |         --cache_dir $CACHE_DIR \
53 |         --num_train_epochs $EPOCHS \
54 |         --seed $SEED \
55 |         --learning_rate $LR \
56 |         --logging_steps 500 \
57 |         --save_steps 0 \
58 |         --save_total_limit 3 \
59 |         --do_train \
60 |         --save_predictions \
61 |         --spacy_model $SPACY_MODEL \
62 |         --rnn_type lstm \
63 |         --rnn_num_layers $RNN_NUM_LAYERS \
64 |         --rnn_hidden_size $RNN_HIDDEN_SIZE \
65 |         --rnn_dropout $RNN_DROPOUT \
66 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
67 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
68 |         --evaluate_during_training
69 | done
70 | 
71 | export PYTHONUNBUFFERED=""
72 | 


--------------------------------------------------------------------------------
/acl/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | def get_sorted_pair(a, b):
 8 |     # ensure citation pair is always in same order
 9 |     if a > b:
10 |         return (a, b)
11 |     else:
12 |         return (b, a)
13 | 
14 | 
15 | def to_label(t, labels):
16 |     if t in labels:
17 |         return t
18 |     else:
19 |         return 'other'
20 | 
21 | 
22 | def normalize_title(t):
23 |     if t:
24 |         t = t.replace('.', ' ').replace('-', ' ').strip().lower()
25 |         #t = re.sub(r'\W+', '', t)
26 |         return t
27 | 
28 | 
29 | def normalize_section(title):
30 |     if title:
31 |         return re.sub(r'[\.0-9]', '',
32 |                       title.
33 |                       strip() \
34 |                       .lower() \
35 |                       .replace('conclusions', 'conclusion') \
36 |                       .replace('methodology', 'method') \
37 |                       .replace('methods', 'method') \
38 |                       .replace('related works', 'related work') \
39 |                       .replace('models', 'model') \
40 |                       .replace('datasets', 'dataset') \
41 |                       .replace('our ', '') \
42 |                       .replace('evaluations', 'evaluation') \
43 |                       .replace('experiments', 'experiment')
44 |                       ).strip()
45 |         # .replace('conclusion and future perspectives', 'conclusion')\
46 |         # .replace('materials and methods', 'methods')
47 | 
48 | 
49 | def get_text_from_doc(doc) -> str:
50 |     """
51 |     Build document text from title + abstract
52 | 
53 |     :param doc: S2 paper
54 |     :return: Document text
55 |     """
56 | 
57 |     text = ''
58 | 
59 |     if 'title' in doc:
60 |         text += doc['title']
61 | 
62 |         if doc['abstract']:
63 |             text += '\n' + doc['abstract']
64 | 
65 |     return text
66 | 
67 | 
68 | def get_text_from_doc_id(doc_id: str, doc_index) -> str:
69 |     """
70 | 
71 |     Build document text from title + abstract
72 | 
73 |     :param doc_id: S2-id
74 |     :param doc_index: S2-id to S2-paper data
75 |     :return: Document text
76 |     """
77 | 
78 |     if doc_id in doc_index:
79 |         return get_text_from_doc(doc_index[doc_id])
80 |     else:
81 |         raise ValueError(f'Document not found in index: {doc_id}')
82 | 
83 | 
84 | # resolve 'and' titles and filter for out-of-index docs
85 | def resolve_and_sect_titles(items, doc_index=None):
86 |     for from_s2_id, to_s2_id, sect_generic, sect_title, sect_marker in items:
87 |         if doc_index and (from_s2_id not in doc_index or to_s2_id not in doc_index):
88 |             # One of the IDs does not exist in document index
89 |             continue
90 | 
91 |         sect_title = normalize_section(sect_title)
92 | 
93 |         if sect_title:
94 |             # Resolve combined sections
95 |             for t in sect_title.split(' and '):
96 |                 if t:
97 |                     yield (from_s2_id, to_s2_id, t, sect_marker)
98 | 


--------------------------------------------------------------------------------
/acl/preprocessing/scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import tqdm
 3 | 
 4 | 
 5 | def scrape_s2(job_name, needed_ids, id2s2, id2s2_errors, id_prefix='', sleep=2.5, save_every_n=1000, offset=0):
 6 |     api_url = 'http://api.semanticscholar.org/v1/paper/'
 7 | 
 8 |     try:
 9 |         for i, needed_id in enumerate(tqdm(needed_ids, total=len(needed_ids))):
10 |             if i < offset:  # skip
11 |                 continue
12 | 
13 |             if needed_id in id2s2 or needed_id in id2s2_errors:
14 |                 continue
15 | 
16 |             res = requests.get(api_url + id_prefix + needed_id)
17 | 
18 |             if res.status_code == 200:
19 |                 try:
20 |                     id2s2[needed_id] = res.json()
21 |                 except ValueError as e:
22 |                     print(f'Error cannot parse JSON: {needed_id}')
23 |                     id2s2_errors[needed_id] = str(e)
24 |             elif res.status_code == 429:
25 |                 print(f'Stop! Rate limit reached at: {i}')
26 |                 break
27 |             elif res.status_code == 403:
28 |                 print(f'Stop! Forbidden / rate limit reached at: {i}')
29 |                 break
30 |             elif res.status_code == 404:
31 |                 id2s2_errors[needed_id] = 404
32 |             else:
33 |                 print(f'Error status: {res.status_code} - {needed_id}')
34 |                 id2s2_errors[needed_id] = res.text
35 | 
36 |             if save_every_n > 0 and (i % save_every_n) == 0 and i > 0:
37 |                 json.dump(id2s2, open(output_dir / f'{job_name}.json', 'w'))
38 |                 json.dump(id2s2_errors, open(output_dir / f'{job_name}_errors.json', 'w'))
39 | 
40 |             time.sleep(sleep)
41 |     except KeyboardInterrupt:
42 |         print('Aborting...')
43 |         pass
44 | 
45 |     return id2s2, id2s2_errors
46 | 
47 | 
48 | 
49 | def scrape_dblp():
50 |     missing_titles = set(filtered_cits.keys()).difference(set(title2dblp_hits.keys()))
51 |     print(f'Missing titles: {len(missing_titles):,}')
52 | 
53 |     title2dblp_hits = {}
54 |     dblp_errors = {}
55 | 
56 |     url = 'https://dblp.org/search/publ/api'
57 | 
58 |     for i, (title, idxs) in tqdm(enumerate(filtered_cits.items()), total=len(filtered_cits)):
59 |         if title in title2dblp_hits or title in dblp_errors:
60 |             continue
61 | 
62 |         q = title
63 |         res = requests.get(url, params={'query': q, 'format': 'json'})
64 | 
65 |         if res.status_code == 200:
66 |             title2dblp_hits[title] = res.json()['result']['hits']
67 |         elif res.status_code == 422:
68 |             dblp_errors[title] = res.status_code
69 |             print(f'422: unprocesseble entity: {title}')
70 |         else:
71 |             # dblp_errors[title] = res.status_code
72 |             print(f'Error: {res.text}')
73 |             break
74 | 
75 |         time.sleep(0.5)
76 | 
77 |         # if i > 3:
78 |         #    break
79 | 
80 |     print(f'Scraped data for {len(title2dblp_hits)} papers from DBPL (errors: {len(dblp_errors)})')
81 | 


--------------------------------------------------------------------------------
/sbin/acl/1.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | export PYTHONUNBUFFERED=1
  4 | 
  5 | export APP_ROOT=$(dirname "$0")
  6 | 
  7 | . $APP_ROOT/config.sh
  8 | 
  9 | # models: albert-base-v1   bert-base-german-cased        biobert-v1-1             longformer-base-4096.tar.gz   pytorch
 10 | # scibert-scivocab-uncased
 11 | #albert-base-v2   bert-base-multilingual-cased  distilbert-base-uncased
 12 | # longformer-large-4096         roberta-base   xlnet-base-cased
 13 | #bert-base-cased  bert-large-cased
 14 | # longformer-base-4096     longformer-large-4096.tar.gz
 15 | # roberta-large
 16 | 
 17 | export MODEL_NAME=bert-base-cased
 18 | export MODEL_NAME=bert-large-cased
 19 | 
 20 | export MODEL_NAME=roberta-base
 21 | export MODEL_NAME=longformer-base-4096
 22 | export MODEL_NAME=xlnet-base
 23 | 
 24 | export CV_FOLD=1
 25 | 
 26 | # longformer
 27 | export EVAL_BATCH_SIZE=4
 28 | export TRAIN_BATCH_SIZE=4
 29 | 
 30 | # large
 31 | export EVAL_BATCH_SIZE=4
 32 | export TRAIN_BATCH_SIZE=2
 33 | 
 34 | # bert-base
 35 | export EVAL_BATCH_SIZE=16
 36 | export TRAIN_BATCH_SIZE=8
 37 | 
 38 | # xlnet-base
 39 | export EVAL_BATCH_SIZE=12
 40 | export TRAIN_BATCH_SIZE=6
 41 | 
 42 | 
 43 | 
 44 | for CV_FOLD in 1 2 3 4
 45 | do
 46 |     python trainer_cli.py --cv_fold $CV_FOLD \
 47 |         --output_dir $OUTPUT_DIR \
 48 |         --model_name_or_path $MODEL_NAME \
 49 |         --doc_id_col $DOC_ID_COL \
 50 |         --doc_a_col $DOC_A_COL \
 51 |         --doc_b_col $DOC_B_COL \
 52 |         --nlp_dataset $NLP_DATASET \
 53 |         --nlp_cache_dir $NLP_CACHE_DIR \
 54 |         --cache_dir $CACHE_DIR \
 55 |         --num_train_epochs $EPOCHS \
 56 |         --seed $SEED \
 57 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
 58 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
 59 |         --learning_rate $LR \
 60 |         --logging_steps 100 \
 61 |         --save_steps 0 \
 62 |         --save_total_limit 3 \
 63 |         --do_train \
 64 |         --save_predictions
 65 | done
 66 | 
 67 | ######
 68 | 
 69 | export EVAL_BATCH_SIZE=16
 70 | export TRAIN_BATCH_SIZE=8
 71 | 
 72 | for MODEL_NAME in "bert-base-cased" "scibert-scivocab-uncased" "roberta-base" "xlnet-base-cased" "google/electra-base-discriminator" "deepset/covid_bert_base"
 73 | do
 74 |     echo $MODEL_NAME
 75 |     export CV_FOLD=1
 76 |     python trainer_cli.py --cv_fold $CV_FOLD \
 77 |         --output_dir $OUTPUT_DIR \
 78 |         --model_name_or_path $MODEL_NAME \
 79 |         --doc_id_col $DOC_ID_COL \
 80 |         --doc_a_col $DOC_A_COL \
 81 |         --doc_b_col $DOC_B_COL \
 82 |         --nlp_dataset $NLP_DATASET \
 83 |         --nlp_cache_dir $NLP_CACHE_DIR \
 84 |         --cache_dir $CACHE_DIR \
 85 |         --num_train_epochs $EPOCHS \
 86 |         --seed $SEED \
 87 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
 88 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
 89 |         --learning_rate $LR \
 90 |         --logging_steps 100 \
 91 |         --save_steps 0 \
 92 |         --save_total_limit 3 \
 93 |         --do_train \
 94 |         --save_predictions
 95 | done
 96 | 
 97 | 
 98 | export PYTHONUNBUFFERED=""
 99 | 
100 | 


--------------------------------------------------------------------------------
/acl/preprocessing/citation_mapping.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from typing import List
  4 | 
  5 | 
  6 | def get_title2s2_id(id2s2__title_list: List):
  7 |     title2s2_id = {}
  8 | 
  9 |     for id2s2, id2title in id2s2__title_list:
 10 |         title2s2_id.update({id2title[_id]: s2['paperId'] for _id, s2 in id2s2.items() if _id in id2title})
 11 | 
 12 |     return title2s2_id
 13 | 
 14 | 
 15 | def get_dblp_titles(fp):
 16 |     """
 17 | 
 18 | 
 19 |     :param fp: Path to DBLP scraper results (JSON)
 20 |     :return: acl_id2title, doi2title, arxiv2title
 21 |     """
 22 |     title2dblp_hits = json.load(open(fp, 'r'))
 23 | 
 24 |     title2doi = {}
 25 |     doi2title = {}
 26 | 
 27 |     title2arxiv = {}
 28 |     arxiv2title = {}
 29 | 
 30 |     title2acl_id = {}
 31 |     acl_id2title = {}
 32 | 
 33 |     for i, (title, hits) in enumerate(title2dblp_hits.items()):
 34 |         if hits['@total'] == '1':  # igore multi matches
 35 |             hit = hits['hit'][0]
 36 | 
 37 |             if 'doi' in hit['info']:
 38 |                 doi = hit['info']['doi'].replace('https://doi.org/', '')
 39 | 
 40 |                 doi2title[doi] = title
 41 |                 title2doi[title] = doi
 42 |                 continue
 43 | 
 44 |             if 'ee' in hit['info']:
 45 |                 ee = hit['info']['ee']
 46 |                 if 'aclweb.org/anthology/' in ee:
 47 |                     match = re.search(r'anthology/([-a-zA-Z0-9]+)', ee)
 48 |                     if match:
 49 |                         acl_id = match.group(1)
 50 |                         title2acl_id[title] = acl_id
 51 |                         acl_id2title[acl_id] = title
 52 |                         continue
 53 | 
 54 |                     # print(acl_id)
 55 | 
 56 |                 if 'arxiv.org' in ee:
 57 |                     match = re.search(r'arxiv.org\/abs\/(.+)', ee)
 58 |                     if match:
 59 |                         arxiv_id = match.group(1)
 60 |                         title2arxiv[title] = arxiv_id
 61 |                         arxiv2title[arxiv_id] = title
 62 |                         continue
 63 | 
 64 |                     # print(arxiv_id)
 65 |                 # other
 66 |                 # print(hit['info']['ee'])
 67 | 
 68 |         #    print(hits)
 69 |         #    print('----')
 70 |         #    if i > 100:
 71 |         #        break
 72 | 
 73 |     found = len(doi2title) + len(arxiv2title) + len(acl_id2title)
 74 | 
 75 |     print(f'Found DOIs: {len(doi2title)} ({len(title2doi)})')
 76 |     print(f'Found arXiv: {len(arxiv2title)}')
 77 |     print(f'Found ACL: {len(acl_id2title)}')
 78 | 
 79 |     print(f'-- Found all: {found:,} / {len(title2dblp_hits):,}')
 80 | 
 81 |     return acl_id2title, doi2title, arxiv2title
 82 | 
 83 | 
 84 | def get_s2_pairs_from_cits(cit_pairs, acl_id2s2):
 85 |     s2_pairs = []
 86 |     not_found = []
 87 | 
 88 |     for from_s2_id, from_acl_id, to_s2_id, sect_generic, sect_title, sect_marker in cit_pairs:
 89 |         if from_s2_id == None:
 90 |             if from_acl_id in acl_id2s2:
 91 |                 from_s2_id = acl_id2s2[from_acl_id]['paperId']
 92 |             else:
 93 |                 not_found.append((from_acl_id, to_s2_id))
 94 |                 continue
 95 | 
 96 |         s2_pairs.append((
 97 |             from_s2_id,
 98 |             to_s2_id,
 99 |             sect_generic,
100 |             sect_title,
101 |             sect_marker,
102 |         ), )
103 | 
104 |     return s2_pairs, not_found
105 | 


--------------------------------------------------------------------------------
/models/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | import logging
 5 | 
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def get_concat(concat: str, embedding_dim: int):
11 |     """
12 | 
13 |     :param concat: Concatenation style
14 |     :param embedding_dim: Size of inputs that are subject to concatenation
15 |     :return: Function that performs concatenation, Size of concatenation output
16 |     """
17 |     concat_func = None
18 |     concat_dim = None
19 | 
20 |     if concat == 'simple':
21 |         concat_func = lambda a, b: torch.cat((a, b), dim=1)
22 |         concat_dim = 2 * embedding_dim
23 |     elif concat == 'dif':
24 |         # x = np.abs(a-b)
25 |         concat_func = lambda a, b: (a - b).abs()
26 |         concat_dim = 1 * embedding_dim
27 |     elif concat == 'prod':
28 |         # x = a * b
29 |         concat_func = lambda a, b: a * b
30 |         concat_dim = 1 * embedding_dim
31 |     elif concat == 'dif-prod':
32 |         # x = np.hstack((np.abs(a-b), a * b))
33 |         concat_func = lambda a, b: torch.cat(((a - b).abs(), a * b), dim=1)
34 |         concat_dim = 2 * embedding_dim
35 | 
36 |     elif concat == '3d-prod':
37 |         # x = np.hstack((a, b, a*b))
38 |         concat_func = lambda a, b: torch.cat((a, b, a * b), dim=1)
39 |         concat_dim = 3 * embedding_dim
40 | 
41 |     elif concat == '3d-dif':
42 |         # x = np.hstack((a, b, np.abs(a-b)))
43 |         concat_func = lambda a, b: torch.cat((a, b, (a - b).abs()), dim=1)
44 |         concat_dim = 3 * embedding_dim
45 |     elif concat == '4d-prod-dif':
46 |         # x = np.hstack((a, b, a*b, np.abs(a-b)))
47 |         concat_func = lambda a, b: torch.cat((a, b, a * b, (a - b).abs()), dim=1)
48 |         concat_dim = 4 * embedding_dim
49 | 
50 |     else:
51 |         raise ValueError('Unsupported concat mode')
52 | 
53 |     logger.debug(f'concat_dim = {concat_dim} ({concat})')
54 |         
55 |     return concat_func, concat_dim
56 | 
57 | 
58 | def get_mlp(input_dim, output_dim, hidden_dim, hidden_layers_count=1, dropout_p=0., activation_cls=nn.ReLU):
59 |     """
60 |     Generate a fully-connected layer (MLP) with dynamic input, output and hidden dimension, and hidden layer count.
61 |     
62 |     - when dropout_p > 0, then dropout is applied with given probability after the activation function.
63 |     
64 |     :param input_dim:  
65 |     :return: Sequential layer
66 |     """
67 |     layers = [
68 |         # first layer
69 |         nn.Linear(input_dim, hidden_dim),
70 |         activation_cls(),
71 |     ]
72 | 
73 |     if dropout_p > 0:
74 |         layers.append(nn.Dropout(dropout_p))
75 | 
76 |     for layer_idx in range(1, hidden_layers_count):
77 |         layers.append(nn.Linear(hidden_dim, hidden_dim)),
78 |         layers.append(activation_cls()),
79 | 
80 |         if dropout_p > 0:
81 |             layers.append(nn.Dropout(dropout_p))
82 | 
83 |     # last layer
84 |     layers.append(nn.Linear(hidden_dim, output_dim))
85 | 
86 |     # TODO fill linear layers
87 |     # nn.init.xavier_normal_(self.classifier.weight)
88 |     # Fills the input Tensor with values according to the method described in “Understanding the difficulty of training deep feedforward neural networks” - Glorot, X. & Bengio, Y. (2010), using a normal distribution.
89 |     # kaiming_normal_
90 |     # Fills the input Tensor with values according to the method described in “Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification” - He, K. et al. (2015), using a normal distribution.
91 | 
92 |     return nn.Sequential(*layers)


--------------------------------------------------------------------------------
/tests/test_auto_modeling.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | from unittest import TestCase
  4 | 
  5 | import torch
  6 | from transformers import AutoTokenizer, AutoConfig, RobertaTokenizer, RobertaForSequenceClassification
  7 | 
  8 | from acl.__data_prep import load_parscit_file, get_citation_context
  9 | from experiments.environment import get_env
 10 | from models.auto_modeling import AutoModelForMultiLabelSequenceClassification
 11 | 
 12 | 
 13 | class AutoModelingTest(TestCase):
 14 |     env = None
 15 | 
 16 |     def setUp(self) -> None:
 17 |         os.environ["WANDB_DISABLED"] = "true"
 18 |         os.environ["WANDB_WATCH"] = "false"
 19 |         self.env = get_env()
 20 | 
 21 |         self.cache_dir = '../data/transformers_cache'
 22 |         self.sample_text = ' '.join(['Hello world! '] * 10)
 23 |         self.num_labels = 5
 24 | 
 25 |     def forward_model(self, model_name_or_path, text, tokenizer_name=None):
 26 |         tokenizer = AutoTokenizer.from_pretrained(tokenizer_name if tokenizer_name else model_name_or_path, cache_dir=self.cache_dir)
 27 |         model_config = AutoConfig.from_pretrained(model_name_or_path, num_labels=self.num_labels, cache_dir=self.cache_dir)
 28 | 
 29 |         model = AutoModelForMultiLabelSequenceClassification.from_pretrained(model_name_or_path, config=model_config, cache_dir=self.cache_dir)
 30 | 
 31 |         model.eval()
 32 | 
 33 |         encodings = tokenizer.batch_encode_plus([text], return_tensors='pt')
 34 | 
 35 |         return model(encodings['input_ids']), model, tokenizer
 36 | 
 37 |     def test_bert_auto(self):
 38 |         model_name_or_path = self.env['bert_dir'] + '/bert-base-cased'
 39 |         out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text)
 40 | 
 41 |         print(out)
 42 |         print(type(model))
 43 | 
 44 |         print(model.config.max_position_embeddings)
 45 | 
 46 |     def test_distilbert_auto(self):
 47 |         model_name_or_path = self.env['bert_dir'] + '/distilbert-base-uncased'
 48 |         out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text)
 49 | 
 50 |         print(out)
 51 |         print(type(model))
 52 | 
 53 | 
 54 |         print(model.config.max_position_embeddings)
 55 | 
 56 | 
 57 |     def test_xlnet_auto(self):
 58 |         model_name_or_path = 'xlnet-base-cased'
 59 |         out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text)
 60 | 
 61 |         print(out)
 62 |         print(type(model))
 63 | 
 64 |         self.assertEqual(self.num_labels, out[0].shape[1])
 65 | 
 66 |         print(model.config.max_position_embeddings)
 67 |         print(tokenizer.model_max_length)
 68 | 
 69 |     def test_roberta_auto(self):
 70 |         model_name_or_path = 'roberta-base'
 71 |         out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text)
 72 | 
 73 |         print(out)
 74 |         print(type(model))
 75 | 
 76 |         self.assertEqual(self.num_labels, out[0].shape[1])
 77 | 
 78 |         print(model.roberta)
 79 |         print(model.config.max_position_embeddings)
 80 |         # model.save_pretrained(self.cache_dir)
 81 | 
 82 |     def test_roberta_manual(self):
 83 |         tokenizer = RobertaTokenizer.from_pretrained('roberta-base', cache_dir=self.cache_dir)
 84 |         model = RobertaForSequenceClassification.from_pretrained('roberta-base', cache_dir=self.cache_dir)
 85 | 
 86 |         encodings = tokenizer.batch_encode_plus(['foo bar'], return_tensors='pt')
 87 | 
 88 |         print(model(encodings['input_ids']))
 89 | 
 90 |     def test_longformer_auto(self):
 91 |         model_name_or_path = 'longformer-base-4096'
 92 |         out, model, tokenizer = self.forward_model(model_name_or_path, self.sample_text, 'roberta-base')
 93 | 
 94 |         print(out)
 95 |         print(type(model))
 96 | 
 97 |         self.assertEqual(self.num_labels, out[0].shape[1])
 98 | 
 99 |         print(model.roberta)
100 |         print(model.config.max_position_embeddings)
101 |         # model.save_pretrained(self.cache_dir)


--------------------------------------------------------------------------------
/cord19/preprocessing/negative_sampling.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import math
  3 | import random
  4 | from typing import List
  5 | 
  6 | from fuzzywuzzy import fuzz
  7 | 
  8 | from acl.utils import get_sorted_pair
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | def get_authors(doi, doi2paper):
 14 |     if doi in doi2paper:
 15 |         paper = doi2paper[doi]
 16 | 
 17 |         if 'authors' in paper:
 18 |             last_names = [a.split()[-1].lower() for a in paper['authors']]
 19 |             return last_names
 20 |         else:
 21 |             return []
 22 | 
 23 |     # elif doi in doi2paper:
 24 |     #     paper = doi2paper[doi]
 25 |     #     last_names = [a['last'].lower() for a in paper['metadata']['authors']]
 26 |     #     return last_names
 27 |     else:
 28 |         raise ValueError(f'DOI not found: {doi}')
 29 | 
 30 | 
 31 | def have_no_shared_authors(a_doi, b_doi, doi2paper):
 32 |     try:
 33 |         a_authors = set(get_authors(a_doi, doi2paper))
 34 |         b_authors = set(get_authors(b_doi, doi2paper))
 35 | 
 36 |         overlap = a_authors & b_authors
 37 | 
 38 |         if len(overlap) == 0:
 39 |             return True
 40 |         else:
 41 |             return False
 42 | 
 43 |     except ValueError:
 44 |         return False
 45 | 
 46 | 
 47 | # has same venue
 48 | def get_venue(doi, doi2paper):
 49 |     if doi in doi2paper:
 50 |         paper = doi2paper[doi]
 51 |         return str(paper['venue']).lower().strip() if 'venue' in paper else None
 52 |     else:
 53 |         raise ValueError(f'DOI not found: {doi}')
 54 | 
 55 | 
 56 | def have_not_same_venue(a_doi, b_doi, doi2paper):
 57 |     a_venue = get_venue(a_doi, doi2paper)
 58 |     b_venue = get_venue(b_doi, doi2paper)
 59 | 
 60 |     if a_venue is None or b_venue is None or a_venue == "" or b_venue == "":
 61 |         # cant answer if venue is not set
 62 |         return False
 63 | 
 64 |     if fuzz.ratio(a_venue, b_venue) < 0.75:
 65 |         # fuzzy string matching score must be low!
 66 |         return True
 67 |     else:
 68 |         return False
 69 | 
 70 | 
 71 | def get_negative_pairs(doi2paper, candidate_doc_ids: List[str], positive_pairs, cits_set, cocits_set, negative_ratio=0.5, negative_count=0):
 72 |     # negative_label = 'none'
 73 |     # negative_needed = 10000 #105492  # len(df)
 74 | 
 75 |     if negative_count > 0:
 76 |         negative_needed = negative_count
 77 |     else:
 78 |         negative_needed = math.ceil(len(positive_pairs) * negative_ratio)
 79 | 
 80 |     negative_rows = []
 81 |     negative_pairs = set()
 82 |     tries = 0
 83 | 
 84 |     print(f'Negatives needed: {negative_needed:,} (ratio: {negative_ratio})')
 85 | 
 86 |     while len(negative_pairs) < negative_needed:
 87 |         a = random.choice(candidate_doc_ids)
 88 |         b = random.choice(candidate_doc_ids)
 89 | 
 90 |         if a == b:
 91 |             tries += 1
 92 |             continue
 93 | 
 94 |         pair = tuple((a, b))
 95 | 
 96 |         if pair in negative_pairs:
 97 |             continue
 98 | 
 99 |         cit_pair = get_sorted_pair(a, b)
100 | 
101 |         if cit_pair in cits_set:
102 |             tries += 1
103 |             continue
104 | 
105 |         if cit_pair in cocits_set:
106 |             tries += 1
107 |             continue
108 | 
109 |         if not have_no_shared_authors(a, b, doi2paper):
110 |             tries += 1
111 |             continue
112 | 
113 |         if not have_not_same_venue(a, b, doi2paper):
114 |             tries += 1
115 |             continue
116 | 
117 |         # text = get_text_from_doi(a)
118 |         # text_b = get_text_from_doi(b)
119 |         # if text == '' or text_b == '':
120 |         #     continue
121 | 
122 |         # None of the criteria above matches...
123 |         negative_pairs.add(pair)
124 |         # negative_rows.append((
125 |         #     a,
126 |         #     b,
127 |         #     text,
128 |         #     text_b,
129 |         #     negative_label,
130 |         # ))
131 | 
132 |     logger.info(f'Found {len(negative_pairs):,} negative rows (tried {tries:,} random samples)')
133 | 
134 |     return negative_pairs
135 | 


--------------------------------------------------------------------------------
/tests/test_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from unittest import TestCase
  3 | 
  4 | import pandas as pd
  5 | from nlp import load_dataset
  6 | 
  7 | from acl.trainer_utils import get_label_classes_from_nlp_dataset
  8 | from datasets.acl_docrel.acl_docrel import get_train_split, get_test_split
  9 | from experiments.environment import get_env
 10 | 
 11 | 
 12 | class TrainerTest(TestCase):
 13 |     def __init__(self, *args, **kwargs):
 14 |         os.environ["WANDB_DISABLED"] = "true"
 15 |         os.environ["WANDB_WATCH"] = "false"
 16 | 
 17 |         super().__init__(*args, **kwargs)
 18 |         self.env = get_env()
 19 | 
 20 |     def test_label_classes(self):
 21 |         ds = "./datasets/acl_docrel/acl_docrel.py"
 22 | 
 23 |         ls = get_label_classes_from_nlp_dataset(ds)
 24 | 
 25 |         self.assertEqual(['introduction',
 26 |                  'related work',
 27 |                  'experiment',
 28 |                  'conclusion',
 29 |                  'results',
 30 |                  'background',
 31 |                  'discussion',
 32 |                  'evaluation',
 33 |                  'method',
 34 |                  #'previous work',
 35 |                  'other',
 36 |                  'none'], ls)
 37 | 
 38 |     def test_load_dataset(self):
 39 |         pass
 40 | 
 41 |     def test_load_datasets_and_compare_label_class_distribution(self):
 42 |         cache_dir = '../data/nlp_cache'
 43 |         acl_ds = "../datasets/acl_docrel/acl_docrel.py"
 44 |         cv_fold = 1
 45 | 
 46 |         train_ds = load_dataset(acl_ds,
 47 |                                 name='relations',
 48 |                                 cache_dir=cache_dir,
 49 |                                 split=get_train_split(cv_fold))
 50 |         test_ds = load_dataset(acl_ds,
 51 |                                name='relations',
 52 |                                cache_dir=cache_dir,
 53 |                                split=get_test_split(cv_fold))
 54 | 
 55 |         labels = [l for r in test_ds for l in r['label']] + [l for r in train_ds for l in r['label']]
 56 |         df = pd.DataFrame(labels, columns=['label'])
 57 | 
 58 |         print('ACL')
 59 |         print(df['label'].value_counts())
 60 | 
 61 |         print('Pairs: %s '(len(train_ds) + len(test_ds)))
 62 | 
 63 | 
 64 |         ######
 65 | 
 66 |         cord19_ds = "../datasets/cord19_docrel/cord19_docrel.py"
 67 |         train_ds = load_dataset(cord19_ds,
 68 |                                 name='relations',
 69 |                                 cache_dir=cache_dir,
 70 |                                 split=get_train_split(cv_fold))
 71 |         test_ds = load_dataset(cord19_ds,
 72 |                                name='relations',
 73 |                                cache_dir=cache_dir,
 74 |                                split=get_test_split(cv_fold))
 75 | 
 76 |         labels = [l for r in test_ds for l in r['label']] + [l for r in train_ds for l in r['label']]
 77 |         df = pd.DataFrame(labels, columns=['label'])
 78 | 
 79 |         print('CORD19')
 80 |         print(df['label'].value_counts())
 81 | 
 82 |         print('Pairs: %s ' (len(train_ds) + len(test_ds)))
 83 | 
 84 | 
 85 |     def test_dataset_splits(self):
 86 |         cache_dir = '../data/nlp_cache'
 87 | 
 88 |         for ds in ["../datasets/acl_docrel/acl_docrel.py", "../datasets/cord19_docrel/cord19_docrel.py"]:
 89 |             print(ds)
 90 | 
 91 |             train_count = 0
 92 |             test_count = 0
 93 | 
 94 |             for cv_fold in [1,2,3,4]:
 95 |                 train_ds = load_dataset(ds,
 96 |                                         name='relations',
 97 |                                         cache_dir=cache_dir,
 98 |                                         split=get_train_split(cv_fold))
 99 | 
100 |                 train_count += len(train_ds)
101 | 
102 |                 test_ds = load_dataset(ds,
103 |                                        name='relations',
104 |                                        cache_dir=cache_dir,
105 |                                        split=get_test_split(cv_fold))
106 |                 test_count += len(test_ds)
107 | 
108 |             print('Train: %s' % (train_count / 4))
109 |             print('Test: %s' % (test_count / 4))
110 |             print()


--------------------------------------------------------------------------------
/demo.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cells": [
 3 |     {
 4 |       "cell_type": "markdown",
 5 |       "metadata": {
 6 |         "pycharm": {}
 7 |       },
 8 |       "source": "# Demo for Aspect-oriented Similarity between Research Papers\n\n### Download models \u0026 install dependencies"
 9 |     },
10 |     {
11 |       "cell_type": "code",
12 |       "execution_count": null,
13 |       "metadata": {
14 |         "pycharm": {}
15 |       },
16 |       "outputs": [],
17 |       "source": "!git clone https://github.com/malteos/aspect-document-similarity.git repo\n%cd repo"
18 |     },
19 |     {
20 |       "cell_type": "code",
21 |       "execution_count": null,
22 |       "metadata": {
23 |         "pycharm": {}
24 |       },
25 |       "outputs": [],
26 |       "source": "# Download models (scibert-scivocab-uncased, trained on first CV fold)\n!mkdir -p models/acl models/cord19\n!wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/acl_fold-1_scibert-scivocab-uncased.tar.gz    \n!tar -xzvf acl_fold-1_scibert-scivocab-uncased.tar.gz\n!mv scibert-scivocab-uncased models/acl\n\n!wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/cord19_fold-1_scibert-scivocab-uncased.tar.gz\n!tar -xzvf cord19_fold-1_scibert-scivocab-uncased.tar.gz\n!mv scibert-scivocab-uncased models/cord19\n\n!wget -O models/cord19/scibert-scivocab-uncased/vocab.txt https://github.com/malteos/aspect-document-similarity/releases/download/1.0/scibert-vocab.txt\n!cp models/cord19/scibert-scivocab-uncased/vocab.txt models/acl/scibert-scivocab-uncased/vocab.txt"
27 |     },
28 |     {
29 |       "cell_type": "code",
30 |       "execution_count": null,
31 |       "metadata": {
32 |         "pycharm": {}
33 |       },
34 |       "outputs": [],
35 |       "source": "# Install dependencies (for colab)\n!pip install requests transformers\u003d\u003d2.10.0\n\n# Install all dependencies\n#!pip install -r requirements.txt "
36 |     },
37 |     {
38 |       "cell_type": "code",
39 |       "execution_count": 31,
40 |       "metadata": {
41 |         "pycharm": {}
42 |       },
43 |       "outputs": [],
44 |       "source": "from IPython.core.display import display, HTML\nfrom demo_utils import get_prediction"
45 |     },
46 |     {
47 |       "cell_type": "markdown",
48 |       "metadata": {
49 |         "pycharm": {}
50 |       },
51 |       "source": "### Try your own papers\n\n1. Select model (either trained on ACL Anthology or CORD-19)\n2. Select input documents by paper IDs. \n\nAll IDs from [Semantic Scholar API](https://api.semanticscholar.org/) are supported (DOI, ArXiv ID, PubMed ID, ACL ID)\n"
52 |     },
53 |     {
54 |       "cell_type": "code",
55 |       "execution_count": 2,
56 |       "metadata": {
57 |         "pycharm": {}
58 |       },
59 |       "outputs": [],
60 |       "source": "selected_model \u003d \u0027acl\u0027  #@param [\"acl\", \"cord19\"]\nfrom_id \u003d \u002710.3115/1667583.1667640\u0027 #@param {type:\"string\"}\nto_id \u003d \u002710.1145/1367497.1367545\u0027 #@param {type:\"string\"}"
61 |     },
62 |     {
63 |       "cell_type": "code",
64 |       "execution_count": 15,
65 |       "metadata": {
66 |         "pycharm": {}
67 |       },
68 |       "outputs": [],
69 |       "source": "# Perform predictions\npred_scores, pred_labels, from_doc, to_doc \u003d get_prediction(f\u0027./models/{selected_model}/scibert-scivocab-uncased\u0027, from_id, to_id)\n\ndisplay(HTML(f\u0027\u003ch3\u003eSeed: \u003ca href\u003d\"{from_doc[\"url\"]}\"\u003e{from_doc[\"title\"]}\u003c/a\u003e\u003c/h3\u003e\u0027))\ndisplay(HTML(f\u0027\u003ch3\u003eTarget: \u003ca href\u003d\"{to_doc[\"url\"]}\"\u003e{to_doc[\"title\"]}\u003c/a\u003e\u003c/h3\u003e\u0027))\ndisplay(HTML(f\u0027\u003ch4\u003ePredicted labels:\u003c/h4\u003e\u003cpre\u003e{\", \".join(pred_labels)}\u003c/pre\u003e\u0027))"
70 |     }
71 |   ],
72 |   "metadata": {
73 |     "kernelspec": {
74 |       "display_name": "Python 3",
75 |       "language": "python",
76 |       "name": "python3"
77 |     },
78 |     "language_info": {
79 |       "codemirror_mode": {
80 |         "name": "ipython",
81 |         "version": 3
82 |       },
83 |       "file_extension": ".py",
84 |       "mimetype": "text/x-python",
85 |       "name": "python",
86 |       "nbconvert_exporter": "python",
87 |       "pygments_lexer": "ipython3",
88 |       "version": "3.7.4"
89 |     }
90 |   },
91 |   "nbformat": 4,
92 |   "nbformat_minor": 2
93 | }


--------------------------------------------------------------------------------
/acl/preprocessing/negative_sampling.py:
--------------------------------------------------------------------------------
  1 | # shared author
  2 | import logging
  3 | import math
  4 | import random
  5 | from collections import defaultdict
  6 | from typing import List, Tuple, Set
  7 | 
  8 | from fuzzywuzzy import fuzz
  9 | 
 10 | from acl.utils import get_sorted_pair
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def get_cocitations(cits: List[Tuple[str, str]]) -> Set[Tuple[str, str]]:
 16 |     from_to_cits = defaultdict(set)
 17 | 
 18 |     for from_id, to_id in cits:
 19 |         from_to_cits[from_id].add(to_id)
 20 | 
 21 |     cocits_set = set()
 22 | 
 23 |     for from_cit, to_cits in from_to_cits.items():
 24 |         for a in to_cits:
 25 |             for b in to_cits:
 26 |                 cocits_set.add(get_sorted_pair(a, b))
 27 | 
 28 |     logger.info(f'total co-citation count: {len(cocits_set):,}')
 29 | 
 30 |     return cocits_set
 31 | 
 32 | 
 33 | def get_authors(doc_id, doc_index):
 34 |     if doc_id in doc_index:
 35 |         s2paper = doc_index[doc_id]
 36 |         last_names = [a['name'].split()[-1].lower() for a in s2paper['authors']]
 37 |         return last_names
 38 |     else:
 39 |         raise ValueError(f'Doc ID not found: {doc_id}')
 40 | 
 41 | 
 42 | def have_no_shared_authors(a_id, b_id, doc_index):
 43 |     try:
 44 |         a_authors = set(get_authors(a_id, doc_index))
 45 |         b_authors = set(get_authors(b_id, doc_index))
 46 | 
 47 |         overlap = a_authors & b_authors
 48 | 
 49 |         if len(overlap) == 0:
 50 |             return True
 51 |         else:
 52 |             return False
 53 | 
 54 |     except ValueError:
 55 |         return False
 56 | 
 57 | 
 58 | # has same venue
 59 | def get_venue(doc_id, doc_index):
 60 |     if doc_id in doc_index:
 61 |         s2paper = doc_index[doc_id]
 62 |         return s2paper['venue'].lower().strip()
 63 |     else:
 64 |         raise ValueError(f'Doc ID not found: {doc_id}')
 65 | 
 66 | 
 67 | def have_not_same_venue(a_id, b_id, doc_index):
 68 |     a_venue = get_venue(a_id, doc_index)
 69 |     b_venue = get_venue(b_id, doc_index)
 70 | 
 71 |     if a_venue == "" or b_venue == "":
 72 |         # cant answer if venue is not set
 73 |         return False
 74 | 
 75 |     if fuzz.ratio(a_venue, b_venue) < 0.75:
 76 |         # fuzzy string matching score must be low!
 77 |         return True
 78 |     else:
 79 |         return False
 80 | 
 81 | 
 82 | def get_negative_pairs(s2_id2s2_paper, positive_pairs, cits_set, cocits_set, negative_ratio=0.5, negative_count=0):
 83 |     # negative_label = 'none'
 84 |     # negative_needed = 10000 #105492  # len(df)
 85 | 
 86 |     if negative_count > 0:
 87 |         negative_needed = negative_count
 88 |     else:
 89 |         negative_needed = math.ceil(len(positive_pairs) * negative_ratio)
 90 | 
 91 |     # negative_rows = []
 92 |     negative_pairs = set()
 93 |     tries = 0
 94 |     all_doc_ids = list(s2_id2s2_paper.keys())
 95 | 
 96 |     logger.info(f'Negatives needed: {negative_needed:,} (ratio: {negative_ratio}, fixed: {negative_count})')
 97 | 
 98 |     while len(negative_pairs) < negative_needed:
 99 |         a = random.choice(all_doc_ids)
100 |         b = random.choice(all_doc_ids)
101 | 
102 |         if a == b:
103 |             tries += 1
104 |             continue
105 | 
106 |         if not have_no_shared_authors(a, b, s2_id2s2_paper):
107 |             tries += 1
108 |             continue
109 | 
110 |         if not have_not_same_venue(a, b, s2_id2s2_paper):
111 |             tries += 1
112 |             continue
113 | 
114 |         cit_pair = get_sorted_pair(a, b)
115 |         if cit_pair in cits_set:
116 |             tries += 1
117 |             continue
118 | 
119 |         if cit_pair in cocits_set:
120 |             tries += 1
121 |             continue
122 | 
123 |         # text = get_text_from_doc_id(a, s2_id2s2_paper)
124 |         # text_b = get_text_from_doc_id(b, s2_id2s2_paper)
125 |         #
126 |         # if text == '' or text_b == '':
127 |         #     continue
128 | 
129 |         pair = tuple((a, b))
130 | 
131 |         if pair in negative_pairs:
132 |             continue
133 | 
134 |         negative_pairs.add(pair)
135 | 
136 |         # negative_rows.append((
137 |         #     text,
138 |         #     text_b,
139 |         #     negative_label,
140 |         # ))
141 | 
142 |     logger.info(f'Found {len(negative_pairs):,} negative rows (tried {tries:,} random samples)')
143 | 
144 |     return negative_pairs


--------------------------------------------------------------------------------
/cord19/preprocessing/cord19_reader.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | from typing import Dict
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | def get_dict_value(d, key, default=None):
 10 |     if key in d:
 11 |         return d[key]
 12 |     else:
 13 |         return default
 14 | 
 15 | 
 16 | def get_papers_and_citations_from_cord19(input_dir, id2meta):
 17 | 
 18 |     subsets = ['biorxiv_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']
 19 |     id2paper = {}
 20 | 
 21 |     has_doi = 0
 22 |     bib_count = 0
 23 |     cits = []  # from_doi, to_doi, <section title>
 24 | 
 25 |     for ss in subsets:
 26 |         ss_dir = os.path.join(input_dir, ss)
 27 | 
 28 |         # iterate over files
 29 |         for fn in os.listdir(ss_dir):
 30 |             if not fn.endswith('.json'):
 31 |                 continue
 32 | 
 33 |             fp = os.path.join(ss_dir, fn)
 34 |             with open(fp, 'r') as f:
 35 |                 paper = json.load(f)
 36 | 
 37 |                 if paper['paper_id'] not in id2meta:
 38 |                     continue
 39 | 
 40 |                 meta = id2meta[paper['paper_id']]
 41 | 
 42 |                 paper['_meta'] = dict(meta)
 43 | 
 44 |                 id2paper[paper['paper_id']] = paper
 45 | 
 46 |                 # has valid DOI
 47 |                 if isinstance(meta['doi'], str) and len(meta['doi']) > 10:
 48 |                     # iterate over body text
 49 |                     for paragraph in paper['body_text']:
 50 |                         # iterate over each citation marker
 51 |                         for cit in paragraph['cite_spans']:
 52 |                             # find corresponding bib entry
 53 |                             if cit['ref_id'] in paper['bib_entries']:
 54 |                                 bib = paper['bib_entries'][cit['ref_id']]
 55 |                                 bib_count += 1
 56 | 
 57 |                                 # only use bib entries with DOI
 58 |                                 if 'DOI' in bib['other_ids']:
 59 |                                     has_doi += 1
 60 | 
 61 |                                     for out_doi in bib['other_ids']['DOI']:
 62 |                                         cits.append((
 63 |                                             meta['doi'],
 64 |                                             out_doi,
 65 |                                             paragraph['section']
 66 |                                         ))
 67 |         # break
 68 |     # break
 69 | 
 70 |     logger.info(f'Paper count: {len(id2paper)}')
 71 |     logger.info(f'DOI exists: {has_doi / bib_count} (total: {bib_count}; doi: {has_doi})')
 72 |     logger.info(f'Citation pairs: {len(cits)}')
 73 | 
 74 |     return id2paper, cits
 75 | 
 76 | 
 77 | def merge_cord19_and_s2_papers(id2paper, id2meta, doi2s2paper: Dict[str, Dict]) -> Dict[str, Dict]:
 78 |     """
 79 | 
 80 |     Merge CORD-19 + S2
 81 | 
 82 |     :param id2meta:
 83 |     :param id2paper:
 84 |     :param doi2s2paper:
 85 |     :return: DOI => Paper
 86 |     """
 87 |     doi2paper = {}
 88 | 
 89 |     for pid, cord_paper in id2paper.items():
 90 |         if pid in id2meta:
 91 |             doi = id2meta[pid]['doi']
 92 | 
 93 |             paper = {
 94 |                 'cord19_id': cord_paper['paper_id'],
 95 |                 's2_id': None,
 96 |                 'title': cord_paper['metadata']['title'],
 97 |                 'abstract': cord_paper['abstract'][0]['text'] if len(cord_paper['abstract']) == 1 else None,
 98 |                 'arxivId': None,
 99 |                 'doi': doi,
100 |                 'venue': cord_paper['_meta']['journal'],
101 |                 'year': int(cord_paper['_meta']['publish_time'].split('-')[0]),
102 |                 'citations_count': None,
103 |                 'references_count': len(cord_paper['bib_entries']),
104 |                 'authors': [author['first'] + ' ' + author['last'] for author in cord_paper['metadata']['authors']],
105 |             }
106 |             doi2paper[doi] = paper
107 | 
108 |     for doi, s2 in doi2s2paper.items():
109 | 
110 |         paper = {
111 |             'cord19_id': None,
112 |             's2_id': get_dict_value(s2, 'paperId'),
113 |             'title': get_dict_value(s2, 'title'),
114 |             'abstract': get_dict_value(s2, 'abstract'),
115 |             'doi': doi,
116 |             'arxivId': get_dict_value(s2, 'arxivId'),
117 |             'venue': get_dict_value(s2, 'venue'),
118 |             'year': get_dict_value(s2, 'year', 0),
119 |             'citations_count': len(get_dict_value(s2, 'citations', [])),
120 |             'references_count': len(get_dict_value(s2, 'references', [])),
121 |             'authors': [a['name'] for a in get_dict_value(s2, 'authors', []) if 'name' in a],
122 |         }
123 | 
124 |         if doi in doi2paper:
125 |             logger.warning(f'Overriding CORD19 with S2 paper data: {doi}')
126 | 
127 |             paper['cord19_id'] = doi2paper[doi]['cord19_id']
128 | 
129 |         doi2paper[doi] = paper
130 | 
131 |     return doi2paper
132 | 


--------------------------------------------------------------------------------
/experiments/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | 
  6 | from torch.optim.lr_scheduler import LambdaLR
  7 | 
  8 | 
  9 | def chunks(lst, n):
 10 |     """Yield successive n-sized chunks from lst."""
 11 |     for i in range(0, len(lst), n):
 12 |         yield lst[i:i + n]
 13 | 
 14 | 
 15 | def chunk(seq, num):
 16 |     avg = len(seq) / float(num)
 17 |     out = []
 18 |     last = 0.0
 19 | 
 20 |     while last < len(seq):
 21 |         out.append(seq[int(last):int(last + avg)])
 22 |         last += avg
 23 | 
 24 |     return out
 25 | 
 26 | 
 27 | def flatten(d, parent_key='', sep='__'):
 28 |     items = []
 29 |     for k, v in d.items():
 30 |         new_key = parent_key + sep + k if parent_key else k
 31 |         if isinstance(v, collections.MutableMapping):
 32 |             items.extend(flatten(v, new_key, sep=sep).items())
 33 |         else:
 34 |             items.append((new_key, v))
 35 |     return dict(items)
 36 | 
 37 | 
 38 | def unflatten(dictionary, sep='__'):
 39 |     out_dict = dict()
 40 |     for key, value in dictionary.items():
 41 |         parts = key.split(sep)
 42 |         d = out_dict
 43 |         for part in parts[:-1]:
 44 |             if part not in d:
 45 |                 d[part] = dict()
 46 |             d = d[part]
 47 |         d[parts[-1]] = value
 48 |     return out_dict
 49 | 
 50 | 
 51 | 
 52 | def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
 53 |     """ Create a schedule with a learning rate that decreases linearly after
 54 |     linearly increasing during a warmup period.
 55 |     """
 56 |     def lr_lambda(current_step):
 57 |         if current_step < num_warmup_steps:
 58 |             return float(current_step) / float(max(1, num_warmup_steps))
 59 |         return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))
 60 | 
 61 |     return LambdaLR(optimizer, lr_lambda, last_epoch)
 62 | 
 63 | 
 64 | def get_categorical_one_hot_encoding_from_str(label_str, label_classes: List[str], label_sep=',', return_list=False):
 65 |     """
 66 |     Converts a single or list categorical labels into a one-hot-encoded vectors.
 67 |     (multi-label multi-class classification)
 68 | 
 69 |     good,bad => [1.0, 1.0]
 70 |     good => [1.0, 0.0]
 71 | 
 72 |     [good,bad], [good] => [ [1.0, 1.0], [1.0, 0.0] ]
 73 | 
 74 |     :param return_list:
 75 |     :param label_str:
 76 |     :param label_classes: Label classes
 77 |     :param label_sep: Label separator (default: ,)
 78 |     :return: np.array or List
 79 |     """
 80 |     if isinstance(label_str, List):
 81 |         # If input is a list of strings
 82 |         ls = [get_categorical_one_hot_encoding_from_str(ls, label_classes, label_sep, return_list) for ls in label_str]
 83 | 
 84 |         if return_list:
 85 |             return ls
 86 |         else:
 87 |             return np.array(ls)
 88 | 
 89 |     numerical_labels = [label_classes.index(l) for l in label_str.split(label_sep)]
 90 |     one_hot = np.zeros(len(label_classes))
 91 | 
 92 |     one_hot[numerical_labels] = 1.
 93 | 
 94 |     if return_list:
 95 |         return one_hot.tolist()
 96 |     else:
 97 |         return one_hot
 98 | 
 99 | 
100 | def get_categorical_one_hot_encoding_from_str(label_str, label_classes: List[str], label_sep=',', return_list=False):
101 |     """
102 |     Converts a single or list categorical labels into a one-hot-encoded vectors.
103 |     (multi-label multi-class classification)
104 | 
105 |     good,bad => [1.0, 1.0]
106 |     good => [1.0, 0.0]
107 | 
108 |     [good,bad], [good] => [ [1.0, 1.0], [1.0, 0.0] ]
109 | 
110 |     :param return_list:
111 |     :param label_str:
112 |     :param label_classes: Label classes
113 |     :param label_sep: Label separator (default: ,)
114 |     :return: np.array or List
115 |     """
116 |     if isinstance(label_str, List):
117 |         # If input is a list of strings
118 |         ls = [get_categorical_one_hot_encoding(ls, label_classes, label_sep, return_list) for ls in label_str]
119 | 
120 |         if return_list:
121 |             return ls
122 |         else:
123 |             return np.array(ls)
124 | 
125 |     numerical_labels = [label_classes.index(l) for l in label_str.split(label_sep)]
126 |     one_hot = np.zeros(len(label_classes))
127 | 
128 |     one_hot[numerical_labels] = 1.
129 | 
130 |     if return_list:
131 |         return one_hot.tolist()
132 |     else:
133 |         return one_hot
134 | 
135 | 
136 | def highlight_max(data, color='green'):
137 |     '''
138 |     highlight the maximum in a Series or DataFrame
139 |     '''
140 |     attr = 'background-color: {}'.format(color)
141 |     #remove % and cast to float
142 |     data = data.replace('%','', regex=True).astype(float)
143 |     if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
144 |         is_max = data == data.max()
145 |         return [attr if v else '' for v in is_max]
146 |     else:  # from .apply(axis=None)
147 |         is_max = data == data.max().max()
148 |         return pd.DataFrame(np.where(is_max, attr, ''),
149 |                             index=data.index, columns=data.columns)


--------------------------------------------------------------------------------
/models/rnn.py:
--------------------------------------------------------------------------------
  1 | from torch.nn import BCEWithLogitsLoss
  2 | from transformers import PreTrainedModel
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, PackedSequence
  8 | 
  9 | 
 10 | class RNNForMultiLabelSequenceClassification(nn.Module):
 11 |     """
 12 | 
 13 |     LSTM/GRU with GloVe/FastText word embeddings
 14 | 
 15 |     forward() compatible with Tranformers Trainer
 16 | 
 17 |     """
 18 | 
 19 |     def __init__(self, word_vectors, hidden_size=50, num_labels=2, num_layers=1, dropout=0., rnn='lstm'):
 20 |         super(RNNForMultiLabelSequenceClassification, self).__init__()
 21 | 
 22 |         self.num_labels = num_labels
 23 |         self.word_hidden_state = torch.zeros(2, 1, hidden_size)
 24 | 
 25 |         self.word_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 2 * hidden_size))
 26 |         self.word_bias = nn.Parameter(torch.Tensor(1, 2 * hidden_size))
 27 |         self.context_weight = nn.Parameter(torch.Tensor(2 * hidden_size, 1))
 28 | 
 29 |         self.lookup = nn.Embedding(num_embeddings=word_vectors.shape[0], embedding_dim=word_vectors.shape[1])
 30 | 
 31 |         if rnn == 'gru':
 32 |             self.rnn = nn.GRU(
 33 |                 input_size=word_vectors.shape[1],
 34 |                 hidden_size=hidden_size,
 35 |                 num_layers=num_layers,
 36 |                 bidirectional=True,
 37 |                 batch_first=True,
 38 |                 dropout=dropout,
 39 |             )
 40 |         elif rnn == 'lstm':
 41 |             self.rnn = nn.LSTM(
 42 |                 input_size=word_vectors.shape[1],
 43 |                 hidden_size=hidden_size,
 44 |                 num_layers=num_layers,
 45 |                 bidirectional=True,
 46 |                 batch_first=True,
 47 |                 dropout=dropout,
 48 |             )
 49 |         else:
 50 |             raise ValueError('Unknown RNN type')
 51 | 
 52 |         self._create_weights(mean=0.0, std=0.05)
 53 | 
 54 |         self.word_attention = nn.Linear(2 * hidden_size, 50)
 55 | 
 56 |         # Word context vector to take dot-product with
 57 |         self.word_context_vector = nn.Linear(50, 1, bias=False)
 58 | 
 59 |         self.classifier = nn.Linear(2 * hidden_size, self.num_labels)
 60 | 
 61 |         # torch.set_printoptions(threshold=10000)
 62 | 
 63 |     def _create_weights(self, mean=0.0, std=0.05):
 64 |         self.word_weight.data.normal_(mean, std)
 65 |         self.context_weight.data.normal_(mean, std)
 66 | 
 67 |     def forward(
 68 |             self,
 69 |             input_ids=None,
 70 |             attention_mask=None,
 71 |             token_type_ids=None,
 72 |             position_ids=None,
 73 |             head_mask=None,
 74 |             inputs_embeds=None,
 75 |             labels=None,
 76 |     ):
 77 | 
 78 |         word_ids_lengths = attention_mask.sum(axis=1)
 79 |         word_embeddings = self.lookup(input_ids)
 80 | 
 81 |         packed_word_embeddings = pack_padded_sequence(word_embeddings,
 82 |                                                       lengths=word_ids_lengths,
 83 |                                                       batch_first=True,
 84 |                                                       enforce_sorted=False)
 85 | 
 86 |         words_representation, _ = self.rnn(packed_word_embeddings)
 87 |         # This implementation uses the feature sentence_embeddings. Paper uses hidden state
 88 |         word_attention = self.word_attention(words_representation.data)
 89 |         word_attention = torch.tanh(word_attention)
 90 | 
 91 |         # Take the dot-product of the attention vectors with the context vector (i.e. parameter of linear layer)
 92 |         word_attention = self.word_context_vector(word_attention).squeeze(1)  # (n_words)
 93 | 
 94 |         # Compute softmax over the dot-product manually
 95 |         # Manually because they have to be computed only over words in the same sentence
 96 | 
 97 |         # First, take the exponent
 98 |         max_value = word_attention.max()  # scalar, for numerical stability during exponent calculation
 99 |         word_attention = torch.exp(word_attention - max_value)  # (n_words)
100 | 
101 |         # Re-arrange as sentences by re-padding with 0s (WORDS -> SENTENCES)
102 |         word_attention, _ = pad_packed_sequence(PackedSequence(data=word_attention,
103 |                                                                batch_sizes=words_representation.batch_sizes,
104 |                                                                sorted_indices=words_representation.sorted_indices,
105 |                                                                unsorted_indices=words_representation.unsorted_indices),
106 |                                                 batch_first=True)  # (n_sentences, max(words_per_sentence))
107 | 
108 |         # Calculate softmax values as now words are arranged in their respective sentences
109 |         word_alphas = word_attention / torch.sum(word_attention, dim=1,
110 |                                                  keepdim=True)  # (n_sentences, max(words_per_sentence))
111 | 
112 |         # Similarly re-arrange word-level RNN outputs as sentences by re-padding with 0s (WORDS -> SENTENCES)
113 |         sentences, _ = pad_packed_sequence(words_representation,
114 |                                            batch_first=True)  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)
115 | 
116 |         # Find sentence embeddings
117 |         sentences = sentences * word_alphas.unsqueeze(2)  # (n_sentences, max(words_per_sentence), 2 * word_rnn_size)
118 | 
119 |         # gets the representation for the sentence
120 |         sentences = sentences.sum(dim=1)  # (n_sentences)
121 | 
122 |         logits = self.classifier(sentences)
123 | 
124 |         outputs = (logits, sentences)
125 | 
126 |         if labels is not None:
127 |             loss_fct = nn.BCEWithLogitsLoss()
128 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
129 | 
130 |             outputs = (loss,) + outputs
131 | 
132 |         return outputs
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Aspect-based Document Similarity for Research Papers
  2 | 
  3 | Implementation, trained models and result data for the paper **Aspect-based Document Similarity for Research Papers** [(PDF on Arxiv)](https://arxiv.org/abs/2010.06395). 
  4 | The supplemental material is available for download under [GitHub Releases](https://github.com/malteos/aspect-document-similarity/releases) or [Zenodo](http://doi.org/10.5281/zenodo.4087898).
  5 | 
  6 | - Datasets are compatible with 🤗 [Huggingface NLP library](https://github.com/huggingface/nlp) (now known as [datasets](https://github.com/huggingface/datasets)). 
  7 | - Models are available on 🤗 [Huggingface Transformers models](https://huggingface.co/malteos). 
  8 | 
  9 | <img src="https://raw.githubusercontent.com/malteos/aspect-document-similarity/master/docrel.png">
 10 | 
 11 | ## Demo
 12 | 
 13 | <a href="https://colab.research.google.com/github/malteos/aspect-document-similarity/blob/master/demo.ipynb"><img src="https://camo.githubusercontent.com/52feade06f2fecbf006889a904d221e6a730c194/68747470733a2f2f636f6c61622e72657365617263682e676f6f676c652e636f6d2f6173736574732f636f6c61622d62616467652e737667" alt="Google Colab"></a>
 14 | 
 15 | You can try our trained models directly on Google Colab on all papers available on Semantic Scholar (via DOI, ArXiv ID, ACL ID, PubMed ID):
 16 | 
 17 | <a href="https://colab.research.google.com/github/malteos/aspect-document-similarity/blob/master/demo.ipynb"><img src="https://raw.githubusercontent.com/malteos/aspect-document-similarity/master/demo.gif" alt="Click here for demo"></a>
 18 | 
 19 | ## Requirements
 20 | 
 21 | - Python 3.7
 22 | - CUDA GPU (for Transformers)
 23 | 
 24 | Datasets
 25 | - [ACL Anthology Reference Corpus (ACL ARC)](http://acl-arc.comp.nus.edu.sg/)
 26 | - [COVID-19 Open Research Dataset (CORD 19)](https://www.semanticscholar.org/cord19)
 27 | 
 28 | ## Installation
 29 | 
 30 | Create a new virtual environment for Python 3.7 with Conda:
 31 |  
 32 |  ```bash
 33 | conda create -n paper python=3.7
 34 | conda activate paper
 35 | ```
 36 | 
 37 | Clone repository and install dependencies:
 38 | ```bash
 39 | git clone https://github.com/malteos/aspect-document-similarity.git repo
 40 | cd repo
 41 | pip install -r requirements.txt
 42 | ```
 43 | 
 44 | ## Experiments
 45 | 
 46 | To reproduce our experiments, follow these steps (if you just want to train and test the models, skip the first two steps):
 47 | 
 48 | ### Prepare
 49 | 
 50 | ```bash
 51 | export DIR=./output
 52 | 
 53 | # ACL Anthology 
 54 | # Get parscit files from: https://acl-arc.comp.nus.edu.sg/archives/acl-arc-160301-parscit/)
 55 | sh ./sbin/download_parsecit.sh
 56 | 
 57 | # CORD-19
 58 | wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2020-03-13.tar.gz
 59 | 
 60 | # Get additional data (collected from Semantic Scholar API)
 61 | wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/acl_s2.tar
 62 | wget https://github.com/malteos/aspect-document-similarity/releases/download/1.0/cord19_s2.tar
 63 | ```
 64 | 
 65 | ### Build datasets
 66 | 
 67 | ```bash
 68 | # ACL
 69 | python -m acl.dataset save_dataset <input_dir> <parscit_dir> <output_dir>
 70 | 
 71 | # CORD-19
 72 | python -m cord19.dataset save_dataset <input_dir> <output_dir>
 73 | 
 74 | ```
 75 | 
 76 | ### Use dataset
 77 | 
 78 | The datasets are built on the Huggingface NLP library (soon available on the official repository):
 79 | 
 80 | ```python
 81 | from nlp import load_dataset
 82 | 
 83 | # Training data for first CV split
 84 | train_dataset = load_dataset(
 85 |     './datasets/cord19_docrel/cord19_docrel.py',
 86 |     name='relations',
 87 |     split='fold_1_train'
 88 | )                   
 89 | ```
 90 | 
 91 | ### Use models
 92 | 
 93 | ```python
 94 | from models.auto_modelling import AutoModelForMultiLabelSequenceClassification
 95 | 
 96 | # Load models with pretrained weights from Huggingface model hub
 97 | acl_model = AutoModelForMultiLabelSequenceClassification('malteos/aspect-acl-scibert-scivocab-uncased')
 98 | cord19_model = AutoModelForMultiLabelSequenceClassification('malteos/aspect-cord19-scibert-scivocab-uncased')
 99 | 
100 | # Use the models in standard Huggingface fashion ...
101 | # acl_model(input_ids, token_type_ids, ...)
102 | # cord19_model(input_ids, token_type_ids, ...)
103 | 
104 | ```
105 | 
106 | ### Train models
107 | 
108 | All models are trained with the `trainer_cli.py` script:
109 | 
110 | ```bash
111 | python trainer_cli.py --cv_fold $CV_FOLD \
112 |     --output_dir $OUTPUT_DIR \
113 |     --model_name_or_path $MODEL_NAME \
114 |     --doc_id_col $DOC_ID_COL \
115 |     --doc_a_col $DOC_A_COL \
116 |     --doc_b_col $DOC_B_COL \
117 |     --nlp_dataset $NLP_DATASET \
118 |     --nlp_cache_dir $NLP_CACHE_DIR \
119 |     --cache_dir $CACHE_DIR \
120 |     --num_train_epochs $EPOCHS \
121 |     --seed $SEED \
122 |     --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
123 |     --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
124 |     --learning_rate $LR \
125 |     --do_train \
126 |     --save_predictions
127 | ```
128 | 
129 | The exact parameters are available in `sbin/acl` and `sbin/cord19`. 
130 | 
131 | 
132 | 
133 | ### Evaluation
134 | 
135 | The results can be computed and viewed with a Jupyter notebook. 
136 | Figures and tables from the paper are part of the notebook.
137 | 
138 | ```bash
139 | jupyter notebook evaluation.ipynb
140 | ```
141 | 
142 | Due to the space constraints some results could not be included in the paper.
143 | The full results for all methods and all test samples are available as 
144 | CSV files under `Releases`
145 | (or via the Jupyter notebook).
146 | 
147 | ## How to cite
148 | 
149 | If you are using our code, please cite [our paper](https://arxiv.org/abs/2010.06395):
150 | 
151 | ```bibtex
152 | @InProceedings{Ostendorff2020c,
153 |   title = {Aspect-based Document Similarity for Research Papers},
154 |   booktitle = {Proceedings of the 28th International Conference on Computational Linguistics (COLING 2020)},
155 |   author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg},
156 |   year = {2020},
157 |   month = {Dec.},
158 | }
159 | ```
160 | 
161 | ## License
162 | 
163 | MIT
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/datasets/acl_docrel/acl_docrel.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import json
  4 | import os
  5 | 
  6 | import nlp
  7 | from pyarrow import csv
  8 | 
  9 | _DESCRIPTION = """Aspect-oriented Document Similarity from the ACL-Anthology dataset"""
 10 | 
 11 | _HOMEPAGE = "https://github.com/malteos/aspect-document-similarity"
 12 | 
 13 | _CITATION = """
 14 | @InProceedings{Ostendorff2020b,
 15 |   title = {Aspect-based Document Similarity for Research Papers},
 16 |   booktitle = {Proceedings of the 28th International Conference on Computational Linguistics, COLING 2020},
 17 |   author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg},
 18 |   year = {2020},
 19 |   month = {Dec.},
 20 | }
 21 | """
 22 | 
 23 | LABEL_CLASSES = ['introduction',
 24 |                  'related work',
 25 |                  'experiment',
 26 |                  'conclusion',
 27 |                  'results',
 28 |                  'background',
 29 |                  'discussion',
 30 |                  'evaluation',
 31 |                  'method',
 32 |                  #'previous work',
 33 |                  'other',
 34 |                  'none']
 35 | 
 36 | DATA_URL = "http://datasets.fiq.de/acl_docrel.tar.gz"
 37 | 
 38 | def get_train_split(k):
 39 |     return nlp.Split(f'fold_{k}_train')
 40 | 
 41 | 
 42 | def get_test_split(k):
 43 |     return nlp.Split(f'fold_{k}_test')
 44 | 
 45 | 
 46 | class AclDocrelConfig(nlp.BuilderConfig):
 47 |     def __init__(self, features, data_url, **kwargs):
 48 |         super(AclDocrelConfig, self).__init__(version=nlp.Version("0.1.0"), **kwargs)
 49 |         self.features = features
 50 |         self.data_url = data_url
 51 | 
 52 | 
 53 | class AclDocrel(nlp.GeneratorBasedBuilder):
 54 |     """ACL anthology document relation dataset."""
 55 | 
 56 |     BUILDER_CONFIGS = [
 57 |         AclDocrelConfig(
 58 |             name="docs",
 59 |             description="document text and meta data",
 60 |             features={
 61 |                 "s2_id": nlp.Value("string"),
 62 |                 "title": nlp.Value("string"),
 63 |                 "abstract": nlp.Value("string"),
 64 |                 "arxivId": nlp.Value("string"),
 65 |                 "doi": nlp.Value("string"),
 66 |                 "venue": nlp.Value("string"),
 67 |                 "year": nlp.Value("int16"),
 68 |                 "citations_count": nlp.Value("int32"),
 69 |                 "references_count": nlp.Value("int32"),
 70 |                 "authors": nlp.Sequence(nlp.Value('string', id='author_name')),
 71 |             },
 72 |             data_url=DATA_URL,
 73 |         ),
 74 |         AclDocrelConfig(
 75 |             name="relations",
 76 |             description=" relation data",
 77 |             features={
 78 |                 "from_s2_id": nlp.Value("string"),
 79 |                 "to_s2_id": nlp.Value("string"),
 80 |                 "label": nlp.Sequence(nlp.Value('string', id='label'))
 81 |             },
 82 |             data_url=DATA_URL,
 83 |         ),
 84 |     ]
 85 | 
 86 |     def _info(self):
 87 |         return nlp.DatasetInfo(
 88 |             description=_DESCRIPTION + self.config.description,
 89 |             features=nlp.Features(self.config.features),
 90 |             homepage=_HOMEPAGE,
 91 |             citation=_CITATION,
 92 |         )
 93 | 
 94 |     def _split_generators(self, dl_manager):
 95 |         arch_path = dl_manager.download_and_extract(self.config.data_url)
 96 | 
 97 |         if self.config.name == "relations":
 98 |             train_file = "train.csv"
 99 |             test_file = "test.csv"
100 | 
101 |             generators = []
102 | 
103 |             for k in [1, 2, 3, 4]:
104 |                 folds_path = os.path.join(arch_path, 'folds', str(k))
105 |                 generators += [
106 |                     nlp.SplitGenerator(
107 |                         name=get_train_split(k),
108 |                         gen_kwargs={'filepath': os.path.join(folds_path, train_file)}
109 |                     ),
110 |                     nlp.SplitGenerator(
111 |                         name=get_test_split(k),
112 |                         gen_kwargs={'filepath': os.path.join(folds_path, test_file)}
113 |                     )
114 |                 ]
115 |             return generators
116 | 
117 |         elif self.config.name == "docs":
118 |             # docs
119 |             docs_file = os.path.join(arch_path, "docs.jsonl")
120 | 
121 |             return [
122 |                 nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}),
123 |             ]
124 |         else:
125 |             raise ValueError()
126 | 
127 |     @staticmethod
128 |     def get_s2_value(s2, key, default=None):
129 |         if key in s2:
130 |             return s2[key]
131 |         else:
132 |             return default
133 | 
134 |     def _generate_examples(self, filepath):
135 |         """Generate docs + rel examples."""
136 | 
137 |         if self.config.name == "relations":
138 |             df = csv.read_csv(filepath).to_pandas()
139 | 
140 |             for idx, row in df.iterrows():
141 |                 yield idx, dict(from_s2_id=row['from_s2_id'], to_s2_id=row['to_s2_id'], label=row['label'].split(','))
142 | 
143 |         elif self.config.name == "docs":
144 | 
145 |             with open(filepath, 'r') as f:
146 |                 for i, line in enumerate(f):
147 |                     s2 = json.loads(line)
148 | 
149 |                     yield i, {
150 |                         's2_id': self.get_s2_value(s2, 'paperId'),
151 |                         'title': self.get_s2_value(s2, 'title'),
152 |                         'abstract': self.get_s2_value(s2, 'abstract'),
153 |                         'doi': self.get_s2_value(s2, 'doi'),
154 |                         'arxivId': self.get_s2_value(s2, 'arxivId'),
155 |                         'venue': self.get_s2_value(s2, 'venue'),
156 |                         'year': self.get_s2_value(s2, 'year', 0),
157 |                         'citations_count': len(self.get_s2_value(s2, 'citations', [])),
158 |                         'references_count': len(self.get_s2_value(s2, 'references', [])),
159 |                         'authors': [a['name'] for a in self.get_s2_value(s2, 'authors', []) if 'name' in a],
160 |                     }
161 | 


--------------------------------------------------------------------------------
/datasets/cord19_docrel/cord19_docrel.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import json
  4 | import os
  5 | 
  6 | import nlp
  7 | from pyarrow import csv
  8 | 
  9 | _DESCRIPTION = """Aspect-oriented Document Similarity from the CORD-19 dataset"""
 10 | 
 11 | _HOMEPAGE = "https://github.com/malteos/aspect-document-similarity"
 12 | 
 13 | _CITATION = """
 14 | @InProceedings{Ostendorff2020b,
 15 |   title = {Aspect-based Document Similarity for Research Papers},
 16 |   booktitle = {Proceedings of the 28th International Conference on Computational Linguistics, COLING 2020},
 17 |   author = {Ostendorff, Malte and Ruas, Terry and Blume, Till and Gipp, Bela and Rehm, Georg},
 18 |   year = {2020},
 19 |   month = {Dec.},
 20 | }
 21 | """
 22 | 
 23 | LABEL_CLASSES = ['discussion',
 24 |                  'introduction',
 25 |                  'conclusion',
 26 |                  'results',
 27 |                  'methods',
 28 |                  'background',
 29 |                  'materials',
 30 |                  'virus',
 31 |                  'future work',
 32 |                  'other',
 33 |                  'none']
 34 | 
 35 | DATA_URL = "http://datasets.fiq.de/cord19_docrel.tar.gz"
 36 | 
 37 | DOC_A_COL = "from_doi"
 38 | 
 39 | DOC_B_COL = "to_doi"
 40 | 
 41 | LABEL_COL = "label"
 42 | 
 43 | 
 44 | def get_train_split(k):
 45 |     return nlp.Split(f'fold_{k}_train')
 46 | 
 47 | 
 48 | def get_test_split(k):
 49 |     return nlp.Split(f'fold_{k}_test')
 50 | 
 51 | 
 52 | class Cord19DocrelConfig(nlp.BuilderConfig):
 53 |     def __init__(self, features, data_url, **kwargs):
 54 |         super(Cord19DocrelConfig, self).__init__(version=nlp.Version("0.1.0"), **kwargs)
 55 |         self.features = features
 56 |         self.data_url = data_url
 57 | 
 58 | 
 59 | class Cord19Docrel(nlp.GeneratorBasedBuilder):
 60 |     """CORD-19 document relation dataset."""
 61 | 
 62 |     BUILDER_CONFIGS = [
 63 |         Cord19DocrelConfig(
 64 |             name="docs",
 65 |             description="document text and meta data",
 66 |             features={
 67 |                 "doi": nlp.Value("string"),
 68 |                 "cord19_id": nlp.Value("string"),
 69 |                 "s2_id": nlp.Value("string"),
 70 |                 "title": nlp.Value("string"),
 71 |                 "abstract": nlp.Value("string"),
 72 |                 "arxivId": nlp.Value("string"),
 73 |                 "venue": nlp.Value("string"),
 74 |                 "year": nlp.Value("int16"),
 75 |                 "citations_count": nlp.Value("int32"),
 76 |                 "references_count": nlp.Value("int32"),
 77 |                 "authors": nlp.Sequence(nlp.Value('string', id='author_name')),
 78 |             },
 79 |             data_url=DATA_URL,
 80 |         ),
 81 |         Cord19DocrelConfig(
 82 |             name="relations",
 83 |             description=" relation data",
 84 |             features={
 85 |                 DOC_A_COL: nlp.Value("string"),
 86 |                 DOC_B_COL: nlp.Value("string"),
 87 |                 LABEL_COL: nlp.Sequence(nlp.Value('string', id='label'))
 88 |             },
 89 |             data_url=DATA_URL,
 90 |         ),
 91 |     ]
 92 | 
 93 |     def _info(self):
 94 |         return nlp.DatasetInfo(
 95 |             description=_DESCRIPTION + self.config.description,
 96 |             features=nlp.Features(self.config.features),
 97 |             homepage=_HOMEPAGE,
 98 |             citation=_CITATION,
 99 |         )
100 | 
101 |     def _split_generators(self, dl_manager):
102 |         arch_path = dl_manager.download_and_extract(self.config.data_url)
103 | 
104 |         if "relations" in self.config.name:
105 |             train_file = "train.csv"
106 |             test_file = "test.csv"
107 | 
108 |             generators = []
109 | 
110 |             for k in [1, 2, 3, 4]:
111 |                 folds_path = os.path.join(arch_path, 'folds', str(k))
112 |                 generators += [
113 |                     nlp.SplitGenerator(
114 |                         name=get_train_split(k),
115 |                         gen_kwargs={'filepath': os.path.join(folds_path, train_file)}
116 |                     ),
117 |                     nlp.SplitGenerator(
118 |                         name=get_test_split(k),
119 |                         gen_kwargs={'filepath': os.path.join(folds_path, test_file)}
120 |                     )
121 |                 ]
122 |             return generators
123 | 
124 |         elif "docs" in self.config.name:
125 |             # docs
126 |             docs_file = os.path.join(arch_path, "docs.jsonl")
127 | 
128 |             return [
129 |                 nlp.SplitGenerator(name=nlp.Split('docs'), gen_kwargs={"filepath": docs_file}),
130 |             ]
131 |         else:
132 |             raise ValueError()
133 | 
134 |     @staticmethod
135 |     def get_dict_value(d, key, default=None):
136 |         if key in d:
137 |             return d[key]
138 |         else:
139 |             return default
140 | 
141 |     def _generate_examples(self, filepath):
142 |         """Generate docs + rel examples."""
143 | 
144 |         if "relations" in self.config.name:
145 |             df = csv.read_csv(filepath).to_pandas()
146 | 
147 |             for idx, row in df.iterrows():
148 |                 yield idx, {
149 |                     DOC_A_COL: row[DOC_A_COL],
150 |                     DOC_B_COL: row[DOC_B_COL],
151 |                     LABEL_COL: row[LABEL_COL].split(','),
152 |                 }
153 | 
154 |         elif self.config.name == "docs":
155 | 
156 |             with open(filepath, 'r') as f:
157 |                 for i, line in enumerate(f):
158 |                     doc = json.loads(line)
159 | 
160 |                     yield i, {
161 |                         'doi': str(self.get_dict_value(doc, 'doi')),  # cast to str otherwise float
162 |                         'cord19_id': self.get_dict_value(doc, 'cord19_id'),
163 |                         's2_id': self.get_dict_value(doc, 's2_id'),
164 |                         'title': self.get_dict_value(doc, 'title'),
165 |                         'abstract': self.get_dict_value(doc, 'abstract'),
166 |                         'arxivId': self.get_dict_value(doc, 'arxivId'),
167 |                         'venue': str(self.get_dict_value(doc, 'venue') or ''),
168 |                         'year': int(self.get_dict_value(doc, 'year', 0) or 0),
169 |                         'citations_count': int(self.get_dict_value(doc, 'citations_count', 0) or 0),
170 |                         'references_count': int(self.get_dict_value(doc, 'references_count', 0) or 0),
171 |                         'authors': self.get_dict_value(doc, 'authors', []),
172 |                     }
173 | 
174 | 


--------------------------------------------------------------------------------
/acl/preprocessing/parsecit.py:
--------------------------------------------------------------------------------
  1 | from lxml import etree
  2 | import lxml
  3 | import re
  4 | import logging
  5 | import os
  6 | 
  7 | from lxml.etree import LxmlError
  8 | from tqdm import tqdm
  9 | 
 10 | from acl.utils import normalize_title
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def get_parsecit_files(parscit_dir):
 16 |     parscit_files = []
 17 | 
 18 |     for d in os.listdir(parscit_dir):
 19 |         if os.path.isdir(os.path.join(parscit_dir, d)):  # subdir
 20 |             for dd in os.listdir(os.path.join(parscit_dir, d)):  # subdir 2
 21 |                 if os.path.isdir(os.path.join(parscit_dir, d, dd)):
 22 |                     for fn in os.listdir(os.path.join(parscit_dir, d, dd)):  # files
 23 |                         fp = os.path.join(parscit_dir, d, dd, fn)
 24 | 
 25 |                         parscit_files.append((fn, fp))
 26 |     # Total files: 14,714  (server 21,520)
 27 |     logger.info(f'Total files: {len(parscit_files):,}')
 28 | 
 29 |     return parscit_files
 30 | 
 31 | 
 32 | def load_parscit_file(fp, include_contexts=False):
 33 |     # read from file path
 34 |     tree = etree.parse(fp)
 35 | 
 36 |     # sections
 37 |     algo_sect = tree.getroot().cssselect('algorithm[name="SectLabel"] > variant')[0]
 38 |     sects = []
 39 |     sect = None
 40 | 
 41 |     for child in algo_sect.getchildren():
 42 |         if child.tag == 'sectionHeader':
 43 |             sects.append({
 44 |                 'title': child.text.strip(),
 45 |                 'generic': child.get('genericHeader'),
 46 |                 'text': '',
 47 |             })
 48 | 
 49 |         elif child.tag == 'bodyText':
 50 |             # Create untitled section if none exist
 51 |             if len(sects) == 0:
 52 |                 sects.append({
 53 |                     'title': None,
 54 |                     'generic': None,
 55 |                     'text': '',
 56 |                 })
 57 | 
 58 |             # Append to last section
 59 |             sects[-1]['text'] += child.text.strip()
 60 | 
 61 |     # replace line breaks within sentence (could be improved)
 62 |     for i, sect in enumerate(sects):
 63 |         sects[i]['text'] = re.sub(r'([A-Za-z],;)([\r\n]+)([A-Za-z])', r'\1 \3', sect['text'])
 64 | 
 65 |     # Iterate over all valid citations
 66 |     cits = []
 67 | 
 68 |     def get_text_with_cssselect(ele, selector, default=None, ith=0):
 69 |         s = ele.cssselect(selector)
 70 | 
 71 |         if len(s) > ith:
 72 |             return s[ith].text
 73 |         else:
 74 |             return default
 75 | 
 76 |     for cit_ele in tree.getroot().cssselect('algorithm[name="ParsCit"] > citationList > citation[valid="true"]'):
 77 |         try:
 78 | 
 79 |             title = get_text_with_cssselect(cit_ele, 'title')
 80 |             marker = get_text_with_cssselect(cit_ele, 'marker')
 81 |             date = get_text_with_cssselect(cit_ele, 'date')  # str
 82 |             book_title = get_text_with_cssselect(cit_ele, 'booktitle')
 83 | 
 84 |             authors = [e.text for e in cit_ele.cssselect('authors > author')]
 85 | 
 86 |             if date and len(date) != 4:
 87 |                 raise ValueError(f'Invalid date: {date}')
 88 |             cit = dict(title=title, authors=authors, marker=marker, date=date, book_title=book_title)
 89 | 
 90 |             if include_contexts:
 91 |                 cit['contexts'] = cit_ele.cssselect('contexts > context')
 92 | 
 93 |             cits.append(cit)
 94 |         except IndexError as e:
 95 |             print(f'Cannot parse citation: {e}; {etree.tostring(cit_ele)[:100]}')
 96 | 
 97 |     # Extract all citation markers (for later cleaning from section text)
 98 |     markers = []
 99 |     for cit_context in tree.getroot().cssselect(
100 |             'algorithm[name="ParsCit"] > citationList > citation > contexts > context'):
101 |         if 'citStr' in cit_context.attrib:
102 |             markers.append(cit_context.get('citStr'))
103 | 
104 |     return sects, cits, markers
105 | 
106 | 
107 | def get_citation_pairs_from_parscit(parscit_files, acl_id2s2, title2s2_id):
108 |     # Load citations with s2
109 |     error_files = []
110 |     acl_id2sects = {}
111 |     acl_id2markers = {}
112 |     cit_pairs = []
113 | 
114 |     # Iterate over papers
115 |     for i, (fn, fp) in enumerate(tqdm(parscit_files, total=len(parscit_files), desc='Reading Parscit files')):
116 |         try:
117 |             sects, cits, markers = load_parscit_file(fp, include_contexts=True)
118 | 
119 |             from_acl_id = '-'.join(fn.split('-', 2)[:2])  # ACL ID
120 |             acl_id2sects[from_acl_id] = sects
121 |             acl_id2markers[from_acl_id] = markers
122 | 
123 |             from_s2_id = acl_id2s2[from_acl_id]['paperId'] if from_acl_id in acl_id2s2 else None
124 | 
125 |             # if from_s2_id not in s2_id2s2_paper:
126 |             #     logger.warning(f'From paper not in index')
127 |             #     continue
128 | 
129 |             # Citations in paper
130 |             for cit in cits:
131 |                 if cit['title'] is None or cit['book_title'] is None or cit['date'] is None:
132 |                     continue
133 | 
134 |                 # Find citing section context
135 |                 sect_contexts = []
136 |                 for context in cit['contexts']:
137 |                     for i, sect in enumerate(sects):  # Try to find citation string in all sections
138 |                         if context.get('citStr') in sect['text']:
139 |                             # found!
140 |                             sect_contexts.append((sect['generic'], sect['title'], context.get('citStr')))
141 | 
142 |                 # Skip citation if context is not available
143 |                 if len(sect_contexts) == 0:
144 |                     continue
145 | 
146 |                 # Find to_s2_id
147 |                 cit_title = normalize_title(cit['title'])
148 |                 if cit_title in title2s2_id:
149 |                     to_s2_id = title2s2_id[cit_title]
150 | 
151 |                     for context in sect_contexts:
152 |                         cit_pairs.append(
153 |                             # from_s2_id, (from_acl_id,) to_s2_id, sect_generic, sect_title, sect_marker
154 |                             (
155 |                                 from_s2_id,
156 |                                 # from_acl_id,
157 |                                 to_s2_id,
158 |                             ) + context
159 |                         )
160 |                 else:
161 |                     # print('Not found:' + cit_title)
162 |                     pass
163 | 
164 |         except LxmlError as e:
165 |             error_files.append((fn, fp))
166 |         # if i > 10:
167 |         #    break
168 | 
169 |     return cit_pairs, error_files
170 | 


--------------------------------------------------------------------------------
/acl/dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import sys
  5 | from pathlib import Path
  6 | 
  7 | import fire
  8 | import pandas as pd
  9 | from sklearn.model_selection import StratifiedKFold
 10 | from smart_open import open
 11 | from tqdm import tqdm
 12 | 
 13 | from acl.preprocessing.negative_sampling import get_cocitations, get_negative_pairs
 14 | from acl.preprocessing.parsecit import get_parsecit_files, get_citation_pairs_from_parscit
 15 | from acl.utils import resolve_and_sect_titles, to_label, get_sorted_pair, get_text_from_doc, \
 16 |     normalize_title
 17 | 
 18 | logging.basicConfig(level=logging.DEBUG)
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def save_dataset(input_dir, parscit_dir, output_dir, cv_folds=4):
 23 |     """
 24 | 
 25 |     Run with: $ python -m acl.dataset save_dataset <input_dir> <parscit_dir> <output_dir>
 26 | 
 27 |     Required parscit directory (from ACL-anthology):
 28 |     - Download and extract from: https://acl-arc.comp.nus.edu.sg/archives/acl-arc-160301-parscit/
 29 |     - parscit/A/A00/A00-1000-parscit.130908.xml
 30 |     - ...
 31 | 
 32 |     Required input files (.json or .json.gz):
 33 |     - title2dblp_hits.json
 34 |     - acl_id2s2.json
 35 |     - arxiv2s2.json
 36 |     - doi2s2.json.gz
 37 | 
 38 |     Output structure
 39 |     - docs.jsonl: each line is a S2-paper
 40 |     - folds/1/
 41 |     - folds/2/
 42 |     - ...
 43 |     - folds/k/train.csv: actual training samples
 44 |     - folds/k/test.csv
 45 | 
 46 |     Samples are provided as CSV files with the following columns:
 47 |     - doc_a: S2-id
 48 |     - doc_b: S2-id
 49 |     - label: List of labels (comma separated)
 50 | 
 51 |     After dataset creation use the following commands to compress and upload all files:
 52 | 
 53 |     cd <output_dir>
 54 |     tar -cvzf acl_docrel.tar.gz docs.jsonl folds/
 55 |     curl --upload-file acl_docrel.tar.gz ftp://$FTP_LOGIN:$FTP_PASSWORD@$FTP_HOST/$FTP_DIR
 56 | 
 57 |     :param input_dir: S2 paper files
 58 |     :param output_dir: Dataset files written to this directory
 59 |     :param parscit_dir:
 60 |     :param cv_folds:
 61 |     :return:
 62 |     """
 63 |     negative_label = 'none'
 64 |     min_text_length = 50
 65 |     negative_sampling_ratio = 0.5
 66 | 
 67 |     # Fixed labels
 68 |     """
 69 |     introduction     20515
 70 |     related work     14883
 71 |     experiment        5749
 72 |     conclusion        1914
 73 |     results           1828
 74 |     background        1748
 75 |     discussion        1627
 76 |     evaluation        1386
 77 |     method             927
 78 |     (previous work      902)
 79 |     """
 80 |     labels = [
 81 |         'introduction',
 82 |         'related work',
 83 |         'experiment',
 84 |         'conclusion',
 85 |         'results',
 86 |         'background',
 87 |         'discussion',
 88 |         'evaluation',
 89 |         'method',
 90 |         # Only top-9 label classes for v1.1 (equal to CORD-19)
 91 |         # 'previous work'
 92 |     ]
 93 | 
 94 |     doc_a_col = 'from_s2_id'
 95 |     doc_b_col = 'to_s2_id'
 96 |     label_col = 'label'
 97 | 
 98 |     # Convert dirs to Path if is string
 99 |     if isinstance(output_dir, str):
100 |         output_dir = Path(output_dir)
101 | 
102 |     if isinstance(input_dir, str):
103 |         input_dir = Path(input_dir)
104 | 
105 |     # Load paper data from various sources
106 |     # acl_id2title, doi2title, arxiv2title = get_dblp_titles(input_dir / 'title2dblp_hits.json.gz')  # TODO
107 |     acl_id2s2 = json.load(open(input_dir / 'acl_id2s2.json.gz', 'r'))
108 |     arxiv2s2 = json.load(open(input_dir / 'arxiv2s2.json.gz', 'r'))
109 |     doi2s2 = json.load(open(input_dir / 'doi2s2.json.gz', 'r'))
110 | 
111 |     # Merge S2 data
112 |     s2_id2s2_paper = {}
113 |     s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in acl_id2s2.items()})
114 |     s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in arxiv2s2.items()})
115 |     s2_id2s2_paper.update({s2['paperId']: s2 for _id, s2 in doi2s2.items()})
116 | 
117 |     # Filter by empty text
118 |     s2_id2s2_paper = {s2_id: p for s2_id, p in s2_id2s2_paper.items() if len(get_text_from_doc(p)) >= min_text_length}
119 | 
120 |     # Title mapping from document index
121 |     title2s2_id = {normalize_title(p['title']): s2_id for s2_id, p in s2_id2s2_paper.items()}
122 | 
123 |     parscit_files = get_parsecit_files(parscit_dir)
124 |     cit_pairs, error_files = get_citation_pairs_from_parscit(parscit_files, acl_id2s2, title2s2_id)
125 | 
126 |     # s2_pairs, s2_pairs_not_found = get_s2_pairs_from_cits(cit_pairs, acl_id2s2)
127 |     normalized_s2_pairs = resolve_and_sect_titles(cit_pairs, doc_index=s2_id2s2_paper)
128 | 
129 |     # Convert to dataframe
130 |     df = pd.DataFrame(normalized_s2_pairs, columns=['from_s2_id', 'to_s2_id', 'citing_section', 'marker'])
131 | 
132 |     # Auto-determine top labels
133 |     pre_label_col = 'citing_section'
134 |     # top_sections = 10
135 |     # labels = list(filter(lambda t: t, df[pre_label_col].value_counts()[:top_sections].keys()))
136 | 
137 |     # Remove duplicates
138 |     logger.info(f'Before drop duplications: {len(df)}')
139 | 
140 |     df[label_col] = [to_label(t, labels) for t in df[pre_label_col]]
141 |     df.drop_duplicates([doc_a_col, doc_b_col, label_col], keep='first', inplace=True)
142 | 
143 |     logger.info(f'After drop duplications: {len(df)}')
144 | 
145 |     # join multi-labels
146 |     # df = df.groupby([doc_a_col, doc_b_col]).label.agg(
147 |     #     [('label_count', 'count'), (label_col, ','.join)]).reset_index()
148 |     df = df.groupby([doc_a_col, doc_b_col]).label.agg(
149 |         [(label_col, ','.join)]).reset_index()
150 | 
151 |     # Positive samples
152 |     # pos_rows = []
153 |     #
154 |     # for idx, r in df.iterrows():
155 |     #     text = get_text_from_doc_id(r[doc_a_col], s2_id2s2_paper)
156 |     #     text_b = get_text_from_doc_id(r[doc_b_col], s2_id2s2_paper)
157 |     #
158 |     #     # Filter out empty texts
159 |     #     if text != '' and text_b != '':
160 |     #         pos_rows.append((text, text_b, r[label_col]))
161 |     cits_list = df[[doc_a_col, doc_b_col]].values.tolist()
162 |     cits_set = {get_sorted_pair(from_id, to_id) for from_id, to_id in cits_list}
163 | 
164 |     logger.info(f'Total citation count: {len(cits_set):,}')
165 | 
166 |     # co cits
167 |     cocits_set = get_cocitations(df[[doc_a_col, doc_b_col]].values.tolist())
168 | 
169 |     # Negative sampling
170 |     negative_pairs = get_negative_pairs(s2_id2s2_paper, cits_list, cits_set, cocits_set,
171 |                                         negative_ratio=negative_sampling_ratio)
172 | 
173 |     # construct dataset frame
174 |     logger.info('Constructing dataset data frame...')
175 |     dataset = df[[doc_a_col, doc_b_col, label_col]].values.tolist()\
176 |         + list(map(lambda p: (p[0], p[1], negative_label), negative_pairs))  # positive + negative pairs
177 | 
178 |     dataset_df = pd.DataFrame(dataset, columns=[doc_a_col, doc_b_col, label_col])
179 | 
180 |     # Verify
181 |     missing_doc_ids = [doc_id for doc_id in dataset_df[doc_a_col].values if doc_id not in s2_id2s2_paper]
182 |     missing_doc_ids += [doc_id for doc_id in dataset_df[doc_b_col].values if doc_id not in s2_id2s2_paper]
183 | 
184 |     if len(missing_doc_ids) > 0:
185 |         raise ValueError(f'Document IDs are missing in index: {missing_doc_ids}')
186 | 
187 |     # Full training and test set
188 |     logger.info(f'Creating {cv_folds}-Folds ')
189 |     kf = StratifiedKFold(n_splits=cv_folds, random_state=0, shuffle=True)
190 | 
191 |     # Stratified K-Folds cross-validator
192 |     for k, (train_index, test_index) in enumerate(
193 |             kf.split(dataset_df.index.tolist(), dataset_df[label_col].values.tolist()), 1):
194 |         fold_dir = os.path.join(output_dir, 'folds', str(k))
195 | 
196 |         if not os.path.exists(fold_dir):
197 |             logger.info(f'Create new fold dir: {fold_dir}')
198 |             os.makedirs(fold_dir)
199 | 
200 |         split_train_df = dataset_df.iloc[train_index]
201 |         split_test_df = dataset_df.iloc[test_index]
202 | 
203 |         logger.info(f'Total: {len(dataset_df):,}; Train: {len(split_train_df):,}; Test: {len(split_test_df):,}')
204 | 
205 |         split_train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=False)
206 |         split_test_df.to_csv(os.path.join(fold_dir, 'test.csv'),  index=False)
207 | 
208 |     # Write doc output
209 |     with open(str(output_dir / 'docs.jsonl'), 'w') as f:
210 |         for paper in tqdm(s2_id2s2_paper.values(), desc='Writing document data', total=len(s2_id2s2_paper)):
211 |             f.write(json.dumps(paper) + '\n')
212 | 
213 |     logger.info('Done')
214 | 
215 | 
216 | if __name__ == '__main__':
217 |     fire.Fire()
218 |     sys.exit(0)
219 | 


--------------------------------------------------------------------------------
/cord19/dataset.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import sys
  5 | from pathlib import Path
  6 | from typing import Union
  7 | 
  8 | import fire
  9 | import pandas as pd
 10 | from sklearn.model_selection import StratifiedKFold
 11 | from smart_open import open
 12 | from tqdm import tqdm
 13 | 
 14 | from acl.preprocessing.negative_sampling import get_cocitations
 15 | from acl.utils import get_sorted_pair, to_label
 16 | from cord19.preprocessing.cord19_reader import get_papers_and_citations_from_cord19, merge_cord19_and_s2_papers
 17 | from cord19.preprocessing.negative_sampling import get_negative_pairs
 18 | from cord19.utils import normalize_section, resolve_and_sect_titles, get_text_from_doi
 19 | 
 20 | logging.basicConfig(level=logging.DEBUG)
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def save_dataset(input_dir: Union[str, Path], output_dir: Union[str, Path], cv_folds: int = 4):
 25 |     """
 26 | 
 27 |     Run with: $ python -m cord19.dataset save_dataset <input_dir> <output_dir>
 28 | 
 29 |     input_dir = '/home/mostendorff/datasets/cord-19/'
 30 |     output_dir = '/home/mostendorff/datasets/cord-19/dataset/'
 31 |     cv_folds = 4
 32 | 
 33 |     input_dir/metadata.csv
 34 |     input_dir/doi2paper.json.gz
 35 |     input_dir/<subsets> = ['biorxiv_medrxiv', 'comm_use_subset', 'custom_license', 'noncomm_use_subset']
 36 | 
 37 |     output_dir/docs.jsonl
 38 |     output_dir/folds/1/train.csv
 39 |     output_dir/folds/1/test.csv
 40 | 
 41 |     tar -cvzf cord19_docrel.tar.gz docs.jsonl folds/
 42 |     curl --upload-file cord19_docrel.tar.gz ftp://$FTP_LOGIN:$FTP_PASSWORD@ostendorff.org/cloud.ostendorff.org/static/
 43 | 
 44 |     :param input_dir: Path to directory with input files
 45 |     :param output_dir: Output files are written to this dir
 46 |     :param cv_folds: Number of folds in k-fold cross validation
 47 |     """
 48 |     label_col = 'label'
 49 |     negative_label = 'none'
 50 |     min_text_length = 50
 51 |     negative_sampling_ratio = 0.5
 52 | 
 53 |     doc_a_col = 'from_doi'
 54 |     doc_b_col = 'to_doi'
 55 | 
 56 |     labels = [
 57 |         'discussion',
 58 |         'introduction',
 59 |         'conclusion',
 60 |         'results',
 61 |         'methods',
 62 |         'background',
 63 |         'materials',
 64 |         'virus',
 65 |         'future work'
 66 |     ]
 67 | 
 68 |     # input_dir = os.path.join(env['datasets_dir'], 'cord-19')
 69 | 
 70 |     # Convert dirs to Path if is string
 71 |     if isinstance(output_dir, str):
 72 |         output_dir = Path(output_dir)
 73 | 
 74 |     if isinstance(input_dir, str):
 75 |         input_dir = Path(input_dir)
 76 | 
 77 |     # Read meta data
 78 |     meta_df = pd.read_csv(input_dir / 'metadata.csv', index_col=0, dtype={'doi': str, 'journal': str})
 79 |     id2meta = {row['sha']: row for idx, row in meta_df.iterrows() if row['sha']}
 80 | 
 81 |     logger.info('Unique DOIs in meta data: %s' % (len(meta_df['doi'].unique()) / len(meta_df)))
 82 | 
 83 |     # Load paper data and citations from CORD-19
 84 |     id2paper, cits = get_papers_and_citations_from_cord19(input_dir, id2meta)
 85 | 
 86 |     # Load paper data from disk (scraped from S2)
 87 |     if os.path.exists(input_dir / 'doi2s2paper.json.gz'):
 88 |         with open(str(input_dir / 'doi2s2paper.json.gz'), 'r') as f:
 89 |             doi2s2paper = json.load(f)
 90 | 
 91 |         logger.info(f'Loaded {len(doi2s2paper):,} scraped papers from disk')
 92 |     else:
 93 |         logger.error('Cannot load S2 papers from: %s' % (input_dir / 'doi2paper.json.gz'))
 94 |         doi2s2paper = {}
 95 | 
 96 |     # Merge CORD-19 papers and S2 papers
 97 |     doi2paper = merge_cord19_and_s2_papers(id2paper, id2meta, doi2s2paper)
 98 | 
 99 |     logger.info(f'Loaded {len(doi2paper)} from CORD-19')
100 | 
101 |     all_dois = list(doi2paper.keys())
102 | 
103 |     # DOIs with text
104 |     doi2text = {}
105 |     for doi in all_dois:
106 |         text = get_text_from_doi(doi, doi2paper, raise_not_found_error=False)
107 |         if len(text) > min_text_length:
108 |             doi2text[doi] = text
109 | 
110 |     logger.info(f'Total DOIs: {len(all_dois):,}')
111 |     logger.info(f'With text DOIs: {len(doi2text):,}')
112 | 
113 |     # Filter citations with existing DOI
114 |     cits_with_doi = [c for c in cits if c[0] in doi2paper and c[1] in doi2paper]
115 | 
116 |     # CORD-19 only: Citations with DOI: 30655 (0.09342419246206499)
117 |     # + S2: Citations with DOI: 170454 (0.5194756908148369)
118 | 
119 |     logger.info(f'Citations with DOI: {len(cits_with_doi)} ({len(cits_with_doi) / len(cits)})')
120 | 
121 |     missing_papers = [c[0] for c in cits if c[0] not in doi2paper]
122 |     missing_papers += [c[1] for c in cits if c[1] not in doi2paper]
123 | 
124 |     logger.info(f'Missing paper data, but DOI: {len(missing_papers)}')
125 | 
126 |     unique_missing_papers = set(missing_papers)
127 | 
128 |     logger.info(f'Unique DOIs of missing papers: {len(unique_missing_papers)}')
129 | 
130 |     # resolve 'and' titles
131 |     normalized_cits_with_doi = resolve_and_sect_titles(cits_with_doi)
132 | 
133 |     cits_df = pd.DataFrame(normalized_cits_with_doi, columns=[doc_a_col, doc_b_col, 'citing_section'])
134 |     # cits_df
135 | 
136 |     logger.info(f'After normalization: {len(cits_df):,} (before: {len(cits_with_doi):,})')
137 | 
138 |     # top_sections = 10
139 |     # labels = list(filter(lambda t: t, cits_df['citing_section'].value_counts()[:top_sections].keys()))
140 | 
141 |     # Remove duplicates
142 |     cits_df[label_col] = [to_label(normalize_section(t), labels) for t in cits_df['citing_section']]
143 |     cits_df.drop_duplicates([doc_a_col, doc_b_col, 'label'], keep='first', inplace=True)
144 | 
145 |     # Document must have text
146 |     cits_df = cits_df[(cits_df[doc_a_col].isin(doi2text.keys())) & (cits_df[doc_a_col].isin(doi2text.keys()))]
147 | 
148 |     # Merge multi-labels
149 |     df = cits_df.groupby([doc_a_col, doc_b_col]).label.agg([(label_col, ','.join)]).reset_index()
150 | 
151 |     # # Positive samples
152 |     # pos_rows = []
153 |     #
154 |     # for idx, r in df.iterrows():
155 |     #     text = get_text_from_doi(r[doc_a_col], doi2s2paper, doi2paper)
156 |     #     text_b = get_text_from_doi(r[doc_b_col], doi2s2paper, doi2paper)
157 |     #
158 |     #     # Filter out empty texts
159 |     #     if text != '' and text_b != '':
160 |     #         pos_rows.append((r[doc_a_col], r[doc_b_col], text, text_b, r[label_col]))
161 | 
162 |     cits_set = set([get_sorted_pair(from_doi, to_doi) for from_doi, to_doi, label in cits_with_doi])
163 | 
164 |     logger.info(f'Total citation count: {len(cits_set):,}')
165 | 
166 |     cocits_set = get_cocitations([(from_doi, to_doi) for from_doi, to_doi, label in cits_with_doi])
167 | 
168 |     # Negatives needed: 52,746 (ratio: 0.5)
169 |     negative_pairs = get_negative_pairs(
170 |         doi2paper,
171 |         candidate_doc_ids=list(doi2text.keys()),
172 |         positive_pairs=df[[doc_a_col, doc_b_col]].values.tolist(),
173 |         cits_set=cits_set,
174 |         cocits_set=cocits_set,
175 |         negative_ratio=negative_sampling_ratio
176 |     )
177 | 
178 |     ###
179 | 
180 |     # construct dataset frame
181 |     logger.info('Constructing dataset data frame...')
182 |     dataset = df[[doc_a_col, doc_b_col, label_col]].values.tolist()\
183 |         + list(map(lambda p: (p[0], p[1], negative_label), negative_pairs))  # positive + negative pairs
184 | 
185 |     dataset_df = pd.DataFrame(dataset, columns=[doc_a_col, doc_b_col, label_col])
186 | 
187 |     # TODO debug sample set?
188 | 
189 |     # Full training and test set
190 |     logger.info(f'Creating {cv_folds}-Folds ')
191 |     kf = StratifiedKFold(n_splits=cv_folds, random_state=0, shuffle=True)
192 | 
193 |     # Stratified K-Folds cross-validator
194 |     for k, (train_index, test_index) in enumerate(
195 |             kf.split(dataset_df.index.tolist(), dataset_df[label_col].values.tolist()), 1):
196 |         fold_dir = os.path.join(output_dir, 'folds', str(k))
197 | 
198 |         if not os.path.exists(fold_dir):
199 |             logger.info(f'Create new fold dir: {fold_dir}')
200 |             os.makedirs(fold_dir)
201 | 
202 |         split_train_df = dataset_df.iloc[train_index]
203 |         split_test_df = dataset_df.iloc[test_index]
204 | 
205 |         logger.info(f'Total: {len(dataset_df):,}; Train: {len(split_train_df):,}; Test: {len(split_test_df):,}')
206 | 
207 |         split_train_df.to_csv(os.path.join(fold_dir, 'train.csv'), index=False)
208 |         split_test_df.to_csv(os.path.join(fold_dir, 'test.csv'),  index=False)
209 | 
210 |     # Write doc output
211 |     with open(str(output_dir / 'docs.jsonl'), 'w') as f:
212 |         for paper in tqdm(doi2paper.values(), desc='Writing document data', total=len(doi2paper)):
213 |             f.write(json.dumps(paper) + '\n')
214 | 
215 |     logger.info('Done')
216 | 
217 | 
218 | if __name__ == '__main__':
219 |     fire.Fire()
220 |     sys.exit(0)
221 | 


--------------------------------------------------------------------------------
/sbin/acl/gpu1.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | export PYTHONUNBUFFERED=1
  4 | 
  5 | export APP_ROOT=$(dirname "$0")
  6 | 
  7 | . $APP_ROOT/config.sh
  8 | 
  9 | # models: albert-base-v1   bert-base-german-cased        biobert-v1-1             longformer-base-4096.tar.gz   pytorch
 10 | # scibert-scivocab-uncased
 11 | #albert-base-v2   bert-base-multilingual-cased  distilbert-base-uncased
 12 | # longformer-large-4096         roberta-base   xlnet-base-cased
 13 | #bert-base-cased  bert-large-cased
 14 | # longformer-base-4096     longformer-large-4096.tar.gz
 15 | # roberta-large
 16 | 
 17 | export EVAL_BATCH_SIZE=16
 18 | export TRAIN_BATCH_SIZE=8
 19 | 
 20 | export EVAL_BATCH_SIZE=16
 21 | export TRAIN_BATCH_SIZE=8
 22 | 
 23 | # serv 9212; gpu 0
 24 | export CUDA_VISIBLE_DEVICES=0,1
 25 | export MODEL_NAME="bert-base-cased"
 26 | 
 27 | echo $MODEL_NAME
 28 | for CV_FOLD in 1 2 3 4
 29 | do
 30 |     echo $CV_FOLD
 31 |     python trainer_cli.py --cv_fold $CV_FOLD \
 32 |         --output_dir $OUTPUT_DIR \
 33 |         --model_name_or_path $MODEL_NAME \
 34 |         --doc_id_col $DOC_ID_COL \
 35 |         --doc_a_col $DOC_A_COL \
 36 |         --doc_b_col $DOC_B_COL \
 37 |         --nlp_dataset $NLP_DATASET \
 38 |         --nlp_cache_dir $NLP_CACHE_DIR \
 39 |         --cache_dir $CACHE_DIR \
 40 |         --num_train_epochs $EPOCHS \
 41 |         --seed $SEED \
 42 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
 43 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
 44 |         --learning_rate $LR \
 45 |         --logging_steps 100 \
 46 |         --save_steps 0 \
 47 |         --save_total_limit 3 \
 48 |         --do_train \
 49 |         --save_predictions
 50 | done
 51 | 
 52 | # serv 9212; gpu 1
 53 | export CUDA_VISIBLE_DEVICES=1
 54 | export MODEL_NAME="scibert-scivocab-uncased"
 55 | echo $MODEL_NAME
 56 | for CV_FOLD in 1 2 3 4
 57 | do
 58 |     echo $CV_FOLD
 59 |     python trainer_cli.py --cv_fold $CV_FOLD \
 60 |         --output_dir $OUTPUT_DIR \
 61 |         --model_name_or_path $MODEL_NAME \
 62 |         --doc_id_col $DOC_ID_COL \
 63 |         --doc_a_col $DOC_A_COL \
 64 |         --doc_b_col $DOC_B_COL \
 65 |         --nlp_dataset $NLP_DATASET \
 66 |         --nlp_cache_dir $NLP_CACHE_DIR \
 67 |         --cache_dir $CACHE_DIR \
 68 |         --num_train_epochs $EPOCHS \
 69 |         --seed $SEED \
 70 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
 71 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
 72 |         --learning_rate $LR \
 73 |         --logging_steps 100 \
 74 |         --save_steps 0 \
 75 |         --save_total_limit 3 \
 76 |         --do_train \
 77 |         --save_predictions
 78 | done
 79 | 
 80 | # serv 9212; gpu 1
 81 | export CUDA_VISIBLE_DEVICES=1
 82 | 
 83 | export EVAL_BATCH_SIZE=16
 84 | export TRAIN_BATCH_SIZE=12
 85 | export EPOCHS=8
 86 | export CV_FOLD=1
 87 | export LR=1e-5
 88 | export RNN_NUM_LAYERS=2
 89 | export RNN_HIDDEN_SIZE=100
 90 | export RNN_DROPOUT=0.1
 91 | export SPACY_MODEL=./output/acl_docrel/spacy/en_acl_fasttext_300d
 92 | export MODEL_NAME=baseline-rnn__fasttext__custom
 93 | 
 94 | for CV_FOLD in 1 2 3 4
 95 | do
 96 |     echo $CV_FOLD
 97 |     python trainer_cli.py --cv_fold $CV_FOLD \
 98 |         --output_dir $OUTPUT_DIR \
 99 |         --model_name_or_path $MODEL_NAME \
100 |         --doc_id_col $DOC_ID_COL \
101 |         --doc_a_col $DOC_A_COL \
102 |         --doc_b_col $DOC_B_COL \
103 |         --nlp_dataset $NLP_DATASET \
104 |         --nlp_cache_dir $NLP_CACHE_DIR \
105 |         --cache_dir $CACHE_DIR \
106 |         --num_train_epochs $EPOCHS \
107 |         --seed $SEED \
108 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
109 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
110 |         --learning_rate $LR \
111 |         --logging_steps 100 \
112 |         --save_steps 0 \
113 |         --save_total_limit 3 \
114 |         --spacy_model $SPACY_MODEL \
115 |         --rnn_type lstm \
116 |         --rnn_num_layers $RNN_NUM_LAYERS \
117 |         --rnn_hidden_size $RNN_HIDDEN_SIZE \
118 |         --rnn_dropout $RNN_DROPOUT \
119 |         --do_train \
120 |         --save_predictions
121 | done
122 | 
123 | 
124 | 
125 | ######
126 | ######
127 | ######
128 | ######
129 | 
130 | 
131 | # serv 9200; gpu 2
132 | export CUDA_VISIBLE_DEVICES=2
133 | export EVAL_BATCH_SIZE=16
134 | export TRAIN_BATCH_SIZE=8
135 | export MODEL_NAME="roberta-base"
136 | for CV_FOLD in 1 2 3 4
137 | do
138 |     echo $CV_FOLD
139 |     python trainer_cli.py --cv_fold $CV_FOLD \
140 |         --output_dir $OUTPUT_DIR \
141 |         --model_name_or_path $MODEL_NAME \
142 |         --doc_id_col $DOC_ID_COL \
143 |         --doc_a_col $DOC_A_COL \
144 |         --doc_b_col $DOC_B_COL \
145 |         --nlp_dataset $NLP_DATASET \
146 |         --nlp_cache_dir $NLP_CACHE_DIR \
147 |         --cache_dir $CACHE_DIR \
148 |         --num_train_epochs $EPOCHS \
149 |         --seed $SEED \
150 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
151 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
152 |         --learning_rate $LR \
153 |         --logging_steps 100 \
154 |         --save_steps 0 \
155 |         --save_total_limit 3 \
156 |         --do_train \
157 |         --save_predictions
158 | done
159 | 
160 | # serv 9200; gpu 3
161 | export CUDA_VISIBLE_DEVICES=3
162 | export EVAL_BATCH_SIZE=12
163 | export TRAIN_BATCH_SIZE=6
164 | export MODEL_NAME="xlnet-base-cased"
165 | for CV_FOLD in 1 2 3 4
166 | do
167 |     echo $CV_FOLD
168 |     python trainer_cli.py --cv_fold $CV_FOLD \
169 |         --output_dir $OUTPUT_DIR \
170 |         --model_name_or_path $MODEL_NAME \
171 |         --doc_id_col $DOC_ID_COL \
172 |         --doc_a_col $DOC_A_COL \
173 |         --doc_b_col $DOC_B_COL \
174 |         --nlp_dataset $NLP_DATASET \
175 |         --nlp_cache_dir $NLP_CACHE_DIR \
176 |         --cache_dir $CACHE_DIR \
177 |         --num_train_epochs $EPOCHS \
178 |         --seed $SEED \
179 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
180 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
181 |         --learning_rate $LR \
182 |         --logging_steps 100 \
183 |         --save_steps 0 \
184 |         --save_total_limit 3 \
185 |         --do_train \
186 |         --save_predictions
187 | done
188 | 
189 | 
190 | # serv 9200; gpu 4
191 | export CUDA_VISIBLE_DEVICES=4
192 | export EVAL_BATCH_SIZE=12
193 | export TRAIN_BATCH_SIZE=8
194 | export MODEL_NAME="google/electra-base-discriminator"
195 | for CV_FOLD in 1 2 3 4
196 | do
197 |     echo $MODEL_NAME
198 |     echo $CV_FOLD
199 |     python trainer_cli.py --cv_fold $CV_FOLD \
200 |         --output_dir $OUTPUT_DIR \
201 |         --model_name_or_path $MODEL_NAME \
202 |         --doc_id_col $DOC_ID_COL \
203 |         --doc_a_col $DOC_A_COL \
204 |         --doc_b_col $DOC_B_COL \
205 |         --nlp_dataset $NLP_DATASET \
206 |         --nlp_cache_dir $NLP_CACHE_DIR \
207 |         --cache_dir $CACHE_DIR \
208 |         --num_train_epochs $EPOCHS \
209 |         --seed $SEED \
210 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
211 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
212 |         --learning_rate $LR \
213 |         --logging_steps 100 \
214 |         --save_steps 0 \
215 |         --save_total_limit 3 \
216 |         --do_train \
217 |         --save_predictions
218 | done
219 | 
220 | 
221 | # serv 9200; gpu 5
222 | export CUDA_VISIBLE_DEVICES=5
223 | export EVAL_BATCH_SIZE=16
224 | export TRAIN_BATCH_SIZE=8
225 | export MODEL_NAME="deepset/covid_bert_base"
226 | for CV_FOLD in 1 2 3 4
227 | do
228 |     echo $MODEL_NAME
229 |     echo $CV_FOLD
230 |     python trainer_cli.py --cv_fold $CV_FOLD \
231 |         --output_dir $OUTPUT_DIR \
232 |         --model_name_or_path $MODEL_NAME \
233 |         --doc_id_col $DOC_ID_COL \
234 |         --doc_a_col $DOC_A_COL \
235 |         --doc_b_col $DOC_B_COL \
236 |         --nlp_dataset $NLP_DATASET \
237 |         --nlp_cache_dir $NLP_CACHE_DIR \
238 |         --cache_dir $CACHE_DIR \
239 |         --num_train_epochs $EPOCHS \
240 |         --seed $SEED \
241 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
242 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
243 |         --learning_rate $LR \
244 |         --logging_steps 100 \
245 |         --save_steps 0 \
246 |         --save_total_limit 3 \
247 |         --do_train \
248 |         --save_predictions
249 | done
250 | 
251 | 
252 | #####
253 | 
254 | # serv 9200; gpu 3
255 | export CUDA_VISIBLE_DEVICES=2,4,5
256 | export EVAL_BATCH_SIZE=12
257 | export TRAIN_BATCH_SIZE=6
258 | export MODEL_NAME="xlnet-base-cased"
259 | for CV_FOLD in 4
260 | do
261 |     echo $CV_FOLD
262 |     python trainer_cli.py --cv_fold $CV_FOLD \
263 |         --output_dir $OUTPUT_DIR \
264 |         --model_name_or_path $MODEL_NAME \
265 |         --doc_id_col $DOC_ID_COL \
266 |         --doc_a_col $DOC_A_COL \
267 |         --doc_b_col $DOC_B_COL \
268 |         --nlp_dataset $NLP_DATASET \
269 |         --nlp_cache_dir $NLP_CACHE_DIR \
270 |         --cache_dir $CACHE_DIR \
271 |         --num_train_epochs $EPOCHS \
272 |         --seed $SEED \
273 |         --per_gpu_eval_batch_size $EVAL_BATCH_SIZE \
274 |         --per_gpu_train_batch_size $TRAIN_BATCH_SIZE \
275 |         --learning_rate $LR \
276 |         --logging_steps 100 \
277 |         --save_steps 0 \
278 |         --save_total_limit 3 \
279 |         --do_train \
280 |         --save_predictions
281 | done
282 | 
283 | export PYTHONUNBUFFERED=""
284 | 
285 | 


--------------------------------------------------------------------------------
/acl/__data_prep.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import bibtexparser
  4 | from lxml import etree
  5 | import lxml
  6 | from fuzzywuzzy import fuzz
  7 | from fuzzywuzzy import process
  8 | import pandas as pd
  9 | import os
 10 | import pickle
 11 | import time
 12 | import json
 13 | import re
 14 | import numpy as np
 15 | from tqdm import tqdm_notebook as tqdm
 16 | from collections import defaultdict
 17 | import requests
 18 | from lxml.etree import  LxmlError
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def load_acl_corpus(data_dir):
 24 |     title2acl_ids = defaultdict(list)
 25 |     acl_id2meta = {}
 26 | 
 27 |     year2titles = defaultdict(list)
 28 |     author_last2titles = defaultdict(list)
 29 | 
 30 |     parser = etree.XMLParser(recover=True)
 31 | 
 32 |     for d in os.listdir(os.path.join(data_dir, 'aclxml')):
 33 |         if os.path.isdir(os.path.join(data_dir, 'aclxml', d)):
 34 |             for vol in os.listdir(os.path.join(data_dir, 'aclxml', d)):
 35 |                 if os.path.isdir(os.path.join(data_dir, 'aclxml', d, vol)):
 36 |                     xml_fp = os.path.join(data_dir, 'aclxml', d, vol, vol + '.xml')
 37 |                     # print(vol)
 38 | 
 39 |                     tree = etree.parse(xml_fp, parser=parser)
 40 | 
 41 |                     # Parse volume
 42 |                     papers = tree.getroot().cssselect('paper')
 43 | 
 44 |                     for paper in papers:
 45 |                         title = next(iter(paper.xpath('./title/text()')), None)
 46 |                         year = next(iter(paper.xpath('./year/text()')), None)
 47 |                         authors_first = paper.xpath('./author/first/text()')
 48 |                         authors_last = paper.xpath('./author/last/text()')
 49 | 
 50 |                         if title is None or year is None:
 51 |                             continue
 52 | 
 53 |                         acl_id = vol + '-' + paper.get('id')
 54 | 
 55 |                         acl_id2meta[acl_id] = dict(
 56 |                             title=title,
 57 |                             year=year,
 58 |                             book_title=next(iter(paper.xpath('./booktitle/text()')), None),
 59 |                             bibkey=next(iter(paper.xpath('./bibkey/text()')), None),
 60 |                             authors_first=authors_first,
 61 |                             authors_last=authors_last,
 62 | 
 63 |                         )
 64 |                         title2acl_ids[title].append(acl_id)
 65 |                         year2titles[year].append(title)
 66 | 
 67 |                         for last in authors_last:
 68 |                             author_last2titles[last].append(title)
 69 | 
 70 |     # Extracted titles: 14,760
 71 |     print(f'Extracted titles: {len(title2acl_ids):,}')
 72 | 
 73 |     return title2acl_ids, acl_id2meta, year2titles, author_last2titles
 74 | 
 75 | 
 76 | def get_text_with_cssselect(ele, selector, default=None, ith=0):
 77 |     s = ele.cssselect(selector)
 78 | 
 79 |     if len(s) > ith:
 80 |         return s[ith].text
 81 |     else:
 82 |         return default
 83 | 
 84 | 
 85 | def load_parscit_file(fp, include_contexts=False):
 86 |     # read from file path
 87 |     tree = etree.parse(fp)
 88 | 
 89 |     # sections
 90 |     algo_sect = tree.getroot().cssselect('algorithm[name="SectLabel"] > variant')[0]
 91 |     sects = []
 92 |     sect = None
 93 | 
 94 |     for child in algo_sect.getchildren():
 95 |         if child.tag == 'sectionHeader':
 96 |             sects.append({
 97 |                 'title': child.text.strip(),
 98 |                 'generic': child.get('genericHeader'),
 99 |                 'text': '',
100 |             })
101 | 
102 |         elif child.tag == 'bodyText':
103 |             # Create untitled section if none exist
104 |             if len(sects) == 0:
105 |                 sects.append({
106 |                     'title': None,
107 |                     'generic': None,
108 |                     'text': '',
109 |                 })
110 | 
111 |             # Append to last section
112 |             sects[-1]['text'] += child.text.strip()
113 | 
114 |     # replace line breaks within sentence (could be improved)
115 |     for i, sect in enumerate(sects):
116 |         sects[i]['text'] = re.sub(r'([A-Za-z],;)([\r\n]+)([A-Za-z])', r'\1 \3', sect['text'])
117 | 
118 |     # Iterate over all valid citations
119 |     cits = []
120 | 
121 |     def get_text_with_cssselect(ele, selector, default=None, ith=0):
122 |         s = ele.cssselect(selector)
123 | 
124 |         if len(s) > ith:
125 |             return s[ith].text
126 |         else:
127 |             return default
128 | 
129 |     for cit_ele in tree.getroot().cssselect('algorithm[name="ParsCit"] > citationList > citation[valid="true"]'):
130 |         try:
131 | 
132 |             title = get_text_with_cssselect(cit_ele, 'title')
133 |             marker = get_text_with_cssselect(cit_ele, 'marker')
134 |             date = get_text_with_cssselect(cit_ele, 'date')  # str
135 |             book_title = get_text_with_cssselect(cit_ele, 'booktitle')
136 | 
137 |             authors = [e.text for e in cit_ele.cssselect('authors > author')]
138 | 
139 |             if date and len(date) != 4:
140 |                 raise ValueError(f'Invalid date: {date}')
141 |             cit = dict(title=title, authors=authors, marker=marker, date=date, book_title=book_title)
142 | 
143 |             if include_contexts:
144 |                 cit['contexts'] = cit_ele.cssselect('contexts > context')
145 | 
146 |             cits.append(cit)
147 |         except IndexError as e:
148 |             print(f'Cannot parse citation: {e}; {etree.tostring(cit_ele)[:100]}')
149 | 
150 |     # Extract all citation markers (for later cleaning from section text)
151 |     markers = []
152 |     for cit_context in tree.getroot().cssselect(
153 |             'algorithm[name="ParsCit"] > citationList > citation > contexts > context'):
154 |         if 'citStr' in cit_context.attrib:
155 |             markers.append(cit_context.get('citStr'))
156 | 
157 |     return sects, cits, markers
158 | 
159 | 
160 | # Extract citation context
161 | # - find section in which the citation markers can be found
162 | # - find the corresponding ACL paper
163 | # - fuzzy title search is expensive, therefore, we check on year + authors first to decrease search space.
164 | def get_citation_context(cits, sects, title2acl_ids, year2titles, author_last2titles):
165 |     cits_with_context = []  # (bib_idx, sect_context)
166 | 
167 |     for cit in cits:
168 |         if cit['title'] is None or cit['book_title'] is None or cit['date'] is None:
169 |             continue
170 | 
171 |         # Find section context
172 |         sect_contexts = []
173 |         for context in cit['contexts']:
174 |             for i, sect in enumerate(sects):  # Try to find citation string in all sections
175 |                 if context.get('citStr') in sect['text']:
176 |                     # found!
177 |                     # print(sect['title'])
178 |                     # print(sect['generic'])
179 |                     sect_contexts.append((sect['generic'], sect['title'], context.get('citStr')))
180 | 
181 |             # print(context.get('citStr'))
182 |             # print(context.get('position'))
183 |             # print(context.get('startWordPos'))
184 | 
185 |         if len(sect_contexts) == 0:
186 |             continue
187 | 
188 |         # Filter for ACL proceedings
189 |         # TODO could be improved
190 |         if 'ACL' in cit['book_title'] or 'Linguistics' in cit['book_title']:
191 |             year_candidates = set(year2titles[cit['date']])  # papers from the same year
192 | 
193 |             if len(year_candidates) > 0:
194 |                 # papers from authors with same name
195 |                 # note: all name parts are used, bc we do not know what the first or last name is.
196 |                 author_names = [name for author in cit['authors'] for name in author.split()]
197 |                 author_candidates = []
198 |                 for name in author_names:
199 |                     if name in author_last2titles:
200 |                         author_candidates += author_last2titles[name]
201 |                 author_candidates = set(author_candidates)
202 | 
203 |                 if len(author_candidates) > 0:
204 |                     # candidate must be in both sets
205 |                     candidates = year_candidates & author_candidates
206 | 
207 |                     if len(candidates) > 0:
208 |                         match_title, score = process.extractOne(cit['title'], candidates)
209 | 
210 |                         # Candidate must be above threshold
211 |                         if score > .95 and match_title in title2acl_ids:
212 |                             for acl_id in title2acl_ids[match_title]:
213 |                                 # Citation found in bib
214 |                                 for sc in sect_contexts:
215 |                                     cits_with_context.append((acl_id, sc))
216 | 
217 |                 # bib_candidates = process.extract(cit['title'], candidate_titles, limit=1)
218 |                 # for c_title, score in bib_candidates:
219 |                 #    for acl_id in title2acl_ids[c_title]:
220 |                 #        # Citation found in bib
221 |                 #        for sc in sect_contexts:
222 |                 #            cits_with_context.append((acl_id, sc))
223 | 
224 |                 # TODO multi title matches? -> check for year
225 | 
226 |                 # print(c_idx)
227 |                 # print(bib_database.entries[c_idx]['title'])
228 |                 # print(marker)
229 |                 #    break
230 |     return cits_with_context
231 | 
232 | 
233 | 


--------------------------------------------------------------------------------
/experiments/data_helpers.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from abc import ABC
  3 | 
  4 | import torch
  5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
  6 | from torch.nn.utils.rnn import pad_sequence
  7 | from torch.utils.data import DataLoader, TensorDataset
  8 | from transformers import BertTokenizer
  9 | 
 10 | from experiments.data_loaders import DefaultXYDataLoader
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class DataHelper(object):
 16 |     """
 17 | 
 18 |     Helps to load experimental data as PyTorch data loaders
 19 | 
 20 |     """
 21 |     train_test_split = 0.8
 22 |     train_batch_size = None
 23 |     test_batch_size = None
 24 |     random_seed = None
 25 |     tqdm_cls = None
 26 | 
 27 |     def __init__(self, **kwargs):
 28 |         for k, v in kwargs.items():
 29 |             if hasattr(self, k):
 30 |                 setattr(self, k, v)
 31 |             else:
 32 |                 raise ValueError(f'Unknown attribute: {k}')
 33 | 
 34 |     def get_data_loaders(self):
 35 |         raise NotImplementedError()
 36 | 
 37 |     @staticmethod
 38 |     def get_item_lengths(data_loader: DataLoader, masks_idx):
 39 |         """
 40 |         Extract the length of data items in data loader (with masks)
 41 | 
 42 |         Inspect output with Pandas like this: `pd.Series(lengths).describe()`
 43 | 
 44 |         :param data_loader:
 45 |         :param masks_idx: Index of mask data in batch
 46 |         :return: List of length
 47 |         """
 48 |         lengths = []
 49 | 
 50 |         for batch in data_loader:
 51 |             for mask in batch[masks_idx]:
 52 |                 lengths.append(int(mask.sum()))
 53 | 
 54 |         return lengths
 55 | 
 56 |     def get_train_test_split(self, df):
 57 |         split_at = int(len(df) * self.train_test_split)
 58 | 
 59 |         split_df = df.sample(frac=1., random_state=self.random_seed).reset_index(drop=True)
 60 | 
 61 |         train_df = split_df[:split_at]
 62 |         test_df = split_df[split_at:]
 63 | 
 64 |         logger.info(f'Train: {len(train_df)}; Test: {len(test_df)} (ratio: {self.train_test_split})')
 65 | 
 66 |         return train_df, test_df
 67 | 
 68 |     def get_data_sampler(self, sampler=None, dataset=None, sampler_cls=None):
 69 |         """
 70 | 
 71 |         Handle different ways to sample data from data loader (Random, sequential, weighted, ..)
 72 | 
 73 |         :param sampler:
 74 |         :param dataset:
 75 |         :param sampler_cls:
 76 |         :return:
 77 |         """
 78 |         if sampler is not None:
 79 |             return sampler  # WeightedRandomSampler
 80 |         elif sampler_cls is not None:
 81 |             return sampler_cls(dataset)  # Sequential or RandomSampler
 82 |         else:
 83 |             raise ValueError('Either `sampler` or `sampler_cls` must be set!')
 84 | 
 85 | 
 86 | class BERTDataHelper(DataHelper):
 87 |     """
 88 |     For BERT/Transformer specific input (tokenizer, ...)
 89 |     """
 90 |     doc_a_col = None  # type: str
 91 |     doc_b_col = None  # type: str
 92 | 
 93 |     tokenizer = None
 94 |     bert_model_path = None
 95 |     bert_tokenizer_cls = BertTokenizer
 96 |     bert_tokenizer_params = {
 97 |         'do_lower_case': True,
 98 |     }
 99 | 
100 |     negative_sampling_ratio = 1.
101 |     max_seq_length = 512
102 | 
103 |     def get_tokenizer(self):
104 |         if self.tokenizer is None:
105 |             self.tokenizer = self.bert_tokenizer_cls.from_pretrained(self.bert_model_path, **self.bert_tokenizer_params)
106 | 
107 |         return self.tokenizer
108 | 
109 |     def get_joint_token_ids_and_types(self, pairs, token_ids_map):
110 |         """
111 |         Converts document pairs into a joint set of tokens for JointBERT models.
112 | 
113 |         Token format: [CLS] doc_a [SEP] doc_b [SEP]
114 |         Token type ids: 0 0 0 1 1
115 | 
116 |         :param pairs: list of tuples with A + B (title or ids depending on keys of token_ids_map)
117 |         :param token_ids_map:
118 |         :return: joint_token_ids (tensor), masks (tensor), token_type_ids (tensor)
119 |         """
120 | 
121 |         reserved_tokens_count = 3
122 |         max_side_length = int((self.max_seq_length - reserved_tokens_count) / 2)
123 | 
124 |         joint_ids = []
125 |         token_types = []
126 | 
127 |         logger.info(f'Joining token pairs with max_side_length={max_side_length}')
128 | 
129 |         if self.tqdm_cls:
130 |             pairs = self.tqdm_cls(pairs, total=len(pairs), desc='Joining documents')
131 | 
132 |         for a, b in pairs:
133 |             token_ids_a = token_ids_map[a]
134 |             token_ids_b = token_ids_map[b]
135 | 
136 |             len_a = len(token_ids_a)
137 |             len_b = len(token_ids_b)
138 | 
139 |             if len_a > max_side_length and len_b > max_side_length:  # both a too long
140 |                 token_ids_a = token_ids_a[:max_side_length]
141 |                 token_ids_b = token_ids_b[:max_side_length]
142 |             elif len_a > max_side_length and len_b <= max_side_length:  # a is long, b is short
143 |                 token_ids_a = token_ids_a[:max_side_length + max_side_length - len_b]
144 |                 token_ids_b = token_ids_b
145 |             elif len_a <= max_side_length and len_b > max_side_length:  # a is short, b is long
146 |                 token_ids_a = token_ids_a
147 |                 token_ids_b = token_ids_b[:max_side_length + max_side_length - len_a]
148 |             else:
149 |                 token_ids_a = token_ids_a
150 |                 token_ids_b = token_ids_b
151 | 
152 |             # joint = [self.get_tokenizer().cls_token_id] + token_ids_a + \
153 |             #         [self.get_tokenizer().sep_token_id] + token_ids_b + [self.get_tokenizer().sep_token_id]
154 |             joint = self.get_tokenizer().build_inputs_with_special_tokens(token_ids_a, token_ids_b)
155 | 
156 |             joint_ids.append(torch.tensor(joint))
157 | 
158 |             # [CLS] ids, .. [SEP] ... [SEP]
159 |             # token_types.append(torch.tensor([0] * (2 + len(token_ids_a)) + [1] * (1 + len(token_ids_b))))
160 |             token_types.append(torch.tensor(self.get_tokenizer().create_token_type_ids_from_sequences(token_ids_a, token_ids_b)))
161 | 
162 |         joint_ids = pad_sequence(joint_ids, batch_first=True, padding_value=self.get_tokenizer().pad_token_id)
163 |         #joint_ids.size()
164 | 
165 |         masks = torch.tensor([[float(i > 0) for i in ii] for ii in joint_ids])
166 | 
167 |         token_types = pad_sequence(token_types, batch_first=True, padding_value=1)
168 | 
169 |         return joint_ids, masks, token_types
170 | 
171 | 
172 |     def to_siamese_data_loader(self, df, token_ids_map, batch_size, sampler_cls=None, sampler=None):
173 |         ys = self.get_ys_as_tensor(df)
174 | 
175 |         doc_ids = df[[self.doc_a_col, self.doc_b_col]].values
176 | 
177 |         if self.tqdm_cls:
178 |             doc_ids = self.tqdm_cls(doc_ids, total=len(doc_ids), desc='Building tensor data set')
179 | 
180 |         #self.get_tokenizer()
181 |         token_ids_a = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[a][:self.max_seq_length - 2] + [
182 |             self.get_tokenizer().sep_token_id]) for a, b in doc_ids]
183 |         token_ids_b = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[b][:self.max_seq_length - 2] + [
184 |             self.get_tokenizer().sep_token_id]) for a, b in doc_ids]
185 | 
186 |         # token_ids_a = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[a][:self.max_seq_length - 2] + [self.get_tokenizer().sep_token_id]) for a, b in doc_ids]
187 |         # token_ids_b = [torch.tensor([self.get_tokenizer().cls_token_id] + token_ids_map[b][:self.max_seq_length - 2] + [self.get_tokenizer().sep_token_id]) for a, b in doc_ids]
188 | 
189 |         token_ids_a = pad_sequence(token_ids_a, batch_first=True, padding_value=self.get_tokenizer().pad_token_id)
190 |         token_ids_b = pad_sequence(token_ids_b, batch_first=True, padding_value=self.get_tokenizer().pad_token_id)
191 | 
192 |         masks_a = torch.tensor([[float(i > 0) for i in ii] for ii in token_ids_a])
193 |         masks_b = torch.tensor([[float(i > 0) for i in ii] for ii in token_ids_b])
194 | 
195 |         # build dataset
196 |         dataset = TensorDataset(
197 |             token_ids_a,
198 |             masks_a,
199 |             token_ids_b,
200 |             masks_b,
201 |             ys)
202 | 
203 |         return DefaultXYDataLoader(dataset, sampler=self.get_data_sampler(sampler, dataset, sampler_cls), batch_size=batch_size)
204 | 
205 |     def to_joint_data_loader(self, df, token_ids_map, batch_size, sampler_cls=None, sampler=None):
206 |         ys = self.get_ys_as_tensor(df)
207 | 
208 |         doc_ids = df[[self.doc_a_col, self.doc_b_col]].values
209 |         joint_ids, masks, token_types = self.get_joint_token_ids_and_types(doc_ids, token_ids_map)
210 | 
211 |         # build dataset
212 |         dataset = TensorDataset(
213 |             joint_ids,
214 |             masks,
215 |             token_types,
216 |             ys)
217 | 
218 |         return DefaultXYDataLoader(dataset, sampler=self.get_data_sampler(sampler, dataset, sampler_cls), batch_size=batch_size)
219 | 
220 | 
221 | class DocRelDataHelper(object):
222 |     labels = ['employer']  # 'employer' # 'capital' # 'country_of_citizenship' #'educated_at' # 'opposite_of'
223 |     label_col = None
224 |     none_label = 'none'
225 |     label_encoder = None
226 |     labels_integer_encoded = None
227 |     onehot_encoder = None
228 |     labels_onehot_encoded = None
229 | 
230 |     def get_labels_count(self):
231 |         """
232 |         If "none label" is set, count is increased by one.
233 | 
234 |         :return:
235 |         """
236 |         if self.none_label:
237 |             return len(self.labels) + 1
238 |         else:
239 |             return len(self.labels)
240 | 
241 |     def set_label_encoder(self, df):
242 |         self.label_encoder = LabelEncoder()
243 |         # self.labels_integer_encoded = self.label_encoder.fit_transform(list(df[self.label_col].values))
244 |         label_values = list(df[self.label_col].values)
245 | 
246 |         if self.none_label:
247 |             label_values.append(self.none_label)
248 | 
249 |         self.labels_integer_encoded = self.label_encoder.fit_transform(label_values)
250 | 
251 |         self.onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
252 |         self.labels_onehot_encoded = self.onehot_encoder.fit_transform(
253 |             self.labels_integer_encoded.reshape(len(self.labels_integer_encoded), 1))
254 | 
255 |     def is_binary_classification(self):
256 |         return len(self.labels) == 1
257 | 
258 |     def get_ys_as_tensor(self, df):
259 |         # convert categorical labels into numbers (one hot vectors)
260 |         if self.is_binary_classification():
261 |             return torch.tensor(self.label_encoder.transform(df[self.label_col].values).reshape(len(df), 1)).double()
262 |         else:
263 |             onehot_encoded = self.onehot_encoder.transform(
264 |                 self.label_encoder.transform(df[self.label_col].values).reshape(len(df), 1)
265 |             )
266 |             return torch.tensor(onehot_encoded)


--------------------------------------------------------------------------------
/acl/trainer_utils.py:
--------------------------------------------------------------------------------
  1 | import importlib.util
  2 | import os
  3 | from dataclasses import dataclass
  4 | from typing import List, Dict, Callable, Optional, Any
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import spacy
  9 | import torch
 10 | from sklearn.metrics import classification_report
 11 | from transformers import DataCollator, PreTrainedTokenizer
 12 | from transformers import EvalPrediction
 13 | 
 14 | from experiments.utils import flatten
 15 | 
 16 | 
 17 | def get_label_classes_from_nlp_dataset(cls_path: str, attr_name='LABEL_CLASSES') -> List[str]:
 18 |     if not cls_path.endswith('.py'):
 19 |         raise ValueError('data path must point to .py-file')
 20 | 
 21 |     if not cls_path.startswith('./'):
 22 |         raise ValueError('Must be relative path')
 23 | 
 24 |     # Make absolute path from app root
 25 |     cls_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), cls_path[2:])
 26 | 
 27 |     # Get file name, remove .py
 28 |     cls_name = cls_path[:-3].split('/')[-1]
 29 | 
 30 |     spec = importlib.util.spec_from_file_location(cls_name, cls_path)
 31 |     dataset_module = importlib.util.module_from_spec(spec)
 32 | 
 33 |     spec.loader.exec_module(dataset_module)
 34 | 
 35 |     if hasattr(dataset_module, attr_name):
 36 |         return getattr(dataset_module, attr_name)
 37 |     else:
 38 |         raise ValueError(f'dataset module does not have attribute: {attr_name}')
 39 | 
 40 | 
 41 | def get_vectors_from_spacy_model(spacy_nlp):
 42 |     unk_token_vector = np.zeros((1, spacy_nlp.vocab.vectors.shape[1]))
 43 |     sep_token_vector = np.ones((1, spacy_nlp.vocab.vectors.shape[1]))
 44 |     return np.concatenate((spacy_nlp.vocab.vectors.data,
 45 |                                               unk_token_vector,
 46 |                                               sep_token_vector), axis=0)
 47 | 
 48 | 
 49 | class DocRelTrainerHelper(object):
 50 |     def __init__(self,
 51 |                  id2doc: Dict[str, Dict],
 52 |                  label_classes: List[str],
 53 |                  doc_a_col: str,
 54 |                  doc_b_col: str,
 55 |                  label_col: str,
 56 |                  text_from_doc_func: Callable,
 57 |                  max_length=512,
 58 |                  spacy_nlp: Optional[Any] = None,
 59 |                  transformers_tokenizer: Optional[PreTrainedTokenizer] = None,
 60 |                  classification_threshold: float = 0.):
 61 |         self.id2doc = id2doc
 62 |         self.transformers_tokenizer = transformers_tokenizer
 63 |         self.spacy_nlp = spacy_nlp
 64 |         self.label_classes = label_classes
 65 |         self.doc_a_col = doc_a_col
 66 |         self.doc_b_col = doc_b_col
 67 |         self.label_col = label_col
 68 |         self.max_length = max_length
 69 |         self.classification_threshold = classification_threshold
 70 |         self.text_from_doc_func = text_from_doc_func
 71 | 
 72 |         if self.transformers_tokenizer and (self.transformers_tokenizer.max_len is None or self.transformers_tokenizer.max_len < 1):
 73 |             raise ValueError('Tokenizer max_length is not set!')
 74 | 
 75 |         if self.spacy_nlp:
 76 |             # Extend vocabulary with UNK + SEP token
 77 |             self.spacy_unk_token_id = len(self.spacy_nlp.vocab.vectors) + 0
 78 |             self.spacy_sep_token_id = len(self.spacy_nlp.vocab.vectors) + 1
 79 |         else:
 80 |             self.spacy_unk_token_id = self.spacy_sep_token_id = None
 81 | 
 82 |     def convert_to_features(self, batch):
 83 |         if self.transformers_tokenizer:
 84 |             return self.convert_to_features_transformers(batch)
 85 |         elif self.spacy_nlp:
 86 |             return self.convert_to_features_spacy(batch)
 87 |         else:
 88 |             raise ValueError('Neither Transformers tokenizer nor Spacy is set!')
 89 | 
 90 |     def convert_to_features_spacy(self, batch):
 91 |         snlp = self.spacy_nlp
 92 |         label_encodings = []
 93 |         input_ids = []
 94 |         attention_masks = []
 95 | 
 96 |         for from_id, to_id, label in zip(batch[self.doc_a_col], batch[self.doc_b_col], batch[self.label_col]):
 97 |             if from_id not in self.id2doc:
 98 |                 raise ValueError(f'Document not found. from_id={from_id}; label={label}')
 99 |             elif to_id not in self.id2doc:
100 |                 raise ValueError(f'Document not found. to_id={to_id}; label={label}')
101 | 
102 |             from_doc = self.id2doc[from_id]
103 |             from_tokens = snlp(self.text_from_doc_func(from_doc))[:np.floor(self.max_length / 2)]
104 |             from_token_ids = [snlp.vocab.vectors.key2row[t.norm] if t.has_vector and t.norm in snlp.vocab.vectors.key2row else self.spacy_unk_token_id for t in from_tokens]
105 | 
106 |             to_doc = self.id2doc[to_id]
107 |             to_tokens = snlp(self.text_from_doc_func(to_doc))[:np.floor(self.max_length / 2)]
108 |             to_token_ids = [snlp.vocab.vectors.key2row[t.norm] if t.has_vector and t.norm in snlp.vocab.vectors.key2row else self.spacy_unk_token_id for t in
109 |                               to_tokens]
110 | 
111 |             # Join with SEP token
112 |             token_ids = from_token_ids + [self.spacy_sep_token_id] + to_token_ids
113 |             token_ids = token_ids[:self.max_length]
114 | 
115 |             attention_mask = np.zeros(self.max_length)
116 |             attention_mask[list(range(len(token_ids)))] = 1.
117 | 
118 |             # Zero-padding
119 |             if len(token_ids) < self.max_length:
120 |                 token_ids += [0] * (self.max_length - len(token_ids))
121 | 
122 |             one_hot_encoded_label = np.zeros(len(self.label_classes))
123 |             one_hot_encoded_label[[self.label_classes.index(l) for l in label]] = 1.
124 | 
125 |             # To list
126 |             attention_masks.append(attention_mask.tolist())
127 |             input_ids.append(token_ids)
128 |             label_encodings.append(one_hot_encoded_label)
129 | 
130 |         encodings = {
131 |             'input_ids': input_ids,
132 |             'attention_mask': attention_masks,
133 |             'token_type_ids': [[0] * self.max_length] * len(input_ids),
134 |             'labels': label_encodings,
135 |         }
136 | 
137 |         return encodings
138 | 
139 |     def convert_to_features_transformers(self, batch):
140 |         text_pairs = []
141 |         label_encodings = []
142 | 
143 |         for from_id, to_id, label in zip(batch[self.doc_a_col], batch[self.doc_b_col], batch[self.label_col]):
144 |             if from_id not in self.id2doc:
145 |                 raise ValueError(f'Document not found. from_id={from_id}; label={label}')
146 |             elif to_id not in self.id2doc:
147 |                 raise ValueError(f'Document not found. to_id={to_id}; label={label}')
148 |             else:
149 |                 from_doc = self.id2doc[from_id]
150 |                 to_doc = self.id2doc[to_id]
151 | 
152 |                 text_pairs.append((
153 |                     self.text_from_doc_func(from_doc), self.text_from_doc_func(to_doc)
154 |                 ))
155 | 
156 |             one_hot_encoded_label = np.zeros(len(self.label_classes))
157 |             one_hot_encoded_label[[self.label_classes.index(l) for l in label]] = 1.
158 | 
159 |             label_encodings.append(one_hot_encoded_label)
160 | 
161 |         input_encodings = self.transformers_tokenizer.batch_encode_plus(
162 |             text_pairs,
163 |             pad_to_max_length=True,
164 |             truncation_strategy='longest_first',
165 |             return_token_type_ids=True,
166 |             return_attention_masks=True,
167 |             max_length=self.max_length
168 |         )
169 | 
170 |         # RoBERTa does not make use of token type ids, therefore a list of zeros is returned.
171 |         encodings = {
172 |             'input_ids': input_encodings['input_ids'],
173 |             'attention_mask': input_encodings['attention_mask'],
174 |             'token_type_ids': input_encodings['token_type_ids'],
175 |             'labels': label_encodings,
176 |         }
177 | 
178 |         # if 'token_type_ids' in input_encodings:
179 |         #     input_encodings['token_type_ids'] = input_encodings['token_type_ids']
180 | 
181 |         return encodings
182 | 
183 |     def compute_metrics(self, p: EvalPrediction) -> Dict:
184 |         predicted_labels = np.where(p.predictions > self.classification_threshold, 1., 0.)
185 | 
186 |         return flatten(classification_report(
187 |                 y_true=p.label_ids,
188 |                 y_pred=predicted_labels,
189 |                 target_names=self.label_classes,
190 |                 zero_division=0,
191 |                 output_dict=True))
192 | 
193 |     def get_df_from_predictions(self, relations_dataset, docs_dataset, predictions, exclude_columns: List=None):
194 |         if exclude_columns is None:
195 |             exclude_columns = []
196 | 
197 |         # To dataframe with IDs ...
198 |         true_dict = {'true_' + label: predictions.label_ids[:, idx] for idx, label in enumerate(self.label_classes)}
199 |         predictions_dict = {'predicted_' + label: predictions.predictions[:, idx] for idx, label in
200 |                             enumerate(self.label_classes)}
201 |         predictions_label_lists = [
202 |             [label for idx, label in enumerate(self.label_classes) if item[idx] > self.classification_threshold] for item in
203 |             predictions.predictions]
204 | 
205 |         # Document meta data
206 |         from_dict = {
207 |             'from_' + col: [self.id2doc[s2_id][col] if s2_id in self.id2doc else None for s2_id in relations_dataset[self.doc_a_col]]
208 |             for col in docs_dataset.column_names if col not in exclude_columns}
209 |         to_dict = {'to_' + col: [self.id2doc[s2_id][col] if s2_id in self.id2doc else None for s2_id in relations_dataset[self.doc_b_col]]
210 |                    for col in docs_dataset.column_names if col not in exclude_columns}
211 | 
212 |         df_dict = {}
213 |         df_dict.update(from_dict)
214 |         df_dict.update(to_dict)
215 | 
216 |         df_dict.update({
217 |             # Labels
218 |             'true': [','.join(label_list) for label_list in relations_dataset[self.label_col]],
219 |             'predicted': [','.join(label_list) for label_list in predictions_label_lists],
220 |         })
221 |         df_dict.update(true_dict)
222 |         df_dict.update(predictions_dict)
223 | 
224 |         return pd.DataFrame.from_dict(df_dict)
225 | 
226 | 
227 | @dataclass
228 | class DocRelDataCollator(DataCollator):
229 |     def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
230 |         """
231 |         Take a list of samples from a Dataset and collate them into a batch.
232 |         Returns:
233 |             A dictionary of tensors
234 |         """
235 | 
236 |         input_ids = torch.stack([example['input_ids'] for example in batch])
237 |         token_type_ids = torch.stack([example['token_type_ids'] for example in batch])
238 |         attention_mask = torch.stack([example['attention_mask'] for example in batch])
239 |         labels = torch.stack([example['labels'].squeeze() for example in batch])
240 | 
241 |         model_kwargs = {
242 |             'input_ids': input_ids,
243 |             'attention_mask': attention_mask,
244 |             'token_type_ids': token_type_ids,
245 |             'labels': labels,
246 |         }
247 | 
248 |         return model_kwargs
249 | 
250 | 
251 | def get_non_empty_text_from_doc(doc) -> str:
252 |     """
253 |     Build document text from title + abstract
254 | 
255 |     :param doc: S2 paper
256 |     :return: Document text
257 |     """
258 | 
259 |     text = ''
260 | 
261 |     if 'title' in doc:
262 |         text += doc['title']
263 | 
264 |         if doc['abstract']:
265 |             text += '\n' + doc['abstract']
266 | 
267 |     if len(text) == 0:
268 |         # Ensure text is at least one char to make tokenizers work.
269 |         text = ' '
270 | 
271 |     return text
272 | 


--------------------------------------------------------------------------------
/models/auto_modeling.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | #from longformer.longformer import LongformerConfig
  4 | from transformers import PretrainedConfig, AutoConfig, \
  5 |     RobertaConfig, BertConfig, XLNetConfig, BartConfig, ElectraConfig
  6 | 
  7 | from models.bart import BartForMultiLabelSequenceClassification
  8 | from models.bert import BertForMultiLabelSequenceClassification
  9 | from models.electra import ElectraForMultiLabelSequenceClassification
 10 | from models.longformer import LongformerForMultiLabelSequenceClassification
 11 | from models.roberta import RobertaForMultiLabelSequenceClassification
 12 | from models.xlnet import XLNetForMultiLabelSequenceClassification
 13 | 
 14 | MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
 15 |     [
 16 |         # (DistilBertConfig, DistilBertForSequenceClassification),
 17 |         # (AlbertConfig, AlbertForSequenceClassification),
 18 |         # (CamembertConfig, CamembertForSequenceClassification),
 19 |         # (XLMRobertaConfig, XLMRobertaForSequenceClassification),
 20 |         # (BartConfig, BartForSequenceClassification),
 21 |         (RobertaConfig, RobertaForMultiLabelSequenceClassification),
 22 |         (BertConfig, BertForMultiLabelSequenceClassification),
 23 |         (XLNetConfig, XLNetForMultiLabelSequenceClassification),
 24 |         #(LongformerConfig, LongformerForMultiLabelSequenceClassification),
 25 |         (BartConfig, BartForMultiLabelSequenceClassification),
 26 |         # (FlaubertConfig, FlaubertForSequenceClassification),
 27 |         # (XLMConfig, XLMForSequenceClassification),
 28 |         (ElectraConfig, ElectraForMultiLabelSequenceClassification),
 29 |     ]
 30 | )
 31 | 
 32 | 
 33 | class AutoModelForMultiLabelSequenceClassification:
 34 |     r"""
 35 |         :class:`~transformers.AutoModelForSequenceClassification` is a generic model class
 36 |         that will be instantiated as one of the sequence classification model classes of the library
 37 |         when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)`
 38 |         class method.
 39 | 
 40 |         This class cannot be instantiated using `__init__()` (throws an error).
 41 |     """
 42 | 
 43 |     def __init__(self):
 44 |         raise EnvironmentError(
 45 |             "AutoModelForMultiLabelSequenceClassification is designed to be instantiated "
 46 |             "using the `AutoModelForMultiLabelSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or "
 47 |             "`AutoModelForMultiLabelSequenceClassification.from_config(config)` methods."
 48 |         )
 49 | 
 50 |     @classmethod
 51 |     def from_config(cls, config):
 52 |         r""" Instantiates one of the base model classes of the library
 53 |         from a configuration.
 54 | 
 55 |         Args:
 56 |             config (:class:`~transformers.PretrainedConfig`):
 57 |                 The model class to instantiate is selected based on the configuration class:
 58 | 
 59 |                 - isInstance of `distilbert` configuration class: :class:`~transformers.DistilBertForSequenceClassification` (DistilBERT model)
 60 |                 - isInstance of `albert` configuration class: :class:`~transformers.AlbertForSequenceClassification` (ALBERT model)
 61 |                 - isInstance of `camembert` configuration class: :class:`~transformers.CamembertForSequenceClassification` (CamemBERT model)
 62 |                 - isInstance of `xlm roberta` configuration class: :class:`~transformers.XLMRobertaForSequenceClassification` (XLM-RoBERTa model)
 63 |                 - isInstance of `roberta` configuration class: :class:`~transformers.RobertaForSequenceClassification` (RoBERTa model)
 64 |                 - isInstance of `bert` configuration class: :class:`~transformers.BertForSequenceClassification` (Bert model)
 65 |                 - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
 66 |                 - isInstance of `xlm` configuration class: :class:`~transformers.XLMForSequenceClassification` (XLM model)
 67 |                 - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
 68 | 
 69 | 
 70 |         Examples::
 71 | 
 72 |             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
 73 |             model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
 74 |         """
 75 |         for config_class, model_class in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.items():
 76 |             if isinstance(config, config_class):
 77 |                 return model_class(config)
 78 |         raise ValueError(
 79 |             "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
 80 |             "Model type should be one of {}.".format(
 81 |                 config.__class__,
 82 |                 cls.__name__,
 83 |                 ", ".join(c.__name__ for c in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
 84 |             )
 85 |         )
 86 | 
 87 |     @classmethod
 88 |     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 89 |         r""" Instantiates one of the sequence classification model classes of the library
 90 |         from a pre-trained model configuration.
 91 | 
 92 |         The `from_pretrained()` method takes care of returning the correct model class instance
 93 |         based on the `model_type` property of the config object, or when it's missing,
 94 |         falling back to using pattern matching on the `pretrained_model_name_or_path` string.
 95 | 
 96 |         The model class to instantiate is selected as the first pattern matching
 97 |         in the `pretrained_model_name_or_path` string (in the following order):
 98 |             - contains `distilbert`: :class:`~transformers.DistilBertForSequenceClassification` (DistilBERT model)
 99 |             - contains `albert`: :class:`~transformers.AlbertForSequenceClassification` (ALBERT model)
100 |             - contains `camembert`: :class:`~transformers.CamembertForSequenceClassification` (CamemBERT model)
101 |             - contains `xlm-roberta`: :class:`~transformers.XLMRobertaForSequenceClassification` (XLM-RoBERTa model)
102 |             - contains `roberta`: :class:`~transformers.RobertaForSequenceClassification` (RoBERTa model)
103 |             - contains `bert`: :class:`~transformers.BertForSequenceClassification` (Bert model)
104 |             - contains `xlnet`: :class:`~transformers.XLNetForSequenceClassification` (XLNet model)
105 |             - contains `flaubert`: :class:`~transformers.FlaubertForSequenceClassification` (Flaubert model)
106 | 
107 |         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
108 |         To train the model, you should first set it back in training mode with `model.train()`
109 | 
110 |         Args:
111 |             pretrained_model_name_or_path: either:
112 | 
113 |                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
114 |                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
115 |                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
116 |                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
117 | 
118 |             model_args: (`optional`) Sequence of positional arguments:
119 |                 All remaining positional arguments will be passed to the underlying model's ``__init__`` method
120 | 
121 |             config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
122 |                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
123 | 
124 |                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
125 |                 - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
126 |                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
127 | 
128 |             state_dict: (`optional`) dict:
129 |                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
130 |                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
131 |                 In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
132 | 
133 |             cache_dir: (`optional`) string:
134 |                 Path to a directory in which a downloaded pre-trained model
135 |                 configuration should be cached if the standard cache should not be used.
136 | 
137 |             force_download: (`optional`) boolean, default False:
138 |                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
139 | 
140 |             resume_download: (`optional`) boolean, default False:
141 |                 Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
142 | 
143 |             proxies: (`optional`) dict, default None:
144 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
145 |                 The proxies are used on each request.
146 | 
147 |             output_loading_info: (`optional`) boolean:
148 |                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
149 | 
150 |             kwargs: (`optional`) Remaining dictionary of keyword arguments:
151 |                 These arguments will be passed to the configuration and the model.
152 | 
153 |         Examples::
154 | 
155 |             model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
156 |             model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
157 |             assert model.config.output_attention == True
158 |             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
159 |             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
160 |             model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
161 | 
162 |         """
163 |         config = kwargs.pop("config", None)
164 |         if not isinstance(config, PretrainedConfig):
165 |             config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
166 | 
167 |         for config_class, model_class in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.items():
168 |             if isinstance(config, config_class):
169 |                 return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
170 |         raise ValueError(
171 |             "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
172 |             "Model type should be one of {}.".format(
173 |                 config.__class__,
174 |                 cls.__name__,
175 |                 ", ".join(c.__name__ for c in MODEL_FOR_MULTI_LABEL_SEQUENCE_CLASSIFICATION_MAPPING.keys()),
176 |             )
177 |         )
178 | 


--------------------------------------------------------------------------------
/word_vectors.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "pycharm": {}
  7 |       },
  8 |       "source": "# Word vectors (FastText) for Baseline\n\n#### Create Spacy model from word vectors\n\n```bash\npython -m spacy init-model en output/cord19_docrel/spacy/en_cord19_fasttext_300d --vectors-loc output/cord19_docrel/cord19.fasttext.w2v.txt\npython -m spacy init-model en output/acl_docrel/spacy/en_acl_fasttext_300d --vectors-loc output/acl_docrel/acl.fasttext.w2v.txt\n```\n"
  9 |     },
 10 |     {
 11 |       "cell_type": "code",
 12 |       "execution_count": 1,
 13 |       "metadata": {
 14 |         "pycharm": {}
 15 |       },
 16 |       "outputs": [
 17 |         {
 18 |           "name": "stderr",
 19 |           "output_type": "stream",
 20 |           "text": [
 21 |             "wandb: WARNING W\u0026B installed but not logged in.  Run `wandb login` or set the WANDB_API_KEY env variable.\n"
 22 |           ]
 23 |         }
 24 |       ],
 25 |       "source": [
 26 |         "import gensim\n",
 27 |         "import json\n",
 28 |         "import os\n",
 29 |         "import requests\n",
 30 |         "import pickle\n",
 31 |         "import pandas as pd\n",
 32 |         "import logging\n",
 33 |         "from pathlib import Path\n",
 34 |         "from tqdm import tqdm_notebook as tqdm\n",
 35 |         "from smart_open import open\n",
 36 |         "from nlp import load_dataset\n",
 37 |         "import nlp\n",
 38 |         "import acl.utils\n",
 39 |         "from trainer_cli import ExperimentArguments"
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "markdown",
 44 |       "metadata": {
 45 |         "pycharm": {}
 46 |       },
 47 |       "source": [
 48 |         "## CORD19"
 49 |       ]
 50 |     },
 51 |     {
 52 |       "cell_type": "code",
 53 |       "execution_count": 22,
 54 |       "metadata": {
 55 |         "pycharm": {}
 56 |       },
 57 |       "outputs": [],
 58 |       "source": [
 59 |         "data_dir \u003d Path(\u0027./output/cord19_docrel\u0027)\n",
 60 |         "\n",
 61 |         "experiment_args \u003d ExperimentArguments(\n",
 62 |         "    nlp_dataset\u003d\u0027./datasets/cord19_docrel/cord19_docrel.py\u0027,\n",
 63 |         "    nlp_cache_dir\u003d\u0027./data/nlp_cache\u0027,\n",
 64 |         "    doc_id_col\u003d\u0027doi\u0027,\n",
 65 |         "    doc_a_col\u003d\u0027from_doi\u0027,\n",
 66 |         "    doc_b_col\u003d\u0027to_doi\u0027,\n",
 67 |         "    cv_fold\u003d1,\n",
 68 |         ")\n",
 69 |         "\n",
 70 |         "docs_ds \u003d load_dataset(experiment_args.nlp_dataset,\n",
 71 |         "                       name\u003d\u0027docs\u0027,\n",
 72 |         "                       cache_dir\u003dexperiment_args.nlp_cache_dir,\n",
 73 |         "                       split\u003dnlp.Split(\u0027docs\u0027))"
 74 |       ]
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "execution_count": 23,
 79 |       "metadata": {
 80 |         "pycharm": {}
 81 |       },
 82 |       "outputs": [
 83 |         {
 84 |           "name": "stdout",
 85 |           "output_type": "stream",
 86 |           "text": [
 87 |             "Total tokens: 16,181,414\n"
 88 |           ]
 89 |         }
 90 |       ],
 91 |       "source": [
 92 |         "# Extract tokens from each document and create token file.\n",
 93 |         "tokens_count \u003d 0\n",
 94 |         "with open(data_dir / \u0027tokens.txt\u0027, \u0027w\u0027) as f:\n",
 95 |         "    for idx, doc in docs_ds.data.to_pandas().iterrows():\n",
 96 |         "        text \u003d acl.utils.get_text_from_doc(doc)  \n",
 97 |         "        for token in gensim.utils.simple_preprocess(text, min_len\u003d2, max_len\u003d15):\n",
 98 |         "            f.write(token + \u0027 \u0027)\n",
 99 |         "            tokens_count +\u003d 1\n",
100 |         "        f.write(\u0027\\n\u0027)\n",
101 |         "print(f\u0027Total tokens: {tokens_count:,}\u0027)\n"
102 |       ]
103 |     },
104 |     {
105 |       "cell_type": "code",
106 |       "execution_count": 26,
107 |       "metadata": {
108 |         "pycharm": {}
109 |       },
110 |       "outputs": [],
111 |       "source": [
112 |         "import fasttext\n",
113 |         "\n",
114 |         "model \u003d fasttext.train_unsupervised(str(data_dir / \u0027tokens.txt\u0027), \n",
115 |         "                                    model\u003d\u0027skipgram\u0027, \n",
116 |         "                                    lr\u003d0.05, # learning rate [0.05]\n",
117 |         "                                    dim\u003d300,   # size of word vectors [100]\n",
118 |         "                                    ws\u003d5,                # size of the context window [5]\n",
119 |         "                                    epoch\u003d5,             # number of epochs [5]\n",
120 |         "                                    thread\u003d4,            # number of threads [number of cpus]\n",
121 |         "                                   )"
122 |       ]
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "execution_count": 27,
127 |       "metadata": {
128 |         "pycharm": {}
129 |       },
130 |       "outputs": [],
131 |       "source": [
132 |         "model.save_model(str(data_dir / \u0027cord19.fasttext.bin\u0027))"
133 |       ]
134 |     },
135 |     {
136 |       "cell_type": "code",
137 |       "execution_count": 28,
138 |       "metadata": {
139 |         "pycharm": {}
140 |       },
141 |       "outputs": [],
142 |       "source": [
143 |         "from gensim.models.wrappers import FastText\n",
144 |         "\n",
145 |         "ft_model \u003d FastText.load_fasttext_format(str(data_dir / \u0027cord19.fasttext.bin\u0027))\n",
146 |         "ft_model.wv.save_word2vec_format(data_dir / \u0027cord19.fasttext.w2v.txt\u0027)"
147 |       ]
148 |     },
149 |     {
150 |       "cell_type": "code",
151 |       "execution_count": null,
152 |       "metadata": {
153 |         "pycharm": {}
154 |       },
155 |       "outputs": [],
156 |       "source": [
157 |         "# Unset\n",
158 |         "del ft_model\n",
159 |         "del model\n",
160 |         "del docs_ds\n",
161 |         "del experiment_args\n",
162 |         "del data_dir"
163 |       ]
164 |     },
165 |     {
166 |       "cell_type": "markdown",
167 |       "metadata": {
168 |         "pycharm": {}
169 |       },
170 |       "source": [
171 |         "## ACL"
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "execution_count": 2,
177 |       "metadata": {
178 |         "pycharm": {}
179 |       },
180 |       "outputs": [
181 |         {
182 |           "name": "stdout",
183 |           "output_type": "stream",
184 |           "text": [
185 |             "Downloading and preparing dataset acl_docrel/docs (download: Unknown size, generated: Unknown size, total: Unknown size) to ./data/nlp_cache/acl_docrel/docs/0.1.0...\n"
186 |           ]
187 |         },
188 |         {
189 |           "data": {
190 |             "application/vnd.jupyter.widget-view+json": {
191 |               "model_id": "5212702e85614bef8a3c2add3e36093e",
192 |               "version_major": 2,
193 |               "version_minor": 0
194 |             },
195 |             "text/plain": [
196 |               "HBox(children\u003d(IntProgress(value\u003d0, description\u003d\u0027Downloading\u0027, max\u003d312525939, style\u003dProgressStyle(description_…"
197 |             ]
198 |           },
199 |           "metadata": {},
200 |           "output_type": "display_data"
201 |         },
202 |         {
203 |           "name": "stdout",
204 |           "output_type": "stream",
205 |           "text": [
206 |             "\n"
207 |           ]
208 |         },
209 |         {
210 |           "data": {
211 |             "application/vnd.jupyter.widget-view+json": {
212 |               "model_id": "",
213 |               "version_major": 2,
214 |               "version_minor": 0
215 |             },
216 |             "text/plain": [
217 |               "HBox(children\u003d(IntProgress(value\u003d1, bar_style\u003d\u0027info\u0027, max\u003d1), HTML(value\u003d\u0027\u0027)))"
218 |             ]
219 |           },
220 |           "metadata": {},
221 |           "output_type": "display_data"
222 |         },
223 |         {
224 |           "name": "stdout",
225 |           "output_type": "stream",
226 |           "text": [
227 |             "\r",
228 |             "Dataset acl_docrel downloaded and prepared to ./data/nlp_cache/acl_docrel/docs/0.1.0. Subsequent calls will reuse this data.\n"
229 |           ]
230 |         }
231 |       ],
232 |       "source": [
233 |         "data_dir \u003d Path(\u0027./output/acl_docrel\u0027)\n",
234 |         "\n",
235 |         "experiment_args \u003d ExperimentArguments(\n",
236 |         "    nlp_dataset\u003d\u0027./datasets/acl_docrel/acl_docrel.py\u0027,\n",
237 |         "    nlp_cache_dir\u003d\u0027./data/nlp_cache\u0027,\n",
238 |         "    doc_id_col\u003d\u0027s2_id\u0027,\n",
239 |         "    doc_a_col\u003d\u0027from_s2_id\u0027,\n",
240 |         "    doc_b_col\u003d\u0027to_s2_id\u0027,\n",
241 |         "    cv_fold\u003d1,\n",
242 |         ")\n",
243 |         "\n",
244 |         "docs_ds \u003d load_dataset(experiment_args.nlp_dataset,\n",
245 |         "                       name\u003d\u0027docs\u0027,\n",
246 |         "                       cache_dir\u003dexperiment_args.nlp_cache_dir,\n",
247 |         "                       split\u003dnlp.Split(\u0027docs\u0027))"
248 |       ]
249 |     },
250 |     {
251 |       "cell_type": "code",
252 |       "execution_count": 3,
253 |       "metadata": {
254 |         "pycharm": {}
255 |       },
256 |       "outputs": [
257 |         {
258 |           "name": "stdout",
259 |           "output_type": "stream",
260 |           "text": [
261 |             "Total tokens: 2,194,010\n"
262 |           ]
263 |         }
264 |       ],
265 |       "source": [
266 |         "# Extract tokens from each document and create token file.\n",
267 |         "tokens_count \u003d 0\n",
268 |         "with open(data_dir / \u0027tokens.txt\u0027, \u0027w\u0027) as f:\n",
269 |         "    for idx, doc in docs_ds.data.to_pandas().iterrows():\n",
270 |         "        text \u003d acl.utils.get_text_from_doc(doc)  \n",
271 |         "        for token in gensim.utils.simple_preprocess(text, min_len\u003d2, max_len\u003d15):\n",
272 |         "            f.write(token + \u0027 \u0027)\n",
273 |         "            tokens_count +\u003d 1\n",
274 |         "        f.write(\u0027\\n\u0027)\n",
275 |         "        \n",
276 |         "# Total tokens: 2,194,010\n",
277 |         "print(f\u0027Total tokens: {tokens_count:,}\u0027)"
278 |       ]
279 |     },
280 |     {
281 |       "cell_type": "code",
282 |       "execution_count": 4,
283 |       "metadata": {
284 |         "pycharm": {}
285 |       },
286 |       "outputs": [],
287 |       "source": [
288 |         "import fasttext\n",
289 |         "\n",
290 |         "model \u003d fasttext.train_unsupervised(str(data_dir / \u0027tokens.txt\u0027), \n",
291 |         "                                    model\u003d\u0027skipgram\u0027, \n",
292 |         "                                    lr\u003d0.05, # learning rate [0.05]\n",
293 |         "                                    dim\u003d300,   # size of word vectors [100]\n",
294 |         "                                    ws\u003d5,                # size of the context window [5]\n",
295 |         "                                    epoch\u003d5,             # number of epochs [5]\n",
296 |         "                                    thread\u003d4,            # number of threads [number of cpus]\n",
297 |         "                                   )"
298 |       ]
299 |     },
300 |     {
301 |       "cell_type": "code",
302 |       "execution_count": 5,
303 |       "metadata": {
304 |         "pycharm": {}
305 |       },
306 |       "outputs": [],
307 |       "source": [
308 |         "model.save_model(str(data_dir / \u0027acl.fasttext.bin\u0027))"
309 |       ]
310 |     },
311 |     {
312 |       "cell_type": "code",
313 |       "execution_count": 6,
314 |       "metadata": {
315 |         "pycharm": {}
316 |       },
317 |       "outputs": [],
318 |       "source": [
319 |         "from gensim.models.wrappers import FastText\n",
320 |         "\n",
321 |         "ft_model \u003d FastText.load_fasttext_format(str(data_dir / \u0027acl.fasttext.bin\u0027))\n",
322 |         "ft_model.wv.save_word2vec_format(data_dir / \u0027acl.fasttext.w2v.txt\u0027)"
323 |       ]
324 |     },
325 |     {
326 |       "cell_type": "code",
327 |       "execution_count": null,
328 |       "metadata": {
329 |         "pycharm": {}
330 |       },
331 |       "outputs": [],
332 |       "source": []
333 |     }
334 |   ],
335 |   "metadata": {
336 |     "kernelspec": {
337 |       "display_name": "Python [conda env:acl-anthology] *",
338 |       "language": "python",
339 |       "name": "conda-env-acl-anthology-py"
340 |     },
341 |     "language_info": {
342 |       "codemirror_mode": {
343 |         "name": "ipython",
344 |         "version": 3
345 |       },
346 |       "file_extension": ".py",
347 |       "mimetype": "text/x-python",
348 |       "name": "python",
349 |       "nbconvert_exporter": "python",
350 |       "pygments_lexer": "ipython3",
351 |       "version": "3.7.4"
352 |     }
353 |   },
354 |   "nbformat": 4,
355 |   "nbformat_minor": 2
356 | }


--------------------------------------------------------------------------------