├── src ├── models │ ├── .gitkeep │ └── __init__.py ├── contrastive │ ├── lspc │ │ ├── run_pretraining.sh │ │ ├── run_finetune_baseline.sh │ │ ├── run_finetune_baseline_multi.sh │ │ ├── run_finetune_siamese.sh │ │ └── run_finetune_multi.sh │ ├── models │ │ ├── metrics.py │ │ └── loss.py │ └── data │ │ └── data_collators.py ├── data │ ├── download_datasets.py │ └── utils.py └── processing │ ├── contrastive │ └── prepare-data.py │ ├── process-wordocc │ └── process-to-wordocc-multi.py │ ├── process-magellan │ └── process_to_magellan.py │ └── process-wordcooc │ └── process-to-wordcooc.py ├── hiergat ├── model │ ├── __init__.py │ ├── eval.py │ ├── ceval.py │ ├── summarize.py │ ├── layer.py │ ├── model.py │ ├── cmodel.py │ └── dataset.py ├── .gitignore ├── README.md ├── all_runs.py ├── result_collection.ipynb ├── hiergat_env.yml ├── dataset.py ├── train.py ├── train_n.py └── task.json ├── ditto ├── ditto_light │ ├── __init__.py │ ├── exceptions.py │ ├── dataset.py │ ├── augment.py │ ├── summarize.py │ ├── knowledge.py │ └── ditto.py ├── .gitignore ├── .gitmodules ├── README.md ├── all_runs.py ├── blocking │ ├── README.md │ ├── train_blocker.py │ └── blocker.py ├── result_collection.ipynb ├── ditto_env.yml ├── train_ditto.py ├── configs.json └── LICENSE ├── notebooks └── processing │ └── benchmark2020 │ ├── .gitkeep │ └── dbscan-clustering.ipynb ├── setup.py ├── LICENSE ├── .gitignore ├── preprocess_data_for_ditto_hiergat.py └── README.md /src/models/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hiergat/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ditto/ditto_light/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/processing/benchmark2020/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hiergat/.gitignore: -------------------------------------------------------------------------------- 1 | ._* 2 | /checkpoints 3 | /report 4 | -------------------------------------------------------------------------------- /ditto/.gitignore: -------------------------------------------------------------------------------- 1 | ditto_light/__pycache__/* 2 | report/ 3 | results_wdc3/ 4 | -------------------------------------------------------------------------------- /ditto/.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "blocking/sentence-transformers"] 2 | path = blocking/sentence-transformers 3 | url = https://github.com/UKPLab/sentence-transformers 4 | -------------------------------------------------------------------------------- /ditto/ditto_light/exceptions.py: -------------------------------------------------------------------------------- 1 | class ModelNotFoundError(Exception): 2 | def __init__(self, path): 3 | super().__init__("Model {} was not found".format(path)) 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='src', 5 | packages=find_packages(), 6 | version='0.1.0', 7 | description='Data Integration Research', 8 | author='Ralph Peeters', 9 | license='BSD-3', 10 | ) 11 | -------------------------------------------------------------------------------- /hiergat/README.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | * Python 3.7 3 | * PyTorch 1.4 4 | * HuggingFace Transformers 5 | * NLTK (for 1-N ER problem) 6 | 7 | Install required packages 8 | ``` 9 | conda env create -f hiergat_env.yml 10 | ``` 11 | 12 | ## Activate corresponding environment 13 | ``` 14 | conda deactivate 15 | conda activate hiergat_env 16 | ``` 17 | 18 | ## To train and test hiergat 19 | ``` 20 | python all_runs.py 21 | ``` -------------------------------------------------------------------------------- /ditto/README.md: -------------------------------------------------------------------------------- 1 | ## Requirements 2 | * Python 3.7.7 3 | * PyTorch 1.9 4 | * HuggingFace Transformers 4.9.2 5 | * Spacy with the ``en_core_web_lg`` models 6 | 7 | Install required packages 8 | ``` 9 | conda env create -f ditto_env.yml 10 | conda activate ditto_env 11 | python -m spacy download en_core_web_lg 12 | ``` 13 | 14 | ## Activate corresponding environment 15 | ``` 16 | conda deactivate 17 | conda activate ditto_env 18 | ``` 19 | 20 | ## To train and test ditto 21 | ``` 22 | python all_runs.py 23 | ``` -------------------------------------------------------------------------------- /hiergat/all_runs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | sizes = ['small', 'medium', 'large'] 4 | difficulties = ['20cc80rnd', '50cc50rnd', '80cc20rnd'] 5 | unseens = ['000un', '050un', '100un'] 6 | 7 | for seed in range(3): 8 | for size in sizes: 9 | for difficulty in difficulties: 10 | for unseen in unseens: 11 | cmd = """CUDA_VISIBLE_DEVICES=3 python train.py \ 12 | --task final_%s_%s%s \ 13 | --run_id %d \ 14 | --batch_size 16 \ 15 | --max_len 256 \ 16 | --lr 5e-6 \ 17 | --n_epochs 50 \ 18 | --finetuning \ 19 | --split \ 20 | --lm roberta""" % (size, difficulty, unseen, seed) 21 | print(cmd) 22 | os.system(cmd) 23 | -------------------------------------------------------------------------------- /ditto/all_runs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | sizes = ['small', 'medium', 'large'] 4 | difficulties = ['20cc80rnd', '50cc50rnd', '80cc20rnd'] 5 | unseens = ['000un', '050un', '100un'] 6 | 7 | for seed in range(3): 8 | for size in sizes: 9 | for difficulty in difficulties: 10 | for unseen in unseens: 11 | cmd = """CUDA_VISIBLE_DEVICES=2 python train_ditto.py \ 12 | --task final_%s_%s%s \ 13 | --logdir results_wdc3/ \ 14 | --run_id %d \ 15 | --batch_size 64 \ 16 | --max_len 256 \ 17 | --lr 5e-5 \ 18 | --n_epochs 50 \ 19 | --finetuning \ 20 | --lm roberta \ 21 | --da del""" % (size, difficulty, unseen, seed) 22 | print(cmd) 23 | os.system(cmd) -------------------------------------------------------------------------------- /src/contrastive/lspc/run_pretraining.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=gpu_8 3 | #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=23:00:00 6 | #SBATCH --export=NONE 7 | MODEL=$1 8 | CHECKPOINT=$2 9 | BATCH=$3 10 | LR=$4 11 | TEMP=$5 12 | CATEGORY=$6 13 | SIZE=$7 14 | AUG=$8 15 | python run_pretraining.py \ 16 | --do_train \ 17 | --train_file ../../data/processed/wdc-lspc/contrastive/pre-train/$CATEGORY/${CATEGORY}_train_$SIZE.pkl.gz \ 18 | --id_deduction_set ../../data/raw/wdc-lspc/training-sets/${CATEGORY}_train_$SIZE.json.gz \ 19 | --tokenizer=$MODEL \ 20 | --grad_checkpoint=$CHECKPOINT \ 21 | --output_dir ../../reports/contrastive/$CATEGORY-$SIZE-$AUG$BATCH-$LR-$TEMP-${MODEL##*/}/ \ 22 | --temperature=$TEMP \ 23 | --per_device_train_batch_size=$BATCH \ 24 | --learning_rate=$LR \ 25 | --weight_decay=0.01 \ 26 | --num_train_epochs=200 \ 27 | --lr_scheduler_type="linear" \ 28 | --warmup_ratio=0.05 \ 29 | --max_grad_norm=1.0 \ 30 | --fp16 \ 31 | --dataloader_num_workers=4 \ 32 | --disable_tqdm=True \ 33 | --save_strategy="epoch" \ 34 | --logging_strategy="epoch" \ 35 | --augment=$AUG \ -------------------------------------------------------------------------------- /src/contrastive/lspc/run_finetune_baseline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=gpu_8 3 | #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=12:00:00 6 | #SBATCH --export=NONE 7 | MODEL=$1 8 | CHECKPOINT=$2 9 | BATCH=$3 10 | LR=$4 11 | CATEGORY=$5 12 | SIZE=$6 13 | AUG=$7 14 | python run_finetune_baseline.py \ 15 | --do_train \ 16 | --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 17 | --train_size=$SIZE \ 18 | --validation_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 19 | --test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \ 20 | --evaluation_strategy=epoch \ 21 | --tokenizer=$MODEL \ 22 | --grad_checkpoint=$CHECKPOINT \ 23 | --output_dir ../../reports/baseline/$CATEGORY-$SIZE-$AUG$BATCH-$LR-${MODEL##*/}/ \ 24 | --per_device_train_batch_size=$BATCH \ 25 | --learning_rate=$LR \ 26 | --weight_decay=0.01 \ 27 | --num_train_epochs=50 \ 28 | --lr_scheduler_type="linear" \ 29 | --warmup_ratio=0.05 \ 30 | --max_grad_norm=1.0 \ 31 | --fp16 \ 32 | --metric_for_best_model=f1 \ 33 | --dataloader_num_workers=4 \ 34 | --disable_tqdm=True \ 35 | --save_strategy="epoch" \ 36 | --load_best_model_at_end \ 37 | --augment=$AUG \ 38 | #--do_param_opt \ -------------------------------------------------------------------------------- /src/contrastive/lspc/run_finetune_baseline_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=gpu_8 3 | #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=12:00:00 6 | #SBATCH --export=NONE 7 | MODEL=$1 8 | CHECKPOINT=$2 9 | BATCH=$3 10 | LR=$4 11 | CATEGORY=$5 12 | SIZE=$6 13 | AUG=$7 14 | python run_finetune_baseline_multi.py \ 15 | --do_train \ 16 | --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 17 | --train_size=$SIZE \ 18 | --validation_file ../../data/interim/wdc-lspc/validation-sets/preprocessed_${CATEGORY}_valid_$SIZE.pkl.gz \ 19 | --test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \ 20 | --evaluation_strategy=epoch \ 21 | --tokenizer=$MODEL \ 22 | --grad_checkpoint=$CHECKPOINT \ 23 | --output_dir ../../reports/baseline-multi/$CATEGORY-$SIZE-$AUG$BATCH-$LR-${MODEL##*/}/ \ 24 | --per_device_train_batch_size=$BATCH \ 25 | --learning_rate=$LR \ 26 | --weight_decay=0.01 \ 27 | --num_train_epochs=50 \ 28 | --lr_scheduler_type="linear" \ 29 | --warmup_ratio=0.05 \ 30 | --max_grad_norm=1.0 \ 31 | --fp16 \ 32 | --metric_for_best_model=f1_micro \ 33 | --dataloader_num_workers=4 \ 34 | --disable_tqdm=True \ 35 | --save_strategy="epoch" \ 36 | --load_best_model_at_end \ 37 | --augment=$AUG \ 38 | #--do_param_opt \ -------------------------------------------------------------------------------- /src/contrastive/lspc/run_finetune_siamese.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=gpu_8 3 | #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=12:00:00 6 | #SBATCH --export=NONE 7 | MODEL=$1 8 | CHECKPOINT=$2 9 | BATCH=$3 10 | LR=$4 11 | TEMP=$5 12 | FROZEN=$6 13 | CATEGORY=$7 14 | SIZE=$8 15 | AUG=$9 16 | PREAUG=${10} 17 | python run_finetune_siamese.py \ 18 | --model_pretrained_checkpoint ../../reports/contrastive/$CATEGORY-$SIZE-$PREAUG$BATCH-$LR-$TEMP-${MODEL##*/}/pytorch_model.bin \ 19 | --do_train \ 20 | --frozen=$FROZEN \ 21 | --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 22 | --train_size=$SIZE \ 23 | --validation_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 24 | --test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \ 25 | --evaluation_strategy=epoch \ 26 | --tokenizer=$MODEL \ 27 | --grad_checkpoint=$CHECKPOINT \ 28 | --output_dir ../../reports/contrastive-ft-siamese/$CATEGORY-$SIZE-$AUG$BATCH-$PREAUG$LR-$TEMP-$FROZEN-${MODEL##*/}/ \ 29 | --per_device_train_batch_size=64 \ 30 | --learning_rate=$LR \ 31 | --weight_decay=0.01 \ 32 | --num_train_epochs=50 \ 33 | --lr_scheduler_type="linear" \ 34 | --warmup_ratio=0.05 \ 35 | --max_grad_norm=1.0 \ 36 | --fp16 \ 37 | --metric_for_best_model=loss \ 38 | --dataloader_num_workers=4 \ 39 | --disable_tqdm=True \ 40 | --save_strategy="epoch" \ 41 | --load_best_model_at_end \ 42 | --augment=$AUG \ 43 | #--do_param_opt \ -------------------------------------------------------------------------------- /src/contrastive/lspc/run_finetune_multi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=gpu_8 3 | #SBATCH --gres=gpu:1 4 | #SBATCH --nodes=1 5 | #SBATCH --time=12:00:00 6 | #SBATCH --export=NONE 7 | MODEL=$1 8 | CHECKPOINT=$2 9 | BATCH=$3 10 | LR=$4 11 | TEMP=$5 12 | FROZEN=$6 13 | CATEGORY=$7 14 | SIZE=$8 15 | AUG=$9 16 | PREAUG=${10} 17 | python run_finetune_multi.py \ 18 | --model_pretrained_checkpoint ../../reports/contrastive/${CATEGORY/multi/}-$SIZE-$PREAUG$BATCH-$LR-$TEMP-${MODEL##*/}/pytorch_model.bin \ 19 | --do_train \ 20 | --frozen=$FROZEN \ 21 | --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \ 22 | --train_size=$SIZE \ 23 | --validation_file ../../data/interim/wdc-lspc/validation-sets/preprocessed_${CATEGORY}_valid_$SIZE.pkl.gz \ 24 | --test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \ 25 | --evaluation_strategy=epoch \ 26 | --tokenizer=$MODEL \ 27 | --grad_checkpoint=$CHECKPOINT \ 28 | --output_dir ../../reports/contrastive-ft-multi/$CATEGORY-$SIZE-$AUG$BATCH-$PREAUG$LR-$TEMP-$FROZEN-${MODEL##*/}/ \ 29 | --per_device_train_batch_size=64 \ 30 | --learning_rate=$LR \ 31 | --weight_decay=0.01 \ 32 | --num_train_epochs=50 \ 33 | --lr_scheduler_type="linear" \ 34 | --warmup_ratio=0.05 \ 35 | --max_grad_norm=1.0 \ 36 | --fp16 \ 37 | --metric_for_best_model=loss \ 38 | --dataloader_num_workers=4 \ 39 | --disable_tqdm=True \ 40 | --save_strategy="epoch" \ 41 | --load_best_model_at_end \ 42 | --augment=$AUG \ 43 | #--do_param_opt \ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2019, Ralph Peeters 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, this 12 | list of conditions and the following disclaimer in the documentation and/or 13 | other materials provided with the distribution. 14 | 15 | * Neither the name of di-research nor the names of its 16 | contributors may be used to endorse or promote products derived from this 17 | software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 26 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 27 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 28 | OF THE POSSIBILITY OF SUCH DAMAGE. 29 | 30 | -------------------------------------------------------------------------------- /src/data/download_datasets.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from zipfile import ZipFile 3 | from pathlib import Path 4 | 5 | DATASETS = [ 6 | 'http://data.dws.informatik.uni-mannheim.de/largescaleproductcorpus/data/wdc-products/data.zip' 7 | ] 8 | 9 | 10 | def download_datasets(): 11 | for link in DATASETS: 12 | 13 | '''iterate through all links in DATASETS 14 | and download them one by one''' 15 | 16 | # obtain filename by splitting url and getting 17 | # last string 18 | file_name = link.split('/')[-1] 19 | 20 | print("Downloading file:%s" % file_name) 21 | 22 | # create response object 23 | r = requests.get(link, stream=True) 24 | 25 | # download started 26 | with open(f'../../{file_name}', 'wb') as f: 27 | for chunk in r.iter_content(chunk_size=1024 * 1024): 28 | if chunk: 29 | f.write(chunk) 30 | 31 | print("%s downloaded!\n" % file_name) 32 | 33 | print("All files downloaded!") 34 | return 35 | 36 | 37 | def unzip_files(): 38 | for link in DATASETS: 39 | file_name = link.split('/')[-1] 40 | # opening the zip file in READ mode 41 | with ZipFile(f'../../{file_name}', 'r') as zip: 42 | # printing all the contents of the zip file 43 | zip.printdir() 44 | 45 | # extracting all the files 46 | print('Extracting all the files now...') 47 | zip.extractall(path='../../') 48 | print('Done!') 49 | 50 | 51 | if __name__ == "__main__": 52 | Path('../../data/').mkdir(parents=True, exist_ok=True) 53 | download_datasets() 54 | unzip_files() 55 | -------------------------------------------------------------------------------- /ditto/blocking/README.md: -------------------------------------------------------------------------------- 1 | # The optional SentenceBERT fine-tuning for advanced blocking 2 | 3 | We leverage the [Sentence Transformers](https://github.com/UKPLab/sentence-transformers) library to fine-tune the LMs for entity record representation. 4 | 5 | ## Train the advanced blocking model 6 | 7 | The following command fine-tunes the BERT model on an entity pair dataset to generate vector representations of entity data entries: 8 | ``` 9 | CUDA_VISIBLE_DEVICES=0 python train_blocker.py \ 10 | --train_fn ../data/er_magellan/Structured/Beer/train.txt \ 11 | --valid_fn ../data/er_magellan/Structured/Beer/valid.txt \ 12 | --model_fn model.pth \ 13 | --batch_size 64 \ 14 | --n_epochs 40 \ 15 | --lm bert \ 16 | --fp16 17 | ``` 18 | 19 | Parameters: 20 | * ``--train_fn``: the training dataset (serialized) 21 | * ``--valid_fn``: the validation dataset (serialized) 22 | * ``--model_fn``: the path to the output model (see sentence-transformers) 23 | * ``--batch_size``, ``--n_epochs``, ``--lm``: batch size, number of epochs, the language model 24 | * ``--fp16``: whether to train with fp16 accelaration 25 | 26 | ## Run the blocking model 27 | 28 | To run the trained blocking model: 29 | ``` 30 | CUDA_VISIBLE_DEVICES=0 python blocker.py \ 31 | --input_path input/ \ 32 | --left_fn table_a.txt \ 33 | --right_fn table_b.txt \ 34 | --output_fn candidates.jsonl \ 35 | --model_fn model.pth \ 36 | --k 10 37 | ``` 38 | where 39 | * ``--input_path``, ``left_fn``, ``right_fn`` are the path to the data directory containing two files, ``left_fn`` and ``right_fn``. The two files are serialized and contain one entry per line 40 | * ``--output_fn``: the output file in jsonline format 41 | * ``--model_fn``: the trained model 42 | * ``--k`` (optional): if this parameter is set, then the candidates will be the top-k most similar entries for each row in ``right_fn`` 43 | * ``--threshold`` (optional): if this parameter is set, then the candidates will be all entry pairs of similarity above the threshold 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *.cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # DotEnv configuration 60 | .env 61 | 62 | # Database 63 | *.db 64 | *.rdb 65 | 66 | # Pycharm 67 | .idea 68 | 69 | # VS Code 70 | .vscode/ 71 | 72 | # Spyder 73 | .spyproject/ 74 | 75 | # Jupyter NB Checkpoints 76 | .ipynb_checkpoints/ 77 | 78 | # exclude data from source control by default 79 | /data/ 80 | 81 | # exclude cache from source control by default 82 | /cache/ 83 | 84 | # exclude models from source control by default 85 | /models/ 86 | 87 | /reports/baseline/ 88 | /reports/baseline-multi/ 89 | /reports/contrastive/ 90 | /reports/contrastive-ft-siamese/ 91 | /reports/contrastive-ft-multi/ 92 | /reports/contrastive-ft-siamese-preaug/ 93 | /reports/contrastive-blocking-archive/ 94 | /reports/matrix/ 95 | /reports/contrastive-onlywrong/ 96 | /reports/contrastive-ft-siamese-onlywrong/ 97 | /fasttext/ 98 | 99 | # Mac OS-specific storage files 100 | .DS_Store 101 | 102 | # vim 103 | *.swp 104 | *.swo 105 | 106 | # Mypy cache 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /hiergat/result_collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "982e523c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import re \n", 12 | "import pandas as pd\n", 13 | "import json\n", 14 | "import ast\n", 15 | "directory = os.fsencode('./output')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "5fdc7b80", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "result_list = []\n", 26 | "for file in os.listdir(directory):\n", 27 | " filename = os.fsdecode(file)\n", 28 | " if (filename.endswith(\".txt\") and filename.startswith(\"final\")): \n", 29 | " with open(os.path.join(directory, file), \"r\") as myfile: \n", 30 | " dictionary = ast.literal_eval(myfile.read())\n", 31 | " f1 = (\"{:.2f}\".format(dictionary['best_test_f1'] * 100))\n", 32 | " filename = filename[6:-4]\n", 33 | " regexp_1 = re.compile(r\"(.*)_lr=5e-06_id=(.*)_batch=16\")\n", 34 | " re_match = regexp_1.match(filename)\n", 35 | " if re_match:\n", 36 | " list_match = list(re_match.groups())\n", 37 | " list_match.append(float(f1))\n", 38 | " result_list.append(list_match)\n", 39 | "\n", 40 | "df = pd.DataFrame(result_list, columns=['data', 'id', 'f1'])\n", 41 | "df = df.sort_values(by=['data', 'id'])\n", 42 | "df = df.groupby(['data'])['f1'].mean().reset_index()" 43 | ] 44 | } 45 | ], 46 | "metadata": { 47 | "kernelspec": { 48 | "display_name": "Python 3 (ipykernel)", 49 | "language": "python", 50 | "name": "python3" 51 | }, 52 | "language_info": { 53 | "codemirror_mode": { 54 | "name": "ipython", 55 | "version": 3 56 | }, 57 | "file_extension": ".py", 58 | "mimetype": "text/x-python", 59 | "name": "python", 60 | "nbconvert_exporter": "python", 61 | "pygments_lexer": "ipython3", 62 | "version": "3.9.12" 63 | }, 64 | "vscode": { 65 | "interpreter": { 66 | "hash": "88691fdbb5fc570d622944683b753479941aed12af1b86d101217ab42f5d39c2" 67 | } 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 5 72 | } 73 | -------------------------------------------------------------------------------- /src/data/utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | np.random.seed(42) 4 | import random 5 | random.seed(42) 6 | 7 | import nltk 8 | from nltk import PorterStemmer 9 | from nltk.corpus import stopwords 10 | 11 | from copy import deepcopy 12 | 13 | from gensim.parsing.preprocessing import lower_to_unicode, preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric 14 | 15 | import re 16 | PATTERN1 = re.compile("\"@\S+\s+") 17 | PATTERN2 = re.compile("\s+") 18 | CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces] 19 | 20 | def clean_string_wdcv2(words): 21 | if not words: 22 | return None 23 | words = words.partition('"')[2] 24 | words = words.rpartition('"')[0] 25 | words = re.sub(PATTERN1, ' ', words) 26 | words = re.sub(PATTERN2, ' ', words) 27 | words = words.replace('"', '') 28 | words = words.strip() 29 | return words 30 | 31 | def clean_string_2020(words): 32 | if not words: 33 | return None 34 | words = preprocess_string(words, CUSTOM_FILTERS) 35 | words = ' '.join(words) 36 | return words 37 | 38 | def clean_specTableContent_wdcv2(words): 39 | if not words: 40 | return None 41 | words = re.sub(PATTERN2, ' ', words) 42 | words = words.strip() 43 | return words 44 | 45 | def tokenize(words, delimiter=None): 46 | #check for NaN 47 | if isinstance(words, float): 48 | if words != words: 49 | return [] 50 | words = str(words) 51 | return words.split(sep=delimiter) 52 | 53 | def remove_stopwords(words, lower=False): 54 | #check for NaN 55 | if isinstance(words, float): 56 | if words != words: 57 | return words 58 | stop_words_list = deepcopy(stopwords.words('english')) 59 | if lower: 60 | stop_words_list = list(map(lambda x: x.lower(), stop_words_list)) 61 | word_list = tokenize(words) 62 | word_list_stopwords_removed = [x for x in word_list if x not in stop_words_list] 63 | words_processed = ' '.join(word_list_stopwords_removed) 64 | return words_processed 65 | 66 | def stem(words): 67 | stemmer = PorterStemmer() 68 | word_list = tokenize(words) 69 | stemmed_words = [stemmer.stem(x) for x in word_list] 70 | words_processed = ' '.join(stemmed_words) 71 | return words_processed -------------------------------------------------------------------------------- /hiergat/model/eval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import os 3 | import numpy as np 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import sklearn.metrics as metrics 7 | 8 | import time 9 | 10 | 11 | def eval_classifier(model, iterator): 12 | model.eval() 13 | 14 | Y = [] 15 | Y_hat = [] 16 | loss_list = [] 17 | total_size = 0 18 | with torch.no_grad(): 19 | for i, batch in enumerate(iterator): 20 | _, x, y, _, masks = batch 21 | logits, y1, y_hat = model(x, y, masks) 22 | 23 | logits = logits.view(-1, logits.shape[-1]) 24 | y1 = y1.view(-1) 25 | loss = nn.CrossEntropyLoss()(logits, y1) 26 | 27 | loss_list.append(loss.item() * y.shape[0]) 28 | total_size += y.shape[0] 29 | 30 | Y.extend(y.numpy().tolist()) 31 | Y_hat.extend(y_hat.cpu().numpy().tolist()) 32 | 33 | loss = sum(loss_list) / total_size 34 | print("======================================") 35 | 36 | accuracy = metrics.accuracy_score(Y, Y_hat) 37 | precision = metrics.precision_score(Y, Y_hat) 38 | recall = metrics.recall_score(Y, Y_hat) 39 | f1 = metrics.f1_score(Y, Y_hat) 40 | print("accuracy=%.4f" % accuracy) 41 | print("precision=%.4f" % precision) 42 | print("recall=%.4f" % recall) 43 | print("f1=%.4f" % f1) 44 | print("======================================") 45 | 46 | return accuracy, precision, recall, f1, loss 47 | 48 | 49 | def eval_on_task(epoch, model, valid_iter, test_iter, 50 | writer, run_tag): 51 | print('Validation:') 52 | start = time.time() 53 | v_output = eval_classifier(model, valid_iter) 54 | print("valid time: ", time.time()-start) 55 | 56 | print('Test:') 57 | t_output = eval_classifier(model, test_iter) 58 | 59 | acc, prec, recall, f1, v_loss = v_output 60 | t_acc, t_prec, t_recall, t_f1, t_loss = t_output 61 | scalars = {'acc': acc, 62 | 'precision': prec, 63 | 'recall': recall, 64 | 'f1': f1, 65 | 'v_loss': v_loss, 66 | 't_acc': t_acc, 67 | 't_precision': t_prec, 68 | 't_recall': t_recall, 69 | 't_f1': t_f1, 70 | 't_loss': t_loss} 71 | 72 | # logging 73 | writer.add_scalars(run_tag, scalars, epoch) 74 | return f1, t_f1 75 | -------------------------------------------------------------------------------- /ditto/result_collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "982e523c", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "import re \n", 12 | "import pandas as pd\n", 13 | "import json\n", 14 | "import ast\n", 15 | "directory = os.fsencode('./output')" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "5fdc7b80", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "result_list = []\n", 26 | "for file in os.listdir(directory):\n", 27 | " filename = os.fsdecode(file)\n", 28 | " if filename.endswith(\".txt\"): \n", 29 | " with open(os.path.join(directory, file), \"r\") as myfile: \n", 30 | " dictionary = ast.literal_eval(myfile.read())\n", 31 | " f1 = (\"{:.2f}\".format(dictionary['best_f1'] * 100))\n", 32 | "\n", 33 | " filename = filename[6:-4]\n", 34 | "\n", 35 | " regexp_1 = re.compile(r\"(.*)un_lm=roberta_da=del_dk=None_su=False_size=None_id=(.*)\")\n", 36 | " re_match = regexp_1.match(filename)\n", 37 | " if (re_match):\n", 38 | " list_match = list(re_match.groups())\n", 39 | " list_match.append(float(f1))\n", 40 | " result_list.append(list_match)\n", 41 | "\n", 42 | "df = pd.DataFrame(result_list, columns=['data', 'f1', 'id'])\n", 43 | "df = df.sort_values(by=['data', 'id'])\n", 44 | "df = df.groupby(['data'])['f1'].mean().reset_index()\n", 45 | "\n", 46 | "df.to_csv('results.csv', encoding='utf-8', index=False)" 47 | ] 48 | } 49 | ], 50 | "metadata": { 51 | "kernelspec": { 52 | "display_name": "Python 3 (ipykernel)", 53 | "language": "python", 54 | "name": "python3" 55 | }, 56 | "language_info": { 57 | "codemirror_mode": { 58 | "name": "ipython", 59 | "version": 3 60 | }, 61 | "file_extension": ".py", 62 | "mimetype": "text/x-python", 63 | "name": "python", 64 | "nbconvert_exporter": "python", 65 | "pygments_lexer": "ipython3", 66 | "version": "3.9.12" 67 | }, 68 | "vscode": { 69 | "interpreter": { 70 | "hash": "378f5ca2fb65fb71205b60ca0e5dd58b8abec09bd391cd47886dadc212764ff3" 71 | } 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 5 76 | } 77 | -------------------------------------------------------------------------------- /hiergat/model/ceval.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import sklearn.metrics as metrics 4 | 5 | import time 6 | 7 | 8 | def eval_classifier(model, iterator, su_iterator): 9 | model.eval() 10 | 11 | Y = [] 12 | Y_hat = [] 13 | loss_list = [] 14 | total_size = 0 15 | with torch.no_grad(): 16 | for i, (batch, su_batch) in enumerate(zip(iterator, su_iterator)): 17 | _, x, y, _, masks = batch 18 | _, _, z, _, _, _ = su_batch 19 | logits, y1, y_hat = model(x, z, y, masks) 20 | 21 | logits = logits.view(-1, logits.shape[-1]) 22 | y1 = y1.view(-1) 23 | loss = nn.CrossEntropyLoss()(logits, y1) 24 | 25 | loss_list.append(loss.item() * y.shape[0]) 26 | total_size += y.shape[0] 27 | 28 | Y.extend(y.numpy().tolist()) 29 | Y_hat.extend(y_hat.cpu().numpy().tolist()) 30 | 31 | loss = sum(loss_list) / total_size 32 | print("======================================") 33 | 34 | accuracy = metrics.accuracy_score(Y, Y_hat) 35 | precision = metrics.precision_score(Y, Y_hat) 36 | recall = metrics.recall_score(Y, Y_hat) 37 | f1 = metrics.f1_score(Y, Y_hat) 38 | print("accuracy=%.4f" % accuracy) 39 | print("precision=%.4f" % precision) 40 | print("recall=%.4f" % recall) 41 | print("f1=%.4f" % f1) 42 | print("======================================") 43 | 44 | return accuracy, precision, recall, f1, loss 45 | 46 | 47 | def eval_on_task(epoch, model, valid_iter, test_iter, valid_su_iter, test_su_iter, 48 | writer, run_tag): 49 | print('Validation:') 50 | start = time.time() 51 | v_output = eval_classifier(model, valid_iter, valid_su_iter) 52 | print("valid time: ", time.time() - start) 53 | 54 | print('Test:') 55 | t_output = eval_classifier(model, test_iter, test_su_iter) 56 | 57 | acc, prec, recall, f1, v_loss = v_output 58 | t_acc, t_prec, t_recall, t_f1, t_loss = t_output 59 | scalars = {'acc': acc, 60 | 'precision': prec, 61 | 'recall': recall, 62 | 'f1': f1, 63 | 'v_loss': v_loss, 64 | 't_acc': t_acc, 65 | 't_precision': t_prec, 66 | 't_recall': t_recall, 67 | 't_f1': t_f1, 68 | 't_loss': t_loss} 69 | 70 | # logging 71 | writer.add_scalars(run_tag, scalars, epoch) 72 | return f1, t_f1 73 | -------------------------------------------------------------------------------- /ditto/ditto_env.yml: -------------------------------------------------------------------------------- 1 | name: ditto_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - backcall=0.2.0=pyhd3eb1b0_0 8 | - ca-certificates=2022.10.11=h06a4308_0 9 | - certifi=2022.9.24=py37h06a4308_0 10 | - cudatoolkit=11.3.1=h2bc3f7f_2 11 | - debugpy=1.5.1=py37h295c915_0 12 | - decorator=5.1.1=pyhd3eb1b0_0 13 | - entrypoints=0.4=py37h06a4308_0 14 | - ipykernel=6.15.2=py37h06a4308_0 15 | - ipython=7.31.1=py37h06a4308_1 16 | - jedi=0.18.1=py37h06a4308_1 17 | - jupyter_client=7.3.5=py37h06a4308_0 18 | - jupyter_core=4.11.2=py37h06a4308_0 19 | - ld_impl_linux-64=2.38=h1181459_1 20 | - libffi=3.3=he6710b0_2 21 | - libgcc-ng=11.2.0=h1234567_1 22 | - libgomp=11.2.0=h1234567_1 23 | - libsodium=1.0.18=h7b6447c_0 24 | - libstdcxx-ng=11.2.0=h1234567_1 25 | - matplotlib-inline=0.1.6=py37h06a4308_0 26 | - ncurses=6.3=h5eee18b_3 27 | - nest-asyncio=1.5.5=py37h06a4308_0 28 | - openssl=1.1.1s=h7f8727e_0 29 | - packaging=21.3=pyhd3eb1b0_0 30 | - parso=0.8.3=pyhd3eb1b0_0 31 | - pexpect=4.8.0=pyhd3eb1b0_3 32 | - pickleshare=0.7.5=pyhd3eb1b0_1003 33 | - pip=22.2.2=py37h06a4308_0 34 | - prompt-toolkit=3.0.20=pyhd3eb1b0_0 35 | - psutil=5.9.0=py37h5eee18b_0 36 | - ptyprocess=0.7.0=pyhd3eb1b0_2 37 | - pygments=2.11.2=pyhd3eb1b0_0 38 | - pyparsing=3.0.9=py37h06a4308_0 39 | - python=3.7.15=haa1d7c7_0 40 | - python-dateutil=2.8.2=pyhd3eb1b0_0 41 | - pyzmq=23.2.0=py37h6a678d5_0 42 | - readline=8.2=h5eee18b_0 43 | - setuptools=65.5.0=py37h06a4308_0 44 | - six=1.16.0=pyhd3eb1b0_1 45 | - sqlite=3.39.3=h5082296_0 46 | - tk=8.6.12=h1ccaba5_0 47 | - tornado=6.2=py37h5eee18b_0 48 | - traitlets=5.1.1=pyhd3eb1b0_0 49 | - wcwidth=0.2.5=pyhd3eb1b0_0 50 | - wheel=0.37.1=pyhd3eb1b0_0 51 | - xz=5.2.6=h5eee18b_0 52 | - zeromq=4.3.4=h2531618_0 53 | - zlib=1.2.13=h5eee18b_0 54 | - pip: 55 | - blis==0.7.9 56 | - catalogue==2.0.8 57 | - charset-normalizer==2.1.1 58 | - click==7.1.2 59 | - cymem==2.0.7 60 | - filelock==3.8.0 61 | - fuzzywuzzy==0.18.0 62 | - gensim==3.8.1 63 | - huggingface-hub==0.10.1 64 | - idna==3.4 65 | - importlib-metadata==5.0.0 66 | - jinja2==3.1.2 67 | - joblib==1.2.0 68 | - jsonlines==1.2.0 69 | - markupsafe==2.1.1 70 | - murmurhash==1.0.9 71 | - nltk==3.7 72 | - numpy==1.19.2 73 | - nvidia-cublas-cu11==11.10.3.66 74 | - nvidia-cuda-nvrtc-cu11==11.7.99 75 | - nvidia-cuda-runtime-cu11==11.7.99 76 | - nvidia-cudnn-cu11==8.5.0.96 77 | - pandas==1.3.5 78 | - pathy==0.6.2 79 | - pillow==9.3.0 80 | - preshed==3.0.8 81 | - protobuf==3.20.1 82 | - pydantic==1.8.2 83 | - pytz==2022.6 84 | - pyyaml==6.0 85 | - regex==2022.10.31 86 | - requests==2.28.1 87 | - sacremoses==0.0.53 88 | - scikit-learn==1.0.2 89 | - scipy==1.3.2 90 | - sentencepiece==0.1.85 91 | - sklearn==0.0 92 | - smart-open==5.2.1 93 | - spacy==3.1.0 94 | - spacy-legacy==3.0.10 95 | - srsly==2.4.5 96 | - tensorboardx==2.5.1 97 | - thinc==8.0.17 98 | - threadpoolctl==3.1.0 99 | - tokenizers==0.13.1 100 | - torch==1.13.0 101 | - torchaudio==0.13.0 102 | - torchvision==0.14.0 103 | - tqdm==4.41.0 104 | - transformers==4.24.0 105 | - typer==0.3.2 106 | - typing-extensions==3.10.0.2 107 | - urllib3==1.26.12 108 | - wasabi==0.10.1 109 | - zipp==3.10.0 110 | prefix: /home/ma/ma_ma/ma_rder/anaconda3/envs/ditto_env 111 | -------------------------------------------------------------------------------- /hiergat/hiergat_env.yml: -------------------------------------------------------------------------------- 1 | name: hiergat_env 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - _pytorch_select=0.2=gpu_0 8 | - backcall=0.2.0=pyhd3eb1b0_0 9 | - blas=1.0=mkl 10 | - ca-certificates=2022.10.11=h06a4308_0 11 | - certifi=2022.12.7=py37h06a4308_0 12 | - cffi=1.15.1=py37h74dc2b5_0 13 | - cudatoolkit=10.1.243=h6bb024c_0 14 | - cudnn=7.6.5=cuda10.1_0 15 | - debugpy=1.5.1=py37h295c915_0 16 | - decorator=5.1.1=pyhd3eb1b0_0 17 | - entrypoints=0.4=py37h06a4308_0 18 | - intel-openmp=2022.0.1=h06a4308_3633 19 | - ipykernel=6.15.2=py37h06a4308_0 20 | - ipython=7.31.1=py37h06a4308_1 21 | - jedi=0.18.1=py37h06a4308_1 22 | - jupyter_client=7.4.8=py37h06a4308_0 23 | - jupyter_core=4.11.2=py37h06a4308_0 24 | - ld_impl_linux-64=2.38=h1181459_1 25 | - libffi=3.4.2=h6a678d5_6 26 | - libgcc-ng=11.2.0=h1234567_1 27 | - libgomp=11.2.0=h1234567_1 28 | - libsodium=1.0.18=h7b6447c_0 29 | - libstdcxx-ng=11.2.0=h1234567_1 30 | - matplotlib-inline=0.1.6=py37h06a4308_0 31 | - mkl=2020.2=256 32 | - mkl-service=2.3.0=py37he8ac12f_0 33 | - mkl_fft=1.3.0=py37h54f3939_0 34 | - mkl_random=1.1.1=py37h0573a6f_0 35 | - ncurses=6.3=h5eee18b_3 36 | - nest-asyncio=1.5.6=py37h06a4308_0 37 | - ninja=1.10.2=h06a4308_5 38 | - ninja-base=1.10.2=hd09550d_5 39 | - numpy-base=1.19.2=py37hfa32c7d_0 40 | - openssl=1.1.1s=h7f8727e_0 41 | - packaging=22.0=py37h06a4308_0 42 | - parso=0.8.3=pyhd3eb1b0_0 43 | - pexpect=4.8.0=pyhd3eb1b0_3 44 | - pickleshare=0.7.5=pyhd3eb1b0_1003 45 | - pip=22.3.1=py37h06a4308_0 46 | - prompt-toolkit=3.0.36=py37h06a4308_0 47 | - psutil=5.9.0=py37h5eee18b_0 48 | - ptyprocess=0.7.0=pyhd3eb1b0_2 49 | - pycparser=2.21=pyhd3eb1b0_0 50 | - pygments=2.11.2=pyhd3eb1b0_0 51 | - python=3.7.15=h7a1cb2a_1 52 | - python-dateutil=2.8.2=pyhd3eb1b0_0 53 | - pytorch=1.4.0=cuda101py37h02f0884_0 54 | - pyzmq=23.2.0=py37h6a678d5_0 55 | - readline=8.2=h5eee18b_0 56 | - setuptools=65.6.3=py37h06a4308_0 57 | - six=1.16.0=pyhd3eb1b0_1 58 | - sqlite=3.40.1=h5082296_0 59 | - tk=8.6.12=h1ccaba5_0 60 | - tornado=6.2=py37h5eee18b_0 61 | - traitlets=5.7.1=py37h06a4308_0 62 | - wcwidth=0.2.5=pyhd3eb1b0_0 63 | - wheel=0.37.1=pyhd3eb1b0_0 64 | - xz=5.2.8=h5eee18b_0 65 | - zeromq=4.3.4=h2531618_0 66 | - zlib=1.2.13=h5eee18b_0 67 | - pip: 68 | - blis==0.4.1 69 | - boto3==1.24.56 70 | - botocore==1.27.56 71 | - catalogue==1.0.0 72 | - charset-normalizer==2.1.1 73 | - click==8.1.3 74 | - cymem==2.0.6 75 | - filelock==3.8.0 76 | - gensim==3.8.1 77 | - idna==3.3 78 | - importlib-metadata==4.12.0 79 | - jmespath==1.0.1 80 | - joblib==1.1.0 81 | - jsonlines==1.2.0 82 | - murmurhash==1.0.8 83 | - nltk==3.5 84 | - numpy==1.17.4 85 | - pandas==1.3.5 86 | - plac==1.1.3 87 | - preshed==3.0.7 88 | - protobuf==3.20.1 89 | - pytz==2022.7.1 90 | - regex==2019.12.20 91 | - requests==2.28.1 92 | - s3transfer==0.6.0 93 | - sacremoses==0.0.53 94 | - scikit-learn==1.0.2 95 | - scipy==1.3.2 96 | - sentencepiece==0.1.85 97 | - sklearn==0.0 98 | - smart-open==6.0.0 99 | - spacy==2.2.3 100 | - srsly==1.0.5 101 | - tensorboardx==2.0 102 | - thinc==7.3.1 103 | - threadpoolctl==3.1.0 104 | - tokenizers==0.5.2 105 | - tqdm==4.41.0 106 | - transformers==2.8.0 107 | - typing-extensions==4.3.0 108 | - urllib3==1.26.11 109 | - wasabi==0.10.1 110 | - zipp==3.8.1 111 | prefix: /home/ma/ma_ma/ma_rder/anaconda3/envs/hiergat_env 112 | -------------------------------------------------------------------------------- /hiergat/dataset.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import os 6 | from pathlib import Path 7 | import glob 8 | import gzip 9 | from copy import deepcopy 10 | 11 | import pickle 12 | 13 | import argparse 14 | import re 15 | import csv 16 | 17 | def combine_row(left, right, label): 18 | def func(row): 19 | col_names = left.columns # assume left and right always have same attributes 20 | list_ = ['COL' + ' ' + str(b) + ' ' + 'VAL' + ' ' + str(a) + ' ' for a, b in zip(row, col_names.values.tolist())] 21 | list_ = ''.join(str(m) for m in list_) 22 | return list_ 23 | 24 | left_list = list(map(func, left.values.tolist())) 25 | right_list = list(map(func, right.values.tolist())) 26 | label_list = [str(l) for l in label] 27 | 28 | left_df = pd.DataFrame({'left': pd.Series(left_list)}) 29 | right_df = pd.DataFrame({'right': pd.Series(right_list)}) 30 | label_df = pd.DataFrame({'label': pd.Series(label_list)}) 31 | 32 | # using tab separator here 33 | # https://github.com/megagonlabs/ditto - \t \t