├── src
    ├── models
    │   ├── .gitkeep
    │   └── __init__.py
    ├── contrastive
    │   ├── lspc
    │   │   ├── run_pretraining.sh
    │   │   ├── run_finetune_baseline.sh
    │   │   ├── run_finetune_baseline_multi.sh
    │   │   ├── run_finetune_siamese.sh
    │   │   └── run_finetune_multi.sh
    │   ├── models
    │   │   ├── metrics.py
    │   │   └── loss.py
    │   └── data
    │   │   └── data_collators.py
    ├── data
    │   ├── download_datasets.py
    │   └── utils.py
    └── processing
    │   ├── contrastive
    │       └── prepare-data.py
    │   ├── process-wordocc
    │       └── process-to-wordocc-multi.py
    │   ├── process-magellan
    │       └── process_to_magellan.py
    │   └── process-wordcooc
    │       └── process-to-wordcooc.py
├── hiergat
    ├── model
    │   ├── __init__.py
    │   ├── eval.py
    │   ├── ceval.py
    │   ├── summarize.py
    │   ├── layer.py
    │   ├── model.py
    │   ├── cmodel.py
    │   └── dataset.py
    ├── .gitignore
    ├── README.md
    ├── all_runs.py
    ├── result_collection.ipynb
    ├── hiergat_env.yml
    ├── dataset.py
    ├── train.py
    ├── train_n.py
    └── task.json
├── ditto
    ├── ditto_light
    │   ├── __init__.py
    │   ├── exceptions.py
    │   ├── dataset.py
    │   ├── augment.py
    │   ├── summarize.py
    │   ├── knowledge.py
    │   └── ditto.py
    ├── .gitignore
    ├── .gitmodules
    ├── README.md
    ├── all_runs.py
    ├── blocking
    │   ├── README.md
    │   ├── train_blocker.py
    │   └── blocker.py
    ├── result_collection.ipynb
    ├── ditto_env.yml
    ├── train_ditto.py
    ├── configs.json
    └── LICENSE
├── notebooks
    └── processing
    │   └── benchmark2020
    │       ├── .gitkeep
    │       └── dbscan-clustering.ipynb
├── setup.py
├── LICENSE
├── .gitignore
├── preprocess_data_for_ditto_hiergat.py
└── README.md


/src/models/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hiergat/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ditto/ditto_light/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/notebooks/processing/benchmark2020/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hiergat/.gitignore:
--------------------------------------------------------------------------------
1 | ._*
2 | /checkpoints
3 | /report
4 | 


--------------------------------------------------------------------------------
/ditto/.gitignore:
--------------------------------------------------------------------------------
1 | ditto_light/__pycache__/*
2 | report/
3 | results_wdc3/
4 | 


--------------------------------------------------------------------------------
/ditto/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "blocking/sentence-transformers"]
2 | 	path = blocking/sentence-transformers
3 | 	url = https://github.com/UKPLab/sentence-transformers
4 | 


--------------------------------------------------------------------------------
/ditto/ditto_light/exceptions.py:
--------------------------------------------------------------------------------
1 | class ModelNotFoundError(Exception):
2 |     def __init__(self, path):
3 |         super().__init__("Model {} was not found".format(path))
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | setup(
 4 |     name='src',
 5 |     packages=find_packages(),
 6 |     version='0.1.0',
 7 |     description='Data Integration Research',
 8 |     author='Ralph Peeters',
 9 |     license='BSD-3',
10 | )
11 | 


--------------------------------------------------------------------------------
/hiergat/README.md:
--------------------------------------------------------------------------------
 1 | ## Requirements 
 2 | * Python 3.7
 3 | * PyTorch 1.4
 4 | * HuggingFace Transformers
 5 | * NLTK (for 1-N ER problem)
 6 | 
 7 | Install required packages
 8 | ```
 9 | conda env create -f hiergat_env.yml
10 | ```
11 | 
12 | ## Activate corresponding environment 
13 | ```
14 | conda deactivate 
15 | conda activate hiergat_env
16 | ```
17 | 
18 | ## To train and test hiergat
19 | ```
20 | python all_runs.py
21 | ```


--------------------------------------------------------------------------------
/ditto/README.md:
--------------------------------------------------------------------------------
 1 | ## Requirements
 2 | * Python 3.7.7
 3 | * PyTorch 1.9
 4 | * HuggingFace Transformers 4.9.2
 5 | * Spacy with the ``en_core_web_lg`` models
 6 | 
 7 | Install required packages
 8 | ```
 9 | conda env create -f ditto_env.yml
10 | conda activate ditto_env
11 | python -m spacy download en_core_web_lg
12 | ```
13 | 
14 | ## Activate corresponding environment 
15 | ```
16 | conda deactivate 
17 | conda activate ditto_env
18 | ```
19 | 
20 | ## To train and test ditto
21 | ```
22 | python all_runs.py
23 | ```


--------------------------------------------------------------------------------
/hiergat/all_runs.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | 
 3 | sizes = ['small', 'medium', 'large'] 
 4 | difficulties = ['20cc80rnd', '50cc50rnd', '80cc20rnd'] 
 5 | unseens = ['000un', '050un', '100un'] 
 6 | 
 7 | for seed in range(3):
 8 |     for size in sizes: 
 9 |         for difficulty in difficulties: 
10 |             for unseen in unseens: 
11 |                 cmd = """CUDA_VISIBLE_DEVICES=3 python train.py \
12 |                         --task final_%s_%s%s \
13 |                         --run_id %d \
14 |                         --batch_size 16 \
15 |                         --max_len 256 \
16 |                         --lr 5e-6 \
17 |                         --n_epochs 50 \
18 |                         --finetuning \
19 |                         --split \
20 |                         --lm roberta""" % (size, difficulty, unseen, seed)
21 |                 print(cmd)
22 |                 os.system(cmd)
23 |                


--------------------------------------------------------------------------------
/ditto/all_runs.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | 
 3 | sizes = ['small', 'medium', 'large'] 
 4 | difficulties = ['20cc80rnd', '50cc50rnd', '80cc20rnd'] 
 5 | unseens = ['000un', '050un', '100un'] 
 6 | 
 7 | for seed in range(3):
 8 |     for size in sizes: 
 9 |         for difficulty in difficulties: 
10 |             for unseen in unseens: 
11 |                 cmd = """CUDA_VISIBLE_DEVICES=2 python train_ditto.py \
12 |                         --task final_%s_%s%s \
13 |                         --logdir results_wdc3/ \
14 |                         --run_id %d \
15 |                         --batch_size 64 \
16 |                         --max_len 256 \
17 |                         --lr 5e-5 \
18 |                         --n_epochs 50 \
19 |                         --finetuning \
20 |                         --lm roberta \
21 |                         --da del""" % (size, difficulty, unseen, seed)
22 |                 print(cmd)
23 |                 os.system(cmd)


--------------------------------------------------------------------------------
/src/contrastive/lspc/run_pretraining.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=gpu_8
 3 | #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=23:00:00
 6 | #SBATCH --export=NONE
 7 | MODEL=$1
 8 | CHECKPOINT=$2
 9 | BATCH=$3
10 | LR=$4
11 | TEMP=$5
12 | CATEGORY=$6
13 | SIZE=$7
14 | AUG=$8
15 | python run_pretraining.py \
16 |     --do_train \
17 |     --train_file ../../data/processed/wdc-lspc/contrastive/pre-train/$CATEGORY/${CATEGORY}_train_$SIZE.pkl.gz \
18 | 	--id_deduction_set ../../data/raw/wdc-lspc/training-sets/${CATEGORY}_train_$SIZE.json.gz \
19 | 	--tokenizer=$MODEL \
20 | 	--grad_checkpoint=$CHECKPOINT \
21 |     --output_dir ../../reports/contrastive/$CATEGORY-$SIZE-$AUG$BATCH-$LR-$TEMP-${MODEL##*/}/ \
22 | 	--temperature=$TEMP \
23 | 	--per_device_train_batch_size=$BATCH \
24 | 	--learning_rate=$LR \
25 | 	--weight_decay=0.01 \
26 | 	--num_train_epochs=200 \
27 | 	--lr_scheduler_type="linear" \
28 | 	--warmup_ratio=0.05 \
29 | 	--max_grad_norm=1.0 \
30 | 	--fp16 \
31 | 	--dataloader_num_workers=4 \
32 | 	--disable_tqdm=True \
33 | 	--save_strategy="epoch" \
34 | 	--logging_strategy="epoch" \
35 | 	--augment=$AUG \


--------------------------------------------------------------------------------
/src/contrastive/lspc/run_finetune_baseline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=gpu_8
 3 | #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=12:00:00
 6 | #SBATCH --export=NONE
 7 | MODEL=$1
 8 | CHECKPOINT=$2
 9 | BATCH=$3
10 | LR=$4
11 | CATEGORY=$5
12 | SIZE=$6
13 | AUG=$7
14 | python run_finetune_baseline.py \
15 |     --do_train \
16 |     --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
17 | 	--train_size=$SIZE \
18 | 	--validation_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
19 | 	--test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \
20 | 	--evaluation_strategy=epoch \
21 | 	--tokenizer=$MODEL \
22 | 	--grad_checkpoint=$CHECKPOINT \
23 |     --output_dir ../../reports/baseline/$CATEGORY-$SIZE-$AUG$BATCH-$LR-${MODEL##*/}/ \
24 | 	--per_device_train_batch_size=$BATCH \
25 | 	--learning_rate=$LR \
26 | 	--weight_decay=0.01 \
27 | 	--num_train_epochs=50 \
28 | 	--lr_scheduler_type="linear" \
29 | 	--warmup_ratio=0.05 \
30 | 	--max_grad_norm=1.0 \
31 | 	--fp16 \
32 | 	--metric_for_best_model=f1 \
33 | 	--dataloader_num_workers=4 \
34 | 	--disable_tqdm=True \
35 | 	--save_strategy="epoch" \
36 | 	--load_best_model_at_end \
37 | 	--augment=$AUG \
38 | 	#--do_param_opt \


--------------------------------------------------------------------------------
/src/contrastive/lspc/run_finetune_baseline_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=gpu_8
 3 | #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=12:00:00
 6 | #SBATCH --export=NONE
 7 | MODEL=$1
 8 | CHECKPOINT=$2
 9 | BATCH=$3
10 | LR=$4
11 | CATEGORY=$5
12 | SIZE=$6
13 | AUG=$7
14 | python run_finetune_baseline_multi.py \
15 |     --do_train \
16 |     --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
17 | 	--train_size=$SIZE \
18 | 	--validation_file ../../data/interim/wdc-lspc/validation-sets/preprocessed_${CATEGORY}_valid_$SIZE.pkl.gz \
19 | 	--test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \
20 | 	--evaluation_strategy=epoch \
21 | 	--tokenizer=$MODEL \
22 | 	--grad_checkpoint=$CHECKPOINT \
23 |     --output_dir ../../reports/baseline-multi/$CATEGORY-$SIZE-$AUG$BATCH-$LR-${MODEL##*/}/ \
24 | 	--per_device_train_batch_size=$BATCH \
25 | 	--learning_rate=$LR \
26 | 	--weight_decay=0.01 \
27 | 	--num_train_epochs=50 \
28 | 	--lr_scheduler_type="linear" \
29 | 	--warmup_ratio=0.05 \
30 | 	--max_grad_norm=1.0 \
31 | 	--fp16 \
32 | 	--metric_for_best_model=f1_micro \
33 | 	--dataloader_num_workers=4 \
34 | 	--disable_tqdm=True \
35 | 	--save_strategy="epoch" \
36 | 	--load_best_model_at_end \
37 | 	--augment=$AUG \
38 | 	#--do_param_opt \


--------------------------------------------------------------------------------
/src/contrastive/lspc/run_finetune_siamese.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=gpu_8
 3 | #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=12:00:00
 6 | #SBATCH --export=NONE
 7 | MODEL=$1
 8 | CHECKPOINT=$2
 9 | BATCH=$3
10 | LR=$4
11 | TEMP=$5
12 | FROZEN=$6
13 | CATEGORY=$7
14 | SIZE=$8
15 | AUG=$9
16 | PREAUG=${10}
17 | python run_finetune_siamese.py \
18 | 	--model_pretrained_checkpoint ../../reports/contrastive/$CATEGORY-$SIZE-$PREAUG$BATCH-$LR-$TEMP-${MODEL##*/}/pytorch_model.bin \
19 |     --do_train \
20 | 	--frozen=$FROZEN \
21 |     --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
22 | 	--train_size=$SIZE \
23 | 	--validation_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
24 | 	--test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \
25 | 	--evaluation_strategy=epoch \
26 | 	--tokenizer=$MODEL \
27 | 	--grad_checkpoint=$CHECKPOINT \
28 |     --output_dir ../../reports/contrastive-ft-siamese/$CATEGORY-$SIZE-$AUG$BATCH-$PREAUG$LR-$TEMP-$FROZEN-${MODEL##*/}/ \
29 | 	--per_device_train_batch_size=64 \
30 | 	--learning_rate=$LR \
31 | 	--weight_decay=0.01 \
32 | 	--num_train_epochs=50 \
33 | 	--lr_scheduler_type="linear" \
34 | 	--warmup_ratio=0.05 \
35 | 	--max_grad_norm=1.0 \
36 | 	--fp16 \
37 | 	--metric_for_best_model=loss \
38 | 	--dataloader_num_workers=4 \
39 | 	--disable_tqdm=True \
40 | 	--save_strategy="epoch" \
41 | 	--load_best_model_at_end \
42 | 	--augment=$AUG \
43 | 	#--do_param_opt \


--------------------------------------------------------------------------------
/src/contrastive/lspc/run_finetune_multi.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --partition=gpu_8
 3 | #SBATCH --gres=gpu:1
 4 | #SBATCH --nodes=1
 5 | #SBATCH --time=12:00:00
 6 | #SBATCH --export=NONE
 7 | MODEL=$1
 8 | CHECKPOINT=$2
 9 | BATCH=$3
10 | LR=$4
11 | TEMP=$5
12 | FROZEN=$6
13 | CATEGORY=$7
14 | SIZE=$8
15 | AUG=$9
16 | PREAUG=${10}
17 | python run_finetune_multi.py \
18 | 	--model_pretrained_checkpoint ../../reports/contrastive/${CATEGORY/multi/}-$SIZE-$PREAUG$BATCH-$LR-$TEMP-${MODEL##*/}/pytorch_model.bin \
19 |     --do_train \
20 | 	--frozen=$FROZEN \
21 |     --train_file ../../data/interim/wdc-lspc/training-sets/preprocessed_${CATEGORY}_train_$SIZE.pkl.gz \
22 | 	--train_size=$SIZE \
23 | 	--validation_file ../../data/interim/wdc-lspc/validation-sets/preprocessed_${CATEGORY}_valid_$SIZE.pkl.gz \
24 | 	--test_file ../../data/interim/wdc-lspc/gold-standards/preprocessed_${CATEGORY}_gs.pkl.gz \
25 | 	--evaluation_strategy=epoch \
26 | 	--tokenizer=$MODEL \
27 | 	--grad_checkpoint=$CHECKPOINT \
28 |     --output_dir ../../reports/contrastive-ft-multi/$CATEGORY-$SIZE-$AUG$BATCH-$PREAUG$LR-$TEMP-$FROZEN-${MODEL##*/}/ \
29 | 	--per_device_train_batch_size=64 \
30 | 	--learning_rate=$LR \
31 | 	--weight_decay=0.01 \
32 | 	--num_train_epochs=50 \
33 | 	--lr_scheduler_type="linear" \
34 | 	--warmup_ratio=0.05 \
35 | 	--max_grad_norm=1.0 \
36 | 	--fp16 \
37 | 	--metric_for_best_model=loss \
38 | 	--dataloader_num_workers=4 \
39 | 	--disable_tqdm=True \
40 | 	--save_strategy="epoch" \
41 | 	--load_best_model_at_end \
42 | 	--augment=$AUG \
43 | 	#--do_param_opt \


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | Copyright (c) 2019, Ralph Peeters
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice, this
 9 |   list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright notice, this
12 |   list of conditions and the following disclaimer in the documentation and/or
13 |   other materials provided with the distribution.
14 | 
15 | * Neither the name of di-research nor the names of its
16 |   contributors may be used to endorse or promote products derived from this
17 |   software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
23 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
26 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
27 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
28 | OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 
30 | 


--------------------------------------------------------------------------------
/src/data/download_datasets.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from zipfile import ZipFile
 3 | from pathlib import Path
 4 | 
 5 | DATASETS = [
 6 |     'http://data.dws.informatik.uni-mannheim.de/largescaleproductcorpus/data/wdc-products/data.zip'
 7 | ]
 8 | 
 9 | 
10 | def download_datasets():
11 |     for link in DATASETS:
12 | 
13 |         '''iterate through all links in DATASETS 
14 |         and download them one by one'''
15 | 
16 |         # obtain filename by splitting url and getting
17 |         # last string
18 |         file_name = link.split('/')[-1]
19 | 
20 |         print("Downloading file:%s" % file_name)
21 | 
22 |         # create response object
23 |         r = requests.get(link, stream=True)
24 | 
25 |         # download started
26 |         with open(f'../../{file_name}', 'wb') as f:
27 |             for chunk in r.iter_content(chunk_size=1024 * 1024):
28 |                 if chunk:
29 |                     f.write(chunk)
30 | 
31 |         print("%s downloaded!\n" % file_name)
32 | 
33 |     print("All files downloaded!")
34 |     return
35 | 
36 | 
37 | def unzip_files():
38 |     for link in DATASETS:
39 |         file_name = link.split('/')[-1]
40 |         # opening the zip file in READ mode
41 |         with ZipFile(f'../../{file_name}', 'r') as zip:
42 |             # printing all the contents of the zip file
43 |             zip.printdir()
44 | 
45 |             # extracting all the files
46 |             print('Extracting all the files now...')
47 |             zip.extractall(path='../../')
48 |             print('Done!')
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     Path('../../data/').mkdir(parents=True, exist_ok=True)
53 |     download_datasets()
54 |     unzip_files()
55 | 


--------------------------------------------------------------------------------
/ditto/blocking/README.md:
--------------------------------------------------------------------------------
 1 | # The optional SentenceBERT fine-tuning for advanced blocking
 2 | 
 3 | We leverage the [Sentence Transformers](https://github.com/UKPLab/sentence-transformers) library to fine-tune the LMs for entity record representation. 
 4 | 
 5 | ## Train the advanced blocking model
 6 | 
 7 | The following command fine-tunes the BERT model on an entity pair dataset to generate vector representations of entity data entries:
 8 | ```
 9 | CUDA_VISIBLE_DEVICES=0 python train_blocker.py \
10 |   --train_fn ../data/er_magellan/Structured/Beer/train.txt \
11 |   --valid_fn ../data/er_magellan/Structured/Beer/valid.txt \
12 |   --model_fn model.pth \
13 |   --batch_size 64 \
14 |   --n_epochs 40 \
15 |   --lm bert \
16 |   --fp16
17 | ```
18 | 
19 | Parameters:
20 | * ``--train_fn``: the training dataset (serialized)
21 | * ``--valid_fn``: the validation dataset (serialized)
22 | * ``--model_fn``: the path to the output model (see sentence-transformers)
23 | * ``--batch_size``, ``--n_epochs``, ``--lm``: batch size, number of epochs, the language model
24 | * ``--fp16``: whether to train with fp16 accelaration
25 | 
26 | ## Run the blocking model
27 | 
28 | To run the trained blocking model:
29 | ```
30 | CUDA_VISIBLE_DEVICES=0 python blocker.py \
31 |   --input_path input/ \
32 |   --left_fn table_a.txt \
33 |   --right_fn table_b.txt \
34 |   --output_fn candidates.jsonl \
35 |   --model_fn model.pth \
36 |   --k 10
37 | ```
38 | where
39 | * ``--input_path``, ``left_fn``, ``right_fn`` are the path to the data directory containing two files, ``left_fn`` and ``right_fn``. The two files are serialized and contain one entry per line
40 | * ``--output_fn``: the output file in jsonline format
41 | * ``--model_fn``: the trained model
42 | * ``--k`` (optional): if this parameter is set, then the candidates will be the top-k most similar entries for each row in ``right_fn``
43 | * ``--threshold`` (optional): if this parameter is set, then the candidates will be all entry pairs of similarity above the threshold
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | 
  5 | # C extensions
  6 | *.so
  7 | 
  8 | # Distribution / packaging
  9 | .Python
 10 | env/
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | 
 26 | # PyInstaller
 27 | #  Usually these files are written by a python script from a template
 28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 29 | *.manifest
 30 | *.spec
 31 | 
 32 | # Installer logs
 33 | pip-log.txt
 34 | pip-delete-this-directory.txt
 35 | 
 36 | # Unit test / coverage reports
 37 | htmlcov/
 38 | .tox/
 39 | .coverage
 40 | .coverage.*
 41 | .cache
 42 | nosetests.xml
 43 | coverage.xml
 44 | *.cover
 45 | 
 46 | # Translations
 47 | *.mo
 48 | *.pot
 49 | 
 50 | # Django stuff:
 51 | *.log
 52 | 
 53 | # Sphinx documentation
 54 | docs/_build/
 55 | 
 56 | # PyBuilder
 57 | target/
 58 | 
 59 | # DotEnv configuration
 60 | .env
 61 | 
 62 | # Database
 63 | *.db
 64 | *.rdb
 65 | 
 66 | # Pycharm
 67 | .idea
 68 | 
 69 | # VS Code
 70 | .vscode/
 71 | 
 72 | # Spyder
 73 | .spyproject/
 74 | 
 75 | # Jupyter NB Checkpoints
 76 | .ipynb_checkpoints/
 77 | 
 78 | # exclude data from source control by default
 79 | /data/
 80 | 
 81 | # exclude cache from source control by default
 82 | /cache/
 83 | 
 84 | # exclude models from source control by default
 85 | /models/
 86 | 
 87 | /reports/baseline/
 88 | /reports/baseline-multi/
 89 | /reports/contrastive/
 90 | /reports/contrastive-ft-siamese/
 91 | /reports/contrastive-ft-multi/
 92 | /reports/contrastive-ft-siamese-preaug/
 93 | /reports/contrastive-blocking-archive/
 94 | /reports/matrix/
 95 | /reports/contrastive-onlywrong/
 96 | /reports/contrastive-ft-siamese-onlywrong/
 97 | /fasttext/
 98 | 
 99 | # Mac OS-specific storage files
100 | .DS_Store
101 | 
102 | # vim
103 | *.swp
104 | *.swo
105 | 
106 | # Mypy cache
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/hiergat/result_collection.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "982e523c",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import os\n",
11 |     "import re \n",
12 |     "import pandas as pd\n",
13 |     "import json\n",
14 |     "import ast\n",
15 |     "directory = os.fsencode('./output')"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "id": "5fdc7b80",
22 |    "metadata": {},
23 |    "outputs": [],
24 |    "source": [
25 |     "result_list = []\n",
26 |     "for file in os.listdir(directory):\n",
27 |     "    filename = os.fsdecode(file)\n",
28 |     "    if (filename.endswith(\".txt\") and filename.startswith(\"final\")): \n",
29 |     "        with open(os.path.join(directory, file), \"r\") as myfile: \n",
30 |     "            dictionary = ast.literal_eval(myfile.read())\n",
31 |     "            f1 = (\"{:.2f}\".format(dictionary['best_test_f1'] * 100))\n",
32 |     "        filename = filename[6:-4]\n",
33 |     "        regexp_1 = re.compile(r\"(.*)_lr=5e-06_id=(.*)_batch=16\")\n",
34 |     "        re_match = regexp_1.match(filename)\n",
35 |     "        if re_match:\n",
36 |     "            list_match = list(re_match.groups())\n",
37 |     "            list_match.append(float(f1))\n",
38 |     "            result_list.append(list_match)\n",
39 |     "\n",
40 |     "df = pd.DataFrame(result_list, columns=['data', 'id', 'f1'])\n",
41 |     "df = df.sort_values(by=['data', 'id'])\n",
42 |     "df = df.groupby(['data'])['f1'].mean().reset_index()"
43 |    ]
44 |   }
45 |  ],
46 |  "metadata": {
47 |   "kernelspec": {
48 |    "display_name": "Python 3 (ipykernel)",
49 |    "language": "python",
50 |    "name": "python3"
51 |   },
52 |   "language_info": {
53 |    "codemirror_mode": {
54 |     "name": "ipython",
55 |     "version": 3
56 |    },
57 |    "file_extension": ".py",
58 |    "mimetype": "text/x-python",
59 |    "name": "python",
60 |    "nbconvert_exporter": "python",
61 |    "pygments_lexer": "ipython3",
62 |    "version": "3.9.12"
63 |   },
64 |   "vscode": {
65 |    "interpreter": {
66 |     "hash": "88691fdbb5fc570d622944683b753479941aed12af1b86d101217ab42f5d39c2"
67 |    }
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 5
72 | }
73 | 


--------------------------------------------------------------------------------
/src/data/utils.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | np.random.seed(42)
 4 | import random
 5 | random.seed(42)
 6 | 
 7 | import nltk
 8 | from nltk import PorterStemmer
 9 | from nltk.corpus import stopwords
10 | 
11 | from copy import deepcopy
12 | 
13 | from gensim.parsing.preprocessing import lower_to_unicode, preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric
14 | 
15 | import re
16 | PATTERN1 = re.compile("\"@\S+\s+")
17 | PATTERN2 = re.compile("\s+")
18 | CUSTOM_FILTERS = [strip_tags, strip_multiple_whitespaces]
19 | 
20 | def clean_string_wdcv2(words):
21 |     if not words:
22 |         return None
23 |     words = words.partition('"')[2]
24 |     words = words.rpartition('"')[0]
25 |     words = re.sub(PATTERN1, ' ', words)
26 |     words = re.sub(PATTERN2, ' ', words)
27 |     words = words.replace('"', '')
28 |     words = words.strip()
29 |     return words
30 | 
31 | def clean_string_2020(words):
32 |     if not words:
33 |         return None
34 |     words = preprocess_string(words, CUSTOM_FILTERS)
35 |     words = ' '.join(words)
36 |     return words
37 | 
38 | def clean_specTableContent_wdcv2(words):
39 |     if not words:
40 |         return None
41 |     words = re.sub(PATTERN2, ' ', words)
42 |     words = words.strip()
43 |     return words
44 | 
45 | def tokenize(words, delimiter=None):
46 |     #check for NaN
47 |     if isinstance(words, float):
48 |         if words != words:
49 |             return []
50 |     words = str(words)
51 |     return words.split(sep=delimiter)
52 | 
53 | def remove_stopwords(words, lower=False):
54 |     #check for NaN
55 |     if isinstance(words, float):
56 |         if words != words:
57 |             return words
58 |     stop_words_list = deepcopy(stopwords.words('english'))
59 |     if lower:
60 |         stop_words_list = list(map(lambda x: x.lower(), stop_words_list))
61 |     word_list = tokenize(words)
62 |     word_list_stopwords_removed = [x for x in word_list if x not in stop_words_list]
63 |     words_processed = ' '.join(word_list_stopwords_removed)
64 |     return words_processed
65 | 
66 | def stem(words):
67 |     stemmer = PorterStemmer()
68 |     word_list = tokenize(words)
69 |     stemmed_words = [stemmer.stem(x) for x in word_list]
70 |     words_processed = ' '.join(stemmed_words)
71 |     return words_processed


--------------------------------------------------------------------------------
/hiergat/model/eval.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import numpy as np
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import sklearn.metrics as metrics
 7 | 
 8 | import time
 9 | 
10 | 
11 | def eval_classifier(model, iterator):
12 |     model.eval()
13 | 
14 |     Y = []
15 |     Y_hat = []
16 |     loss_list = []
17 |     total_size = 0
18 |     with torch.no_grad():
19 |         for i, batch in enumerate(iterator):
20 |             _, x, y, _, masks = batch
21 |             logits, y1, y_hat = model(x, y, masks)
22 | 
23 |             logits = logits.view(-1, logits.shape[-1])
24 |             y1 = y1.view(-1)
25 |             loss = nn.CrossEntropyLoss()(logits, y1)
26 | 
27 |             loss_list.append(loss.item() * y.shape[0])
28 |             total_size += y.shape[0]
29 | 
30 |             Y.extend(y.numpy().tolist())
31 |             Y_hat.extend(y_hat.cpu().numpy().tolist())
32 | 
33 |     loss = sum(loss_list) / total_size
34 |     print("======================================")
35 | 
36 |     accuracy = metrics.accuracy_score(Y, Y_hat)
37 |     precision = metrics.precision_score(Y, Y_hat)
38 |     recall = metrics.recall_score(Y, Y_hat)
39 |     f1 = metrics.f1_score(Y, Y_hat)
40 |     print("accuracy=%.4f" % accuracy)
41 |     print("precision=%.4f" % precision)
42 |     print("recall=%.4f" % recall)
43 |     print("f1=%.4f" % f1)
44 |     print("======================================")
45 | 
46 |     return accuracy, precision, recall, f1, loss
47 | 
48 | 
49 | def eval_on_task(epoch, model, valid_iter, test_iter,
50 |                  writer, run_tag):
51 |     print('Validation:')
52 |     start = time.time()
53 |     v_output = eval_classifier(model, valid_iter)
54 |     print("valid time: ", time.time()-start)
55 | 
56 |     print('Test:')
57 |     t_output = eval_classifier(model, test_iter)
58 | 
59 |     acc, prec, recall, f1, v_loss = v_output
60 |     t_acc, t_prec, t_recall, t_f1, t_loss = t_output
61 |     scalars = {'acc': acc,
62 |                'precision': prec,
63 |                'recall': recall,
64 |                'f1': f1,
65 |                'v_loss': v_loss,
66 |                't_acc': t_acc,
67 |                't_precision': t_prec,
68 |                't_recall': t_recall,
69 |                't_f1': t_f1,
70 |                't_loss': t_loss}
71 | 
72 |     # logging
73 |     writer.add_scalars(run_tag, scalars, epoch)
74 |     return f1, t_f1
75 | 


--------------------------------------------------------------------------------
/ditto/result_collection.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "id": "982e523c",
 7 |    "metadata": {},
 8 |    "outputs": [],
 9 |    "source": [
10 |     "import os\n",
11 |     "import re \n",
12 |     "import pandas as pd\n",
13 |     "import json\n",
14 |     "import ast\n",
15 |     "directory = os.fsencode('./output')"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "id": "5fdc7b80",
22 |    "metadata": {},
23 |    "outputs": [],
24 |    "source": [
25 |     "result_list = []\n",
26 |     "for file in os.listdir(directory):\n",
27 |     "    filename = os.fsdecode(file)\n",
28 |     "    if filename.endswith(\".txt\"): \n",
29 |     "        with open(os.path.join(directory, file), \"r\") as myfile: \n",
30 |     "            dictionary = ast.literal_eval(myfile.read())\n",
31 |     "            f1 = (\"{:.2f}\".format(dictionary['best_f1'] * 100))\n",
32 |     "\n",
33 |     "        filename = filename[6:-4]\n",
34 |     "\n",
35 |     "        regexp_1 = re.compile(r\"(.*)un_lm=roberta_da=del_dk=None_su=False_size=None_id=(.*)\")\n",
36 |     "        re_match = regexp_1.match(filename)\n",
37 |     "        if (re_match):\n",
38 |     "            list_match = list(re_match.groups())\n",
39 |     "            list_match.append(float(f1))\n",
40 |     "            result_list.append(list_match)\n",
41 |     "\n",
42 |     "df = pd.DataFrame(result_list, columns=['data', 'f1', 'id'])\n",
43 |     "df = df.sort_values(by=['data', 'id'])\n",
44 |     "df = df.groupby(['data'])['f1'].mean().reset_index()\n",
45 |     "\n",
46 |     "df.to_csv('results.csv', encoding='utf-8', index=False)"
47 |    ]
48 |   }
49 |  ],
50 |  "metadata": {
51 |   "kernelspec": {
52 |    "display_name": "Python 3 (ipykernel)",
53 |    "language": "python",
54 |    "name": "python3"
55 |   },
56 |   "language_info": {
57 |    "codemirror_mode": {
58 |     "name": "ipython",
59 |     "version": 3
60 |    },
61 |    "file_extension": ".py",
62 |    "mimetype": "text/x-python",
63 |    "name": "python",
64 |    "nbconvert_exporter": "python",
65 |    "pygments_lexer": "ipython3",
66 |    "version": "3.9.12"
67 |   },
68 |   "vscode": {
69 |    "interpreter": {
70 |     "hash": "378f5ca2fb65fb71205b60ca0e5dd58b8abec09bd391cd47886dadc212764ff3"
71 |    }
72 |   }
73 |  },
74 |  "nbformat": 4,
75 |  "nbformat_minor": 5
76 | }
77 | 


--------------------------------------------------------------------------------
/hiergat/model/ceval.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import sklearn.metrics as metrics
 4 | 
 5 | import time
 6 | 
 7 | 
 8 | def eval_classifier(model, iterator, su_iterator):
 9 |     model.eval()
10 | 
11 |     Y = []
12 |     Y_hat = []
13 |     loss_list = []
14 |     total_size = 0
15 |     with torch.no_grad():
16 |         for i, (batch, su_batch) in enumerate(zip(iterator, su_iterator)):
17 |             _, x, y, _, masks = batch
18 |             _, _, z, _, _, _ = su_batch
19 |             logits, y1, y_hat = model(x, z, y, masks)
20 | 
21 |             logits = logits.view(-1, logits.shape[-1])
22 |             y1 = y1.view(-1)
23 |             loss = nn.CrossEntropyLoss()(logits, y1)
24 | 
25 |             loss_list.append(loss.item() * y.shape[0])
26 |             total_size += y.shape[0]
27 | 
28 |             Y.extend(y.numpy().tolist())
29 |             Y_hat.extend(y_hat.cpu().numpy().tolist())
30 | 
31 |     loss = sum(loss_list) / total_size
32 |     print("======================================")
33 | 
34 |     accuracy = metrics.accuracy_score(Y, Y_hat)
35 |     precision = metrics.precision_score(Y, Y_hat)
36 |     recall = metrics.recall_score(Y, Y_hat)
37 |     f1 = metrics.f1_score(Y, Y_hat)
38 |     print("accuracy=%.4f" % accuracy)
39 |     print("precision=%.4f" % precision)
40 |     print("recall=%.4f" % recall)
41 |     print("f1=%.4f" % f1)
42 |     print("======================================")
43 | 
44 |     return accuracy, precision, recall, f1, loss
45 | 
46 | 
47 | def eval_on_task(epoch, model, valid_iter, test_iter, valid_su_iter, test_su_iter,
48 |                  writer, run_tag):
49 |     print('Validation:')
50 |     start = time.time()
51 |     v_output = eval_classifier(model, valid_iter, valid_su_iter)
52 |     print("valid time: ", time.time() - start)
53 | 
54 |     print('Test:')
55 |     t_output = eval_classifier(model, test_iter, test_su_iter)
56 | 
57 |     acc, prec, recall, f1, v_loss = v_output
58 |     t_acc, t_prec, t_recall, t_f1, t_loss = t_output
59 |     scalars = {'acc': acc,
60 |                'precision': prec,
61 |                'recall': recall,
62 |                'f1': f1,
63 |                'v_loss': v_loss,
64 |                't_acc': t_acc,
65 |                't_precision': t_prec,
66 |                't_recall': t_recall,
67 |                't_f1': t_f1,
68 |                't_loss': t_loss}
69 | 
70 |     # logging
71 |     writer.add_scalars(run_tag, scalars, epoch)
72 |     return f1, t_f1
73 | 


--------------------------------------------------------------------------------
/ditto/ditto_env.yml:
--------------------------------------------------------------------------------
  1 | name: ditto_env
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - backcall=0.2.0=pyhd3eb1b0_0
  8 |   - ca-certificates=2022.10.11=h06a4308_0
  9 |   - certifi=2022.9.24=py37h06a4308_0
 10 |   - cudatoolkit=11.3.1=h2bc3f7f_2
 11 |   - debugpy=1.5.1=py37h295c915_0
 12 |   - decorator=5.1.1=pyhd3eb1b0_0
 13 |   - entrypoints=0.4=py37h06a4308_0
 14 |   - ipykernel=6.15.2=py37h06a4308_0
 15 |   - ipython=7.31.1=py37h06a4308_1
 16 |   - jedi=0.18.1=py37h06a4308_1
 17 |   - jupyter_client=7.3.5=py37h06a4308_0
 18 |   - jupyter_core=4.11.2=py37h06a4308_0
 19 |   - ld_impl_linux-64=2.38=h1181459_1
 20 |   - libffi=3.3=he6710b0_2
 21 |   - libgcc-ng=11.2.0=h1234567_1
 22 |   - libgomp=11.2.0=h1234567_1
 23 |   - libsodium=1.0.18=h7b6447c_0
 24 |   - libstdcxx-ng=11.2.0=h1234567_1
 25 |   - matplotlib-inline=0.1.6=py37h06a4308_0
 26 |   - ncurses=6.3=h5eee18b_3
 27 |   - nest-asyncio=1.5.5=py37h06a4308_0
 28 |   - openssl=1.1.1s=h7f8727e_0
 29 |   - packaging=21.3=pyhd3eb1b0_0
 30 |   - parso=0.8.3=pyhd3eb1b0_0
 31 |   - pexpect=4.8.0=pyhd3eb1b0_3
 32 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
 33 |   - pip=22.2.2=py37h06a4308_0
 34 |   - prompt-toolkit=3.0.20=pyhd3eb1b0_0
 35 |   - psutil=5.9.0=py37h5eee18b_0
 36 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
 37 |   - pygments=2.11.2=pyhd3eb1b0_0
 38 |   - pyparsing=3.0.9=py37h06a4308_0
 39 |   - python=3.7.15=haa1d7c7_0
 40 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
 41 |   - pyzmq=23.2.0=py37h6a678d5_0
 42 |   - readline=8.2=h5eee18b_0
 43 |   - setuptools=65.5.0=py37h06a4308_0
 44 |   - six=1.16.0=pyhd3eb1b0_1
 45 |   - sqlite=3.39.3=h5082296_0
 46 |   - tk=8.6.12=h1ccaba5_0
 47 |   - tornado=6.2=py37h5eee18b_0
 48 |   - traitlets=5.1.1=pyhd3eb1b0_0
 49 |   - wcwidth=0.2.5=pyhd3eb1b0_0
 50 |   - wheel=0.37.1=pyhd3eb1b0_0
 51 |   - xz=5.2.6=h5eee18b_0
 52 |   - zeromq=4.3.4=h2531618_0
 53 |   - zlib=1.2.13=h5eee18b_0
 54 |   - pip:
 55 |     - blis==0.7.9
 56 |     - catalogue==2.0.8
 57 |     - charset-normalizer==2.1.1
 58 |     - click==7.1.2
 59 |     - cymem==2.0.7
 60 |     - filelock==3.8.0
 61 |     - fuzzywuzzy==0.18.0
 62 |     - gensim==3.8.1
 63 |     - huggingface-hub==0.10.1
 64 |     - idna==3.4
 65 |     - importlib-metadata==5.0.0
 66 |     - jinja2==3.1.2
 67 |     - joblib==1.2.0
 68 |     - jsonlines==1.2.0
 69 |     - markupsafe==2.1.1
 70 |     - murmurhash==1.0.9
 71 |     - nltk==3.7
 72 |     - numpy==1.19.2
 73 |     - nvidia-cublas-cu11==11.10.3.66
 74 |     - nvidia-cuda-nvrtc-cu11==11.7.99
 75 |     - nvidia-cuda-runtime-cu11==11.7.99
 76 |     - nvidia-cudnn-cu11==8.5.0.96
 77 |     - pandas==1.3.5
 78 |     - pathy==0.6.2
 79 |     - pillow==9.3.0
 80 |     - preshed==3.0.8
 81 |     - protobuf==3.20.1
 82 |     - pydantic==1.8.2
 83 |     - pytz==2022.6
 84 |     - pyyaml==6.0
 85 |     - regex==2022.10.31
 86 |     - requests==2.28.1
 87 |     - sacremoses==0.0.53
 88 |     - scikit-learn==1.0.2
 89 |     - scipy==1.3.2
 90 |     - sentencepiece==0.1.85
 91 |     - sklearn==0.0
 92 |     - smart-open==5.2.1
 93 |     - spacy==3.1.0
 94 |     - spacy-legacy==3.0.10
 95 |     - srsly==2.4.5
 96 |     - tensorboardx==2.5.1
 97 |     - thinc==8.0.17
 98 |     - threadpoolctl==3.1.0
 99 |     - tokenizers==0.13.1
100 |     - torch==1.13.0
101 |     - torchaudio==0.13.0
102 |     - torchvision==0.14.0
103 |     - tqdm==4.41.0
104 |     - transformers==4.24.0
105 |     - typer==0.3.2
106 |     - typing-extensions==3.10.0.2
107 |     - urllib3==1.26.12
108 |     - wasabi==0.10.1
109 |     - zipp==3.10.0
110 | prefix: /home/ma/ma_ma/ma_rder/anaconda3/envs/ditto_env
111 | 


--------------------------------------------------------------------------------
/hiergat/hiergat_env.yml:
--------------------------------------------------------------------------------
  1 | name: hiergat_env
  2 | channels:
  3 |   - defaults
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=main
  6 |   - _openmp_mutex=5.1=1_gnu
  7 |   - _pytorch_select=0.2=gpu_0
  8 |   - backcall=0.2.0=pyhd3eb1b0_0
  9 |   - blas=1.0=mkl
 10 |   - ca-certificates=2022.10.11=h06a4308_0
 11 |   - certifi=2022.12.7=py37h06a4308_0
 12 |   - cffi=1.15.1=py37h74dc2b5_0
 13 |   - cudatoolkit=10.1.243=h6bb024c_0
 14 |   - cudnn=7.6.5=cuda10.1_0
 15 |   - debugpy=1.5.1=py37h295c915_0
 16 |   - decorator=5.1.1=pyhd3eb1b0_0
 17 |   - entrypoints=0.4=py37h06a4308_0
 18 |   - intel-openmp=2022.0.1=h06a4308_3633
 19 |   - ipykernel=6.15.2=py37h06a4308_0
 20 |   - ipython=7.31.1=py37h06a4308_1
 21 |   - jedi=0.18.1=py37h06a4308_1
 22 |   - jupyter_client=7.4.8=py37h06a4308_0
 23 |   - jupyter_core=4.11.2=py37h06a4308_0
 24 |   - ld_impl_linux-64=2.38=h1181459_1
 25 |   - libffi=3.4.2=h6a678d5_6
 26 |   - libgcc-ng=11.2.0=h1234567_1
 27 |   - libgomp=11.2.0=h1234567_1
 28 |   - libsodium=1.0.18=h7b6447c_0
 29 |   - libstdcxx-ng=11.2.0=h1234567_1
 30 |   - matplotlib-inline=0.1.6=py37h06a4308_0
 31 |   - mkl=2020.2=256
 32 |   - mkl-service=2.3.0=py37he8ac12f_0
 33 |   - mkl_fft=1.3.0=py37h54f3939_0
 34 |   - mkl_random=1.1.1=py37h0573a6f_0
 35 |   - ncurses=6.3=h5eee18b_3
 36 |   - nest-asyncio=1.5.6=py37h06a4308_0
 37 |   - ninja=1.10.2=h06a4308_5
 38 |   - ninja-base=1.10.2=hd09550d_5
 39 |   - numpy-base=1.19.2=py37hfa32c7d_0
 40 |   - openssl=1.1.1s=h7f8727e_0
 41 |   - packaging=22.0=py37h06a4308_0
 42 |   - parso=0.8.3=pyhd3eb1b0_0
 43 |   - pexpect=4.8.0=pyhd3eb1b0_3
 44 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
 45 |   - pip=22.3.1=py37h06a4308_0
 46 |   - prompt-toolkit=3.0.36=py37h06a4308_0
 47 |   - psutil=5.9.0=py37h5eee18b_0
 48 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
 49 |   - pycparser=2.21=pyhd3eb1b0_0
 50 |   - pygments=2.11.2=pyhd3eb1b0_0
 51 |   - python=3.7.15=h7a1cb2a_1
 52 |   - python-dateutil=2.8.2=pyhd3eb1b0_0
 53 |   - pytorch=1.4.0=cuda101py37h02f0884_0
 54 |   - pyzmq=23.2.0=py37h6a678d5_0
 55 |   - readline=8.2=h5eee18b_0
 56 |   - setuptools=65.6.3=py37h06a4308_0
 57 |   - six=1.16.0=pyhd3eb1b0_1
 58 |   - sqlite=3.40.1=h5082296_0
 59 |   - tk=8.6.12=h1ccaba5_0
 60 |   - tornado=6.2=py37h5eee18b_0
 61 |   - traitlets=5.7.1=py37h06a4308_0
 62 |   - wcwidth=0.2.5=pyhd3eb1b0_0
 63 |   - wheel=0.37.1=pyhd3eb1b0_0
 64 |   - xz=5.2.8=h5eee18b_0
 65 |   - zeromq=4.3.4=h2531618_0
 66 |   - zlib=1.2.13=h5eee18b_0
 67 |   - pip:
 68 |     - blis==0.4.1
 69 |     - boto3==1.24.56
 70 |     - botocore==1.27.56
 71 |     - catalogue==1.0.0
 72 |     - charset-normalizer==2.1.1
 73 |     - click==8.1.3
 74 |     - cymem==2.0.6
 75 |     - filelock==3.8.0
 76 |     - gensim==3.8.1
 77 |     - idna==3.3
 78 |     - importlib-metadata==4.12.0
 79 |     - jmespath==1.0.1
 80 |     - joblib==1.1.0
 81 |     - jsonlines==1.2.0
 82 |     - murmurhash==1.0.8
 83 |     - nltk==3.5
 84 |     - numpy==1.17.4
 85 |     - pandas==1.3.5
 86 |     - plac==1.1.3
 87 |     - preshed==3.0.7
 88 |     - protobuf==3.20.1
 89 |     - pytz==2022.7.1
 90 |     - regex==2019.12.20
 91 |     - requests==2.28.1
 92 |     - s3transfer==0.6.0
 93 |     - sacremoses==0.0.53
 94 |     - scikit-learn==1.0.2
 95 |     - scipy==1.3.2
 96 |     - sentencepiece==0.1.85
 97 |     - sklearn==0.0
 98 |     - smart-open==6.0.0
 99 |     - spacy==2.2.3
100 |     - srsly==1.0.5
101 |     - tensorboardx==2.0
102 |     - thinc==7.3.1
103 |     - threadpoolctl==3.1.0
104 |     - tokenizers==0.5.2
105 |     - tqdm==4.41.0
106 |     - transformers==2.8.0
107 |     - typing-extensions==4.3.0
108 |     - urllib3==1.26.11
109 |     - wasabi==0.10.1
110 |     - zipp==3.8.1
111 | prefix: /home/ma/ma_ma/ma_rder/anaconda3/envs/hiergat_env
112 | 


--------------------------------------------------------------------------------
/hiergat/dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | import os
 6 | from pathlib import Path
 7 | import glob
 8 | import gzip
 9 | from copy import deepcopy
10 | 
11 | import pickle
12 | 
13 | import argparse
14 | import re
15 | import csv
16 | 
17 | def combine_row(left, right, label):
18 |     def func(row):
19 |         col_names = left.columns # assume left and right always have same attributes
20 |         list_ = ['COL' + ' ' + str(b) + ' ' + 'VAL' + ' ' + str(a) + ' ' for a, b in zip(row, col_names.values.tolist())]
21 |         list_ = ''.join(str(m) for m in list_)
22 |         return list_
23 | 
24 |     left_list = list(map(func, left.values.tolist()))
25 |     right_list = list(map(func, right.values.tolist()))
26 |     label_list = [str(l) for l in label]
27 | 
28 |     left_df = pd.DataFrame({'left': pd.Series(left_list)})
29 |     right_df = pd.DataFrame({'right': pd.Series(right_list)})
30 |     label_df = pd.DataFrame({'label': pd.Series(label_list)})
31 | 
32 |     # using tab separator here
33 |     # https://github.com/megagonlabs/ditto - <entry_1> \t <entry_2> \t <label>
34 |     final_df = left_df.left.map(str) + '\t' + right_df.right
35 |     final_df = final_df.map(str) + '\t' + label_df.label
36 | 
37 |     return final_df
38 | 
39 | def preprocess_part(path):
40 |     with gzip.open(f'{path}', 'rb') as f:
41 |         test_set = pd.read_pickle(f)
42 | 
43 |     test_set = test_set.drop(['pair_id'], axis=1)
44 | 
45 |     mask_left = test_set.columns.str.endswith('_left')
46 |     mask_right = test_set.columns.str.endswith('_right')
47 | 
48 |     left = test_set.loc[:, mask_left]
49 |     right = test_set.loc[:, mask_right]
50 |     label = [int(x) for x in list(test_set['label'].values)]
51 | 
52 |     left.columns = left.columns.str.removesuffix('_left')
53 |     left = left.drop(['id', 'cluster_id'], axis=1)
54 |     left = left.fillna('')
55 |     left = left[['brand', 'title', 'price', 'priceCurrency', 'description', 'specTableContent']]
56 | 
57 |     right.columns = right.columns.str.removesuffix('_right')
58 |     right = right.drop(['id', 'cluster_id'], axis=1)
59 |     right = right.fillna('')
60 |     right = right[['brand', 'title', 'price', 'priceCurrency', 'description', 'specTableContent']]
61 | 
62 |     final_df = combine_row(left, right, label)
63 | 
64 |     return(final_df)
65 | 
66 | def preprocess_dataset():
67 |     os.makedirs('./data/final_output/', exist_ok=True)
68 | 
69 |     test_path =  "../data/interim/wdc-lspc/" + "gold-standards" + "/"
70 |     valid_path = "../data/interim/wdc-lspc/" + "validation-sets" + "/"
71 |     train_path = "../data/interim/wdc-lspc/" + "training-sets" + "/"
72 |     groups = {"gold-standards", "validation-sets", "training-sets"}
73 | 
74 |     if not (os.path.exists(test_path)):
75 |         print('Dataset does not exist')
76 |         return
77 | 
78 |     print(f'START BULDING FINAL DATASETS')
79 |     
80 |     for path in glob.glob(os.path.join(test_path, (r'preprocessed_wdcproducts*.pkl.gz'))):
81 |         test_df = preprocess_part(path)
82 |         np.savetxt(f'./data/final_output/{path[2:-7]}.txt', test_df.values, fmt = "%s")
83 |     print(f'DONE TEST')
84 | 
85 |     for path in glob.glob(os.path.join(valid_path, (r'preprocessed_wdcproducts*.pkl.gz'))):
86 |         valid_df = preprocess_part(path)
87 |         np.savetxt(f'./data/final_output/{path[2:-7]}.txt', valid_df.values, fmt = "%s")
88 |     print(f'DONE VALID')
89 | 
90 |     for path in glob.glob(os.path.join(train_path, (r'preprocessed_wdcproducts*.pkl.gz'))):
91 |         train_df = preprocess_part(path)
92 |         np.savetxt(f'./data/final_output/{path[2:-7]}.txt', train_df.values, fmt = "%s")
93 |     print(f'DONE TRAIN')
94 | 
95 |     print(f'FINISHED BULDING FINAL DATASETS\n')
96 | 
97 | if __name__ == '__main__':
98 |     preprocess_dataset()
99 | 


--------------------------------------------------------------------------------
/ditto/train_ditto.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import json
 4 | import sys
 5 | import torch
 6 | import numpy as np
 7 | import random
 8 | 
 9 | sys.path.insert(0, "Snippext_public")
10 | 
11 | from ditto_light.dataset import DittoDataset
12 | from ditto_light.summarize import Summarizer
13 | from ditto_light.knowledge import *
14 | from ditto_light.ditto import train
15 | 
16 | import nltk
17 | nltk.download('stopwords')
18 | 
19 | if __name__=="__main__":
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--task", type=str, default="Structured/Beer")
22 |     parser.add_argument("--run_id", type=int, default=0)
23 |     parser.add_argument("--batch_size", type=int, default=64)
24 |     parser.add_argument("--max_len", type=int, default=256)
25 |     parser.add_argument("--lr", type=float, default=3e-5)
26 |     parser.add_argument("--n_epochs", type=int, default=20)
27 |     parser.add_argument("--finetuning", dest="finetuning", action="store_true")
28 |     parser.add_argument("--save_model", dest="save_model", action="store_true")
29 |     parser.add_argument("--logdir", type=str, default="checkpoints/")
30 |     parser.add_argument("--lm", type=str, default='distilbert')
31 |     parser.add_argument("--fp16", dest="fp16", action="store_true")
32 |     parser.add_argument("--da", type=str, default=None)
33 |     parser.add_argument("--alpha_aug", type=float, default=0.8)
34 |     parser.add_argument("--dk", type=str, default=None)
35 |     parser.add_argument("--summarize", dest="summarize", action="store_true")
36 |     parser.add_argument("--size", type=int, default=None)
37 | 
38 |     hp = parser.parse_args()
39 | 
40 |     # set seeds
41 |     seed = hp.run_id
42 |     random.seed(seed)
43 |     np.random.seed(seed)
44 |     torch.manual_seed(seed)
45 |     if torch.cuda.is_available():
46 |         torch.cuda.manual_seed_all(seed)
47 | 
48 |     # only a single task for baseline
49 |     task = hp.task
50 | 
51 |     # create the tag of the run
52 |     run_tag = '%s_lm=%s_da=%s_dk=%s_su=%s_size=%s_id=%d' % (task, hp.lm, hp.da,
53 |             hp.dk, hp.summarize, str(hp.size), hp.run_id)
54 |     run_tag = run_tag.replace('/', '_')
55 | 
56 |     # load task configuration
57 |     configs = json.load(open('configs.json'))
58 |     configs = {conf['name'] : conf for conf in configs}
59 |     if (task in configs):
60 |         config = configs[task]
61 |     else:
62 |         raise KeyboardInterrupt
63 | 
64 |     trainset = config['trainset']
65 |     validset = config['validset']
66 |     testset = config['testset']
67 | 
68 |     # summarize the sequences up to the max sequence length
69 |     if hp.summarize:
70 |         summarizer = Summarizer(config, lm=hp.lm)
71 |         trainset = summarizer.transform_file(trainset, max_len=hp.max_len)
72 |         validset = summarizer.transform_file(validset, max_len=hp.max_len)
73 |         testset = summarizer.transform_file(testset, max_len=hp.max_len)
74 | 
75 |     if hp.dk is not None:
76 |         if hp.dk == 'product':
77 |             injector = ProductDKInjector(config, hp.dk)
78 |         else:
79 |             injector = GeneralDKInjector(config, hp.dk)
80 | 
81 |         trainset = injector.transform_file(trainset)
82 |         validset = injector.transform_file(validset)
83 |         testset = injector.transform_file(testset)
84 | 
85 |     # load train/dev/test sets
86 |     train_dataset = DittoDataset(trainset,
87 |                                    lm=hp.lm,
88 |                                    max_len=hp.max_len,
89 |                                    size=hp.size,
90 |                                    da=hp.da)
91 |     valid_dataset = DittoDataset(validset, lm=hp.lm)
92 |     test_dataset = DittoDataset(testset, lm=hp.lm)
93 | 
94 |     # train and evaluate the model
95 |     train(train_dataset,
96 |           valid_dataset,
97 |           test_dataset,
98 |           run_tag, hp)
99 | 


--------------------------------------------------------------------------------
/src/contrastive/models/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
 3 | 
 4 | from pdb import set_trace
 5 |     
 6 | def compute_metrics_bce(eval_pred):
 7 |     logits, labels = eval_pred
 8 | 
 9 |     logits = np.copy(logits)
10 |     labels = np.copy(labels)
11 | 
12 |     logits[logits>=0.5] = 1
13 |     logits[logits<0.5] = 0
14 |     predictions = logits.reshape(-1)
15 |     labels = labels.reshape(-1)
16 | 
17 |     accuracy = accuracy_score(labels, predictions)
18 |     f1 = f1_score(labels, predictions, pos_label=1, average='binary')
19 |     precision = precision_score(labels, predictions, pos_label=1, average='binary')
20 |     recall = recall_score(labels, predictions, pos_label=1, average='binary')
21 | 
22 |     return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}
23 | 
24 | def compute_metrics_cosine(eval_pred):
25 |     threshold = 0.75
26 |     logits, labels = eval_pred
27 | 
28 |     logits = np.copy(logits)
29 |     labels = np.copy(labels)
30 | 
31 |     predictions = logits
32 |     predictions[predictions>=threshold] = 1
33 |     predictions[predictions<threshold] = 0
34 |     predictions = predictions.reshape(-1)
35 |     labels = labels.reshape(-1)
36 | 
37 |     accuracy = accuracy_score(labels, predictions)
38 |     f1 = f1_score(labels, predictions, pos_label=1, average='binary')
39 |     precision = precision_score(labels, predictions, pos_label=1, average='binary')
40 |     recall = recall_score(labels, predictions, pos_label=1, average='binary')
41 | 
42 |     return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}
43 | 
44 | def compute_metrics_baseline(eval_pred):
45 |     logits, labels = eval_pred
46 | 
47 |     logits = np.copy(logits)
48 |     labels = np.copy(labels)
49 | 
50 |     predictions = np.argmax(logits, axis=-1)
51 | 
52 |     accuracy = accuracy_score(labels, predictions)
53 |     f1 = f1_score(labels, predictions, pos_label=1, average='binary')
54 |     precision = precision_score(labels, predictions, pos_label=1, average='binary')
55 |     recall = recall_score(labels, predictions, pos_label=1, average='binary')
56 | 
57 |     return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}
58 | 
59 | def compute_metrics_baseline_multiclass(eval_pred):
60 |     logits, labels = eval_pred
61 | 
62 |     logits = np.copy(logits)
63 |     labels = np.copy(labels)
64 | 
65 |     predictions = np.argmax(logits, axis=-1)
66 | 
67 |     accuracy = accuracy_score(labels, predictions)
68 |     f1 = f1_score(labels, predictions, average='micro')
69 |     precision = precision_score(labels, predictions, average='micro')
70 |     recall = recall_score(labels, predictions, average='micro')
71 |     f1_macro = f1_score(labels, predictions, average='macro')
72 |     precision_macro = precision_score(labels, predictions, average='macro')
73 |     recall_macro = recall_score(labels, predictions, average='macro')
74 | 
75 |     return {"accuracy": accuracy, "f1_micro": f1, "precision_micro": precision, "recall_micro": recall, "f1_macro": f1_macro, "precision_macro": precision_macro, "recall_macro": recall_macro}
76 | 
77 | def compute_metrics_matrix(eval_pred):
78 |     logits, labels = eval_pred
79 | 
80 |     logits = np.copy(logits)
81 |     labels = np.copy(labels)
82 | 
83 |     logits = logits[logits!=-100]
84 |     logits[logits>=0.5] = 1
85 |     logits[logits<0.5] = 0
86 | 
87 |     labels = labels[labels!=-100]
88 |     
89 |     predictions = logits.reshape(-1)
90 |     labels = labels.reshape(-1)
91 | 
92 |     accuracy = accuracy_score(labels, predictions)
93 |     f1 = f1_score(labels, predictions, pos_label=1, average='binary')
94 |     precision = precision_score(labels, predictions, pos_label=1, average='binary')
95 |     recall = recall_score(labels, predictions, pos_label=1, average='binary')
96 | 
97 |     return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}


--------------------------------------------------------------------------------
/src/contrastive/models/loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Author: Yonglong Tian (yonglong@mit.edu)
 3 | Date: May 07, 2020
 4 | """
 5 | from __future__ import print_function
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | 
10 | from pdb import set_trace
11 | 
12 | 
13 | class SupConLoss(nn.Module):
14 |     """Supervised Contrastive Learning: https://arxiv.org/pdf/2004.11362.pdf.
15 |     It also supports the unsupervised contrastive loss in SimCLR"""
16 |     def __init__(self, temperature=0.07, contrast_mode='all',
17 |                  base_temperature=0.07):
18 |         super(SupConLoss, self).__init__()
19 |         self.temperature = temperature
20 |         self.contrast_mode = contrast_mode
21 |         self.base_temperature = base_temperature
22 | 
23 |     def forward(self, features, labels=None, mask=None):
24 |         """Compute loss for model. If both `labels` and `mask` are None,
25 |         it degenerates to SimCLR unsupervised loss:
26 |         https://arxiv.org/pdf/2002.05709.pdf
27 |         Args:
28 |             features: hidden vector of shape [bsz, n_views, ...].
29 |             labels: ground truth of shape [bsz].
30 |             mask: contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j
31 |                 has the same class as sample i. Can be asymmetric.
32 |         Returns:
33 |             A loss scalar.
34 |         """
35 |         device = (torch.device('cuda')
36 |                   if features.is_cuda
37 |                   else torch.device('cpu'))
38 | 
39 |         if len(features.shape) < 3:
40 |             raise ValueError('`features` needs to be [bsz, n_views, ...],'
41 |                              'at least 3 dimensions are required')
42 |         if len(features.shape) > 3:
43 |             features = features.view(features.shape[0], features.shape[1], -1)
44 | 
45 |         batch_size = features.shape[0]
46 |         if labels is not None and mask is not None:
47 |             raise ValueError('Cannot define both `labels` and `mask`')
48 |         elif labels is None and mask is None:
49 |             mask = torch.eye(batch_size, dtype=torch.float32).to(device)
50 |         elif labels is not None:
51 |             labels = labels.contiguous().view(-1, 1)
52 |             if labels.shape[0] != batch_size:
53 |                 raise ValueError('Num of labels does not match num of features')
54 |             mask = torch.eq(labels, labels.T).float().to(device)
55 |         else:
56 |             mask = mask.float().to(device)
57 | 
58 |         contrast_count = features.shape[1]
59 |         contrast_feature = torch.cat(torch.unbind(features, dim=1), dim=0)
60 |         if self.contrast_mode == 'one':
61 |             anchor_feature = features[:, 0]
62 |             anchor_count = 1
63 |         elif self.contrast_mode == 'all':
64 |             anchor_feature = contrast_feature
65 |             anchor_count = contrast_count
66 |         else:
67 |             raise ValueError('Unknown mode: {}'.format(self.contrast_mode))
68 | 
69 |         # compute logits
70 |         anchor_dot_contrast = torch.div(
71 |             torch.matmul(anchor_feature, contrast_feature.T),
72 |             self.temperature)
73 |         # for numerical stability
74 |         logits_max, _ = torch.max(anchor_dot_contrast, dim=1, keepdim=True)
75 |         logits = anchor_dot_contrast - logits_max.detach()
76 | 
77 |         # tile mask
78 |         mask = mask.repeat(anchor_count, contrast_count)
79 |         # mask-out self-contrast cases
80 |         logits_mask = torch.scatter(
81 |             torch.ones_like(mask),
82 |             1,
83 |             torch.arange(batch_size * anchor_count).view(-1, 1).to(device),
84 |             0
85 |         )
86 |         mask = mask * logits_mask
87 | 
88 |         # compute log_prob
89 |         exp_logits = torch.exp(logits) * logits_mask
90 |         log_prob = logits - torch.log(exp_logits.sum(1, keepdim=True))
91 | 
92 |         # compute mean of log-likelihood over positive
93 |         mean_log_prob_pos = (mask * log_prob).sum(1) / mask.sum(1)
94 | 
95 |         # loss
96 |         loss = - (self.temperature / self.base_temperature) * mean_log_prob_pos
97 |         loss = loss.view(anchor_count, batch_size).mean()
98 | 
99 |         return loss


--------------------------------------------------------------------------------
/ditto/ditto_light/dataset.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | from torch.utils import data
  4 | from transformers import AutoTokenizer
  5 | 
  6 | from .augment import Augmenter
  7 | 
  8 | # map lm name to huggingface's pre-trained model names
  9 | lm_mp = {'roberta': 'roberta-base',
 10 |          'distilbert': 'distilbert-base-uncased'}
 11 | 
 12 | def get_tokenizer(lm):
 13 |     if lm in lm_mp:
 14 |         return AutoTokenizer.from_pretrained(lm_mp[lm])
 15 |     else:
 16 |         return AutoTokenizer.from_pretrained(lm)
 17 | 
 18 | 
 19 | class DittoDataset(data.Dataset):
 20 |     """EM dataset"""
 21 | 
 22 |     def __init__(self,
 23 |                  path,
 24 |                  max_len=256,
 25 |                  size=None,
 26 |                  lm='roberta',
 27 |                  da=None):
 28 |         self.tokenizer = get_tokenizer(lm)
 29 |         self.pairs = []
 30 |         self.labels = []
 31 |         self.max_len = max_len
 32 |         self.size = size
 33 | 
 34 |         if isinstance(path, list):
 35 |             lines = path
 36 |         else:
 37 |             lines = open(path)
 38 | 
 39 |         for line in lines:
 40 |             s1, s2, label = line.strip().split('\t')
 41 |             self.pairs.append((s1, s2))
 42 |             self.labels.append(int(label))
 43 | 
 44 |         self.pairs = self.pairs[:size]
 45 |         self.labels = self.labels[:size]
 46 |         self.da = da
 47 |         if da is not None:
 48 |             self.augmenter = Augmenter()
 49 |         else:
 50 |             self.augmenter = None
 51 | 
 52 | 
 53 |     def __len__(self):
 54 |         """Return the size of the dataset."""
 55 |         return len(self.pairs)
 56 | 
 57 |     def __getitem__(self, idx):
 58 |         """Return a tokenized item of the dataset.
 59 | 
 60 |         Args:
 61 |             idx (int): the index of the item
 62 | 
 63 |         Returns:
 64 |             List of int: token ID's of the two entities
 65 |             List of int: token ID's of the two entities augmented (if da is set)
 66 |             int: the label of the pair (0: unmatch, 1: match)
 67 |         """
 68 |         left = self.pairs[idx][0]
 69 |         right = self.pairs[idx][1]
 70 | 
 71 |         # left + right
 72 |         x = self.tokenizer.encode(text=left,
 73 |                                   text_pair=right,
 74 |                                   max_length=self.max_len,
 75 |                                   truncation=True)
 76 | 
 77 |         # augment if da is set
 78 |         if self.da is not None:
 79 |             combined = self.augmenter.augment_sent(left + ' [SEP] ' + right, self.da)
 80 |             left, right = combined.split(' [SEP] ')
 81 |             x_aug = self.tokenizer.encode(text=left,
 82 |                                       text_pair=right,
 83 |                                       max_length=self.max_len,
 84 |                                       truncation=True)
 85 |             return x, x_aug, self.labels[idx]
 86 |         else:
 87 |             return x, self.labels[idx]
 88 | 
 89 | 
 90 |     @staticmethod
 91 |     def pad(batch):
 92 |         """Merge a list of dataset items into a train/test batch
 93 |         Args:
 94 |             batch (list of tuple): a list of dataset items
 95 | 
 96 |         Returns:
 97 |             LongTensor: x1 of shape (batch_size, seq_len)
 98 |             LongTensor: x2 of shape (batch_size, seq_len).
 99 |                         Elements of x1 and x2 are padded to the same length
100 |             LongTensor: a batch of labels, (batch_size,)
101 |         """
102 |         if len(batch[0]) == 3:
103 |             x1, x2, y = zip(*batch)
104 | 
105 |             maxlen = max([len(x) for x in x1+x2])
106 |             x1 = [xi + [0]*(maxlen - len(xi)) for xi in x1]
107 |             x2 = [xi + [0]*(maxlen - len(xi)) for xi in x2]
108 |             return torch.LongTensor(x1), \
109 |                    torch.LongTensor(x2), \
110 |                    torch.LongTensor(y)
111 |         else:
112 |             x12, y = zip(*batch)
113 |             maxlen = max([len(x) for x in x12])
114 |             x12 = [xi + [0]*(maxlen - len(xi)) for xi in x12]
115 |             return torch.LongTensor(x12), \
116 |                    torch.LongTensor(y)
117 | 
118 | 


--------------------------------------------------------------------------------
/ditto/blocking/train_blocker.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import json
  4 | import sys
  5 | import math
  6 | 
  7 | sys.path.insert(0, "sentence-transformers")
  8 | 
  9 | from sentence_transformers.readers import InputExample
 10 | from sentence_transformers import models, losses
 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 13 | 
 14 | from torch.utils.data import DataLoader
 15 | 
 16 | class Reader:
 17 |     """A simple reader class for the matching datasets.
 18 |     """
 19 |     def __init__(self):
 20 |         self.guid = 0
 21 | 
 22 |     def get_examples(self, fn):
 23 |         examples = []
 24 |         for line in open(fn):
 25 |             sent1, sent2, label = line.strip().split('\t')
 26 |             examples.append(InputExample(guid=self.guid,
 27 |                 texts=[sent1, sent2],
 28 |                 label=int(label)))
 29 |             self.guid += 1
 30 |         return examples
 31 | 
 32 | def train(hp):
 33 |     """Train the advanced blocking model
 34 |     Store the trained model in hp.model_fn.
 35 | 
 36 |     Args:
 37 |         hp (Namespace): the hyperparameters
 38 | 
 39 |     Returns:
 40 |         None
 41 |     """
 42 |     # define model
 43 |     model_names = {'distilbert': 'distilbert-base-uncased',
 44 |                    'bert': 'bert-base-uncased',
 45 |                    'albert': 'albert-base-v2' }
 46 | 
 47 |     word_embedding_model = models.Transformer(model_names[hp.lm])
 48 |     pooling_model = models.Pooling(word_embedding_model\
 49 |                                    .get_word_embedding_dimension(),
 50 | 				   pooling_mode_mean_tokens=True,
 51 | 				   pooling_mode_cls_token=False,
 52 | 				   pooling_mode_max_tokens=False)
 53 | 
 54 |     model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
 55 | 
 56 |     # load the training and validation data
 57 |     reader = Reader()
 58 |     trainset = SentencesDataset(examples=reader.get_examples(hp.train_fn),
 59 |                                 model=model)
 60 |     train_dataloader = DataLoader(trainset,
 61 |                                   shuffle=True,
 62 |                                   batch_size=hp.batch_size)
 63 |     train_loss = losses.SoftmaxLoss(model=model,
 64 |             sentence_embedding_dimension=model\
 65 |                     .get_sentence_embedding_dimension(),
 66 |             num_labels=2)
 67 | 
 68 |     dev_data = SentencesDataset(examples=reader\
 69 |                                          .get_examples(hp.valid_fn),
 70 |                                 model=model)
 71 |     dev_dataloader = DataLoader(dev_data,
 72 |                                 shuffle=False,
 73 |                                 batch_size=hp.batch_size)
 74 |     evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
 75 | 
 76 |     warmup_steps = math.ceil(len(train_dataloader) \
 77 |             * hp.n_epochs / hp.batch_size * 0.1) #10% of train data for warm-up
 78 | 
 79 |     if os.path.exists(hp.model_fn):
 80 |         import shutil
 81 |         shutil.rmtree(hp.model_fn)
 82 | 
 83 |     # Train the model
 84 |     model.fit(train_objectives=[(train_dataloader, train_loss)],
 85 |           evaluator=evaluator,
 86 |           epochs=hp.n_epochs,
 87 |           evaluation_steps=1000,
 88 |           warmup_steps=warmup_steps,
 89 |           output_path=hp.model_fn,
 90 |           fp16=hp.fp16,
 91 |           fp16_opt_level='O2')
 92 | 
 93 | if __name__=="__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--train_fn", type=str, default="../data/er_magellan/Structured/Beer/train.txt")
 96 |     parser.add_argument("--valid_fn", type=str, default="../data/er_magellan/Structured/Beer/valid.txt")
 97 |     parser.add_argument("--model_fn", type=str, default="model.pth")
 98 |     parser.add_argument("--batch_size", type=int, default=64)
 99 |     parser.add_argument("--n_epochs", type=int, default=20)
100 |     parser.add_argument("--logdir", type=str, default="checkpoints/")
101 |     parser.add_argument("--lm", type=str, default='distilbert')
102 |     parser.add_argument("--fp16", dest="fp16", action="store_true")
103 |     hp = parser.parse_args()
104 | 
105 |     train(hp)
106 | 


--------------------------------------------------------------------------------
/preprocess_data_for_ditto_hiergat.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import os
  6 | from pathlib import Path
  7 | import glob
  8 | import gzip
  9 | from copy import deepcopy
 10 | 
 11 | import argparse
 12 | import re
 13 | import csv
 14 | 
 15 | def combine_row(left, right, label):
 16 |     def func(row):
 17 |         col_names = left.columns # assume left and right always have same attributes
 18 |         list_ = ['COL' + ' ' + str(b) + ' ' + 'VAL' + ' ' + str(a) + ' ' for a, b in zip(row, col_names.values.tolist())]
 19 |         list_ = ''.join(str(m) for m in list_)
 20 |         return list_
 21 | 
 22 |     left_list = list(map(func, left.values.tolist()))
 23 |     right_list = list(map(func, right.values.tolist()))
 24 |     label_list = [str(l) for l in label]
 25 | 
 26 |     left_df = pd.DataFrame({'left': pd.Series(left_list)})
 27 |     right_df = pd.DataFrame({'right': pd.Series(right_list)})
 28 |     label_df = pd.DataFrame({'label': pd.Series(label_list)})
 29 | 
 30 |     # using tab separator here
 31 |     # https://github.com/megagonlabs/ditto - <entry_1> \t <entry_2> \t <label>
 32 |     final_df = left_df.left.map(str) + '\t' + right_df.right
 33 |     final_df = final_df.map(str) + '\t' + label_df.label
 34 | 
 35 |     return final_df
 36 | 
 37 | def preprocess_part(path):
 38 |     with gzip.open(f'{path}', 'rb') as f:
 39 |         test_set = pd.read_pickle(f)
 40 | 
 41 |     test_set = test_set.drop(['pair_id'], axis=1)
 42 | 
 43 |     mask_left = test_set.columns.str.endswith('_left')
 44 |     mask_right = test_set.columns.str.endswith('_right')
 45 | 
 46 |     left = test_set.loc[:, mask_left]
 47 |     right = test_set.loc[:, mask_right]
 48 |     label = [int(x) for x in list(test_set['label'].values)]
 49 | 
 50 |     left.columns = left.columns.str.removesuffix('_left')
 51 |     left = left.drop(['id', 'cluster_id'], axis=1)
 52 |     left = left.fillna('')
 53 |     left = left[['brand', 'title', 'price', 'priceCurrency', 'description', 'specTableContent']]
 54 | 
 55 |     right.columns = right.columns.str.removesuffix('_right')
 56 |     right = right.drop(['id', 'cluster_id'], axis=1)
 57 |     right = right.fillna('')
 58 |     right = right[['brand', 'title', 'price', 'priceCurrency', 'description', 'specTableContent']]
 59 | 
 60 |     final_df = combine_row(left, right, label)
 61 | 
 62 |     return(final_df)
 63 | 
 64 | def preprocess_dataset():
 65 |     os.makedirs('./ditto/data/final_output/', exist_ok=True)
 66 |     os.makedirs('./hiergat/data/final_output/', exist_ok=True)
 67 | 
 68 |     test_path =  "data/interim/wdc-lspc/" + "gold-standards" + "/"
 69 |     valid_path = "data/interim/wdc-lspc/" + "validation-sets" + "/"
 70 |     train_path = "data/interim/wdc-lspc/" + "training-sets" + "/"
 71 |     groups = {"gold-standards", "validation-sets", "training-sets"}
 72 | 
 73 |     if not (os.path.exists(test_path)):
 74 |         print('Dataset does not exist')
 75 |         return
 76 | 
 77 |     print(f'START BULDING FINAL DATASETS')
 78 |     
 79 |     for path in glob.glob(os.path.join(test_path, (r'preprocessed_wdcproducts[!multi]*.pkl.gz'))):
 80 |         test_df = preprocess_part(path)
 81 |         np.savetxt(f'./ditto/data/final_output/{path[37:-7]}.txt', test_df.values, fmt = "%s")
 82 |         np.savetxt(f'./hiergat/data/final_output/{path[37:-7]}.txt', test_df.values, fmt = "%s")
 83 |     print(f'DONE TEST')
 84 | 
 85 |     for path in glob.glob(os.path.join(valid_path, (r'preprocessed_wdcproducts[!multi]*.pkl.gz'))):
 86 |         valid_df = preprocess_part(path)
 87 |         np.savetxt(f'./ditto/data/final_output/{path[38:-7]}.txt', valid_df.values, fmt = "%s")
 88 |         np.savetxt(f'./hiergat/data/final_output/{path[38:-7]}.txt', valid_df.values, fmt = "%s")
 89 |     print(f'DONE VALID')
 90 | 
 91 |     for path in glob.glob(os.path.join(train_path, (r'preprocessed_wdcproducts[!multi]*.pkl.gz'))):
 92 |         train_df = preprocess_part(path)
 93 |         np.savetxt(f'./ditto/data/final_output/{path[36:-7]}.txt', train_df.values, fmt = "%s")
 94 |         np.savetxt(f'./hiergat/data/final_output/{path[36:-7]}.txt', train_df.values, fmt = "%s")
 95 |     print(f'DONE TRAIN')
 96 | 
 97 |     print(f'FINISHED BULDING FINAL DATASETS\n')
 98 | 
 99 | if __name__ == '__main__':
100 |     preprocess_dataset()


--------------------------------------------------------------------------------
/hiergat/model/summarize.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | 
  4 | from sklearn.feature_extraction.text import TfidfVectorizer
  5 | from collections import Counter
  6 | from nltk.corpus import stopwords
  7 | 
  8 | from .dataset import get_tokenizer
  9 | 
 10 | stopwords = set(stopwords.words('english'))
 11 | 
 12 | class Summarizer:
 13 |     def __init__(self, task_config, lm='bert', lm_path=None):
 14 |         self.config = task_config
 15 |         self.tokenizer = get_tokenizer(lm, lm_path)
 16 |         self.len_cache = {}
 17 | 
 18 |     def build_index(self, contents):
 19 |         content = []
 20 |         for line in contents:
 21 |             LL = line.split('\t')
 22 |             if len(LL) > 2:
 23 |                 tokens = LL[1].split(' ')
 24 |                 filter_tokens = []
 25 |                 for token in tokens:
 26 |                     if token not in ['COL', 'VAL'] and \
 27 |                        token not in stopwords:
 28 |                         filter_tokens.append(token)
 29 |                 content.append(' '.join(filter_tokens))
 30 | 
 31 |         vectorizer = TfidfVectorizer().fit(content)
 32 |         self.vocab = vectorizer.vocabulary_
 33 |         self.idf = vectorizer.idf_
 34 | 
 35 |     def get_len(self, word):
 36 |         if word in self.len_cache:
 37 |             return self.len_cache[word]
 38 |         length = len(self.tokenizer.tokenize(word))
 39 |         self.len_cache[word] = length
 40 |         return length
 41 | 
 42 |     def transform(self, row, max_len=128):
 43 |         sentA, sentB, label = row.strip().split('\t')
 44 |         res = sentA + '\t'
 45 | 
 46 |         for sent in [sentB]:
 47 |             attr_sent = ' '.join(
 48 |                 filter(lambda x: len(x),
 49 |                        map(lambda x: re.sub('COL .*', '', x).strip(),
 50 |                            sent.split('VAL'))))
 51 | 
 52 |             token_cnt = Counter(attr_sent.split(' '))
 53 |             total_len = 0
 54 | 
 55 |             subset = Counter()
 56 | 
 57 |             for token, cnt in token_cnt.most_common():
 58 |                 if token in self.vocab:
 59 |                     # attribute name
 60 |                     if cnt == 1 and self.idf[self.vocab[token]] == 1 :
 61 |                         continue
 62 | 
 63 |                     subset[token] = cnt / self.idf[self.vocab[token]]
 64 |             subset = subset.most_common(max_len)
 65 | 
 66 |             # Remove own token
 67 |             i = 0
 68 |             prev = 0
 69 |             for _, cnt in subset:
 70 |                 if cnt != 1.0 and prev == cnt:
 71 |                     break
 72 |                 prev = cnt
 73 |                 i += 1
 74 | 
 75 |             if i != len(subset):
 76 |                 subset = subset[:i-1]
 77 | 
 78 |             topk_tokens_copy = set([])
 79 |             for word, _ in subset:
 80 |                 bert_len = self.get_len(word)
 81 |                 if total_len + bert_len > max_len:
 82 |                     break
 83 |                 total_len += bert_len
 84 |                 topk_tokens_copy.add(word)
 85 | 
 86 |             name_flag = 0
 87 |             for token in sent.split(' '):
 88 |                 if token in ['COL', 'VAL']:
 89 |                     res += token + ' '
 90 |                     if token == 'COL':
 91 |                         name_flag = 1
 92 |                 elif name_flag:
 93 |                     res += token + ' '
 94 |                     name_flag = 0
 95 |                 elif token in topk_tokens_copy:
 96 |                     res += token + ' '
 97 |                     topk_tokens_copy.remove(token)
 98 | 
 99 |             res += '\t'
100 | 
101 |         res += label + '\n'
102 |         return res
103 | 
104 |     def transform_file(self, input_fn, batch_size=17, max_len=256, overwrite=False):
105 |         out_fn = input_fn + '.su'
106 |         if not os.path.exists(out_fn) or \
107 |            os.stat(out_fn).st_size == 0 or overwrite:
108 |             with open(out_fn, 'w') as fout:
109 |                 batch = []
110 | 
111 |                 for line in open(input_fn):
112 |                     batch.append(line)
113 | 
114 |                     if len(batch) == batch_size:
115 |                         self.build_index(batch)
116 |                         for line in batch:
117 |                             fout.write(self.transform(line, max_len=max_len))
118 |                         batch.clear()
119 |         return out_fn
120 | 


--------------------------------------------------------------------------------
/src/processing/contrastive/prepare-data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | np.random.seed(42)
 4 | import random
 5 | random.seed(42)
 6 | 
 7 | import itertools
 8 | import html
 9 | 
10 | from pathlib import Path
11 | import shutil
12 | 
13 | from src.data import utils
14 | 
15 | def _cut_lspc_multi(row):
16 |     attributes = {'title': 50,
17 |                   'brand': 5,
18 |                   'description': 100,
19 |                   'specTableContent': 200}
20 | 
21 |     for attr, value in attributes.items():
22 |         try:
23 |             row[attr] = ' '.join(row[attr].split(' ')[:value])
24 |         except AttributeError:
25 |             continue
26 |     return row
27 |    
28 | def clean_price(price_input):
29 |     price_input = price_input.fillna('')
30 |     price_input = price_input.replace('nan', '')
31 |     price_input = price_input.str.strip()
32 |     return price_input
33 | 
34 | def update_price_currency(row):
35 |     row_price = row[0]
36 |     row_currency = row[1]
37 |     price = ""
38 |     currency = ""
39 |     if row_price == "":
40 |         return price, currency
41 |     else:
42 |         return row_price, row_currency
43 | 
44 | 
45 | if __name__ == '__main__':
46 | 
47 |     categories = ['wdcproducts20cc80rnd000un', 'wdcproducts50cc50rnd000un', 'wdcproducts80cc20rnd000un']
48 |     train_sizes = ['small', 'medium', 'large']
49 |     valid_types = ['000un', '050un', '100un']
50 | 
51 |     data = pd.read_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.pkl.gz')
52 | 
53 |     relevant_cols = ['id', 'cluster_id', 'brand', 'title', 'description', 'specTableContent', 'price', 'priceCurrency']
54 | 
55 |     for category in categories:
56 |         for valid_type in valid_types:
57 |             out_path = f'../../../data/processed/wdc-lspc/contrastive/pre-train/{category.replace("000un", valid_type)}/'
58 |             shutil.rmtree(out_path, ignore_errors=True)
59 |             Path(out_path).mkdir(parents=True, exist_ok=True)
60 |             
61 |             for train_size in train_sizes:
62 |                 try:
63 |                     ids = pd.read_pickle(f'../../../data/interim/wdc-lspc/training-sets/preprocessed_{category}_train_{train_size}.pkl.gz')
64 |                     ids_valid = pd.read_pickle(f'../../../data/interim/wdc-lspc/validation-sets/preprocessed_{category.replace("000un", valid_type)}_valid_{train_size}.pkl.gz')
65 |                 except FileNotFoundError:
66 |                     continue
67 |                 relevant_ids = set()
68 |                 relevant_ids.update(ids['id_left'])
69 |                 relevant_ids.update(ids['id_right'])
70 |                 relevant_ids.update(ids_valid['id_left'])
71 |                 relevant_ids.update(ids_valid['id_right'])
72 | 
73 |                 data_selection = data[data['id'].isin(relevant_ids)].copy()
74 |                 data_selection = data_selection[relevant_cols]
75 |                 data_selection = data_selection.reset_index(drop=True)
76 | 
77 |                 data_selection['title'] = data_selection['title'].apply(utils.clean_string_2020)
78 |                 data_selection['description'] = data_selection['description'].apply(utils.clean_string_2020)
79 |                 data_selection['brand'] = data_selection['brand'].apply(utils.clean_string_2020)
80 |                 data_selection['price'] = data_selection['price'].apply(utils.clean_string_2020)
81 |                 data_selection['priceCurrency'] = data_selection['priceCurrency'].apply(utils.clean_string_2020)
82 | 
83 |                 data_selection['price'] = clean_price(data_selection['price'])
84 |                 data_selection[['price', 'priceCurrency']] = data_selection[['price', 'priceCurrency']].apply(update_price_currency, axis=1, result_type="expand")
85 | 
86 |                 data_selection = data_selection.fillna('')
87 | 
88 |                 data_selection['title'] = data_selection['title'].apply(lambda x: html.unescape(x))
89 |                 data_selection['description'] = data_selection['description'].apply(lambda x: html.unescape(x))
90 |                 data_selection['brand'] = data_selection['brand'].apply(lambda x: html.unescape(x))
91 | 
92 |                 data_selection = data_selection.apply(_cut_lspc_multi, axis=1)
93 | 
94 |                 data_selection = data_selection.replace('', None)
95 | 
96 |                 data_selection.to_pickle(f'{out_path}{category.replace("000un", valid_type)}_train_{train_size}.pkl.gz')


--------------------------------------------------------------------------------
/ditto/ditto_light/augment.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | import numpy as np
  4 | 
  5 | class Augmenter(object):
  6 |     """Data augmentation operator.
  7 |     Support both span and attribute level augmentation operators.
  8 |     """
  9 |     def __init__(self):
 10 |         pass
 11 | 
 12 |     def augment(self, tokens, labels, op='del'):
 13 |         """ Performs data augmentation on a sequence of tokens
 14 |         The supported ops:
 15 |            ['del',
 16 |             'swap',
 17 |             'all']
 18 |         Args:
 19 |             tokens (list of strings): the input tokens
 20 |             labels (list of strings): the labels of the tokens
 21 |             op (str, optional): a string encoding of the operator to be applied
 22 |         Returns:
 23 |             list of strings: the augmented tokens
 24 |             list of strings: the augmented labels
 25 |         """
 26 |         if 'del' in op:
 27 |             # insert padding to keep the length consistent
 28 |             # span_len = random.randint(1, 3)
 29 |             span_len = random.randint(1, 2)
 30 |             pos1, pos2 = self.sample_span(tokens, labels, span_len=span_len)
 31 |             if pos1 < 0:
 32 |                 return tokens, labels
 33 |             new_tokens = tokens[:pos1] + tokens[pos2+1:]
 34 |             new_labels = tokens[:pos1] + labels[pos2+1:]
 35 |         elif 'swap' in op:
 36 |             span_len = random.randint(2, 4)
 37 |             pos1, pos2 = self.sample_span(tokens, labels, span_len=span_len)
 38 |             if pos1 < 0:
 39 |                 return tokens, labels
 40 |             sub_arr = tokens[pos1:pos2+1]
 41 |             random.shuffle(sub_arr)
 42 |             new_tokens = tokens[:pos1] + sub_arr + tokens[pos2+1:]
 43 |             new_labels = tokens[:pos1] + ['O'] * (pos2 - pos1 + 1) + labels[pos2+1:]
 44 |         else:
 45 |             new_tokens, new_labels = tokens, labels
 46 | 
 47 |         return new_tokens, new_labels
 48 | 
 49 | 
 50 |     def augment_sent(self, text, op='all'):
 51 |         """ Performs data augmentation on a classification example.
 52 |         Similar to augment(tokens, labels) but works for sentences
 53 |         or sentence-pairs.
 54 |         Args:
 55 |             text (str): the input sentence
 56 |             op (str, optional): a string encoding of the operator to be applied
 57 |         Returns:
 58 |             str: the augmented sentence
 59 |         """
 60 |         # 50% of chance of flipping
 61 |         if ' [SEP] ' in text and random.randint(0, 1) == 0:
 62 |             left, right = text.split(' [SEP] ')
 63 |             text = right + ' [SEP] ' + left
 64 | 
 65 |         # tokenize the sentence
 66 |         current = ''
 67 |         tokens = text.split(' ')
 68 | 
 69 |         # avoid the special tokens
 70 |         labels = []
 71 |         for token in tokens:
 72 |             if token in ['COL', 'VAL']:
 73 |                 labels.append('HD')
 74 |             elif token in ['[CLS]', '[SEP]']:
 75 |                 labels.append('<SEP>')
 76 |             else:
 77 |                 labels.append('O')
 78 | 
 79 |         if op == 'all':
 80 |             # RandAugment: https://arxiv.org/pdf/1909.13719.pdf
 81 |             N = 2
 82 |             ops = ['del', 'swap']
 83 |             for op in random.choices(ops, k=N):
 84 |                 tokens, labels = self.augment(tokens, labels, op=op)
 85 |         else:
 86 |             tokens, labels = self.augment(tokens, labels, op=op)
 87 |         results = ' '.join(tokens)
 88 |         return results
 89 | 
 90 |     def sample_span(self, tokens, labels, span_len=3):
 91 |         candidates = []
 92 |         for idx, token in enumerate(tokens):
 93 |             if idx + span_len - 1 < len(labels) and ''.join(labels[idx:idx+span_len]) == 'O'*span_len:
 94 |                 candidates.append((idx, idx+span_len-1))
 95 |         if len(candidates) <= 0:
 96 |             return -1, -1
 97 |         return random.choice(candidates)
 98 | 
 99 |     def sample_position(self, tokens, labels, tfidf=False):
100 |         candidates = []
101 |         for idx, token in enumerate(tokens):
102 |             if labels[idx] == 'O':
103 |                 candidates.append(idx)
104 |         if len(candidates) <= 0:
105 |             return -1
106 |         return random.choice(candidates)
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     ag = Augmenter()
111 |     text = 'COL content VAL vldb conference papers 2020-01-01 COL year VAL 2020 [SEP] COL content VAL sigmod conference 2010 papers 2019-12-31 COL year VAL 2019'
112 |     for op in ['del',
113 |                'swap',
114 |                'all']:
115 |         print(op)
116 |         print(ag.augment_sent(text, op=op))
117 | 


--------------------------------------------------------------------------------
/ditto/blocking/blocker.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import jsonlines
  4 | import pickle
  5 | import numpy as np
  6 | import argparse
  7 | 
  8 | from tqdm import tqdm
  9 | 
 10 | sys.path.append("sentence-transformers")
 11 | 
 12 | from sentence_transformers import SentenceTransformer
 13 | 
 14 | def encode_all(path, input_fn, model, overwrite=False):
 15 |     """Encode a collection of entries and output to a file
 16 | 
 17 |     Args:
 18 |         path (str): the input path
 19 |         input_fn (str): the file of the serialzied entries
 20 |         model (SentenceTransformer): the transformer model
 21 |         overwrite (boolean, optional): whether to overwrite out_fn
 22 | 
 23 |     Returns:
 24 |         List of str: the serialized entries
 25 |         List of np.ndarray: the encoded vectors
 26 |     """
 27 |     input_fn = os.path.join(path, input_fn)
 28 |     output_fn = input_fn + '.mat'
 29 | 
 30 |     # read from input_fn
 31 |     lines = open(input_fn).read().split('\n')
 32 | 
 33 |     # encode and dump
 34 |     if not os.path.exists(output_fn) or overwrite:
 35 |         vectors = model.encode(lines)
 36 |         vectors = [v / np.linalg.norm(v) for v in vectors]
 37 |         pickle.dump(vectors, open(output_fn, 'wb'))
 38 |     else:
 39 |         vectors = pickle.load(open(output_fn, 'rb'))
 40 |     return lines, vectors
 41 | 
 42 | 
 43 | def blocked_matmul(mata, matb,
 44 |                    threshold=None,
 45 |                    k=None,
 46 |                    batch_size=512):
 47 |     """Find the most similar pairs of vectors from two matrices (top-k or threshold)
 48 | 
 49 |     Args:
 50 |         mata (np.ndarray): the first matrix
 51 |         matb (np.ndarray): the second matrix
 52 |         threshold (float, optional): if set, return all pairs of cosine
 53 |             similarity above the threshold
 54 |         k (int, optional): if set, return for each row in matb the top-k
 55 |             most similar vectors in mata
 56 |         batch_size (int, optional): the batch size of each block
 57 | 
 58 |     Returns:
 59 |         list of tuples: the pairs of similar vectors' indices and the similarity
 60 |     """
 61 |     mata = np.array(mata)
 62 |     matb = np.array(matb)
 63 |     results = []
 64 |     for start in tqdm(range(0, len(matb), batch_size)):
 65 |         block = matb[start:start+batch_size]
 66 |         sim_mat = np.matmul(mata, block.transpose())
 67 |         if k is not None:
 68 |             indices = np.argpartition(-sim_mat, k, axis=0)
 69 |             for row in indices[:k]:
 70 |                 for idx_b, idx_a in enumerate(row):
 71 |                     idx_b += start
 72 |                     results.append((idx_a, idx_b, sim_mat[idx_a][idx_b-start]))
 73 | 
 74 |         if threshold is not None:
 75 |             indices = np.argwhere(sim_mat >= threshold)
 76 |             total += len(indices)
 77 |             for idx_a, idx_b in indices:
 78 |                 idx_b += start
 79 |                 results.append((idx_a, idx_b, sim_mat[idx_a][idx_b-start]))
 80 |     return results
 81 | 
 82 | 
 83 | def dump_pairs(out_fn, entries_a, entries_b, pairs):
 84 |     """Dump the pairs to a jsonl file
 85 |     """
 86 |     with jsonlines.open(out_fn, mode='w') as writer:
 87 |         for idx_a, idx_b, score in pairs:
 88 |             writer.write([entries_a[idx_a], entries_b[idx_b], str(score)])
 89 | 
 90 | if __name__ == "__main__":
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument("--input_path", type=str, default="../data/er_magellan/Structured/Beer/")
 93 |     parser.add_argument("--left_fn", type=str, default=None)
 94 |     parser.add_argument("--right_fn", type=str, default=None)
 95 |     parser.add_argument("--output_fn", type=str, default='candidates.jsonl')
 96 |     parser.add_argument("--model_fn", type=str, default="model.pth/")
 97 |     parser.add_argument("--batch_size", type=int, default=512)
 98 |     parser.add_argument("--k", type=int, default=10)
 99 |     parser.add_argument("--threshold", type=float, default=None) # 0.6
100 |     hp = parser.parse_args()
101 | 
102 |     # load the model
103 |     model = SentenceTransformer(hp.model_fn)
104 | 
105 |     # generate the vectors
106 |     mata = matb = None
107 |     entries_a = entries_b = None
108 |     if hp.left_fn is not None:
109 |         entries_a, mata = encode_all(hp.input_path, hp.left_fn, model)
110 |     if hp.right_fn is not None:
111 |         entries_b, matb = encode_all(hp.input_path, hp.right_fn, model)
112 | 
113 |     if mata and matb:
114 |         pairs = blocked_matmul(mata, matb,
115 |                    threshold=hp.threshold,
116 |                    k=hp.k,
117 |                    batch_size=hp.batch_size)
118 |         dump_pairs(os.path.join(hp.input_path, hp.output_fn),
119 |                    entries_a,
120 |                    entries_b,
121 |                    pairs)
122 | 


--------------------------------------------------------------------------------
/hiergat/model/layer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | 
  6 | 
  7 | class AttentionLayer(nn.Module):
  8 | 
  9 |     def __init__(self, hidden_size, alpha, device):
 10 |         super(AttentionLayer, self).__init__()
 11 | 
 12 |         self.a = nn.Parameter(torch.zeros(size=(hidden_size, 1)))
 13 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
 14 | 
 15 |         self.leakyrelu = nn.LeakyReLU(alpha)
 16 | 
 17 |         self.device = device
 18 | 
 19 |     def forward(self, words, word_emb, attr_emb):
 20 |         words_emb = word_emb(words)
 21 |         attr_emb = attr_emb.unsqueeze(1)
 22 |         attrs_emb = attr_emb.repeat(1, words_emb.size()[1], 1)
 23 |         combina = torch.cat([words_emb, attrs_emb], dim=2)
 24 | 
 25 |         e = self.leakyrelu(torch.matmul(combina, self.a)).squeeze(-1)  # (batch size, seq length)
 26 |         attn = torch.zeros(words_emb.size()[0], word_emb.num_embeddings)  # (batch size, vocab length)
 27 |         attn = attn.to(self.device)
 28 |         for i in range(words_emb.size()[0]):
 29 |             attn[i][words[i]] = e[i]
 30 | 
 31 |         return attn
 32 | 
 33 | 
 34 | class ContAttentionLayer(nn.Module):
 35 | 
 36 |     def __init__(self, hidden_size, alpha):
 37 |         super(ContAttentionLayer, self).__init__()
 38 | 
 39 |         self.a = nn.Parameter(torch.zeros(size=(hidden_size, 1)))
 40 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
 41 | 
 42 |         self.leakyrelu = nn.LeakyReLU(alpha)
 43 | 
 44 |     def forward(self, attrs, all):
 45 |         alls = all.repeat(attrs.size()[0], 1)
 46 |         combina = torch.cat([attrs, alls], dim=1)
 47 | 
 48 |         e = self.leakyrelu(torch.matmul(combina, self.a))
 49 |         attention = F.softmax(e, dim=0)
 50 | 
 51 |         return attrs - attention * alls
 52 | 
 53 | 
 54 | class GlobalAttentionLayer(nn.Module):
 55 | 
 56 |     def __init__(self, hidden_size, alpha):
 57 |         super(GlobalAttentionLayer, self).__init__()
 58 | 
 59 |         self.linear = nn.Linear(hidden_size, hidden_size)
 60 |         self.a = nn.Parameter(torch.zeros(size=(hidden_size, 1)))
 61 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
 62 | 
 63 |         self.leakyrelu = nn.LeakyReLU(alpha)
 64 | 
 65 |     def forward(self, words_emb):
 66 |         words_emb = self.linear(words_emb)
 67 | 
 68 |         e = self.leakyrelu(torch.matmul(words_emb, self.a)).squeeze(-1)
 69 |         attention = F.softmax(e, dim=1).unsqueeze(1)
 70 | 
 71 |         attributes_emb = torch.matmul(attention, words_emb).squeeze(1)
 72 |         return F.relu(attributes_emb)
 73 | 
 74 | 
 75 | class StructAttentionLayer(nn.Module):
 76 | 
 77 |     def __init__(self, hidden_size, alpha):
 78 |         super(StructAttentionLayer, self).__init__()
 79 | 
 80 |         self.a = nn.Parameter(torch.zeros(size=(hidden_size, 1)))
 81 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
 82 | 
 83 |         self.leakyrelu = nn.LeakyReLU(alpha)
 84 | 
 85 |     def forward(self, attrs_emb, entity_emb):
 86 |         attr_num = attrs_emb.size()[1]
 87 | 
 88 |         entity_emb = entity_emb.unsqueeze(1)
 89 |         entitys_emb = entity_emb.repeat(1, attr_num, 1)
 90 |         combina = torch.cat([attrs_emb, entitys_emb], dim=2)
 91 | 
 92 |         e = self.leakyrelu(torch.matmul(combina, self.a)).squeeze(-1)
 93 |         attention = F.softmax(e, dim=1).unsqueeze(1) * attr_num
 94 | 
 95 |         entitys_emb = torch.matmul(attention, attrs_emb).squeeze(1)
 96 |         return entitys_emb
 97 | 
 98 | 
 99 | class ResAttentionLayer(nn.Module):
100 | 
101 |     def __init__(self, hidden_size, alpha, thr=0.5):
102 |         super(ResAttentionLayer, self).__init__()
103 | 
104 |         self.thr = thr
105 | 
106 |         self.linear = nn.Linear(hidden_size, hidden_size)
107 |         self.a = nn.Parameter(torch.zeros(size=(2 * hidden_size, 1)))
108 |         nn.init.xavier_uniform_(self.a.data, gain=1.414)
109 | 
110 |         self.leakyrelu = nn.LeakyReLU(alpha)
111 | 
112 |     def forward(self, entity_embs):
113 |         Wh = self.linear(entity_embs)
114 | 
115 |         a_input = self._prepare_input(Wh)
116 |         e = self.leakyrelu(torch.matmul(a_input, self.a)).squeeze(-1)
117 |         attention = F.softmax(e, dim=1)
118 | 
119 |         # We apply the pooling operation
120 |         attention = (attention < self.thr).type(attention.dtype) * attention
121 |         h_prime = torch.matmul(attention, Wh)
122 | 
123 |         return F.elu(entity_embs - h_prime)
124 | 
125 |     def _prepare_input(self, Wh):
126 |         N = Wh.size()[0]
127 |         d = Wh.size()[1]
128 | 
129 |         Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0)
130 |         Wh_repeated_alternating = Wh.repeat(N, 1)
131 |         all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)
132 | 
133 |         return all_combinations_matrix.view(N, N, 2 * d)
134 | 


--------------------------------------------------------------------------------
/ditto/ditto_light/summarize.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import csv
  3 | import sys
  4 | import os
  5 | 
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from collections import Counter
  8 | from nltk.corpus import stopwords
  9 | 
 10 | from .dataset import get_tokenizer
 11 | 
 12 | import nltk
 13 | nltk.download('stopwords')
 14 | 
 15 | stopwords = set(stopwords.words('english'))
 16 | 
 17 | class Summarizer:
 18 |     """To summarize a data entry pair into length up to the max sequence length.
 19 | 
 20 |     Args:
 21 |         task_config (Dictionary): the task configuration
 22 |         lm (string): the language model (bert, albert, or distilbert)
 23 | 
 24 |     Attributes:
 25 |         config (Dictionary): the task configuration
 26 |         tokenizer (Tokenizer): a tokenizer from the huggingface library
 27 |     """
 28 |     def __init__(self, task_config, lm):
 29 |         self.config = task_config
 30 |         self.tokenizer = get_tokenizer(lm=lm)
 31 |         self.len_cache = {}
 32 | 
 33 |         # build the tfidf index
 34 |         self.build_index()
 35 | 
 36 |     def build_index(self):
 37 |         """Build the idf index.
 38 | 
 39 |         Store the index and vocabulary in self.idf and self.vocab.
 40 |         """
 41 |         fns = [self.config['trainset'],
 42 |                self.config['validset'],
 43 |                self.config['testset']]
 44 |         content = []
 45 |         for fn in fns:
 46 |             with open(fn) as fin:
 47 |                 for line in fin:
 48 |                     LL = line.split('\t')
 49 |                     if len(LL) > 2:
 50 |                         for entry in LL:
 51 |                             content.append(entry)
 52 | 
 53 |         vectorizer = TfidfVectorizer().fit(content)
 54 |         self.vocab = vectorizer.vocabulary_
 55 |         self.idf = vectorizer.idf_
 56 | 
 57 |     def get_len(self, word):
 58 |         """Return the sentence_piece length of a token.
 59 |         """
 60 |         if word in self.len_cache:
 61 |             return self.len_cache[word]
 62 |         length = len(self.tokenizer.tokenize(word))
 63 |         self.len_cache[word] = length
 64 |         return length
 65 | 
 66 |     def transform(self, row, max_len=128):
 67 |         """Summarize one single example.
 68 | 
 69 |         Only retain tokens of the highest tf-idf
 70 | 
 71 |         Args:
 72 |             row (str): a matching example of two data entries and a binary label, separated by tab
 73 |             max_len (int, optional): the maximum sequence length to be summarized to
 74 | 
 75 |         Returns:
 76 |             str: the summarized example
 77 |         """
 78 |         sentA, sentB, label = row.strip().split('\t')
 79 |         res = ''
 80 |         cnt = Counter()
 81 |         for sent in [sentA, sentB]:
 82 |             tokens = sent.split(' ')
 83 |             for token in tokens:
 84 |                 if token not in ['COL', 'VAL'] and \
 85 |                    token not in stopwords:
 86 |                     if token in self.vocab:
 87 |                         cnt[token] += self.idf[self.vocab[token]]
 88 | 
 89 |         for sent in [sentA, sentB]:
 90 |             token_cnt = Counter(sent.split(' '))
 91 |             total_len = token_cnt['COL'] + token_cnt['VAL']
 92 | 
 93 |             subset = Counter()
 94 |             for token in set(token_cnt.keys()):
 95 |                 subset[token] = cnt[token]
 96 |             subset = subset.most_common(max_len)
 97 | 
 98 |             topk_tokens_copy = set([])
 99 |             for word, _ in subset:
100 |                 bert_len = self.get_len(word)
101 |                 if total_len + bert_len > max_len:
102 |                     break
103 |                 total_len += bert_len
104 |                 topk_tokens_copy.add(word)
105 | 
106 |             num_tokens = 0
107 |             for token in sent.split(' '):
108 |                 if token in ['COL', 'VAL']:
109 |                     res += token + ' '
110 |                 elif token in topk_tokens_copy:
111 |                     res += token + ' '
112 |                     topk_tokens_copy.remove(token)
113 | 
114 |             res += '\t'
115 | 
116 |         res += label + '\n'
117 |         return res
118 | 
119 |     def transform_file(self, input_fn, max_len=256, overwrite=False):
120 |         """Summarize all lines of a tsv file.
121 | 
122 |         Run the summarizer. If the output already exists, just return the file name.
123 | 
124 |         Args:
125 |             input_fn (str): the input file name
126 |             max_len (int, optional): the max sequence len
127 |             overwrite (bool, optional): if true, then overwrite any cached output
128 | 
129 |         Returns:
130 |             str: the output file name
131 |         """
132 |         out_fn = input_fn + '.su'
133 |         if not os.path.exists(out_fn) or \
134 |            os.stat(out_fn).st_size == 0 or overwrite:
135 |             with open(out_fn, 'w') as fout:
136 |                 for line in open(input_fn):
137 |                     fout.write(self.transform(line, max_len=max_len))
138 |         return out_fn
139 | 


--------------------------------------------------------------------------------
/hiergat/model/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .layer import AttentionLayer as AL, GlobalAttentionLayer as GoAL, StructAttentionLayer as SAL
  6 | from .dataset import get_lm_path
  7 | 
  8 | import sys
  9 | 
 10 | class TranHGAT(nn.Module):
 11 |     def __init__(self, attr_num, device='cpu', finetuning=True, lm='bert', lm_path=None):
 12 |         super().__init__()
 13 | 
 14 |         # load the model or model checkpoint
 15 |         path = get_lm_path(lm, lm_path)
 16 |         self.lm = lm
 17 |         if lm == 'bert':
 18 |             from transformers import BertModel
 19 |             self.bert = BertModel.from_pretrained(path)
 20 |         elif lm == 'distilbert':
 21 |             from transformers import DistilBertModel
 22 |             self.bert = DistilBertModel.from_pretrained(path)
 23 |         elif lm == 'roberta':
 24 |             from transformers import RobertaModel
 25 |             self.bert = RobertaModel.from_pretrained(path)
 26 |         elif lm == 'xlnet':
 27 |             from transformers import XLNetModel
 28 |             self.bert = XLNetModel.from_pretrained(path)
 29 | 
 30 |         self.device = device
 31 |         self.finetuning = finetuning
 32 | 
 33 |         # hard corded for now
 34 |         hidden_size = 768
 35 |         hidden_dropout_prob = 0.1
 36 | 
 37 |         self.inits = nn.ModuleList([
 38 |             GoAL(hidden_size, 0.2)
 39 |             for _ in range(attr_num)])
 40 |         self.conts = nn.ModuleList([
 41 |             AL(hidden_size + hidden_size, 0.2, device)
 42 |             for _ in range(attr_num)])
 43 |         self.out = SAL(hidden_size * (attr_num + 1), 0.2)
 44 | 
 45 |         self.softmax = nn.Softmax(dim=2)
 46 |         self.dropout = nn.Dropout(hidden_dropout_prob)
 47 |         self.fc = nn.Linear(hidden_size, 2)
 48 | 
 49 |     def forward(self, xs, y, masks):
 50 |         xs = xs.to(self.device)
 51 |         y = y.to(self.device)
 52 |         masks = masks.to(self.device)
 53 | 
 54 |         xs = xs.permute(1, 0, 2) #[Attributes, Batch, Tokens]
 55 |         masks = masks.permute(0, 2, 1) # [Batch, All Tokens, Attributes]
 56 | 
 57 |         attr_outputs = []
 58 |         pooled_outputs = []
 59 |         attns = []
 60 |         if self.training and self.finetuning:
 61 |             self.bert.train()
 62 |             for x, init, cont in zip(xs, self.inits, self.conts):
 63 | #                 print(x) 
 64 | #                 sys.exit()
 65 |                 
 66 |                 attr_embeddings = init(self.bert.get_input_embeddings()(x)) # [Batch, Hidden]
 67 |                 attr_outputs.append(attr_embeddings)
 68 | 
 69 |                 attn = cont(x, self.bert.get_input_embeddings(), attr_embeddings) # [Batch, All Tokens]
 70 |                 attns.append(attn)
 71 | 
 72 |             attns = self.softmax(torch.stack(attns).permute(1, 2, 0)) * masks # [Batch, All Tokens, Attributes]
 73 |             attr_outputs = torch.stack(attr_outputs).permute(1, 0, 2) # [Batch, Attributes, Hidden]
 74 |             for x in xs:
 75 |                 if self.lm == 'distilbert':
 76 |                     words_emb = self.bert.embeddings(x)
 77 |                 else:
 78 |                     words_emb = self.bert.get_input_embeddings()(x)
 79 | 
 80 |                 for i in range(words_emb.size()[0]): # i is index of batch
 81 |                     words_emb[i] += torch.matmul(attns[i][x[i]], attr_outputs[i])
 82 | 
 83 |                 output = self.bert(inputs_embeds=words_emb)
 84 |                 pooled_output = output[0][:, 0, :]
 85 |                 pooled_output = self.dropout(pooled_output)
 86 | 
 87 |                 pooled_outputs.append(pooled_output)
 88 | 
 89 |             attr_outputs = torch.stack(pooled_outputs).permute(1, 0, 2)
 90 |             entity_outputs = attr_outputs.reshape(attr_outputs.size()[0], -1)
 91 |             entity_output = self.out(attr_outputs, entity_outputs)
 92 |         else:
 93 |             self.bert.eval()
 94 |             with torch.no_grad():
 95 |                 for x, init, cont in zip(xs, self.inits, self.conts):
 96 |                     attr_embeddings = init(self.bert.get_input_embeddings()(x))
 97 |                     attr_outputs.append(attr_embeddings)
 98 | 
 99 |                     # 64 * 768
100 |                     attn = cont(x, self.bert.get_input_embeddings(), attr_embeddings)
101 |                     attns.append(attn)
102 | 
103 |                 attns = self.softmax(torch.stack(attns).permute(1, 2, 0)) * masks
104 |                 attr_outputs = torch.stack(attr_outputs).permute(1, 0, 2)
105 |                 for x in xs:
106 |                     if self.lm == 'distilbert':
107 |                         words_emb = self.bert.embeddings(x)
108 |                     else:
109 |                         words_emb = self.bert.get_input_embeddings()(x)
110 | 
111 |                     for i in range(words_emb.size()[0]):
112 |                         words_emb[i] += torch.matmul(attns[i][x[i]], attr_outputs[i])
113 | 
114 |                     output = self.bert(inputs_embeds=words_emb)
115 |                     pooled_output = output[0][:, 0, :]
116 |                     pooled_output = self.dropout(pooled_output)
117 |                     pooled_outputs.append(pooled_output)
118 | 
119 |                 attr_outputs = torch.stack(pooled_outputs).permute(1, 0, 2)
120 |                 entity_outputs = attr_outputs.reshape(attr_outputs.size()[0], -1)
121 |                 entity_output = self.out(attr_outputs, entity_outputs)
122 | 
123 |         logits = self.fc(entity_output)
124 |         y_hat = logits.argmax(-1)
125 |         return logits, y, y_hat
126 | 


--------------------------------------------------------------------------------
/ditto/ditto_light/knowledge.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import csv
  3 | import sys
  4 | import os
  5 | import spacy
  6 | 
  7 | from collections import Counter
  8 | 
  9 | class DKInjector:
 10 |     """Inject domain knowledge to the data entry pairs.
 11 | 
 12 |     Attributes:
 13 |         config: the task configuration
 14 |         name: the injector name
 15 |     """
 16 |     def __init__(self, config, name):
 17 |         self.config = config
 18 |         self.name = name
 19 |         self.initialize()
 20 | 
 21 |     def initialize(self):
 22 |         pass
 23 | 
 24 |     def transform(self, entry):
 25 |         return entry
 26 | 
 27 |     def transform_file(self, input_fn, overwrite=False):
 28 |         """Transform all lines of a tsv file.
 29 | 
 30 |         Run the knowledge injector. If the output already exists, just return the file name.
 31 | 
 32 |         Args:
 33 |             input_fn (str): the input file name
 34 |             overwrite (bool, optional): if true, then overwrite any cached output
 35 | 
 36 |         Returns:
 37 |             str: the output file name
 38 |         """
 39 |         out_fn = input_fn + '.dk'
 40 |         if not os.path.exists(out_fn) or \
 41 |             os.stat(out_fn).st_size == 0 or overwrite:
 42 | 
 43 |             with open(out_fn, 'w') as fout:
 44 |                 for line in open(input_fn):
 45 |                     LL = line.split('\t')
 46 |                     if len(LL) == 3:
 47 |                         entry0 = self.transform(LL[0])
 48 |                         entry1 = self.transform(LL[1])
 49 |                         fout.write(entry0 + '\t' + entry1 + '\t' + LL[2])
 50 |         return out_fn
 51 | 
 52 | 
 53 | class ProductDKInjector(DKInjector):
 54 |     """The domain-knowledge injector for product data.
 55 |     """
 56 |     def initialize(self):
 57 |         """Initialize spacy"""
 58 |         self.nlp = spacy.load('en_core_web_lg')
 59 | 
 60 |     def transform(self, entry):
 61 |         """Transform a data entry.
 62 | 
 63 |         Use NER to regconize the product-related named entities and
 64 |         mark them in the sequence. Normalize the numbers into the same format.
 65 | 
 66 |         Args:
 67 |             entry (str): the serialized data entry
 68 | 
 69 |         Returns:
 70 |             str: the transformed entry
 71 |         """
 72 |         res = ''
 73 |         doc = self.nlp(entry, disable=['tagger', 'parser'])
 74 |         ents = doc.ents
 75 |         start_indices = {}
 76 |         end_indices = {}
 77 | 
 78 |         for ent in ents:
 79 |             start, end, label = ent.start, ent.end, ent.label_
 80 |             if label in ['NORP', 'GPE', 'LOC', 'PERSON', 'PRODUCT']:
 81 |                 start_indices[start] = 'PRODUCT'
 82 |                 end_indices[end] = 'PRODUCT'
 83 |             if label in ['DATE', 'QUANTITY', 'TIME', 'PERCENT', 'MONEY']:
 84 |                 start_indices[start] = 'NUM'
 85 |                 end_indices[end] = 'NUM'
 86 | 
 87 |         for idx, token in enumerate(doc):
 88 |             if idx in start_indices:
 89 |                 res += start_indices[idx] + ' '
 90 | 
 91 |             # normalizing the numbers
 92 |             if token.like_num:
 93 |                 try:
 94 |                     val = float(token.text)
 95 |                     if val == round(val):
 96 |                         res += '%d ' % (int(val))
 97 |                     else:
 98 |                         res += '%.2f ' % (val)
 99 |                 except:
100 |                     res += token.text + ' '
101 |             elif len(token.text) >= 7 and \
102 |                  any([ch.isdigit() for ch in token.text]):
103 |                 res += 'ID ' + token.text + ' '
104 |             else:
105 |                 res += token.text + ' '
106 |         return res.strip()
107 | 
108 | 
109 | 
110 | class GeneralDKInjector(DKInjector):
111 |     """The domain-knowledge injector for publication and business data.
112 |     """
113 |     def initialize(self):
114 |         """Initialize spacy"""
115 |         self.nlp = spacy.load('en_core_web_lg')
116 | 
117 |     def transform(self, entry):
118 |         """Transform a data entry.
119 | 
120 |         Use NER to regconize the product-related named entities and
121 |         mark them in the sequence. Normalize the numbers into the same format.
122 | 
123 |         Args:
124 |             entry (str): the serialized data entry
125 | 
126 |         Returns:
127 |             str: the transformed entry
128 |         """
129 |         res = ''
130 |         doc = self.nlp(entry, disable=['tagger', 'parser'])
131 |         ents = doc.ents
132 |         start_indices = {}
133 |         end_indices = {}
134 | 
135 |         for ent in ents:
136 |             start, end, label = ent.start, ent.end, ent.label_
137 |             if label in ['PERSON', 'ORG', 'LOC', 'PRODUCT', 'DATE', 'QUANTITY', 'TIME']:
138 |                 start_indices[start] = label
139 |                 end_indices[end] = label
140 | 
141 |         for idx, token in enumerate(doc):
142 |             if idx in start_indices:
143 |                 res += start_indices[idx] + ' '
144 | 
145 |             # normalizing the numbers
146 |             if token.like_num:
147 |                 try:
148 |                     val = float(token.text)
149 |                     if val == round(val):
150 |                         res += '%d ' % (int(val))
151 |                     else:
152 |                         res += '%.2f ' % (val)
153 |                 except:
154 |                     res += token.text + ' '
155 |             elif len(token.text) >= 7 and \
156 |                  any([ch.isdigit() for ch in token.text]):
157 |                 res += 'ID ' + token.text + ' '
158 |             else:
159 |                 res += token.text + ' '
160 |         return res.strip()
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # WDC Products: A Multi-Dimensional Entity Matching Benchmark
  2 | 
  3 | This repository contains the code and data download links to reproduce building the WDC Products benchmark. The benchmark files are available for direct download [here](http://webdatacommons.org/largescaleproductcorpus/wdc-products/). A paper describing the benchmark will be presented at the [EDBT2024](https://dastlab.github.io/edbticdt2024/) conference. A preprint of the paper is available [here](https://arxiv.org/abs/2301.09521).
  4 | 
  5 | * **Requirements**
  6 | 
  7 |     [Anaconda3](https://www.anaconda.com/products/individual)
  8 | 
  9 |     Please keep in mind that the code is not optimized for portable or even non-workstation devices. Some of the scripts may require large amounts of RAM (64GB+) and GPUs. It is advised to use a powerful workstation or server when experimenting with some of the larger files.
 10 | 
 11 |     The code has only been used and tested on Linux (CentOS) servers.
 12 | 
 13 | * **Building the conda environment**
 14 | 
 15 |     To build the exact conda environment used for the creation of WDC Products, navigate to the project root folder where the file *environment.yml* is located and run ```conda env create -f environment.yml```
 16 | 
 17 | * **Downloading the raw data files**
 18 | 
 19 |     Navigate to the *src/data/* folder and run ```python download_datasets.py``` to automatically download the source files into the correct locations.
 20 |     You can find the data at *data/interim/wdc-lspc/corpus/*
 21 | 
 22 | * **Building WDC Products**
 23 | 
 24 |     To reproduce building WDC Products, run the following notebooks in order. Please keep in mind that some notebooks may run for multiple hours.
 25 |     
 26 |     1. *notebooks/processing/benchmark2020/langdetect-and-clean.ipynb*
 27 |     2. *notebooks/processing/benchmark2020/dbscan-clustering.ipynb*
 28 |     3. *notebooks/processing/benchmark2020/generate-sets-final.ipynb*
 29 | 	
 30 | * **Experiments**
 31 | 	
 32 | 	You need to install the project as a package. To do this, activate the environment, navigate to the root folder of the project, and run ```pip install -e .```
 33 | 	
 34 |     * **Preparing the data**:
 35 | 
 36 |     To prepare the data for the experiments, run the first script below and then any of the others to prepare the data for the respective experiments. Make sure to navigate to the respective folders first. If you did not run the previous benchmark creation steps, you can obtain the needed data files on the WDC Products [page](https://webdatacommons.org/largescaleproductcorpus/wdc-products/): 
 37 |     
 38 |     - *src/processing/preprocess/preprocess_wdcproducts.py*
 39 | 
 40 |     - *src/processing/contrastive/prepare-data.py*
 41 |     - *src/processing/process-magellan/process_to_magellan.py*
 42 |     - *src/processing/process-wordcooc/process-to-wordcooc.py*
 43 |     - *src/processing/process-wordocc/process-to-wordocc-multi.py*
 44 |     
 45 |     To prepare data for Ditto and HierGAT input run
 46 |     
 47 |     - *preprocess_data_for_ditto_hiergat.py*
 48 | 
 49 |     * **Running the experiments**:
 50 | 	
 51 | 		* **Ditto**:
 52 | 	    First install the Ditto environment from the *ditto/ditto_env.yml* then run *ditto/all_runs.py*
 53 | 	
 54 | 		* **HierGAT**:
 55 | 	    First install the HierGAT environment from the *hiergat/hiergat_env.yml* then run *hiergat/all_runs.py*
 56 | 	
 57 |         * **Magellan**:
 58 |             Navigate to *src/models/magellan/* and run the script *run_magellan.py*
 59 | 
 60 |         * **Word Coocurrence**:
 61 |             Navigate to *src/models/wordcooc/* and run the script *run_wordcooc.py*
 62 | 
 63 |         * **Word Occurrence**:
 64 |             Navigate to *src/models/wordocc* and run the script *run_wordocc_multi.py*
 65 | 	    
 66 | 
 67 |         * **Transformer**:
 68 | 
 69 |             Navigate to src/contrastive/
 70 |             
 71 |             To fine-tune a Transformer, run any of the fine-tuning scripts, e.g. for pair-wise:
 72 | 
 73 |             ```CUDA_VISIBLE_DEVICES="GPU_ID" bash lspc/run_finetune_baseline.sh roberta-base True 64 5e-05 wdcproducts80cc20rnd000un large```
 74 | 
 75 |             You need to specify model, usage of gradient checkpointing, batch size, learning rate, dataset and development size as arguments here.
 76 | 
 77 |             Analogously for fine-tuning a multi-class Transformer: 
 78 | 
 79 |             ```CUDA_VISIBLE_DEVICES="GPU_ID" bash lspc/run_finetune_baseline_multi.sh roberta-base True 64 5e-05 wdcproductsmulti80cc20rnd000un large```
 80 | 
 81 |         * **R-SupCon**:
 82 | 
 83 |             Navigate to src/contrastive/
 84 | 
 85 |             * **Contrastive Pre-training**:
 86 | 	
 87 |                 To run contrastive pre-training use e.g.
 88 | 
 89 |                 ```CUDA_VISIBLE_DEVICES="GPU_ID" bash lspc/run_pretraining.sh roberta-base True 1024 5e-05 0.07 wdcproducts80cc20rnd000un large```
 90 | 
 91 |                 You need to specify model, usage of gradient checkpointing, batch size, learning rate, temperature, dataset and development size as arguments here.
 92 | 
 93 |             * **Cross-entropy Fine-tuning**:
 94 |             
 95 |                 Finally, to use the pre-trained models for fine-tuning, run any of the fine-tuning scripts, e.g. for pair-wise:
 96 | 
 97 |                 ```CUDA_VISIBLE_DEVICES="GPU_ID" bash lspc/run_finetune_siamese.sh roberta-base True 1024 5e-05 0.07 frozen wdcproducts80cc20rnd000un large``` 
 98 | 
 99 |                 Analogously for fine-tuning multi-class R-SupCon: 
100 | 
101 |                 ```CUDA_VISIBLE_DEVICES="GPU_ID" bash lspc/run_finetune_multi.sh roberta-base True 1024 5e-05 0.07 frozen wdcproductsmulti80cc20rnd000un large```
102 | 		
103 | 	
104 | 
105 |     
106 |     Result files can subsequently be found in the *reports* folder.
107 | 
108 | 
109 | 	
110 | --------
111 | 
112 | Project based on the [cookiecutter data science project template](https://drivendata.github.io/cookiecutter-data-science/). #cookiecutterdatascience
113 | 


--------------------------------------------------------------------------------
/src/processing/process-wordocc/process-to-wordocc-multi.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | np.random.seed(42)
  4 | import random
  5 | random.seed(42)
  6 | 
  7 | import os
  8 | import glob
  9 | import json
 10 | from pathlib import Path
 11 | 
 12 | from sklearn.feature_extraction.text import CountVectorizer
 13 | 
 14 | from pdb import set_trace
 15 | 
 16 | def process_df_columns_to_wordocc(file, columns_preprocess_wordcooc, feature_combinations):
 17 |     data_df = None
 18 |     if '.pkl.gz' in file:
 19 |         data_df = pd.read_pickle(file)
 20 |     if 'training' in file:
 21 |         valid = file.replace('training', 'validation')
 22 |         valid = valid.replace('train', 'valid')
 23 |         valid_df = pd.read_pickle(valid)
 24 |         data_df = pd.concat([data_df, valid_df])
 25 |         data_df = data_df.reset_index(drop=True)
 26 |     elif '.json.gz' in file:
 27 |         data_df = pd.read_json(file, lines=True)
 28 |     else:
 29 |         print(f'unrecognized file format: {Path(file).suffix}')
 30 |     data_df.fillna('', inplace=True)
 31 | 
 32 |     # preprocess selected columns
 33 |     for column in columns_preprocess_wordcooc:
 34 |         data_df[column] = data_df[column].astype(str)
 35 | 
 36 |     # build combined features for every feature combination
 37 |     for feature_combination in feature_combinations:
 38 |         feats_to_combine = feature_combination.split('+')
 39 |         data_df[feature_combination + '_wordocc'] = data_df[feats_to_combine[0]]
 40 | 
 41 |         for feat_to_combine in feats_to_combine[1:]:
 42 |             data_df[feature_combination + '_wordocc'] += (' ' + data_df[feat_to_combine])
 43 | 
 44 |         data_df[feature_combination + '_wordocc'] = data_df[feature_combination + '_wordocc'].str.strip()
 45 | 
 46 |     return data_df
 47 | 
 48 | 
 49 | def transform_columns_to_wordcount(data_df, feature_combinations, test_df):
 50 |     words = {}
 51 | 
 52 |     for feature_combination in feature_combinations:
 53 | 
 54 |         # build relevant strings for vocabulary
 55 |         all_strings = data_df[['id', feature_combination + '_wordocc']].copy()
 56 |         all_strings = all_strings.rename(
 57 |             columns={feature_combination + '_wordocc': feature_combination})
 58 |         all_unique_strings = all_strings
 59 |         all_unique_strings = all_unique_strings.drop_duplicates(subset='id')
 60 | 
 61 |         # learn vocabulary
 62 |         count_vectorizer = CountVectorizer(min_df=2, binary=True)
 63 |         count_vectorizer.fit(all_unique_strings[feature_combination])
 64 | 
 65 |         words[feature_combination] = count_vectorizer.get_feature_names()
 66 | 
 67 |         # apply binary word occurrence
 68 |         matrix = count_vectorizer.transform(data_df[feature_combination + '_wordocc'])
 69 |         data_df[feature_combination + '_wordocc'] = [x for x in matrix]
 70 | 
 71 |         if not isinstance(test_df, type(None)):
 72 |             matrix_test = count_vectorizer.transform(test_df[feature_combination + '_wordocc'])
 73 |             test_df[feature_combination + '_wordocc'] = [x for x in matrix_test]
 74 | 
 75 |     return data_df, test_df, words
 76 | 
 77 | def preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name, dataset_name, valid_set=None,
 78 |                         test_set=None):
 79 |     columns_preprocess_wordcooc = [col for col in columns_to_preprocess]
 80 | 
 81 |     main_df = process_df_columns_to_wordocc(file, columns_preprocess_wordcooc, feature_combinations)
 82 | 
 83 |     if not isinstance(test_set, type(None)):
 84 |         test_df = process_df_columns_to_wordocc(test_set, columns_preprocess_wordcooc, feature_combinations)
 85 |     else:
 86 |         test_df = None
 87 | 
 88 |     main_df, test_df, words = transform_columns_to_wordcount(main_df, feature_combinations, test_df)
 89 | 
 90 |     main_name = os.path.basename(file)
 91 |     new_main_name = main_name.replace('.pkl.gz', '_wordocc')
 92 |     new_main_name = new_main_name.replace('.json.gz', '_wordocc')
 93 | 
 94 |     out_path = f'../../../data/processed/{dataset_name}/wordocc/{experiment_name}/'
 95 | 
 96 |     os.makedirs(out_path + 'feature-names/', exist_ok=True)
 97 | 
 98 |     with open(out_path + 'feature-names/' + new_main_name + '_words.json', 'w') as f:
 99 |         json.dump(words, f, ensure_ascii=False)
100 | 
101 |     if isinstance(valid_set, type(None)):
102 |         main_df.to_pickle(out_path + new_main_name + '.pkl.gz', compression='gzip')
103 |     else:
104 |         validation_ids_df = pd.read_pickle(valid_set)
105 |         validation_df = main_df[main_df['id'].isin(validation_ids_df['id'].values)]
106 | 
107 |         main_df.to_pickle(out_path + new_main_name + '.pkl.gz', compression='gzip')
108 |         valid_name = new_main_name.replace('train', 'valid')
109 |         validation_df.to_pickle(out_path + valid_name + '.pkl.gz', compression='gzip')
110 | 
111 |     if not isinstance(test_df, type(None)):
112 |         test_name = os.path.basename(test_set)
113 |         test_name = test_name.replace('.pkl.gz', '')
114 |         test_name = test_name.replace('.json.gz', '')
115 |         new_test_name = new_main_name + '_' + test_name
116 | 
117 |         test_df.to_pickle(out_path + new_test_name + '.pkl.gz', compression='gzip')
118 | 
119 | 
120 | if __name__ == '__main__':
121 | 
122 |     for file in glob.glob('../../../data/interim/wdc-lspc/training-sets/*'):
123 |         if 'multi' not in file or 'wdcproducts' not in file:
124 |             continue
125 |             
126 |         valid = file.replace('training', 'validation')
127 |         valid = valid.replace('train', 'valid')
128 | 
129 |         columns_to_preprocess = ['title', 'description', 'brand', 'price', 'priceCurrency']
130 |         feature_combinations = ['brand+title+price+priceCurrency+description']
131 | 
132 |         test_cat = os.path.basename(file).split('_')[1]
133 |         test ='../../../data/interim/wdc-lspc/gold-standards/preprocessed_{}_gs.pkl.gz'.format(test_cat)
134 | 
135 |         preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name='learning-curve', dataset_name='wdc-lspc',  valid_set=valid, test_set=test)


--------------------------------------------------------------------------------
/hiergat/model/cmodel.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from transformers import BertModel
  5 | 
  6 | from .layer import AttentionLayer as AL, GlobalAttentionLayer as GoAL, \
  7 |     StructAttentionLayer as SAL, ResAttentionLayer as RAL, ContAttentionLayer as CAL
  8 | from .dataset import get_lm_path
  9 | 
 10 | 
 11 | class TranHGAT(nn.Module):
 12 |     def __init__(self, attr_num, device='cpu', finetuning=True, lm='bert', lm_path=None):
 13 |         super().__init__()
 14 | 
 15 |         # load the model or model checkpoint
 16 |         path = get_lm_path(lm, lm_path)
 17 |         self.lm = lm
 18 |         if lm == 'bert':
 19 |             from transformers import BertModel
 20 |             self.bert = BertModel.from_pretrained(path)
 21 |         elif lm == 'distilbert':
 22 |             from transformers import DistilBertModel
 23 |             self.bert = DistilBertModel.from_pretrained(path)
 24 |         elif lm == 'roberta':
 25 |             from transformers import RobertaModel
 26 |             self.bert = RobertaModel.from_pretrained(path)
 27 |         elif lm == 'xlnet':
 28 |             from transformers import XLNetModel
 29 |             self.bert = XLNetModel.from_pretrained(path)
 30 | 
 31 |         self.device = device
 32 |         self.finetuning = finetuning
 33 | 
 34 |         # hard corded for now
 35 |         hidden_size = 768
 36 |         hidden_dropout_prob = 0.1
 37 | 
 38 |         self.inits = nn.ModuleList([
 39 |             GoAL(hidden_size, 0.2)
 40 |             for _ in range(attr_num)])
 41 |         self.alls = nn.ModuleList([
 42 |             GoAL(hidden_size, 0.2)
 43 |             for _ in range(attr_num)])
 44 |         self.oves = nn.ModuleList([
 45 |             CAL(hidden_size + hidden_size, 0.2)
 46 |             for _ in range(attr_num)])
 47 |         self.conts = nn.ModuleList([
 48 |             AL(hidden_size + hidden_size, 0.2, device)
 49 |             for _ in range(attr_num)])
 50 |         self.out = SAL(hidden_size * (attr_num + 1), 0.2)
 51 |         self.res = RAL(hidden_size, 0.2, 1/17)
 52 | 
 53 |         self.softmax = nn.Softmax(dim=2)
 54 |         self.dropout = nn.Dropout(hidden_dropout_prob)
 55 |         self.fc = nn.Linear(hidden_size, 2)
 56 | 
 57 |     def forward(self, xs, zs, y, masks):
 58 |         xs = xs.to(self.device)
 59 |         zs = zs.to(self.device)
 60 |         y = y.to(self.device)
 61 |         masks = masks.to(self.device)
 62 | 
 63 |         xs = xs.permute(1, 0, 2) #[Attributes, Batch, Tokens]
 64 |         masks = masks.permute(0, 2, 1) # [Batch, All Tokens, Attributes]
 65 | 
 66 |         attr_outputs = []
 67 |         pooled_outputs = []
 68 |         attns = []
 69 |         if self.training and self.finetuning:
 70 |             self.bert.train()
 71 |             for x, z, init, all, ove, cont in zip(xs, zs, self.inits, self.alls, self.oves, self.conts):
 72 |                 attr_embeddings = init(self.bert.get_input_embeddings()(x)) # [Batch, Hidden]
 73 |                 all_embedding = all(self.bert.get_input_embeddings()(z)) # [1, Hidden]
 74 | 
 75 |                 attr_embeddings = ove(attr_embeddings, all_embedding)
 76 |                 attr_outputs.append(attr_embeddings)
 77 | 
 78 |                 attn = cont(x, self.bert.get_input_embeddings(), attr_embeddings) # [Batch, All Tokens]
 79 |                 attns.append(attn)
 80 | 
 81 |             attns = self.softmax(torch.stack(attns).permute(1, 2, 0)) * masks # [Batch, All Tokens, Attributes]
 82 |             attr_outputs = torch.stack(attr_outputs).permute(1, 0, 2) # [Batch, Attributes, Hidden]
 83 |             for x in xs:
 84 |                 if self.lm == 'distilbert':
 85 |                     words_emb = self.bert.embeddings(x)
 86 |                 else:
 87 |                     words_emb = self.bert.get_input_embeddings()(x)
 88 | 
 89 |                 for i in range(words_emb.size()[0]):
 90 |                     words_emb[i] += torch.matmul(attns[i][x[i]], attr_outputs[i])
 91 | 
 92 |                 output = self.bert(inputs_embeds=words_emb)
 93 | 
 94 |                 pooled_output = output[0][:, 0, :]
 95 |                 pooled_output = self.dropout(pooled_output)
 96 |                 pooled_outputs.append(pooled_output)
 97 | 
 98 |             # Struct Attention
 99 |             attr_outputs = torch.stack(pooled_outputs).permute(1, 0, 2)
100 |             entity_outputs = attr_outputs.reshape(attr_outputs.size()[0], -1)
101 |             entity_output = self.res(self.out(attr_outputs, entity_outputs))
102 |         else:
103 |             self.bert.eval()
104 |             with torch.no_grad():
105 |                 for x, z, init, all, ove, cont in zip(xs, zs, self.inits, self.alls, self.oves, self.conts):
106 |                     attr_embeddings = init(self.bert.get_input_embeddings()(x))  # [Batch, Hidden]
107 |                     all_embedding = all(self.bert.get_input_embeddings()(z))  # [1, Hidden]
108 | 
109 |                     attr_embeddings = ove(attr_embeddings, all_embedding)
110 |                     attr_outputs.append(attr_embeddings)
111 | 
112 |                     attn = cont(x, self.bert.get_input_embeddings(), attr_embeddings)  # [Batch, All Tokens]
113 |                     attns.append(attn)
114 | 
115 |                 attns = self.softmax(torch.stack(attns).permute(1, 2, 0)) * masks
116 |                 attr_outputs = torch.stack(attr_outputs).permute(1, 0, 2)
117 |                 for x in xs:
118 |                     if self.lm == 'distilbert':
119 |                         words_emb = self.bert.embeddings(x)
120 |                     else:
121 |                         words_emb = self.bert.get_input_embeddings()(x)
122 | 
123 |                     for i in range(words_emb.size()[0]):
124 |                         words_emb[i] += torch.matmul(attns[i][x[i]], attr_outputs[i])
125 | 
126 |                     output = self.bert(inputs_embeds=words_emb)
127 | 
128 |                     pooled_output = output[0][:, 0, :]
129 |                     pooled_output = self.dropout(pooled_output)
130 |                     pooled_outputs.append(pooled_output)
131 | 
132 |                 attr_outputs = torch.stack(pooled_outputs).permute(1, 0, 2)
133 |                 entity_outputs = attr_outputs.reshape(attr_outputs.size()[0], -1)
134 |                 entity_output = self.res(self.out(attr_outputs, entity_outputs))
135 | 
136 |         logits = self.fc(entity_output)
137 |         y_hat = logits.argmax(-1)
138 |         return logits, y, y_hat
139 | 


--------------------------------------------------------------------------------
/hiergat/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | from model.eval import eval_on_task
  6 | from model.dataset import Dataset, get_tokenizer
  7 | from model.model import TranHGAT
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn as nn
 12 | import torch.nn.functional as F
 13 | import torch.optim as optim
 14 | 
 15 | from tensorboardX import SummaryWriter
 16 | from torch.utils import data
 17 | from transformers import AdamW, get_linear_schedule_with_warmup
 18 | 
 19 | import time
 20 | 
 21 | import sys
 22 | import random
 23 | 
 24 | def train(model, train_set, optimizer, scheduler=None, batch_size=32):
 25 |     iterator = data.DataLoader(dataset=train_set, batch_size=batch_size,
 26 |                                shuffle=True, num_workers=1, collate_fn=Dataset.pad)
 27 |     classifier_criterion = nn.CrossEntropyLoss()
 28 | 
 29 |     model.train()
 30 |     for i, batch in enumerate(iterator):
 31 |         # for monitoring
 32 |         _, xs, y, _, masks = batch
 33 |         _y = y
 34 | 
 35 |         # forward
 36 |         optimizer.zero_grad()
 37 |         logits, y, _ = model(xs, y, masks)
 38 | 
 39 |         logits = logits.view(-1, logits.shape[-1])
 40 |         y = y.view(-1)
 41 |         loss = classifier_criterion(logits, y)
 42 | 
 43 |         loss.backward()
 44 |         optimizer.step()
 45 |         if scheduler:
 46 |             scheduler.step()
 47 | 
 48 |         if i % 10 == 0:  # monitoring
 49 |             print(f"step: {i}, loss: {loss.item()}")
 50 |             del loss
 51 | 
 52 | 
 53 | def initialize_and_train(trainset, validset, testset, attr_num, args, run_tag):
 54 |     padder = Dataset.pad
 55 |     valid_iter = data.DataLoader(dataset=validset, batch_size=args.batch_size,
 56 |                                  shuffle=False, num_workers=0, collate_fn=padder)
 57 |     test_iter = data.DataLoader(dataset=testset, batch_size=args.batch_size,
 58 |                                 shuffle=False, num_workers=0, collate_fn=padder)
 59 | 
 60 |     # initialize model
 61 |     # set seeds
 62 |     seed = args.run_id
 63 |     random.seed(seed)
 64 |     np.random.seed(seed)
 65 |     torch.manual_seed(seed)
 66 |     if torch.cuda.is_available():
 67 |         torch.cuda.manual_seed_all(seed)
 68 | 
 69 |     # initialize model
 70 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 71 |     model = TranHGAT(attr_num, device, args.finetuning, lm=args.lm, lm_path=args.lm_path)
 72 |     if device == 'cpu':
 73 |         optimizer = AdamW(model.parameters(), lr=args.lr)
 74 |     else:
 75 |         model = model.cuda()
 76 |         optimizer = AdamW(model.parameters(), lr=args.lr)
 77 | 
 78 |     # learning rate scheduler
 79 |     num_steps = (len(trainset) // args.batch_size) * args.n_epochs
 80 |     scheduler = get_linear_schedule_with_warmup(optimizer,
 81 |                                                 num_warmup_steps=0,
 82 |                                                 num_training_steps=num_steps)
 83 | 
 84 |     # create logging directory
 85 |     if not os.path.exists(args.logdir):
 86 |         os.makedirs(args.logdir)
 87 |     writer = SummaryWriter(log_dir=args.logdir)
 88 | 
 89 |     # start training
 90 |     best_dev_f1 = best_test_f1 = 0.0
 91 |     epoch = 1
 92 |     no_improvement_count = 0
 93 |     while ((epoch <= args.n_epochs) and (no_improvement_count <= 10)):
 94 |         start = time.time()
 95 |         train(model, trainset, optimizer, scheduler=scheduler,
 96 |               batch_size=args.batch_size)
 97 |         print("train time: ", time.time()-start)
 98 | 
 99 |         print(f"=========eval at epoch={epoch}=========")
100 |         dev_f1, test_f1 = eval_on_task(epoch, model, valid_iter, test_iter,
101 |                                        writer, run_tag)
102 | 
103 |         if dev_f1 > 1e-6:
104 |             epoch += 1
105 | 
106 |             if dev_f1 > best_dev_f1:
107 |                 best_dev_f1 = dev_f1
108 |                 best_test_f1 = test_f1
109 |             if (epoch == args.n_epochs):
110 |                 path = os.getcwd()
111 |                 path = path + '/output/' + str(run_tag) + '.txt'
112 |                 os.makedirs(os.path.dirname(path), exist_ok=True)
113 | 
114 |                 dict = {'best_test_f1': best_test_f1}
115 | 
116 |                 with open(path, "a+") as f:
117 |                     f.write(repr(dict) + '\n')
118 | 
119 |             print("current_best_test_f1: " + str(best_test_f1))
120 | 
121 |             if args.save_model:
122 |                 pass
123 | #             if args.save_model:
124 | #                 if dev_f1 > best_dev_f1:
125 | #                     best_dev_f1 = dev_f1
126 | #                     torch.save(model.state_dict(), run_tag + '_dev.pt')
127 | #                 if test_f1 > best_test_f1:
128 | #                     best_test_f1 = dev_f1
129 | #                     torch.save(model.state_dict(), run_tag + '_test.pt')
130 |         else:
131 |             no_improvement_count += 1
132 | 
133 |     writer.close()
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     parser = argparse.ArgumentParser()
138 |     parser.add_argument("--task", type=str, default="Amazon-Google")
139 |     parser.add_argument("--run_id", type=int, default=0)
140 |     parser.add_argument("--batch_size", type=int, default=32)
141 |     parser.add_argument("--max_len", type=int, default=256)
142 |     parser.add_argument("--lr", type=float, default=1e-5)
143 |     parser.add_argument("--n_epochs", type=int, default=10)
144 |     parser.add_argument("--finetuning", dest="finetuning", action="store_true")
145 |     parser.add_argument("--save_model", dest="save_model", action="store_true")
146 |     parser.add_argument("--logdir", type=str, default="checkpoints/")
147 |     parser.add_argument("--lm_path", type=str, default=None)
148 |     parser.add_argument("--split", dest="split", action="store_true")
149 |     parser.add_argument("--lm", type=str, default='bert')
150 | 
151 |     args = parser.parse_args()
152 | 
153 |     # only a single task for baseline
154 |     task = args.task
155 | 
156 |     # create the tag of the run
157 |     run_tag = '%s_lr=%s_id=%d_batch=%d' % (task, args.lr, args.run_id, args.batch_size)
158 |     run_tag = run_tag.replace('/', '_')
159 | 
160 |     # load task configuration
161 |     configs = json.load(open('task.json'))
162 |     configs = {conf['name']: conf for conf in configs}
163 |     config = configs[task]
164 | 
165 |     trainset = config['trainset']
166 |     validset = config['validset']
167 |     testset = config['testset']
168 |     category = config['category']
169 | 
170 |     # load train/dev/test sets
171 |     train_dataset = Dataset(trainset, category, lm=args.lm, lm_path=args.lm_path, max_len=args.max_len, split=args.split)
172 |     valid_dataset = Dataset(validset, category, lm=args.lm, lm_path=args.lm_path, split=args.split)
173 |     test_dataset = Dataset(testset, category, lm=args.lm, lm_path=args.lm_path, split=args.split)
174 | 
175 |     initialize_and_train(train_dataset, valid_dataset, test_dataset, train_dataset.get_attr_num(), args, run_tag)
176 | 


--------------------------------------------------------------------------------
/hiergat/model/dataset.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from torch.utils import data
  8 | 
  9 | tokenizer = None
 10 | 
 11 | import sys
 12 | 
 13 | def get_lm_path(lm, lm_path):
 14 |     if lm_path != None:
 15 |         return lm_path
 16 | 
 17 |     if lm == 'bert':
 18 |         return 'bert-base-uncased'
 19 |     elif lm == 'distilbert':
 20 |         return 'distilbert-base-uncased'
 21 |     elif lm == 'roberta':
 22 |         return 'roberta-base'
 23 |     elif lm == 'xlnet':
 24 |         return 'xlnet-base-cased'
 25 | 
 26 | 
 27 | def get_tokenizer(lm, lm_path):
 28 |     global tokenizer
 29 | 
 30 |     path = get_lm_path(lm, lm_path)
 31 |     if tokenizer is None:
 32 |         if lm == 'bert':
 33 |             from transformers import BertTokenizer
 34 |             tokenizer = BertTokenizer.from_pretrained(path)
 35 |         elif lm == 'distilbert':
 36 |             from transformers import DistilBertTokenizer
 37 |             tokenizer = DistilBertTokenizer.from_pretrained(path)
 38 |         elif lm == 'roberta':
 39 |             from transformers import RobertaTokenizer
 40 |             tokenizer = RobertaTokenizer.from_pretrained(path)
 41 |         elif lm == 'xlnet':
 42 |             from transformers import XLNetTokenizer
 43 |             tokenizer = XLNetTokenizer.from_pretrained(path)
 44 | 
 45 |     return tokenizer
 46 | 
 47 | 
 48 | class Dataset(data.Dataset):
 49 |     def __init__(self, source, category, lm='bert', lm_path=None, max_len=512, split=True):
 50 |         self.tokenizer = get_tokenizer(lm, lm_path)
 51 | 
 52 |         # tokens and tags
 53 |         self.max_len = max_len
 54 | 
 55 |         sents, tags_li, attributes = self.read_classification_file(source, split)
 56 | 
 57 |         # assign class variables
 58 |         self.sents, self.tags_li, self.attributes = sents, tags_li, attributes
 59 |         self.category = category
 60 | 
 61 |         self.attr_num = len(self.attributes[0][0])
 62 | 
 63 |         # index for tags/labels
 64 |         self.tag2idx = {tag: idx for idx, tag in enumerate(self.category)}
 65 |         self.idx2tag = {idx: tag for idx, tag in enumerate(self.category)}
 66 | 
 67 |     def read_classification_file(self, path, split):
 68 |         sents, labels, attributes = [], [], []
 69 |         for line in open(path):
 70 |             items = line.strip().split('\t')
 71 | 
 72 |             attrs = []
 73 |             if split:
 74 |                 attr_items = [item + ' COL' for item in items[0:-1]]
 75 |                 for attr_item in attr_items:
 76 |                     attrs.append([f"COL {attr_str}" for attr_str
 77 |                                   in re.findall(r"(?<=COL )[brand|title|description|price|priceCurrency].*?(?= COL)", attr_item)])
 78 |                 try:
 79 |                     assert len(attrs[0]) == len(attrs[1])
 80 |                 except AssertionError:
 81 |                     raise
 82 |             else:
 83 |                 attrs = [[item] for item in items[0:-1]]
 84 | 
 85 |             sents.append(items[0] + ' [SEP] ' + items[1])
 86 |             labels.append(items[2])
 87 |             attributes.append(attrs)
 88 |         return sents, labels, attributes
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.sents)
 92 | 
 93 |     def __getitem__(self, idx):
 94 |         words, tags, attributes = self.sents[idx], self.tags_li[idx], self.attributes[idx]
 95 | 
 96 |         xs = [self.tokenizer.encode(text=attributes[0][i], text_pair=attributes[1][i],
 97 |                                     add_special_tokens=True, truncation="longest_first", max_length=self.max_len)
 98 |               for i in range(self.attr_num)]
 99 |         left_zs = [self.tokenizer.encode(text=attributes[0][i], add_special_tokens=True,
100 |                                          truncation="longest_first", max_length=self.max_len)
101 |                    for i in range(self.attr_num)]
102 |         right_zs = [self.tokenizer.encode(text=attributes[1][i], add_special_tokens=True,
103 |                                           truncation="longest_first", max_length=self.max_len)
104 |                     for i in range(self.attr_num)]
105 | 
106 |         masks = [torch.zeros(self.tokenizer.vocab_size, dtype=torch.int)
107 |                  for _ in range(self.attr_num)]
108 |         for i in range(self.attr_num):
109 |             masks[i][xs[i]] = 1
110 |         masks = torch.stack(masks)
111 | 
112 |         y = self.tag2idx[tags]  # label
113 | 
114 |         seqlens = [len(x) for x in xs]
115 |         left_zslens = [len(left_z) for left_z in left_zs]
116 |         right_zslens = [len(right_z) for right_z in right_zs]
117 | 
118 |         return words, xs, y, seqlens, masks, left_zs, right_zs, left_zslens, right_zslens, attributes
119 | 
120 |     def get_attr_num(self):
121 |         return self.attr_num
122 | 
123 |     @staticmethod
124 |     def pad(batch):
125 |         f = lambda x: [sample[x] for sample in batch]
126 |         g = lambda x, seqlen, val: \
127 |             [[sample + [val] * (seqlen - len(sample)) \
128 |               for sample in samples[x]]
129 |              for samples in batch]  # 0: <pad>
130 | 
131 |         # get maximal sequence length
132 |         seqlens = f(3)
133 |         maxlen = np.array(seqlens).max()
134 | 
135 |         words = f(0)
136 |         xs = torch.LongTensor(g(1, maxlen, 0))
137 |         y = f(2)
138 |         masks = torch.stack(f(4))
139 | 
140 |         if isinstance(y[0], float):
141 |             y = torch.Tensor(y)
142 |         else:
143 |             y = torch.LongTensor(y)
144 |         return words, xs, y, seqlens, masks
145 | 
146 |     @staticmethod
147 |     def padJoin(batch):
148 |         f = lambda x: [sample[x] for sample in batch]
149 |         g = lambda x, seqlen, val: \
150 |             [[sample + [val] * (seqlen - len(sample)) \
151 |               for sample in samples[x]]
152 |              for samples in batch]  # 0: <pad>
153 | 
154 |         # get maximal sequence length
155 |         seqlens = f(3)
156 |         maxlen = np.array(seqlens).max()
157 | 
158 |         words = f(0)
159 |         xs = torch.LongTensor(g(1, maxlen, 0))
160 |         y = f(2)
161 |         masks = torch.stack(f(4))
162 | 
163 |         attributes = f(9)
164 |         attr_num = xs.size()[1]
165 | 
166 |         right_attributes = []
167 |         for i in range(attr_num):
168 |             right_attribute = []
169 |             for attribute in attributes:
170 |                 right_attribute.append(attribute[1][i])
171 |             right_attributes.append(right_attribute)
172 | 
173 |         zs = [tokenizer.encode(text=' '.join(right_attributes[i]),
174 |                     add_special_tokens=False, truncation="longest_first", max_length=512)
175 |               for i in range(attr_num)]
176 |         maxlen = np.array([len(z) for z in zs]).max()
177 |         zs = [z + [0] * (maxlen-len(z)) for z in zs]
178 |         zs = torch.LongTensor(zs).unsqueeze(0).permute(1, 0, 2)
179 | 
180 | 
181 |         if isinstance(y[0], float):
182 |             y = torch.Tensor(y)
183 |         else:
184 |             y = torch.LongTensor(y)
185 | 
186 |         return words, xs, zs, y, seqlens, masks
187 | 


--------------------------------------------------------------------------------
/src/processing/process-magellan/process_to_magellan.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | np.random.seed(42)
  4 | import random
  5 | random.seed(42)
  6 | 
  7 | import os
  8 | import glob
  9 | import py_entitymatching as em
 10 | from pathlib import Path
 11 | 
 12 | def preprocess_magellan(file, columns_to_preprocess, experiment_name, dataset_name, validation_set=None):
 13 |     columns_preprocess_magellan = ['ltable_' + col for col in columns_to_preprocess]
 14 |     columns_preprocess_magellan.extend(['rtable_' + col for col in columns_to_preprocess])
 15 |     data_df = None
 16 |     if '.pkl.gz' in file:
 17 |         data_df = pd.read_pickle(file)
 18 |         if 'wdcproducts' in file and 'gs' not in file:
 19 |             val_df = pd.read_pickle(validation_set)
 20 |             data_df = pd.concat([data_df, val_df])
 21 |         data_df = data_df.reset_index(drop=True)
 22 |     elif '.json.gz' in file:
 23 |         data_df = pd.read_json(file, lines=True)
 24 |     else:
 25 |         print(f'unrecognized file format: {Path(file).suffix}')
 26 |     data_df.fillna('', inplace=True)
 27 |     if 'price' in columns_to_preprocess and 'wdcproducts' not in file:
 28 |         data_df['price_left'] = data_df['price_left'].replace(r'^\s*$', np.nan, regex=True)
 29 |         data_df['price_right'] = data_df['price_right'].replace(r'^\s*$', np.nan, regex=True)
 30 |         data_df['price_left'] = data_df['price_left'].astype('float64')
 31 |         data_df['price_right'] = data_df['price_right'].astype('float64')
 32 |     # change column naming to magellan format
 33 |     cols = list(data_df.columns)
 34 |     for i, col in enumerate(cols):
 35 |         if '_left' in col:
 36 |             col = col.replace('_left', '')
 37 |             cols[i] = 'ltable_' + col
 38 |         if '_right' in col:
 39 |             col = col.replace('_right', '')
 40 |             cols[i] = 'rtable_' + col
 41 |     data_df.columns = cols
 42 | 
 43 |     # build left and right subsets
 44 |     left_df = data_df[[col for col in data_df.columns if 'ltable_' in col]].copy()
 45 |     left_df.drop_duplicates(subset='ltable_id', inplace=True)
 46 |     right_df = data_df[[col for col in data_df.columns if 'rtable_' in col]].copy()
 47 |     right_df.drop_duplicates(subset='rtable_id', inplace=True)
 48 | 
 49 |     # assign magellan ids in subsets
 50 |     left_df['mag_id'] = range(0, len(left_df))
 51 |     right_df['mag_id'] = range(0, len(right_df))
 52 | 
 53 |     # use magellan ids and assign global pair id
 54 |     len_assert = len(data_df)
 55 |     data_df = data_df.merge(left_df[['ltable_id', 'mag_id']], how='left', on='ltable_id')
 56 |     data_df.rename(columns={'mag_id': 'ltable_mag_id'}, inplace=True)
 57 |     data_df = data_df.merge(right_df[['rtable_id', 'mag_id']], how='left', on='rtable_id')
 58 |     data_df.rename(columns={'mag_id': 'rtable_mag_id'}, inplace=True)
 59 |     data_df['_id'] = range(0, len(data_df))
 60 |     assert len(data_df) == len_assert
 61 | 
 62 |     left_df.drop(columns='ltable_id', inplace=True)
 63 |     right_df.drop(columns='rtable_id', inplace=True)
 64 | 
 65 |     left_cols = left_df.columns
 66 |     left_df.columns = [col.replace('ltable_', '') for col in left_cols]
 67 | 
 68 |     right_cols = right_df.columns
 69 |     right_df.columns = [col.replace('rtable_', '') for col in right_cols]
 70 | 
 71 |     file_name = os.path.basename(file)
 72 |     new_file_name = file_name.replace('.pkl.gz', '_magellan_')
 73 |     new_file_name = new_file_name.replace('.json.gz', '_magellan_')
 74 | 
 75 |     out_path1 = f'../../../data/processed/{dataset_name}/magellan/{experiment_name}/'
 76 |     out_path2 = f'../../../data/processed/{dataset_name}/magellan/{experiment_name}/formatted/'
 77 | 
 78 |     os.makedirs(out_path2, exist_ok=True)
 79 | 
 80 |     left_df.to_csv(out_path1 + new_file_name + 'left.csv.gz', compression='gzip', header=True, index=False)
 81 |     right_df.to_csv(out_path1 + new_file_name + 'right.csv.gz', compression='gzip', header=True, index=False)
 82 |     data_df.to_csv(out_path1 + new_file_name + 'pairs.csv.gz', compression='gzip', header=True, index=False)
 83 | 
 84 |     # magellan formatting for py_entitymatching
 85 |     A = em.read_csv_metadata(out_path1 + new_file_name + 'left.csv.gz', key='mag_id')
 86 |     em.to_csv_metadata(A, out_path2 + new_file_name + 'left_formatted.csv')
 87 |     B = em.read_csv_metadata(out_path1 + new_file_name + 'right.csv.gz', key='mag_id')
 88 |     em.to_csv_metadata(B, out_path2 + new_file_name + 'right_formatted.csv')
 89 | 
 90 |     C = em.read_csv_metadata(out_path1 + new_file_name + 'pairs.csv.gz',
 91 |                              key='_id',
 92 |                              ltable=A, rtable=B,
 93 |                              fk_ltable='ltable_mag_id', fk_rtable='rtable_mag_id')
 94 | 
 95 |     if isinstance(validation_set, type(None)):
 96 | 
 97 |         em.to_csv_metadata(C, out_path2 + new_file_name + 'pairs_formatted.csv')
 98 | 
 99 |     else:
100 |         if 'csv' in validation_set:
101 |             validation_ids_df = pd.read_csv(validation_set)
102 |             
103 |         else:
104 |             validation_ids_df = pd.read_pickle(validation_set)
105 | 
106 |         validation_df = C[C['pair_id'].isin(validation_ids_df['pair_id'].values)]
107 |         train_df = C[~C['pair_id'].isin(validation_ids_df['pair_id'].values)]
108 | 
109 |         em.to_csv_metadata(C, out_path2 + new_file_name + 'pairs_formatted.csv')
110 | 
111 |         new_file_name = new_file_name.replace('train', 'trainonly')
112 | 
113 |         em.to_csv_metadata(train_df, out_path2 + new_file_name + 'pairs_formatted.csv')
114 | 
115 |         valid_name = new_file_name.replace('trainonly', 'valid')
116 | 
117 |         em.to_csv_metadata(validation_df, out_path2 + valid_name + 'pairs_formatted.csv')
118 | 
119 | if __name__ == '__main__':
120 |     # learning-curve experiment
121 |     for file in glob.glob('../../../data/interim/wdc-lspc/training-sets/*'):
122 |         if 'multi' in file or 'wdcproducts' not in file:
123 |             continue
124 |         valid = file.replace('training', 'validation')
125 |         valid = valid.replace('train', 'valid')
126 |         if 'wdcproducts' not in file:
127 |             columns_to_preprocess = ['title', 'description', 'brand', 'specTableContent']
128 |             valid = valid.replace('.pkl.gz', '.csv')
129 |             valid = valid.replace('interim', 'raw')
130 |             valid = valid.replace('preprocessed_', '')
131 |         else:
132 |             columns_to_preprocess = ['title', 'description', 'brand', 'price', 'priceCurrency']
133 |         preprocess_magellan(file, columns_to_preprocess, experiment_name='learning-curve', dataset_name='wdc-lspc', validation_set=valid)
134 | 
135 |     for file in glob.glob('../../../data/interim/wdc-lspc/gold-standards/*'):
136 |         if 'multi' in file or 'wdcproducts' not in file:
137 |             continue
138 |         if 'wdcproducts' not in file:
139 |             columns_to_preprocess = ['title', 'description', 'brand', 'specTableContent']
140 |         else:
141 |             columns_to_preprocess = ['title', 'description', 'brand', 'price', 'priceCurrency']
142 | 
143 |         preprocess_magellan(file, columns_to_preprocess, experiment_name='learning-curve', dataset_name='wdc-lspc')
144 | 


--------------------------------------------------------------------------------
/hiergat/train_n.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import time
  5 | 
  6 | from model.ceval import eval_on_task
  7 | from model.dataset import Dataset, get_tokenizer
  8 | from model.cmodel import TranHGAT
  9 | from model.summarize import Summarizer
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | import torch.nn as nn
 14 | import torch.nn.functional as F
 15 | import torch.optim as optim
 16 | 
 17 | from tensorboardX import SummaryWriter
 18 | from torch.utils import data
 19 | from transformers import AdamW, get_linear_schedule_with_warmup
 20 | 
 21 | 
 22 | def train(model, train_set, optimizer, scheduler=None, batch_size=32, su=None):
 23 |     iterator = data.DataLoader(dataset=train_set, batch_size=batch_size,
 24 |                                shuffle=False, num_workers=1, collate_fn=Dataset.pad)
 25 |     su_iterator = data.DataLoader(dataset=su, batch_size=batch_size,
 26 |                                   shuffle=False, num_workers=1, collate_fn=Dataset.padJoin)
 27 |     classifier_criterion = nn.CrossEntropyLoss()
 28 | 
 29 |     model.train()
 30 |     for i, (batch, su_batch) in enumerate(zip(iterator, su_iterator)):
 31 |         # for monitoring
 32 |         _, xs, y, _, masks = batch
 33 |         _, _, zs, _, _, _ = su_batch
 34 |         _y = y
 35 | 
 36 |         # forward
 37 |         optimizer.zero_grad()
 38 |         logits, y, _ = model(xs, zs, y, masks)
 39 | 
 40 |         logits = logits.view(-1, logits.shape[-1])
 41 |         y = y.view(-1)
 42 |         loss = classifier_criterion(logits, y)
 43 | 
 44 |         loss.backward()
 45 |         optimizer.step()
 46 |         if scheduler:
 47 |             scheduler.step()
 48 | 
 49 |         if i % 10 == 0:  # monitoring
 50 |             print(f"step: {i}, loss: {loss.item()}")
 51 |             del loss
 52 | 
 53 | 
 54 | def initialize_and_train(trainset, validset, testset, attr_num, args, run_tag,
 55 |                          trainset_su=None, validset_su=None, testset_su=None):
 56 |     padder = Dataset.pad
 57 |     valid_iter = data.DataLoader(dataset=validset, batch_size=args.batch_size,
 58 |                                  shuffle=False, num_workers=0, collate_fn=padder)
 59 |     test_iter = data.DataLoader(dataset=testset, batch_size=args.batch_size,
 60 |                                 shuffle=False, num_workers=0, collate_fn=padder)
 61 | 
 62 |     valid_su_iter = data.DataLoader(dataset=validset_su, batch_size=args.batch_size,
 63 |                                     shuffle=False, num_workers=0, collate_fn=Dataset.padJoin)
 64 |     test_su_iter = data.DataLoader(dataset=testset_su, batch_size=args.batch_size,
 65 |                                    shuffle=False, num_workers=0, collate_fn=Dataset.padJoin)
 66 | 
 67 |     # initialize model
 68 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 69 |     model = TranHGAT(attr_num, device, args.finetuning, lm=args.lm, lm_path=args.lm_path)
 70 |     if device == 'cpu':
 71 |         optimizer = AdamW(model.parameters(), lr=args.lr)
 72 |     else:
 73 |         model = model.cuda()
 74 |         optimizer = AdamW(model.parameters(), lr=args.lr)
 75 | 
 76 |     # learning rate scheduler
 77 |     num_steps = (len(trainset) // args.batch_size) * args.n_epochs
 78 |     scheduler = get_linear_schedule_with_warmup(optimizer,
 79 |                                                 num_warmup_steps=0,
 80 |                                                 num_training_steps=num_steps)
 81 | 
 82 |     # create logging directory
 83 |     if not os.path.exists(args.logdir):
 84 |         os.makedirs(args.logdir)
 85 |     writer = SummaryWriter(log_dir=args.logdir)
 86 | 
 87 |     # start training
 88 |     best_dev_f1 = best_test_f1 = 0.0
 89 |     epoch = 1
 90 |     while epoch <= args.n_epochs:
 91 |         start = time.time()
 92 |         train(model, trainset, optimizer, scheduler=scheduler,
 93 |               batch_size=args.batch_size, su=trainset_su)
 94 |         print("train time: ", time.time() - start)
 95 | 
 96 |         print(f"=========eval at epoch={epoch}=========")
 97 |         dev_f1, test_f1 = eval_on_task(epoch, model, valid_iter, test_iter, valid_su_iter, test_su_iter,
 98 |                                        writer, run_tag)
 99 | 
100 |         if dev_f1 > 1e-6:
101 |             epoch += 1
102 |             if args.save_model:
103 |                 if dev_f1 > best_dev_f1:
104 |                     best_dev_f1 = dev_f1
105 |                     torch.save(model.state_dict(), run_tag + '_dev.pt')
106 |                 if test_f1 > best_test_f1:
107 |                     best_test_f1 = dev_f1
108 |                     torch.save(model.state_dict(), run_tag + '_test.pt')
109 | 
110 |     writer.close()
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     parser = argparse.ArgumentParser()
115 |     parser.add_argument("--task", type=str, default="N/Amazon-Google")
116 |     parser.add_argument("--run_id", type=int, default=0)
117 |     parser.add_argument("--batch_size", type=int, default=17)
118 |     parser.add_argument("--max_len", type=int, default=256)
119 |     parser.add_argument("--su_len", type=int, default=10)
120 |     parser.add_argument("--lr", type=float, default=1e-5)
121 |     parser.add_argument("--n_epochs", type=int, default=10)
122 |     parser.add_argument("--finetuning", dest="finetuning", action="store_true")
123 |     parser.add_argument("--save_model", dest="save_model", action="store_true")
124 |     parser.add_argument("--logdir", type=str, default="checkpoints/")
125 |     parser.add_argument("--lm_path", type=str, default=None)
126 |     parser.add_argument("--split", dest="split", action="store_true")
127 |     parser.add_argument("--lm", type=str, default='bert')
128 | 
129 |     args = parser.parse_args()
130 | 
131 |     # only a single task for baseline
132 |     task = args.task
133 | 
134 |     # create the tag of the run
135 |     run_tag = '%s_lr=%s_id=%d' % (task, args.lr, args.run_id)
136 |     run_tag = run_tag.replace('/', '_')
137 | 
138 |     # load task configuration
139 |     configs = json.load(open('task.json'))
140 |     configs = {conf['name']: conf for conf in configs}
141 |     config = configs[task]
142 | 
143 |     trainset = config['trainset']
144 |     validset = config['validset']
145 |     testset = config['testset']
146 |     category = config['category']
147 | 
148 |     # load train/dev/test sets
149 |     train_dataset = Dataset(trainset, category, args.lm, args.lm_path, split=args.split)
150 |     valid_dataset = Dataset(validset, category, args.lm, args.lm_path, split=args.split)
151 |     test_dataset = Dataset(testset, category, args.lm, args.lm_path, split=args.split)
152 | 
153 |     summarizer = Summarizer(config, lm=args.lm, lm_path=args.lm_path)
154 |     trainset_su = summarizer.transform_file(trainset, max_len=args.su_len, batch_size=args.batch_size, overwrite=True)
155 |     validset_su = summarizer.transform_file(validset, max_len=args.su_len, batch_size=args.batch_size, overwrite=True)
156 |     testset_su = summarizer.transform_file(testset, max_len=args.su_len, batch_size=args.batch_size, overwrite=True)
157 | 
158 |     train_dataset_su = Dataset(trainset_su, category, args.lm_path, split=args.split)
159 |     valid_dataset_su = Dataset(validset_su, category, args.lm_path, split=args.split)
160 |     test_dataset_su = Dataset(testset_su, category, args.lm_path, split=args.split)
161 |     initialize_and_train(train_dataset, valid_dataset, test_dataset,
162 |                             train_dataset.get_attr_num(), args, run_tag,
163 |                             train_dataset_su, valid_dataset_su, test_dataset_su)
164 | 


--------------------------------------------------------------------------------
/ditto/ditto_light/ditto.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | import random
  8 | import numpy as np
  9 | import sklearn.metrics as metrics
 10 | import argparse
 11 | 
 12 | from .dataset import DittoDataset
 13 | from torch.utils import data
 14 | from transformers import AutoModel, AdamW, get_linear_schedule_with_warmup
 15 | from tensorboardX import SummaryWriter
 16 | # from apex import amp
 17 | 
 18 | lm_mp = {'roberta': 'roberta-base',
 19 |          'distilbert': 'distilbert-base-uncased'}
 20 | 
 21 | class DittoModel(nn.Module):
 22 |     """A baseline model for EM."""
 23 | 
 24 |     def __init__(self, device='cuda', lm='roberta', alpha_aug=0.8):
 25 |         super().__init__()
 26 |         if lm in lm_mp:
 27 |             self.bert = AutoModel.from_pretrained(lm_mp[lm])
 28 |         else:
 29 |             self.bert = AutoModel.from_pretrained(lm)
 30 | 
 31 |         self.device = device
 32 |         self.alpha_aug = alpha_aug
 33 | 
 34 |         # linear layer
 35 |         hidden_size = self.bert.config.hidden_size
 36 |         self.fc = torch.nn.Linear(hidden_size, 2)
 37 | 
 38 | 
 39 |     def forward(self, x1, x2=None):
 40 |         """Encode the left, right, and the concatenation of left+right.
 41 | 
 42 |         Args:
 43 |             x1 (LongTensor): a batch of ID's
 44 |             x2 (LongTensor, optional): a batch of ID's (augmented)
 45 | 
 46 |         Returns:
 47 |             Tensor: binary prediction
 48 |         """
 49 |         x1 = x1.to(self.device) # (batch_size, seq_len)
 50 |         if x2 is not None:
 51 |             # MixDA
 52 |             x2 = x2.to(self.device) # (batch_size, seq_len)
 53 |             enc = self.bert(torch.cat((x1, x2)))[0][:, 0, :]
 54 |             batch_size = len(x1)
 55 |             enc1 = enc[:batch_size] # (batch_size, emb_size)
 56 |             enc2 = enc[batch_size:] # (batch_size, emb_size)
 57 | 
 58 |             aug_lam = np.random.beta(self.alpha_aug, self.alpha_aug)
 59 |             enc = enc1 * aug_lam + enc2 * (1.0 - aug_lam)
 60 |         else:
 61 |             enc = self.bert(x1)[0][:, 0, :]
 62 | 
 63 |         return self.fc(enc) # .squeeze() # .sigmoid()
 64 | 
 65 | 
 66 | def evaluate(model, iterator, threshold=None):
 67 |     """Evaluate a model on a validation/test dataset
 68 | 
 69 |     Args:
 70 |         model (DMModel): the EM model
 71 |         iterator (Iterator): the valid/test dataset iterator
 72 |         threshold (float, optional): the threshold on the 0-class
 73 | 
 74 |     Returns:
 75 |         float: the F1 score
 76 |         float (optional): if threshold is not provided, the threshold
 77 |             value that gives the optimal F1
 78 |     """
 79 |     all_p = []
 80 |     all_y = []
 81 |     all_probs = []
 82 |     with torch.no_grad():
 83 |         for batch in iterator:
 84 |             x, y = batch
 85 |             logits = model(x)
 86 |             probs = logits.softmax(dim=1)[:, 1]
 87 |             all_probs += probs.cpu().numpy().tolist()
 88 |             all_y += y.cpu().numpy().tolist()
 89 | 
 90 |     if threshold is not None:
 91 |         pred = [1 if p > threshold else 0 for p in all_probs]
 92 |         f1 = metrics.f1_score(all_y, pred)
 93 |         return f1
 94 |     else:
 95 |         best_th = 0.5
 96 |         f1 = 0.0 # metrics.f1_score(all_y, all_p)
 97 | 
 98 |         for th in np.arange(0.0, 1.0, 0.05):
 99 |             pred = [1 if p > th else 0 for p in all_probs]
100 |             new_f1 = metrics.f1_score(all_y, pred)
101 |             if new_f1 > f1:
102 |                 f1 = new_f1
103 |                 best_th = th
104 | 
105 |         return f1, best_th
106 | 
107 | 
108 | def train_step(train_iter, model, optimizer, scheduler, hp):
109 |     """Perform a single training step
110 | 
111 |     Args:
112 |         train_iter (Iterator): the train data loader
113 |         model (DMModel): the model
114 |         optimizer (Optimizer): the optimizer (Adam or AdamW)
115 |         scheduler (LRScheduler): learning rate scheduler
116 |         hp (Namespace): other hyper-parameters (e.g., fp16)
117 | 
118 |     Returns:
119 |         None
120 |     """
121 |     criterion = nn.CrossEntropyLoss()
122 |     # criterion = nn.MSELoss()
123 |     for i, batch in enumerate(train_iter):
124 |         optimizer.zero_grad()
125 | 
126 |         if len(batch) == 2:
127 |             x, y = batch
128 |             prediction = model(x)
129 |         else:
130 |             x1, x2, y = batch
131 |             prediction = model(x1, x2)
132 | 
133 |         loss = criterion(prediction, y.to(model.device))
134 | 
135 |         if hp.fp16:
136 |             pass
137 |             # with amp.scale_loss(loss, optimizer) as scaled_loss:
138 |             #    scaled_loss.backward()
139 |         else:
140 |             loss.backward()
141 |         optimizer.step()
142 |         scheduler.step()
143 |         if i % 10 == 0: # monitoring
144 |             print(f"step: {i}, loss: {loss.item()}")
145 |         del loss
146 | 
147 | 
148 | def train(trainset, validset, testset, run_tag, hp):
149 |     """Train and evaluate the model
150 | 
151 |     Args:
152 |         trainset (DittoDataset): the training set
153 |         validset (DittoDataset): the validation set
154 |         testset (DittoDataset): the test set
155 |         run_tag (str): the tag of the run
156 |         hp (Namespace): Hyper-parameters (e.g., batch_size,
157 |                         learning rate, fp16)
158 | 
159 |     Returns:
160 |         None
161 |     """
162 |     run_tag = '%s_lm=%s_da=%s_dk=%s_su=%s_size=%s_id=%d' % (hp.task, hp.lm, hp.da,
163 |                 hp.dk, hp.summarize, str(hp.size), hp.run_id)
164 |     run_tag = run_tag.replace('/', '_')
165 |     path = "./output/" + str(run_tag) + '.txt'
166 |     os.makedirs(os.path.dirname(path), exist_ok=True)
167 | 
168 |     padder = trainset.pad
169 |     # create the DataLoaders
170 |     train_iter = data.DataLoader(dataset=trainset,
171 |                                  batch_size=hp.batch_size,
172 |                                  shuffle=True,
173 |                                  num_workers=0,
174 |                                  collate_fn=padder)
175 |     valid_iter = data.DataLoader(dataset=validset,
176 |                                  batch_size=hp.batch_size*16,
177 |                                  shuffle=False,
178 |                                  num_workers=0,
179 |                                  collate_fn=padder)
180 |     test_iter = data.DataLoader(dataset=testset,
181 |                                  batch_size=hp.batch_size*16,
182 |                                  shuffle=False,
183 |                                  num_workers=0,
184 |                                  collate_fn=padder)
185 | 
186 |     # initialize model, optimizer, and LR scheduler
187 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
188 |     model = DittoModel(device=device,
189 |                        lm=hp.lm,
190 |                        alpha_aug=hp.alpha_aug)
191 |     model = model.cuda()
192 |     optimizer = AdamW(model.parameters(), lr=hp.lr)
193 | 
194 |     if hp.fp16:
195 |         pass
196 |         # model, optimizer = amp.initialize(model, optimizer, opt_level='O2')
197 |     num_steps = (len(trainset) // hp.batch_size) * hp.n_epochs
198 |     scheduler = get_linear_schedule_with_warmup(optimizer,
199 |                                                 num_warmup_steps=0,
200 |                                                 num_training_steps=num_steps)
201 | 
202 |     # logging with tensorboardX
203 |     writer = SummaryWriter(log_dir=hp.logdir)
204 | 
205 |     best_dev_f1 = best_test_f1 = 0.0
206 |     for epoch in range(1, hp.n_epochs+1):
207 |         # train
208 |         model.train()
209 |         train_step(train_iter, model, optimizer, scheduler, hp)
210 | 
211 |         # eval
212 |         model.eval()
213 |         dev_f1, th = evaluate(model, valid_iter)
214 |         test_f1 = evaluate(model, test_iter, threshold=th)
215 | 
216 |         if dev_f1 > best_dev_f1:
217 |             best_dev_f1 = dev_f1
218 |             best_test_f1 = test_f1
219 |             if hp.save_model:
220 |                 # create the directory if not exist
221 |                 directory = os.path.join(hp.logdir, hp.task)
222 |                 if not os.path.exists(directory):
223 |                     os.makedirs(directory)
224 | 
225 |                 # save the checkpoints for each component
226 |                 ckpt_path = os.path.join(hp.logdir, hp.task, 'model.pt')
227 |                 ckpt = {'model': model.state_dict(),
228 |                         'optimizer': optimizer.state_dict(),
229 |                         'scheduler': scheduler.state_dict(),
230 |                         'epoch': epoch}
231 |                 torch.save(ckpt, ckpt_path)
232 | 
233 |         print(f"epoch {epoch}: dev_f1={dev_f1}, f1={test_f1}, best_f1={best_test_f1}")
234 | 
235 |         # logging
236 |         scalars = {'f1': dev_f1,
237 |                    't_f1': test_f1}
238 |         writer.add_scalars(run_tag, scalars, epoch)
239 | 
240 |         if (epoch == hp.n_epochs):
241 |             path = "./output/" + str(run_tag) + '.txt'
242 |             dict = {'best_f1': best_test_f1}
243 | 
244 |             with open(path, "a+") as f:
245 |                 f.write(repr(dict) + '\n')
246 | 
247 |     writer.close()
248 | 


--------------------------------------------------------------------------------
/src/processing/process-wordcooc/process-to-wordcooc.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | np.random.seed(42)
  4 | import random
  5 | random.seed(42)
  6 | 
  7 | import os
  8 | import glob
  9 | import json
 10 | from pathlib import Path
 11 | 
 12 | from sklearn.feature_extraction.text import CountVectorizer
 13 | 
 14 | from pdb import set_trace
 15 | 
 16 | def process_df_columns_to_wordocc(file, columns_preprocess_wordcooc, feature_combinations):
 17 |     data_df = None
 18 |     if '.pkl.gz' in file:
 19 |         data_df = pd.read_pickle(file)
 20 |     if 'wdcproducts' in file and 'training' in file:
 21 |         valid = file.replace('training', 'validation')
 22 |         valid = valid.replace('train', 'valid')
 23 |         valid_df = pd.read_pickle(valid)
 24 |         data_df = pd.concat([data_df, valid_df])
 25 |         data_df = data_df.reset_index(drop=True)
 26 |     elif '.json.gz' in file:
 27 |         data_df = pd.read_json(file, lines=True)
 28 |     else:
 29 |         print(f'unrecognized file format: {Path(file).suffix}')
 30 |     data_df.fillna('', inplace=True)
 31 | 
 32 |     # preprocess selected columns
 33 |     for column in columns_preprocess_wordcooc:
 34 |         data_df[column] = data_df[column].astype(str)
 35 | 
 36 |     # build combined features for every feature combination
 37 |     for feature_combination in feature_combinations:
 38 |         feats_to_combine = feature_combination.split('+')
 39 |         data_df[feature_combination + '_wordocc_left'] = data_df[feats_to_combine[0] + '_left']
 40 |         data_df[feature_combination + '_wordocc_right'] = data_df[feats_to_combine[0] + '_right']
 41 | 
 42 |         for feat_to_combine in feats_to_combine[1:]:
 43 |             data_df[feature_combination + '_wordocc_left'] += (' ' + data_df[feat_to_combine + '_left'])
 44 |             data_df[feature_combination + '_wordocc_right'] += (' ' + data_df[feat_to_combine + '_right'])
 45 | 
 46 |         data_df[feature_combination + '_wordocc_left'] = data_df[feature_combination + '_wordocc_left'].str.strip()
 47 |         data_df[feature_combination + '_wordocc_right'] = data_df[feature_combination + '_wordocc_right'].str.strip()
 48 | 
 49 |     return data_df
 50 | 
 51 | 
 52 | def transform_columns_to_wordcount(data_df, feature_combinations, test_df):
 53 |     words = {}
 54 | 
 55 |     for feature_combination in feature_combinations:
 56 | 
 57 |         # build relevant strings for vocabulary
 58 |         all_left_strings = data_df[['id_left', feature_combination + '_wordocc_left']].copy()
 59 |         all_left_strings = all_left_strings.rename(
 60 |             columns={'id_left': 'id', feature_combination + '_wordocc_left': feature_combination})
 61 |         all_right_strings = data_df[['id_right', feature_combination + '_wordocc_right']].copy()
 62 |         all_right_strings = all_right_strings.rename(
 63 |             columns={'id_right': 'id', feature_combination + '_wordocc_right': feature_combination})
 64 |         all_unique_strings = pd.concat([all_left_strings, all_right_strings])
 65 |         all_unique_strings = all_unique_strings.drop_duplicates(subset='id')
 66 | 
 67 |         # learn vocabulary
 68 |         count_vectorizer = CountVectorizer(min_df=2, binary=True)
 69 |         count_vectorizer.fit(all_unique_strings[feature_combination])
 70 | 
 71 |         words[feature_combination] = count_vectorizer.get_feature_names()
 72 | 
 73 |         # apply binary word occurrence
 74 |         left_matrix = count_vectorizer.transform(data_df[feature_combination + '_wordocc_left'])
 75 |         right_matrix = count_vectorizer.transform(data_df[feature_combination + '_wordocc_right'])
 76 |         data_df[feature_combination + '_wordocc_left'] = [x for x in left_matrix]
 77 |         data_df[feature_combination + '_wordocc_right'] = [x for x in right_matrix]
 78 | 
 79 |         if not isinstance(test_df, type(None)):
 80 |             left_matrix_test = count_vectorizer.transform(test_df[feature_combination + '_wordocc_left'])
 81 |             right_matrix_test = count_vectorizer.transform(test_df[feature_combination + '_wordocc_right'])
 82 |             test_df[feature_combination + '_wordocc_left'] = [x for x in left_matrix_test]
 83 |             test_df[feature_combination + '_wordocc_right'] = [x for x in right_matrix_test]
 84 | 
 85 |     return data_df, test_df, words
 86 | 
 87 | 
 88 | def transform_columns_to_wordcooc(data_df, feature_combinations, test_df):
 89 |     for feature_combination in feature_combinations:
 90 |         data_df[feature_combination + '_wordcooc'] = list(
 91 |             map(lambda x, y: x.multiply(y).astype(int), data_df[feature_combination + '_wordocc_left'].values,
 92 |                 data_df[feature_combination + '_wordocc_right'].values))
 93 | 
 94 |         if not isinstance(test_df, type(None)):
 95 |             test_df[feature_combination + '_wordcooc'] = list(
 96 |                 map(lambda x, y: x.multiply(y).astype(int), test_df[feature_combination + '_wordocc_left'].values,
 97 |                     test_df[feature_combination + '_wordocc_right'].values))
 98 | 
 99 |     return data_df, test_df
100 | 
101 | 
102 | def preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name, dataset_name, valid_set=None,
103 |                         test_set=None):
104 |     columns_preprocess_wordcooc = [col + '_left' for col in columns_to_preprocess]
105 |     columns_preprocess_wordcooc.extend([col + '_right' for col in columns_to_preprocess])
106 | 
107 |     main_df = process_df_columns_to_wordocc(file, columns_preprocess_wordcooc, feature_combinations)
108 | 
109 |     if not isinstance(test_set, type(None)):
110 |         test_df = process_df_columns_to_wordocc(test_set, columns_preprocess_wordcooc, feature_combinations)
111 |     else:
112 |         test_df = None
113 | 
114 |     main_df, test_df, words = transform_columns_to_wordcount(main_df, feature_combinations, test_df)
115 |     main_df, test_df = transform_columns_to_wordcooc(main_df, feature_combinations, test_df)
116 | 
117 |     main_name = os.path.basename(file)
118 |     new_main_name = main_name.replace('.pkl.gz', '_wordcooc')
119 |     new_main_name = new_main_name.replace('.json.gz', '_wordcooc')
120 | 
121 |     out_path = f'../../../data/processed/{dataset_name}/wordcooc/{experiment_name}/'
122 | 
123 |     os.makedirs(out_path + 'feature-names/', exist_ok=True)
124 | 
125 |     with open(out_path + 'feature-names/' + new_main_name + '_words.json', 'w') as f:
126 |         json.dump(words, f, ensure_ascii=False)
127 | 
128 |     if isinstance(valid_set, type(None)):
129 |         main_df.to_pickle(out_path + new_main_name + '.pkl.gz', compression='gzip')
130 |     else:
131 |         if 'wdcproducts' in file:
132 |             validation_ids_df = pd.read_pickle(valid_set)
133 |         else:
134 |             validation_ids_df = pd.read_csv(valid_set)
135 |         validation_df = main_df[main_df['pair_id'].isin(validation_ids_df['pair_id'].values)]
136 | 
137 |         main_df.to_pickle(out_path + new_main_name + '.pkl.gz', compression='gzip')
138 |         valid_name = new_main_name.replace('train', 'valid')
139 |         validation_df.to_pickle(out_path + valid_name + '.pkl.gz', compression='gzip')
140 | 
141 |     if not isinstance(test_df, type(None)):
142 |         test_name = os.path.basename(test_set)
143 |         test_name = test_name.replace('.pkl.gz', '')
144 |         test_name = test_name.replace('.json.gz', '')
145 |         new_test_name = new_main_name + '_' + test_name
146 | 
147 |         test_df.to_pickle(out_path + new_test_name + '.pkl.gz', compression='gzip')
148 | 
149 | 
150 | if __name__ == '__main__':
151 | 
152 |     for file in glob.glob('../../../data/interim/wdc-lspc/training-sets/*'):
153 |         if 'multi' in file or 'wdcproducts' not in file:
154 |             continue
155 |             
156 |         valid = file.replace('training', 'validation')
157 |         valid = valid.replace('train', 'valid')
158 |         if 'wdcproducts' not in file:
159 |             columns_to_preprocess = ['title', 'description', 'brand', 'specTableContent']
160 |             feature_combinations = ['title', 'brand+title', 'brand+title+description',
161 |                             'brand+title+description+specTableContent']
162 |             valid = valid.replace('.pkl.gz', '.csv')
163 |             valid = valid.replace('preprocessed_', '')
164 |             valid = valid.replace('interim', 'raw')
165 |         else:
166 |             columns_to_preprocess = ['title', 'description', 'brand', 'price', 'priceCurrency']
167 |             feature_combinations = ['brand+title+price+priceCurrency+description']
168 | 
169 |         test_cat = os.path.basename(file).split('_')[1]
170 |         test ='../../../data/interim/wdc-lspc/gold-standards/preprocessed_{}_gs.pkl.gz'.format(test_cat)
171 | 
172 |         preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name='learning-curve', dataset_name='wdc-lspc',  valid_set=valid, test_set=test)
173 |         
174 |         test = test.replace('000un','050un')
175 |         preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name='learning-curve', dataset_name='wdc-lspc',  valid_set=valid, test_set=test)
176 | 
177 |         test = test.replace('050un','100un')
178 |         preprocess_wordcooc(file, columns_to_preprocess, feature_combinations, experiment_name='learning-curve', dataset_name='wdc-lspc',  valid_set=valid, test_set=test)


--------------------------------------------------------------------------------
/notebooks/processing/benchmark2020/dbscan-clustering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "bb69321d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "np.random.seed(42)\n",
 13 |     "import random\n",
 14 |     "random.seed(42)\n",
 15 |     "\n",
 16 |     "from pdb import set_trace\n",
 17 |     "\n",
 18 |     "from statistics import median, mean\n",
 19 |     "\n",
 20 |     "from sklearn.cluster import DBSCAN\n",
 21 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 22 |     "\n",
 23 |     "from gensim.parsing.preprocessing import lower_to_unicode, preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric\n",
 24 |     "\n",
 25 |     "from tqdm.auto import tqdm\n",
 26 |     "tqdm.pandas()"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "id": "ee859b70",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Load cleansed PDC2020 corpus"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "fb5a8556",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "corpus = pd.read_pickle('../../../data/interim/wdc-lspc/corpus/dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.pkl.gz')\n",
 45 |     "corpus.head()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "id": "b4418772",
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "counts = corpus['cluster_id'].value_counts()\n",
 56 |     "counts = counts[counts > 3]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "id": "b1a056a8",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "# Apply DBSCAN clustering and save it for manual labeling"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "id": "f81d24e8",
 71 |    "metadata": {
 72 |     "scrolled": false
 73 |    },
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "eps_list = [0.35]\n",
 77 |     "min_samples_list = [1]\n",
 78 |     "\n",
 79 |     "for eps in eps_list:\n",
 80 |     "    for min_sample in min_samples_list:\n",
 81 |     "        print(f'eps: {eps}, min_samples: {min_sample}')\n",
 82 |     "        \n",
 83 |     "        corpus_selection = corpus[corpus['cluster_id'].isin(counts.index)].copy()\n",
 84 |     "        corpus_selection = corpus_selection.drop_duplicates('cluster_id')\n",
 85 |     "        CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces]\n",
 86 |     "\n",
 87 |     "        corpus_selection['title_processed'] = corpus_selection['title'].apply(lower_to_unicode)\n",
 88 |     "        corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(preprocess_string, args=(CUSTOM_FILTERS,))\n",
 89 |     "        corpus_selection['title_processed'] = corpus_selection['title_processed'].apply(lambda x: ' '.join(x))\n",
 90 |     "        \n",
 91 |     "        vectorizer = CountVectorizer(strip_accents='unicode', binary=True, min_df=4)\n",
 92 |     "        #vectorizer = TfidfVectorizer(strip_accents='unicode', use_idf=False)\n",
 93 |     "        matrix = vectorizer.fit_transform(corpus_selection['title_processed'])\n",
 94 |     "\n",
 95 |     "        dbscan = DBSCAN(metric='cosine', eps=eps, min_samples=min_sample)\n",
 96 |     "        #dbscan = OPTICS(metric='cosine', max_eps=eps, eps=eps, min_samples=min_sample, cluster_method='dbscan')\n",
 97 |     "        clustering = dbscan.fit(matrix)\n",
 98 |     "        corpus_selection['dbscan_cluster'] = clustering.labels_\n",
 99 |     "        \n",
100 |     "        counts_relevant = corpus['cluster_id'].value_counts()\n",
101 |     "\n",
102 |     "        counts_relevant_unseen = counts_relevant[counts_relevant > 3]\n",
103 |     "        counts_relevant_unseen = counts_relevant_unseen[counts_relevant_unseen < 7]\n",
104 |     "        \n",
105 |     "        counts_relevant_seen = counts_relevant[counts_relevant > 6]\n",
106 |     "        counts_relevant_seen = counts_relevant_seen[counts_relevant_seen < 81]\n",
107 |     "        \n",
108 |     "        print(f'Seen data:')\n",
109 |     "        corpus_selection_seen = corpus_selection[corpus_selection['cluster_id'].isin(counts_relevant_seen.index)].copy()\n",
110 |     "        corpus_selection_seen = corpus_selection_seen[corpus_selection_seen['dbscan_cluster'] != -1]\n",
111 |     "        \n",
112 |     "        print(f'Clusters found: {len(corpus_selection_seen[\"dbscan_cluster\"].unique())}')\n",
113 |     "        print(f'Mean cluster size: {mean(corpus_selection_seen[\"dbscan_cluster\"].value_counts())}, Median cluster_size: {median(corpus_selection_seen[\"dbscan_cluster\"].value_counts())}')\n",
114 |     "        \n",
115 |     "        counts_clustering = corpus_selection_seen['dbscan_cluster'].value_counts()\n",
116 |     "        counts_clustering = counts_clustering[counts_clustering > 2]\n",
117 |     "        corpus_selection_seen = corpus_selection_seen[corpus_selection_seen['dbscan_cluster'].isin(counts_clustering.index)]\n",
118 |     "        corpus_selection_seen = corpus_selection_seen.sort_values('dbscan_cluster')\n",
119 |     "        \n",
120 |     "        print(f'Clusters >2 found: {len(corpus_selection_seen[\"dbscan_cluster\"].unique())}')\n",
121 |     "        print(f'Mean cluster size: {mean(corpus_selection_seen[\"dbscan_cluster\"].value_counts())}, Median cluster_size: {median(corpus_selection_seen[\"dbscan_cluster\"].value_counts())}\\n')\n",
122 |     "        corpus_selection_seen = corpus_selection_seen[['dbscan_cluster', 'brand', 'title', 'description', 'price', 'priceCurrency',\n",
123 |     "       'specTableContent', 'id', 'cluster_id', 'sku', 'mpn', 'gtin', 'gtin8',\n",
124 |     "       'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier']]\n",
125 |     "        \n",
126 |     "        corpus_selection_seen.to_excel(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_eps{eps}_minsamples{min_sample}_dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.xlsx', header=True, index=False)\n",
127 |     "        \n",
128 |     "        db_clu = corpus_selection_seen[['cluster_id', 'dbscan_cluster']].copy()\n",
129 |     "        db_clu = db_clu.drop_duplicates('cluster_id')\n",
130 |     "        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_mapping.csv', header=True, index=False)\n",
131 |     "        db_clu = corpus_selection_seen['dbscan_cluster'].copy()\n",
132 |     "        db_clu = db_clu.drop_duplicates()\n",
133 |     "        db_clu = db_clu.sort_values()\n",
134 |     "        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/seen_dbscan_clusters.csv', header=True, index=False)\n",
135 |     "        \n",
136 |     "        print(f'Unseen data:')\n",
137 |     "        corpus_selection_unseen = corpus_selection[corpus_selection['cluster_id'].isin(counts_relevant_unseen.index)].copy()\n",
138 |     "        corpus_selection_unseen = corpus_selection_unseen[corpus_selection_unseen['dbscan_cluster'] != -1]\n",
139 |     "        \n",
140 |     "        print(f'Clusters found: {len(corpus_selection_unseen[\"dbscan_cluster\"].unique())}')\n",
141 |     "        print(f'Mean cluster size: {mean(corpus_selection_unseen[\"dbscan_cluster\"].value_counts())}, Median cluster_size: {median(corpus_selection_unseen[\"dbscan_cluster\"].value_counts())}')\n",
142 |     "        \n",
143 |     "        counts_clustering = corpus_selection_unseen['dbscan_cluster'].value_counts()\n",
144 |     "        counts_clustering = counts_clustering[counts_clustering > 2]\n",
145 |     "        corpus_selection_unseen = corpus_selection_unseen[corpus_selection_unseen['dbscan_cluster'].isin(counts_clustering.index)]\n",
146 |     "        corpus_selection_unseen = corpus_selection_unseen.sort_values('dbscan_cluster')\n",
147 |     "        \n",
148 |     "        print(f'Clusters >2 found: {len(corpus_selection_unseen[\"dbscan_cluster\"].unique())}')\n",
149 |     "        print(f'Mean cluster size: {mean(corpus_selection_unseen[\"dbscan_cluster\"].value_counts())}, Median cluster_size: {median(corpus_selection_unseen[\"dbscan_cluster\"].value_counts())}\\n')\n",
150 |     "        corpus_selection_unseen = corpus_selection_unseen[['dbscan_cluster', 'brand', 'title', 'description', 'price', 'priceCurrency',\n",
151 |     "       'specTableContent', 'id', 'cluster_id', 'sku', 'mpn', 'gtin', 'gtin8',\n",
152 |     "       'gtin12', 'gtin13', 'gtin14', 'productID', 'identifier']]\n",
153 |     "        \n",
154 |     "        corpus_selection_unseen.to_excel(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_eps{eps}_minsamples{min_sample}_dedup_preprocessed_lspcV2020_only_en_strict_only_long_title_only_mainentity.xlsx', header=True, index=False)\n",
155 |     "        \n",
156 |     "        db_clu = corpus_selection_unseen[['cluster_id', 'dbscan_cluster']].copy()\n",
157 |     "        db_clu = db_clu.drop_duplicates('cluster_id')\n",
158 |     "        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_mapping.csv', header=True, index=False)\n",
159 |     "        db_clu = corpus_selection_unseen['dbscan_cluster'].copy()\n",
160 |     "        db_clu = db_clu.drop_duplicates()\n",
161 |     "        db_clu = db_clu.sort_values()\n",
162 |     "        db_clu.to_csv(f'../../../data/interim/wdc-lspc/corpus/unseen_dbscan_clusters.csv', header=True, index=False)\n",
163 |     "\n",
164 |     "        print(f'-------------------------------------------------------------------------')"
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "Python 3 (ipykernel)",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.8.13"
185 |   }
186 |  },
187 |  "nbformat": 4,
188 |  "nbformat_minor": 5
189 | }
190 | 


--------------------------------------------------------------------------------
/src/contrastive/data/data_collators.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | np.random.seed(42)
  3 | import random
  4 | random.seed(42)
  5 | 
  6 | from dataclasses import dataclass
  7 | from typing import Optional
  8 | 
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
 13 | 
 14 | from pdb import set_trace
 15 | 
 16 | @dataclass
 17 | class DataCollatorContrastivePretrainSelfSupervised:
 18 | 
 19 |     tokenizer: PreTrainedTokenizerBase
 20 |     max_length: Optional[int] = 128
 21 |     pad_to_multiple_of: Optional[int] = None
 22 |     return_tensors: str = "pt"
 23 | 
 24 |     def __call__(self, input):
 25 |         
 26 |         features_left = [x[0]['features'] for x in input]
 27 |         labels = [x[0]['labels'] for x in input]
 28 |         
 29 |         batch = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
 30 | 
 31 |         if 'token_type_ids' in batch.keys():
 32 |             del batch['token_type_ids']
 33 | 
 34 |         batch['labels'] = torch.LongTensor(labels)
 35 |         
 36 |         return batch
 37 | 
 38 | @dataclass
 39 | class DataCollatorContrastivePretrain:
 40 | 
 41 |     tokenizer: PreTrainedTokenizerBase
 42 |     max_length: Optional[int] = 128
 43 |     pad_to_multiple_of: Optional[int] = None
 44 |     return_tensors: str = "pt"
 45 | 
 46 |     def __call__(self, input):
 47 | 
 48 |         features_left = [x[0]['features'] for x in input]
 49 |         features_right = [x[1]['features'] for x in input]
 50 |         labels = [x[0]['labels'] for x in input]
 51 | 
 52 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
 53 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
 54 |         
 55 |         batch = batch_left
 56 |         if 'token_type_ids' in batch.keys():
 57 |             del batch['token_type_ids']
 58 |         batch['input_ids_right'] = batch_right['input_ids']
 59 |         batch['attention_mask_right'] = batch_right['attention_mask']
 60 | 
 61 |         batch['labels'] = torch.LongTensor(labels)
 62 | 
 63 |         return batch
 64 | 
 65 | @dataclass
 66 | class DataCollatorContrastivePretrainDeepmatcher:
 67 | 
 68 |     tokenizer: PreTrainedTokenizerBase
 69 |     max_length: Optional[int] = 128
 70 |     pad_to_multiple_of: Optional[int] = None
 71 |     return_tensors: str = "pt"
 72 | 
 73 |     def __call__(self, input_both):
 74 | 
 75 |         rnd = random.choice(range(len(input_both[0])))
 76 |         input = [x[rnd] for x in input_both]
 77 | 
 78 |         features_left = [x[0]['features'] for x in input]
 79 |         features_right = [x[1]['features'] for x in input]
 80 | 
 81 |         labels = [x[0]['labels'] for x in input]
 82 | 
 83 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
 84 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
 85 |         
 86 |         batch = batch_left
 87 |         if 'token_type_ids' in batch.keys():
 88 |             del batch['token_type_ids']
 89 |         batch['input_ids_right'] = batch_right['input_ids']
 90 |         batch['attention_mask_right'] = batch_right['attention_mask']
 91 | 
 92 |         batch['labels'] = torch.LongTensor(labels)
 93 | 
 94 |         return batch
 95 | @dataclass
 96 | class DataCollatorContrastiveClassification:
 97 | 
 98 |     tokenizer: PreTrainedTokenizerBase
 99 |     max_length: Optional[int] = 128
100 |     pad_to_multiple_of: Optional[int] = None
101 |     return_tensors: str = "pt"
102 | 
103 |     def __call__(self, input):
104 | 
105 |         features_left = [x['features_left'] for x in input]
106 |         features_right = [x['features_right'] for x in input]
107 |         labels = [x['labels'] for x in input]
108 |         
109 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
110 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
111 |         
112 |         batch = batch_left
113 |         if 'token_type_ids' in batch.keys():
114 |             del batch['token_type_ids']
115 |         batch['input_ids_right'] = batch_right['input_ids']
116 |         batch['attention_mask_right'] = batch_right['attention_mask']
117 | 
118 |         batch['labels'] = torch.LongTensor(labels)
119 | 
120 |         return batch
121 | 
122 | @dataclass
123 | class DataCollatorContrastiveCrossClassification:
124 | 
125 |     tokenizer: PreTrainedTokenizerBase
126 |     max_length: Optional[int] = 256
127 |     pad_to_multiple_of: Optional[int] = None
128 |     return_tensors: str = "pt"
129 | 
130 |     def __call__(self, input):
131 | 
132 |         features_left = [x['features_left'] for x in input]
133 |         features_right = [x['features_right'] for x in input]
134 |         labels = [x['labels'] for x in input]
135 |         
136 |         batch = self.tokenizer(features_left, features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
137 | 
138 |         if 'token_type_ids' in batch.keys():
139 |             del batch['token_type_ids']
140 | 
141 |         batch['labels'] = torch.LongTensor(labels)
142 | 
143 |         return batch
144 | 
145 | @dataclass
146 | class DataCollatorMatrix:
147 | 
148 |     tokenizer: PreTrainedTokenizerBase
149 |     max_length: Optional[int] = 128
150 |     pad_to_multiple_of: Optional[int] = None
151 |     return_tensors: str = "pt"
152 | 
153 |     def __call__(self, input):
154 | 
155 |         features_left = [x[0]['features'] for x in input]
156 |         features_right = [x[1]['features'] for x in input]
157 |         labels = [x[0]['labels'] for x in input]
158 | 
159 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
160 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
161 |         
162 |         batch = batch_left
163 |         if 'token_type_ids' in batch.keys():
164 |             del batch['token_type_ids']
165 |         batch['input_ids_right'] = batch_right['input_ids']
166 |         batch['attention_mask_right'] = batch_right['attention_mask']
167 | 
168 |         batch_size = len(input)
169 |         anchor_count = len(input[0])
170 | 
171 |         labels = torch.LongTensor(labels)
172 |         labels_contrastive = labels.clone()
173 |         labels = labels.repeat(2)
174 | 
175 |         pairwise_diff = (labels.unsqueeze(1) - labels) + 1
176 |         pairwise_diff[pairwise_diff != 1] = 0
177 | 
178 |         idx = torch.triu_indices(pairwise_diff.shape[0], pairwise_diff.shape[0], 1)
179 |         pairwise_diff = pairwise_diff[idx[0],idx[1]]
180 |         
181 |         batch['labels'] = pairwise_diff.reshape(len(batch['input_ids']), -1)
182 |         batch['contrastive'] = labels_contrastive
183 | 
184 |         return batch
185 | 
186 | @dataclass
187 | class DataCollatorMatrixDeepmatcher:
188 | 
189 |     tokenizer: PreTrainedTokenizerBase
190 |     max_length: Optional[int] = 128
191 |     pad_to_multiple_of: Optional[int] = None
192 |     return_tensors: str = "pt"
193 | 
194 |     def __call__(self, input_both):
195 | 
196 |         rnd = random.choice([0,1])
197 |         input = [x[rnd] for x in input_both]
198 | 
199 |         features_left = [x[0]['features'] for x in input]
200 |         features_right = [x[1]['features'] for x in input]
201 |         labels = [x[0]['labels'] for x in input]
202 | 
203 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
204 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
205 |         
206 |         batch = batch_left
207 |         if 'token_type_ids' in batch.keys():
208 |             del batch['token_type_ids']
209 |         batch['input_ids_right'] = batch_right['input_ids']
210 |         batch['attention_mask_right'] = batch_right['attention_mask']
211 | 
212 |         batch_size = len(input)
213 |         anchor_count = len(input[0])
214 | 
215 |         labels = torch.LongTensor(labels)
216 |         labels_contrastive = labels.clone()
217 |         labels = labels.repeat(2)
218 | 
219 |         pairwise_diff = (labels.unsqueeze(1) - labels) + 1
220 |         pairwise_diff[pairwise_diff != 1] = 0
221 | 
222 |         idx = torch.triu_indices(pairwise_diff.shape[0], pairwise_diff.shape[0], 1)
223 |         pairwise_diff = pairwise_diff[idx[0],idx[1]]
224 |         
225 |         batch['labels'] = pairwise_diff.reshape(len(batch['input_ids']), -1)
226 |         batch['contrastive'] = labels_contrastive
227 | 
228 |         return batch
229 |     
230 | @dataclass
231 | class DataCollatorMatrixNew:
232 | 
233 |     tokenizer: PreTrainedTokenizerBase
234 |     max_length: Optional[int] = 128
235 |     pad_to_multiple_of: Optional[int] = None
236 |     return_tensors: str = "pt"
237 | 
238 |     def __call__(self, input):
239 | 
240 |         features_left = [x[0]['features'] for x in input]
241 |         features_right = [x[1]['features'] for x in input]
242 |         labels = [x[0]['labels'] for x in input]
243 | 
244 |         batch_left = self.tokenizer(features_left, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
245 |         batch_right = self.tokenizer(features_right, padding=True, truncation=True, max_length=self.max_length, return_tensors=self.return_tensors)
246 |         
247 |         batch = batch_left
248 |         if 'token_type_ids' in batch.keys():
249 |             del batch['token_type_ids']
250 |         batch['input_ids_right'] = batch_right['input_ids']
251 |         batch['attention_mask_right'] = batch_right['attention_mask']
252 | 
253 |         batch_size = len(input)
254 |         anchor_count = len(input[0])
255 | 
256 |         labels = torch.LongTensor(labels)
257 |         labels = labels.repeat(2)
258 | 
259 |         pairwise_diff = (labels.unsqueeze(1) - labels) + 1
260 |         pairwise_diff[pairwise_diff != 1] = 0
261 | 
262 |         idx = torch.triu_indices(pairwise_diff.shape[0], pairwise_diff.shape[0], 1)
263 |         pairwise_diff = pairwise_diff[idx[0],idx[1]]
264 |         
265 |         batch['labels'] = pairwise_diff.reshape(len(batch['input_ids']), -1)
266 | 
267 |         return batch


--------------------------------------------------------------------------------
/hiergat/task.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "final_small_80cc20rnd000un",
  4 |     "category": [
  5 |         "0",
  6 |         "1"
  7 |     ],
  8 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
  9 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 10 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
 11 |   },
 12 |   {
 13 |     "name": "final_small_80cc20rnd100un",
 14 |     "category": [
 15 |         "0",
 16 |         "1"
 17 |     ],
 18 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
 19 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 20 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
 21 |   },
 22 |   {
 23 |     "name": "final_small_50cc50rnd050un",
 24 |     "category": [
 25 |         "0",
 26 |         "1"
 27 |     ],
 28 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 29 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 30 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
 31 |   },
 32 |   {
 33 |     "name": "final_small_50cc50rnd000un",
 34 |     "category": [
 35 |         "0",
 36 |         "1"
 37 |     ],
 38 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 39 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 40 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
 41 |   },
 42 |   {
 43 |     "name": "final_small_20cc80rnd050un",
 44 |     "category": [
 45 |         "0",
 46 |         "1"
 47 |     ],
 48 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 49 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 50 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
 51 |   },
 52 |   {
 53 |     "name": "final_small_50cc50rnd100un",
 54 |     "category": [
 55 |         "0",
 56 |         "1"
 57 |     ],
 58 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 59 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 60 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
 61 |   },
 62 |   {
 63 |     "name": "final_small_20cc80rnd100un",
 64 |     "category": [
 65 |         "0",
 66 |         "1"
 67 |     ],
 68 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 69 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 70 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
 71 |   },
 72 |   {
 73 |     "name": "final_small_20cc80rnd000un",
 74 |     "category": [
 75 |         "0",
 76 |         "1"
 77 |     ],
 78 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 79 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 80 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
 81 |   },
 82 |   {
 83 |     "name": "final_small_80cc20rnd050un",
 84 |     "category": [
 85 |         "0",
 86 |         "1"
 87 |     ],
 88 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
 89 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 90 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
 91 |   },
 92 |   {
 93 |     "name": "final_medium_80cc20rnd000un",
 94 |     "category": [
 95 |         "0",
 96 |         "1"
 97 |     ],
 98 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
 99 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
100 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
101 |   },
102 |   {
103 |     "name": "final_medium_80cc20rnd100un",
104 |     "category": [
105 |         "0",
106 |         "1"
107 |     ],
108 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
109 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
110 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
111 |   },
112 |   {
113 |     "name": "final_medium_50cc50rnd050un",
114 |     "category": [
115 |         "0",
116 |         "1"
117 |     ],
118 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
119 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
120 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
121 |   },
122 |   {
123 |     "name": "final_medium_50cc50rnd000un",
124 |     "category": [
125 |         "0",
126 |         "1"
127 |     ],
128 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
129 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
130 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
131 |   },
132 |   {
133 |     "name": "final_medium_20cc80rnd050un",
134 |     "category": [
135 |         "0",
136 |         "1"
137 |     ],
138 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
139 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
140 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
141 |   },
142 |   {
143 |     "name": "final_medium_50cc50rnd100un",
144 |     "category": [
145 |         "0",
146 |         "1"
147 |     ],
148 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
149 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
150 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
151 |   },
152 |   {
153 |     "name": "final_medium_20cc80rnd100un",
154 |     "category": [
155 |         "0",
156 |         "1"
157 |     ],
158 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
159 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
160 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
161 |   },
162 |   {
163 |     "name": "final_medium_20cc80rnd000un",
164 |     "category": [
165 |         "0",
166 |         "1"
167 |     ],
168 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
169 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
170 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
171 |   },
172 |   {
173 |     "name": "final_medium_80cc20rnd050un",
174 |     "category": [
175 |         "0",
176 |         "1"
177 |     ],
178 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
179 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
180 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
181 |   },
182 |   {
183 |     "name": "final_large_80cc20rnd000un",
184 |     "category": [
185 |         "0",
186 |         "1"
187 |     ],
188 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
189 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
190 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
191 |   },
192 |   {
193 |     "name": "final_large_80cc20rnd100un",
194 |     "category": [
195 |         "0",
196 |         "1"
197 |     ],
198 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
199 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
200 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
201 |   },
202 |   {
203 |     "name": "final_large_50cc50rnd050un",
204 |     "category": [
205 |         "0",
206 |         "1"
207 |     ],
208 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
209 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
210 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
211 |   },
212 |   {
213 |     "name": "final_large_50cc50rnd000un",
214 |     "category": [
215 |         "0",
216 |         "1"
217 |     ],
218 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
219 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
220 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
221 |   },
222 |   {
223 |     "name": "final_large_20cc80rnd050un",
224 |     "category": [
225 |         "0",
226 |         "1"
227 |     ],
228 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
229 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
230 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
231 |   },
232 |   {
233 |     "name": "final_large_50cc50rnd100un",
234 |     "category": [
235 |         "0",
236 |         "1"
237 |     ],
238 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
239 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
240 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
241 |   },
242 |   {
243 |     "name": "final_large_20cc80rnd100un",
244 |     "category": [
245 |         "0",
246 |         "1"
247 |     ],
248 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
249 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
250 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
251 |   },
252 |   {
253 |     "name": "final_large_20cc80rnd000un",
254 |     "category": [
255 |         "0",
256 |         "1"
257 |     ],
258 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
259 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
260 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
261 |   },
262 |   {
263 |     "name": "final_large_80cc20rnd050un",
264 |     "category": [
265 |         "0",
266 |         "1"
267 |     ],
268 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
269 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
270 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
271 |   }
272 | ]


--------------------------------------------------------------------------------
/ditto/configs.json:
--------------------------------------------------------------------------------
  1 | [
  2 |   {
  3 |     "name": "final_small_80cc20rnd000un",
  4 |     "category": [
  5 |         "0",
  6 |         "1"
  7 |     ],
  8 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
  9 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 10 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
 11 |   },
 12 |   {
 13 |     "name": "final_small_80cc20rnd100un",
 14 |     "category": [
 15 |         "0",
 16 |         "1"
 17 |     ],
 18 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
 19 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 20 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
 21 |   },
 22 |   {
 23 |     "name": "final_small_50cc50rnd050un",
 24 |     "category": [
 25 |         "0",
 26 |         "1"
 27 |     ],
 28 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 29 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 30 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
 31 |   },
 32 |   {
 33 |     "name": "final_small_50cc50rnd000un",
 34 |     "category": [
 35 |         "0",
 36 |         "1"
 37 |     ],
 38 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 39 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 40 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
 41 |   },
 42 |   {
 43 |     "name": "final_small_20cc80rnd050un",
 44 |     "category": [
 45 |         "0",
 46 |         "1"
 47 |     ],
 48 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 49 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 50 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
 51 |   },
 52 |   {
 53 |     "name": "final_small_50cc50rnd100un",
 54 |     "category": [
 55 |         "0",
 56 |         "1"
 57 |     ],
 58 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_small.txt",
 59 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_small.txt",
 60 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
 61 |   },
 62 |   {
 63 |     "name": "final_small_20cc80rnd100un",
 64 |     "category": [
 65 |         "0",
 66 |         "1"
 67 |     ],
 68 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 69 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 70 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
 71 |   },
 72 |   {
 73 |     "name": "final_small_20cc80rnd000un",
 74 |     "category": [
 75 |         "0",
 76 |         "1"
 77 |     ],
 78 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_small.txt",
 79 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_small.txt",
 80 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
 81 |   },
 82 |   {
 83 |     "name": "final_small_80cc20rnd050un",
 84 |     "category": [
 85 |         "0",
 86 |         "1"
 87 |     ],
 88 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_small.txt",
 89 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_small.txt",
 90 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
 91 |   },
 92 |   {
 93 |     "name": "final_medium_80cc20rnd000un",
 94 |     "category": [
 95 |         "0",
 96 |         "1"
 97 |     ],
 98 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
 99 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
100 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
101 |   },
102 |   {
103 |     "name": "final_medium_80cc20rnd100un",
104 |     "category": [
105 |         "0",
106 |         "1"
107 |     ],
108 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
109 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
110 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
111 |   },
112 |   {
113 |     "name": "final_medium_50cc50rnd050un",
114 |     "category": [
115 |         "0",
116 |         "1"
117 |     ],
118 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
119 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
120 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
121 |   },
122 |   {
123 |     "name": "final_medium_50cc50rnd000un",
124 |     "category": [
125 |         "0",
126 |         "1"
127 |     ],
128 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
129 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
130 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
131 |   },
132 |   {
133 |     "name": "final_medium_20cc80rnd050un",
134 |     "category": [
135 |         "0",
136 |         "1"
137 |     ],
138 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
139 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
140 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
141 |   },
142 |   {
143 |     "name": "final_medium_50cc50rnd100un",
144 |     "category": [
145 |         "0",
146 |         "1"
147 |     ],
148 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_medium.txt",
149 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_medium.txt",
150 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
151 |   },
152 |   {
153 |     "name": "final_medium_20cc80rnd100un",
154 |     "category": [
155 |         "0",
156 |         "1"
157 |     ],
158 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
159 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
160 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
161 |   },
162 |   {
163 |     "name": "final_medium_20cc80rnd000un",
164 |     "category": [
165 |         "0",
166 |         "1"
167 |     ],
168 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_medium.txt",
169 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_medium.txt",
170 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
171 |   },
172 |   {
173 |     "name": "final_medium_80cc20rnd050un",
174 |     "category": [
175 |         "0",
176 |         "1"
177 |     ],
178 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_medium.txt",
179 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_medium.txt",
180 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
181 |   },
182 |   {
183 |     "name": "final_large_80cc20rnd000un",
184 |     "category": [
185 |         "0",
186 |         "1"
187 |     ],
188 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
189 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
190 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_gs.txt"
191 |   },
192 |   {
193 |     "name": "final_large_80cc20rnd100un",
194 |     "category": [
195 |         "0",
196 |         "1"
197 |     ],
198 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
199 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
200 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd100un_gs.txt"
201 |   },
202 |   {
203 |     "name": "final_large_50cc50rnd050un",
204 |     "category": [
205 |         "0",
206 |         "1"
207 |     ],
208 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
209 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
210 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd050un_gs.txt"
211 |   },
212 |   {
213 |     "name": "final_large_50cc50rnd000un",
214 |     "category": [
215 |         "0",
216 |         "1"
217 |     ],
218 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
219 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
220 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_gs.txt"
221 |   },
222 |   {
223 |     "name": "final_large_20cc80rnd050un",
224 |     "category": [
225 |         "0",
226 |         "1"
227 |     ],
228 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
229 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
230 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd050un_gs.txt"
231 |   },
232 |   {
233 |     "name": "final_large_50cc50rnd100un",
234 |     "category": [
235 |         "0",
236 |         "1"
237 |     ],
238 |     "trainset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_train_large.txt",
239 |     "validset": "data/final_output/preprocessed_wdcproducts50cc50rnd000un_valid_large.txt",
240 |     "testset": "data/final_output/preprocessed_wdcproducts50cc50rnd100un_gs.txt"
241 |   },
242 |   {
243 |     "name": "final_large_20cc80rnd100un",
244 |     "category": [
245 |         "0",
246 |         "1"
247 |     ],
248 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
249 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
250 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd100un_gs.txt"
251 |   },
252 |   {
253 |     "name": "final_large_20cc80rnd000un",
254 |     "category": [
255 |         "0",
256 |         "1"
257 |     ],
258 |     "trainset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_train_large.txt",
259 |     "validset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_valid_large.txt",
260 |     "testset": "data/final_output/preprocessed_wdcproducts20cc80rnd000un_gs.txt"
261 |   },
262 |   {
263 |     "name": "final_large_80cc20rnd050un",
264 |     "category": [
265 |         "0",
266 |         "1"
267 |     ],
268 |     "trainset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_train_large.txt",
269 |     "validset": "data/final_output/preprocessed_wdcproducts80cc20rnd000un_valid_large.txt",
270 |     "testset": "data/final_output/preprocessed_wdcproducts80cc20rnd050un_gs.txt"
271 |   }
272 | ]
273 | 


--------------------------------------------------------------------------------
/ditto/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------