├── convinse
    ├── __init__.py
    ├── library
    │   ├── __init__.py
    │   ├── wikipedia_library.py
    │   ├── utils.py
    │   └── custom_trainer.py
    ├── distant_supervision
    │   ├── __init__.py
    │   ├── README.md
    │   ├── turn_relevance_annotator.py
    │   └── structured_representation_annotator.py
    ├── evidence_retrieval_scoring
    │   ├── wikipedia_retriever
    │   │   ├── __init__.py
    │   │   ├── text_parser.py
    │   │   ├── table_parser.py
    │   │   ├── infobox_parser.py
    │   │   └── wikipedia_retriever.py
    │   ├── bm25_es.py
    │   ├── clocq_bm25.py
    │   ├── evidence_retrieval_scoring.py
    │   ├── README.md
    │   └── clocq_er.py
    ├── heterogeneous_answering
    │   ├── heterogeneous_answering.py
    │   ├── README.md
    │   └── fid_module
    │   │   ├── fid_utils.py
    │   │   └── fid_module.py
    ├── question_understanding
    │   ├── question_understanding.py
    │   ├── README.md
    │   ├── question_rewriting
    │   │   ├── dataset_question_rewriting.py
    │   │   ├── question_rewriting_model.py
    │   │   └── question_rewriting_module.py
    │   ├── naive_concat
    │   │   └── naive_concat.py
    │   ├── structured_representation
    │   │   ├── dataset_structured_representation.py
    │   │   ├── structured_representation_module.py
    │   │   └── structured_representation_model.py
    │   └── question_resolution
    │   │   ├── question_resolution_module.py
    │   │   └── question_resolution_utils.py
    ├── evaluation.py
    └── pipeline.py
├── Makefile
├── requirements.txt
├── scripts
    ├── silver_annotation.sh
    ├── initialize.sh
    ├── pipeline.sh
    └── download.sh
├── .gitignore
├── setup.py
├── LICENSE
└── config
    └── convmix
        ├── nc_all-clocq_bm25-fid.yml
        ├── nc_init-clocq_bm25-fid.yml
        ├── nc_prev-clocq_bm25-fid.yml
        ├── nc_init_prev-clocq_bm25-fid.yml
        ├── qres-clocq_bm25-fid.yml
        ├── convinse.yml
        └── qrew-clocq_bm25-fid.yml


/convinse/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/convinse/library/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/convinse/distant_supervision/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | style:
2 | 	black --line-length 100 --target-version py38 .


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/wikipedia_retriever/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | bs4
 3 | datasets
 4 | matplotlib
 5 | networkx
 6 | numpy
 7 | nltk
 8 | pybind11
 9 | python-Levenshtein
10 | torch_transformers
11 | pyyaml
12 | rank-bm25
13 | requests
14 | scikit-learn
15 | sentencepiece
16 | tensorboardX
17 | tqdm
18 | transformers
19 | wikitables


--------------------------------------------------------------------------------
/scripts/silver_annotation.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | # read config parameter: if no present, stick to convinse.yaml
 3 | CONFIG=${1:-config/convmix/convinse.yml}
 4 | 
 5 | # adjust name to log
 6 | IFS='/' read -ra NAME <<< "$CONFIG"
 7 | DATA=${NAME[1]}
 8 | IFS='.' read -ra NAME <<< "${NAME[2]}"
 9 | NAME=${NAME[0]}
10 | OUT=out/${DATA}/silver_annotation_${NAME}.out
11 | mkdir -p out/${DATA}
12 | 
13 | # start script
14 | nohup python -u convinse/distant_supervision/silver_annotation.py --inference $CONFIG  > $OUT 2>&1 &


--------------------------------------------------------------------------------
/scripts/initialize.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | 
 3 | # initialize root dir
 4 | CONVINSE_ROOT=$(pwd)
 5 | 
 6 | # create directories
 7 | mkdir -p _benchmarks
 8 | mkdir -p _data
 9 | mkdir -p _intermediate_representations
10 | mkdir -p _results
11 | mkdir -p _results/convmix
12 | mkdir -p out
13 | mkdir -p out/convmix
14 | mkdir -p out/slurm
15 | 
16 | # download 
17 | bash scripts/download.sh convmix
18 | bash scripts/download.sh data
19 | bash scripts/download.sh wikipedia
20 | bash scripts/download.sh convinse
21 | bash scripts/download.sh annotated
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # meta data
 2 | *__pycache__
 3 | convinse.egg-info/
 4 | .DS_Store
 5 | 
 6 | # data folders
 7 | _data/
 8 | _intermediate_representations/
 9 | _benchmarks/
10 | 
11 | # specific folders
12 | *results/
13 | *logs/
14 | *cache/
15 | *examples/
16 | *out/
17 | *runs/
18 | *tmp/
19 | 
20 | # data files
21 | *.pickle
22 | *.out
23 | *.json
24 | *.txt
25 | 
26 | # specific paths
27 | clocq/
28 | convmix/
29 | test.py
30 | 
31 | # other repos
32 | convinse/heterogeneous_answering/fid_module/FiD/
33 | convinse/question_understanding/question_resolution/quretec/


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("requirements.txt", "r") as f:
 4 |     requirements = list(f.read().splitlines())
 5 | 
 6 | setup(
 7 |     name="convinse",
 8 |     version="1.0",
 9 |     description="Code for the CONVINSE project (published in SIGIR 2022).",
10 |     long_description=open("README.md").read(),
11 |     long_description_content_type="text/markdown",
12 |     author="Philipp Christmann",
13 |     author_email="pchristm@mpi-inf.mpg.de",
14 |     url="https://convinse.mpi-inf.mpg.de",
15 |     packages=find_packages(),
16 |     include_package_data=False,
17 |     keywords=["qa", "question answering", "heterogeneous QA", "conversational", "ConvQA", "knowledge bases", "heterogeneous sources"],
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3.8"
20 |     ],
21 |     install_requires=requirements
22 | )
23 | 


--------------------------------------------------------------------------------
/convinse/library/wikipedia_library.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Library for different string and path functions
 3 | for the Wikipedia retriever.
 4 | """
 5 | 
 6 | 
 7 | def format_wiki_path(value):
 8 |     """Reformat Wikipedia entity link."""
 9 |     return value.replace("/wiki/", "")
10 | 
11 | 
12 | def is_wikipedia_path(value):
13 |     """Check if the value is a Wikipedia entity."""
14 |     if not value:
15 |         return False
16 |     elif not value.startswith("/wiki"):
17 |         return False
18 |     elif value.startswith("/wiki/File:"):
19 |         return False
20 |     elif "Category:" in value:
21 |         return False
22 |     elif "Special:" in value:
23 |         return False
24 |     return True
25 | 
26 | 
27 | def _wiki_title_to_path(wiki_title):
28 |     wiki_path = wiki_title.replace(" ", "_")
29 |     wiki_path = wiki_path.replace("'", "%27")
30 |     wiki_path = wiki_path.replace("-", "_")
31 |     return wiki_path
32 | 
33 | 
34 | def _wiki_path_to_title(wiki_path):
35 |     wiki_title = wiki_path.replace("_", " ")
36 |     wiki_title = wiki_title.replace("%27", "'")
37 |     return wiki_title
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Philipp Christmann, Rishiraj Saha Roy, Gerhard Weikum
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/scripts/pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash
 2 | #SBATCH -o out/slurm/$OUT
 3 | 
 4 | ## check argument length
 5 | if [[ $# -lt 1 ]]
 6 | then
 7 | 	echo "Error: Invalid number of options: Please specify at least the pipeline-function."
 8 | 	echo "Usage: bash scripts/pipeline.sh --train/--pred-answers/--gold-answers/--main-results/--example [<PATH_TO_CONFIG>] [<SOURCES_STR>]"
 9 | 	exit 0
10 | fi
11 | 
12 | ## read config parameter: if no present, stick to default (default.yaml)
13 | FUNCTION=$1
14 | CONFIG=${2:-"config/convmix/convinse.yml"}
15 | SOURCES=${3:-"kb_text_table_info"}
16 | 
17 | ## set path for output
18 | # get function name
19 | FUNCTION_NAME=${FUNCTION#"--"}
20 | # get data name
21 | IFS='/' read -ra NAME <<< "$CONFIG"
22 | DATA=${NAME[1]}
23 | # get config name
24 | CFG_NAME=${NAME[2]%".yml"}
25 | 
26 | # set output path (include sources only if not default value)
27 | if [[ $# -lt 3 ]]
28 | then
29 | 	OUT="out/${DATA}/pipeline-${FUNCTION_NAME}-${CFG_NAME}.out"
30 | else
31 | 	OUT="out/${DATA}/pipeline-${FUNCTION_NAME}-${CFG_NAME}-${SOURCES}.out"
32 | fi
33 | 
34 | 
35 | ## fix global vars (required for FiD)
36 | export SLURM_NTASKS=1
37 | export TOKENIZERS_PARALLELISM=false
38 | 
39 | ## start script
40 | if ! command -v sbatch &> /dev/null
41 | then
42 | 	# no slurm setup: run via nohup
43 | 	nohup python -u convinse/pipeline.py $FUNCTION $CONFIG $SOURCES > $OUT 2>&1 &
44 | else
45 | 	# run with sbatch
46 | 	sbatch <<EOT
47 | #!/bin/bash
48 | 
49 | #SBATCH -p gpu20
50 | #SBATCH --gres gpu:1
51 | #SBATCH -t 30:00:00
52 | #SBATCH -o $OUT
53 | 
54 | python -u convinse/pipeline.py $FUNCTION $CONFIG $SOURCES 
55 | EOT
56 | fi


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/wikipedia_retriever/text_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def extract_text_snippets(wiki_md, wiki_title, nlp):
 5 |     """
 6 |     Extract text snippets from the given
 7 |     markdown text.
 8 |     """
 9 |     if not wiki_md or not wiki_md.get("extract"):
10 |         return []
11 | 
12 |     # load content
13 |     content = wiki_md["extract"]
14 | 
15 |     # remove noise and load doc
16 |     clean_content = _filter_noise(content)
17 |     doc = nlp(clean_content)
18 | 
19 |     # split the given document into sentences
20 |     evidences = list()
21 |     for sent in doc.sents:
22 |         # drop empty sentences
23 |         if not sent.text.strip():
24 |             continue
25 | 
26 |         # prepend wiki_title for context
27 |         evidence_text = f"{wiki_title}, {sent.text.strip()}"
28 | 
29 |         # create evidence object
30 |         evidence = {
31 |             # entities are added later by EvidenceAnnotator
32 |             "evidence_text": evidence_text,
33 |             "source": "text",
34 |         }
35 |         evidences.append(evidence)
36 |     return evidences
37 | 
38 | 
39 | def _filter_noise(wiki_content):
40 |     """
41 |     Filter headings and whitespaces from the document.
42 |     """
43 |     # remove sections
44 |     content = wiki_content.split("== Citations ==")[0]
45 |     content = wiki_content.split("== Footnotes ==")[0]
46 |     content = wiki_content.split("== References ==")[0]
47 |     content = wiki_content.split("== Further reading ==")[0]
48 |     # clean text
49 |     content = re.sub(r"==.*?==+", "", content)
50 |     content = content.replace("\n", " ")
51 |     while "  " in content:
52 |         content = content.replace("  ", " ")
53 |     return content
54 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/wikipedia_retriever/table_parser.py:
--------------------------------------------------------------------------------
 1 | import mwparserfromhell as mwp
 2 | from wikitables import WikiTable
 3 | from wikitables.util import ftag
 4 | 
 5 | 
 6 | def json_tables_to_evidences(tables, wiki_title):
 7 |     """
 8 |     Convert the table parsed by wikitables-module to evidences.
 9 |     """
10 |     evidences = list()
11 |     # for each table in document
12 |     for table in tables:
13 |         # row-wise table processing
14 |         for row in table.rows:
15 |             evidence_text = f"{wiki_title}"
16 |             for key, value in row.items():
17 |                 evidence_text += f", {key} is {value}"
18 | 
19 |             # create evidence
20 |             evidence = {"evidence_text": evidence_text, "source": "table"}
21 |             evidences.append(evidence)
22 |     return evidences
23 | 
24 | 
25 | def extract_wikipedia_tables(wiki_md):
26 |     """
27 |     Retrieve json-tables from the wikipedia page.
28 |     """
29 |     if not wiki_md or not wiki_md.get("revisions"):
30 |         return []
31 | 
32 |     # load content
33 |     content = wiki_md["revisions"][0]["*"]
34 |     title = wiki_md["title"]
35 | 
36 |     # extract tables using wikitables-module
37 |     try:
38 |         tables = _import_tables(content, title)
39 |     except:
40 |         tables = []
41 |     return tables
42 | 
43 | 
44 | def _import_tables(content, title, lang="en"):
45 |     """
46 |     Extract tables from the given markdown content
47 |     using the wikitables module and mwparser.
48 |     """
49 |     raw_tables = mwp.parse(content).filter_tags(matches=ftag("table"))
50 | 
51 |     def _table_gen():
52 |         for idx, table in enumerate(raw_tables):
53 |             name = "%s[%s]" % (title, idx)
54 |             yield WikiTable(name, table)
55 | 
56 |     return list(_table_gen())
57 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/bm25_es.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from rank_bm25 import BM25Okapi
 4 | 
 5 | 
 6 | class BM25Scoring:
 7 |     def __init__(self, config):
 8 |         with open(config["path_to_stopwords"], "r") as fp:
 9 |             self.stopwords = fp.read().split("\n")
10 | 
11 |         self.max_evidences = config["evs_max_evidences"]
12 |         if config["qu"] == "sr":
13 |             self.sr_delimiter = config["sr_delimiter"].strip()
14 |         else:
15 |             self.sr_delimiter = " "
16 | 
17 |     def get_top_evidences(self, structured_representation, evidences):
18 |         """
19 |         Retrieve the top-100 evidences among the retrieved ones,
20 |         for the given AR.
21 |         """
22 | 
23 |         def tokenize(string):
24 |             """Function to tokenize string (word-level)."""
25 |             string = string.replace(",", " ")
26 |             string = string.replace(self.sr_delimiter, " ")
27 |             string = string.strip()
28 |             return [word.lower() for word in string.split() if not word in self.stopwords]
29 | 
30 |         if not evidences:
31 |             return evidences
32 | 
33 |         # tokenize
34 |         mapping = {
35 |             " ".join(tokenize(evidence["evidence_text"])): evidence for evidence in evidences
36 |         }
37 |         tokenized_sr = tokenize(structured_representation)
38 | 
39 |         # create corpus
40 |         tokenized_corpus = [tokenize(evidence["evidence_text"]) for evidence in evidences]
41 |         bm25_module = BM25Okapi(tokenized_corpus)
42 | 
43 |         # scoring
44 |         scores = bm25_module.get_scores(tokenized_sr)
45 | 
46 |         # retrieve top-k
47 |         ranked_indices = sorted(
48 |             range(len(tokenized_corpus)), key=lambda i: scores[i], reverse=True
49 |         )[: self.max_evidences]
50 | 
51 |         scored_evidences = [
52 |             mapping[" ".join(tokenized_corpus[index])] for i, index in enumerate(ranked_indices)
53 |         ]
54 |         return scored_evidences
55 | 


--------------------------------------------------------------------------------
/scripts/download.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/bash 
 2 | 
 3 | # initialize root dir
 4 | CONVINSE_ROOT=$(pwd)
 5 | 
 6 | ## check argument length
 7 | if [[ $# -lt 1 ]]
 8 | then
 9 | 	echo "Error: Invalid number of options: Please specify the data which should be downloaded."
10 | 	echo "Usage: bash scripts/download.sh <DATA_FOR_DOWNLOAD>"
11 | 	exit 0
12 | fi
13 | 
14 | case "$1" in
15 | "convinse")
16 |     echo "Downloading CONVINSE data..."
17 |     wget http://qa.mpi-inf.mpg.de/convinse/convmix_data/convinse.zip
18 |     mkdir -p _data/convmix/
19 |     unzip convinse.zip -d _data/convmix/
20 |     rm convinse.zip
21 |     echo "Successfully downloaded CONVINSE data!"
22 |     ;;
23 | "convmix")
24 |     echo "Downloading ConvMix dataset..."
25 |     mkdir -p _benchmarks/convmix
26 |     cd _benchmarks/convmix
27 |     wget http://qa.mpi-inf.mpg.de/convinse/train_set.zip
28 |     unzip train_set.zip
29 |     rm train_set.zip
30 |     wget http://qa.mpi-inf.mpg.de/convinse/dev_set.zip
31 |     unzip dev_set.zip
32 |     rm dev_set.zip
33 |     wget http://qa.mpi-inf.mpg.de/convinse/test_set.zip
34 |     unzip test_set.zip
35 |     rm test_set.zip
36 |     echo "Successfully downloaded ConvMix dataset!"
37 |     ;;
38 | "wikipedia")
39 |     echo "Downloading Wikipedia dump..."
40 |     wget http://qa.mpi-inf.mpg.de/convinse/convmix_data/wikipedia.zip
41 |     mkdir -p _data/convmix/
42 |     unzip wikipedia.zip -d _data/convmix/
43 |     rm wikipedia.zip
44 |     echo "Successfully downloaded Wikipedia dump!"
45 |     ;;
46 | "annotated")
47 |     echo "Downloading annotated ConvMix data..."
48 |     wget http://qa.mpi-inf.mpg.de/convinse/convmix_data/annotated.zip
49 |     mkdir -p _intermediate_representations/convmix/
50 |     unzip annotated.zip -d _intermediate_representations/convmix/
51 |     rm annotated.zip
52 |     echo "Successfully downloaded annotated ConvMix data!"
53 |     ;;
54 | "data")
55 |     echo "Downloading general repo data..."
56 |     wget http://qa.mpi-inf.mpg.de/convinse/data.zip
57 |     unzip data.zip -d _data
58 |     rm data.zip
59 |     echo "Successfully downloaded general repo data!"
60 |     ;;
61 | *)
62 |     echo "Error: Invalid specification of the data. Data $1 could not be found."
63 | 	exit 0
64 |     ;;
65 | esac
66 | 


--------------------------------------------------------------------------------
/convinse/heterogeneous_answering/heterogeneous_answering.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from convinse.library.utils import store_json_with_mkdir, get_logger
 5 | 
 6 | 
 7 | class HeterogeneousAnswering:
 8 |     def __init__(self, config):
 9 |         """Initialize HA module."""
10 |         self.config = config
11 |         self.logger = get_logger(__name__, config)
12 | 
13 |     def train(self, sources=["kb", "text", "table", "info"]):
14 |         """ Method used in case no training required for HA phase. """
15 |         self.logger.info("Module used does not require training.")
16 | 
17 |     def inference(self):
18 |         """Run HA on data and add answers for each source combination."""
19 |         input_dir = self.config["path_to_annotated"]
20 |         output_dir = self.config["path_to_intermediate_results"]
21 | 
22 |         qu = self.config["qu"]
23 |         ers = self.config["ers"]
24 |         ha = self.config["ha"]
25 | 
26 |         source_combinations = self.config["source_combinations"]
27 |         for sources in source_combinations:
28 |             sources_string = "_".join(sources)
29 | 
30 |             input_path = os.path.join(input_dir, qu, ers, sources_string, "test_ers.jsonl")
31 |             output_path = os.path.join(input_dir, qu, ers, sources_string, ha, "test_ha.json")
32 |             self.inference_on_data_split(input_path, output_path, sources)
33 | 
34 |     def inference_on_data_split(self, input_path, output_path):
35 |         """Run HA on given data split."""
36 |         # open data
37 |         input_turns = list()
38 |         data = list()
39 |         with open(input_path, "r") as fp:
40 |             line = fp.readline()
41 |             while line:
42 |                 conversation = json.loads(line)
43 |                 input_turns += [turn for turn in conversation["questions"]]
44 |                 data.append(conversation)
45 |                 line = fp.readline()
46 | 
47 |         # inference
48 |         self.inference_on_turns(input_turns)
49 | 
50 |         # store processed data
51 |         store_json_with_mkdir(data, output_path)
52 | 
53 |     def inference_on_data(self, input_data):
54 |         """Run HA on given data."""
55 |         input_turns = [turn for conv in input_data for turn in conv["questions"]]
56 |         self.inference_on_turns(input_turns)
57 |         return input_data
58 | 
59 |     def inference_on_turns(self, input_turns):
60 |         """Run HA on a set of turns."""
61 |         for turn in turns:
62 |             self.inference_on_turn(turn)
63 | 
64 |     def inference_on_turn(self, turn):
65 |         raise Exception(
66 |             "This is an abstract function which should be overwritten in a derived class!"
67 |         )
68 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/question_understanding.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | from tqdm import tqdm
 5 | 
 6 | from convinse.library.utils import store_json_with_mkdir, get_logger
 7 | 
 8 | 
 9 | class QuestionUnderstanding:
10 |     """Abstract class for QU phase."""
11 | 
12 |     def __init__(self, config, use_gold_answers):
13 |         """Initialize QU module."""
14 |         self.config = config
15 |         self.logger = get_logger(__name__, config)
16 |         self.use_gold_answers = use_gold_answers
17 | 
18 |     def train(self):
19 |         """Method used in case no training required for QU phase."""
20 |         self.logger.info("QU - Module used does not require training.")
21 | 
22 |     def inference(self):
23 |         """Run model on data and add predictions."""
24 |         # inference: add predictions to data
25 |         qu = self.config["qu"]
26 |         input_dir = self.config["path_to_annotated"]
27 |         output_dir = self.config["path_to_intermediate_results"]
28 | 
29 |         input_path = os.path.join(input_dir, "annotated_train.json")
30 |         output_path = os.path.join(output_dir, qu, "train_qu.json")
31 |         self.inference_on_data_split(input_path, output_path)
32 | 
33 |         input_path = os.path.join(input_dir, "annotated_dev.json")
34 |         output_path = os.path.join(output_dir, qu, "dev_qu.json")
35 |         self.inference_on_data_split(input_path, output_path)
36 | 
37 |         input_path = os.path.join(input_dir, "annotated_test.json")
38 |         output_path = os.path.join(output_dir, qu, "test_qu.json")
39 |         self.inference_on_data_split(input_path, output_path)
40 | 
41 |     def inference_on_data_split(self, input_path, output_path):
42 |         """Run model on data and add predictions."""
43 |         self.logger.info(f"QU - Starting inference on {input_path}.")
44 | 
45 |         # open data
46 |         with open(input_path, "r") as fp:
47 |             data = json.load(fp)
48 | 
49 |         # model inference on given data
50 |         self.inference_on_data(data)
51 | 
52 |         # store data
53 |         store_json_with_mkdir(data, output_path)
54 | 
55 |         # log
56 |         self.logger.info(f"QU - Inference done on {input_path}.")
57 | 
58 |     def inference_on_data(self, input_data):
59 |         """Run model on data and add predictions."""
60 |         # model inference on given data
61 |         for conversation in tqdm(input_data):
62 |             self.inference_on_conversation(conversation)
63 |         return input_data
64 | 
65 |     def inference_on_conversation(self, conversation):
66 |         raise Exception(
67 |             "This is an abstract function which should be overwritten in a derived class!"
68 |         )
69 | 
70 |     def inference_on_turn(self, turn, history_turns):
71 |         raise Exception(
72 |             "This is an abstract function which should be overwritten in a derived class!"
73 |         )
74 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/README.md:
--------------------------------------------------------------------------------
 1 | # Question Understanding (QU)
 2 | 
 3 | Module to create an intent-explicit form of the current question and the corresponding conversational history.
 4 | 
 5 | - [Create your own QU module](#create-your-own-qu-module)
 6 |   - [`inference_on_turn` function](#inference_on_turn-function)
 7 |   - [`inference_on_conversation` function](#inference_on_conversation-function)
 8 |   - [`train` function](#optional-train-function)
 9 | 
10 | ## Create your own QU module
11 | You can inherit from the [QuestionUnderstanding](question_understanding.py) class and create your own QU module. Implementing the functions `inference_on_turn` and `inference_on_conversation` is sufficient for the pipeline to run properly. You might want to implement your own training procedure for your module via the `train` function though.
12 | 
13 | Further, you need to instantiate a logger in the class, which will be used in the parent class.
14 | Alternatively, you can call the __init__ method of the parent class.  
15 | Also, make sure to use the `use_gold_answers` parameter properly in your derived class.
16 | This parameter will be given as parameter when initializing the module.
17 | 
18 | ## `inference_on_turn` function
19 | 
20 | **Inputs**:
21 | - `turn`: the current turn for which the intent-explicit representation should be generated.
22 | - `history_turns`: the previous turns in the conversation. Can be used to generate the intent-explicit form. List of turn dictionaries.
23 | 
24 | **Description**:  
25 | This method is supposed to generate an intent-explicit form of the current question, given the conversational history.
26 | Please make sure that the class parameter `use_gold_answers` controls whether the gold answer(s) (in `turn["answers"]`) or the predicted answer(s) (in `turn["pred_answers"]`) are used.
27 | 
28 | **Output**:  
29 | Returns the turn. Make sure to store the intent-explicit representation of the information need in `turn["structured_representation"]`. 
30 | 
31 | ## `inference_on_conversation` function
32 | 
33 | **Inputs**:
34 | - `conversation`: the conversation for which the intent-explicit representation should be generated.
35 | 
36 | **Description**:  
37 | This method is supposed to generate intent-explicit forms for all turns in the conversation. In the method, you can keep track of the conversational history (e.g. using a list). Please make sure that the class parameter `use_gold_answers` controls whether the gold answer(s) (in `turn["answers"]`) or the predicted answer(s) (in `turn["pred_answers"]`) are used.
38 | 
39 | **Output**:  
40 | Returns the conversation. Make sure to store the intent-explicit representation of the information need for every turn in the conversation, in `turn["structured_representation"]`. 
41 | 
42 | ## [Optional] `train` function
43 | 
44 | **Inputs**: NONE
45 | 
46 | **Description**:  
47 | If required, you can train your QU module here. You can make use of whatever parameters are stored in your .yml file.
48 | 
49 | **Output**: NONE
50 | 


--------------------------------------------------------------------------------
/convinse/heterogeneous_answering/README.md:
--------------------------------------------------------------------------------
 1 | # Heterogeneous Answering (HA)
 2 | 
 3 | Module to answer the intent-explicit representation of the question, using the top-*e* retrieved evidences.
 4 | 
 5 | - [Create your own Ha module](#create-your-own-ha-module)
 6 |   - [`inference_on_turn` function](#inference_on_turn-function)
 7 |   - [`inference_on_conversation` function](#inference_on_conversation-function)
 8 |   - [`train` function](#optional-train-function)
 9 | - [Answer format](#answer-format)
10 | 
11 | ## Create your own HA module
12 | You can inherit from the [HeterogeneousAnswering](heterogeneous_answering.py) class and create your own QU module. Implementing the function `inference_on_turn` is sufficient for the pipeline to run properly. You might want to implement your own training procedure for your module via the `train` function though.
13 | 
14 | Further, you need to instantiate a logger in the class, which will be used in the parent class.
15 | Alternatively, you can call the __init__ method of the parent class.  
16 | 
17 | ## `inference_on_turn` function
18 | 
19 | **Inputs**:
20 | - `turn`: the turn, for which the answer should be predicted. You can access the intent-explicit representation of the information need via `turn["structured_representation"]`, and the top-*e* evidences via `turn["top_evidences"]`.
21 | 
22 | **Description**:  
23 | Run the HA module on the information need, and predict the answer(s).
24 | 
25 | **Output**:
26 | Returns the turn. Make sure to add the predicted answers to `turn["pred_answers"]`. You can find additional information on the [expected answer format](#answer-format) below.
27 | 
28 | ## [Optional] `train` function
29 | 
30 | **Inputs**:
31 | - `sources`: list of sources for which the HA module should be trained. The default setting is to train a single model for all sources (and combinations of sources) for generalizability.
32 | 
33 | **Description**:  
34 | If required, you can train your HA module here. You can make use of whatever parameters are stored in your .yml file.
35 | 
36 | **Output**: NONE
37 | 
38 | ## Answer format
39 | The predicted answers are given as list of answer-dictionaries, and should be stored in `turn["pred_answers"]`.
40 | Note, that the answers should be normalized to Wikidata. This allows for fair comparison beyond plain string matching.
41 | Further, in a real use-case this has the advantage that knowledge cards can be shown for the given KB items.
42 | In case a date or year is returned, give the corresponding timestamp as the ID ("2011-04-17T00:00:00Z"; standard format in Wikidata, and the CLOCQ API), and a verbalized version as label ("17 April 2011"). You can make use of the timestamp-related functions in the [StringLibrary](../library/string_library.py).
43 | ``` json
44 | [{
45 | 	"id": "<WIKIDATA ITEM ID>",
46 | 	"label": "<WIKIDATA ITEM LABEL>",
47 | 	"rank": "<RANK INTEGER>" 
48 | }]
49 | ```
50 | 
51 | `rank` starts with 1, and has exactly one answer at every rank (for comparison on ConvMix).
52 | 
53 | Example:
54 | ``` json
55 | [
56 |   {
57 |     "id": "Q23633",
58 |     "label": "HBO",
59 |     "rank": "1" 
60 |   },
61 |   {
62 |     "id": "2011-04-17T00:00:00Z",
63 |     "label": "17 April 2011",
64 |     "rank": "2" 
65 |   }
66 | ]
67 | ```
68 | 
69 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/clocq_bm25.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import json
 4 | import time
 5 | import logging
 6 | 
 7 | from tqdm import tqdm
 8 | from pathlib import Path
 9 | 
10 | from convinse.library.utils import get_config, get_logger
11 | from convinse.evidence_retrieval_scoring.evidence_retrieval_scoring import EvidenceRetrievalScoring
12 | from convinse.evidence_retrieval_scoring.clocq_er import ClocqRetriever
13 | from convinse.evidence_retrieval_scoring.bm25_es import BM25Scoring
14 | 
15 | 
16 | class ClocqBM25(EvidenceRetrievalScoring):
17 |     def __init__(self, config):
18 |         self.config = config
19 |         self.logger = get_logger(__name__, config)
20 |         self.evr = ClocqRetriever(config)
21 |         self.evs = BM25Scoring(config)
22 | 
23 |     def inference_on_turn(self, turn, sources=["kb", "text", "table", "info"]):
24 |         """Retrieve best evidences for SR."""
25 |         structured_representation = turn["structured_representation"]
26 |         evidences, _ = self.evr.retrieve_evidences(structured_representation, sources)
27 |         top_evidences = self.evs.get_top_evidences(structured_representation, evidences)
28 |         turn["top_evidences"] = top_evidences
29 |         return top_evidences
30 | 
31 |     def store_cache(self):
32 |         """Store cache of evidence retriever."""
33 |         self.evr.store_cache()
34 | 
35 | 
36 | #######################################################################################################################
37 | #######################################################################################################################
38 | if __name__ == "__main__":
39 |     if len(sys.argv) != 2:
40 |         raise Exception("python convinse/evidence_retrieval_scoring/clocq_bm25.py <PATH_TO_CONFIG>")
41 | 
42 |     # load config
43 |     config_path = sys.argv[1]
44 |     config = get_config(config_path)
45 |     ers = ClocqBM25(config)
46 | 
47 |     # inference: add predictions to data
48 |     input_dir = config["path_to_annotated"]
49 |     output_dir = config["path_to_intermediate_results"]
50 | 
51 |     qu = config["qu"]
52 |     source_combinations = config["source_combinations"]
53 | 
54 |     for sources in source_combinations:
55 |         sources_string = "_".join(sources)
56 | 
57 |         input_path = os.path.join(input_dir, qu, "train_qu.json")
58 |         if os.path.exists(input_path):
59 |             output_path = os.path.join(
60 |                 output_dir, qu, "clocq_bm25", sources_string, "train_ers.jsonl"
61 |             )
62 |             ers.inference_on_data_split(input_path, output_path, sources)
63 | 
64 |         input_path = os.path.join(input_dir, qu, "dev_qu.json")
65 |         if os.path.exists(input_path):
66 |             output_path = os.path.join(
67 |                 output_dir, qu, "clocq_bm25", sources_string, "dev_ers.jsonl"
68 |             )
69 |             ers.inference_on_data_split(input_path, output_path, sources)
70 | 
71 |         input_path = os.path.join(input_dir, qu, "test_qu.json")
72 |         output_path = os.path.join(output_dir, qu, "clocq_bm25", sources_string, "test_ers.jsonl")
73 |         ers.inference_on_data_split(input_path, output_path, sources)
74 | 
75 |     # store results in cache
76 |     ers.store_cache()
77 | 


--------------------------------------------------------------------------------
/convinse/distant_supervision/README.md:
--------------------------------------------------------------------------------
  1 | # Distant Supervision
  2 | 
  3 | - [Usage](#usage)
  4 | - [Input format](#input-format)
  5 | - [Output format](#output-format)
  6 | 
  7 | ## Usage
  8 | For running the distant supervision on a given dataset, simply run:
  9 | ```
 10 | 	bash scripts/silver_annotation.sh [<PATH_TO_CONFIG>]
 11 | ```
 12 | from the ROOT directory of the project.  
 13 | The paths to the input files will be read from the given config values for `train_input_path`, `dev_input_path`, and `test_input_path`.
 14 | This will create annotated versions of the benchmark in `_intermediate_representations/<BENCHMARK>/`.
 15 | 
 16 | ## Input format
 17 | The annotation script expects the benchmark in the following (minimal) format:
 18 | ```
 19 | [
 20 | 	// first conversation
 21 | 	{	
 22 | 		"conversation_id": "<INT>",
 23 | 		"questions": [
 24 | 			// question 1 (complete)
 25 | 			{
 26 | 				"turn": 0, 
 27 | 				"question_id": "<QUESTION-ID>", 
 28 | 				"question": "<QUESTION>", 
 29 | 				"answers": [
 30 | 					{
 31 | 						"id": "<Wikidata ID>",
 32 | 						"label": "<Item Label>
 33 | 					},
 34 | 				]
 35 | 			},
 36 | 			// question 2 (incomplete)
 37 | 			{
 38 | 				"turn": 1,
 39 | 				"question_id": "<QUESTION-ID>", 
 40 | 				"question": "<QUESTION>",
 41 | 				"answers": [
 42 | 				{
 43 | 					"id": "<Wikidata ID>",
 44 | 					"label": "<Item Label>
 45 | 				},
 46 | 				]
 47 | 		]
 48 | 	},
 49 | 	// second conversation
 50 | 	{
 51 | 		...
 52 | 	},
 53 | 	// ...
 54 | ]
 55 | ```
 56 | Any other keys can be provided, and will be written to the output.
 57 | You can see [here](../heterogeneous_answering#answer-format) for additional information of the expected format of the answer IDs and labels.
 58 | 
 59 | ## Output format
 60 | The result will be stored in a .json file:
 61 | 
 62 | ```
 63 | [
 64 | 	// first conversation
 65 | 	{	
 66 | 		"conversation_id": "<INT>",
 67 | 		"questions": [
 68 | 			// question 1 (complete)
 69 | 			{
 70 | 				"turn": 0, 
 71 | 				"question_id": "<QUESTION-ID>", 
 72 | 				"question": "<QUESTION>", 
 73 | 				"answers": [
 74 | 				{
 75 | 					"id": "<Wikidata ID>",
 76 | 					"label": "<Item Label>
 77 | 				},
 78 | 				// supervision signals from weak supervision
 79 | 				"silver_SR": [
 80 | 					// SR 1
 81 | 					["<STR1>", "<STR2>",]
 82 | 				],
 83 | 				"silver_relevant_turns": [
 84 | 					// list of integers referring to the relevant turns
 85 | 					// -> this data is not used in current framework
 86 | 					0
 87 | 				]
 88 | 			},
 89 | 			// question 2 (incomplete)
 90 | 			{
 91 | 				"turn": 1,
 92 | 				"question_id": "<QUESTION-ID>", 
 93 | 				"question": "<QUESTION>", 
 94 | 				"completed_question": "<QUESTION>", 
 95 | 				"answers": [
 96 | 				{
 97 | 					"id": "<Wikidata ID>",
 98 | 					"label": "<Item Label>
 99 | 				},
100 | 				// supervision signals from weak supervision
101 | 				"silver_SR": [
102 | 					// SR 1
103 | 					["<STR1>", "<STR2>",]
104 | 				],
105 | 				"silver_relevant_turns": [
106 | 					// list of integers referring to the relevant turns
107 | 					// -> this data is not used in current framework
108 | 					0
109 | 				]
110 | 			},
111 | 			// ...
112 | 		]
113 | 		// ...
114 | 	},
115 | 	// second conversation
116 | 	{
117 | 		...
118 | 	},
119 | 	// ...
120 | ]
121 | ```
122 | 


--------------------------------------------------------------------------------
/convinse/library/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import yaml
  4 | import json
  5 | import logging
  6 | from pathlib import Path
  7 | 
  8 | 
  9 | def get_config(path):
 10 |     """Load the config dict from the given .yml file."""
 11 |     with open(path, "r") as fp:
 12 |         config = yaml.safe_load(fp)
 13 |     return config
 14 | 
 15 | 
 16 | def store_json_with_mkdir(data, output_path, indent=True):
 17 |     """Store the JSON data in the given path."""
 18 |     # create path if not exists
 19 |     output_dir = os.path.dirname(output_path)
 20 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
 21 |     with open(output_path, "w") as fp:
 22 |         fp.write(json.dumps(data, indent=4))
 23 | 
 24 | 
 25 | def get_logger(mod_name, config):
 26 |     """Get a logger instance for the given module name."""
 27 |     # create logger
 28 |     logger = logging.getLogger(mod_name)
 29 |     # add handler and format
 30 |     handler = logging.StreamHandler(sys.stdout)
 31 |     formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
 32 |     handler.setFormatter(formatter)
 33 |     logger.addHandler(handler)
 34 |     # set log level
 35 |     log_level = config["log_level"]
 36 |     logger.setLevel(getattr(logging, log_level))
 37 |     return logger
 38 | 
 39 | 
 40 | def get_result_logger(config):
 41 |     """Get a logger instance for the given module name."""
 42 |     # create logger
 43 |     logger = logging.getLogger("result_logger")
 44 |     # add handler and format
 45 |     method_name = config["name"]
 46 |     benchmark = config["benchmark"]
 47 |     result_file = f"_results/{benchmark}/{method_name}.res"
 48 |     result_dir = os.path.dirname(result_file)
 49 |     Path(result_dir).mkdir(parents=True, exist_ok=True)
 50 |     handler = logging.FileHandler(result_file)
 51 |     formatter = logging.Formatter('%(asctime)s %(message)s')
 52 |     handler.setFormatter(formatter)
 53 |     logger.addHandler(handler)
 54 |     # set log level
 55 |     logger.setLevel("INFO")
 56 |     return logger
 57 | 
 58 | 
 59 | def plot_flow_graph(graph):
 60 |     """
 61 |     Predict turn relevances among the given conversation.
 62 |     The method will plot the resulting flow graph.
 63 |     """
 64 |     nx.nx_agraph.write_dot(graph, "test.dot")
 65 |     # same layout using matplotlib with no labels
 66 |     pos = graphviz_layout(graph, prog="dot")
 67 |     pos = pos
 68 |     plt.figure(figsize=(18, 20))
 69 |     nx.draw(graph, pos, with_labels=True, arrows=True, node_size=100)
 70 |     # nx.draw(G, pos, with_labels=True, arrows=True, node_size=100, figsize=(20, 20), dpi=150)
 71 |     plt.xlim([-1, 800])
 72 |     plt.show()
 73 | 
 74 | 
 75 | def print_dict(python_dict):
 76 |     """Print python dict as json-string."""
 77 |     json_string = json.dumps(python_dict)
 78 |     print(json_string)
 79 | 
 80 | 
 81 | def print_verbose(config, string):
 82 |     """Print the given string if verbose is set."""
 83 |     if config["verbose"]:
 84 |         print(str(string))
 85 | 
 86 | 
 87 | def extract_mapping_incomplete_complete(data_paths):
 88 |     """
 89 |     Extract mapping from incomplete questions to complete
 90 |     questions for all follow-up questions.
 91 |     """
 92 |     mapping_incomplete_to_complete = dict()
 93 |     for data_path in data_paths:
 94 |         with open(data_path, "r") as fp:
 95 |             dataset = json.load(fp)
 96 | 
 97 |         for conversation in dataset:
 98 |             for turn in conversation["questions"]:
 99 |                 if turn["turn"] == 0:
100 |                     continue
101 |                 question = turn["question"]
102 |                 completed = turn["completed"]
103 |                 mapping_incomplete_to_complete[question] = completed
104 |     return mapping_incomplete_to_complete
105 | 
106 | 


--------------------------------------------------------------------------------
/config/convmix/nc_all-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "nc_all-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: nc_all
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: True
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | naive_concat: all
 85 | 
 86 | #################################################################
 87 | #  Parameters - ERS
 88 | #################################################################  
 89 | # cache path
 90 | ers_use_cache: True
 91 | ers_cache_path: "_data/convmix/nc_all/er_cache.pickle"
 92 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
 93 | ers_on_the_fly: True
 94 | 
 95 | # evidence retrieval
 96 | evr_min_evidence_length: 3
 97 | evr_max_evidence_length: 200
 98 | evr_max_entities: 10 # max entities per evidence
 99 | evr_max_pos_evidences: 10
100 | 
101 | # evidence scoring
102 | evs_max_evidences: 100
103 | 
104 | #################################################################
105 | #  Parameters - HA
106 | #################################################################
107 | # general
108 | ha_max_answers: 50
109 | 
110 | fid_model_path: "_data/convmix/nc_all/fid/best_dev"
111 | fid_per_gpu_batch_size: 1
112 | fid_max_evidences: 100
113 | 
114 | # train
115 | fid_lr: 0.00005
116 | fid_optim: adamw
117 | fid_scheduler: linear
118 | fid_weight_decay: 0.01
119 | fid_text_maxlength: 250
120 | fid_answer_maxlength: 10
121 | fid_total_step: 15000
122 | fid_warmup_step: 1000
123 | 
124 | # inference
125 | fid_max_evidences: 100
126 | fid_num_beams: 20
127 | 


--------------------------------------------------------------------------------
/config/convmix/nc_init-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "nc_init-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: nc_init
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: True
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | naive_concat: init
 85 | 
 86 | #################################################################
 87 | #  Parameters - ERS
 88 | #################################################################  
 89 | # cache path
 90 | ers_use_cache: True
 91 | ers_cache_path: "_data/convmix/nc_init/er_cache.pickle"
 92 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
 93 | ers_on_the_fly: True
 94 | 
 95 | # evidence retrieval
 96 | evr_min_evidence_length: 3
 97 | evr_max_evidence_length: 200
 98 | evr_max_entities: 10 # max entities per evidence
 99 | evr_max_pos_evidences: 10
100 | 
101 | # evidence scoring
102 | evs_max_evidences: 100
103 | 
104 | #################################################################
105 | #  Parameters - HA
106 | #################################################################
107 | # general
108 | ha_max_answers: 50
109 | 
110 | fid_model_path: "_data/convmix/nc_init/fid/best_dev"
111 | fid_per_gpu_batch_size: 1
112 | fid_max_evidences: 100
113 | 
114 | # train
115 | fid_lr: 0.00005
116 | fid_optim: adamw
117 | fid_scheduler: linear
118 | fid_weight_decay: 0.01
119 | fid_text_maxlength: 250
120 | fid_answer_maxlength: 10
121 | fid_total_step: 15000
122 | fid_warmup_step: 1000
123 | 
124 | # inference
125 | fid_max_evidences: 100
126 | fid_num_beams: 20


--------------------------------------------------------------------------------
/config/convmix/nc_prev-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "nc_prev-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: nc_prev
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: True
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | naive_concat: prev
 85 | 
 86 | #################################################################
 87 | #  Parameters - ERS
 88 | #################################################################  
 89 | # cache path
 90 | ers_use_cache: True
 91 | ers_cache_path: "_data/convmix/nc_prev/er_cache.pickle"
 92 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
 93 | ers_on_the_fly: True
 94 | 
 95 | # evidence retrieval
 96 | evr_min_evidence_length: 3
 97 | evr_max_evidence_length: 200
 98 | evr_max_entities: 10 # max entities per evidence
 99 | evr_max_pos_evidences: 10
100 | 
101 | # evidence scoring
102 | evs_max_evidences: 100
103 | 
104 | #################################################################
105 | #  Parameters - HA
106 | #################################################################
107 | # general
108 | ha_max_answers: 50
109 | 
110 | fid_model_path: "_data/convmix/nc_prev/fid/best_dev"
111 | fid_per_gpu_batch_size: 1
112 | fid_max_evidences: 100
113 | 
114 | # train
115 | fid_lr: 0.00005
116 | fid_optim: adamw
117 | fid_scheduler: linear
118 | fid_weight_decay: 0.01
119 | fid_text_maxlength: 250
120 | fid_answer_maxlength: 10
121 | fid_total_step: 15000
122 | fid_warmup_step: 1000
123 | 
124 | # inference
125 | fid_max_evidences: 100
126 | fid_num_beams: 20
127 | 


--------------------------------------------------------------------------------
/config/convmix/nc_init_prev-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "nc_init_prev-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: nc_init_prev
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: True
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | naive_concat: init_prev
 85 | 
 86 | #################################################################
 87 | #  Parameters - ERS
 88 | #################################################################  
 89 | # cache path
 90 | ers_use_cache: True
 91 | ers_cache_path: "_data/convmix/nc_init_prev/er_cache.pickle"
 92 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
 93 | ers_on_the_fly: True
 94 | 
 95 | # evidence retrieval
 96 | evr_min_evidence_length: 3
 97 | evr_max_evidence_length: 200
 98 | evr_max_entities: 10 # max entities per evidence
 99 | evr_max_pos_evidences: 10
100 | 
101 | # evidence scoring
102 | evs_max_evidences: 100
103 | 
104 | #################################################################
105 | #  Parameters - HA
106 | #################################################################
107 | # general
108 | ha_max_answers: 50
109 | 
110 | fid_model_path: "_data/convmix/nc_init_prev/fid/best_dev"
111 | fid_per_gpu_batch_size: 1
112 | fid_max_evidences: 100
113 | 
114 | # train
115 | fid_lr: 0.00005
116 | fid_optim: adamw
117 | fid_scheduler: linear
118 | fid_weight_decay: 0.01
119 | fid_text_maxlength: 250
120 | fid_answer_maxlength: 10
121 | fid_total_step: 15000
122 | fid_warmup_step: 1000
123 | 
124 | # inference
125 | fid_max_evidences: 100
126 | fid_num_beams: 20


--------------------------------------------------------------------------------
/config/convmix/qres-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "qres-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: qres
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: True
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | qres_input_separator: "[SEP]"
 85 | qres_model_id: "convmix_qres"
 86 | qres_model_dir: "_data/convmix/qres/"
 87 | 
 88 | #################################################################
 89 | #  Parameters - ERS
 90 | #################################################################  
 91 | # cache path
 92 | ers_use_cache: True
 93 | ers_cache_path: "_data/convmix/qres/er_cache.pickle"
 94 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
 95 | ers_on_the_fly: True
 96 | 
 97 | # evidence retrieval
 98 | evr_min_evidence_length: 3
 99 | evr_max_evidence_length: 200
100 | evr_max_entities: 10 # max entities per evidence
101 | evr_max_pos_evidences: 10
102 | 
103 | # evidence scoring
104 | evs_max_evidences: 100
105 | 
106 | #################################################################
107 | #  Parameters - HA
108 | #################################################################
109 | # general
110 | ha_max_answers: 50
111 | 
112 | fid_model_path: "_data/convmix/qres/fid/best_dev"
113 | fid_per_gpu_batch_size: 1
114 | fid_max_evidences: 100
115 | 
116 | # train
117 | fid_lr: 0.00005
118 | fid_optim: adamw
119 | fid_scheduler: linear
120 | fid_weight_decay: 0.01
121 | fid_text_maxlength: 250
122 | fid_answer_maxlength: 10
123 | fid_total_step: 15000
124 | fid_warmup_step: 1000
125 | 
126 | # inference
127 | fid_max_evidences: 100
128 | fid_num_beams: 20


--------------------------------------------------------------------------------
/convinse/heterogeneous_answering/fid_module/fid_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import random
  5 | import logging
  6 | 
  7 | from pathlib import Path
  8 | 
  9 | from convinse.library.utils import get_config
 10 | from convinse.evaluation import evidence_has_answer, question_is_existential
 11 | 
 12 | 
 13 | def prepare_turn(config, input_turn, output_path, train=False):
 14 |     """
 15 |     Prepare the given turn for input into FiD.
 16 |     Input will be top-100 evidences per question
 17 |     as predicted by ERS stage.
 18 |     Writes the result in the given output path.
 19 |     """
 20 |     # create output dir
 21 |     output_dir = os.path.dirname(output_path)
 22 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
 23 | 
 24 |     # prepare
 25 |     res = _prepare_turn(config, input_turn, train)
 26 |     if res is None:
 27 |         sr = input_turn["structured_representation"]
 28 |         raise Exception(f"No evidences found for this turn! SR: {sr}.")
 29 |     
 30 |     # store
 31 |     with open(output_path, "w") as fp:
 32 |         fp.write(json.dumps(res))
 33 |         fp.write("\n")
 34 | 
 35 | 
 36 | def prepare_data(config, input_turns, output_path, train=False):
 37 |     """
 38 |     Prepare the given data for input into FiD.
 39 |     Input will be top-100 evidences per question
 40 |     as predicted by ERS stage.
 41 |     """
 42 |     # create output dir
 43 |     output_dir = os.path.dirname(output_path)
 44 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
 45 | 
 46 |     # process data
 47 |     with open(output_path, "w") as fp_out:
 48 |         # transform
 49 |         for turn in input_turns:
 50 |             # skip instances that are already processed
 51 |             if not turn.get("pred_answers") is None:
 52 |                 continue
 53 | 
 54 |             res = _prepare_turn(config, turn, train)
 55 |             # skip turns for which no evidences were found
 56 |             if res is None:
 57 |                 continue
 58 | 
 59 |             # write new
 60 |             fp_out.write(json.dumps(res))
 61 |             fp_out.write("\n")
 62 | 
 63 | 
 64 | def _prepare_turn(config, input_turn, train):
 65 |     """
 66 |     Prepare the given turn for input into FiD.
 67 |     Input will be top-100 evidences per question
 68 |     as predicted by ERS stage.
 69 |     Returns the object. For internal usage!
 70 |     """
 71 |     # construct set of answers that are present (from silver evidences)
 72 |     answer_ids = [answer["id"] for answer in input_turn["answers"]]
 73 | 
 74 |     # prepare target answers
 75 |     target_answers = set()
 76 |     # retrieve target answers from answering evidences -> preserve order!
 77 |     evidences = input_turn["top_evidences"]
 78 |     for evidence in evidences:
 79 |         if evidence_has_answer(evidence, input_turn["answers"]):
 80 |             for disambiguation in evidence["disambiguations"]:
 81 |                 if disambiguation[1] in answer_ids:
 82 |                     target_answers.add(disambiguation[0])
 83 | 
 84 |     # if no answer can be found, skip instance during train/dev!
 85 |     if train and not target_answers:
 86 |         return None
 87 | 
 88 |     # if no answer in dataset, skip (fix for TimeQuestions dataset)
 89 |     if not input_turn["answers"]:
 90 |         return None
 91 | 
 92 |     evidences = input_turn["top_evidences"]
 93 |     evidences = evidences[: config["fid_max_evidences"]]
 94 | 
 95 |     # create data
 96 |     answers = list(target_answers) + [answer["label"] for answer in input_turn["answers"]]
 97 |     target_answer = answers[0]  # always first element of target_answers
 98 |     evidences = [
 99 |         {"title": evidence["retrieved_for_entity"]["label"], "text": evidence["evidence_text"]}
100 |         for evidence in input_turn["top_evidences"]
101 |     ]
102 | 
103 |     # if there are no evidences, return None (=skip instance)
104 |     if evidences == []:
105 |         return None
106 | 
107 |     # return transformed instance
108 |     return {
109 |         "id": input_turn["question_id"],
110 |         "question": input_turn["structured_representation"],
111 |         "target": target_answer,
112 |         "answers": answers,
113 |         "ctxs": evidences,
114 |     }


--------------------------------------------------------------------------------
/convinse/question_understanding/question_rewriting/dataset_question_rewriting.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import torch
  4 | 
  5 | from convinse.library.string_library import StringLibrary as string_lib
  6 | from convinse.library.utils import extract_mapping_incomplete_complete
  7 | 
  8 | def input_to_text(history_turns, current_turn, history_separator):
  9 |     """
 10 |     Transform the relevant turns and current turn into the input text.
 11 |     """
 12 |     history_text = history_separator.join(
 13 |         [_history_turn_to_text(history_turn, history_separator) for history_turn in history_turns]
 14 |     )
 15 | 
 16 |     # create input
 17 |     current_question = current_turn["question"]
 18 |     input_text = f"{history_text}{history_separator}{current_question}"
 19 |     return input_text
 20 | 
 21 | 
 22 | def _history_turn_to_text(history_turn, history_separator):
 23 |     """
 24 |     Transform the given history turn to text.
 25 |     """
 26 |     question = history_turn["question"]
 27 |     answers = history_turn["answers"]
 28 |     answers_text = " ".join([answer["label"] for answer in answers])
 29 |     history_turn_text = f"{question}{history_separator}{answers_text}"
 30 |     return history_turn_text
 31 |     
 32 | 
 33 | class DatasetQuestionRewriting(torch.utils.data.Dataset):
 34 |     def __init__(self, config, tokenizer, path):
 35 |         self.config = config
 36 |         self.tokenizer = tokenizer
 37 |         self.history_separator = config["history_separator"]
 38 | 
 39 |         benchmark_path = config["benchmark_path"]
 40 |         train_path = os.path.join(benchmark_path, config["train_input_path"])
 41 |         dev_path = os.path.join(benchmark_path, config["dev_input_path"])
 42 |         data_paths = [train_path, dev_path]
 43 |         self.mapping_incomplete_to_complete = extract_mapping_incomplete_complete(data_paths)
 44 | 
 45 |         input_encodings, output_encodings, dataset_length = self._load_data(path)
 46 |         self.input_encodings = input_encodings
 47 |         self.output_encodings = output_encodings
 48 |         self.dataset_length = dataset_length
 49 | 
 50 |     def __getitem__(self, idx):
 51 |         item = {key: torch.tensor(val[idx]) for key, val in self.input_encodings.items()}
 52 |         labels = self.output_encodings["input_ids"][idx]
 53 |         item = {
 54 |             "input_ids": item["input_ids"],
 55 |             "attention_mask": item["attention_mask"],
 56 |             "labels": labels,
 57 |         }
 58 |         return item
 59 | 
 60 |     def __len__(self):
 61 |         return self.dataset_length
 62 | 
 63 |     def _load_data(self, path):
 64 |         """
 65 |         Opens the file, and loads the data into
 66 |         a format that can be put into the model.
 67 | 
 68 |         The whole history is given as input.
 69 |         The complete question, as annotated in the dataset,
 70 |         is the gold output.
 71 |         """
 72 |         # open data
 73 |         with open(path, "r") as fp:
 74 |             dataset = json.load(fp)
 75 | 
 76 |         inputs = list()
 77 |         outputs = list()
 78 | 
 79 |         for conversation in dataset:
 80 |             history = list()
 81 |             for turn in conversation["questions"]:
 82 |                 # skip initial turn: no rewrite required!
 83 |                 if turn["turn"] == 0:
 84 |                     continue
 85 | 
 86 |                 # create input
 87 |                 inputs.append(input_to_text(history, turn, self.history_separator))
 88 | 
 89 |                 # create output
 90 |                 question = turn["question"]
 91 |                 complete = self.mapping_incomplete_to_complete.get(question)
 92 |                 outputs.append(complete)
 93 | 
 94 |                 # append to history
 95 |                 history.append(turn)
 96 | 
 97 |         input_encodings = self.tokenizer(
 98 |             inputs, padding=True, truncation=True, max_length=self.config["qrew_max_input_length"]
 99 |         )
100 |         output_encodings = self.tokenizer(
101 |             outputs, padding=True, truncation=True, max_length=self.config["qrew_max_input_length"]
102 |         )
103 |         dataset_length = len(inputs)
104 | 
105 |         return input_encodings, output_encodings, dataset_length
106 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/question_rewriting/question_rewriting_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import transformers
  4 | 
  5 | from pathlib import Path
  6 | import convinse.question_understanding.question_rewriting.dataset_question_rewriting as dataset
  7 | 
  8 | 
  9 | class QuestionRewritingModel(torch.nn.Module):
 10 |     def __init__(self, config):
 11 |         super(QuestionRewritingModel, self).__init__()
 12 |         self.config = config
 13 |         self.model = transformers.T5ForConditionalGeneration.from_pretrained(
 14 |             "castorini/t5-base-canard"
 15 |         )
 16 |         self.tokenizer = transformers.T5TokenizerFast.from_pretrained("castorini/t5-base-canard")
 17 | 
 18 |     def set_eval_mode(self):
 19 |         """Set model to eval mode."""
 20 |         self.model.eval()
 21 | 
 22 |     def save(self):
 23 |         """Save model."""
 24 |         model_path = self.config["qrew_model_path"]
 25 |         # create dir if not exists
 26 |         model_dir = os.path.dirname(model_path)
 27 |         Path(model_dir).mkdir(parents=True, exist_ok=True)
 28 |         torch.save(self.model.state_dict(), model_path)
 29 | 
 30 |     def load(self):
 31 |         """Load model."""
 32 |         state_dict = torch.load(self.config["qrew_model_path"])
 33 |         self.model.load_state_dict(state_dict)
 34 |         # move to GPU (if possible)
 35 |         if torch.cuda.is_available():
 36 |             self.model = self.model.cuda()
 37 | 
 38 |     def train(self, train_path, dev_path):
 39 |         """Train model."""
 40 |         # load datasets
 41 |         train_dataset = dataset.DatasetQuestionRewriting(self.config, self.tokenizer, train_path)
 42 |         dev_dataset = dataset.DatasetQuestionRewriting(self.config, self.tokenizer, dev_path)
 43 |         # arguments for training
 44 |         training_args = transformers.Seq2SeqTrainingArguments(
 45 |             output_dir="convinse/question_understanding/question_rewriting/results",  # output directory
 46 |             num_train_epochs=self.config[
 47 |                 "qrew_num_train_epochs"
 48 |             ],  # total number of training epochs
 49 |             per_device_train_batch_size=self.config[
 50 |                 "qrew_per_device_train_batch_size"
 51 |             ],  # batch size per device during training
 52 |             per_device_eval_batch_size=self.config[
 53 |                 "qrew_per_device_eval_batch_size"
 54 |             ],  # batch size for evaluation
 55 |             warmup_steps=self.config[
 56 |                 "qrew_warmup_steps"
 57 |             ],  # number of warmup steps for learning rate scheduler
 58 |             weight_decay=self.config["qrew_weight_decay"],  # strength of weight decay
 59 |             logging_dir="convinse/question_understanding/question_rewriting/logs",  # directory for storing logs
 60 |             logging_steps=1000,
 61 |             evaluation_strategy="epoch",
 62 |             save_strategy="epoch",
 63 |             load_best_model_at_end="True"
 64 |             # predict_with_generate=True
 65 |         )
 66 |         # create the object for training
 67 |         trainer = transformers.Seq2SeqTrainer(
 68 |             model=self.model,
 69 |             args=training_args,
 70 |             train_dataset=train_dataset,
 71 |             eval_dataset=dev_dataset,
 72 |         )
 73 |         # training progress
 74 |         trainer.train()
 75 |         # store model
 76 |         self.save()
 77 | 
 78 |     def inference(self, inputs):
 79 |         """
 80 |         Run the model on the given input.
 81 |         Snippet taken from: https://github.com/gonced8/rachael-scai/blob/main/demo.py
 82 |         """
 83 |         # encode
 84 |         rewrite_input_ids = self.tokenizer.encode(
 85 |             inputs,
 86 |             truncation=False,
 87 |             return_tensors="pt",
 88 |         )
 89 |         if torch.cuda.is_available():
 90 |             rewrite_input_ids = rewrite_input_ids.cuda()
 91 |         # generate
 92 |         output = self.model.generate(
 93 |             rewrite_input_ids,
 94 |             max_length=self.config["qrew_max_output_length"],
 95 |             do_sample=self.config["qrew_do_sample"],
 96 |         )
 97 |         # decoding
 98 |         model_rewrite = self.tokenizer.batch_decode(
 99 |             output,
100 |             skip_special_tokens=True,
101 |             clean_up_tokenization_spaces=True,
102 |         )[0]
103 |         return model_rewrite
104 | 


--------------------------------------------------------------------------------
/config/convmix/convinse.yml:
--------------------------------------------------------------------------------
  1 | name: "convinse"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: sr
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | 
 46 | train_input_path: "train_set/train_set_ALL.json"
 47 | dev_input_path: "dev_set/dev_set_ALL.json"
 48 | test_input_path: "test_set/test_set_ALL.json"
 49 | 
 50 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 51 | path_to_intermediate_results: "_intermediate_representations/convmix"
 52 | 
 53 | #################################################################
 54 | #  Parameters - CLOCQ
 55 | #################################################################
 56 | clocq_params:
 57 |   h_match: 0.4
 58 |   h_rel: 0.2
 59 |   h_conn: 0.3
 60 |   h_coh: 0.1
 61 |   d: 20
 62 |   k: "AUTO"
 63 |   p_setting: 1000 # setting for search_space function
 64 |   bm25_limit: False
 65 | clocq_p: 1000 #  setting for neighborhood function(s) 
 66 | clocq_use_api: True # using CLOCQClientInterface
 67 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 68 | clocq_port: "443" # port for client
 69 | 
 70 | #################################################################
 71 | #  Parameters - Silver annotation
 72 | #################################################################
 73 | # annotation - SR
 74 | sr_relation_shared_active: True
 75 | sr_remove_stopwords: True
 76 | 
 77 | # OPTIONAL: annotation - turn relevance 
 78 | tr_transitive_relevances: False
 79 | tr_extract_dataset: False
 80 | 
 81 | #################################################################
 82 | #  Parameters - QU
 83 | #################################################################
 84 | sr_architecture: BART
 85 | sr_model_path: "_data/convmix/convinse/sr_model.bin"
 86 | sr_max_input_length: 512
 87 | 
 88 | history_separator: " ||| "
 89 | sr_separator: " || "
 90 | 
 91 | # training parameters
 92 | sr_num_train_epochs: 5
 93 | sr_per_device_train_batch_size: 10
 94 | sr_per_device_eval_batch_size: 10
 95 | sr_warmup_steps: 500
 96 | sr_weight_decay: 0.01
 97 | 
 98 | # generation parameters
 99 | sr_no_repeat_ngram_size: 2
100 | sr_num_beams: 20
101 | sr_early_stopping: True
102 | 
103 | sr_delimiter: "||"
104 | 
105 | #################################################################
106 | #  Parameters - ERS
107 | #################################################################  
108 | # cache path
109 | ers_use_cache: True
110 | ers_cache_path: "_data/convmix/convinse/er_cache.pickle"
111 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
112 | ers_on_the_fly: True
113 | 
114 | # evidence retrieval
115 | evr_min_evidence_length: 3
116 | evr_max_evidence_length: 200
117 | evr_max_entities: 10 # max entities per evidence
118 | evr_max_pos_evidences: 10
119 | 
120 | # evidence scoring
121 | evs_max_evidences: 100
122 | 
123 | #################################################################
124 | #  Parameters - HA
125 | #################################################################
126 | # general
127 | ha_max_answers: 50
128 | 
129 | fid_model_path: "_data/convmix/convinse/fid/best_dev"
130 | fid_per_gpu_batch_size: 1
131 | fid_max_evidences: 100
132 | 
133 | # train
134 | fid_lr: 0.00005
135 | fid_optim: adamw
136 | fid_scheduler: linear
137 | fid_weight_decay: 0.01
138 | fid_text_maxlength: 250
139 | fid_answer_maxlength: 10
140 | fid_total_step: 15000
141 | fid_warmup_step: 1000
142 | 
143 | # inference
144 | fid_max_evidences: 100
145 | fid_num_beams: 20
146 | 
147 | 
148 | 


--------------------------------------------------------------------------------
/config/convmix/qrew-clocq_bm25-fid.yml:
--------------------------------------------------------------------------------
  1 | name: "qrew-clocq_bm25-fid"
  2 | log_level: "INFO"
  3 | 
  4 | # Construct pipeline
  5 | qu: qrew
  6 | ers: clocq_bm25
  7 | ha: fid
  8 | 
  9 | # Define source combinations
 10 | source_combinations:
 11 |   - - kb
 12 |   - - text
 13 |   - - table
 14 |   - - info
 15 |   - - kb
 16 |     - text
 17 |   - - kb
 18 |     - table
 19 |   - - kb 
 20 |     - info
 21 |   - - text
 22 |     - table
 23 |   - - text 
 24 |     - info
 25 |   - - table
 26 |     - info
 27 |   - - kb
 28 |     - text
 29 |     - table
 30 |     - info
 31 | 
 32 | #################################################################
 33 | #  General file paths
 34 | #################################################################
 35 | path_to_stopwords: "_data/stopwords.txt"
 36 | path_to_labels: "_data/labels.json"
 37 | path_to_wikipedia_mappings: "_data/wikipedia_mappings.json"
 38 | path_to_wikidata_mappings: "_data/wikidata_mappings.json"
 39 | 
 40 | #################################################################
 41 | #  Benchmark specific settings
 42 | #################################################################
 43 | benchmark: "convmix"
 44 | benchmark_path: "_benchmarks/convmix"
 45 | seed_conversations_path: "_benchmarks/convmix/ConvMixSeed.json"
 46 | 
 47 | train_input_path: "train_set/train_set_ALL.json"
 48 | dev_input_path: "dev_set/dev_set_ALL.json"
 49 | test_input_path: "test_set/test_set_ALL.json"
 50 | 
 51 | path_to_annotated: "_intermediate_representations/convmix" # where annotated inputs come from
 52 | path_to_intermediate_results: "_intermediate_representations/convmix"
 53 | 
 54 | #################################################################
 55 | #  Parameters - CLOCQ
 56 | #################################################################
 57 | clocq_params:
 58 |   h_match: 0.4
 59 |   h_rel: 0.2
 60 |   h_conn: 0.3
 61 |   h_coh: 0.1
 62 |   d: 20
 63 |   k: "AUTO"
 64 |   p_setting: 1000 # setting for search_space function
 65 |   bm25_limit: False
 66 | clocq_p: 1000 #  setting for neighborhood function(s) 
 67 | clocq_use_api: True # using CLOCQClientInterface
 68 | clocq_host: "https://clocq.mpi-inf.mpg.de/api" # host for client
 69 | clocq_port: "443" # port for client
 70 | 
 71 | #################################################################
 72 | #  Parameters - Silver annotation
 73 | #################################################################
 74 | # annotation - SR
 75 | sr_relation_shared_active: True
 76 | sr_remove_stopwords: True
 77 | 
 78 | # OPTIONAL: annotation - turn relevance 
 79 | tr_transitive_relevances: False
 80 | tr_extract_dataset: True
 81 | 
 82 | #################################################################
 83 | #  Parameters - QU
 84 | #################################################################
 85 | qrew_model_path: "_data/convmix/qrew/qrew.bin"
 86 | qrew_max_input_length: 512
 87 | 
 88 | history_separator: " ||| "
 89 | 
 90 | # training parameters
 91 | qrew_num_train_epochs: 3
 92 | qrew_per_device_train_batch_size: 10
 93 | qrew_per_device_eval_batch_size: 10
 94 | qrew_warmup_steps: 500
 95 | qrew_weight_decay: 0.01
 96 | 
 97 | # generation parameters
 98 | qrew_no_repeat_ngram_size: 2
 99 | qrew_max_output_length: 100
100 | qrew_do_sample: True
101 | 
102 | #################################################################
103 | #  Parameters - ERS
104 | #################################################################  
105 | # cache path
106 | ers_use_cache: True
107 | ers_cache_path: "_data/convmix/qrew/er_cache.pickle"
108 | ers_wikipedia_dump: "_data/convmix/wikipedia_dump.pickle"
109 | ers_on_the_fly: True
110 | 
111 | # evidence retrieval
112 | evr_min_evidence_length: 3
113 | evr_max_evidence_length: 200
114 | evr_max_entities: 10 # max entities per evidence
115 | evr_max_pos_evidences: 10
116 | 
117 | # evidence scoring
118 | evs_max_evidences: 100
119 | 
120 | #################################################################
121 | #  Parameters - HA
122 | #################################################################
123 | # general
124 | ha_max_answers: 50
125 | 
126 | fid_model_path: "_data/convmix/qrew/fid/best_dev"
127 | fid_per_gpu_batch_size: 1
128 | fid_max_evidences: 100
129 | 
130 | # train
131 | fid_lr: 0.00005
132 | fid_optim: adamw
133 | fid_scheduler: linear
134 | fid_weight_decay: 0.01
135 | fid_text_maxlength: 250
136 | fid_answer_maxlength: 10
137 | fid_total_step: 15000
138 | fid_warmup_step: 1000
139 | 
140 | # inference
141 | fid_max_evidences: 100
142 | fid_num_beams: 20
143 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/naive_concat/naive_concat.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import logging
  4 | 
  5 | from tqdm import tqdm
  6 | 
  7 | from pathlib import Path
  8 | from convinse.question_understanding.question_understanding import QuestionUnderstanding
  9 | from convinse.library.utils import get_config
 10 | 
 11 | 
 12 | class NaiveConcat(QuestionUnderstanding):
 13 |     """
 14 |     Prepend various parts of the ongoing conversation to the current turn.
 15 |     A turn refers to the question and the answers.
 16 |     Answers can be predicted answers or generated answers.
 17 |         - Option 1 (init): Prepend initial turn
 18 |         - Option 2 (prev): Prepend previous turn
 19 |         - Option 3 (init_prev): Prepend initial and previous turn
 20 |         - Option 4 (all): Prepend ALL previous turns
 21 |     The option can be set in the config file.
 22 |     """
 23 | 
 24 |     def inference_on_turn(self, turn, history_turns):
 25 |         """Run model on single turn and add predictions."""
 26 |         intent_explicit = self._preprend_history(history_turns, turn)
 27 |         turn["structured_representation"] = intent_explicit
 28 |         return turn
 29 | 
 30 |     def inference_on_conversation(self, conversation):
 31 |         """Run inference on a single conversation."""
 32 |         history_turns = list()
 33 |         for turn in conversation["questions"]:
 34 |             # concat history to question
 35 |             question = self._preprend_history(history_turns, turn)
 36 |             turn["structured_representation"] = question
 37 | 
 38 |             # append to history
 39 |             history_turns.append(turn)
 40 |         return conversation
 41 | 
 42 |     def _preprend_history(self, history_turns, current_turn):
 43 |         """
 44 |         Transform the relevant turns and current turn into the input text.
 45 |         """
 46 |         ## consider last turn and first turn only
 47 |         if self.config["naive_concat"] == "init_prev":
 48 |             if len(history_turns) > 2:
 49 |                 history_turns = [history_turns[0], history_turns[-1]]
 50 | 
 51 |         ## consider first turn only
 52 |         elif self.config["naive_concat"] == "init":
 53 |             if len(history_turns) > 1:
 54 |                 history_turns = [history_turns[0]]
 55 | 
 56 |         ## consider last turn only
 57 |         elif self.config["naive_concat"] == "prev":
 58 |             if len(history_turns) > 1:
 59 |                 history_turns = [history_turns[-1]]
 60 | 
 61 |         ## consider ALL turns
 62 |         elif self.config["naive_concat"] == "all":
 63 |             history_turns = history_turns
 64 | 
 65 |         ## consider only current turn
 66 |         elif self.config["naive_concat"] == "none":
 67 |             history_turns = []
 68 | 
 69 |         else:
 70 |             raise Exception("Unknown value for naive_concat!")
 71 | 
 72 |         # create history text
 73 |         history_text = " ".join(
 74 |             [self._history_turn_to_text(history_turn) for history_turn in history_turns]
 75 |         )
 76 | 
 77 |         # create input
 78 |         current_question = current_turn["question"]
 79 |         input_text = f"{history_text} {current_question}"
 80 |         return input_text
 81 | 
 82 |     def _history_turn_to_text(self, history_turn):
 83 |         """
 84 |         Transform the given history turn to text.
 85 |         """
 86 |         turn = history_turn["turn"]
 87 |         question = history_turn["question"]
 88 | 
 89 |         # use predicted answer in end-to-end evaluation
 90 |         if self.use_gold_answers:
 91 |             answers = history_turn["answers"]
 92 |             answers_text = ", ".join([answer["label"] for answer in answers])
 93 |         else:
 94 |             answer = history_turn["pred_answers"][0]
 95 |             answers_text = answer["label"]
 96 | 
 97 |         history_turn_text = f"{question} {answers_text}"
 98 |         return history_turn_text
 99 | 
100 | 
101 | #######################################################################################################################
102 | #######################################################################################################################
103 | if __name__ == "__main__":
104 |     if len(sys.argv) != 2:
105 |         raise Exception(
106 |             "Invalid number of options provided.\nUsage: python convinse/question_understanding/naive_concat/naive_concat.py <PATH_TO_CONFIG>"
107 |         )
108 | 
109 |     # load config
110 |     config_path = sys.argv[1]
111 |     config = get_config(config_path)
112 |     naive_concat = NaiveConcat(config, use_gold_answers=True)
113 |     naive_concat.inference()
114 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/structured_representation/dataset_structured_representation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import torch
  3 | 
  4 | from convinse.library.string_library import StringLibrary as string_lib
  5 | 
  6 | 
  7 | def input_to_text(history_turns, current_turn, history_separator):
  8 |     """
  9 |     Transform the history turns and current turn into the input text.
 10 |     """
 11 |     # create history text
 12 |     history_text = history_separator.join(
 13 |         [_history_turn_to_text(history_turn, history_separator) for history_turn in history_turns]
 14 |     )
 15 | 
 16 |     # create input
 17 |     current_question = current_turn["question"]
 18 |     input_text = f"{history_text}{history_separator}{current_question}"
 19 |     return input_text
 20 | 
 21 | 
 22 | def _history_turn_to_text(history_turn, history_separator):
 23 |     """
 24 |     Transform the given history turn to text.
 25 |     """
 26 |     question = history_turn["question"]
 27 |     answers = history_turn["answers"]
 28 |     answers_text = " ".join([answer["label"] for answer in answers])
 29 |     history_turn_text = f"{question}{history_separator}{answers_text}"
 30 |     return history_turn_text
 31 | 
 32 | 
 33 | def output_to_text(silver_SR, SR_delimiter):
 34 |     """
 35 |     Transform the given silver abstract representation to text.
 36 |     The (recursive) list data structure is resolved and flattened.
 37 |     """
 38 |     sep = ", "
 39 |     topic, entities, relation, ans_type = silver_SR[0]
 40 | 
 41 |     # create individual components
 42 |     topic = " ".join(topic).strip()
 43 |     entities = " ".join(entities).strip()
 44 |     relation = " ".join(relation).strip()
 45 |     ans_type = ans_type.strip() if ans_type else ""
 46 | 
 47 |     # create ar text
 48 |     sr_text = f"{topic}{SR_delimiter}{entities}{SR_delimiter}{relation}{SR_delimiter}{ans_type}"
 49 | 
 50 |     # remove whitespaces in AR
 51 |     while "  " in sr_text:
 52 |         sr_text = sr_text.replace("  ", " ")
 53 |     sr_text.replace(" , ", ", ")
 54 |     sr_text = sr_text.strip()
 55 |     return sr_text
 56 | 
 57 | 
 58 | class DatasetStructuredRepresentation(torch.utils.data.Dataset):
 59 |     def __init__(self, config, tokenizer, path):
 60 |         self.config = config
 61 |         self.tokenizer = tokenizer
 62 |         self.history_separator = config["history_separator"]
 63 |         self.sr_separator = config["sr_separator"]
 64 | 
 65 |         input_encodings, output_encodings, dataset_length = self._load_data(path)
 66 |         self.input_encodings = input_encodings
 67 |         self.output_encodings = output_encodings
 68 |         self.dataset_length = dataset_length
 69 | 
 70 |     def __getitem__(self, idx):
 71 |         item = {key: torch.tensor(val[idx]) for key, val in self.input_encodings.items()}
 72 |         labels = self.output_encodings["input_ids"][idx]
 73 |         item = {
 74 |             "input_ids": item["input_ids"],
 75 |             "attention_mask": item["attention_mask"],
 76 |             "labels": labels,
 77 |         }
 78 |         return item
 79 | 
 80 |     def __len__(self):
 81 |         return self.dataset_length
 82 | 
 83 |     def _load_data(self, path):
 84 |         """
 85 |         Opens the file, and loads the data into
 86 |         a format that can be put into the model.
 87 | 
 88 |         The input dataset should be annotated using
 89 |         the silver_annotation.py class.
 90 | 
 91 |         The whole history is given as input.
 92 |         """
 93 |         # open data
 94 |         with open(path, "r") as fp:
 95 |             dataset = json.load(fp)
 96 | 
 97 |         inputs = list()
 98 |         outputs = list()
 99 | 
100 |         for conversation in dataset:
101 |             history = list()
102 |             for turn in conversation["questions"]:
103 |                 # skip examples for which no gold SR was found, or for first turn
104 |                 if not turn["silver_SR"]:
105 |                     continue
106 | 
107 |                 inputs.append(input_to_text(history, turn, self.history_separator))
108 |                 outputs.append(output_to_text(turn["silver_SR"], self.sr_separator))
109 | 
110 |                 # append to history
111 |                 history.append(turn)
112 | 
113 |         # encode
114 |         input_encodings = self.tokenizer(
115 |             inputs, padding=True, truncation=True, max_length=self.config["sr_max_input_length"]
116 |         )
117 |         output_encodings = self.tokenizer(
118 |             outputs, padding=True, truncation=True, max_length=self.config["sr_max_input_length"]
119 |         )
120 |         dataset_length = len(inputs)
121 | 
122 |         return input_encodings, output_encodings, dataset_length
123 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/question_rewriting/question_rewriting_module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import logging
  4 | 
  5 | from convinse.library.utils import get_config, get_logger
  6 | from convinse.question_understanding.question_understanding import QuestionUnderstanding
  7 | from convinse.question_understanding.question_rewriting.question_rewriting_model import (
  8 |     QuestionRewritingModel,
  9 | )
 10 | import convinse.question_understanding.question_rewriting.dataset_question_rewriting as dataset
 11 | 
 12 | 
 13 | class QuestionRewritingModule(QuestionUnderstanding):
 14 |     def __init__(self, config, use_gold_answers):
 15 |         """Initialize QR module."""
 16 |         self.config = config
 17 |         self.logger = get_logger(__name__, config)
 18 |         self.use_gold_answers = use_gold_answers
 19 |         
 20 |         # create model
 21 |         self.qr_model = QuestionRewritingModel(config)
 22 |         self.model_loaded = False
 23 | 
 24 |         self.history_separator = config["history_separator"]
 25 | 
 26 |     def train(self):
 27 |         """Train the model on silver AR data."""
 28 |         # create paths
 29 |         self.logger.info(f"Starting training...")
 30 |         data_dir = self.config["path_to_annotated"]
 31 |         train_path = os.path.join(data_dir, "annotated_train.json")
 32 |         dev_path = os.path.join(data_dir, "annotated_dev.json")
 33 |         self.qr_model.train(train_path, dev_path)
 34 |         self.logger.info(f"Finished training.")
 35 | 
 36 |     def inference_on_conversation(self, conversation):
 37 |         """Run inference on a single conversation."""
 38 |         # load QR model (if required)
 39 |         self._load()
 40 | 
 41 |         # QR model inference
 42 |         history_turns = list()
 43 |         for i, turn in enumerate(conversation["questions"]):
 44 |             # append to history
 45 |             question = turn["question"]
 46 |             history_turns.append(question)
 47 | 
 48 |             # prepare input (omitt gold answer(s))
 49 |             rewrite_input = self.history_separator.join(history_turns)
 50 | 
 51 |             # run inference
 52 |             qrew = self.qr_model.inference(rewrite_input)
 53 |             turn["structured_representation"] = qrew
 54 | 
 55 |             # only append answer if there is a next question
 56 |             if i + 1 < len(conversation["questions"]):
 57 |                 if self.use_gold_answers:
 58 |                     answer_text = " ".join([answer["label"] for answer in turn["answers"]])
 59 |                 else:
 60 |                     # answer_text = ", ".join([answer["label"] for answer in turn["pred_answers"]])
 61 |                     answer_text = turn["pred_answers"][0]["label"]
 62 |                 history_turns.append(answer_text)
 63 |         return conversation
 64 | 
 65 |     def inference_on_turn(self, turn, history_turns):
 66 |         """Run inference on a single turn (and history)."""
 67 |         # load QR model (if required)
 68 |         self._load()
 69 | 
 70 |         # SR model inference
 71 |         question = turn["question"]
 72 |         history_turns.append(question)
 73 | 
 74 |         # prepare input (omitt gold answer(s))
 75 |         rewrite_input = self.history_separator.join(history_turns)
 76 | 
 77 |         # run inference
 78 |         intent_explicit = self.qr_model.inference(rewrite_input)
 79 |         turn["structured_representation"] = intent_explicit
 80 |         return turn
 81 | 
 82 |     def _load(self):
 83 |         """Load the QRes model."""
 84 |         # only load if not already done so
 85 |         if not self.model_loaded:
 86 |             self.qr_model.load()
 87 |             self.qr_model.set_eval_mode()
 88 |             self.model_loaded = True
 89 | 
 90 | 
 91 | #######################################################################################################################
 92 | #######################################################################################################################
 93 | if __name__ == "__main__":
 94 |     if len(sys.argv) != 3:
 95 |         raise Exception(
 96 |             "Invalid number of options provided.\nUsage: python convinse/question_understanding/question_rewriting/question_rewriting_module.py <PATH_TO_CONFIG>"
 97 |         )
 98 | 
 99 |     function = sys.argv[1]
100 |     config_path = sys.argv[2]
101 |     config = get_config(config_path)
102 | 
103 |     # train: train model
104 |     if function == "--train":
105 |         qrm = QuestionRewritingModule(config, use_gold_answers=True)
106 |         qrm.train()
107 | 
108 |     # inference: add predictions to data
109 |     elif function == "--inference":
110 |         # load config
111 |         qrm = QuestionRewritingModule(config, use_gold_answers=True)
112 |         qrm.inference()
113 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/question_resolution/question_resolution_module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import logging
  5 | 
  6 | from subprocess import Popen, PIPE
  7 | 
  8 | from convinse.library.utils import get_config, get_logger, store_json_with_mkdir
  9 | from convinse.question_understanding.question_understanding import QuestionUnderstanding
 10 | import convinse.question_understanding.question_resolution.question_resolution_utils as qres_utils
 11 | 
 12 | 
 13 | class QuestionResolutionModule(QuestionUnderstanding):
 14 |     def __init__(self, config, use_gold_answers):
 15 |         """Initialize SR module."""
 16 |         self.config = config
 17 |         self.logger = get_logger(__name__, config)
 18 |         self.use_gold_answers = use_gold_answers
 19 |         self.path_to_quretec = "convinse/question_understanding/question_resolution/quretec"
 20 |         self.model_id = config["qres_model_id"]
 21 |         self.model_dir = config["qres_model_dir"]
 22 | 
 23 |     def train(self):
 24 |         """Train the model on silver SR data."""
 25 |         # train model
 26 |         self.logger.info(f"Starting training...")
 27 |         input_dir = self.config["path_to_annotated"]
 28 |         train_path = os.path.join(input_dir, "annotated_train.json")
 29 |         dev_path = os.path.join(input_dir, "annotated_dev.json")
 30 |         qres_utils.prepare_data_for_training(self.config, train_path, dev_path)
 31 |         benchmark = self.config["benchmark"]
 32 |         data_dir = f"_intermediate_representations/{benchmark}/qres/data"
 33 | 
 34 |         # run training
 35 |         COMMAND = ["python", f"{self.path_to_quretec}/run_ner.py"]
 36 |         COMMAND += ["--task_name", "ner"]
 37 |         COMMAND += ["--bert_model", "bert-large-uncased"]
 38 |         COMMAND += ["--max_seq_length", "300"]
 39 |         COMMAND += ["--train_batch_size", "20"]
 40 |         COMMAND += ["--train_on", "train"]
 41 |         COMMAND += ["--hidden_dropout_prob", "0.4"]
 42 |         COMMAND += ["--dev_on", "dev"]
 43 |         COMMAND += ["--do_train"]
 44 |         COMMAND += ["--data_dir", data_dir]
 45 |         COMMAND += ["--base_dir", self.model_dir]
 46 |         COMMAND += ["--model_id", self.model_id]
 47 |         process = Popen(COMMAND, stdout=sys.stdout, stderr=sys.stderr)
 48 |         self.logger.info(f"Finished training.")
 49 | 
 50 |     def inference_on_data(self, input_data):
 51 |         """Run model on data and add predictions."""
 52 |         benchmark = self.config["benchmark"]
 53 |         data_dir = f"_intermediate_representations/{benchmark}/qres/data"
 54 |         output_path = os.path.join(data_dir, "data_for_inference.json")
 55 | 
 56 |         # model inference on given data
 57 |         qres_utils.prepare_data_for_inference(
 58 |             self.config, input_data, output_path, use_gold_answers=self.use_gold_answers
 59 |         )
 60 |         self._inference()
 61 | 
 62 |         # postprocess predictions
 63 |         quretec_pred_path = os.path.join(
 64 |             self.model_dir, self.model_id, "eval_results_data_for_inference_epoch0.json"
 65 |         )
 66 |         qres_utils.postprocess_data(input_data, quretec_pred_path)
 67 |         return input_data
 68 | 
 69 |     def inference_on_turn(self, turn, history_turns):
 70 |         """Run inference on a single turn (and history)."""
 71 |         if not history_turns:
 72 |             turn["structured_representation"] = turn["question"]
 73 |             return turn
 74 | 
 75 |         benchmark = self.config["benchmark"]
 76 |         data_dir = f"_intermediate_representations/{benchmark}/qres/data"
 77 |         output_path = os.path.join(data_dir, "data_for_inference.json")
 78 |         
 79 |         # model inference on given data
 80 |         qres_utils.prepare_turn_for_inference(self.config, turn, history_turns, output_path, self.use_gold_answers)
 81 |         self._inference()
 82 | 
 83 |         # postprocess predictions
 84 |         quretec_pred_path = os.path.join(
 85 |             self.model_dir, self.model_id, "eval_results_data_for_inference_epoch0.json"
 86 |         )
 87 |         qres_utils.postprocess_turn(turn, quretec_pred_path)
 88 |         return turn
 89 | 
 90 |     def _inference(self):
 91 |         """Run QuReTeC model on given input via separate script."""
 92 |         benchmark = self.config["benchmark"]
 93 |         data_dir = f"_intermediate_representations/{benchmark}/qres/data"
 94 | 
 95 |         # run inference
 96 |         COMMAND = ["python", f"{self.path_to_quretec}/run_ner.py"]
 97 |         COMMAND += ["--task_name", "ner"]
 98 |         COMMAND += ["--do_eval"]
 99 |         COMMAND += ["--do_lower_case"]
100 |         COMMAND += ["--data_dir", data_dir]
101 |         COMMAND += ["--base_dir", self.model_dir]
102 |         COMMAND += ["--dev_on", "data_for_inference"]
103 |         COMMAND += ["--model_id", self.model_id]
104 |         COMMAND += ["--no_cuda"]
105 |         process = Popen(COMMAND, stdout=sys.stdout, stderr=sys.stderr)
106 |         process.communicate()
107 | 
108 | 
109 | #######################################################################################################################
110 | #######################################################################################################################
111 | if __name__ == "__main__":
112 |     if len(sys.argv) != 3:
113 |         raise Exception(
114 |             "Usage: python convinse/question_understanding/question_resolution/question_resolution_module.py --<FUNCTION> <PATH_TO_CONFIG>"
115 |         )
116 | 
117 |     function = sys.argv[1]
118 |     config_path = sys.argv[2]
119 |     config = get_config(config_path)
120 | 
121 |     # train: train model
122 |     if function == "--train":
123 |         qrm = QuestionResolutionModule(config, use_gold_answers=True)
124 |         qrm.train()
125 | 
126 |     # inference: add predictions to data
127 |     elif function == "--inference":
128 |         # load config
129 |         qrm = QuestionResolutionModule(config, use_gold_answers=True)
130 |         qrm.inference()
131 | 


--------------------------------------------------------------------------------
/convinse/library/custom_trainer.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import inspect
  3 | import math
  4 | import os
  5 | import random
  6 | import re
  7 | import shutil
  8 | import sys
  9 | import time
 10 | import json
 11 | import warnings
 12 | from logging import StreamHandler
 13 | from pathlib import Path
 14 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 15 | 
 16 | from tqdm.auto import tqdm
 17 | 
 18 | import numpy as np
 19 | import torch
 20 | from packaging import version
 21 | from torch import nn
 22 | from torch.utils.data.dataloader import DataLoader
 23 | from torch.utils.data.dataset import Dataset, IterableDataset
 24 | from torch.utils.data.distributed import DistributedSampler
 25 | from torch.utils.data.sampler import RandomSampler, SequentialSampler
 26 | 
 27 | from transformers import Trainer
 28 | from transformers.trainer_utils import speed_metrics
 29 | from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
 30 | 
 31 | 
 32 | class CustomTrainer(Trainer):
 33 |     def __init__(
 34 |         self,
 35 |         model,
 36 |         args=None,
 37 |         data_collator=None,
 38 |         train_dataset=None,
 39 |         eval_dataset=None,
 40 |         tokenizer=None,
 41 |         model_init=None,
 42 |         compute_metrics=None,
 43 |         callbacks=None,
 44 |         optimizers=(None, None),
 45 |         path_to_best_model="models/model",
 46 |     ):
 47 |         super().__init__(
 48 |             model,
 49 |             args,
 50 |             data_collator,
 51 |             train_dataset,
 52 |             eval_dataset,
 53 |             tokenizer,
 54 |             model_init,
 55 |             compute_metrics,
 56 |             callbacks,
 57 |             optimizers,
 58 |         )
 59 |         self.path_to_best_model = path_to_best_model
 60 | 
 61 |     def evaluate(
 62 |         self,
 63 |         train_dataset=None,
 64 |         eval_dataset: Optional[Dataset] = None,
 65 |         ignore_keys: Optional[List[str]] = None,
 66 |         metric_key_prefix: str = "eval",
 67 |     ) -> Dict[str, float]:
 68 | 
 69 |         self._memory_tracker.start()
 70 | 
 71 |         train_dataloader = self.get_train_dataloader()
 72 |         eval_dataloader = self.get_eval_dataloader(eval_dataset)
 73 |         start_time = time.time()
 74 | 
 75 |         eval_loop = (
 76 |             self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
 77 |         )
 78 | 
 79 |         train_output = eval_loop(
 80 |             train_dataloader,
 81 |             description="Evaluation on train",
 82 |             # No point gathering the predictions if there are no metrics, otherwise we defer to
 83 |             # self.args.prediction_loss_only
 84 |             prediction_loss_only=True if self.compute_metrics is None else None,
 85 |             ignore_keys=ignore_keys,
 86 |             metric_key_prefix="train",
 87 |         )
 88 | 
 89 |         eval_output = eval_loop(
 90 |             eval_dataloader,
 91 |             description="Evaluation on dev",
 92 |             # No point gathering the predictions if there are no metrics, otherwise we defer to
 93 |             # self.args.prediction_loss_only
 94 |             prediction_loss_only=True if self.compute_metrics is None else None,
 95 |             ignore_keys=ignore_keys,
 96 |             metric_key_prefix="eval",
 97 |         )
 98 | 
 99 |         total_batch_size = self.args.eval_batch_size * self.args.world_size
100 | 
101 |         train_output.metrics.update(
102 |             speed_metrics(metric_key_prefix, start_time, train_output.num_samples)
103 |         )
104 | 
105 |         eval_output.metrics.update(
106 |             speed_metrics(metric_key_prefix, start_time, eval_output.num_samples)
107 |         )
108 | 
109 |         self.log(train_output.metrics)
110 |         self.log(eval_output.metrics)
111 | 
112 |         if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
113 |             # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
114 |             xm.master_print(met.metrics_report())
115 | 
116 |         self.control = self.callback_handler.on_evaluate(
117 |             self.args, self.state, self.control, train_output.metrics
118 |         )
119 |         self.control = self.callback_handler.on_evaluate(
120 |             self.args, self.state, self.control, eval_output.metrics
121 |         )
122 | 
123 |         self._memory_tracker.stop_and_update_metrics(train_output.metrics)
124 |         self._memory_tracker.stop_and_update_metrics(eval_output.metrics)
125 | 
126 |         dic = {
127 |             "Training metrics": train_output.metrics,
128 |             "Validation metrics": eval_output.metrics,
129 |         }
130 |         print(eval_output.metrics.keys())
131 |         eval_accuracy = eval_output.metrics["eval_accuracy"]
132 | 
133 |         # store model if performance improved
134 |         if self.state.best_model_checkpoint is None or eval_accuracy > self.state.best_metric:
135 |             self.state.best_model_checkpoint = self.state.global_step
136 |             self.state.best_metric = eval_accuracy
137 |             self._save_model(self.path_to_best_model)
138 |             self._store_metadata_best_model()
139 |         return dic
140 | 
141 |     def _store_metadata_best_model(self):
142 |         """
143 |         Store metadata of best model to .txt file.
144 |         """
145 |         # change extension of path
146 |         path, ext = os.path.splitext(self.path_to_best_model)
147 |         path_to_metadata = f"{path}.txt"
148 | 
149 |         # create metadata string
150 |         metadata = f"Best metric: {self.state.best_metric}, global_step: {self.state.best_model_checkpoint}"
151 | 
152 |         # store metadata
153 |         with open(path_to_metadata, "w") as fp:
154 |             fp.write(metadata)
155 | 
156 |     def _save_model(self, output_dir: Optional[str] = None):
157 |         """
158 |         Stores the best model found so far.
159 |         """
160 |         print("Storing best model")
161 |         super().save_model(output_dir)
162 | 


--------------------------------------------------------------------------------
/convinse/evaluation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | from tqdm import tqdm
  4 | 
  5 | from convinse.library.string_library import StringLibrary
  6 | from Levenshtein import distance as levenshtein_distance
  7 | 
  8 | 
  9 | def answer_presence(evidences, answers):
 10 |     """
 11 |     Compute the answer presence for a set of evidences
 12 |     and a parsed answer dict, and return a list of
 13 |     answering evidences.
 14 |     Return format: (boolean, [evidence-dict, ...])
 15 |     """
 16 |     # initialize
 17 |     answer_present = False
 18 |     answering_evidences = list()
 19 | 
 20 |     # go through evidences
 21 |     for evidence in evidences:
 22 |         if evidence_has_answer(evidence, answers):
 23 |             # remember evidence
 24 |             answer_present = True
 25 |             answering_evidences.append(evidence)
 26 |     # return results
 27 |     return (answer_present, answering_evidences)
 28 | 
 29 | 
 30 | def evidence_has_answer(evidence, gold_answers):
 31 |     """Check whether the given evidence has any of the answers."""
 32 |     for answer_candidate in evidence["wikidata_entities"]:
 33 |         # check if answering candidate
 34 |         if candidate_in_answers(answer_candidate, gold_answers):
 35 |             return True
 36 |     return False
 37 | 
 38 | 
 39 | def candidate_in_answers(answer_candidate, gold_answers):
 40 |     """Check if candidate is answer."""
 41 |     # get ids
 42 |     answer_candidate_id = answer_candidate["id"]
 43 |     gold_answer_ids = [answer["id"] for answer in gold_answers]
 44 | 
 45 |     # normalize
 46 |     answer_candidate_id = answer_candidate_id.lower().strip().replace('"', "").replace("+", "")
 47 |     gold_answer_ids = [answer.lower().strip().replace('"', "") for answer in gold_answer_ids]
 48 | 
 49 |     # perform check
 50 |     if answer_candidate_id in gold_answer_ids:
 51 |         return True
 52 | 
 53 |     # no match found
 54 |     return False
 55 | 
 56 | 
 57 | def mrr_score(answers, gold_answers):
 58 |     """Compute MRR score for given answers and gold answers."""
 59 |     # check if any answer was given
 60 |     if not answers:
 61 |         return 0.0
 62 |     # go through answer candidates
 63 |     for answer in answers:
 64 |         if candidate_in_answers(answer["answer"], gold_answers):
 65 |             return 1.0 / float(answer["rank"])
 66 |     return 0.0
 67 | 
 68 | 
 69 | def precision_at_1(answers, gold_answers):
 70 |     """Compute P@1 score for given answers and gold answers."""
 71 |     # check if any answer was given
 72 |     if not answers:
 73 |         return 0.0
 74 |     # go through answer candidates
 75 |     for answer in answers:
 76 |         if float(answer["rank"]) > float(1.0):
 77 |             break
 78 |         elif candidate_in_answers(answer["answer"], gold_answers):
 79 |             return 1.0
 80 |     return 0.0
 81 | 
 82 | 
 83 | def hit_at_5(answers, gold_answers):
 84 |     """Compute Hit@5 score for given answers and gold answers."""
 85 |     # check if any answer was given
 86 |     if not answers:
 87 |         return 0.0
 88 |     # go through answer candidates
 89 |     for answer in answers:
 90 |         if float(answer["rank"]) > float(5.0):
 91 |             break
 92 |         elif candidate_in_answers(answer["answer"], gold_answers):
 93 |             return 1.0
 94 |     return 0.0
 95 | 
 96 | 
 97 | def get_ranked_answers(config, generated_answer, turn):
 98 |     """
 99 |     Convert the predicted answer text to a Wikidata ID (or Yes/No),
100 |     and return the ranked answers.
101 |     Can be used for any method that predicts an answer string (instead of a KB item).
102 |     """
103 |     # check if existential (special treatment)
104 |     question = turn["question"]
105 |     if question_is_existential(question):
106 |         ranked_answers = [
107 |             {"answer": {"id": "yes", "label": "yes"}, "score": 1.0, "rank": 1},
108 |             {"answer": {"id": "no", "label": "no"}, "score": 0.5, "rank": 2},
109 |         ]
110 |     # no existential
111 |     else:
112 |         # return dummy answer in case None was found (if no evidences found)
113 |         if generated_answer is None:
114 |             return [{"answer": {"id": "None", "label": "None"}, "rank": 1, "score": 0.0}]
115 |         smallest_diff = 100000
116 |         all_answers = list()
117 |         mentions = set()
118 |         for evidence in turn["top_evidences"]:
119 |             for disambiguation in evidence["disambiguations"]:
120 |                 mention = disambiguation[0]
121 |                 id = disambiguation[1]
122 |                 if id is None or id == False:
123 |                     continue
124 | 
125 |                 # skip duplicates
126 |                 ans = str(mention) + str(id)
127 |                 if ans in mentions:
128 |                     continue
129 |                 mentions.add(ans)
130 |                 # exact match
131 |                 if generated_answer == mention:
132 |                     diff = 0
133 |                 # otherwise compute edit distance
134 |                 else:
135 |                     diff = levenshtein_distance(generated_answer, mention)
136 | 
137 |                 all_answers.append({"answer": {"id": id, "label": mention}, "score": diff})
138 | 
139 |         sorted_answers = sorted(all_answers, key = lambda j: j['score'])
140 |         ranked_answers = [
141 |             {"answer": answer["answer"], "score": answer["score"], "rank": i+1}
142 |             for i, answer in enumerate(sorted_answers)
143 |         ]
144 | 
145 |     # don't return all answers
146 |     max_answers = config["ha_max_answers"]
147 |     ranked_answers = ranked_answers[:max_answers]
148 |     if not ranked_answers:
149 |         ranked_answers = [{"answer": {"id": "None", "label": "None"}, "rank": 1, "score": 0.0}]
150 |     return ranked_answers
151 | 
152 | 
153 | def question_is_existential(question):
154 |     existential_keywords = [
155 |         "is",
156 |         "are",
157 |         "was",
158 |         "were",
159 |         "am",
160 |         "be",
161 |         "being",
162 |         "been",
163 |         "did",
164 |         "do",
165 |         "does",
166 |         "done",
167 |         "doing",
168 |         "has",
169 |         "have",
170 |         "had",
171 |         "having",
172 |     ]
173 |     lowercase_question = question.lower()
174 |     lowercase_question = lowercase_question.strip()
175 |     for keyword in existential_keywords:
176 |         if lowercase_question.startswith(keyword):
177 |             return True
178 |     return False
179 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/evidence_retrieval_scoring.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | 
  4 | from pathlib import Path
  5 | from tqdm import tqdm
  6 | 
  7 | from convinse.library.utils import get_config, get_logger
  8 | from convinse.evaluation import answer_presence
  9 | 
 10 | 
 11 | class EvidenceRetrievalScoring:
 12 |     """Abstract class for ERs phase."""
 13 | 
 14 |     def __init__(self, config):
 15 |         """Initialize ERS module."""
 16 |         self.config = config
 17 |         self.logger = get_logger(__name__, config)
 18 | 
 19 |     def train(self, sources=None):
 20 |         """Method used in case no training required for ERS phase."""
 21 |         self.logger.info("Module used does not require training.")
 22 | 
 23 |     def inference(self, sources=None):
 24 |         """Run ERS on data and add retrieve top-e evidences for each source combination."""
 25 |         input_dir = self.config["path_to_annotated"]
 26 |         output_dir = self.config["path_to_intermediate_results"]
 27 | 
 28 |         qu = self.config["qu"]
 29 |         ers = self.config["ers"]
 30 | 
 31 |         # either use given option, or from config
 32 |         if not sources is None:
 33 |             source_combinations = [sources]
 34 |         else:
 35 |             source_combinations = self.config["source_combinations"]
 36 | 
 37 |         # go through all combinations
 38 |         for sources in source_combinations:
 39 |             sources_string = "_".join(sources)
 40 | 
 41 |             input_path = os.path.join(input_dir, qu, "train_qu.json")
 42 |             output_path = os.path.join(output_dir, qu, ers, sources_string, "train_ers.jsonl")
 43 |             self.inference_on_data_split(input_path, output_path, sources)
 44 | 
 45 |             input_path = os.path.join(input_dir, qu, "dev_qu.json")
 46 |             output_path = os.path.join(output_dir, qu, ers, sources_string, "dev_ers.jsonl")
 47 |             self.inference_on_data_split(input_path, output_path, sources)
 48 | 
 49 |             input_path = os.path.join(input_dir, qu, "test_qu.json")
 50 |             output_path = os.path.join(output_dir, qu, ers, sources_string, "test_ers.jsonl")
 51 |             self.inference_on_data_split(input_path, output_path, sources)
 52 | 
 53 |         # store results in cache (if applicable)
 54 |         self.store_cache()
 55 | 
 56 |     def inference_on_data_split(self, input_path, output_path, sources):
 57 |         """
 58 |         Run ERS on the dataset to predict
 59 |         answering evidences for each SR in the dataset.
 60 |         """
 61 |         # open data
 62 |         with open(input_path, "r") as fp:
 63 |             data = json.load(fp)
 64 |         self.logger.info(f"Input data loaded from: {input_path}.")
 65 | 
 66 |         # score
 67 |         answer_presences = list()
 68 |         source_to_ans_pres = {"kb": 0, "text": 0, "table": 0, "info": 0, "all": 0}
 69 | 
 70 |         # create folder if not exists
 71 |         output_dir = os.path.dirname(output_path)
 72 |         Path(output_dir).mkdir(parents=True, exist_ok=True)
 73 | 
 74 |         # process data
 75 |         with open(output_path, "w") as fp:
 76 |             for conversation in tqdm(data):
 77 |                 for turn in conversation["questions"]:
 78 |                     top_evidences = self.inference_on_turn(turn, sources)
 79 |                     turn["top_evidences"] = top_evidences
 80 | 
 81 |                     # answer presence
 82 |                     hit, answering_evidences = answer_presence(top_evidences, turn["answers"])
 83 |                     turn["answer_presence"] = hit
 84 |                     turn["answer_presence_per_src"] = {
 85 |                         evidence["source"]: 1 for evidence in answering_evidences
 86 |                     }
 87 | 
 88 |                 # write conversation to file
 89 |                 fp.write(json.dumps(conversation))
 90 |                 fp.write("\n")
 91 | 
 92 |                 # accumulate results
 93 |                 c_answer_presences = [turn["answer_presence"] for turn in conversation["questions"]]
 94 |                 answer_presences += c_answer_presences
 95 |                 for turn in conversation["questions"]:
 96 |                     answer_presence_per_src = turn["answer_presence_per_src"]
 97 |                     # add per source answer presence
 98 |                     for src, ans_presence in answer_presence_per_src.items():
 99 |                         source_to_ans_pres[src] += ans_presence
100 |                     # aggregate overall answer presence for validation
101 |                     if len(answer_presence_per_src.items()):
102 |                         source_to_ans_pres["all"] += 1
103 | 
104 |         # print results
105 |         res_path = output_path.replace(".jsonl", ".res")
106 |         with open(res_path, "w") as fp:
107 |             avg_answer_presence = sum(answer_presences) / len(answer_presences)
108 |             fp.write(f"Avg. answer presence: {avg_answer_presence}\n")
109 |             answer_presence_per_src = {
110 |                 src: (num / len(answer_presences)) for src, num in source_to_ans_pres.items()
111 |             }
112 |             fp.write(f"Answer presence per source: {answer_presence_per_src}")
113 | 
114 |         # log
115 |         self.logger.info(f"Done with processing: {input_path}.")
116 | 
117 |     def inference_on_data(self, input_data, sources=["kb", "text", "table", "info"]):
118 |         """Run ERS on given data."""
119 |         input_turns = [turn for conv in input_data for turn in conv["questions"]]
120 |         self.inference_on_turns(input_turns, sources)
121 |         return input_data
122 | 
123 |     def inference_on_turns(self, input_turns, sources=["kb", "text", "table", "info"]):
124 |         """Run ERS on given turns."""
125 |         for turn in input_turns:
126 |             top_evidences = self.inference_on_turn(turn, sources)
127 |             turn["top_evidences"] = top_evidences
128 | 
129 |             # answer presence
130 |             hit, answering_evidences = answer_presence(top_evidences, turn["answers"])
131 |             turn["answer_presence"] = hit
132 |             turn["answer_presence_per_src"] = {
133 |                 evidence["source"]: 1 for evidence in answering_evidences
134 |             }
135 |         return input_turns
136 | 
137 |     def inference_on_turn(self):
138 |         raise Exception(
139 |             "This is an abstract function which should be overwritten in a derived class!"
140 |         )
141 | 
142 |     def store_cache(self):
143 |         pass
144 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/structured_representation/structured_representation_module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import torch
  4 | import logging
  5 | import random
  6 | 
  7 | from convinse.library.utils import get_config, get_logger
  8 | from convinse.question_understanding.question_understanding import QuestionUnderstanding
  9 | from convinse.question_understanding.structured_representation.structured_representation_model import (
 10 |     StructuredRepresentationModel,
 11 | )
 12 | import convinse.question_understanding.structured_representation.dataset_structured_representation as dataset
 13 | 
 14 | 
 15 | class StructuredRepresentationModule(QuestionUnderstanding):
 16 |     def __init__(self, config, use_gold_answers):
 17 |         """Initialize SR module."""
 18 |         self.config = config
 19 |         self.logger = get_logger(__name__, config)
 20 |         self.use_gold_answers = use_gold_answers
 21 |         
 22 |         # create model
 23 |         self.sr_model = StructuredRepresentationModel(config)
 24 |         self.model_loaded = False
 25 | 
 26 |         self.history_separator = config["history_separator"]
 27 |         self.sr_delimiter = config["sr_delimiter"]
 28 | 
 29 |     def train(self):
 30 |         """Train the model on silver SR data."""
 31 |         # train model
 32 |         self.logger.info(f"Starting training...")
 33 |         data_dir = self.config["path_to_annotated"]
 34 |         train_path = os.path.join(data_dir, "annotated_train.json")
 35 |         dev_path = os.path.join(data_dir, "annotated_dev.json")
 36 |         self.sr_model.train(train_path, dev_path)
 37 |         self.logger.info(f"Finished training.")
 38 | 
 39 |     def inference_on_conversation(self, conversation):
 40 |         """Run inference on a single conversation."""
 41 |         # load SR model (if required)
 42 |         self._load()
 43 | 
 44 |         with torch.no_grad():
 45 |             # SR model inference
 46 |             history_turns = list()
 47 |             for i, turn in enumerate(conversation["questions"]):
 48 |                 self.inference_on_turn(turn, history_turns)
 49 | 
 50 |                 # only append answer if there is a next question
 51 |                 if i + 1 < len(conversation["questions"]):
 52 |                     if self.use_gold_answers:
 53 |                         answer_text = ", ".join([answer["label"] for answer in turn["answers"]])
 54 |                     else:
 55 |                         # answer_text = ", ".join([answer["label"] for answer in turn["pred_answers"]])
 56 |                         answer_text = turn["pred_answers"][0]["label"]
 57 |                     history_turns.append(answer_text)
 58 |             return conversation
 59 | 
 60 |     def inference_on_turn(self, turn, history_turns):
 61 |         """Run inference on a single turn."""
 62 |         # load SR model (if required)
 63 |         self._load()
 64 | 
 65 |         with torch.no_grad():
 66 |             # SR model inference
 67 |             question = turn["question"]
 68 |             history_turns.append(question)
 69 | 
 70 |             # prepare input (omitt gold answer(s))
 71 |             rewrite_input = self.history_separator.join(history_turns)
 72 | 
 73 |             # run inference
 74 |             sr = self.sr_model.inference(rewrite_input)
 75 |             turn["structured_representation"] = sr
 76 |             return turn
 77 | 
 78 |     def _load(self):
 79 |         """Load the SR model."""
 80 |         # only load if not already done so
 81 |         if not self.model_loaded:
 82 |             self.sr_model.load()
 83 |             self.sr_model.set_eval_mode()
 84 |             self.model_loaded = True
 85 | 
 86 |     def adjust_sr_for_ablation(self, sr, ablation_type):
 87 |         """
 88 |         Adjust the given SR based on the specific ablation type.
 89 |         """
 90 |         slots = sr.split(self.sr_delimiter, 3)
 91 |         if len(slots) < 4 and not slots[0]:
 92 |             # type missing
 93 |             slots = slots + [""]
 94 |         elif len(slots) < 4:
 95 |             # context missing
 96 |             slots = [""] + slots
 97 |         if len(slots) < 4:
 98 |             # fix other (strange) cases
 99 |             slots = slots + (4 - len(slots)) * [""]
100 |         context, entity, pred, ans_type = slots
101 |         if ablation_type == "nocontext":
102 |             sr = f"{entity.strip()} {self.sr_delimiter} {pred.strip()} {self.sr_delimiter} {ans_type.strip()}"
103 |         elif ablation_type == "noentity":
104 |             sr = f"{context.strip()} {self.sr_delimiter} {pred.strip()} {self.sr_delimiter} {ans_type.strip()}"
105 |         elif ablation_type == "nopred":
106 |             sr = f"{context.strip()} {self.sr_delimiter} {entity.strip()} {self.sr_delimiter} {ans_type.strip()}"
107 |         elif ablation_type == "notype":
108 |             sr = f"{context.strip()} {self.sr_delimiter} {entity.strip()} {self.sr_delimiter} {pred.strip()}"
109 |         elif ablation_type == "nostructure":
110 |             slots = [context, entity, pred, ans_type]
111 |             random.shuffle(slots)
112 |             sr = f"{slots[0].strip()} {self.sr_delimiter} {slots[1].strip()} {self.sr_delimiter} {slots[2].strip()} {self.sr_delimiter} {slots[3].strip()}"
113 |         elif ablation_type == "full":
114 |             sr = f"{context.strip()} {self.sr_delimiter} {entity.strip()} {self.sr_delimiter} {pred.strip()} {self.sr_delimiter} {ans_type.strip()}"
115 |         else:
116 |             raise Exception(f"Unknown ablation type: {ablation_type}")
117 |         return sr
118 | 
119 | 
120 | #######################################################################################################################
121 | #######################################################################################################################
122 | if __name__ == "__main__":
123 |     if len(sys.argv) != 3:
124 |         raise Exception(
125 |             "Usage: python convinse/question_understanding/structured_representation/structured_representation_module.py --<FUNCTION> <PATH_TO_CONFIG>"
126 |         )
127 | 
128 |     function = sys.argv[1]
129 |     config_path = sys.argv[2]
130 |     config = get_config(config_path)
131 | 
132 |     # train: train model
133 |     if function == "--train":
134 |         srm = StructuredRepresentationModule(config, use_gold_answers=True)
135 |         srm.train()
136 | 
137 |     # inference: add predictions to data
138 |     elif function == "--inference":
139 |         # load config
140 |         srm = StructuredRepresentationModule(config, use_gold_answers=True)
141 |         srm.inference()
142 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/README.md:
--------------------------------------------------------------------------------
 1 | # Evidence Retrieval and Scoring (ERS)
 2 | 
 3 | Module to retrieve relevant evidences from (heterogeneous) information sources.
 4 | 
 5 | - [Create your own ERS module](#create-your-own-ers-module)
 6 | 	- [`inference_on_turn` function](#inference_on_turn-function)
 7 | 	- [`store_cache` function](#optional-store_cache-function)
 8 | 	- [`train` function](#optional-train-function)
 9 | - [Available information sources](#available-information-sources)
10 | - [Wikidata access](#wikidata-access)
11 | - [Wikipedia access](#wikipedia-access)
12 | - [Evidences format](#evidences-format)
13 | 
14 | 
15 | ## Create your own ERS module
16 | You can inherit from the [`EvidenceRetrievalScoring`](evidence_retrieval_scoring.py) class and create your own ERS module.
17 | Implementing the function `inference_on_turn` is sufficient for the pipeline to run.
18 | In case you would like to store intermediate retrieval results, make sure to implement the `store_cache` function, which is called after the ERS module is run to store any data.
19 | Further, you need to instantiate a logger in the class, which will be used in the parent class.
20 | Alternatively, you can call the __init__ method of the parent class. 
21 | Please find further details below.
22 | 
23 | 
24 | ## `inference_on_turn` function
25 | 
26 | **Inputs**:
27 | - `turn`: turn that evidences are retrieved for. Will have the intent-explicit form of the current question in `turn["structured_representation"]`.
28 | - `sources`: list of input sources.
29 | 
30 | **Description**:  
31 | For the given intent-explicit representation of the question, retrieve relevant evidences from heterogeneous information sources. Depending on your individual module implemented, the initially retrieved evidences require to be scored to identify the top-*e* most relevant ones (*e* is defined by `evs_max_evidences` in the config).
32 | 
33 | **Output**:  
34 | Returns the top-*e* evidences. However, the current pipeline does not make use of the return value.
35 | Make sure to also store these evidences in `turn["top_evidences"]`. In your implementation, make sure that the config parameter `evs_max_evidences` controls the amount of evidences going into the HA part.
36 | 
37 | 
38 | ## [Optional] `store_cache` function
39 | 
40 | **Inputs**: NONE
41 | 
42 | **Description**:  
43 | Whatever intermediate retrieval results you obtain in your implementation of the class, you can store these on disk to re-use them in a future run (e.g. for efficiency or reproducability). The default implementation (in [`EvidenceRetrievalScoring`](evidence_retrieval_scoring.py)) does not do anything. If you do not require storing any data, you can simply skip this function.
44 | 
45 | **Output**: NONE
46 | 
47 | ## [Optional] `train` function
48 | 
49 | **Inputs**: NONE
50 | 
51 | **Description**:  
52 | If required, you can train your ERS module here. You can make use of whatever parameters are stored in your .yml file.
53 | 
54 | **Output**: NONE
55 | 
56 | ## Available information sources
57 | The following information sources are implemented in the native CONVINSE pipeline:
58 | - `"kb"`: KB-facts from Wikidata,
59 | - `"text"`: text-snippets (sentence-level) from Wikipedia,
60 | - `"table"`: table-records (row-level) from Wikipedia,
61 | - `"info"`: infobox-entries (attribute-value-pairs) from Wikipedia.
62 | 
63 | For specifying specific combinations of information sources (e.g. for retrieval, training,...), you can either adjust the respective parameters in the config, or provide them as argument to the bash script. E.g. giving the option "kb_text_info" specifies that "kb", "text" and "info" should be set.
64 | 
65 | ## Wikidata access
66 | For accessing Wikidata, you can make use of the [`ClocqRetriever`](clocq_er.py) class.
67 | You can:
68 | 1) retrieve relevant KB-facts for a given input snippet, using [CLOCQ](https://clocq.mpi-inf.mpg.de)'s search space retrieval functionality via the `retrieve_evidences` function, specifying the desired [input information sources](#available-information-sources) in a list,
69 | 2) retrieve relevant KB-facts for a given input snippet, using [CLOCQ](https://clocq.mpi-inf.mpg.de)'s search space retrieval functionality via the `retrieve_KB_facts` function, which will only return evidences from the KB, or 
70 | 4) retrieve KB-facts for a given Wikidata item ID via the `retrieve_kb_facts_for_item` function.
71 | 
72 | The CLOCQ parameters in the config will be used as input for the CLOCQ functions.
73 | For quickly getting started, you can make use of the publicly available [CLOCQ API](https://clocq.mpi-inf.mpg.de), which is the default setup.
74 | For more efficient access, you can run the CLOCQ algorithm on your local machine. Note, that this comes with quite some memory requirements of \~400 GB.
75 | 
76 | ## Wikipedia access
77 | For accessing Wikipedia text, tables and infoboxes, you can use the [`ClocqRetriever`](clocq_er.py) class, or directly use the [`WikipediaRetriever`](wikipedia_retriever/wikipedia_retriever.py) package.
78 | You can:
79 | 1) retrieve facts from Wikipedia for a given input snippet, using [CLOCQ](https://clocq.mpi-inf.mpg.de)'s search space retrieval functionality via the `retrieve_evidences` function, specifying the desired [input information sources](#available-information-sources) in a list,
80 | 2) retrieve facts from Wikipedia for a given Wikidata item ID, using the `retrieve_wikipedia_evidences` function in the [`ClocqRetriever`](clocq_er.py) class,
81 | 3) retrieve facts from Wikipedia for a given Wikidata item ID, using the `retrieve_wp_evidences` function in the [`WikipediaRetriever`](wikipedia_retriever/wikipedia_retriever.py) package. You can adjust this function as required. Make sure to include the `retrieved_for_entity` key to the resulting evidences (not taken care of in this function).
82 | 
83 | Either way, the pipeline would try to read evidences from the cache, or the Wikipedia dump (specified with the `ers_wikipedia_dump` keyword in the config).
84 | The parameter `ers_on_the_fly` controls, whether the Wikipedia API is called on-the-fly to retrieve evidences for entities that are not included in the specified Wikipedia dump. If `ers_on_the_fly=False`, an empty list of evidences will be returned in case an entity is not included.
85 | 
86 | ## Evidences format
87 | Evidences are stored and processed in the following format. If you plan your own implementation of the ERS module, make sure that you match this format.
88 | 
89 | ``` json
90 | {
91 | 	"evidence_text": "<TEXT>",
92 | 	"source": "kb|text|table|infobox",
93 | 	"disambiguations": [["<SURFACE_FORM_IN_EVIDENCE>", "ITEM_ID>"], ],
94 | 	"wikidata_entities": [{"id": "<ITEM_ID>", "label": "<LABEL>"}, ],
95 | 	"retrieved_for_entity": {"id": "<ITEM_ID>", "label": "<LABEL>"}
96 | }
97 | ```
98 | 
99 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/wikipedia_retriever/infobox_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Infobox parser for Wikipedia tables.
  3 | Inspired by https://github.com/roskoff/HTMLTableParser/blob/master/HTMLTableParser.py.
  4 | """
  5 | from html.parser import HTMLParser
  6 | 
  7 | import convinse.library.wikipedia_library as wiki
  8 | 
  9 | CELL_SEPARATOR = ", "
 10 | COMPONENT_SEPARATOR = ", "
 11 | 
 12 | 
 13 | def infobox_to_evidences(parsed_infobox, wiki_title):
 14 |     """
 15 |     Extracts the evidences from the given parsed infobox.
 16 |     The parsed infobox is output of the InfoboxParser class
 17 |     below.
 18 |     """
 19 |     # initialize
 20 |     wiki_path = wiki._wiki_title_to_path(wiki_title)
 21 |     evidences = list()
 22 |     last_header_row = ""
 23 | 
 24 |     # iterate through rows
 25 |     for row in parsed_infobox:
 26 |         if len(row) == 1 and row[0]["cell_type"] == "header":
 27 |             # remember header
 28 |             last_header_row = row[0]["text"]
 29 |         else:
 30 |             # create evidence text
 31 |             row_text = CELL_SEPARATOR.join([cell["text"] for cell in row])
 32 |             evidence_components = [wiki_title, last_header_row, row_text]
 33 |             evidence_text = COMPONENT_SEPARATOR.join(evidence_components)
 34 | 
 35 |             # wikipedia entities (=paths)
 36 |             wiki_paths = [entity for cell in row for entity in cell["entities"]]
 37 | 
 38 |             # dict from string to wiki_path
 39 |             disambiguations = [
 40 |                 (string, wiki_path)
 41 |                 for cell in row
 42 |                 for string, wiki_path in (cell["anchor_dict"]).items()
 43 |             ]
 44 | 
 45 |             ## do not consider wiki_paths with hashtags
 46 |             # hashtag indicates a paragraph on entity, rather than entity
 47 |             wiki_paths = [wiki_path for wiki_path in wiki_paths if not "#" in wiki_path]
 48 | 
 49 |             # add current Wikipedia page entity
 50 |             if not wiki_path in wiki_paths:
 51 |                 wiki_paths.append(wiki_path)
 52 |                 wiki_title = wiki._wiki_path_to_title(wiki_path)
 53 |                 disambiguations.append((wiki_title, wiki_path))
 54 | 
 55 |             # create evidence
 56 |             evidence = {
 57 |                 "evidence_text": evidence_text,
 58 |                 "wikipedia_paths": wiki_paths,
 59 |                 "wp_disambiguations": disambiguations,
 60 |                 "source": "info",
 61 |             }
 62 |             evidences.append(evidence)
 63 |     return evidences
 64 | 
 65 | 
 66 | class InfoboxParser(HTMLParser):
 67 |     """
 68 |     This class serves as a html infobox parser.
 69 |     Assumes the following table format:
 70 |             header_row1
 71 |             attribute_name | attribute_value
 72 |             ...
 73 |             header_row2
 74 |             attribute_name | attribute_value
 75 |     """
 76 | 
 77 |     def __init__(
 78 |         self,
 79 |         anchor_dict,
 80 |         decode_html_entities=False,
 81 |         data_separator=" ",
 82 |     ):
 83 |         HTMLParser.__init__(self, convert_charrefs=decode_html_entities)
 84 |         self._data_separator = data_separator
 85 |         self._in_td = False
 86 |         self._in_th = False
 87 |         self._current_table = []
 88 |         self._current_row = []
 89 |         self._current_cell = {"entities": [], "text": [], "anchor_dict": dict()}
 90 | 
 91 |         self._after_href = False
 92 |         self._last_href = None
 93 | 
 94 |         # initialize tables object
 95 |         self.tables = []
 96 | 
 97 |         # multiple mentions of same entity are not all tagged with URL
 98 |         # -> remember such entities (phrase->entity entries)
 99 |         self.anchor_dict = anchor_dict
100 | 
101 |     def get_anchor_dict(self):
102 |         return self.anchor_dict
103 | 
104 |     def handle_starttag(self, tag, attrs):
105 |         """
106 |         We need to remember the opening point for the content of interest.
107 |         The other tags (<table>, <tr>) are only handled at the closing point.
108 |         """
109 |         if tag == "td":
110 |             self._in_td = True
111 |         if tag == "th":
112 |             self._in_th = True
113 | 
114 |         for name, value in attrs:
115 |             if name == "href" and wiki.is_wikipedia_path(value):
116 |                 wiki_path = wiki.format_wiki_path(value)
117 |                 if wiki.is_wikipedia_path(wiki_path):
118 |                     self._current_cell["entities"].append(wiki_path)
119 |                 self._after_href = True
120 |                 self._last_href = wiki_path
121 | 
122 |     def handle_data(self, data):
123 |         """This is where we save content to a cell."""
124 |         if self._in_td or self._in_th:
125 |             self._current_cell["text"].append(data)
126 |         if self._after_href:
127 |             # store data->entity in dicts
128 |             self.anchor_dict[data] = self._last_href
129 |             if wiki.is_wikipedia_path(self._last_href):
130 |                 self._current_cell["anchor_dict"][data] = self._last_href
131 |             # delete flag
132 |             self._after_href = False
133 | 
134 |     def handle_endtag(self, tag):
135 |         """
136 |         Here we exit the tags. If the closing tag is </tr>, we know that we
137 |         can save our currently parsed cells to the current table as a row and
138 |         prepare for a new row. If the closing tag is </table>, we save the
139 |         current table and prepare for a new one.
140 |         """
141 |         if tag == "td":
142 |             self._in_td = False
143 |         elif tag == "th":
144 |             self._in_th = False
145 | 
146 |         if tag in ["td", "th"]:
147 |             cell_text = self._data_separator.join(self._current_cell["text"]).strip()
148 |             cell_text = cell_text.replace("\n", ", ")
149 |             cell_text = cell_text.replace("\xa0", " ")
150 |             cell_entities = self._current_cell["entities"]
151 |             cell_anchor_dict = self._current_cell["anchor_dict"]
152 |             if not cell_entities:
153 |                 for anchor in self.anchor_dict:
154 |                     if anchor in cell_text:
155 |                         anchor_entity = self.anchor_dict[anchor]
156 |                         cell_entities.append(anchor_entity)
157 |                         cell_anchor_dict[anchor] = anchor_entity
158 |             cell_type = "data" if tag == "td" else "header"
159 |             final_cell = {
160 |                 "entities": cell_entities,
161 |                 "text": cell_text,
162 |                 "anchor_dict": cell_anchor_dict,
163 |                 "cell_type": cell_type,
164 |             }
165 |             self._current_row.append(final_cell)
166 |             self._current_cell = {"entities": [], "text": [], "anchor_dict": dict()}
167 |         elif tag == "tr":
168 |             self._current_table.append(self._current_row)
169 |             self._current_row = []
170 |         elif tag == "table":
171 |             self.tables.append(self._current_table)
172 |             self._current_table = []
173 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/structured_representation/structured_representation_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import transformers
  4 | 
  5 | from pathlib import Path
  6 | import convinse.question_understanding.structured_representation.dataset_structured_representation as dataset
  7 | 
  8 | 
  9 | class StructuredRepresentationModel(torch.nn.Module):
 10 |     def __init__(self, config):
 11 |         super(StructuredRepresentationModel, self).__init__()
 12 |         self.config = config
 13 |         self.sr_delimiter = self.config["sr_delimiter"]
 14 |         
 15 |         # select model architecture
 16 |         if config["sr_architecture"] == "BART":
 17 |             self.model = transformers.BartForConditionalGeneration.from_pretrained(
 18 |                 "facebook/bart-base"
 19 |             )
 20 |             self.tokenizer = transformers.BartTokenizerFast.from_pretrained("facebook/bart-base")
 21 |         elif config["sr_architecture"] == "T5":
 22 |             self.model = transformers.T5ForConditionalGeneration.from_pretrained("t5-base")
 23 |             self.tokenizer = transformers.T5TokenizerFast.from_pretrained("t5-base")
 24 |         else:
 25 |             raise Exception(
 26 |                 "Unknown architecture for SR module specified in config: currently, only T5-base (=T5) and BART-base (=BART) are supported."
 27 |             )
 28 | 
 29 |     def set_eval_mode(self):
 30 |         """Set model to eval mode."""
 31 |         self.model.eval()
 32 | 
 33 |     def save(self):
 34 |         """Save model."""
 35 |         model_path = self.config["sr_model_path"]
 36 |         # create dir if not exists
 37 |         model_dir = os.path.dirname(model_path)
 38 |         Path(model_dir).mkdir(parents=True, exist_ok=True)
 39 |         torch.save(self.model.state_dict(), model_path)
 40 | 
 41 |     def load(self):
 42 |         """Load model."""
 43 |         if torch.cuda.is_available():
 44 |             state_dict = torch.load(self.config["sr_model_path"])
 45 |         else:
 46 |             state_dict = torch.load(self.config["sr_model_path"], torch.device('cpu'))
 47 |         self.model.load_state_dict(state_dict)
 48 |         if torch.cuda.is_available():
 49 |             self.model = self.model.cuda()
 50 | 
 51 |     def train(self, train_path, dev_path):
 52 |         """Train model."""
 53 |         # load datasets
 54 |         train_dataset = dataset.DatasetStructuredRepresentation(
 55 |             self.config, self.tokenizer, train_path
 56 |         )
 57 |         dev_dataset = dataset.DatasetStructuredRepresentation(self.config, self.tokenizer, dev_path)
 58 |         # arguments for training
 59 |         training_args = transformers.Seq2SeqTrainingArguments(
 60 |             output_dir="convinse/question_understanding/structured_representation/results",  # output directory
 61 |             num_train_epochs=self.config["sr_num_train_epochs"],  # total number of training epochs
 62 |             per_device_train_batch_size=self.config[
 63 |                 "sr_per_device_train_batch_size"
 64 |             ],  # batch size per device during training
 65 |             per_device_eval_batch_size=self.config[
 66 |                 "sr_per_device_eval_batch_size"
 67 |             ],  # batch size for evaluation
 68 |             warmup_steps=self.config[
 69 |                 "sr_warmup_steps"
 70 |             ],  # number of warmup steps for learning rate scheduler
 71 |             weight_decay=self.config["sr_weight_decay"],  # strength of weight decay
 72 |             logging_dir="convinse/question_understanding/structured_representation/logs",  # directory for storing logs
 73 |             logging_steps=1000,
 74 |             evaluation_strategy="epoch",
 75 |             save_strategy="epoch",
 76 |             load_best_model_at_end="True"
 77 |             # predict_with_generate=True
 78 |         )
 79 |         # create the object for training
 80 |         trainer = transformers.Seq2SeqTrainer(
 81 |             model=self.model,
 82 |             args=training_args,
 83 |             train_dataset=train_dataset,
 84 |             eval_dataset=dev_dataset,
 85 |         )
 86 |         # training progress
 87 |         trainer.train()
 88 |         # store model
 89 |         self.save()
 90 | 
 91 |     def inference(self, input):
 92 |         """Run the model on the given input."""
 93 |         # encode
 94 |         input_encodings = self.tokenizer(
 95 |             input,
 96 |             padding=True,
 97 |             truncation=True,
 98 |             max_length=self.config["sr_max_input_length"],
 99 |             return_tensors="pt",
100 |         )
101 |         if torch.cuda.is_available():
102 |             input_encodings = input_encodings.to(torch.device("cuda"))
103 |         # generate
104 |         output = self.model.generate(
105 |             input_ids=input_encodings["input_ids"],
106 |             attention_mask=input_encodings["attention_mask"],
107 |             no_repeat_ngram_size=self.config["sr_no_repeat_ngram_size"],
108 |             num_beams=self.config["sr_num_beams"],
109 |             early_stopping=self.config["sr_early_stopping"],
110 |         )
111 |         # decoding
112 |         sr = self.tokenizer.batch_decode(
113 |             output,
114 |             skip_special_tokens=True,
115 |             clean_up_tokenization_spaces=True,
116 |         )[0]
117 | 
118 |         # format SR properly
119 |         sr = self._format_sr(sr)
120 |         return sr
121 | 
122 |     def _format_sr(self, sr):
123 |         """Make sure the SR has 3 delimiters."""
124 |         slots = sr.split(self.sr_delimiter.strip(), 3)
125 |         if len(slots) < 4 and not slots[0]:
126 |             # type missing
127 |             slots = slots + [""]
128 |         elif len(slots) < 4:
129 |             # topic missing
130 |             slots = [""] + slots
131 |         if len(slots) < 4:
132 |             # in case there are still less than 4 slots
133 |             slots = slots + (4 - len(slots)) * [""]
134 |         sr = self.sr_delimiter.join(slots)
135 |         return sr
136 | 
137 |     def inference_on_batch(self, inputs):
138 |         """Run the model on the given inputs (batch)."""
139 |         # encode inputs
140 |         input_encodings = self.tokenizer(
141 |             inputs,
142 |             padding=True,
143 |             truncation=True,
144 |             max_length=self.config["sr_max_input_length"],
145 |             return_tensors="pt",
146 |         )
147 |         # generation
148 |         summary_ids = self.model.generate(
149 |             input_ids=input_encodings["input_ids"],
150 |             attention_mask=input_encodings["attention_mask"],
151 |             no_repeat_ngram_size=self.config["sr_no_repeat_ngram_size"],
152 |             num_beams=self.config["sr_num_beams"],
153 |             early_stopping=self.config["sr_early_stopping"],
154 |         )
155 |         # decoding
156 |         output = [
157 |             self.tokenizer.decode(
158 |                 g,
159 |                 skip_special_tokens=True,
160 |                 clean_up_tokenization_spaces=False,
161 |             )
162 |             for g in summary_ids
163 |         ]
164 |         return output
165 | 


--------------------------------------------------------------------------------
/convinse/distant_supervision/turn_relevance_annotator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import random
  3 | 
  4 | from convinse.library.utils import get_logger
  5 | 
  6 | class TurnRelevanceAnnotator:
  7 |     def __init__(self, config):
  8 |         self.config = config
  9 |         self.logger = get_logger(__name__, config)
 10 | 
 11 |     def annotate_turn_relevances(self, flow_graph, conversation):
 12 |         """
 13 |         Annotate turn relevances for the conversation from the graph.
 14 |         This method also extracts a dataset to train the turn relevance
 15 |         module in the following form:
 16 |         [relevance, (turn1, question1, answer_str1), (turn2, question2, answer_str2)]
 17 |         """
 18 |         if not flow_graph:
 19 |             return []
 20 |         questions = dict()
 21 |         answers = dict()
 22 |         explored_turns = set()
 23 |         # extract positive examples
 24 |         positive_examples = list()
 25 |         leafs = flow_graph["leafs"]
 26 |         while leafs:
 27 |             for node in leafs:
 28 |                 turn_id = node["turn"]
 29 |                 node_type = node["type"]
 30 |                 node_str = str(turn_id) + node_type
 31 |                 # skip if node already seen
 32 |                 if node_str in explored_turns:
 33 |                     continue
 34 |                 # remember explored turn
 35 |                 explored_turns.add(node_str)
 36 |                 if node["type"] == "answer":
 37 |                     answer = [
 38 |                         {"id": id_, "label": labels[0]}
 39 |                         for (id_, labels, surface_form) in node["relevant_disambiguations"]
 40 |                     ]
 41 |                     answer_turn = node["turn"]
 42 |                     answers[answer_turn] = answer
 43 |                 elif node["type"] == "question":
 44 |                     question = node["question"]
 45 |                     question_turn = node["turn"]
 46 |                     questions[question_turn] = question
 47 |                     self._initialize_turn_relevance(
 48 |                         question_turn, conversation
 49 |                     )  # remember that node was seen
 50 |                     parent_questions = self._get_parent_nodes(node)
 51 |                     for (parent_question_turn, parent_question) in parent_questions:
 52 |                         positive_examples.append(
 53 |                             [
 54 |                                 1,
 55 |                                 (parent_question_turn, parent_question),
 56 |                                 (question_turn, question),
 57 |                             ]
 58 |                         )
 59 |                         self._add_relevant_turn(question_turn, parent_question_turn, conversation)
 60 |             leafs = [node for leaf in leafs for node in leaf["parents"]]
 61 | 
 62 |         # extract transitive turn_relevances from the positive examples
 63 |         turn_relevances = {turn: list() for turn in range(10)}
 64 |         if self.config["tr_transitive_relevances"]:
 65 |             for example in positive_examples:
 66 |                 _, question1, question2 = example
 67 |                 turn1, _ = question1
 68 |                 turn2, _ = question2
 69 |                 turn_relevances[turn2].append(turn1)
 70 |             turn_relevances = self._add_transitive_turn_relevances(turn_relevances, conversation)
 71 | 
 72 |         # extract negative examples
 73 |         negative_examples = list()
 74 |         for turn1 in questions:
 75 |             for turn2 in questions:
 76 |                 if turn1 == turn2:
 77 |                     continue
 78 |                 if not turn2 in turn_relevances[turn1]:
 79 |                     turn1_question = questions[turn1]
 80 |                     turn2_question = questions[turn2]
 81 |                     negative_examples.append([0, (turn2, turn2_question), (turn1, turn1_question)])
 82 | 
 83 |         # augment data with answers
 84 |         instances = positive_examples + negative_examples
 85 |         tr_data = list()
 86 |         for instance in instances:
 87 |             label, (turn1, question1), (turn2, question2) = instance
 88 |             tr_data.append(
 89 |                 {
 90 |                     "relevance": label,
 91 |                     "history_turn": {
 92 |                         "turn": turn1,
 93 |                         "question": question1,
 94 |                         "answers": answers[turn1],
 95 |                     },
 96 |                     "current_turn": {
 97 |                         "turn": turn2,
 98 |                         "question": question2,
 99 |                         "answers": answers[turn2],
100 |                     },
101 |                 }
102 |             )
103 | 
104 |         # return data for turn relevance (required for training turn r module)
105 |         return tr_data
106 | 
107 |     def _add_relevant_turn(self, current_turn, relevant_turn, conversation):
108 |         """
109 |         Add relevant_turn as relevant for the current turn.
110 |         """
111 |         if not relevant_turn in conversation["questions"][current_turn]["silver_relevant_turns"]:
112 |             conversation["questions"][current_turn]["silver_relevant_turns"].append(relevant_turn)
113 | 
114 |     def _initialize_turn_relevance(self, current_turn, conversation):
115 |         """
116 |         Remember that node was found in graph.
117 |         Aims to distinguish between turns for which no information was found
118 |         due to their answer type (e.g. existentials; relevant turns: None),
119 |         and turns which are found to be self-sufficient (relevant turns: empty list).
120 |         """
121 |         if conversation["questions"][current_turn]["silver_relevant_turns"] is None:
122 |             conversation["questions"][current_turn]["silver_relevant_turns"] = list()
123 | 
124 |     def _get_parent_questions(self, node):
125 |         """
126 |         NOT IN USE: Extract parent question of the given node.
127 |         Was used in earlier version!
128 |         """
129 |         if not node["parents"]:
130 |             return list()
131 |         parent_nodes = node["parents"]
132 |         parent_nodes_copy = list()
133 |         for parent_node in parent_nodes:
134 |             if not parent_node["type"] == "question":
135 |                 new_parent_nodes = [
136 |                     node for node in parent_node["parents"] if parent_node["type"] == "question"
137 |                 ]
138 |                 parent_nodes_copy += new_parent_nodes
139 |             else:
140 |                 parent_nodes_copy.append(parent_node)
141 |         parent_nodes = parent_nodes_copy
142 |         parent_questions = [
143 |             (parent_node["turn"], parent_node["question"]) for parent_node in parent_nodes
144 |         ]
145 |         # return question and turn
146 |         return parent_questions
147 | 
148 |     def _get_parent_nodes(self, node):
149 |         """
150 |         Extract parent nodes of the given node.
151 |         """
152 |         if not node["parents"]:
153 |             return list()
154 |         parent_nodes = node["parents"]
155 |         parent_nodes = [
156 |             (parent_node["turn"], parent_node["question"]) for parent_node in parent_nodes
157 |         ]
158 |         return parent_nodes
159 | 
160 |     def _add_transitive_turn_relevances(self, turn_relevances, conversation):
161 |         """
162 |         Add the transitive turn_relevances from the single hop turn_relevances.
163 |         """
164 |         has_changed = True
165 |         # iterate until nothing has changed in a loop
166 |         while has_changed:
167 |             has_changed = False
168 |             new_turn_relevances = turn_relevances.copy()
169 |             for child in turn_relevances:
170 |                 for parent in turn_relevances[child]:
171 |                     for grandparent in turn_relevances[parent]:
172 |                         if not grandparent in turn_relevances[child]:
173 |                             new_turn_relevances[child].append(grandparent)
174 |                             self._add_relevant_turn(child, grandparent, conversation)
175 |                             has_changed = True
176 |             turn_relevances = new_turn_relevances
177 |         return turn_relevances
178 | 
179 |     def _answers_to_string(self, answers):
180 |         """
181 |         Transform the answer list string into text.
182 |         """
183 |         answers = answers.replace("[", "").replace("]", "").replace("'", "")
184 |         return answers
185 | 


--------------------------------------------------------------------------------
/convinse/question_understanding/question_resolution/question_resolution_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | 
  5 | from nltk.tokenize import word_tokenize
  6 | from pathlib import Path
  7 | 
  8 | from convinse.library.utils import get_config
  9 | from convinse.question_understanding.structured_representation.dataset_structured_representation import (
 10 |     output_to_text,
 11 | )
 12 | 
 13 | 
 14 | def postprocess_data(input_data, quretec_output_path):
 15 |     """Postprocess predictions by Question resolution model."""
 16 |     # open outputs
 17 |     with open(quretec_output_path, "r") as fp:
 18 |         quretec_pred = json.load(fp)
 19 | 
 20 |     # process predictions
 21 |     counter = 0
 22 |     for conversation in input_data:
 23 |         history = list()
 24 |         for turn in conversation["questions"]:
 25 |             # load input
 26 |             question = turn["question"]
 27 |             if turn["turn"] == 0:
 28 |                 turn["structured_representation"] = question
 29 |                 continue
 30 |             history_words = quretec_pred["x_input"][counter]
 31 |             predictions = quretec_pred["y_pred"][counter]
 32 |             # create completed
 33 |             qres = _process_prediction(question, history_words, predictions)
 34 |             # create instance
 35 |             turn["structured_representation"] = qres
 36 |             # next instance
 37 |             counter += 1
 38 |     return input_data
 39 | 
 40 | 
 41 | def postprocess_turn(turn, quretec_output_path):
 42 |     """Postprocess predictions by Question resolution model."""
 43 |     # open outputs
 44 |     with open(quretec_output_path, "r") as fp:
 45 |         quretec_pred = json.load(fp)
 46 | 
 47 |     # process predictions
 48 |     history_words = quretec_pred["x_input"][0]
 49 |     predictions = quretec_pred["y_pred"][0]
 50 |     
 51 |     # create completed
 52 |     question = turn["question"]
 53 |     qres = _process_prediction(question, history_words, predictions)
 54 |     # create instance
 55 |     turn["structured_representation"] = qres
 56 |     return turn
 57 | 
 58 | 
 59 | def _process_prediction(question, history_words, predictions):
 60 |     """Construct completed question from QuReTeC predictions."""
 61 |     # set of question words
 62 |     question_words_set = set(word_tokenize(question))
 63 | 
 64 |     # add words that were found relevant
 65 |     for i, pred in enumerate(predictions):
 66 |         if pred == "REL":  # word found relevant
 67 |             word = history_words[i]
 68 |             # skip if word already in question
 69 |             if word in question_words_set:
 70 |                 continue
 71 | 
 72 |             # add word to question
 73 |             question += " " + word
 74 |             question_words_set.add(word)
 75 |     # return completed question
 76 |     return question
 77 | 
 78 | 
 79 | def prepare_data_for_inference(config, data, output_path, use_gold_answers=False):
 80 |     """Prepare data in the given split."""
 81 |     dataset_for_inference = list()
 82 | 
 83 |     question_id = 0
 84 | 
 85 |     for conversation in data:
 86 |         history = list()
 87 |         for i, turn in enumerate(conversation["questions"]):
 88 |             # append question to history
 89 |             question = turn["question"]
 90 |             history.append(question)
 91 | 
 92 |             # first turn does not have a rewrite
 93 |             if turn["turn"] == 0:
 94 |                 continue
 95 | 
 96 |             turn_instance = _prepare_turn_for_inference(config, turn, history, question_id, use_gold_answers)
 97 |             dataset_for_inference.append(turn_instance)
 98 |             question_id += 1
 99 | 
100 |             # append answer to history (if there is a next turn)
101 |             if i + 1 < len(conversation["questions"]):
102 |                 if use_gold_answers:
103 |                     answer_text = ", ".join([answer["label"] for answer in turn["answers"]])
104 |                 else:
105 |                     answer_text = turn["pred_answers"][0]["label"]
106 |                 # append to history
107 |                 history.append(answer_text)
108 | 
109 |     # store in file
110 |     with open(output_path, "w") as fp:
111 |         json.dump(dataset_for_inference, fp)
112 | 
113 | 
114 | def _prepare_turn_for_inference(config, turn, history, question_id, use_gold_answers=False):
115 |     """Prepare a turn for inference and return result."""
116 |     # append question to history
117 |     question = turn["question"]
118 |     qres_input_separator = config["qres_input_separator"]
119 | 
120 |     # prepare input
121 |     history_text = " ".join(history)
122 |     tokenized_history = word_tokenize(history_text)
123 |     tokenized_question = word_tokenize(question)
124 | 
125 |     # create instance for distant supervision
126 |     sr = turn.get("silver_SR")
127 |     if sr:
128 |         sr_text = output_to_text(sr, " ")
129 |         tokenized_sr = word_tokenize(sr_text)
130 |         bert_ner_overlap = _create_bert_ner_overlap(
131 |             tokenized_history, tokenized_question, tokenized_sr, qres_input_separator
132 |         )
133 | 
134 |         instance_distant_supervision = {
135 |             "prev_questions": history_text,
136 |             "cur_question": question,
137 |             "answer_text": sr_text,
138 |             "bert_ner_overlap": bert_ner_overlap,
139 |             "id": str(question_id),
140 |         }
141 |         return instance_distant_supervision
142 |     else:
143 |         tokenized_input = tokenized_history + [qres_input_separator] + tokenized_question
144 |         ner_info = (
145 |             len(tokenized_history) * ["O"]
146 |             + [qres_input_separator]
147 |             + (len(tokenized_question) * ["O"])
148 |         )
149 |         bert_ner_overlap = [tokenized_input, ner_info]
150 | 
151 |         instance_distant_supervision = {
152 |             "prev_questions": history_text,
153 |             "cur_question": question,
154 |             "answer_text": question,
155 |             "bert_ner_overlap": bert_ner_overlap,
156 |             "id": str(question_id),
157 |         }
158 |         return instance_distant_supervision
159 | 
160 | 
161 | def prepare_turn_for_inference(config, turn, history_turns, output_path, use_gold_answers=False):
162 |     """Prepare a turn for inference and store in file."""
163 |     # process history
164 |     history = list()
165 |     for turn in history_turns:
166 |         # append question
167 |         question = turn["question"]
168 |         history.append(question)
169 | 
170 |         # append answer
171 |         if use_gold_answers:
172 |             answer_text = ", ".join([answer["label"] for answer in turn["answers"]])
173 |         else:
174 |             answer_text = turn["pred_answers"][0]["label"]
175 |         history.append(answer_text)
176 | 
177 |     # prepare turn
178 |     turn_instance = _prepare_turn_for_inference(config, turn, history, 0, use_gold_answers)
179 | 
180 |     # store in file
181 |     output_dir = os.path.dirname(output_path)
182 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
183 |     with open(output_path, "w") as fp:
184 |         json.dump([turn_instance], fp)
185 | 
186 | 
187 | def prepare_data_for_training(config, train_path, dev_path):
188 |     """Prepare data in the given split."""
189 |     input_dir = config["path_to_annotated"]
190 |     benchmark = config["benchmark"]
191 |     intermediate_res = config["path_to_intermediate_results"]
192 |     output_dir = os.path.join(intermediate_res, "qres", "data")
193 |     Path(output_dir).mkdir(parents=True, exist_ok=True)
194 | 
195 |     output_for_train = os.path.join(output_dir, "train.json")
196 |     _prepare_data_split_for_training(config, train_path, output_for_train)
197 | 
198 |     output_for_train = os.path.join(output_dir, "dev.json")
199 |     _prepare_data_split_for_training(config, dev_path, output_for_train)
200 | 
201 | 
202 | def _prepare_data_split_for_training(config, input_path, output_path):
203 |     """Prepare data in the given split."""
204 |     with open(input_path, "r") as fp:
205 |         data = json.load(fp)
206 | 
207 |     dataset_for_train = list()
208 |     qres_input_separator = config["qres_input_separator"]
209 | 
210 |     question_id = 0
211 | 
212 |     for conversation in data:
213 |         history = list()
214 |         for turn in conversation["questions"]:
215 |             question = turn["question"]
216 |             answer_text = " ".join([answer["label"] for answer in turn["answers"]])
217 | 
218 |             # append to history
219 |             history.append(f"{question} {qres_input_separator} {answer_text}")
220 |             # first turn does not have a rewrite
221 |             if turn["turn"] == 0:
222 |                 continue
223 | 
224 |             # prepare input
225 |             history_text = " ".join(history[:-1])
226 |             tokenized_history = word_tokenize(history_text)
227 |             tokenized_question = word_tokenize(question)
228 | 
229 |             # create instance for distant supervision
230 |             sr = turn.get("silver_SR")
231 |             if sr:
232 |                 sr_text = output_to_text(sr, " ")
233 |                 tokenized_sr = word_tokenize(sr_text)
234 |                 bert_ner_overlap = _create_bert_ner_overlap(
235 |                     tokenized_history, tokenized_question, tokenized_sr, qres_input_separator
236 |                 )
237 | 
238 |                 instance_distant_supervision = {
239 |                     "prev_questions": history_text,
240 |                     "cur_question": question,
241 |                     "answer_text": sr_text,
242 |                     "bert_ner_overlap": bert_ner_overlap,
243 |                     "id": str(question_id),
244 |                 }
245 |                 dataset_for_train.append(instance_distant_supervision)
246 | 
247 |     # store in files
248 |     with open(output_path, "w") as fp:
249 |         json.dump(dataset_for_train, fp)
250 | 
251 | 
252 | def _create_bert_ner_overlap(
253 |     tokenized_history, tokenized_question, tokenized_gold, qres_input_separator
254 | ):
255 |     """Create the required 'bert_ner_overlap' for the QuReTeC method."""
256 |     gold_tokens = set(tokenized_gold)
257 |     tokenized_input = tokenized_history + [qres_input_separator] + tokenized_question
258 |     ner_info = list()
259 |     for token in tokenized_history:
260 |         if token in gold_tokens:
261 |             ner_info.append("REL")
262 |         else:
263 |             ner_info.append("O")
264 |     # can not provide any ner_info here
265 |     ner_info = ner_info + [qres_input_separator] + (len(tokenized_question) * ["O"])
266 |     return [tokenized_input, ner_info]
267 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/clocq_er.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import time
  5 | import pickle
  6 | import logging
  7 | from pathlib import Path
  8 | from filelock import FileLock
  9 | 
 10 | from clocq.CLOCQ import CLOCQ
 11 | from clocq.interface.CLOCQInterfaceClient import CLOCQInterfaceClient
 12 | 
 13 | from convinse.library.utils import print_verbose, get_logger
 14 | from convinse.evidence_retrieval_scoring.wikipedia_retriever.wikipedia_retriever import (
 15 | 	WikipediaRetriever,
 16 | )
 17 | 
 18 | ENT_PATTERN = re.compile("^Q[0-9]+$")
 19 | PRE_PATTERN = re.compile("^P[0-9]+$")
 20 | 
 21 | KB_ITEM_SEPARATOR = ", "
 22 | 
 23 | 
 24 | class ClocqRetriever:
 25 | 	def __init__(self, config):
 26 | 		self.config = config
 27 | 		self.logger = get_logger(__name__, config)
 28 | 
 29 | 		# load cache
 30 | 		self.use_cache = config["ers_use_cache"]
 31 | 		if self.use_cache:
 32 | 			self.cache_path = config["ers_cache_path"]
 33 | 			self._init_cache()
 34 | 			self.cache_changed = False
 35 | 
 36 | 		# initialize clocq for KB-facts and disambiguations
 37 | 		if config["clocq_use_api"]:
 38 | 			self.clocq = CLOCQInterfaceClient(host=config["clocq_host"], port=config["clocq_port"])
 39 | 		else:
 40 | 			self.clocq = CLOCQ()
 41 | 
 42 | 		# initialize wikipedia-retriever
 43 | 		self.wiki_retriever = WikipediaRetriever(config)
 44 | 		if config["qu"] == "sr":
 45 | 			self.sr_delimiter = config["sr_delimiter"].strip()
 46 | 		else:
 47 | 			self.sr_delimiter = " "
 48 | 
 49 | 	def retrieve_evidences(self, structured_representation, sources):
 50 | 		"""
 51 | 		Retrieve evidences and question entities
 52 | 		for the given SR (or other question/text).
 53 | 
 54 | 		This function is used for initial evidence
 55 | 		retrieval. These evidences are filtered in the
 56 | 		next step.
 57 | 
 58 | 		Can also be used from external modules to access
 59 | 		all evidences for the given SR (if possible from cache).
 60 | 		"""
 61 | 		# KB-facts (always required for question entities)
 62 | 		all_question_entities = list()
 63 | 		all_question_entity_ids = set()
 64 | 		all_evidences = list()
 65 | 
 66 | 		# remove delimiter from SR
 67 | 		structured_representation = structured_representation.replace(self.sr_delimiter, " ")
 68 | 
 69 | 		evidences, question_entities = self.retrieve_KB_facts(structured_representation)
 70 | 		all_evidences += evidences
 71 | 
 72 | 		## TODO: might not be required any more, since only single SR considered
 73 | 		# identify new entities (avoid duplicates)
 74 | 		new_entities = [
 75 | 			entity
 76 | 			for entity in question_entities
 77 | 			if not entity["item"]["id"] in all_question_entity_ids
 78 | 		]
 79 | 		all_question_entities += new_entities
 80 | 
 81 | 		# update set of entity ids
 82 | 		all_question_entity_ids.update([entity["item"]["id"] for entity in new_entities])
 83 | 
 84 | 		# wikipedia evidences (only if required)
 85 | 		if any(src in sources for src in ["text", "table", "info"]):
 86 | 			for question_entity in all_question_entities:
 87 | 				all_evidences += self.retrieve_wikipedia_evidences(question_entity)
 88 | 
 89 | 		# config-based filtering
 90 | 		all_evidences = self.filter_evidences(all_evidences, sources)
 91 | 		return all_evidences, all_question_entities
 92 | 
 93 | 	def retrieve_wikipedia_evidences(self, question_entity):
 94 | 		"""
 95 | 		Retrieve evidences from Wikipedia for the given question entity.
 96 | 		"""
 97 | 		question_entity_id = question_entity["item"]["id"]
 98 | 
 99 | 		# look-up cache
100 | 		# if self.use_cache and question_entity_id in self.cache["wikipedia"]:
101 | 			# return self.cache["wikipedia"][question_entity_id]
102 | 
103 | 		# used for debugging
104 | 		# self.logger.debug(f"No cache hit: Retrieving wikipedia evidences for: {question_entity_id}.")
105 | 		# cache_res = self.cache["wikipedia"].get(question_entity_id)
106 | 		# if self.use_cache:
107 | 		# 	cache_len = len(self.cache["wikipedia"])
108 | 		# 	# self.logger.debug(f"Cache result: {cache_res}.")
109 | 		# 	self.logger.debug(f"Cache length: {cache_len}.")
110 | 	 
111 | 		 # retrieve result
112 | 		evidences = self.wiki_retriever.retrieve_wp_evidences(question_entity_id)
113 | 		for evidence in evidences:
114 | 			evidence["retrieved_for_entity"] = question_entity["item"]
115 |  
116 | 		assert not evidences is None # evidences should never be None
117 | 		# store result in cache
118 | 		# if self.use_cache:
119 | 			# self.cache_changed = True
120 | 			# self.cache["wikipedia"][question_entity_id] = evidences
121 | 		return evidences
122 | 
123 | 	def retrieve_KB_facts(self, structured_representation):
124 | 		"""
125 | 		Retrieve KB facts for the given SR (or other question/text).
126 | 		Also returns the question entities, for usage in Wikipedia retriever.
127 | 		"""
128 | 
129 | 		def _is_potential_answer(item_id):
130 | 			"""Return if item_id could be answer."""
131 | 			# keep all KB-items except for predicates
132 | 			if PRE_PATTERN.match(item_id):
133 | 				return False
134 | 			return True
135 | 
136 | 		# look-up cache
137 | 		if self.use_cache and structured_representation in self.cache["kb"]:
138 | 			return self.cache["kb"][structured_representation]
139 | 
140 | 		self.logger.debug(f"No cache hit: Retrieving search space for: {structured_representation}.")
141 | 
142 | 		# apply CLOCQ
143 | 		clocq_result = self.clocq.get_search_space(
144 | 			structured_representation, parameters=self.config["clocq_params"], include_labels=True
145 | 		)
146 | 
147 | 		# get question entities (predicates dropped)
148 | 		question_entities = [
149 | 			item
150 | 			for item in clocq_result["kb_item_tuple"]
151 | 			if not item["item"]["id"] is None and ENT_PATTERN.match(item["item"]["id"])
152 | 		]
153 | 
154 | 		question_items_set = set([item["item"]["id"] for item in clocq_result["kb_item_tuple"]])
155 | 
156 | 		# remember potential duplicate facts
157 | 		potential_duplicates = set()
158 | 
159 | 		# transform facts to evidences
160 | 		evidences = list()
161 | 		for fact in clocq_result["search_space"]:
162 | 			# evidence text
163 | 			evidence_text = self._kb_fact_to_text(fact)
164 | 
165 | 			# entities the fact was retrieved for from clocq
166 | 			retrieved_for = [item for item in fact if item["id"] in question_items_set][0]
167 | 
168 | 			# potential duplicate
169 | 			if len(retrieved_for) > 1:
170 | 				# skip duplicate
171 | 				if evidence_text in potential_duplicates:
172 | 					continue
173 | 				# remember evidence
174 | 				potential_duplicates.add(evidence_text)
175 | 
176 | 			evidence = {
177 | 				"evidence_text": evidence_text,
178 | 				"wikidata_entities": [item for item in fact if _is_potential_answer(item["id"])],
179 | 				"disambiguations": [
180 | 					(item["label"], item["id"]) for item in fact if ENT_PATTERN.match(item["id"])
181 | 				],
182 | 				"retrieved_for_entity": retrieved_for,
183 | 				"source": "kb",
184 | 			}
185 | 			evidences.append(evidence)
186 | 
187 | 		# store result in cache
188 | 		if self.use_cache:
189 | 			self.cache_changed = True
190 | 			self.cache["kb"][structured_representation] = (evidences, question_entities)
191 | 		return evidences, question_entities
192 | 
193 | 	def retrieve_kb_facts_for_item(self, item_id):
194 | 		"""
195 | 		Retrieve KB facts with the given KB item ID.
196 | 		Returns a list of KB facts, with each KB fact being a tuple of {"id", "label"} dictionaries.
197 | 		"""
198 | 		# retrieve KB facts via clocq (API)
199 | 		return self.clocq.get_neighborhood(item_id, p=self.config["clocq_p"], include_labels=True)
200 | 
201 | 	def filter_evidences(self, evidences, sources):
202 | 		"""
203 | 		Filter the set of evidences according to their source.
204 | 		"""
205 | 		filtered_evidences = list()
206 | 		for evidence in evidences:
207 | 			if len(evidence["wikidata_entities"]) == 1:
208 | 				continue
209 | 			if len(evidence["wikidata_entities"]) > self.config["evr_max_entities"]:
210 | 				continue
211 | 			if evidence["source"] in sources:
212 | 				filtered_evidences.append(evidence)
213 | 
214 | 		return filtered_evidences
215 | 
216 | 	def _kb_fact_to_text(self, fact):
217 | 		"""Verbalize the KB-fact."""
218 | 		return KB_ITEM_SEPARATOR.join([item["label"] for item in fact])
219 | 
220 | 	def store_cache(self):
221 | 		"""Store the cache to disk."""
222 | 		if not self.use_cache: # store only if cache in use
223 | 			return
224 | 		if not self.cache_changed: # store only if cache changed
225 | 			return
226 | 		# check if the cache was updated by other processes
227 | 		if self._read_cache_version() == self.cache_version:
228 | 			# no updates: store and update version
229 | 			self.logger.info(f"Writing ER cache at path {self.cache_path}.")
230 | 			with FileLock(f"{self.cache_path}.lock"):
231 | 				self._write_cache(self.cache)
232 | 				self._write_cache_version()
233 | 		else:
234 | 			# update! read updated version and merge the caches
235 | 			self.logger.info(f"Merging ER cache at path {self.cache_path}.")
236 | 			with FileLock(f"{self.cache_path}.lock"):
237 | 				# read updated version
238 | 				updated_cache = self._read_cache()
239 | 				# overwrite with changes in current process (most recent)
240 | 				updated_cache["kb"].update(self.cache["kb"])
241 | 				updated_cache["wikipedia"].update(self.cache["wikipedia"])
242 | 				# store
243 | 				self._write_cache(updated_cache)
244 | 				self._write_cache_version()
245 | 		# store extended wikipedia dump (if any changes occured)
246 | 		self.wiki_retriever.store_dump()
247 | 
248 | 	def reset_cache(self):
249 | 		"""Reset the cache for new population."""
250 | 		self.logger.warn(f"Resetting ER cache at path {self.cache_path}.")
251 | 		with FileLock(f"{self.cache_path}.lock"):
252 | 			self.cache = {"kb": {}, "wikipedia": {}}
253 | 			self._write_cache(self.cache)
254 | 			self._write_cache_version()
255 | 
256 | 	def _init_cache(self):
257 | 		"""Initialize the cache."""
258 | 		if os.path.isfile(self.cache_path):
259 | 			# remember version read initially
260 | 			self.logger.info(f"Loading ER cache from path {self.cache_path}.")
261 | 			with FileLock(f"{self.cache_path}.lock"):
262 | 				self.cache_version = self._read_cache_version()
263 | 				self.logger.debug(self.cache_version)
264 | 				self.cache = self._read_cache()
265 | 			self.logger.info(f"ER cache successfully loaded.")
266 | 		else:
267 | 			self.logger.info(f"Could not find an existing ER cache at path {self.cache_path}.")
268 | 			self.logger.info("Populating ER cache from scratch!")
269 | 			self.cache = {"kb": {}, "wikipedia": {}}
270 | 			self._write_cache(self.cache)
271 | 			self._write_cache_version()
272 | 
273 | 	def _read_cache(self):
274 | 		"""
275 | 		Read the current version of the cache.
276 | 		This can be different from the version used in this file,
277 | 		given that multiple processes may access it simultaneously.
278 | 		"""
279 | 		# read file content from cache shared across QU methods
280 | 		with open(self.cache_path, "rb") as fp:
281 | 			cache = pickle.load(fp)
282 | 		return cache
283 | 
284 | 	def _write_cache(self, cache):
285 | 		"""Write to the cache."""
286 | 		cache_dir = os.path.dirname(self.cache_path)
287 | 		Path(cache_dir).mkdir(parents=True, exist_ok=True)
288 | 		with open(self.cache_path, "wb") as fp:
289 | 			pickle.dump(cache, fp)
290 | 		return cache
291 | 
292 | 	def _read_cache_version(self):
293 | 		"""Read the cache version (hashed timestamp of last update) from a dedicated file."""
294 | 		if not os.path.isfile(f"{self.cache_path}.version"):
295 | 			self._write_cache_version()
296 | 		with open(f"{self.cache_path}.version", "r") as fp:
297 | 			cache_version = fp.readline().strip()
298 | 		return cache_version
299 | 
300 | 	def _write_cache_version(self):
301 | 		"""Write the current cache version (hashed timestamp of current update)."""
302 | 		with open(f"{self.cache_path}.version", "w") as fp:
303 | 			version = str(time.time())
304 | 			fp.write(version)
305 | 		self.cache_version = version
306 | 


--------------------------------------------------------------------------------
/convinse/evidence_retrieval_scoring/wikipedia_retriever/wikipedia_retriever.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import requests
  4 | import spacy
  5 | import sys
  6 | import time
  7 | import pickle
  8 | import json
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | from convinse.library.utils import get_config, get_logger
 12 | import convinse.library.wikipedia_library as wiki
 13 | 
 14 | from convinse.evidence_retrieval_scoring.wikipedia_retriever.text_parser import (
 15 |     extract_text_snippets,
 16 | )
 17 | from convinse.evidence_retrieval_scoring.wikipedia_retriever.table_parser import (
 18 |     extract_wikipedia_tables,
 19 |     json_tables_to_evidences,
 20 | )
 21 | from convinse.evidence_retrieval_scoring.wikipedia_retriever.infobox_parser import (
 22 |     InfoboxParser,
 23 |     infobox_to_evidences,
 24 | )
 25 | from convinse.evidence_retrieval_scoring.wikipedia_retriever.evidence_annotator import (
 26 |     EvidenceAnnotator,
 27 | )
 28 | 
 29 | 
 30 | API_URL = "http://en.wikipedia.org/w/api.php"
 31 | PARAMS = {
 32 |     "prop": "extracts|revisions",
 33 |     "format": "json",
 34 |     "action": "query",
 35 |     "explaintext": "",
 36 |     "rvprop": "content",
 37 | }
 38 | 
 39 | YEAR_PATTERN = re.compile("^[0-9][0-9][0-9][0-9]$")
 40 | WIKI_DATE_PATTERN = re.compile("[0-9]+ [A-Z][a-z]* [0-9][0-9][0-9][0-9]")
 41 | 
 42 | 
 43 | class WikipediaRetriever:
 44 |     def __init__(self, config):
 45 |         self.config = config
 46 |         self.logger = get_logger(__name__, config)
 47 | 
 48 |         # whether Wikipedia evidences are retrieved on the fly (i.e. from the Wikipedia API)
 49 |         self.on_the_fly = config["ers_on_the_fly"]
 50 | 
 51 |         # initialize dump
 52 |         self._init_wikipedia_dump()
 53 | 
 54 |         if self.on_the_fly:
 55 |             # open dicts
 56 |             with open(config["path_to_wikidata_mappings"], "r") as fp:
 57 |                 self.wikidata_mappings = json.load(fp)
 58 |             with open(config["path_to_wikipedia_mappings"], "r") as fp:
 59 |                 self.wikipedia_mappings = json.load(fp)
 60 | 
 61 |             # initialize evidence annotator (used for (text)->Wikipedia->Wikidata)
 62 |             self.annotator = EvidenceAnnotator(config, self.wikidata_mappings)
 63 | 
 64 |             # load nlp pipeline
 65 |             self.nlp = spacy.blank("en")
 66 |             self.nlp.add_pipe("sentencizer")
 67 |         self.logger.debug("WikipediaRetriever successfully initialized!")
 68 | 
 69 |     def retrieve_wp_evidences(self, question_entity_id):
 70 |         """
 71 |         Retrieve evidences from Wikipedia for the given Wikipedia title.
 72 |         Always returns the full set of evidences (text, table, infobox).
 73 |         Filtering is done via filter_evidences function.
 74 |         """
 75 |         if question_entity_id in self.wikipedia_dump:
 76 |             self.logger.debug(f"Found Wikipedia evidences in dump!")
 77 |             return self.wikipedia_dump.get(question_entity_id)
 78 | 
 79 |         if not self.on_the_fly:
 80 |             self.logger.debug(f"No Wikipedia evidences in dump, but on-the-fly retrieval not active!")
 81 |             return []
 82 | 
 83 |         # get Wikipedia title
 84 |         wiki_path = self.wikipedia_mappings.get(question_entity_id)
 85 |         if not wiki_path:
 86 |             self.logger.debug(f"No Wikipedia link found for this Wikidata ID: {question_entity_id}.")
 87 |             self.wikipedia_dump[question_entity_id] = [] # remember
 88 |             return []
 89 |         self.logger.debug(f"Retrieving Wikipedia evidences for: {wiki_path}.")
 90 | 
 91 |         # retrieve Wikipedia soup
 92 |         wiki_title = wiki._wiki_path_to_title(wiki_path)
 93 |         soup = self._retrieve_soup(wiki_title)
 94 |         if soup is None:
 95 |             self.wikipedia_dump[question_entity_id] = [] # remember
 96 |             return []
 97 | 
 98 |         # retrieve Wikipedia markdown
 99 |         wiki_md = self._retrieve_markdown(wiki_title)
100 | 
101 |         # extract anchors
102 |         doc_anchor_dict = self._build_document_anchor_dict(soup)
103 | 
104 |         # retrieve evidences
105 |         infobox_evidences = self._retrieve_infobox_entries(wiki_title, soup, doc_anchor_dict)
106 |         table_records = self._retrieve_table_records(wiki_title, wiki_md)
107 |         text_snippets = self._retrieve_text_snippets(wiki_title, wiki_md)
108 | 
109 |         # prune e.g. too long evidences
110 |         evidences = infobox_evidences + table_records + text_snippets
111 |         evidences = self.filter_and_clean_evidences(evidences)
112 | 
113 |         ## add wikidata entities (for table and text)
114 |         # evidences with no wikidata entities (except for the wiki_path) are dropped
115 |         self.annotator.annotate_wikidata_entities(wiki_path, evidences, doc_anchor_dict)
116 | 
117 |         # store result in dump
118 |         self.wikipedia_dump[question_entity_id] = evidences
119 | 
120 |         self.logger.debug(f"Evidences successfully retrieved for {question_entity_id}.")
121 |         return evidences
122 | 
123 |     def filter_and_clean_evidences(self, evidences):
124 |         """
125 |         Drop evidences which do not suffice specific
126 |         criteria. E.g. such evidences could be too
127 |         short, long, or contain too many symbols.
128 |         """
129 |         filtered_evidences = list()
130 |         for evidence in evidences:
131 |             evidence_text = evidence["evidence_text"]
132 |             ## filter evidences
133 |             # too short
134 |             if len(evidence_text) < self.config["evr_min_evidence_length"]:
135 |                 continue
136 |             # too long
137 |             if len(evidence_text) > self.config["evr_max_evidence_length"]:
138 |                 continue
139 |             # ratio of letters very low
140 |             letters = sum(c.isalpha() for c in evidence_text)
141 |             if letters < len(evidence_text) / 2:
142 |                 continue
143 | 
144 |             ## clean evidence
145 |             evidence_text = self.clean_evidence(evidence_text)
146 |             evidence["evidence_text"] = evidence_text
147 |             filtered_evidences.append(evidence)
148 |         return filtered_evidences
149 | 
150 |     def clean_evidence(self, evidence_text):
151 |         """Clean the given evidence text."""
152 |         evidence_text = re.sub(r"\[[0-9]*\]", "", evidence_text)
153 |         return evidence_text
154 | 
155 |     def _retrieve_infobox_entries(self, wiki_title, soup, doc_anchor_dict):
156 |         """
157 |         Retrieve infobox entries for the given Wikipedia entity.
158 |         """
159 |         # get infobox (only one infobox possible)
160 |         infoboxes = soup.find_all("table", {"class": "infobox"})
161 |         if not infoboxes:
162 |             return []
163 |         infobox = infoboxes[0]
164 | 
165 |         # parse infobox content
166 |         p = InfoboxParser(doc_anchor_dict)
167 |         infobox_html = str(infobox)
168 |         p.feed(infobox_html)
169 | 
170 |         # transform parsed infobox to evidences
171 |         infobox_parsed = p.tables[0]
172 |         evidences = infobox_to_evidences(infobox_parsed, wiki_title)
173 |         return evidences
174 | 
175 |     def _retrieve_table_records(self, wiki_title, wiki_md):
176 |         """
177 |         Retrieve table records for the given Wikipedia entity.
178 |         """
179 |         # extract wikipedia tables
180 |         tables = extract_wikipedia_tables(wiki_md)
181 | 
182 |         # extract evidences from tables
183 |         evidences = json_tables_to_evidences(tables, wiki_title)
184 |         return evidences
185 | 
186 |     def _retrieve_text_snippets(self, wiki_title, wiki_md):
187 |         """
188 |         Retrieve text snippets for the given Wikidata entity.
189 |         """
190 |         evidences = extract_text_snippets(wiki_md, wiki_title, self.nlp)
191 |         return evidences
192 | 
193 |     def _build_document_anchor_dict(self, soup):
194 |         """
195 |         Establishes a dictionary that maps from Wikipedia text
196 |         to the Wikipedia entity (=link). Is used to map to
197 |         Wikidata entities (via Wikipedia) later.
198 |         Format: text -> Wikidata entity.
199 |         """
200 |         # prune navigation bar
201 |         for div in soup.find_all("div", {"class": "navbox"}):
202 |             div.decompose()
203 | 
204 |         # go through links
205 |         anchor_dict = dict()
206 |         for tag in soup.find_all("a"):
207 |             # anchor text
208 |             text = tag.text.strip()
209 |             if len(text) < 3:
210 |                 continue
211 |             # duplicate anchor text (keep first)
212 |             # -> later ones can be more specific/incorrect
213 |             if anchor_dict.get(text):
214 |                 continue
215 | 
216 |             # wiki title (=entity)
217 |             href = tag.attrs.get("href")
218 |             if not wiki.is_wikipedia_path(href):
219 |                 continue
220 |             wiki_path = wiki.format_wiki_path(href)
221 | 
222 |             anchor_dict[text] = wiki_path
223 |         return anchor_dict
224 | 
225 |     def _retrieve_soup(self, wiki_title):
226 |         """
227 |         Retrieve Wikipedia html for the given Wikipedia Title.
228 |         """
229 |         wiki_path = wiki._wiki_title_to_path(wiki_title)
230 |         link = f"https://en.wikipedia.org/wiki/{wiki_path}"
231 |         try:
232 |             html = requests.get(link).text
233 |             soup = BeautifulSoup(html, features="html.parser")
234 |         except:
235 |             return None
236 |         return soup
237 | 
238 |     def _retrieve_markdown(self, wiki_title):
239 |         """
240 |         Retrieve the content of the given wikipedia title.
241 |         """
242 |         params = PARAMS.copy()
243 |         params["titles"] = wiki_title
244 |         try:
245 |             # make request
246 |             r = requests.get(API_URL, params=params)
247 |             res = r.json()
248 |         except:
249 |             return None
250 |         pages = res["query"]["pages"]
251 |         page = list(pages.values())[0]
252 |         return page
253 | 
254 |     def _init_wikipedia_dump(self):
255 |         """
256 |         Initialize the Wikipedia dump. The consists of a mapping
257 |         from Wikidata IDs to Wikipedia evidences in the expected format.
258 |         """
259 |         path_to_dump = self.config["ers_wikipedia_dump"]
260 |         with open(path_to_dump, "rb") as fp:
261 |             self.wikipedia_dump = pickle.load(fp)
262 |         self.wikipedia_dump_version = len(self.wikipedia_dump)
263 | 
264 |     def store_dump(self):
265 |         """Store the updated Wikipedia dump."""
266 |         if len(self.wikipedia_dump) > self.wikipedia_dump_version:
267 |             self.logger.info("Wikipedia dump extended! Storing data on disk.")
268 |             path_to_dump = self.config["ers_wikipedia_dump"]
269 |             with open(path_to_dump, "wb") as fp:
270 |                 pickle.dump(self.wikipedia_dump, fp)
271 | 
272 | #######################################################################################################################
273 | #######################################################################################################################
274 | if __name__ == "__main__":
275 |     # RUN: python convinse.evidence_retrieval_scoring/wikipedia_retriever/wikipedia_retriever.py config/convmix/convinse.yml
276 |     if len(sys.argv) != 2:
277 |         raise Exception(
278 |             "python convinse.evidence_retrieval_scoring/wikipedia_retriever/wikipedia_retriever.py <PATH_TO_CONFIG>"
279 |         )
280 | 
281 |     # load config
282 |     config_path = sys.argv[1]
283 |     config = get_config(config_path)
284 | 
285 |     # create retriever
286 |     retriever = WikipediaRetriever(config)
287 | 
288 |     # retrieve evidences
289 |     start = time.time()
290 |     question_entity = {"id": "Q23572", "label": "Game of Thrones"}
291 |     evidences = retriever.retrieve_wp_evidences(question_entity["id"])
292 |     print("Time consumed", time.time() - start)
293 | 
294 |     # show evidences
295 |     for evidence in evidences:
296 |         print(evidence)
297 |         break
298 | 


--------------------------------------------------------------------------------
/convinse/pipeline.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import copy
  5 | 
  6 | from convinse.evaluation import answer_presence
  7 | from convinse.library.utils import get_config, get_logger, get_result_logger, store_json_with_mkdir
  8 | 
  9 | # qu
 10 | from convinse.question_understanding.naive_concat.naive_concat import NaiveConcat
 11 | from convinse.question_understanding.question_rewriting.question_rewriting_module import (
 12 | 	QuestionRewritingModule,
 13 | )
 14 | from convinse.question_understanding.question_resolution.question_resolution_module import (
 15 | 	QuestionResolutionModule,
 16 | )
 17 | from convinse.question_understanding.structured_representation.structured_representation_module import (
 18 | 	StructuredRepresentationModule,
 19 | )
 20 | 
 21 | # ers
 22 | from convinse.evidence_retrieval_scoring.clocq_bm25 import ClocqBM25
 23 | from convinse.evidence_retrieval_scoring.clocq_er import ClocqRetriever
 24 | 
 25 | # ha
 26 | from convinse.heterogeneous_answering.fid_module.fid_module import FiDModule
 27 | 
 28 | 
 29 | class Pipeline:
 30 | 	def __init__(self, config, use_gold_answers):
 31 | 		"""Create the pipeline based on the config."""
 32 | 		# load config
 33 | 		self.config = config
 34 | 		self.logger = get_logger(__name__, config)
 35 | 		self.result_logger = get_result_logger(config)
 36 | 
 37 | 		# load individual modules
 38 | 		self.qu = self._load_qu(use_gold_answers)
 39 | 		self.ers = self._load_ers()
 40 | 		self.ha = self._load_ha()
 41 | 
 42 | 		self.name = config["name"]
 43 | 
 44 | 	def train(self, sources_str):
 45 | 		"""
 46 | 		Train the given pipeline in the standard manner.
 47 | 		First, train the QU phase (if required), run the inference on all sets,
 48 | 		and then train the ERS (if required), run inference in all sets, and
 49 | 		finally train the HA model.
 50 | 		"""
 51 | 		sources = sources_str.split("_")
 52 | 
 53 | 		# Question Understanding (QU)
 54 | 		self.qu.train()
 55 | 		self.qu.inference()
 56 | 		self.qu = None  # free up memory
 57 | 
 58 | 		# Evidence Retrieval and Scoring (ERS)
 59 | 		self.ers.train()
 60 | 		self.ers.inference(sources)
 61 | 		self.ers = None  # free up memory
 62 | 
 63 | 		# Heterogeneous Answering (HA)
 64 | 		self.ha.train()
 65 | 
 66 | 	def main_results(self):
 67 | 		"""
 68 | 		Run the pipeline using gold answers on all source combinations in the CONVINSE paper.
 69 | 		"""
 70 | 		source_combinations = ["kb_text_table_info", "kb", "text", "table", "info", "kb_text", "kb_table", "kb_info", "text_table", "text_info", "table_info"]
 71 | 		self.run_with_gold_answers(source_combinations, clean_up=False)
 72 | 
 73 | 	def run_with_gold_answers(self, sources_str, clean_up=False):
 74 | 		"""
 75 | 		Run the pipeline using gold answers for the next turns.
 76 | 		`sources_str` can either be a single string, or a list of strings (several source combinations).
 77 | 		The parameter `clean_up` controls whether the QU and ERS modules would
 78 | 		be unset after their inference, for freeing up some memory.
 79 | 		"""
 80 | 		# define output path
 81 | 		if not isinstance(sources_str, list):
 82 | 			source_combinations = [sources_str]
 83 | 		else:
 84 | 			source_combinations = sources_str
 85 | 		output_dir = self.set_output_dir(source_combinations[0])
 86 | 
 87 | 		# open data
 88 | 		input_dir = config["path_to_intermediate_results"]
 89 | 		input_path = os.path.join(input_dir, "annotated_test.json")
 90 | 		with open(input_path, "r") as fp:
 91 | 			data = json.load(fp)
 92 | 
 93 | 		self.logger.debug(f"len(input_data) {len(data)}")
 94 | 
 95 | 		self.qu.inference_on_data(data)
 96 | 		if clean_up: self.qu = None  # free up memory
 97 | 		output_path = f"{output_dir}/res_{self.name}_qu.json"
 98 | 		store_json_with_mkdir(data, output_path)
 99 | 
100 | 		# run inference on data
101 | 		for sources_str in source_combinations:
102 | 			input_data = copy.deepcopy(data)
103 | 
104 | 			# define output path
105 | 			output_dir = self.set_output_dir(sources_str)
106 | 
107 | 			self.ers.inference_on_data(input_data, sources_str.split("_"))
108 | 			self.ers.store_cache()
109 | 			if clean_up: self.ers = None  # free up memory
110 | 			output_path = f"{output_dir}/res_{self.name}_ers.json"
111 | 			store_json_with_mkdir(input_data, output_path)
112 | 
113 | 			self.ha.inference_on_data(input_data)
114 | 			output_path = f"{output_dir}/res_{self.name}_gold_answers.json"
115 | 			store_json_with_mkdir(input_data, output_path)
116 | 
117 | 			# compute results
118 | 			p_at_1_list = [turn["p_at_1"] for conv in input_data for turn in conv["questions"]]
119 | 			p_at_1 = sum(p_at_1_list) / len(p_at_1_list)
120 | 			p_at_1 = round(p_at_1, 3)
121 | 			ans_pres_list = [turn["answer_presence"] for conv in input_data for turn in conv["questions"]]
122 | 			ans_pres = sum(ans_pres_list) / len(ans_pres_list)
123 | 			ans_pres = round(ans_pres, 3)
124 | 			num_questions = len(p_at_1_list)
125 | 
126 | 			# log result
127 | 			res_str = f"Gold answers - {sources_str} - P@1 ({num_questions}): {p_at_1}"
128 | 			self.logger.info(res_str)
129 | 			self.result_logger.info(res_str)
130 | 
131 | 	def run_with_predicted_answers(self, sources_str):
132 | 		"""
133 | 		Run the instantiated pipeline, using the predicted answers of previous turns
134 | 		for generating the output of the QU phase.
135 | 		"""
136 | 
137 | 		def _next_turn_exists(current_turn_id, benchmark):
138 | 			"""Check if there is a next turn. Assumes that all conversations have same length."""
139 | 			return any([(current_turn_id < len(conv["questions"])) for conv in benchmark])
140 | 
141 | 		def _first_k_turns(conversation, k):
142 | 			"""Get first k turns of conversation."""
143 | 			return conversation["questions"][:k]
144 | 
145 | 		# define output path
146 | 		output_dir = self.set_output_dir(sources_str)
147 | 
148 | 		# open data
149 | 		input_dir = config["path_to_intermediate_results"]
150 | 		input_path = os.path.join(input_dir, "annotated_test.json")
151 | 		with open(input_path, "r") as fp:
152 | 			benchmark = json.load(fp)
153 | 
154 | 		# prepare input for first turn
155 | 		current_turn_id = 0
156 | 		output_prev_turn = benchmark.copy()
157 | 
158 | 		# iterate through turns
159 | 		while _next_turn_exists(current_turn_id, benchmark):
160 | 			# while current_turn_id < 5:
161 | 			self.logger.info(f"Starting with turn {current_turn_id}")
162 | 			# prepare for next turn
163 | 			input_data = copy.deepcopy(benchmark)
164 | 			for i, conv in enumerate(input_data):
165 | 				if current_turn_id >= len(conv["questions"]):
166 | 					input_data[i] = output_prev_turn[i]
167 | 					continue
168 | 				conv_output_prev_turn = output_prev_turn[i].copy()
169 | 				conv["questions"] = _first_k_turns(conv_output_prev_turn, current_turn_id) + [
170 | 					conv["questions"][current_turn_id]
171 | 				]
172 | 
173 | 			### QU
174 | 			self.logger.info(f"QU for turn {current_turn_id}")
175 | 			output_path = f"{output_dir}/qu_input_turn_{current_turn_id}.json"
176 | 			store_json_with_mkdir(input_data, output_path)
177 | 			self.qu.inference_on_data(input_data)
178 | 
179 | 			### ERS
180 | 			self.logger.info(f"ERS for turn {current_turn_id}")
181 | 			output_path = f"{output_dir}/ers_input_turn_{current_turn_id}.json"
182 | 			input_turns = [
183 | 				conv["questions"][current_turn_id]
184 | 				for conv in input_data
185 | 				if current_turn_id < len(conv["questions"])
186 | 			]
187 | 			store_json_with_mkdir(input_turns, output_path)
188 | 			self.ers.inference_on_turns(input_turns)
189 | 
190 | 			### HA
191 | 			self.logger.info(f"HA for turn {current_turn_id}")
192 | 			self.ha.inference_on_turns(input_turns)
193 | 
194 | 			# prepare for next turn
195 | 			output_prev_turn = input_data
196 | 			current_turn_id += 1
197 | 
198 | 			# store intermediate res
199 | 			output_path = f"{output_dir}/res_turn_{current_turn_id}.json"
200 | 			store_json_with_mkdir(output_prev_turn, output_path)
201 | 
202 | 		# compute results
203 | 		p_at_1_list = [turn["p_at_1"] for conv in output_prev_turn for turn in conv["questions"]]
204 | 		p_at_1 = sum(p_at_1_list) / len(p_at_1_list)
205 | 		p_at_1 = round(p_at_1, 3)
206 | 		num_questions = len(p_at_1_list)
207 | 
208 | 		# log result
209 | 		res_str = f"Pred. answers - {sources_str} - P@1 ({num_questions}): {p_at_1}"
210 | 		self.logger.info(res_str)
211 | 
212 | 		# store cache
213 | 		self.ers.store_cache()
214 | 
215 | 	def example(self):
216 | 		"""Run pipeline on a single input turn."""
217 | 		turn = {
218 | 			"question_id": "0",
219 | 			"turn": 0,
220 | 			"answers": [{"id": "Q445772", "label": "Nikolaj Coster-Waldau"}],
221 | 			"question": "Who played Jaime Lannister in Game of Thrones?",
222 | 		}
223 | 		self.logger.info(f"Running QU")
224 | 		self.qu.inference_on_turn(turn, [])
225 | 		self.logger.debug(turn)
226 | 		self.logger.info(f"Running ERS")
227 | 		self.ers.inference_on_turn(turn)
228 | 		self.logger.debug(turn)
229 | 		self.logger.info(f"Running HA")
230 | 		self.ha.inference_on_turn(turn)
231 | 		self.logger.info(turn)
232 | 		self.result_logger.info("hi")
233 | 
234 | 	def set_output_dir(self, sources_str):
235 | 		"""Define path for outputs."""
236 | 		qu = self.config["qu"]
237 | 		ers = self.config["ers"]
238 | 		ha = self.config["ha"]
239 | 		path_to_intermediate_results = self.config["path_to_intermediate_results"]
240 | 
241 | 		output_dir = os.path.join(path_to_intermediate_results, qu, ers, sources_str, ha)
242 | 		return output_dir
243 | 
244 | 	def _load_qu(self, use_gold_answers):
245 | 		"""Instantiate QU stage of CONVINSE pipeline."""
246 | 		qu = self.config["qu"]
247 | 		self.logger.info("Loading QU module")
248 | 		if qu.startswith("nc_"):
249 | 			return NaiveConcat(self.config, use_gold_answers)
250 | 		elif qu == "sr":
251 | 			return StructuredRepresentationModule(self.config, use_gold_answers)
252 | 		elif qu == "qrew":
253 | 			return QuestionRewritingModule(self.config, use_gold_answers)
254 | 		elif qu == "qres":
255 | 			return QuestionResolutionModule(self.config, use_gold_answers)
256 | 		else:
257 | 			raise ValueError(
258 | 				f"There is no available module for instantiating the QU phase called {qu}."
259 | 			)
260 | 
261 | 	def _load_ers(self):
262 | 		"""Instantiate ERS stage of CONVINSE pipeline."""
263 | 		ers = self.config["ers"]
264 | 		self.logger.info("Loading ERS module")
265 | 		if ers == "clocq_bm25":
266 | 			return ClocqBM25(self.config)
267 | 		else:
268 | 			raise ValueError(
269 | 				f"There is no available module for instantiating the ERS phase called {ers}."
270 | 			)
271 | 
272 | 	def _load_ha(self):
273 | 		"""Instantiate HA stage of CONVINSE pipeline."""
274 | 		ha = self.config["ha"]
275 | 		self.logger.info("Loading HA module")
276 | 		if ha == "fid":
277 | 			return FiDModule(self.config)
278 | 		else:
279 | 			raise ValueError(
280 | 				f"There is no available module for instantiating the HA phase called {ha}."
281 | 			)
282 | 
283 | 
284 | #######################################################################################################################
285 | #######################################################################################################################
286 | if __name__ == "__main__":
287 | 	if len(sys.argv) < 3:
288 | 		raise Exception("Usage: python convinse/pipeline.py <FUNCTION> <PATH_TO_CONFIG> [<SOURCES_STRING>]")
289 | 
290 | 	# load config
291 | 	function = sys.argv[1]
292 | 	config_path = sys.argv[2]
293 | 	config = get_config(config_path)
294 | 
295 | 	# inference using predicted answers
296 | 	if function == "--train":
297 | 		sources_str = sys.argv[3] if len(sys.argv) > 3 else "kb_text_table_info"
298 | 		pipeline = Pipeline(config, use_gold_answers=True)
299 | 		pipeline.train(sources_str)
300 | 
301 | 	elif function == "--main-results":
302 | 		pipeline = Pipeline(config, use_gold_answers=True)
303 | 		pipeline.main_results()
304 | 
305 | 	elif function == "--gold-answers":
306 | 		sources_str = sys.argv[3] if len(sys.argv) > 3 else "kb_text_table_info"
307 | 		pipeline = Pipeline(config, use_gold_answers=True)
308 | 		pipeline.run_with_gold_answers(sources_str)
309 | 
310 | 	elif function == "--pred-answers":
311 | 		sources_str = sys.argv[3] if len(sys.argv) > 3 else "kb_text_table_info"
312 | 		pipeline = Pipeline(config, use_gold_answers=False)
313 | 		pipeline.run_with_predicted_answers(sources_str)
314 | 
315 | 	elif function == "--example":
316 | 		pipeline = Pipeline(config, use_gold_answers=False)
317 | 		pipeline.example()
318 | 
319 | 	else:
320 | 		raise Exception(f"Unknown function {function}!")
321 | 


--------------------------------------------------------------------------------
/convinse/distant_supervision/structured_representation_annotator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import sys
  4 | import time
  5 | 
  6 | from convinse.library.utils import get_logger
  7 | from convinse.library.string_library import StringLibrary
  8 | 
  9 | 
 10 | class StructuredRepresentationAnnotator:
 11 |     def __init__(self, clocq, config):
 12 |         self.clocq = clocq
 13 |         self.config = config
 14 | 
 15 |         self.string_lib = StringLibrary(config)
 16 |         with open(config["path_to_labels"], "r") as fp:
 17 |             self.labels_dict = json.load(fp)
 18 | 
 19 |         self.type_relevance_cache = dict()
 20 | 
 21 |     def annotate_structured_representations(self, flow_graph, conversation):
 22 |         """
 23 |         Get the abstract representations for the questions in the flow_graph.
 24 |         """
 25 |         # initialize
 26 |         structured_representations = dict()
 27 |         explored_turns = set()
 28 |         leafs = flow_graph["leafs"]
 29 | 
 30 |         # search tree bottom-up
 31 |         while leafs:
 32 |             for node in leafs:
 33 |                 turn_id = node["turn"]
 34 |                 node_type = node["type"]
 35 |                 node_str = str(turn_id) + node_type
 36 |                 if node_str in explored_turns:
 37 |                     continue
 38 |                 elif node["type"] == "question":
 39 |                     # extract SR
 40 |                     structured_representation = self._extract_structured_representation(
 41 |                         node, turn_id, conversation, structured_representations
 42 |                     )
 43 |                     structured_representations[turn_id] = structured_representation
 44 | 
 45 |                     # add SR to data
 46 |                     conversation["questions"][turn_id]["silver_SR"].append(
 47 |                         structured_representation
 48 |                     )
 49 | 
 50 |                 explored_turns.add(node_str)
 51 |             leafs = [node for leaf in leafs for node in leaf["parents"]]
 52 | 
 53 |     def _extract_structured_representation(
 54 |         self, node, turn_id, conversation, structured_representations
 55 |     ):
 56 |         """
 57 |         Extract a structured representation for the question.
 58 |         """
 59 |         question = node["question"]
 60 |         if self.config["benchmark"] == "convquestions":
 61 |             answers = conversation["questions"][turn_id]["answer"]
 62 |             answers = self.string_lib.parse_answers_to_dicts(answers, self.labels_dict)
 63 |         else:
 64 |             answers = conversation["questions"][turn_id]["answers"]
 65 | 
 66 |         disambiguations_context = node["relevant_context"]
 67 |         disambiguations_current = node["relevant_disambiguations"]
 68 | 
 69 |         # extract common disambiguations
 70 |         # -> avoid that same disambiguation is in context and entity slot!
 71 |         res = self._extract_common_disambiguations(disambiguations_context, disambiguations_current)
 72 |         entities, common_disambiguations, context = res
 73 | 
 74 |         # remove entity surface forms from question
 75 |         question = self._remove_surface_forms(question, disambiguations_current)
 76 | 
 77 |         # derive SR context and entities
 78 |         sr_context = self._get_context(entities, common_disambiguations, context)
 79 |         sr_entities = self._get_entities(entities, common_disambiguations, context)
 80 | 
 81 |         ## derive SR relation
 82 |         # remove stopwords
 83 |         if self.config["sr_remove_stopwords"]:
 84 |             words = self.string_lib.get_question_words(question, ner=None)
 85 |             sr_relation = [" ".join(words)]
 86 |         else:
 87 |             # remove symbols
 88 |             sr_relation = [self._normalize_relation_str(question)]
 89 |         # shared relation
 90 |         if self.config["sr_relation_shared_active"]:
 91 |             # if the previous turn had the same predicate, append the previous turns relation here
 92 |             if node.get("relation_shared_with"):
 93 |                 prev_turn = node["relation_shared_with"]["turn"]
 94 |                 prev_sr = structured_representations[prev_turn]
 95 |                 prev_relation = prev_sr[2][-1]
 96 |                 sr_relation.append(prev_relation)
 97 | 
 98 |         # derive SR answer type
 99 |         sr_answer_type = self._get_answer_type(question, answers)
100 | 
101 |         # create SR
102 |         structured_representation = (
103 |             sr_context,
104 |             sr_entities,
105 |             sr_relation,
106 |             sr_answer_type,
107 |         )
108 |         return structured_representation
109 | 
110 |     def _remove_surface_forms(self, question, relevant_disambiguations):
111 |         """
112 |         Remove disambiguated surface forms from question. Sort surface forms by
113 |         length to avoid problems: e.g. removing 'unicorn' before removing 'last unicorn'
114 |         leads to a problem.
115 |         """
116 |         # derive set of surface forms
117 |         distinct_surface_forms = set()
118 |         for (item_id, surface_forms, label) in relevant_disambiguations:
119 |             distinct_surface_forms.update(surface_forms)
120 |         # sort surface forms by string length
121 |         distinct_surface_forms = sorted(distinct_surface_forms, key=lambda j: len(j), reverse=True)
122 |         for surface_form in distinct_surface_forms:
123 |             # mechanism to avoid lowering full question at this point
124 |             start_index = question.lower().find(surface_form.lower())
125 |             if not start_index == -1:
126 |                 end_index = start_index + len(surface_form)
127 |                 question = question[:start_index] + question[end_index:]
128 |         return question
129 | 
130 |     def _extract_common_disambiguations(self, disambiguations_context, disambiguations_current):
131 |         """
132 |         Returns the common disambiguations. We care only about surface forms here,
133 |         but compare common items.
134 |         Parameters:
135 |         - disambiguations_context: disambiguations in previous turns
136 |         - disambiguations_current: disambiguations in current turns
137 |         """
138 |         entities = set()
139 |         common_disambiguations = set()
140 |         common_disambiguations_items = set()
141 |         context = set()
142 | 
143 |         # disambiguated item-ids from current question
144 |         question_item_ids = [item for item, surface_forms, label in disambiguations_current]
145 | 
146 |         # go through disambiguations in context and check if any common disambiguations exist
147 |         for item, surface_forms, label in disambiguations_context:
148 |             if item in question_item_ids:
149 |                 for surface_form in surface_forms: # same entity can occur with different surface forms
150 |                     common_disambiguations.add(surface_form)
151 |                 common_disambiguations_items.add(item)
152 |             else:
153 |                 for surface_form in surface_forms:
154 |                     context.add(surface_form)
155 | 
156 |         # other disambiguations, that are not in common, are entities
157 |         for item, surface_forms, label in disambiguations_current:
158 |             # check if entity was already in context (common disambiguation)
159 |             if not item in common_disambiguations_items:
160 |                 for surface_form in surface_forms:
161 |                     entities.add(surface_form)
162 |         return list(entities), list(common_disambiguations), list(context)
163 | 
164 |     def _get_context(self, entities, common_disambiguations, context):
165 |         """
166 |         Get the mentions that provide additional context for the information need,
167 |         beyond the mentioned entity.
168 |         """
169 |         if entities:
170 |             return common_disambiguations + context
171 |         elif common_disambiguations:
172 |             return context
173 |         else:
174 |             return []
175 | 
176 |     def _get_entities(self, entities, common_disambiguations, context):
177 |         """ Get the mentions, that are most relevant to the information need. """
178 |         if entities:
179 |             return entities
180 |         elif common_disambiguations:
181 |             return common_disambiguations
182 |         else:
183 |             return context
184 | 
185 |     def _get_answer_type(self, question, answers):
186 |         """
187 |         Get the answer_type from the answer.
188 |         In case the answer has multiple types, compute the most relevant type
189 |         to the question using word2vec similarities
190 |         """
191 |         if self.string_lib.is_year(answers[0]["label"]):
192 |             return "year"
193 |         elif self.string_lib.is_timestamp(answers[0]["id"]):
194 |             return "date"
195 |         elif self.string_lib.is_number(answers[0]["id"]):
196 |             return "number"
197 |         elif self.string_lib.is_entity(answers[0]["id"]):
198 |             type_ = self._get_most_relevant_type(answers)
199 |             if type_ is None:
200 |                 return ""
201 |             return type_["label"]
202 |         else:
203 |             return "string"
204 | 
205 |     def _type_relevance(self, type_id):
206 |         """
207 |         Score the relevance of the type.
208 |         """
209 |         if self.type_relevance_cache.get(type_id):
210 |             return self.type_relevance_cache.get(type_id)
211 |         freq1, freq2 = self.clocq.get_frequency(type_id)
212 |         type_relevance = freq1 + freq2
213 |         self.type_relevance_cache[type_id] = type_relevance
214 |         return type_relevance
215 | 
216 |     def _get_most_relevant_type(self, answers):
217 |         """
218 |         Get the most relevant type for the item, as given by the type_relevance funtion.
219 |         """
220 |         # fetch types
221 |         all_types = list()
222 |         for item in answers:
223 |             item_id = item["id"]
224 |             types = self.clocq.get_types(item_id)
225 |             if not types:
226 |                 continue
227 |             for type_ in types:
228 |                 if type_ != "None":
229 |                     all_types.append(type_)
230 |         if not all_types:
231 |             return None
232 |         # sort types by relevance, and take top one
233 |         most_relevant_type = sorted(
234 |             all_types, key=lambda j: self._type_relevance(j["id"]), reverse=True
235 |         )[0]
236 |         return most_relevant_type
237 | 
238 |     def _get_relevant_types(self, item):
239 |         """
240 |         NOT IN USE: Get only the relevant types for the item.
241 |         E.g. Christopher Nolan has 11 different occupations, but only 3-4 are important.
242 |         Implemented by matching with description, if no exact match found return all types.
243 |         """
244 |         all_types = self.clocq.get_types(item)
245 |         description = descriptions.get(item)
246 |         if not all_types:
247 |             return "unknown"
248 |         elif not description:
249 |             return all_types
250 |         # extract types that have an exact match in the description
251 |         relevant_types = list()
252 |         for candidate in all_types:
253 |             candidate_label = candidate["label"]
254 |             if candidate_label in description:
255 |                 relevant_types.append(candidate)
256 |         # if no such exact match found, return all
257 |         if not relevant_types:
258 |             return all_types
259 |         return relevant_types
260 | 
261 |     def _normalize_relation_str(self, relation_str):
262 |         """Remove punctuation, whitespaces and lower the string."""
263 |         relation_str = (
264 |             relation_str.replace(",", "")
265 |             .replace("!", "")
266 |             .replace("?", "")
267 |             .replace(".", "")
268 |             .replace("'", "")
269 |             .replace('"', "")
270 |             .replace(":", "")
271 |             .replace("’", "")
272 |             .replace("{", "")
273 |             .replace("}", "")
274 |             .replace(" s ", " ")
275 |         )
276 |         relation_str = relation_str.lower()
277 |         relation_str = relation_str.strip()
278 |         return relation_str
279 | 


--------------------------------------------------------------------------------
/convinse/heterogeneous_answering/fid_module/fid_module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import torch
  6 | import random
  7 | 
  8 | from subprocess import Popen, PIPE
  9 | 
 10 | from convinse.library.utils import get_config, store_json_with_mkdir
 11 | import convinse.heterogeneous_answering.fid_module.fid_utils as fid_utils
 12 | from convinse.heterogeneous_answering.heterogeneous_answering import HeterogeneousAnswering
 13 | import convinse.evaluation as evaluation
 14 | 
 15 | 
 16 | class FiDModule(HeterogeneousAnswering):
 17 |     def __init__(self, config):
 18 |         """Initialize the FiD module."""
 19 |         self.config = config
 20 |         self.path_to_fid = "convinse/heterogeneous_answering/fid_module/FiD"
 21 |         self._initialize_conda_dir()
 22 | 
 23 |     def train(self, sources=["kb", "text", "table", "info"]):
 24 |         """ Train the FiD model on the dataset. """
 25 |         # set paths
 26 |         sources_string = "_".join(sources)
 27 |         input_dir = self.config["path_to_intermediate_results"]
 28 |         qu = self.config["qu"]
 29 |         ers = self.config["ers"]
 30 |         train_path = os.path.join(input_dir, qu, ers, sources_string, "train_ers.jsonl")
 31 |         dev_path = os.path.join(input_dir, qu, ers, sources_string, "dev_ers.jsonl")
 32 | 
 33 |         # load train data
 34 |         with open(train_path, "r") as fp:
 35 |             train_data = list()
 36 |             line = fp.readline()
 37 |             while line:
 38 |                 train_data.append(json.loads(line))
 39 |                 line = fp.readline()
 40 |             train_input_turns = [turn for conv in train_data for turn in conv["questions"]]
 41 |         # load dev data
 42 |         with open(dev_path, "r") as fp:
 43 |             dev_data = list()
 44 |             line = fp.readline()
 45 |             while line:
 46 |                 dev_data.append(json.loads(line))
 47 |                 line = fp.readline()
 48 |             dev_input_turns = [turn for conv in dev_data for turn in conv["questions"]]
 49 | 
 50 |         # prepare paths
 51 |         prepared_train_path, _ = self._prepare_paths()
 52 |         prepared_dev_path, _ = self._prepare_paths()
 53 | 
 54 |         # prepare data
 55 |         fid_utils.prepare_data(self.config, train_input_turns, prepared_train_path, train=True)
 56 |         fid_utils.prepare_data(self.config, dev_input_turns, prepared_dev_path, train=False)
 57 | 
 58 |         # free up memory
 59 |         del dev_input_turns
 60 |         del train_input_turns
 61 |         del dev_data
 62 |         del train_data
 63 | 
 64 |         # train
 65 |         self._train(prepared_train_path, prepared_dev_path, sources_string)
 66 | 
 67 |     def inference_on_turns(self, input_turns):
 68 |         """Run HA on given turns."""
 69 |         # paths
 70 |         prepared_input_path, res_name = self._prepare_paths()
 71 | 
 72 |         # prepare data
 73 |         fid_utils.prepare_data(self.config, input_turns, prepared_input_path)
 74 | 
 75 |         # inference
 76 |         self._inference(res_name, prepared_input_path)
 77 | 
 78 |         # parse result
 79 |         path_to_result = f"{self.path_to_fid}/tmp_output_data/{res_name}/final_output.txt"
 80 |         generated_answers = self._parse_result(path_to_result)
 81 | 
 82 |         # add predicted answers to turns
 83 |         for turn in input_turns:
 84 |             self._postprocess_turn(turn, generated_answers)
 85 |         return input_turns
 86 | 
 87 |     def inference_on_turn(self, turn):
 88 |         """Run HA on a single turn."""
 89 |         # paths
 90 |         prepared_input_path, res_name = self._prepare_paths()
 91 | 
 92 |         # prepare data
 93 |         fid_utils.prepare_turn(self.config, turn, prepared_input_path, train=False)
 94 | 
 95 |         # inference
 96 |         self._inference(res_name, prepared_input_path)
 97 | 
 98 |         # parse result
 99 |         path_to_result = f"{self.path_to_fid}/tmp_output_data/{res_name}/final_output.txt"
100 |         generated_answers = self._parse_result(path_to_result)
101 | 
102 |         # add predicted answers to turns
103 |         self._postprocess_turn(turn, generated_answers)
104 |         return turn
105 | 
106 |     def _train(self, prepared_train_path, prepared_dev_path, sources_string):
107 |         benchmark = self.config["benchmark"]
108 |         method_name = self.config["name"]
109 |         qu = self.config["qu"]
110 |         name = f"{method_name}_{sources_string}"
111 |         COMMAND = [self.path_to_fid_python_env, f"{self.path_to_fid}/train_reader.py"]
112 |         COMMAND += ["--name", name]
113 |         COMMAND += ["--checkpoint_dir", f"_data/{benchmark}/{method_name}/fid"]
114 |         COMMAND += ["--train_data", prepared_train_path]
115 |         COMMAND += ["--eval_data", prepared_dev_path]
116 |         COMMAND += ["--model_size", "base"]
117 |         COMMAND += ["--lr", str(self.config["fid_lr"])]
118 |         COMMAND += ["--optim", str(self.config["fid_optim"])]
119 |         COMMAND += ["--scheduler", str(self.config["fid_scheduler"])]
120 |         COMMAND += ["--weight_decay", str(self.config["fid_weight_decay"])]
121 |         COMMAND += ["--text_maxlength", str(self.config["fid_text_maxlength"])]
122 |         COMMAND += ["--answer_maxlength", str(self.config["fid_answer_maxlength"])]
123 |         COMMAND += ["--per_gpu_batch_size", str(self.config["fid_per_gpu_batch_size"])]
124 |         COMMAND += ["--n_context", str(self.config["fid_max_evidences"])]
125 |         COMMAND += ["--total_step", str(self.config["fid_total_step"])]
126 |         COMMAND += ["--warmup_step", str(self.config["fid_warmup_step"])]
127 |         process = Popen(COMMAND, stdout=sys.stdout, stderr=sys.stderr)
128 |         process.communicate()
129 | 
130 |     def _inference(self, res_name, prepared_input_path):
131 |         """Run inference on a given question (or SR), and a set of evidences."""
132 |         COMMAND = [self.path_to_fid_python_env, f"{self.path_to_fid}/test_reader.py"]
133 |         COMMAND += ["--name", res_name]
134 |         COMMAND += ["--model_path", self.config["fid_model_path"]]
135 |         COMMAND += ["--checkpoint_dir", f"{self.path_to_fid}/tmp_output_data"]
136 |         COMMAND += ["--eval_data", prepared_input_path]
137 |         COMMAND += ["--n_context", str(self.config["fid_max_evidences"])]
138 |         COMMAND += ["--per_gpu_batch_size", str(self.config["fid_per_gpu_batch_size"])]
139 |         COMMAND += ["--write_results"]
140 |         process = Popen(COMMAND, stdout=sys.stdout, stderr=sys.stderr)
141 |         process.communicate()
142 | 
143 |     def _postprocess_turn(self, turn, generated_answers):
144 |         ques_id = turn["question_id"]
145 |         generated_answer = generated_answers.get(ques_id)
146 |         turn["generated_answer"] = generated_answer
147 | 
148 |         # get ranked answers
149 |         ranked_answers = evaluation.get_ranked_answers(
150 |             self.config, generated_answer, turn
151 |         )
152 |         try:
153 |             turn["pred_answers"] = [
154 |                 {
155 |                     "id": ans["answer"]["id"],
156 |                     "label": ans["answer"]["label"],
157 |                     "rank": ans["rank"],
158 |                     "score": ans["score"],
159 |                 }
160 |                 for ans in ranked_answers
161 |             ]
162 |         except:
163 |             print(f"Fail with: {ranked_answers}")
164 |             turn["pred_answers"] = [
165 |                 {"id": ans["answer"]["id"], "label": ans["answer"]["label"], "rank": ans["rank"]}
166 |                 for ans in ranked_answers
167 |             ]
168 |         # eval
169 |         p_at_1 = evaluation.precision_at_1(ranked_answers, turn["answers"])
170 |         turn["p_at_1"] = p_at_1
171 |         mrr = evaluation.mrr_score(ranked_answers, turn["answers"])
172 |         turn["mrr"] = mrr
173 |         h_at_5 = evaluation.hit_at_5(ranked_answers, turn["answers"])
174 |         turn["h_at_5"] = h_at_5
175 | 
176 |         # delete noise
177 |         if turn.get("top_evidences"):
178 |             del turn["top_evidences"]
179 |         if turn.get("question_entities"):
180 |             del turn["question_entities"]
181 |         if turn.get("silver_SR"):
182 |             del turn["silver_SR"]
183 |         if turn.get("silver_relevant_turns"):
184 |             del turn["silver_relevant_turns"]
185 |         if turn.get("silver_answering_evidences"):
186 |             del turn["silver_answering_evidences"]
187 | 
188 |     def _parse_result(self, path_to_result):
189 |         """
190 |         Parse the output generated by FiD, and add predicted
191 |         (and generated) answers to the data.
192 |         """
193 |         # get answers from output file
194 |         generated_answers = dict()
195 |         with open(path_to_result, "r") as fp:
196 |             line = fp.readline()
197 |             while line:
198 |                 try:
199 |                     ques_id, answer = line.split(None, 1)
200 |                 except:
201 |                     ques_id = line.strip()
202 |                     answer = ""
203 |                 ques_id = ques_id.strip()
204 |                 answer = answer.strip()
205 |                 generated_answers[ques_id] = answer
206 |                 line = fp.readline()
207 |         return generated_answers
208 | 
209 |     def _prepare_paths(self):
210 |         """ Prepare random path for handling input/output with piped FiD process. """
211 |         random_num = str(random.randint(0, 10000))
212 |         prepared_input_path = f"{self.path_to_fid}/tmp_input_data/data_{random_num}.jsonl"
213 |         res_name = f"output_{random_num}"
214 |         return prepared_input_path, res_name
215 | 
216 |     def _initialize_conda_dir(self):
217 |         """ Code to automatically detect and set the path to the FiD environment."""
218 |         conda_dir = os.environ.get("CONDA_PREFIX", None)
219 |         if not conda_dir:
220 |             raise Exception("Something went wrong! Tried accessing the value of the CONDA_PREFIX variable, but failed. Please make sure that you have a valid conda installation.")
221 |         
222 |         # in case some environment is activated (which should be `convinse`), move one dir upwards
223 |         if "envs" in conda_dir:
224 |             conda_dir = os.path.dirname(conda_dir)
225 |         else:
226 |             conda_dir = os.path.join(conda_dir, "envs")
227 |         self.path_to_fid_python_env = os.path.join(conda_dir, "fid", "bin", "python")
228 | 
229 | 
230 | #######################################################################################################################
231 | #######################################################################################################################
232 | if __name__ == "__main__":
233 |     if len(sys.argv) < 2:
234 |         raise Exception(
235 |             "python convinse/heterogeneous_answering/fid_module/fid_module.py --<FUNCTION> <PATH_TO_CONFIG>"
236 |         )
237 | 
238 |     function = sys.argv[1]
239 |     config_path = sys.argv[2]
240 |     config = get_config(config_path)
241 | 
242 |     if function == "--train":
243 |         # set paths
244 |         qu = config["qu"]
245 |         ers = config["ers"]
246 |         input_dir = config["path_to_intermediate_results"]
247 |         data_sources_str = config["fid_train_sources"]
248 |         path = os.path.join(input_dir, qu, ers, data_sources_str)
249 |         train_path = os.path.join(path, "train_ers.jsonl")
250 |         dev_path = os.path.join(path, "dev_ers.jsonl")
251 | 
252 |         # train
253 |         fid = FiDModule(config)
254 |         fid.train(train_path, dev_path)
255 | 
256 |     elif function == "--example":
257 |         # set paths
258 |         qu = config["qu"]
259 |         ers = config["ers"]
260 |         input_dir = config["path_to_intermediate_results"]
261 |         data_sources_str = "kb_text_table_info"
262 |         path = os.path.join(input_dir, qu, ers, data_sources_str)
263 |         input_path = os.path.join(path, "dev_ers.jsonl")
264 | 
265 |         with open(input_path, "r") as fp:
266 |             line = fp.readline()
267 |             conv = json.loads(line)
268 |         turn = conv["questions"][0]
269 | 
270 |         # run inference on example
271 |         fid = FiDModule(config)
272 | 
273 |         start = time.time()
274 |         res = fid.inference_on_turn(turn)
275 |         print(res)
276 |         print(f"Spent {time.time()-start} seconds!")
277 | 


--------------------------------------------------------------------------------