├── README.md
├── data
    ├── result.json
    ├── s56075423.txt
    ├── s58951365.txt
    └── s59358936.txt
├── dygie
    ├── __init__.py
    ├── data
    │   ├── __init__.py
    │   ├── dataset_readers
    │   │   ├── document.py
    │   │   └── dygie.py
    │   └── fields
    │   │   └── adjacency_field_assym.py
    ├── models
    │   ├── __init__.py
    │   ├── coref.py
    │   ├── dygie.py
    │   ├── entity_beam_pruner.py
    │   ├── events.py
    │   ├── ner.py
    │   ├── relation.py
    │   └── shared.py
    ├── predictors
    │   ├── __init__.py
    │   └── dygie.py
    ├── pytest.ini
    ├── spacy_interface
    │   ├── __init__.py
    │   └── spacy_interface.py
    ├── tests
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── annotated_doc_test.py
    │   │   ├── collate_test.py
    │   │   ├── document_test.py
    │   │   ├── dygie_test.py
    │   │   └── spacy_interface_test.py
    │   ├── fixtures
    │   │   ├── ace_event_article.json
    │   │   ├── ace_event_coref_article.json
    │   │   ├── collate
    │   │   │   ├── ace-event
    │   │   │   │   ├── dev.json
    │   │   │   │   ├── test.json
    │   │   │   │   └── train.json
    │   │   │   └── scierc
    │   │   │   │   ├── dev.json
    │   │   │   │   ├── test.json
    │   │   │   │   └── train.json
    │   │   ├── dygie_test.jsonnet
    │   │   ├── dygie_test_full.jsonnet
    │   │   ├── multi_dataset
    │   │   │   ├── dev.jsonl
    │   │   │   ├── test.jsonl
    │   │   │   └── train.jsonl
    │   │   └── scierc_article.json
    │   └── models
    │   │   ├── __init__.py
    │   │   ├── coref_test.py
    │   │   ├── dygie_test.py
    │   │   ├── multi_dataset_test.sh
    │   │   └── relation_test.py
    └── training
    │   ├── event_metrics.py
    │   ├── f1.py
    │   ├── ner_metrics.py
    │   └── relation_metrics.py
├── models
    ├── README.txt
    └── inference.py
├── temp_dygie_input.json
├── temp_dygie_output.json
└── temp_file_list.json


/README.md:
--------------------------------------------------------------------------------
1 | # RadGraph
2 | RadGraph: Extracting Clinical Entities and Relations from Radiology Reports
3 | 


--------------------------------------------------------------------------------
/data/result.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/data/result.json


--------------------------------------------------------------------------------
/data/s56075423.txt:
--------------------------------------------------------------------------------
 1 |                                  FINAL REPORT
 2 |  HISTORY:  Intubated for overdose.  
 3 |  
 4 |  COMPARISON:  None. 
 5 |  
 6 |  TECHNIQUE:  Supine AP view of the chest.  
 7 |  
 8 |  FINDINGS:
 9 |  
10 |  Endotracheal tube terminates approximately 4.4 cm from the carina, in standard
11 |  position.  Nasogastric tube tip is within the stomach, as is the side port. 
12 |  Cardiac, mediastinal and hilar contours are normal.  Lungs are clear and the
13 |  pulmonary vascularity is normal.  No pleural effusion or pneumothorax is
14 |  present.  No acute osseous abnormalities are seen.  
15 |  
16 |  IMPRESSION:
17 |  
18 |  Standard positioning of the endotracheal tube and nasogastric tube.  No acute
19 |  cardiopulmonary process.
20 | 


--------------------------------------------------------------------------------
/data/s58951365.txt:
--------------------------------------------------------------------------------
 1 |                                  FINAL REPORT
 2 |  EXAMINATION:
 3 |  Chest:  Frontal and lateral views
 4 |  
 5 |  INDICATION:  History: ___F with cough  // Pneumonia
 6 |  
 7 |  TECHNIQUE:  Chest:  Frontal and Lateral
 8 |  
 9 |  COMPARISON:  None.
10 |  
11 |  FINDINGS: 
12 |  
13 |  The lungs are clear without focal consolidation.  No pleural effusion or
14 |  pneumothorax is seen. The cardiac and mediastinal silhouettes are
15 |  unremarkable.
16 |  
17 |  IMPRESSION: 
18 |  
19 |  No acute cardiopulmonary process.
20 | 


--------------------------------------------------------------------------------
/data/s59358936.txt:
--------------------------------------------------------------------------------
 1 |                                  FINAL REPORT
 2 |  EXAMINATION:  CHEST (AP AND LAT)
 3 |  
 4 |  INDICATION:  History: ___M with subacute CVA seen on MRI
 5 |  
 6 |  TECHNIQUE:  Upright AP and lateral views of the chest
 7 |  
 8 |  COMPARISON:  None.
 9 |  
10 |  FINDINGS: 
11 |  
12 |  Heart size is normal.  The aorta is tortuous.  The pulmonary vasculature and
13 |  hilar contours are normal.  Lungs are hyperinflated but clear. No focal
14 |  consolidation, pleural effusion or pneumothorax is present.  No acute osseous
15 |  abnormality is identified.
16 |  
17 |  IMPRESSION: 
18 |  
19 |  No acute cardiopulmonary abnormality.
20 | 


--------------------------------------------------------------------------------
/dygie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/__init__.py


--------------------------------------------------------------------------------
/dygie/data/__init__.py:
--------------------------------------------------------------------------------
1 | from dygie.data.dataset_readers.dygie import DyGIEReader
2 | from dygie.data.dataset_readers.document import Document
3 | 


--------------------------------------------------------------------------------
/dygie/data/dataset_readers/dygie.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Dict, List, Optional, Tuple, DefaultDict, Set, Union
  3 | import json
  4 | import pickle as pkl
  5 | import warnings
  6 | 
  7 | from overrides import overrides
  8 | 
  9 | from allennlp.common.file_utils import cached_path
 10 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader
 11 | from allennlp.data.fields import (ListField, TextField, SpanField, MetadataField,
 12 |                                   SequenceLabelField, AdjacencyField, LabelField)
 13 | from allennlp.data.instance import Instance
 14 | from allennlp.data.tokenizers import Token
 15 | from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
 16 | from allennlp.data.dataset_readers.dataset_utils import enumerate_spans
 17 | 
 18 | from dygie.data.fields.adjacency_field_assym import AdjacencyFieldAssym
 19 | from dygie.data.dataset_readers.document import Document, Sentence
 20 | 
 21 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 22 | 
 23 | 
 24 | class DyGIEDataException(Exception):
 25 |     pass
 26 | 
 27 | 
 28 | @DatasetReader.register("dygie")
 29 | class DyGIEReader(DatasetReader):
 30 |     """
 31 |     Reads a single JSON-formatted file. This is the same file format as used in the
 32 |     scierc, but is preprocessed
 33 |     """
 34 |     def __init__(self,
 35 |                  max_span_width: int,
 36 |                  token_indexers: Dict[str, TokenIndexer] = None,
 37 |                  **kwargs) -> None:
 38 |         super().__init__(**kwargs)
 39 |         self._max_span_width = max_span_width
 40 |         self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
 41 | 
 42 |     @overrides
 43 |     def _read(self, file_path: str):
 44 |         # if `file_path` is a URL, redirect to the cache
 45 |         file_path = cached_path(file_path)
 46 | 
 47 |         with open(file_path, "r") as f:
 48 |             lines = f.readlines()
 49 | 
 50 |         for line in lines:
 51 |             # Loop over the documents.
 52 |             doc_text = json.loads(line)
 53 |             instance = self.text_to_instance(doc_text)
 54 |             yield instance
 55 | 
 56 |     def _too_long(self, span):
 57 |         return span[1] - span[0] + 1 > self._max_span_width
 58 | 
 59 |     def _process_ner(self, span_tuples, sent):
 60 |         ner_labels = [""] * len(span_tuples)
 61 | 
 62 |         for span, label in sent.ner_dict.items():
 63 |             if self._too_long(span):
 64 |                 continue
 65 |             ix = span_tuples.index(span)
 66 |             ner_labels[ix] = label
 67 | 
 68 |         return ner_labels
 69 | 
 70 |     def _process_coref(self, span_tuples, sent):
 71 |         coref_labels = [-1] * len(span_tuples)
 72 | 
 73 |         for span, label in sent.cluster_dict.items():
 74 |             if self._too_long(span):
 75 |                 continue
 76 |             ix = span_tuples.index(span)
 77 |             coref_labels[ix] = label
 78 |         return coref_labels
 79 | 
 80 |     def _process_relations(self, span_tuples, sent):
 81 |         relations = []
 82 |         relation_indices = []
 83 | 
 84 |         # Loop over the gold spans. Look up their indices in the list of span tuples and store
 85 |         # values.
 86 |         for (span1, span2), label in sent.relation_dict.items():
 87 |             # If either span is beyond the max span width, skip it.
 88 |             if self._too_long(span1) or self._too_long(span2):
 89 |                 continue
 90 |             ix1 = span_tuples.index(span1)
 91 |             ix2 = span_tuples.index(span2)
 92 |             relation_indices.append((ix1, ix2))
 93 |             relations.append(label)
 94 | 
 95 |         return relations, relation_indices
 96 | 
 97 |     def _process_events(self, span_tuples, sent):
 98 |         n_tokens = len(sent.text)
 99 | 
100 |         trigger_labels = [""] * n_tokens
101 |         for tok_ix, trig_label in sent.events.trigger_dict.items():
102 |             trigger_labels[tok_ix] = trig_label
103 | 
104 |         arguments = []
105 |         argument_indices = []
106 | 
107 |         for (trig_ix, arg_span), arg_label in sent.events.argument_dict.items():
108 |             if self._too_long(arg_span):
109 |                 continue
110 |             arg_span_ix = span_tuples.index(arg_span)
111 |             argument_indices.append((trig_ix, arg_span_ix))
112 |             arguments.append(arg_label)
113 | 
114 |         return trigger_labels, arguments, argument_indices
115 | 
116 |     def _process_sentence(self, sent: Sentence, dataset: str):
117 |         # Get the sentence text and define the `text_field`.
118 |         sentence_text = [self._normalize_word(word) for word in sent.text]
119 |         text_field = TextField([Token(word) for word in sentence_text], self._token_indexers)
120 | 
121 |         # Enumerate spans.
122 |         spans = []
123 |         for start, end in enumerate_spans(sentence_text, max_span_width=self._max_span_width):
124 |             spans.append(SpanField(start, end, text_field))
125 |         span_field = ListField(spans)
126 |         span_tuples = [(span.span_start, span.span_end) for span in spans]
127 | 
128 |         # Convert data to fields.
129 |         # NOTE: The `ner_labels` and `coref_labels` would ideally have type
130 |         # `ListField[SequenceLabelField]`, where the sequence labels are over the `SpanField` of
131 |         # `spans`. But calling `as_tensor_dict()` fails on this specific data type. Matt G
132 |         # recognized that this is an AllenNLP API issue and suggested that represent these as
133 |         # `ListField[ListField[LabelField]]` instead.
134 |         fields = {}
135 |         fields["text"] = text_field
136 |         fields["spans"] = span_field
137 |         if sent.ner is not None:
138 |             ner_labels = self._process_ner(span_tuples, sent)
139 |             fields["ner_labels"] = ListField(
140 |                 [LabelField(entry, label_namespace=f"{dataset}__ner_labels")
141 |                  for entry in ner_labels])
142 |         if sent.cluster_dict is not None:
143 |             # Skip indexing for coref labels, which are ints.
144 |             coref_labels = self._process_coref(span_tuples, sent)
145 |             fields["coref_labels"] = ListField(
146 |                 [LabelField(entry, label_namespace="coref_labels", skip_indexing=True)
147 |                  for entry in coref_labels])
148 |         if sent.relations is not None:
149 |             relation_labels, relation_indices = self._process_relations(span_tuples, sent)
150 |             fields["relation_labels"] = AdjacencyField(
151 |                 indices=relation_indices, sequence_field=span_field, labels=relation_labels,
152 |                 label_namespace=f"{dataset}__relation_labels")
153 |         if sent.events is not None:
154 |             trigger_labels, argument_labels, argument_indices = self._process_events(span_tuples, sent)
155 |             fields["trigger_labels"] = SequenceLabelField(
156 |                 trigger_labels, text_field, label_namespace=f"{dataset}__trigger_labels")
157 |             fields["argument_labels"] = AdjacencyFieldAssym(
158 |                 indices=argument_indices, row_field=text_field, col_field=span_field,
159 |                 labels=argument_labels, label_namespace=f"{dataset}__argument_labels")
160 | 
161 |         return fields
162 | 
163 |     def _process_sentence_fields(self, doc: Document):
164 |         # Process each sentence.
165 |         sentence_fields = [self._process_sentence(sent, doc.dataset) for sent in doc.sentences]
166 | 
167 |         # Make sure that all sentences have the same set of keys.
168 |         first_keys = set(sentence_fields[0].keys())
169 |         for entry in sentence_fields:
170 |             if set(entry.keys()) != first_keys:
171 |                 raise DyGIEDataException(
172 |                     f"Keys do not match across sentences for document {doc.doc_key}.")
173 | 
174 |         # For each field, store the data from all sentences together in a ListField.
175 |         fields = {}
176 |         keys = sentence_fields[0].keys()
177 |         for key in keys:
178 |             this_field = ListField([sent[key] for sent in sentence_fields])
179 |             fields[key] = this_field
180 | 
181 |         return fields
182 | 
183 |     @overrides
184 |     def text_to_instance(self, doc_text: Dict[str, Any]):
185 |         """
186 |         Convert a Document object into an instance.
187 |         """
188 |         doc = Document.from_json(doc_text)
189 | 
190 |         # Make sure there are no single-token sentences; these break things.
191 |         sent_lengths = [len(x) for x in doc.sentences]
192 |         if min(sent_lengths) < 2:
193 |             msg = (f"Document {doc.doc_key} has a sentence with a single token or no tokens. "
194 |                    "This may break the modeling code.")
195 |             warnings.warn(msg)
196 | 
197 |         fields = self._process_sentence_fields(doc)
198 |         fields["metadata"] = MetadataField(doc)
199 | 
200 |         return Instance(fields)
201 | 
202 |     @overrides
203 |     def _instances_from_cache_file(self, cache_filename):
204 |         with open(cache_filename, "rb") as f:
205 |             for entry in pkl.load(f):
206 |                 yield entry
207 | 
208 |     @overrides
209 |     def _instances_to_cache_file(self, cache_filename, instances):
210 |         with open(cache_filename, "wb") as f:
211 |             pkl.dump(instances, f, protocol=pkl.HIGHEST_PROTOCOL)
212 | 
213 |     @staticmethod
214 |     def _normalize_word(word):
215 |         if word == "/." or word == "/?":
216 |             return word[1:]
217 |         else:
218 |             return word
219 | 


--------------------------------------------------------------------------------
/dygie/data/fields/adjacency_field_assym.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Set, Tuple
  2 | import logging
  3 | import textwrap
  4 | 
  5 | from overrides import overrides
  6 | import torch
  7 | 
  8 | from allennlp.common.checks import ConfigurationError
  9 | from allennlp.data.fields.field import Field
 10 | from allennlp.data.fields.sequence_field import SequenceField
 11 | from allennlp.data.vocabulary import Vocabulary
 12 | 
 13 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 14 | 
 15 | 
 16 | class AdjacencyFieldAssym(Field[torch.Tensor]):
 17 |     """
 18 |     There are cases where we need to express adjacency relations between elements in two different
 19 |     fields - for instance a TextField and a SpanField. This implements an "asymmetric" adjacency field.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     indices : ``List[Tuple[int, int]]``
 24 |     row_field : ``SequenceField``
 25 |         The field with the sequence that the rows of `indices` index into.
 26 |     col_field : ``SequenceField``
 27 |         The field with the sequence that the columns of `indices` index into.
 28 |     labels : ``List[str]``, optional, default = None
 29 |         Optional labels for the edges of the adjacency matrix.
 30 |     label_namespace : ``str``, optional (default='labels')
 31 |         The namespace to use for converting tag strings into integers.  We convert tag strings to
 32 |         integers for you, and this parameter tells the ``Vocabulary`` object which mapping from
 33 |         strings to integers to use (so that "O" as a tag doesn't get the same id as "O" as a word).
 34 |     padding_value : ``int``, (optional, default = -1)
 35 |         The value to use as padding.
 36 |     """
 37 |     # It is possible that users want to use this field with a namespace which uses OOV/PAD tokens.
 38 |     # This warning will be repeated for every instantiation of this class (i.e for every data
 39 |     # instance), spewing a lot of warnings so this class variable is used to only log a single
 40 |     # warning per namespace.
 41 |     _already_warned_namespaces: Set[str] = set()
 42 | 
 43 |     def __init__(self,
 44 |                  indices: List[Tuple[int, int]],
 45 |                  row_field: SequenceField,
 46 |                  col_field: SequenceField,
 47 |                  labels: List[str] = None,
 48 |                  label_namespace: str = 'labels',
 49 |                  padding_value: int = -1) -> None:
 50 |         self.indices = indices
 51 |         self.labels = labels
 52 |         self.row_field = row_field
 53 |         self.col_field = col_field
 54 |         self._label_namespace = label_namespace
 55 |         self._padding_value = padding_value
 56 |         self._indexed_labels: List[int] = None
 57 | 
 58 |         self._maybe_warn_for_namespace(label_namespace)
 59 |         row_length = row_field.sequence_length()
 60 |         col_length = col_field.sequence_length()
 61 | 
 62 |         if len(set(indices)) != len(indices):
 63 |             raise ConfigurationError(f"Indices must be unique, but found {indices}")
 64 | 
 65 |         if not all([0 <= index[1] < col_length and 0 <= index[0] < row_length for index in indices]):
 66 |             raise ConfigurationError(f"Label indices and sequence length "
 67 |                                      f"are incompatible: {indices} and {row_length} or {col_length}")
 68 | 
 69 |         if labels is not None and len(indices) != len(labels):
 70 |             raise ConfigurationError(f"Labelled indices were passed, but their lengths do not match: "
 71 |                                      f" {labels}, {indices}")
 72 | 
 73 |     def _maybe_warn_for_namespace(self, label_namespace: str) -> None:
 74 |         if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")):
 75 |             if label_namespace not in self._already_warned_namespaces:
 76 |                 logger.warning("Your label namespace was '%s'. We recommend you use a namespace "
 77 |                                "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by "
 78 |                                "default to your vocabulary.  See documentation for "
 79 |                                "`non_padded_namespaces` parameter in Vocabulary.",
 80 |                                self._label_namespace)
 81 |                 self._already_warned_namespaces.add(label_namespace)
 82 | 
 83 |     @overrides
 84 |     def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
 85 |         if self._indexed_labels is None and self.labels is not None:
 86 |             for label in self.labels:
 87 |                 counter[self._label_namespace][label] += 1  # type: ignore
 88 | 
 89 |     @overrides
 90 |     def index(self, vocab: Vocabulary):
 91 |         if self._indexed_labels is None and self.labels is not None:
 92 |             self._indexed_labels = [vocab.get_token_index(label, self._label_namespace)
 93 |                                     for label in self.labels]
 94 | 
 95 |     @overrides
 96 |     def get_padding_lengths(self) -> Dict[str, int]:
 97 |         return {'num_rows': self.row_field.sequence_length(),
 98 |                 'num_cols': self.col_field.sequence_length()}
 99 | 
100 |     @overrides
101 |     def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
102 |         desired_num_rows = padding_lengths['num_rows']
103 |         desired_num_cols = padding_lengths['num_cols']
104 |         tensor = torch.ones(desired_num_rows, desired_num_cols) * self._padding_value
105 |         labels = self._indexed_labels or [1 for _  in range(len(self.indices))]
106 | 
107 |         for index, label in zip(self.indices, labels):
108 |             tensor[index] = label
109 |         return tensor
110 | 
111 |     @overrides
112 |     def empty_field(self) -> 'AdjacencyFieldAssym':
113 |         # pylint: disable=protected-access
114 |         # The empty_list here is needed for mypy
115 |         empty_list: List[Tuple[int, int]] = []
116 |         adjacency_field = AdjacencyFieldAssym(empty_list,
117 |                                               self.row_field.empty_field(),
118 |                                               self.col_field.empty_field(),
119 |                                               padding_value=self._padding_value)
120 |         return adjacency_field
121 | 
122 |     def __str__(self) -> str:
123 |         row_length = self.row_field.sequence_length()
124 |         col_length = self.col_field.sequence_length()
125 |         formatted_labels = "".join(["\t\t" + labels + "\n"
126 |                                     for labels in textwrap.wrap(repr(self.labels), 100)])
127 |         formatted_indices = "".join(["\t\t" + index + "\n"
128 |                                      for index in textwrap.wrap(repr(self.indices), 100)])
129 |         return f"AdjacencyFieldAssym of row length {row_length} and col length {col_length}\n" \
130 |                f"\t\twith indices:\n {formatted_indices}\n" \
131 |                f"\t\tand labels:\n {formatted_labels} \t\tin namespace: '{self._label_namespace}'."
132 | 


--------------------------------------------------------------------------------
/dygie/models/__init__.py:
--------------------------------------------------------------------------------
1 | from dygie.models.dygie import DyGIE
2 | 


--------------------------------------------------------------------------------
/dygie/models/entity_beam_pruner.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This is basically a copy of AllenNLP's Pruner module, but with support for entity beams.
  3 | """
  4 | 
  5 | from typing import Tuple, Union
  6 | 
  7 | from overrides import overrides
  8 | import torch
  9 | 
 10 | from allennlp.nn import util
 11 | from allennlp.modules import TimeDistributed
 12 | 
 13 | 
 14 | def make_pruner(scorer, entity_beam=False, gold_beam=False):
 15 |     """
 16 |     Create a pruner that either takes outputs of other scorers (i.e. entity beam), or uses its own
 17 |     scorer (the `default_scorer`).
 18 |     """
 19 |     item_scorer = torch.nn.Sequential(
 20 |         TimeDistributed(scorer),
 21 |         TimeDistributed(torch.nn.Linear(scorer.get_output_dim(), 1)))
 22 |     min_score_to_keep = 1e-10 if entity_beam else None
 23 | 
 24 |     return Pruner(item_scorer, entity_beam, gold_beam, min_score_to_keep)
 25 | 
 26 | 
 27 | class Pruner(torch.nn.Module):
 28 |     """
 29 |     This module scores and prunes items in a list using a parameterised scoring function and a
 30 |     threshold.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     scorer : ``torch.nn.Module``, required.
 35 |         A module which, given a tensor of shape (batch_size, num_items, embedding_size),
 36 |         produces a tensor of shape (batch_size, num_items, 1), representing a scalar score
 37 |         per item in the tensor.
 38 |     entity_beam: bool, optional.
 39 |         If True, use class scores output from another module instead of using own scorer.
 40 |     gold_beam: bool, optional.
 41 |        If True, use gold arguments.
 42 |     min_score_to_keep : float, optional.
 43 |         If given, only keep items that score at least this high.
 44 |     """
 45 |     def __init__(self, scorer: torch.nn.Module, entity_beam: bool = False, gold_beam: bool = False,
 46 |                  min_score_to_keep: float = None) -> None:
 47 |         super().__init__()
 48 |         # If gold beam is on, then entity beam must be off and min_score_to_keep must be None.
 49 |         assert not (gold_beam and ((min_score_to_keep is not None) or entity_beam))
 50 |         self._scorer = scorer
 51 |         self._entity_beam = entity_beam
 52 |         self._gold_beam = gold_beam
 53 |         self._min_score_to_keep = min_score_to_keep
 54 | 
 55 |     @overrides
 56 |     def forward(self, # pylint: disable=arguments-differ
 57 |                 embeddings: torch.FloatTensor,
 58 |                 mask: torch.LongTensor,
 59 |                 num_items_to_keep: Union[int, torch.LongTensor],
 60 |                 class_scores: torch.FloatTensor = None,
 61 |                 gold_labels: torch.long = None) -> Tuple[torch.FloatTensor, torch.LongTensor,
 62 |                                                          torch.LongTensor, torch.FloatTensor]:
 63 |         """
 64 |         Extracts the top-k scoring items with respect to the scorer. We additionally return
 65 |         the indices of the top-k in their original order, not ordered by score, so that downstream
 66 |         components can rely on the original ordering (e.g., for knowing what spans are valid
 67 |         antecedents in a coreference resolution model). May use the same k for all sentences in
 68 |         minibatch, or different k for each.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         embeddings : ``torch.FloatTensor``, required.
 73 |             A tensor of shape (batch_size, num_items, embedding_size), containing an embedding for
 74 |             each item in the list that we want to prune.
 75 |         mask : ``torch.LongTensor``, required.
 76 |             A tensor of shape (batch_size, num_items), denoting unpadded elements of
 77 |             ``embeddings``.
 78 |         num_items_to_keep : ``Union[int, torch.LongTensor]``, required.
 79 |             If a tensor of shape (batch_size), specifies the number of items to keep for each
 80 |             individual sentence in minibatch.
 81 |             If an int, keep the same number of items for all sentences.
 82 |         class_scores:
 83 |            Class scores to be used with entity beam.
 84 |         candidate_labels: If in debugging mode, use gold labels to get beam.
 85 | 
 86 |         Returns
 87 |         -------
 88 |         top_embeddings : ``torch.FloatTensor``
 89 |             The representations of the top-k scoring items.
 90 |             Has shape (batch_size, max_num_items_to_keep, embedding_size).
 91 |         top_mask : ``torch.LongTensor``
 92 |             The corresponding mask for ``top_embeddings``.
 93 |             Has shape (batch_size, max_num_items_to_keep).
 94 |         top_indices : ``torch.IntTensor``
 95 |             The indices of the top-k scoring items into the original ``embeddings``
 96 |             tensor. This is returned because it can be useful to retain pointers to
 97 |             the original items, if each item is being scored by multiple distinct
 98 |             scorers, for instance. Has shape (batch_size, max_num_items_to_keep).
 99 |         top_item_scores : ``torch.FloatTensor``
100 |             The values of the top-k scoring items.
101 |             Has shape (batch_size, max_num_items_to_keep, 1).
102 |         num_items_kept
103 |         """
104 |         # If an int was given for number of items to keep, construct tensor by repeating the value.
105 |         if isinstance(num_items_to_keep, int):
106 |             batch_size = mask.size(0)
107 |             # Put the tensor on same device as the mask.
108 |             num_items_to_keep = num_items_to_keep * torch.ones([batch_size], dtype=torch.long,
109 |                                                                device=mask.device)
110 | 
111 |         mask = mask.unsqueeze(-1)
112 |         num_items = embeddings.size(1)
113 | 
114 |         # Shape: (batch_size, num_items, 1)
115 |         # If entity beam is one, use the class scores. Else ignore them and use the scorer.
116 |         if self._entity_beam:
117 |             scores, _ = class_scores.max(dim=-1)
118 |             scores = scores.unsqueeze(-1)
119 |         # If gold beam is one, give a score of 0 wherever the gold label is non-zero (indicating a
120 |         # non-null label), otherwise give a large negative number.
121 |         elif self._gold_beam:
122 |             scores = torch.where(gold_labels > 0,
123 |                                  torch.zeros_like(gold_labels, dtype=torch.float),
124 |                                  -1e20 * torch.ones_like(gold_labels, dtype=torch.float))
125 |             scores = scores.unsqueeze(-1)
126 |         else:
127 |             scores = self._scorer(embeddings)
128 | 
129 |         # If we're only keeping items that score above a given threshold, change the number of kept
130 |         # items here.
131 |         if self._min_score_to_keep is not None:
132 |             num_good_items = torch.sum(scores > self._min_score_to_keep, dim=1).squeeze()
133 |             num_items_to_keep = torch.min(num_items_to_keep, num_good_items)
134 |         # If gold beam is on, keep the gold items.
135 |         if self._gold_beam:
136 |             num_items_to_keep = torch.sum(gold_labels > 0, dim=1)
137 | 
138 |         # Always keep at least one item to avoid edge case with empty matrix.
139 |         max_items_to_keep = max(num_items_to_keep.max().item(), 1)
140 | 
141 |         if scores.size(-1) != 1 or scores.dim() != 3:
142 |             raise ValueError(f"The scorer passed to Pruner must produce a tensor of shape"
143 |                              f"(batch_size, num_items, 1), but found shape {scores.size()}")
144 |         # Make sure that we don't select any masked items by setting their scores to be very
145 |         # negative.  These are logits, typically, so -1e20 should be plenty negative.
146 |         # NOTE(`mask` needs to be a byte tensor now.)
147 |         scores = util.replace_masked_values(scores, mask.bool(), -1e20)
148 | 
149 |         # Shape: (batch_size, max_num_items_to_keep, 1)
150 |         _, top_indices = scores.topk(max_items_to_keep, 1)
151 | 
152 |         # Mask based on number of items to keep for each sentence.
153 |         # Shape: (batch_size, max_num_items_to_keep)
154 |         top_indices_mask = util.get_mask_from_sequence_lengths(num_items_to_keep, max_items_to_keep)
155 |         top_indices_mask = top_indices_mask.bool()
156 | 
157 |         # Shape: (batch_size, max_num_items_to_keep)
158 |         top_indices = top_indices.squeeze(-1)
159 | 
160 |         # Fill all masked indices with largest "top" index for that sentence, so that all masked
161 |         # indices will be sorted to the end.
162 |         # Shape: (batch_size, 1)
163 |         fill_value, _ = top_indices.max(dim=1)
164 |         fill_value = fill_value.unsqueeze(-1)
165 |         # Shape: (batch_size, max_num_items_to_keep)
166 |         top_indices = torch.where(top_indices_mask, top_indices, fill_value)
167 | 
168 |         # Now we order the selected indices in increasing order with
169 |         # respect to their indices (and hence, with respect to the
170 |         # order they originally appeared in the ``embeddings`` tensor).
171 |         top_indices, _ = torch.sort(top_indices, 1)
172 | 
173 |         # Shape: (batch_size * max_num_items_to_keep)
174 |         # torch.index_select only accepts 1D indices, but here
175 |         # we need to select items for each element in the batch.
176 |         flat_top_indices = util.flatten_and_batch_shift_indices(top_indices, num_items)
177 | 
178 |         # Shape: (batch_size, max_num_items_to_keep, embedding_size)
179 |         top_embeddings = util.batched_index_select(embeddings, top_indices, flat_top_indices)
180 | 
181 |         # Combine the masks on spans that are out-of-bounds, and the mask on spans that are outside
182 |         # the top k for each sentence.
183 |         # Shape: (batch_size, max_num_items_to_keep)
184 |         sequence_mask = util.batched_index_select(mask, top_indices, flat_top_indices)
185 |         sequence_mask = sequence_mask.squeeze(-1).bool()
186 |         top_mask = top_indices_mask & sequence_mask
187 |         top_mask = top_mask.long()
188 | 
189 |         # Shape: (batch_size, max_num_items_to_keep, 1)
190 |         top_scores = util.batched_index_select(scores, top_indices, flat_top_indices)
191 | 
192 |         return top_embeddings, top_mask, top_indices, top_scores, num_items_to_keep
193 | 


--------------------------------------------------------------------------------
/dygie/models/ner.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Dict, List, Optional, Callable
  3 | 
  4 | import torch
  5 | from torch.nn import functional as F
  6 | from overrides import overrides
  7 | 
  8 | from allennlp.data import Vocabulary
  9 | from allennlp.models.model import Model
 10 | from allennlp.modules import TimeDistributed
 11 | from allennlp.nn import util, InitializerApplicator, RegularizerApplicator
 12 | 
 13 | from dygie.training.ner_metrics import NERMetrics
 14 | from dygie.data.dataset_readers import document
 15 | 
 16 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 17 | 
 18 | 
 19 | class NERTagger(Model):
 20 |     """
 21 |     Named entity recognition module of DyGIE model.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     mention_feedforward : ``FeedForward``
 26 |         This feedforward network is applied to the span representations which is then scored
 27 |         by a linear layer.
 28 |     feature_size: ``int``
 29 |         The embedding size for all the embedded features, such as distances or span widths.
 30 |     lexical_dropout: ``int``
 31 |         The probability of dropping out dimensions of the embedded text.
 32 |     regularizer : ``RegularizerApplicator``, optional (default=``None``)
 33 |         If provided, will be used to calculate the regularization penalty during training.
 34 |     """
 35 | 
 36 |     def __init__(self,
 37 |                  vocab: Vocabulary,
 38 |                  make_feedforward: Callable,
 39 |                  span_emb_dim: int,
 40 |                  regularizer: Optional[RegularizerApplicator] = None) -> None:
 41 |         super(NERTagger, self).__init__(vocab, regularizer)
 42 | 
 43 |         self._namespaces = [entry for entry in vocab.get_namespaces() if "ner_labels" in entry]
 44 | 
 45 |         # Number of classes determine the output dimension of the final layer
 46 |         self._n_labels = {name: vocab.get_vocab_size(name) for name in self._namespaces}
 47 | 
 48 |         # Null label is needed to keep track of when calculating the metrics
 49 |         for namespace in self._namespaces:
 50 |             null_label = vocab.get_token_index("", namespace)
 51 |             assert null_label == 0  # If not, the dummy class won't correspond to the null label.
 52 | 
 53 |         # The output dim is 1 less than the number of labels because we don't score the null label;
 54 |         # we just give it a score of 0 by default.
 55 | 
 56 |         # Create a separate scorer and metric for each dataset we're dealing with.
 57 |         self._ner_scorers = torch.nn.ModuleDict()
 58 |         self._ner_metrics = {}
 59 | 
 60 |         for namespace in self._namespaces:
 61 |             mention_feedforward = make_feedforward(input_dim=span_emb_dim)
 62 |             self._ner_scorers[namespace] = torch.nn.Sequential(
 63 |                 TimeDistributed(mention_feedforward),
 64 |                 TimeDistributed(torch.nn.Linear(
 65 |                     mention_feedforward.get_output_dim(),
 66 |                     self._n_labels[namespace] - 1)))
 67 | 
 68 |             self._ner_metrics[namespace] = NERMetrics(self._n_labels[namespace], null_label)
 69 | 
 70 |         self._active_namespace = None
 71 | 
 72 |         self._loss = torch.nn.CrossEntropyLoss(reduction="sum")
 73 | 
 74 |     @overrides
 75 |     def forward(self,  # type: ignore
 76 |                 spans: torch.IntTensor,
 77 |                 span_mask: torch.IntTensor,
 78 |                 span_embeddings: torch.IntTensor,
 79 |                 sentence_lengths: torch.Tensor,
 80 |                 ner_labels: torch.IntTensor = None,
 81 |                 metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
 82 |         """
 83 |         TODO(dwadden) Write documentation.
 84 |         """
 85 | 
 86 |         # Shape: (Batch size, Number of Spans, Span Embedding Size)
 87 |         # span_embeddings
 88 | 
 89 |         self._active_namespace = f"{metadata.dataset}__ner_labels"
 90 |         if self._active_namespace not in self._ner_scorers:
 91 |             return {"loss": 0}
 92 | 
 93 |         scorer = self._ner_scorers[self._active_namespace]
 94 | 
 95 |         ner_scores = scorer(span_embeddings)
 96 |         # Give large negative scores to masked-out elements.
 97 |         mask = span_mask.unsqueeze(-1)
 98 |         ner_scores = util.replace_masked_values(ner_scores, mask.bool(), -1e20)
 99 |         # The dummy_scores are the score for the null label.
100 |         dummy_dims = [ner_scores.size(0), ner_scores.size(1), 1]
101 |         dummy_scores = ner_scores.new_zeros(*dummy_dims)
102 |         ner_scores = torch.cat((dummy_scores, ner_scores), -1)
103 | 
104 |         _, predicted_ner = ner_scores.max(2)
105 | 
106 |         predictions = self.predict(ner_scores.detach().cpu(),
107 |                                    spans.detach().cpu(),
108 |                                    span_mask.detach().cpu(),
109 |                                    metadata)
110 |         output_dict = {"predictions": predictions}
111 | 
112 |         if ner_labels is not None:
113 |             metrics = self._ner_metrics[self._active_namespace]
114 |             metrics(predicted_ner, ner_labels, span_mask)
115 |             ner_scores_flat = ner_scores.view(-1, self._n_labels[self._active_namespace])
116 |             ner_labels_flat = ner_labels.view(-1)
117 |             mask_flat = span_mask.view(-1).bool()
118 | 
119 |             loss = self._loss(ner_scores_flat[mask_flat], ner_labels_flat[mask_flat])
120 | 
121 |             output_dict["loss"] = loss
122 | 
123 |         return output_dict
124 | 
125 |     def predict(self, ner_scores, spans, span_mask, metadata):
126 |         # TODO(dwadden) Make sure the iteration works in documents with a single sentence.
127 |         # Zipping up and iterating iterates over the zeroth dimension of each tensor; this
128 |         # corresponds to iterating over sentences.
129 |         predictions = []
130 |         zipped = zip(ner_scores, spans, span_mask, metadata)
131 |         for ner_scores_sent, spans_sent, span_mask_sent, sentence in zipped:
132 |             predicted_scores_raw, predicted_labels = ner_scores_sent.max(dim=1)
133 |             softmax_scores = F.softmax(ner_scores_sent, dim=1)
134 |             predicted_scores_softmax, _ = softmax_scores.max(dim=1)
135 |             ix = (predicted_labels != 0) & span_mask_sent.bool()
136 | 
137 |             predictions_sent = []
138 |             zip_pred = zip(predicted_labels[ix], predicted_scores_raw[ix],
139 |                            predicted_scores_softmax[ix], spans_sent[ix])
140 |             for label, label_score_raw, label_score_softmax, label_span in zip_pred:
141 |                 label_str = self.vocab.get_token_from_index(label.item(), self._active_namespace)
142 |                 span_start, span_end = label_span.tolist()
143 |                 ner = [span_start, span_end, label_str, label_score_raw.item(),
144 |                        label_score_softmax.item()]
145 |                 prediction = document.PredictedNER(ner, sentence, sentence_offsets=True)
146 |                 predictions_sent.append(prediction)
147 | 
148 |             predictions.append(predictions_sent)
149 | 
150 |         return predictions
151 | 
152 |     # TODO(dwadden) This code is repeated elsewhere. Refactor.
153 |     @overrides
154 |     def get_metrics(self, reset: bool = False) -> Dict[str, float]:
155 |         "Loop over the metrics for all namespaces, and return as dict."
156 |         res = {}
157 |         for namespace, metrics in self._ner_metrics.items():
158 |             precision, recall, f1 = metrics.get_metric(reset)
159 |             prefix = namespace.replace("_labels", "")
160 |             to_update = {f"{prefix}_precision": precision,
161 |                          f"{prefix}_recall": recall,
162 |                          f"{prefix}_f1": f1}
163 |             res.update(to_update)
164 | 
165 |         res_avg = {}
166 |         for name in ["precision", "recall", "f1"]:
167 |             values = [res[key] for key in res if name in key]
168 |             res_avg[f"MEAN__ner_{name}"] = sum(values) / len(values) if values else 0
169 |             res.update(res_avg)
170 | 
171 |         return res
172 | 


--------------------------------------------------------------------------------
/dygie/models/relation.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any, Dict, List, Optional, Callable
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | from overrides import overrides
  7 | 
  8 | from allennlp.data import Vocabulary
  9 | from allennlp.models.model import Model
 10 | from allennlp.nn import util, RegularizerApplicator
 11 | from allennlp.modules import TimeDistributed
 12 | 
 13 | from dygie.training.relation_metrics import RelationMetrics
 14 | from dygie.models.entity_beam_pruner import Pruner
 15 | from dygie.data.dataset_readers import document
 16 | 
 17 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 18 | 
 19 | 
 20 | # TODO(dwadden) add tensor dimension comments.
 21 | # TODO(dwadden) Different sentences should have different number of relation candidates depending on
 22 | # length.
 23 | class RelationExtractor(Model):
 24 |     """
 25 |     Relation extraction module of DyGIE model.
 26 |     """
 27 |     # TODO(dwadden) add option to make `mention_feedforward` be the NER tagger.
 28 | 
 29 |     def __init__(self,
 30 |                  vocab: Vocabulary,
 31 |                  make_feedforward: Callable,
 32 |                  span_emb_dim: int,
 33 |                  feature_size: int,
 34 |                  spans_per_word: float,
 35 |                  positive_label_weight: float = 1.0,
 36 |                  regularizer: Optional[RegularizerApplicator] = None) -> None:
 37 |         super().__init__(vocab, regularizer)
 38 | 
 39 |         self._namespaces = [entry for entry in vocab.get_namespaces() if "relation_labels" in entry]
 40 |         self._n_labels = {name: vocab.get_vocab_size(name) for name in self._namespaces}
 41 | 
 42 |         self._mention_pruners = torch.nn.ModuleDict()
 43 |         self._relation_feedforwards = torch.nn.ModuleDict()
 44 |         self._relation_scorers = torch.nn.ModuleDict()
 45 |         self._relation_metrics = {}
 46 | 
 47 |         for namespace in self._namespaces:
 48 |             mention_feedforward = make_feedforward(input_dim=span_emb_dim)
 49 |             feedforward_scorer = torch.nn.Sequential(
 50 |                 TimeDistributed(mention_feedforward),
 51 |                 TimeDistributed(torch.nn.Linear(mention_feedforward.get_output_dim(), 1)))
 52 |             self._mention_pruners[namespace] = Pruner(feedforward_scorer)
 53 | 
 54 |             relation_scorer_dim = 3 * span_emb_dim
 55 |             relation_feedforward = make_feedforward(input_dim=relation_scorer_dim)
 56 |             self._relation_feedforwards[namespace] = relation_feedforward
 57 |             relation_scorer = torch.nn.Linear(
 58 |                 relation_feedforward.get_output_dim(), self._n_labels[namespace])
 59 |             self._relation_scorers[namespace] = relation_scorer
 60 | 
 61 |             self._relation_metrics[namespace] = RelationMetrics()
 62 | 
 63 |         self._spans_per_word = spans_per_word
 64 |         self._active_namespace = None
 65 | 
 66 |         self._loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=-1)
 67 | 
 68 |     @overrides
 69 |     def forward(self,  # type: ignore
 70 |                 spans: torch.IntTensor,
 71 |                 span_mask,
 72 |                 span_embeddings,  # TODO(dwadden) add type.
 73 |                 sentence_lengths,
 74 |                 relation_labels: torch.IntTensor = None,
 75 |                 metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
 76 |         """
 77 |         TODO(dwadden) Write documentation.
 78 |         """
 79 |         self._active_namespace = f"{metadata.dataset}__relation_labels"
 80 | 
 81 |         if self._active_namespace not in self._relation_scorers:
 82 |             return {"loss": 0}
 83 | 
 84 |         (top_span_embeddings, top_span_mention_scores,
 85 |          num_spans_to_keep, top_span_mask,
 86 |          top_span_indices, top_spans) = self._prune_spans(
 87 |              spans, span_mask, span_embeddings, sentence_lengths)
 88 | 
 89 |         relation_scores = self._compute_relation_scores(
 90 |             self._compute_span_pair_embeddings(top_span_embeddings), top_span_mention_scores)
 91 | 
 92 |         prediction_dict, predictions = self.predict(top_spans.detach().cpu(),
 93 |                                                     relation_scores.detach().cpu(),
 94 |                                                     num_spans_to_keep.detach().cpu(),
 95 |                                                     metadata)
 96 | 
 97 |         output_dict = {"predictions": predictions}
 98 | 
 99 |         # Evaluate loss and F1 if labels were provided.
100 |         if relation_labels is not None:
101 |             # Compute cross-entropy loss.
102 |             gold_relations = self._get_pruned_gold_relations(
103 |                 relation_labels, top_span_indices, top_span_mask)
104 | 
105 |             cross_entropy = self._get_cross_entropy_loss(relation_scores, gold_relations)
106 | 
107 |             # Compute F1.
108 |             assert len(prediction_dict) == len(metadata)  # Make sure length of predictions is right.
109 |             relation_metrics = self._relation_metrics[self._active_namespace]
110 |             relation_metrics(prediction_dict, metadata)
111 | 
112 |             output_dict["loss"] = cross_entropy
113 |         return output_dict
114 | 
115 |     def _prune_spans(self, spans, span_mask, span_embeddings, sentence_lengths):
116 |         # Prune
117 |         num_spans = spans.size(1)  # Max number of spans for the minibatch.
118 | 
119 |         # Keep different number of spans for each minibatch entry.
120 |         num_spans_to_keep = torch.ceil(sentence_lengths.float() * self._spans_per_word).long()
121 | 
122 |         pruner = self._mention_pruners[self._active_namespace]
123 |         (top_span_embeddings, top_span_mask,
124 |          top_span_indices, top_span_mention_scores, num_spans_kept) = pruner(
125 |              span_embeddings, span_mask, num_spans_to_keep)
126 | 
127 |         top_span_mask = top_span_mask.unsqueeze(-1)
128 | 
129 |         flat_top_span_indices = util.flatten_and_batch_shift_indices(top_span_indices, num_spans)
130 |         top_spans = util.batched_index_select(spans,
131 |                                               top_span_indices,
132 |                                               flat_top_span_indices)
133 | 
134 |         return top_span_embeddings, top_span_mention_scores, num_spans_to_keep, top_span_mask, top_span_indices, top_spans
135 | 
136 |     def predict(self, top_spans, relation_scores, num_spans_to_keep, metadata):
137 |         preds_dict = []
138 |         predictions = []
139 |         zipped = zip(top_spans, relation_scores, num_spans_to_keep, metadata)
140 | 
141 |         for top_spans_sent, relation_scores_sent, num_spans_sent, sentence in zipped:
142 |             pred_dict_sent, predictions_sent = self._predict_sentence(
143 |                 top_spans_sent, relation_scores_sent, num_spans_sent, sentence)
144 |             preds_dict.append(pred_dict_sent)
145 |             predictions.append(predictions_sent)
146 | 
147 |         return preds_dict, predictions
148 | 
149 |     def _predict_sentence(self, top_spans, relation_scores, num_spans_to_keep, sentence):
150 |         keep = num_spans_to_keep.item()
151 |         top_spans = [tuple(x) for x in top_spans.tolist()]
152 | 
153 |         # Iterate over all span pairs and labels. Record the span if the label isn't null.
154 |         predicted_scores_raw, predicted_labels = relation_scores.max(dim=-1)
155 |         softmax_scores = F.softmax(relation_scores, dim=-1)
156 |         predicted_scores_softmax, _ = softmax_scores.max(dim=-1)
157 |         predicted_labels -= 1  # Subtract 1 so that null labels get -1.
158 | 
159 |         keep_mask = torch.zeros(len(top_spans))
160 |         keep_mask[:keep] = 1
161 |         keep_mask = keep_mask.bool()
162 | 
163 |         ix = (predicted_labels >= 0) & keep_mask
164 | 
165 |         res_dict = {}
166 |         predictions = []
167 | 
168 |         for i, j in ix.nonzero(as_tuple=False):
169 |             span_1 = top_spans[i]
170 |             span_2 = top_spans[j]
171 |             label = predicted_labels[i, j].item()
172 |             raw_score = predicted_scores_raw[i, j].item()
173 |             softmax_score = predicted_scores_softmax[i, j].item()
174 | 
175 |             label_name = self.vocab.get_token_from_index(label, namespace=self._active_namespace)
176 |             res_dict[(span_1, span_2)] = label_name
177 |             list_entry = (span_1[0], span_1[1], span_2[0], span_2[1], label_name, raw_score, softmax_score)
178 |             predictions.append(document.PredictedRelation(list_entry, sentence, sentence_offsets=True))
179 | 
180 |         return res_dict, predictions
181 | 
182 |     # TODO(dwadden) This code is repeated elsewhere. Refactor.
183 |     @overrides
184 |     def get_metrics(self, reset: bool = False) -> Dict[str, float]:
185 |         "Loop over the metrics for all namespaces, and return as dict."
186 |         res = {}
187 |         for namespace, metrics in self._relation_metrics.items():
188 |             precision, recall, f1 = metrics.get_metric(reset)
189 |             prefix = namespace.replace("_labels", "")
190 |             to_update = {f"{prefix}_precision": precision,
191 |                          f"{prefix}_recall": recall,
192 |                          f"{prefix}_f1": f1}
193 |             res.update(to_update)
194 | 
195 |         res_avg = {}
196 |         for name in ["precision", "recall", "f1"]:
197 |             values = [res[key] for key in res if name in key]
198 |             res_avg[f"MEAN__relation_{name}"] = sum(values) / len(values) if values else 0
199 |             res.update(res_avg)
200 | 
201 |         return res
202 | 
203 |     @staticmethod
204 |     def _compute_span_pair_embeddings(top_span_embeddings: torch.FloatTensor):
205 |         """
206 |         TODO(dwadden) document me and add comments.
207 |         """
208 |         # Shape: (batch_size, num_spans_to_keep, num_spans_to_keep, embedding_size)
209 |         num_candidates = top_span_embeddings.size(1)
210 | 
211 |         embeddings_1_expanded = top_span_embeddings.unsqueeze(2)
212 |         embeddings_1_tiled = embeddings_1_expanded.repeat(1, 1, num_candidates, 1)
213 | 
214 |         embeddings_2_expanded = top_span_embeddings.unsqueeze(1)
215 |         embeddings_2_tiled = embeddings_2_expanded.repeat(1, num_candidates, 1, 1)
216 | 
217 |         similarity_embeddings = embeddings_1_expanded * embeddings_2_expanded
218 | 
219 |         pair_embeddings_list = [embeddings_1_tiled, embeddings_2_tiled, similarity_embeddings]
220 |         pair_embeddings = torch.cat(pair_embeddings_list, dim=3)
221 | 
222 |         return pair_embeddings
223 | 
224 |     def _compute_relation_scores(self, pairwise_embeddings, top_span_mention_scores):
225 |         relation_feedforward = self._relation_feedforwards[self._active_namespace]
226 |         relation_scorer = self._relation_scorers[self._active_namespace]
227 | 
228 |         batch_size = pairwise_embeddings.size(0)
229 |         max_num_spans = pairwise_embeddings.size(1)
230 |         feature_dim = relation_feedforward.input_dim
231 | 
232 |         embeddings_flat = pairwise_embeddings.view(-1, feature_dim)
233 | 
234 |         relation_projected_flat = relation_feedforward(embeddings_flat)
235 |         relation_scores_flat = relation_scorer(relation_projected_flat)
236 | 
237 |         relation_scores = relation_scores_flat.view(batch_size, max_num_spans, max_num_spans, -1)
238 | 
239 |         # Add the mention scores for each of the candidates.
240 | 
241 |         relation_scores += (top_span_mention_scores.unsqueeze(-1) +
242 |                             top_span_mention_scores.transpose(1, 2).unsqueeze(-1))
243 | 
244 |         shape = [relation_scores.size(0), relation_scores.size(1), relation_scores.size(2), 1]
245 |         dummy_scores = relation_scores.new_zeros(*shape)
246 | 
247 |         relation_scores = torch.cat([dummy_scores, relation_scores], -1)
248 |         return relation_scores
249 | 
250 |     @staticmethod
251 |     def _get_pruned_gold_relations(relation_labels, top_span_indices, top_span_masks):
252 |         """
253 |         Loop over each slice and get the labels for the spans from that slice.
254 |         All labels are offset by 1 so that the "null" label gets class zero. This is the desired
255 |         behavior for the softmax. Labels corresponding to masked relations keep the label -1, which
256 |         the softmax loss ignores.
257 |         """
258 |         # TODO(dwadden) Test and possibly optimize.
259 |         relations = []
260 | 
261 |         zipped = zip(relation_labels, top_span_indices, top_span_masks.bool())
262 |         for sliced, ixs, top_span_mask in zipped:
263 |             entry = sliced[ixs][:, ixs].unsqueeze(0)
264 |             mask_entry = top_span_mask & top_span_mask.transpose(0, 1).unsqueeze(0)
265 |             entry[mask_entry] += 1
266 |             entry[~mask_entry] = -1
267 |             relations.append(entry)
268 | 
269 |         return torch.cat(relations, dim=0)
270 | 
271 |     def _get_cross_entropy_loss(self, relation_scores, relation_labels):
272 |         """
273 |         Compute cross-entropy loss on relation labels. Ignore diagonal entries and entries giving
274 |         relations between masked out spans.
275 |         """
276 |         # Need to add one for the null class.
277 |         n_labels = self._n_labels[self._active_namespace] + 1
278 |         scores_flat = relation_scores.view(-1, n_labels)
279 |         # Need to add 1 so that the null label is 0, to line up with indices into prediction matrix.
280 |         labels_flat = relation_labels.view(-1)
281 |         # Compute cross-entropy loss.
282 |         loss = self._loss(scores_flat, labels_flat)
283 |         return loss
284 | 


--------------------------------------------------------------------------------
/dygie/models/shared.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Short utility functions.
 3 | """
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | def cumsum_shifted(xs):
 9 |     """
10 |     Assumes `xs` is a 1-d array.
11 |     The usual cumsum has elements [x[1], x[1] + x[2], ...]. This one has elements
12 |     [0, x[1], x[1] + x[2], ...]. Useful for calculating sentence offsets.
13 |     """
14 |     cs = xs.cumsum(dim=0)
15 |     shift = torch.zeros(1, dtype=torch.long, device=cs.device)  # Put on correct device.
16 |     return torch.cat([shift, cs[:-1]], dim=0)
17 | 
18 | 
19 | def batch_identity(batch_size, matrix_size, *args, **kwargs):
20 |     """
21 |     Tile the identity matrix along axis 0, `batch_size` times.
22 |     """
23 |     ident = torch.eye(matrix_size, *args, **kwargs).unsqueeze(0)
24 |     res = ident.repeat(batch_size, 1, 1)
25 |     return res
26 | 
27 | 
28 | def fields_to_batches(d, keys_to_ignore=[]):
29 |     """
30 |     The input is a dict whose items are batched tensors. The output is a list of dictionaries - one
31 |     per entry in the batch - with the slices of the tensors for that entry. Here's an example.
32 |     Input:
33 |     d = {"a": [[1, 2], [3,4]], "b": [1, 2]}
34 |     Output:
35 |     res = [{"a": [1, 2], "b": 1}, {"a": [3, 4], "b": 2}].
36 |     """
37 |     keys = [key for key in d.keys() if key not in keys_to_ignore]
38 | 
39 |     # Make sure all input dicts have same length. If they don't, there's a problem.
40 |     lengths = {k: len(d[k]) for k in keys}
41 |     if len(set(lengths.values())) != 1:
42 |         msg = f"fields have different lengths: {lengths}."
43 |         # If there's a doc key, add it to specify where the error is.
44 |         if "doc_key" in d:
45 |             msg = f"For document {d['doc_key']}, " + msg
46 |         raise ValueError(msg)
47 | 
48 |     length = list(lengths.values())[0]
49 |     res = [{k: d[k][i] for k in keys} for i in range(length)]
50 |     return res
51 | 
52 | 
53 | def batches_to_fields(batches):
54 |     """
55 |     The inverse of `fields_to_batches`.
56 |     """
57 |     # Make sure all the keys match.
58 |     first_keys = batches[0].keys()
59 |     for entry in batches[1:]:
60 |         if set(entry.keys()) != set(first_keys):
61 |             raise ValueError("Keys to not match on all entries.")
62 | 
63 |     res = {k: [] for k in first_keys}
64 |     for batch in batches:
65 |         for k, v in batch.items():
66 |             res[k].append(v)
67 | 
68 |     return res
69 | 


--------------------------------------------------------------------------------
/dygie/predictors/__init__.py:
--------------------------------------------------------------------------------
1 | from dygie.predictors.dygie import DyGIEPredictor
2 | 


--------------------------------------------------------------------------------
/dygie/predictors/dygie.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import numpy as np
 3 | import warnings
 4 | 
 5 | from overrides import overrides
 6 | import numpy
 7 | import json
 8 | 
 9 | from allennlp.common.util import JsonDict
10 | from allennlp.nn import util
11 | from allennlp.data import Batch
12 | from allennlp.data import DatasetReader
13 | from allennlp.models import Model
14 | from allennlp.predictors.predictor import Predictor
15 | 
16 | 
17 | @Predictor.register("dygie")
18 | class DyGIEPredictor(Predictor):
19 |     """
20 |     Predictor for DyGIE model.
21 | 
22 |     If model was trained on coref, prediction is done on a whole document at
23 |     once. This risks overflowing memory on large documents.
24 |     If the model was trained without coref, prediction is done by sentence.
25 |     """
26 |     def __init__(
27 |             self, model: Model, dataset_reader: DatasetReader) -> None:
28 |         super().__init__(model, dataset_reader)
29 | 
30 |     def predict(self, document):
31 |         return self.predict_json({"document": document})
32 | 
33 |     def predict_tokenized(self, tokenized_document: List[str]) -> JsonDict:
34 |         instance = self._words_list_to_instance(tokenized_document)
35 |         return self.predict_instance(instance)
36 | 
37 |     @overrides
38 |     def dump_line(self, outputs):
39 |         # Need to override to tell Python how to deal with Numpy ints.
40 |         return json.dumps(outputs, default=int) + "\n"
41 | 
42 |     # TODO(dwadden) Can this be implemented in `forward_on_instance`  instead?
43 |     @overrides
44 |     def predict_instance(self, instance):
45 |         """
46 |         An instance is an entire document, represented as a list of sentences.
47 |         """
48 |         model = self._model
49 |         cuda_device = model._get_prediction_device()
50 | 
51 |         # Try to predict this batch.
52 |         try:
53 |             dataset = Batch([instance])
54 |             dataset.index_instances(model.vocab)
55 |             model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
56 |             prediction = model.make_output_human_readable(model(**model_input)).to_json()
57 |         # If we run out of GPU memory, warn user and indicate that this document failed.
58 |         # This way, prediction doesn't grind to a halt every time we run out of GPU.
59 |         except RuntimeError as err:
60 |             # doc_key, dataset, sentences, message
61 |             metadata = instance["metadata"].metadata
62 |             doc_key = metadata.doc_key
63 |             msg = (f"Encountered a RunTimeError on document {doc_key}. Skipping this example."
64 |                    f" Error message:\n{err.args[0]}.")
65 |             warnings.warn(msg)
66 |             prediction = metadata.to_json()
67 |             prediction["_FAILED_PREDICTION"] = True
68 | 
69 |         return prediction
70 | 


--------------------------------------------------------------------------------
/dygie/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests/
3 | python_paths = ./
4 | addopts = -p no:warnings


--------------------------------------------------------------------------------
/dygie/spacy_interface/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/spacy_interface/__init__.py


--------------------------------------------------------------------------------
/dygie/spacy_interface/spacy_interface.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List
  2 | from allennlp.data import Batch
  3 | from dygie.models.dygie import DyGIE
  4 | from dygie.data.dataset_readers.dygie import DyGIEReader
  5 | from allennlp.models.archival import load_archive
  6 | from allennlp.nn import util
  7 | from spacy.language import Language
  8 | from spacy.tokens import Span
  9 | from spacy.tokens.doc import Doc
 10 | from spacy.tokens.span import Span
 11 | 
 12 | Doc.set_extension("rels", default=[], force=True)
 13 | Span.set_extension("rels", default=[], force=True)
 14 | Doc.set_extension("span_ents", default=[], force=True)
 15 | Span.set_extension("label_", default=[], force=True)
 16 | Doc.set_extension("events", default=[], force=True)
 17 | Span.set_extension("events", default=[], force=True)
 18 | 
 19 | 
 20 | def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc:
 21 |     doc_rels = []
 22 |     doc_evs = []
 23 |     # store events as relations. include confidence scores in the relation tuple (TODO: add relation property)
 24 |     for evs, ds in zip(prediction.get("predicted_events", []), doc.sents):
 25 |         sent_evs = []
 26 |         for ev in evs:
 27 |             if len(ev)>=3:
 28 |                 trig = [r for r in ev if r[1]=="TRIGGER"]
 29 |                 arg0s = [r for r in ev if r[2]=="ARG0"]
 30 |                 #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]]
 31 |                 arg1s = [r for r in ev if r[2]=="ARG1"]
 32 |                 e_trig = doc[trig[0][0]:trig[0][0]+1]
 33 |                 for arg0 in arg0s:
 34 |                     e_arg0 = doc[arg0[0] : arg0[1] + 1]
 35 |                     for arg1 in arg1s:
 36 |                         e_arg1 = doc[arg1[0] : arg1[1] + 1]
 37 |                         #here confidence is set as the minimum among {trigger,args}, as a conservative measure.
 38 |                         sent_evs.append({"ARG0":e_arg0,"ARG1":e_arg1,"RELATION_TRIGGER":e_trig,"CONF":min([arg0[4],arg1[4],trig[0][3]])})
 39 |                         
 40 |         doc_evs.append(sent_evs)
 41 |         ds._.events = sent_evs
 42 |     doc._.events = doc_evs
 43 |     #TODO add doc._.span_ents too. 
 44 | 
 45 |     for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents):
 46 |         sent_rels = []
 47 |         for rel in rels:
 48 |             e1 = doc[rel[0] : rel[1] + 1]
 49 |             e2 = doc[rel[2] : rel[3] + 1]
 50 |             tag = rel[4]
 51 |             sent_rels.append((e1, e2, tag))
 52 |         doc_rels.append(sent_rels)
 53 |         ds._.rels = sent_rels
 54 |     doc._.rels = doc_rels
 55 |     if "predicted_ner" not in prediction:
 56 |         return doc
 57 |     preds = [p for r in prediction.get("predicted_ner", []) for p in r]
 58 |     # storing all span based entitis to doc._.span_ents
 59 |     span_ents = []
 60 |     for sent in prediction["predicted_ner"]:
 61 |         ent_sent = []
 62 |         for ent in sent:
 63 |             d = doc[ent[0] : ent[1] + 1]
 64 |             d._.label_ = ent[2]
 65 |             ent_sent.append(d)
 66 |         span_ents.append(ent_sent)
 67 |     doc._.span_ents = span_ents
 68 |     # store entities to doc.ents of spacy
 69 |     # because spacy can't support the overlapped entities we have to merge overlapped entities
 70 |     # to the longest ones.
 71 |     dist_ents = []
 72 |     prc = []
 73 |     for i, p1 in enumerate(preds):
 74 |         t = [p1]
 75 |         if i in prc:
 76 |             continue
 77 |         for j, p2 in enumerate(preds[i + 1 :]):
 78 |             if p2[0] <= p1[1]:
 79 |                 t.append(p1)
 80 |                 prc.append(j + i + 1)
 81 |         dist_ents.append(t)
 82 |     res = []
 83 |     for t in dist_ents:
 84 |         if len(t) == 1:
 85 |             res.append(t[0])
 86 |         elif len(t) > 1:
 87 |             mn = t[0][0]
 88 |             mx = t[0][1]
 89 |             for p in t[1:]:
 90 |                 if p[0] < mn:
 91 |                     mn = p[0]
 92 |                 if p[1] > mx:
 93 |                     mx = p[1]
 94 |             res.append([mn, mx, t[0][2], t[0][3], t[0][4]])
 95 |     sel_ents = []
 96 |     for ent in res:
 97 |         try:
 98 |             d = doc[ent[0] : ent[1] + 1]
 99 |             s = doc.char_span(d.start_char, d.end_char, label=ent[2])
100 |             if s:
101 |                 sel_ents.append(s)
102 |         except Exception as e:
103 |             print("error in spacy span", e)
104 |             raise e
105 |     doc.ents = sel_ents
106 |     return doc
107 | 
108 | 
109 | class DygieppPipe:
110 |     name = "dygiepp"
111 | 
112 |     def __init__(
113 |         self,
114 |         nlp: Language,
115 |         pretrained_filepath: str = "./pretrained/scierc-lightweight.tar.gz",
116 |         dataset_name: str = "scierc",
117 |     ) -> None:
118 |         """spacy factory class for adding information to spacy document. For now just entities and relations.
119 |         It adds entities to doc.ents and relations to doc._.rels: List[List[Token,Token,str]] which is a list of relations
120 |         as  entity1, entity2, relation name
121 | 
122 |         Args:
123 |             nlp (Language): Spacy Language instance
124 |             name (str, optional): Pipe name. Defaults to "dygiepp".
125 |             pretrained_filepath (str, optional): Address of pre-trained model to extract information. Defaults to "./pretrained/scierc-lightweight.tar.gz".
126 |             dataset_name (str, optional): Dataset name used for model. Defaults to "scierc".
127 |         """
128 |         # TODO add events and cluster information to spacy doc too
129 |         archive = load_archive(pretrained_filepath)
130 |         self._model = archive.model
131 |         self._model.eval()
132 |         archive.config["dataset_reader"].pop("type")  # it's stupid but was necessary!
133 |         self._dataset_reader = DyGIEReader.from_params(archive.config["dataset_reader"])
134 |         self.dataset_name = dataset_name
135 | 
136 |     def __call__(self, doc: Doc) -> Doc:
137 |         cuda_device = self._model._get_prediction_device()
138 |         sentences = [[tok.text for tok in sent] for sent in doc.sents]
139 |         ins = self._dataset_reader.text_to_instance(
140 |             {"sentences": sentences, "doc_key": "test", "dataset": self.dataset_name}
141 |         )
142 |         dataset = Batch([ins])
143 |         dataset.index_instances(self._model.vocab)
144 |         model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
145 |         prediction = self._model.make_output_human_readable(
146 |             self._model(**model_input)
147 |         ).to_json()
148 |         # prepare and store ent/relation information to spacy Doc
149 |         return prepare_spacy_doc(doc, prediction)
150 | 


--------------------------------------------------------------------------------
/dygie/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/tests/data/__init__.py


--------------------------------------------------------------------------------
/dygie/tests/data/annotated_doc_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Spot checks for the classes defined in annotated_doc.py.
  3 | 
  4 | Uses the example provided in data.md, with index modifications to account for
  5 | the fact that spacy tokenizes contracted words into two tokens.
  6 | 
  7 | Author: Serena G. Lotreck
  8 | """
  9 | import unittest
 10 | import os
 11 | import shutil
 12 | import sys
 13 | 
 14 | sys.path.append('../../../scripts/new-dataset')
 15 | 
 16 | import annotated_doc as ad
 17 | import spacy
 18 | 
 19 | 
 20 | class TestEnt(unittest.TestCase):
 21 |     def setUp(self):
 22 | 
 23 |         # Set up tempdir
 24 |         self.tmpdir = "tmp"
 25 |         os.makedirs(self.tmpdir, exist_ok=True)
 26 | 
 27 |         # Set up document text
 28 |         nlp = spacy.load("en_core_web_sm")
 29 |         dataset = 'scierc'
 30 |         text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
 31 |                 "She was elected in 2017.")
 32 |         text_path = f'{self.tmpdir}/myfile.txt'
 33 |         with open(text_path, 'w') as f:
 34 |             f.write(text)
 35 |         ann = ("T1\tCity 0 7\tSeattle\n"
 36 |                "T2\tPerson 25 37\tJenny Durkan\n"
 37 |                "T3\tCity 41 51\tthe city's\n"
 38 |                "T4\tPerson 59 62\tShe\n"
 39 |                "T5\tPersonnel.Election 67 74\telected\n"
 40 |                "T6\tYear 78 82\t2017\n"
 41 |                "R1\tMayor-Of Arg1:T2 Arg2:T3\n"
 42 |                "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
 43 |                "*\tEQUIV T1 T3\n"
 44 |                "*\tEQUIV T2 T4\n")
 45 |         ann_path = f'{self.tmpdir}/myfile.ann'
 46 |         with open(ann_path, 'w') as f:
 47 |             f.write(ann)
 48 |         self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)]
 49 |         # NOTE: spacy tokenizes words with apostrophes into separate words.
 50 | 
 51 |         # Set up annotated_doc object
 52 |         self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path,
 53 |                                                        ann_path,
 54 |                                                        nlp,
 55 |                                                        dataset,
 56 |                                                        coref=True)
 57 |         self.annotated_doc.char_to_token()
 58 | 
 59 |         # Right answer
 60 |         self.ner = [[[0, 0, "City"]], [[6, 7, "Person"], [9, 11, "City"]],
 61 |                     [[14, 14, "Person"], [16, 16, "Personnel.Election"],
 62 |                      [18, 18, "Year"]]]
 63 | 
 64 |     def tearDown(self):
 65 | 
 66 |         shutil.rmtree(self.tmpdir)
 67 | 
 68 |     def test_format_ner_dygiepp(self):
 69 | 
 70 |         ner = ad.Ent.format_ner_dygiepp(self.annotated_doc.ents,
 71 |                                         self.sent_idx_tups)
 72 | 
 73 |         self.assertEqual(ner, self.ner)
 74 | 
 75 | 
 76 | class TestBinRel(unittest.TestCase):
 77 |     def setUp(self):
 78 | 
 79 |         # Set up tempdir
 80 |         self.tmpdir = "tmp"
 81 |         os.makedirs(self.tmpdir, exist_ok=True)
 82 | 
 83 |         # Set up document text
 84 |         nlp = spacy.load("en_core_web_sm")
 85 |         dataset = 'scierc'
 86 |         text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
 87 |                 "She was elected in 2017.")
 88 |         text_path = f'{self.tmpdir}/myfile.txt'
 89 |         with open(text_path, 'w') as f:
 90 |             f.write(text)
 91 |         ann = ("T1\tCity 0 7\tSeattle\n"
 92 |                "T2\tPerson 25 37\tJenny Durkan\n"
 93 |                "T3\tCity 41 51\tthe city's\n"
 94 |                "T4\tPerson 59 62\tShe\n"
 95 |                "T5\tPersonnel.Election 67 74\telected\n"
 96 |                "T6\tYear 78 82\t2017\n"
 97 |                "R1\tMayor-Of Arg1:T2 Arg2:T3\n"
 98 |                "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
 99 |                "*\tEQUIV T1 T3\n"
100 |                "*\tEQUIV T2 T4\n")
101 |         ann_path = f'{self.tmpdir}/myfile.ann'
102 |         with open(ann_path, 'w') as f:
103 |             f.write(ann)
104 |         self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)]
105 |         # NOTE: spacy tokenizes words with apostrophes into separate words.
106 | 
107 |         # Set up annotated_doc object
108 |         self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path,
109 |                                                        ann_path,
110 |                                                        nlp,
111 |                                                        dataset,
112 |                                                        coref=True)
113 |         self.annotated_doc.char_to_token()
114 | 
115 |         # Set up relation
116 |         self.rel1 = ad.BinRel("R1\tMayor-Of Arg1:T2 Arg2:T3".split())
117 | 
118 |         # Right answer
119 |         self.relations = [[], [[6, 7, 9, 11, "Mayor-Of"]], []]
120 | 
121 |     def tearDown(self):
122 | 
123 |         shutil.rmtree(self.tmpdir)
124 | 
125 |     def test_set_arg_objects(self):
126 | 
127 |         self.rel1.set_arg_objects(self.annotated_doc.ents)
128 | 
129 |         self.assertEqual(self.rel1.arg1, self.annotated_doc.ents[1])
130 |         self.assertEqual(self.rel1.arg2, self.annotated_doc.ents[2])
131 | 
132 |     def test_format_bin_rels_dygiepp(self):
133 | 
134 |         self.rel1.set_arg_objects(self.annotated_doc.ents)
135 |         relations = ad.BinRel.format_bin_rels_dygiepp([self.rel1],
136 |                                                       self.sent_idx_tups)
137 | 
138 |         self.assertEqual(relations, self.relations)
139 | 
140 | 
141 | class TestEvent(unittest.TestCase):
142 |     def setUp(self):
143 | 
144 |         # Set up tempdir
145 |         self.tmpdir = "tmp"
146 |         os.makedirs(self.tmpdir, exist_ok=True)
147 | 
148 |         # Set up document text
149 |         nlp = spacy.load("en_core_web_sm")
150 |         dataset = 'scierc'
151 |         text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
152 |                 "She was elected in 2017.")
153 |         text_path = f'{self.tmpdir}/myfile.txt'
154 |         with open(text_path, 'w') as f:
155 |             f.write(text)
156 |         ann = ("T1\tCity 0 7\tSeattle\n"
157 |                "T2\tPerson 25 37\tJenny Durkan\n"
158 |                "T3\tCity 41 51\tthe city's\n"
159 |                "T4\tPerson 59 62\tShe\n"
160 |                "T5\tPersonnel.Election 67 74\telected\n"
161 |                "T6\tYear 78 82\t2017\n"
162 |                "R1\tMayor-Of Arg1:T2 Arg2:T3\n"
163 |                "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
164 |                "*\tEQUIV T1 T3\n"
165 |                "*\tEQUIV T2 T4\n")
166 |         ann_path = f'{self.tmpdir}/myfile.ann'
167 |         with open(ann_path, 'w') as f:
168 |             f.write(ann)
169 |         self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)]
170 |         # NOTE: spacy tokenizes words with apostrophes into separate words.
171 | 
172 |         # Set up annotated_doc object
173 |         self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path,
174 |                                                        ann_path,
175 |                                                        nlp,
176 |                                                        dataset,
177 |                                                        coref=True)
178 |         self.annotated_doc.char_to_token()
179 | 
180 |         # Set up events
181 |         self.event1 = ad.Event(
182 |             "E1\tPersonnel.Election:T5 Person:T4 Year:T6".split())
183 | 
184 |         # Right answer
185 |         self.events = [[], [],
186 |                        [[[16, "Personnel.Election"], [14, 14, "Person"],
187 |                          [18, 18, "Year"]]]]
188 | 
189 |     def tearDown(self):
190 | 
191 |         shutil.rmtree(self.tmpdir)
192 | 
193 |     def test_set_arg_objects(self):
194 | 
195 |         self.event1.set_arg_objects(self.annotated_doc.ents)
196 | 
197 |         self.assertEqual(self.event1.trigger, self.annotated_doc.ents[4])
198 |         self.assertEqual(
199 |             self.event1.args,
200 |             [self.annotated_doc.ents[3], self.annotated_doc.ents[5]])
201 | 
202 |     def test_format_events_dygiepp(self):
203 | 
204 |         self.event1.set_arg_objects(self.annotated_doc.ents)
205 |         events = ad.Event.format_events_dygiepp([self.event1],
206 |                                                 self.sent_idx_tups)
207 | 
208 |         self.assertEqual(events, self.events)
209 | 
210 | 
211 | class TestEquivRel(unittest.TestCase):
212 |     def setUp(self):
213 | 
214 |         # Set up tempdir
215 |         self.tmpdir = "tmp"
216 |         os.makedirs(self.tmpdir, exist_ok=True)
217 | 
218 |         # Set up document text
219 |         nlp = spacy.load("en_core_web_sm")
220 |         dataset = 'scierc'
221 |         text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
222 |                 "She was elected in 2017.")
223 |         text_path = f'{self.tmpdir}/myfile.txt'
224 |         with open(text_path, 'w') as f:
225 |             f.write(text)
226 |         ann = ("T1\tCity 0 7\tSeattle\n"
227 |                "T2\tPerson 25 37\tJenny Durkan\n"
228 |                "T3\tCity 41 51\tthe city's\n"
229 |                "T4\tPerson 59 62\tShe\n"
230 |                "T5\tPersonnel.Election 67 74\telected\n"
231 |                "T6\tYear 78 82\t2017\n"
232 |                "R1\tMayor-Of Arg1:T2 Arg2:T3\n"
233 |                "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
234 |                "*\tEQUIV T1 T3\n"
235 |                "*\tEQUIV T2 T4\n")
236 |         ann_path = f'{self.tmpdir}/myfile.ann'
237 |         with open(ann_path, 'w') as f:
238 |             f.write(ann)
239 | 
240 |         # Set up annotated_doc object
241 |         self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path,
242 |                                                        ann_path,
243 |                                                        nlp,
244 |                                                        dataset,
245 |                                                        coref=True)
246 |         self.annotated_doc.char_to_token()
247 | 
248 |         # Set up equivalence relations
249 |         self.equivrel1 = ad.EquivRel("*\tEQUIV T1 T3".split())
250 |         self.equivrel2 = ad.EquivRel("*\tEQUIV T2 T4".split())
251 | 
252 |         # The dygiepp-formatted correct answer
253 |         self.corefs = [[[0, 0], [9, 11]], [[6, 7], [14, 14]]]
254 | 
255 |     def tearDown(self):
256 | 
257 |         shutil.rmtree(self.tmpdir)
258 | 
259 |     def test_set_arg_objects(self):
260 | 
261 |         self.equivrel1.set_arg_objects(self.annotated_doc.ents)
262 |         self.equivrel2.set_arg_objects(self.annotated_doc.ents)
263 | 
264 |         self.assertEqual(
265 |             self.equivrel1.args,
266 |             [self.annotated_doc.ents[0], self.annotated_doc.ents[2]])
267 |         self.assertEqual(
268 |             self.equivrel2.args,
269 |             [self.annotated_doc.ents[1], self.annotated_doc.ents[3]])
270 | 
271 |     def test_format_corefs_dygiepp(self):
272 | 
273 |         self.equivrel1.set_arg_objects(self.annotated_doc.ents)
274 |         self.equivrel2.set_arg_objects(self.annotated_doc.ents)
275 |         corefs = ad.EquivRel.format_corefs_dygiepp(
276 |             [self.equivrel1, self.equivrel2])
277 | 
278 |         self.assertEqual(corefs, self.corefs)
279 | 
280 | 
281 | class TestAnnotatedDoc(unittest.TestCase):
282 |     """
283 |     Tests the functionality of char_to_token and format_dygiepp.
284 |     """
285 |     def setUp(self):
286 | 
287 |         # Set up temp dir and test docs
288 |         self.tmpdir = "tmp"
289 |         os.makedirs(self.tmpdir, exist_ok=True)
290 | 
291 |         txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. "
292 |                "She was elected in 2017.")
293 | 
294 |         self.txt = f'{self.tmpdir}/myfile.txt'
295 |         with open(self.txt, 'w') as f:
296 |             f.write(txt)
297 | 
298 |         ann = ("T1\tCity 0 7\tSeattle\n"
299 |                "T2\tPerson 25 37\tJenny Durkan\n"
300 |                "T3\tCity 41 51\tthe city's\n"
301 |                "T4\tPerson 59 62\tShe\n"
302 |                "T5\tPersonnel.Election 67 74\telected\n"
303 |                "T6\tYear 78 82\t2017\n"
304 |                "R1\tMayor-Of Arg1:T2 Arg2:T3\n"
305 |                "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n"
306 |                "*\tEQUIV T1 T3\n"
307 |                "*\tEQUIV T2 T4\n")
308 | 
309 |         self.ann = f'{self.tmpdir}/myfile.ann'
310 |         with open(self.ann, 'w') as f:
311 |             f.write(ann)
312 | 
313 |         # Define other attributes
314 |         self.nlp = spacy.load("en_core_web_sm")
315 |         self.dataset = 'scierc'
316 | 
317 |         # Define right answer
318 |         self.dygiepp_dict = {
319 |             "doc_key":
320 |             "myfile",
321 |             "dataset":
322 |             self.dataset,
323 |             "sentences":
324 |             [[tok.text for tok in sent] for sent in self.nlp(txt).sents],
325 |             "ner": [[[0, 0, "City"]], [[6, 7, "Person"], [9, 11, "City"]],
326 |                     [[14, 14, "Person"], [16, 16, "Personnel.Election"],
327 |                      [18, 18, "Year"]]],
328 |             "relations": [[], [[6, 7, 9, 11, "Mayor-Of"]], []],
329 |             "clusters": [[[0, 0], [9, 11]], [[6, 7], [14, 14]]],
330 |             "events": [[], [],
331 |                        [[[16, "Personnel.Election"], [14, 14, "Person"],
332 |                          [18, 18, "Year"]]]]
333 |         }
334 | 
335 |     def tearDown(self):
336 | 
337 |         shutil.rmtree(self.tmpdir)
338 | 
339 |     def test_char_to_token(self):
340 | 
341 |         annotated_doc = ad.AnnotatedDoc.parse_ann(self.txt,
342 |                                                   self.ann,
343 |                                                   self.nlp,
344 |                                                   self.dataset,
345 |                                                   coref=True)
346 |         annotated_doc.char_to_token()
347 | 
348 |         self.assertEqual(annotated_doc.ents[0].tok_start, 0)
349 |         self.assertEqual(annotated_doc.ents[1].tok_start, 6)
350 |         self.assertEqual(annotated_doc.ents[2].tok_start, 9)
351 | 
352 |         self.assertEqual(annotated_doc.ents[0].tok_end, 0)
353 |         self.assertEqual(annotated_doc.ents[1].tok_end, 7)
354 |         self.assertEqual(annotated_doc.ents[2].tok_end, 11)
355 | 
356 |     def test_format_dygiepp(self):
357 | 
358 |         annotated_doc = ad.AnnotatedDoc.parse_ann(self.txt,
359 |                                                   self.ann,
360 |                                                   self.nlp,
361 |                                                   self.dataset,
362 |                                                   coref=True)
363 |         annotated_doc.char_to_token()
364 |         res = annotated_doc.format_dygiepp()
365 | 
366 |         self.assertEqual(res, self.dygiepp_dict)
367 | 
368 | 
369 | if __name__ == "__main__":
370 |     unittest.main()
371 | 


--------------------------------------------------------------------------------
/dygie/tests/data/collate_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Test that a dataset doesn't change when it's collated and then de-collated.
  3 | """
  4 | 
  5 | import unittest
  6 | import json
  7 | import os
  8 | import shutil
  9 | import sys
 10 | from pathlib import Path
 11 | 
 12 | 
 13 | # Since the collating code isn't inside the `dygie` package, I need to do a little work to import
 14 | # it.
 15 | current_dir = Path(os.path.dirname(os.path.realpath(__file__)))
 16 | common_root = current_dir.parent.parent.parent
 17 | collate_dir = f"{common_root}/scripts/data/shared"
 18 | sys.path.append(collate_dir)
 19 | 
 20 | # Now import the code
 21 | import collate
 22 | import uncollate
 23 | 
 24 | 
 25 | # Utility function.
 26 | def load_jsonl(fname):
 27 |     with open(fname) as f:
 28 |         return [json.loads(x) for x in f]
 29 | 
 30 | 
 31 | # The actual tests.
 32 | class TestCollate(unittest.TestCase):
 33 |     def setUp(self):
 34 |         self.collated_dir = "tmp/collated"
 35 |         self.uncollated_dir = "tmp/uncollated"
 36 |         os.makedirs(self.collated_dir, exist_ok=True)
 37 |         os.makedirs(self.uncollated_dir, exist_ok=True)
 38 | 
 39 |     def tearDown(self):
 40 |         shutil.rmtree("tmp")
 41 | 
 42 |     @staticmethod
 43 |     def is_same(x1, x2):
 44 |         "Compare the fields in two dicts loaded from json."
 45 |         # Check if keys are same.
 46 |         if sorted(x1.keys()) != sorted(x2.keys()):
 47 |             return False
 48 | 
 49 |         # Loop over all fields. If not same, return False.
 50 |         for key in x1:
 51 |             if x1[key] != x2[key]:
 52 |                 return False
 53 | 
 54 |         # If we get to the end, they're the same.
 55 |         return True
 56 | 
 57 |     def files_same(self, f1, f2):
 58 |         "Check that contests of two files are the same."
 59 |         data1 = load_jsonl(f1)
 60 |         data2 = load_jsonl(f2)
 61 | 
 62 |         # Ignore these in the comparison; `dataset` gets added, while `sentence_start` and
 63 |         # `clusters` get removed.
 64 |         fields_to_ignore = ["dataset", "sentence_start", "clusters"]
 65 |         for data in [data1, data2]:
 66 |             for entry in data:
 67 |                 # Since the input data doesn't have a `dataset` field, we don't want to compare on
 68 |                 # this.
 69 |                 for field_to_ignore in fields_to_ignore:
 70 |                     if field_to_ignore in entry:
 71 |                         del entry[field_to_ignore]
 72 | 
 73 |         if len(data1) != len(data2):
 74 |             return False
 75 | 
 76 |         for entry1, entry2 in zip(data1, data2):
 77 |             if not self.is_same(entry1, entry2):
 78 |                 return False
 79 | 
 80 |         return True
 81 | 
 82 |     def check_collate(self, dirname):
 83 |         input_dir = f"fixtures/collate/{dirname}"
 84 | 
 85 |         # Make the collator.
 86 |         collator_args = collate.get_args([input_dir, self.collated_dir, "--file_extension=json",
 87 |                                           f"--dataset={dirname}"])
 88 |         collator_runner = collate.CollateRunner(**vars(collator_args))
 89 | 
 90 |         # Make the uncollator.
 91 |         uncollator_args = uncollate.get_args(
 92 |             [self.collated_dir, self.uncollated_dir, f"--order_like_directory={input_dir}",
 93 |              "--file_extension=json"])
 94 |         uncollator_runner = uncollate.UnCollateRunner(**vars(uncollator_args))
 95 | 
 96 |         # Run both.
 97 |         collator_runner.run()
 98 |         uncollator_runner.run()
 99 | 
100 |         for name in ["train", "dev", "test"]:
101 |             assert self.files_same(f"{input_dir}/{name}.json", f"{self.uncollated_dir}/{name}.json")
102 | 
103 |     def test_collate(self):
104 |         "Make sure that our Document class can read and write data without changing it."
105 |         for dirname in ["ace-event", "scierc"]:
106 |             self.check_collate(dirname)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     unittest.main()
111 | 


--------------------------------------------------------------------------------
/dygie/tests/data/document_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Spot-checks for the Document class.
 3 | """
 4 | 
 5 | import unittest
 6 | import json
 7 | import os
 8 | import shutil
 9 | 
10 | from dygie.data import Document
11 | 
12 | 
13 | class TestDocument(unittest.TestCase):
14 |     def setUp(self):
15 |         self.tmpdir = "tmp"
16 |         os.makedirs(self.tmpdir, exist_ok=True)
17 | 
18 |     def tearDown(self):
19 |         shutil.rmtree(self.tmpdir)
20 | 
21 |     @staticmethod
22 |     def is_same(x1, x2):
23 |         "Compare the fields in two dicts loaded from json."
24 |         # Check if keys are same.
25 |         if x1.keys() != x2.keys():
26 |             return False
27 | 
28 |         # Loop over all fields. If not same, return False.
29 |         for key in x1:
30 |             if x1[key] != x2[key]:
31 |                 return False
32 | 
33 |         # If we get to the end, they're the same.
34 |         return True
35 | 
36 |     def check_document(self, document_name):
37 |         # Load the original file.
38 |         with open(f"fixtures/{document_name}.json") as f:
39 |             js = json.load(f)
40 |         doc = Document.from_json(js)
41 | 
42 |         # Dump to file.
43 |         tmpfile = f"{self.tmpdir}/{document_name}.json"
44 |         dumped = doc.to_json()
45 |         with open(tmpfile, "w") as f:
46 |             json.dump(dumped, f)
47 | 
48 |         # Reload and compare.
49 |         with open(tmpfile) as f:
50 |             reloaded = json.load(f)
51 |         assert self.is_same(js, reloaded)
52 | 
53 |     def test_document(self):
54 |         "Make sure that our Document class can read and write data without changing it."
55 |         for document_name in ["ace_event_article", "scierc_article", "ace_event_coref_article"]:
56 |             self.check_document(document_name)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     unittest.main()
61 | 


--------------------------------------------------------------------------------
/dygie/tests/data/dygie_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Short unit tests to make sure our dataset readers are behaving correctly.
  3 | Checks a sample from the scierc data
  4 | """
  5 | 
  6 | import unittest
  7 | from allennlp.data.vocabulary import Vocabulary
  8 | 
  9 | from dygie.data import DyGIEReader
 10 | 
 11 | 
 12 | class TestDygieReader(unittest.TestCase):
 13 | 
 14 |     def setUp(self):
 15 |         # scierc
 16 |         # Sentence lengths: [20, 23, 36, 14, 14, 30, 31, 15].
 17 |         # Cumulative sentence lengths: [20, 43, 79, 93, 107, 137, 168, 183].
 18 |         self.reader = DyGIEReader(max_span_width=5)
 19 |         self.dataset = self.reader.read("dygie/tests/fixtures/scierc_article.json")
 20 | 
 21 |     def tearDown(self):
 22 |         pass
 23 | 
 24 |     def test_tokens_correct_scierc(self):
 25 |         # instances are now entire documents instead of sentences
 26 |         instance = self.dataset.instances[0]
 27 |         tokens = instance["text"][4][0:]
 28 |         assert len(tokens) == 14
 29 |         text = [token.text for token in tokens]
 30 |         assert text[:6] == ["Thirdly", "the", "learned", "intrinsic", "object", "structure"]
 31 | 
 32 |     def test_ner_correct_scierc(self):
 33 |         instance = self.dataset.instances[0]
 34 |         ner_field = instance["ner_labels"][3]
 35 |         spans = instance["spans"][3]
 36 | 
 37 |         for label, span in zip(ner_field, spans):
 38 |             start, end = span.span_start, span.span_end
 39 |             if start == 2 and end == 3:
 40 |                 assert label.label == "Method"
 41 |             elif start == 11 and end == 12:
 42 |                 assert label.label == "Method"
 43 |             else:
 44 |                 assert label.label == ""
 45 | 
 46 |     def test_relation_correct_scierc(self):
 47 |         instance = self.dataset.instances[0]
 48 |         relation_field = instance["relation_labels"][5]
 49 |         span_list = relation_field.sequence_field
 50 |         # There should be one relation in this sentence,
 51 |         indices = relation_field.indices
 52 |         labels = relation_field.labels
 53 |         assert len(indices) == len(labels) == 1
 54 |         ix = indices[0]
 55 |         label = labels[0]
 56 |         # Check that the relation refers to the correct spans
 57 |         span1 = span_list[ix[0]]
 58 |         span2 = span_list[ix[1]]
 59 |         assert ((span1.span_start == 19 and span1.span_end == 20 and
 60 |                  span2.span_start == 22 and span2.span_end == 24))
 61 |         # Check that the label's correct.
 62 |         assert label == "USED-FOR"
 63 | 
 64 |     def test_coref_correct_scierc(self):
 65 |         instance = self.dataset.instances[0]
 66 |         coref_field = instance["coref_labels"]
 67 |         spans = instance["spans"]
 68 |         # A list, one entry per sentence. For each sentence, a dict mapping spans to cluster id's.
 69 |         cluster_mappings = [{(6, 6): 1},
 70 |                             {},
 71 |                             {(19, 21): 0},
 72 |                             {(11, 12): 0, (2, 3): 2},
 73 |                             {(3, 5): 0},
 74 |                             {(5, 7): 0, (19, 20): 2, (22, 24): 3},
 75 |                             {(5, 5): 3},
 76 |                             {(2, 2): 1}]
 77 |         for instance, cluster_mapping, span in zip(coref_field, cluster_mappings, spans):
 78 |             curr_coref_field = instance
 79 |             curr_span = span
 80 |             for label, span in zip(curr_coref_field, curr_span):
 81 |                 start, end = span.span_start, span.span_end
 82 |                 if (start, end) in cluster_mapping:
 83 |                     # print(start, end)
 84 |                     # print(label.label)
 85 |                     assert cluster_mapping[(start, end)] == label.label
 86 |                 else:
 87 |                     assert label.label == -1
 88 | 
 89 |     def test_vocab_size_correct_scierc(self):
 90 |         vocab = Vocabulary.from_instances(self.dataset.instances)
 91 |         # There are 4 unique NER labels and 6 relation labels in the text fixture doc. For the ner
 92 |         # labels, there is an extra category for the null label. For the relation labels, there
 93 |         # isn't. This is due to the way their respective `Field`s represent labels.
 94 |         assert vocab.get_vocab_size("None__ner_labels") == 5
 95 |         assert vocab.get_vocab_size("None__relation_labels") == 6
 96 |         # For numeric labels, vocab size is 0.
 97 |         assert vocab.get_vocab_size("coref_labels") == 0
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     unittest.main()
102 | 


--------------------------------------------------------------------------------
/dygie/tests/data/spacy_interface_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from dygie.spacy_interface.spacy_interface import prepare_spacy_doc
 3 | import spacy
 4 | 
 5 | class TestSpacyInterface(unittest.TestCase):
 6 |     
 7 |     def setUp(self) -> None:
 8 |         nlp = spacy.load('en_core_web_sm')
 9 |         text = "Title: VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network\nSection:"
10 |         doc = nlp(text)
11 |         sentences = [[tok.text for tok in sent] for sent in doc.sents]
12 |         self.prediction = {'doc_key': 'test',
13 |                             'dataset': 'scierc',
14 |                             'sentences': sentences,
15 |                             'predicted_ner': [[[2, 2, 'Method', 15.5283, 1.0],
16 |                             [5, 11, 'Method', 3.0847, 0.9563],
17 |                             [6, 11, 'Method', 3.8185, 0.9672],
18 |                             [14, 18, 'Method', 3.4321, 0.9686],
19 |                             [15, 18, 'Method', 11.8431, 1.0],
20 |                             [19, 19, 'Generic', 4.7359, 0.7531]]],
21 |                             'predicted_relations': [[[2, 2, 6, 11, 'HYPONYM-OF', 2.0108, 0.8819],
22 |                             [19, 19, 19, 19, 'USED-FOR', 0.8034, 0.2309]]]}
23 |         self.doc = doc
24 |         return super().setUp()
25 | 
26 |     def test_relation(self):
27 |         doc = prepare_spacy_doc(self.doc, self.prediction)
28 |         # number of sentences
29 |         self.assertEqual(len(doc._.rels),1)
30 |         # number of relations
31 |         self.assertEqual(len(doc._.rels[0]),2)
32 |         # type of relations
33 |         self.assertEqual(doc._.rels[0][0][2], 'HYPONYM-OF')
34 |         self.assertEqual(doc._.rels[0][1][2], 'USED-FOR')
35 |         
36 | 
37 |     def test_span_based_entity(self):
38 |         doc = prepare_spacy_doc(self.doc, self.prediction)
39 |         # number of sentences
40 |         self.assertEqual(len(doc._.span_ents),1)
41 |         # number of span based entities 
42 |         self.assertEqual(len(doc._.span_ents[0]),6)
43 | 
44 |     def test_spacy_entity(self):
45 |         doc = prepare_spacy_doc(self.doc, self.prediction)
46 |         # number of proned merged entities
47 |         self.assertEqual(len(doc.ents),4)
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     unittest.main()


--------------------------------------------------------------------------------
/dygie/tests/fixtures/ace_event_article.json:
--------------------------------------------------------------------------------
1 | {"events": [[], [[[22, "conflict_attack"]]], [], [], [[[99, "movement_transport"], [97, 98, "vehicle"], [97, 97, "artifact"], [101, 102, "destination"]]], [[[119, "movement_transport"], [114, 114, "vehicle"], [118, 118, "artifact"], [121, 124, "destination"]]], [[[137, "conflict_attack"]]], [], [], [], [], [], [], [], [], [], [[[235, "conflict_attack"], [232, 236, "attacker"]]], [], [], [], [[[273, "conflict_attack"], [270, 271, "target"]]], [], [[[291, "movement_transport"], [288, 289, "artifact"], [293, 293, "destination"]]], [], [], [], [], [], [], [], [], [[[412, "conflict_demonstrate"], [408, 408, "entity"], [409, 410, "time"]], [[415, "conflict_attack"]]], [[[430, "conflict_demonstrate"]]], [], [[[468, "conflict_demonstrate"], [454, 455, "place"], [457, 464, "entity"], [471, 471, "time"]]], [], [[[499, "conflict_demonstrate"], [491, 492, "place"], [496, 496, "time"]]], [], [], [[[559, "conflict_demonstrate"], [563, 567, "time"]]], [[[579, "conflict_demonstrate"], [573, 585, "place"]], [[585, "conflict_attack"], [584, 584, "place"]]], [], [], [], [], [], [], [], [[[693, "conflict_demonstrate"]]], [[[705, "conflict_demonstrate"], [716, 717, "place"]], [[721, "conflict_attack"], [720, 720, "place"]]], [], [], [], [], [], [], [[[834, "life_injure"], [833, 837, "victim"]], [[837, "conflict_attack"], [833, 837, "target"]]], [[[855, "conflict_attack"], [857, 860, "attacker"], [876, 876, "place"], [844, 845, "target"], [848, 852, "target"]]], [], [], [], [], [], [], [], [[[1007, "conflict_attack"]]], [[[1012, "conflict_attack"], [1013, 1015, "target"]]], [], [], [], [[[1063, "conflict_demonstrate"]]], [], [], [], [[[1088, "conflict_demonstrate"]]], [], []], "ner": [[[0, 0, "PER"], [1, 1, "PER"], [3, 3, "PER"]], [[10, 10, "PER"], [28, 31, "PER"], [29, 29, "PER"], [34, 34, "PER"], [37, 37, "PER"]], [[47, 48, "LOC"], [56, 65, "PER"], [63, 65, "VEH"], [64, 64, "VEH"]], [[74, 76, "PER"], [74, 74, "PER"], [81, 85, "GPE"], [82, 82, "GPE"], [85, 85, "GPE"]], [[91, 92, "PER"], [94, 95, "PER"], [94, 94, "ORG"], [97, 97, "PER"], [97, 98, "VEH"], [101, 102, "LOC"], [106, 108, "PER"], [108, 108, "LOC"]], [[114, 114, "VEH"], [118, 118, "PER"], [121, 124, "GPE"], [124, 124, "GPE"]], [[126, 127, "PER"]], [[140, 140, "time"], [141, 141, "PER"]], [[147, 155, "PER"], [148, 148, "GPE"], [151, 151, "PER"], [159, 160, "PER"]], [[168, 170, "ORG"]], [[179, 180, "PER"]], [[183, 186, "PER"], [183, 186, "PER"]], [[193, 194, "PER"], [196, 197, "PER"]], [[208, 210, "GPE"], [209, 209, "GPE"]], [[218, 219, "PER"], [220, 220, "GPE"], [222, 222, "GPE"]], [[224, 224, "GPE"], [226, 226, "GPE"], [228, 228, "GPE"]], [[230, 230, "GPE"], [232, 236, "GPE"], [236, 236, "GPE"]], [[238, 238, "PER"], [241, 241, "PER"], [244, 246, "ORG"], [244, 244, "PER"]], [[248, 249, "PER"], [250, 250, "PER"], [252, 254, "PER"]], [[256, 256, "PER"]], [[270, 271, "PER"]], [[275, 281, "VEH"], [277, 281, "VEH"], [280, 280, "PER"], [280, 281, "VEH"]], [[288, 289, "PER"], [293, 293, "GPE"]], [[295, 295, "PER"]], [], [[321, 321, "GPE"], [324, 327, "LOC"], [327, 327, "GPE"], [337, 339, "PER"], [339, 339, "LOC"], [341, 341, "LOC"], [344, 349, "LOC"]], [[351, 351, "time"], [352, 352, "LOC"], [353, 354, "PER"]], [[358, 359, "PER"]], [[364, 365, "PER"], [367, 368, "ORG"], [370, 372, "GPE"], [371, 372, "LOC"]], [[378, 378, "PER"], [383, 383, "ORG"], [388, 389, "GPE"]], [[393, 393, "PER"], [396, 396, "PER"], [400, 400, "GPE"], [402, 402, "LOC"]], [[405, 405, "ORG"], [408, 408, "PER"], [409, 410, "time"], [420, 421, "PER"]], [[423, 423, "ORG"]], [[432, 432, "GPE"], [434, 436, "GPE"], [437, 438, "time"], [441, 441, "PER"], [443, 444, "time"], [446, 447, "PER"], [450, 451, "FAC"]], [[454, 455, "GPE"], [457, 464, "ORG"], [471, 471, "time"]], [[473, 473, "PER"], [473, 475, "PER"], [478, 481, "PER"]], [[489, 489, "GPE"], [491, 492, "GPE"], [496, 496, "time"], [506, 513, "PER"], [507, 507, "GPE"], [510, 513, "LOC"], [511, 512, "LOC"]], [[515, 515, "ORG"], [515, 518, "PER"], [521, 521, "PER"], [523, 524, "GPE"]], [[526, 526, "PER"], [531, 531, "PER"], [535, 535, "GPE"]], [[537, 538, "PER"], [540, 540, "ORG"], [540, 541, "PER"], [543, 543, "PER"], [545, 545, "PER"], [547, 567, "PER"], [551, 551, "PER"], [557, 558, "PER"], [563, 567, "time"]], [[571, 571, "GPE"], [573, 585, "GPE"], [584, 584, "GPE"], [589, 595, "PER"]], [[597, 597, "PER"], [603, 604, "PER"]], [[606, 606, "PER"], [611, 611, "PER"]], [[617, 618, "PER"], [624, 626, "PER"]], [[628, 628, "PER"], [632, 640, "PER"], [634, 640, "PER"], [634, 638, "ORG"], [634, 635, "GPE"], [637, 637, "ORG"]], [[642, 642, "PER"], [645, 645, "PER"], [647, 647, "PER"]], [[651, 651, "PER"], [653, 653, "PER"], [658, 658, "PER"]], [[667, 667, "PER"], [669, 670, "ORG"], [678, 678, "ORG"]], [], [[696, 696, "PER"], [699, 700, "PER"], [703, 703, "PER"], [709, 709, "PER"], [716, 717, "GPE"], [720, 720, "GPE"]], [], [[730, 730, "PER"], [741, 748, "PER"], [743, 748, "PER"], [750, 751, "PER"], [753, 755, "PER"]], [[757, 757, "PER"], [760, 763, "ORG"]], [[767, 767, "PER"], [771, 772, "PER"]], [[776, 779, "ORG"]], [[782, 787, "PER"], [785, 787, "ORG"], [789, 791, "ORG"], [799, 799, "ORG"], [805, 805, "PER"], [809, 809, "ORG"], [822, 822, "ORG"]], [[824, 824, "PER"], [827, 827, "PER"], [833, 837, "PER"]], [[839, 839, "PER"], [840, 840, "ORG"], [844, 845, "PER"], [848, 852, "PER"], [857, 860, "ORG"], [858, 858, "GPE"], [860, 860, "PER"], [862, 862, "PER"], [866, 867, "GPE"], [872, 873, "GPE"], [876, 876, "GPE"]], [[878, 878, "PER"], [891, 893, "LOC"], [892, 892, "PER"], [895, 897, "GPE"], [896, 896, "GPE"]], [[899, 899, "PER"], [901, 901, "PER"], [903, 903, "PER"], [907, 910, "PER"], [907, 907, "PER"], [910, 910, "GPE"], [914, 914, "PER"], [920, 921, "PER"]], [[923, 923, "PER"], [926, 926, "PER"], [928, 930, "GPE"], [929, 929, "GPE"], [933, 939, "GPE"]], [[942, 942, "PER"], [942, 943, "PER"], [945, 945, "PER"]], [[952, 952, "PER"], [954, 954, "PER"], [957, 958, "FAC"]], [[960, 960, "PER"], [969, 969, "PER"], [979, 980, "FAC"]], [[982, 982, "PER"], [986, 986, "PER"], [989, 989, "PER"], [992, 992, "PER"], [994, 994, "GPE"]], [], [[1013, 1015, "ORG"], [1014, 1014, "GPE"]], [[1017, 1017, "GPE"]], [[1026, 1026, "PER"], [1028, 1028, "PER"], [1030, 1034, "PER"], [1032, 1034, "PER"], [1039, 1040, "PER"]], [[1043, 1043, "PER"], [1045, 1045, "PER"], [1048, 1048, "PER"], [1051, 1051, "PER"]], [], [[1065, 1065, "PER"]], [[1069, 1069, "PER"]], [[1073, 1073, "PER"], [1075, 1075, "PER"]], [[1077, 1077, "PER"], [1078, 1079, "PER"]], [[1094, 1095, "time"], [1105, 1106, "PER"], [1114, 1115, "time"]], [[1117, 1117, "PER"]]], "sentences": [["WOODRUFF", "I", "hope", "they", "get", "a", "little", "rest", "."], ["When", "we", "come", "back", ",", "one", "of", "the", "many", "sad", "aspects", "of", "this", "war", ",", "humanitarian", "aid", "rushed", "to", "people", "who", "need", "it", "but", "not", "everybody", "gets", "what", "they", "came", "for", "."], ["Slowly", "humanitarian", "aid", "is", "rolling", "into", "southern", "Iraq", "but", "dramatic", "scenes", "like", "this", "one", "of", "a", "crowd", "tearing", "into", "the", "supplies", "on", "a", "convoy", "truck", "only", "underscores", "how", "desperate", "the", "need", "is", "."], ["Correspondent", "Martin", "Geissler", "files", "this", "report", "from", "the", "Iraqi", "town", "of", "Safwan", "."], ["-LRB-", "BEGIN", "VIDEOTAPE", "-RRB-", "MARTIN", "GEISSLER", ",", "CNN", "CORRESPONDENT", "As", "our", "convoy", "rolled", "through", "southern", "Iraq", "the", "desperation", "of", "the", "people", "here", "soon", "became", "evident", "."], ["In", "trucks", "and", "on", "foot", "they", "came", "to", "the", "town", "of", "Safwan", "."], ["These", "people", "have", "been", "without", "food", "or", "water", "supplies", "since", "the", "war", "began", "."], ["Now", "they", "are", "desperate", "."], ["Within", "seconds", "the", "Kuwaiti", "aid", "workers", "who", "had", "organized", "this", "trip", "were", "overpowered", "by", "the", "mob", "."], ["These", "desperate", "scenes", "are", "exactly", "what", "the", "aide", "agencies", "wanted", "to", "avoid", "."], ["This", "is", "survival", "of", "the", "fittest", "."], ["Only", "the", "healthy", "and", "strong", "can", "get", "to", "the", "food", "."], ["The", "weak", "and", "the", "ill", "are", "left", "with", "nothing", "."], ["Despite", "this", "effort", "to", "help", "the", "Iraqi", "people", ",", "resentment", "is", "never", "far", "away", "."], ["UNIDENTIFIED", "MALE", "We", "hate", "U.S.", "."], ["We", "hate", "British", ",", "England", "."], ["We", "hate", "any", "state", "in", "war", "here", "."], ["GEISSLER", "What", "do", "you", "think", "about", "Saddam", "'s", "regime", "?"], ["UNIDENTIFIED", "MALE", "Saddam", "'s", "very", "good", "man", "."], ["GEISSLER", "As", "the", "supplies", "ran", "out", "the", "mood", "swung", "from", "frantic", "to", "ugly", "."], ["Delivery", "drivers", "were", "threatened", "."], ["One", "of", "the", "buses", "in", "our", "convoy", "was", "held", "up", "at", "knifepoint", "."], ["The", "troops", "have", "moved", "into", "Safwan", "."], ["We", ",", "as", "a", "consequence", ",", "have", "had", "to", "move", "out", "."], ["It", "'s", "simply", "too", "dangerous", "."], ["This", "is", "a", "clear", "indication", "that", "despite", "the", "coalition", "reassurances", "that", "this", "part", "of", "Iraq", "is", "safe", "and", "despite", "the", "aid", "being", "brought", "into", "the", "people", "here", ",", "it", "is", "still", "a", "very", ",", "very", "volatile", "area", "."], ["Tonight", "here", "the", "strong", "are", "eating", "."], ["The", "weak", "still", "go", "hungry", "."], ["Martin", "Geissler", ",", "ITV", "News", ",", "Safwan", "southern", "Iraq", "."], ["-LRB-", "END", "VIDEOTAPE", "-RRB-", "WOODRUFF", "So", "many", "different", "pictures", "we", "are", "getting", "from", "across", "that", "country", "."], ["Well", ",", "they", "are", "making", "their", "voices", "heard", "at", "home", "and", "abroad", "."], ["When", "we", "return", ",", "Americans", "this", "weekend", "are", "marching", "against", "the", "war", "and", "in", "support", "of", "the", "troops", "."], ["We", "'ll", "check", "out", "some", "of", "the", "demonstrations", "."], ["Here", "in", "the", "United", "States", "this", "weekend", "just", "as", "they", "did", "last", "weekend", ",", "anti-war", "protesters", "taking", "to", "the", "street", "."], ["In", "Los", "Angeles", ",", "the", "International", "Black", "Coalition", "for", "Peace", "and", "Justice", "is", "sponsoring", "a", "rally", "for", "peace", "today", "."], ["Congresswoman", "Maxine", "Waters", "was", "among", "those", "scheduled", "to", "attend", "."], ["In", "the", "meantime", "further", "north", "in", "California", ",", "San", "Francisco", "is", "the", "setting", "today", "for", "a", "rally", "aimed", "at", "boosting", "the", "moral", "of", "the", "American", "troops", "in", "the", "Persian", "Gulf", "region", "."], ["CNN", "'s", "Rusty", "Dornin", "is", "with", "us", "from", "San", "Francisco", "."], ["Rusty", ",", "what", "sort", "of", "crowd", "is", "showing", "up", "there", "?"], ["RUSTY", "DORNIN", ",", "CNN", "CORRESPONDENT", "Well", "Judy", ",", "this", "is", "the", "largest", "group", "really", "we", "'ve", "seen", "of", "the", "support", "the", "troops", "rallies", "that", "have", "been", "over", "the", "last", "few", "weeks", "."], ["Of", "course", "this", "is", "the", "home", "of", "the", "anti-", "war", "demonstrations", "stemming", "back", "to", "the", "Vietnam", "War", "but", "there", "are", "close", "to", "between", "500", "and", "1,000", "people", "."], ["I", "'m", "getting", "various", "estimates", "on", "the", "crowd", "."], ["We", "did", "want", "to", "show", "you", "an", "interesting", "thing", "here", "."], ["Some", "folks", "are", "showing", "some", "solidarity", "with", "the", "speakers", "here", "."], ["You", "'re", "looking", "at", "some", "of", "San", "Francisco", "'s", "police", "department", "'s", "officers", "."], ["I", "did", "ask", "them", "why", "they", "did", "that", "."], ["They", "said", "they", "were", "wearing", "skullcaps", "so", "they", "decided", "to", "adopt", "this", "to", "show", "solidarity", "."], ["They", "said", "the", "department", "has", "not", "made", "any", "statement", "about", "whether", "they", "think", "that", "'s", "all", "right", "or", "not", "."], ["This", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["As", "I", "said", ",", "the", "officers", "did", "tell", "me", "that", "this", "is", "the", "largest", "pro-troops", "demonstration", "that", "has", "ever", "been", "in", "San", "Francisco", "since", "the", "Vietnam", "War", "."], ["So", "far", ",", "very", "peaceful", "."], ["As", "I", "said", ",", "there", "have", "been", "a", "few", "verbal", "exchanges", "but", "one", "of", "the", "most", "enthusiastically", "received", "speakers", "here", "was", "Bessam", "Al-Husaini", ",", "an", "Iraqi", "American", "."], ["He", "'s", "with", "the", "Iraqi", "American", "Council", "."], ["How", "do", "you", "feel", "about", "supporting", "the", "troops", "?"], ["How", "does", "the", "Iraqi", "American", "Council", "feel", "?"], ["BESSAM", "AL-HUSAINI", ",", "IRAQI", "AMERICAN", "COUNCIL", "Well", "the", "Iraqi", "American", "have", "been", "waiting", "for", "this", "liberation", "and", "they", "want", "to", "get", "rid", "of", "Saddam", "so", "bad", "and", "they", "will", "have", "to", "take", "it", "the", "way", "it", "'s", "been", "offered", "to", "us", "."], ["DORNIN", "How", "do", "you", "feel", "though", "seeing", "pictures", "of", "civilians", "injured", "in", "the", "bombings", "?"], ["AL-HUSAINI", "We", "get", "reports", "that", "these", "civilians", "especially", "in", "the", "-LRB-", "INAUDIBLE", "-RRB-", "population", "have", "been", "attacked", "by", "the", "Iraqi", "regime", "themselves", "so", "they", "ca", "n't", "blame", "the", "American", "and", "said", "look", "what", "the", "American", "doing", "to", "us", "."], ["He", "would", "love", "to", "see", "this", "and", "would", "draw", "on", "this", "emotion", "from", "the", "Arab", "world", "and", "the", "European", "country", "."], ["DORNIN", "Now", "you", "said", "you", "did", "talk", "to", "your", "family", "in", "Baghdad", "as", "well", "and", "they", "also", "are", "somewhat", "suspicious", "of", "the", "Americans", "."], ["AL-HUSAINI", "Well", ",", "I", "mean", "the", "Iraqi", "people", "have", "been", "the", "primary", "victim", "from", "the", "whole", "thing", "."], ["Yeah", "my", "family", ",", "you", "know", ",", "still", "in", "fear", "."], ["You", "know", "they", "stay", "in", "one", "room", "."], ["They", "boarded", "all", "the", "-", "all", "the", "windows", "and", "they", "built", "with", "a", "brick", "one", "of", "the", "windows", "facing", "the", "street", "."], ["I", "mean", "yes", ",", "you", "know", ",", "my", "prayer", "to", "them", "but", "we", "can", "only", "do", "so", "much", "."], ["Hopefully", "it", "will", "be", "a", "short", "war", "."], ["It", "will", "be", "attack", "the", "Iraqi", "regime", "."], ["We", "can", "live", "in", "liberty", "and", "freedom", "soon", "."], ["DORNIN", "Now", "I", "understand", "some", "of", "the", "people", "here", "have", "been", "threatened", "by", "anti-war", "protesters", "."], ["Have", "you", "had", "anyone", "either", "threatening", "you", "or", "...", "AL-HUSAINI", "No", ",", "no", "problem", "."], ["This", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["DORNIN", "OK", "."], ["Thank", "you", "very", "much", "."], ["AL-HUSAINI", "Thank", "you", "."], ["DORNIN", "Bessam", "Al-Husaini", "here", "and", "it", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["It", "is", "wrapping", "up", "this", "afternoon", "and", "it", "looks", "like", "the", "civil", "disobedience", "acts", "by", "the", "anti-protesters", "are", "scheduled", "to", "get", "underway", "once", "again", "next", "week", "."], ["Judy", "..."]], "clusters": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "relations": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "doc_key": "CNN_IP_20030329.1600.01-3", "dataset": "ace05_event"}
2 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/collate/scierc/dev.json:
--------------------------------------------------------------------------------
 1 | {"clusters": [[[6, 17], [32, 32]], [[4, 4], [55, 55], [91, 91]], [[58, 62], [64, 64], [79, 79]]], "sentences": [["This", "paper", "presents", "an", "algorithm", "for", "computing", "optical", "flow", ",", "shape", ",", "motion", ",", "lighting", ",", "and", "albedo", "from", "an", "image", "sequence", "of", "a", "rigidly-moving", "Lambertian", "object", "under", "distant", "illumination", "."], ["The", "problem", "is", "formulated", "in", "a", "manner", "that", "subsumes", "structure", "from", "motion", ",", "multi-view", "stereo", ",", "and", "photo-metric", "stereo", "as", "special", "cases", "."], ["The", "algorithm", "utilizes", "both", "spatial", "and", "temporal", "intensity", "variation", "as", "cues", ":", "the", "former", "constrains", "flow", "and", "the", "latter", "constrains", "surface", "orientation", ";", "combining", "both", "cues", "enables", "dense", "reconstruction", "of", "both", "textured", "and", "texture-less", "surfaces", "."], ["The", "algorithm", "works", "by", "iteratively", "estimating", "affine", "camera", "parameters", ",", "illumination", ",", "shape", ",", "and", "albedo", "in", "an", "alternating", "fashion", "."], ["Results", "are", "demonstrated", "on", "videos", "of", "hand-held", "objects", "moving", "in", "front", "of", "a", "fixed", "light", "and", "camera", "."]], "ner": [[[4, 4, "Generic"], [6, 17, "Task"], [20, 21, "Material"], [24, 26, "Material"], [28, 29, "OtherScientificTerm"]], [[32, 32, "Generic"], [42, 42, "Material"], [44, 45, "Material"], [48, 49, "Material"]], [[55, 55, "Generic"], [58, 62, "OtherScientificTerm"], [64, 64, "Generic"], [67, 67, "Generic"], [69, 69, "OtherScientificTerm"], [72, 72, "Generic"], [74, 75, "OtherScientificTerm"], [79, 79, "Generic"], [81, 88, "Task"]], [[91, 91, "Generic"], [95, 105, "Method"]], [[115, 118, "Material"]]], "relations": [[[4, 4, 6, 17, "USED-FOR"], [20, 21, 4, 4, "USED-FOR"], [24, 26, 20, 21, "FEATURE-OF"], [28, 29, 24, 26, "FEATURE-OF"]], [[42, 42, 44, 45, "CONJUNCTION"], [44, 45, 48, 49, "CONJUNCTION"]], [[58, 62, 55, 55, "USED-FOR"], [67, 67, 64, 64, "HYPONYM-OF"], [67, 67, 69, 69, "USED-FOR"], [67, 67, 72, 72, "CONJUNCTION"], [72, 72, 64, 64, "HYPONYM-OF"], [72, 72, 74, 75, "USED-FOR"], [79, 79, 81, 88, "USED-FOR"]], [[95, 105, 91, 91, "USED-FOR"]], []], "doc_key": "ICCV_2003_158_abs"}
 2 | {"clusters": [[[90, 91], [107, 107]]], "sentences": [["Past", "work", "of", "generating", "referring", "expressions", "mainly", "utilized", "attributes", "of", "objects", "and", "binary", "relations", "between", "objects", "."], ["However", ",", "such", "an", "approach", "does", "not", "work", "well", "when", "there", "is", "no", "distinctive", "attribute", "among", "objects", "."], ["To", "overcome", "this", "limitation", ",", "this", "paper", "proposes", "a", "method", "utilizing", "the", "perceptual", "groups", "of", "objects", "and", "n-ary", "relations", "among", "them", "."], ["The", "key", "is", "to", "identify", "groups", "of", "objects", "that", "are", "naturally", "recognized", "by", "humans", "."], ["We", "conducted", "psychological", "experiments", "with", "42", "subjects", "to", "collect", "referring", "expressions", "in", "such", "situations", ",", "and", "built", "a", "generation", "algorithm", "based", "on", "the", "results", "."], ["The", "evaluation", "using", "another", "23", "subjects", "showed", "that", "the", "proposed", "method", "could", "effectively", "generate", "proper", "referring", "expressions", "."]], "ner": [[[4, 5, "OtherScientificTerm"], [12, 13, "OtherScientificTerm"]], [], [[52, 53, "OtherScientificTerm"]], [], [[81, 82, "OtherScientificTerm"], [90, 91, "Method"]], [[107, 107, "Generic"], [112, 113, "OtherScientificTerm"]]], "relations": [[], [], [], [], [], []], "doc_key": "C04-1096"}
 3 | {"clusters": [[[32, 32], [44, 44]], [[1, 2], [11, 11]], [[103, 104], [125, 126]], [[95, 96], [108, 109], [121, 122]]], "sentences": [["An", "entity-oriented", "approach", "to", "restricted-domain", "parsing", "is", "proposed", "."], ["In", "this", "approach", ",", "the", "definitions", "of", "the", "structure", "and", "surface", "representation", "of", "domain", "entities", "are", "grouped", "together", "."], ["Like", "semantic", "grammar", ",", "this", "allows", "easy", "exploitation", "of", "limited", "domain", "semantics", "."], ["In", "addition", ",", "it", "facilitates", "fragmentary", "recognition", "and", "the", "use", "of", "multiple", "parsing", "strategies", ",", "and", "so", "is", "particularly", "useful", "for", "robust", "recognition", "of", "extra-grammatical", "input", "."], ["Several", "advantages", "from", "the", "point", "of", "view", "of", "language", "definition", "are", "also", "noted", "."], ["Representative", "samples", "from", "an", "entity-oriented", "language", "definition", "are", "presented", ",", "along", "with", "a", "control", "structure", "for", "an", "entity-oriented", "parser", ",", "some", "parsing", "strategies", "that", "use", "the", "control", "structure", ",", "and", "worked", "examples", "of", "parses", "."], ["A", "parser", "incorporating", "the", "control", "structure", "and", "the", "parsing", "strategies", "is", "currently", "under", "implementation", "."]], "ner": [[[1, 2, "Method"], [4, 5, "Task"]], [[11, 11, "Generic"], [17, 23, "OtherScientificTerm"]], [[29, 30, "Method"], [32, 32, "Generic"], [37, 39, "OtherScientificTerm"]], [[44, 44, "Generic"], [46, 47, "Task"], [52, 54, "Method"], [63, 66, "OtherScientificTerm"]], [], [[86, 88, "OtherScientificTerm"], [95, 96, "OtherScientificTerm"], [99, 100, "Method"], [103, 104, "Method"], [108, 109, "OtherScientificTerm"]], [[118, 118, "Method"], [121, 122, "OtherScientificTerm"], [125, 126, "Method"]]], "relations": [[[1, 2, 4, 5, "USED-FOR"]], [], [[32, 32, 37, 39, "USED-FOR"]], [[44, 44, 46, 47, "USED-FOR"], [44, 44, 52, 54, "USED-FOR"], [52, 54, 63, 66, "USED-FOR"]], [], [[95, 96, 99, 100, "USED-FOR"], [108, 109, 103, 104, "USED-FOR"]], [[121, 122, 118, 118, "PART-OF"]]], "doc_key": "P84-1047"}
 4 | {"clusters": [[[6, 11], [21, 21], [53, 53]], [[15, 16], [69, 69], [94, 94]], [[4, 11], [82, 83]]], "sentences": [["This", "paper", "summarizes", "the", "formalism", "of", "Category", "Cooccurrence", "Restrictions", "-LRB-", "CCRs", "-RRB-", "and", "describes", "two", "parsing", "algorithms", "that", "interpret", "it", "."], ["CCRs", "are", "Boolean", "conditions", "on", "the", "cooccurrence", "of", "categories", "in", "local", "trees", "which", "allow", "the", "statement", "of", "generalizations", "which", "can", "not", "be", "captured", "in", "other", "current", "syntax", "formalisms", "."], ["The", "use", "of", "CCRs", "leads", "to", "syntactic", "descriptions", "formulated", "entirely", "with", "restrictive", "statements", "."], ["The", "paper", "shows", "how", "conventional", "algorithms", "for", "the", "analysis", "of", "context", "free", "languages", "can", "be", "adapted", "to", "the", "CCR", "formalism", "."], ["Special", "attention", "is", "given", "to", "the", "part", "of", "the", "parser", "that", "checks", "the", "fulfillment", "of", "logical", "well-formedness", "conditions", "on", "trees", "."]], "ner": [[[4, 11, "Task"], [6, 11, "OtherScientificTerm"], [15, 16, "Method"], [19, 19, "Generic"]], [[21, 21, "OtherScientificTerm"], [23, 24, "OtherScientificTerm"], [31, 32, "OtherScientificTerm"], [36, 38, "OtherScientificTerm"], [47, 48, "Method"]], [[53, 53, "OtherScientificTerm"], [56, 57, "OtherScientificTerm"], [61, 62, "OtherScientificTerm"]], [[69, 69, "Generic"], [74, 76, "Material"], [82, 83, "Task"]], [[94, 94, "Method"], [100, 102, "OtherScientificTerm"], [104, 104, "OtherScientificTerm"]]], "relations": [[[15, 16, 19, 19, "USED-FOR"]], [], [[61, 62, 56, 57, "FEATURE-OF"]], [[69, 69, 82, 83, "USED-FOR"], [74, 76, 69, 69, "USED-FOR"]], [[100, 102, 104, 104, "FEATURE-OF"]]], "doc_key": "C88-1066"}
 5 | {"clusters": [[[34, 36], [99, 101]], [[3, 5], [27, 27], [48, 48], [93, 93], [106, 106]]], "sentences": [["We", "present", "a", "text", "mining", "method", "for", "finding", "synonymous", "expressions", "based", "on", "the", "distributional", "hypothesis", "in", "a", "set", "of", "coherent", "corpora", "."], ["This", "paper", "proposes", "a", "new", "methodology", "to", "improve", "the", "accuracy", "of", "a", "term", "aggregation", "system", "using", "each", "author", "'s", "text", "as", "a", "coherent", "corpus", "."], ["Our", "approach", "is", "based", "on", "the", "idea", "that", "one", "person", "tends", "to", "use", "one", "expression", "for", "one", "meaning", "."], ["According", "to", "our", "assumption", ",", "most", "of", "the", "words", "with", "similar", "context", "features", "in", "each", "author", "'s", "corpus", "tend", "not", "to", "be", "synonymous", "expressions", "."], ["Our", "proposed", "method", "improves", "the", "accuracy", "of", "our", "term", "aggregation", "system", ",", "showing", "that", "our", "approach", "is", "successful", "."]], "ner": [[[3, 5, "Method"], [8, 9, "OtherScientificTerm"], [13, 14, "OtherScientificTerm"]], [[27, 27, "Generic"], [31, 31, "Metric"], [34, 36, "Method"]], [[48, 48, "Generic"]], [[76, 78, "OtherScientificTerm"], [88, 89, "OtherScientificTerm"]], [[93, 93, "Generic"], [96, 96, "Metric"], [99, 101, "Method"], [106, 106, "Generic"]]], "relations": [[[3, 5, 8, 9, "USED-FOR"], [13, 14, 3, 5, "USED-FOR"]], [[31, 31, 34, 36, "EVALUATE-FOR"], [34, 36, 27, 27, "EVALUATE-FOR"]], [], [], [[96, 96, 99, 101, "EVALUATE-FOR"], [99, 101, 93, 93, "EVALUATE-FOR"]]], "doc_key": "C04-1116"}
 6 | {"clusters": [[[28, 31], [68, 70], [96, 96], [123, 123]], [[78, 79], [108, 108]], [[42, 44], [48, 48]]], "sentences": [["In", "this", "work", ",", "we", "present", "a", "technique", "for", "robust", "estimation", ",", "which", "by", "explicitly", "incorporating", "the", "inherent", "uncertainty", "of", "the", "estimation", "procedure", ",", "results", "in", "a", "more", "efficient", "robust", "estimation", "algorithm", "."], ["In", "addition", ",", "we", "build", "on", "recent", "work", "in", "randomized", "model", "verification", ",", "and", "use", "this", "to", "characterize", "the", "`", "non-randomness", "'", "of", "a", "solution", "."], ["The", "combination", "of", "these", "two", "strategies", "results", "in", "a", "robust", "estimation", "procedure", "that", "provides", "a", "significant", "speed-up", "over", "existing", "RANSAC", "techniques", ",", "while", "requiring", "no", "prior", "information", "to", "guide", "the", "sampling", "process", "."], ["In", "particular", ",", "our", "algorithm", "requires", ",", "on", "average", ",", "3-10", "times", "fewer", "samples", "than", "standard", "RANSAC", ",", "which", "is", "in", "close", "agreement", "with", "theoretical", "predictions", "."], ["The", "efficiency", "of", "the", "algorithm", "is", "demonstrated", "on", "a", "selection", "of", "geometric", "estimation", "problems", "."]], "ner": [[[7, 7, "Generic"], [9, 10, "Task"], [17, 22, "OtherScientificTerm"], [28, 31, "Method"]], [[42, 44, "Task"], [48, 48, "Generic"]], [[64, 64, "Generic"], [68, 70, "Method"], [78, 79, "Method"], [84, 85, "OtherScientificTerm"], [89, 90, "OtherScientificTerm"]], [[96, 96, "Generic"], [108, 108, "Method"], [116, 117, "OtherScientificTerm"]], [[123, 123, "Generic"], [130, 132, "Task"]]], "relations": [[[7, 7, 9, 10, "USED-FOR"], [7, 7, 28, 31, "USED-FOR"], [17, 22, 7, 7, "USED-FOR"]], [], [[64, 64, 68, 70, "USED-FOR"], [78, 79, 68, 70, "COMPARE"]], [[96, 96, 108, 108, "COMPARE"]], [[130, 132, 123, 123, "EVALUATE-FOR"]]], "doc_key": "ICCV_2009_47_abs"}
 7 | {"clusters": [[[58, 59], [71, 72], [94, 95]], [[8, 10], [22, 22], [33, 33], [50, 50], [63, 63], [83, 83]], [[40, 41], [67, 68]]], "sentences": [["An", "attempt", "has", "been", "made", "to", "use", "an", "Augmented", "Transition", "Network", "as", "a", "procedural", "dialog", "model", "."], ["The", "development", "of", "such", "a", "model", "appears", "to", "be", "important", "in", "several", "respects", ":", "as", "a", "device", "to", "represent", "and", "to", "use", "different", "dialog", "schemata", "proposed", "in", "empirical", "conversation", "analysis", ";", "as", "a", "device", "to", "represent", "and", "to", "use", "models", "of", "verbal", "interaction", ";", "as", "a", "device", "combining", "knowledge", "about", "dialog", "schemata", "and", "about", "verbal", "interaction", "with", "knowledge", "about", "task-oriented", "and", "goal-directed", "dialogs", "."], ["A", "standard", "ATN", "should", "be", "further", "developed", "in", "order", "to", "account", "for", "the", "verbal", "interactions", "of", "task-oriented", "dialogs", "."]], "ner": [[[8, 10, "Method"], [14, 15, "Method"]], [[22, 22, "Generic"], [33, 33, "Generic"], [40, 41, "OtherScientificTerm"], [45, 46, "Method"], [50, 50, "Generic"], [56, 56, "Generic"], [58, 59, "OtherScientificTerm"], [63, 63, "Generic"], [67, 68, "OtherScientificTerm"], [71, 72, "OtherScientificTerm"], [76, 79, "Material"]], [[83, 83, "Method"], [94, 95, "OtherScientificTerm"], [97, 98, "Material"]]], "relations": [[[8, 10, 14, 15, "HYPONYM-OF"]], [[40, 41, 33, 33, "USED-FOR"], [40, 41, 45, 46, "USED-FOR"], [56, 56, 50, 50, "USED-FOR"], [56, 56, 58, 59, "USED-FOR"], [67, 68, 71, 72, "CONJUNCTION"]], [[83, 83, 94, 95, "USED-FOR"], [94, 95, 97, 98, "FEATURE-OF"]]], "doc_key": "C80-1073"}
 8 | {"clusters": [[[4, 6], [25, 25], [65, 65], [70, 70], [88, 88]], [[20, 22], [91, 92]], [[15, 17], [77, 77]]], "sentences": [["We", "present", "a", "practically", "unsupervised", "learning", "method", "to", "produce", "single-snippet", "answers", "to", "definition", "questions", "in", "question", "answering", "systems", "that", "supplement", "Web", "search", "engines", "."], ["The", "method", "exploits", "on-line", "encyclopedias", "and", "dictionaries", "to", "generate", "automatically", "an", "arbitrarily", "large", "number", "of", "positive", "and", "negative", "definition", "examples", ",", "which", "are", "then", "used", "to", "train", "an", "svm", "to", "separate", "the", "two", "classes", "."], ["We", "show", "experimentally", "that", "the", "proposed", "method", "is", "viable", ",", "that", "it", "outperforms", "the", "alternative", "of", "training", "the", "system", "on", "questions", "and", "news", "articles", "from", "trec", ",", "and", "that", "it", "helps", "the", "search", "engine", "handle", "definition", "questions", "significantly", "better", "."]], "ner": [[[4, 6, "Method"], [9, 10, "OtherScientificTerm"], [15, 17, "Method"], [20, 22, "Method"]], [[25, 25, "Generic"], [27, 30, "Material"], [39, 43, "Material"], [52, 52, "Method"]], [[65, 65, "Generic"], [70, 70, "Generic"], [73, 73, "Generic"], [77, 77, "Generic"], [81, 82, "Material"], [84, 84, "Material"], [88, 88, "Generic"], [91, 92, "Method"]]], "relations": [[[4, 6, 9, 10, "USED-FOR"], [15, 17, 20, 22, "USED-FOR"]], [[25, 25, 27, 30, "USED-FOR"], [27, 30, 39, 43, "USED-FOR"], [39, 43, 52, 52, "USED-FOR"]], [[70, 70, 73, 73, "COMPARE"], [81, 82, 77, 77, "USED-FOR"], [81, 82, 84, 84, "PART-OF"], [88, 88, 91, 92, "USED-FOR"]]], "doc_key": "H05-1041"}
 9 | {"clusters": [], "sentences": [["We", "revisit", "the", "classical", "decision-theoretic", "problem", "of", "weighted", "expert", "voting", "from", "a", "statistical", "learning", "perspective", "."], ["In", "particular", ",", "we", "examine", "the", "consistency", "-LRB-", "both", "asymptotic", "and", "finitary", "-RRB-", "of", "the", "optimal", "Nitzan-Paroush", "weighted", "majority", "and", "related", "rules", "."], ["In", "the", "case", "of", "known", "expert", "competence", "levels", ",", "we", "give", "sharp", "error", "estimates", "for", "the", "optimal", "rule", "."], ["When", "the", "competence", "levels", "are", "unknown", ",", "they", "must", "be", "empirically", "estimated", "."], ["We", "provide", "frequentist", "and", "Bayesian", "analyses", "for", "this", "situation", "."], ["Some", "of", "our", "proof", "techniques", "are", "non-standard", "and", "may", "be", "of", "independent", "interest", "."], ["The", "bounds", "we", "derive", "are", "nearly", "optimal", ",", "and", "several", "challenging", "open", "problems", "are", "posed", "."], ["Experimental", "results", "are", "provided", "to", "illustrate", "the", "theory", "."]], "ner": [[[3, 9, "Task"], [12, 14, "Method"]], [[32, 34, "OtherScientificTerm"]], [[44, 46, "OtherScientificTerm"], [50, 52, "Method"], [55, 56, "OtherScientificTerm"]], [[60, 61, "OtherScientificTerm"]], [[75, 76, "Method"]], [], [], []], "relations": [[[12, 14, 3, 9, "USED-FOR"]], [], [[50, 52, 55, 56, "USED-FOR"]], [], [], [], [], []], "doc_key": "NIPS_2014_18_abs"}
10 | {"clusters": [[[51, 57], [74, 74]], [[7, 8], [70, 71]], [[3, 8], [136, 138]]], "sentences": [["We", "analyze", "a", "reweighted", "version", "of", "the", "Kikuchi", "approximation", "for", "estimating", "the", "log", "partition", "function", "of", "a", "product", "distribution", "defined", "over", "a", "region", "graph", "."], ["We", "establish", "sufficient", "conditions", "for", "the", "concavity", "of", "our", "reweighted", "objective", "function", "in", "terms", "of", "weight", "assignments", "in", "the", "Kikuchi", "expansion", ",", "and", "show", "that", "a", "reweighted", "version", "of", "the", "sum", "product", "algorithm", "applied", "to", "the", "Kikuchi", "region", "graph", "will", "produce", "global", "optima", "of", "the", "Kikuchi", "approximation", "whenever", "the", "algorithm", "converges", "."], ["When", "the", "region", "graph", "has", "two", "layers", ",", "corresponding", "to", "a", "Bethe", "approximation", ",", "we", "show", "that", "our", "sufficient", "conditions", "for", "concavity", "are", "also", "necessary", "."], ["Finally", ",", "we", "provide", "an", "explicit", "characterization", "of", "the", "polytope", "of", "concavity", "in", "terms", "of", "the", "cycle", "structure", "of", "the", "region", "graph", "."], ["We", "conclude", "with", "simulations", "that", "demonstrate", "the", "advantages", "of", "the", "reweighted", "Kikuchi", "approach", "."]], "ner": [[[3, 8, "Method"], [7, 8, "Method"], [12, 18, "Task"], [22, 23, "OtherScientificTerm"]], [[31, 31, "OtherScientificTerm"], [34, 36, "OtherScientificTerm"], [40, 41, "OtherScientificTerm"], [44, 45, "OtherScientificTerm"], [51, 57, "Method"], [61, 63, "OtherScientificTerm"], [66, 67, "OtherScientificTerm"], [70, 71, "Method"], [74, 74, "Generic"]], [[79, 80, "OtherScientificTerm"], [88, 89, "Method"], [98, 98, "OtherScientificTerm"]], [[114, 114, "OtherScientificTerm"], [119, 120, "OtherScientificTerm"], [123, 124, "OtherScientificTerm"]], [[136, 138, "Method"]]], "relations": [[[3, 8, 12, 18, "USED-FOR"], [12, 18, 22, 23, "FEATURE-OF"]], [[31, 31, 34, 36, "FEATURE-OF"], [51, 57, 61, 63, "USED-FOR"], [66, 67, 70, 71, "FEATURE-OF"]], [], [[119, 120, 123, 124, "FEATURE-OF"]], []], "doc_key": "NIPS_2014_10_abs"}
11 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/dygie_test.jsonnet:
--------------------------------------------------------------------------------
  1 | // Quick test that doesn't load in any data.
  2 | 
  3 | // Primary prediction target. Watch metrics associated with this target.
  4 | local target = "rel";
  5 | 
  6 | // Specifies the token-level features that will be created.
  7 | local use_glove = true;
  8 | local use_char = true;
  9 | local use_elmo = false;
 10 | local use_attentive_span_extractor = true;
 11 | 
 12 | // Specifies the model parameters.
 13 | local lstm_hidden_size = 200;
 14 | local lstm_n_layers = 1;
 15 | local feature_size = 10;
 16 | local feedforward_layers = 2;
 17 | local char_n_filters = 50;
 18 | local feedforward_dim = 150;
 19 | local max_span_width = 8;
 20 | local feedforward_dropout = 0.2;
 21 | local lexical_dropout = 0.5;
 22 | local lstm_dropout = 0.4;
 23 | local loss_weights = {
 24 |   "ner": 1.0,
 25 |   "relation": 1.0,
 26 |   "coref": 1.0
 27 | };
 28 | 
 29 | // Coref settings.
 30 | local coref_spans_per_word = 0.4;
 31 | local coref_max_antecedents = 100;
 32 | 
 33 | // Relation settings.
 34 | local relation_spans_per_word = 0.4;
 35 | local relation_positive_label_weight = 1.0;
 36 | 
 37 | // Model training
 38 | local num_epochs = 250;
 39 | local patience = 25;
 40 | local learning_rate_scheduler = {
 41 |   "type": "reduce_on_plateau",
 42 |   "factor": 0.5,
 43 |   "mode": "max",
 44 |   "patience": 5
 45 | };
 46 | local learning_rate = 0.001;
 47 | 
 48 | 
 49 | ////////////////////////////////////////////////////////////////////////////////
 50 | 
 51 | // Nothing below this line needs to change.
 52 | 
 53 | 
 54 | // Storing constants.
 55 | 
 56 | local validation_metrics = {
 57 |   "ner": "+ner_f1",
 58 |   "rel": "+rel_f1",
 59 |   "coref": "+coref_f1"
 60 | };
 61 | 
 62 | local display_metrics = {
 63 |   "ner": ["ner_precision", "ner_recall", "ner_f1"],
 64 |   "rel": ["rel_precision", "rel_recall", "rel_f1", "rel_span_recall"],
 65 |   "coref": ["coref_precision", "coref_recall", "coref_f1", "coref_mention_recall"]
 66 | };
 67 | 
 68 | local glove_dim = 300;
 69 | local elmo_dim = 1024;
 70 | 
 71 | local module_initializer = [
 72 |   [".*linear_layers.*weight", {"type": "xavier_normal"}],
 73 |   [".*scorer._module.weight", {"type": "xavier_normal"}],
 74 |   ["_distance_embedding.weight", {"type": "xavier_normal"}]];
 75 | 
 76 | local dygie_initializer = [
 77 |   ["_span_width_embedding.weight", {"type": "xavier_normal"}],
 78 |   ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
 79 |   ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
 80 | ];
 81 | 
 82 | 
 83 | ////////////////////////////////////////////////////////////////////////////////
 84 | 
 85 | // Calculating dimensions.
 86 | 
 87 | local token_embedding_dim = ((if use_glove then glove_dim else 0) +
 88 |   (if use_char then char_n_filters else 0) +
 89 |   (if use_elmo then elmo_dim else 0));
 90 | local endpoint_span_emb_dim = 4 * lstm_hidden_size + feature_size;
 91 | local attended_span_emb_dim = if use_attentive_span_extractor then token_embedding_dim else 0;
 92 | local span_emb_dim = endpoint_span_emb_dim + attended_span_emb_dim;
 93 | local pair_emb_dim = 3 * span_emb_dim;
 94 | local relation_scorer_dim = pair_emb_dim;
 95 | local coref_scorer_dim = pair_emb_dim + feature_size;
 96 | 
 97 | ////////////////////////////////////////////////////////////////////////////////
 98 | 
 99 | // Function definitions
100 | 
101 | local make_feedforward(input_dim) = {
102 |   "input_dim": input_dim,
103 |   "num_layers": feedforward_layers,
104 |   "hidden_dims": feedforward_dim,
105 |   "activations": "relu",
106 |   "dropout": feedforward_dropout
107 | };
108 | 
109 | // Model components
110 | 
111 | local token_indexers = {
112 |   [if use_glove then "tokens"]: {
113 |     "type": "single_id",
114 |     "lowercase_tokens": false
115 |   },
116 |   [if use_char then "token_characters"]: {
117 |     "type": "characters",
118 |     "min_padding_length": 5
119 |   },
120 |   [if use_elmo then "elmo"]: {
121 |     "type": "elmo_characters"
122 |   }
123 | };
124 | 
125 | local text_field_embedder = {
126 |   "token_embedders": {
127 |     [if use_glove then "tokens"]: {
128 |       "type": "embedding",
129 |       // "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz",
130 |       "embedding_dim": 300,
131 |       "trainable": false
132 |     },
133 |     [if use_char then "token_characters"]: {
134 |       "type": "character_encoding",
135 |       "embedding": {
136 |         "num_embeddings": 262,
137 |         "embedding_dim": 16
138 |       },
139 |       "encoder": {
140 |         "type": "cnn",
141 |         "embedding_dim": 16,
142 |         "num_filters": char_n_filters,
143 |         "ngram_filter_sizes": [5]
144 |       }
145 |     },
146 |     [if use_elmo then "elmo"]: {
147 |       "type": "elmo_token_embedder",
148 |       "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
149 |       "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
150 |       "do_layer_norm": false,
151 |       "dropout": 0.5
152 |     }
153 |   }
154 | };
155 | 
156 | 
157 | ////////////////////////////////////////////////////////////////////////////////
158 | 
159 | // The model
160 | 
161 | {
162 |   "dataset_reader": {
163 |     "type": "ie_json",
164 |     "token_indexers": token_indexers,
165 |     "max_span_width": max_span_width
166 |   },
167 |   "train_data_path": "tests/fixtures/scierc_article.json",
168 |   "validation_data_path": "tests/fixtures/scierc_article.json",
169 |   "model": {
170 |     "type": "dygie",
171 |     "text_field_embedder": text_field_embedder,
172 |     "initializer": dygie_initializer,
173 |     "loss_weights": loss_weights,
174 |     "lexical_dropout": lexical_dropout,
175 |     "feature_size": feature_size,
176 |     "use_attentive_span_extractor": use_attentive_span_extractor,
177 |     "max_span_width": max_span_width,
178 |     "display_metrics": display_metrics[target],
179 |     "context_layer": {
180 |       "type": "lstm",
181 |       "bidirectional": true,
182 |       "input_size": token_embedding_dim,
183 |       "hidden_size": lstm_hidden_size,
184 |       "num_layers": lstm_n_layers,
185 |       "dropout": lstm_dropout
186 |     },
187 |     "modules": {
188 |       "coref": {
189 |         "spans_per_word": coref_spans_per_word,
190 |         "max_antecedents": coref_max_antecedents,
191 |         "mention_feedforward": make_feedforward(span_emb_dim),
192 |         "antecedent_feedforward": make_feedforward(coref_scorer_dim),
193 |         "initializer": module_initializer
194 |       },
195 |       "ner": {
196 |         "mention_feedforward": make_feedforward(span_emb_dim),
197 |         "initializer": module_initializer
198 |       },
199 |       "relation": {
200 |         "spans_per_word": relation_spans_per_word,
201 |         "positive_label_weight": relation_positive_label_weight,
202 |         "mention_feedforward": make_feedforward(span_emb_dim),
203 |         "relation_feedforward": make_feedforward(relation_scorer_dim),
204 |         "initializer": module_initializer,
205 |       },
206 |     },
207 |   },
208 |   "iterator": {
209 |     "type": "ie_batch",
210 |     "batch_size": 10
211 |   },
212 |   "validation_iterator": {
213 |     "type": "ie_document",
214 |   },
215 |   "trainer": {
216 |     "num_epochs": num_epochs,
217 |     "grad_norm": 5.0,
218 |     "patience" : patience,
219 |     "cuda_device" : -1,
220 |     "validation_metric": validation_metrics[target],
221 |     "learning_rate_scheduler": learning_rate_scheduler,
222 |     "optimizer": {
223 |       "type": "adam",
224 |       "lr": learning_rate,
225 |     },
226 |   }
227 | }
228 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/dygie_test_full.jsonnet:
--------------------------------------------------------------------------------
  1 | // Full end-to-end test, with all components turned on.
  2 | 
  3 | // Primary prediction target. Watch metrics associated with this target.
  4 | local target = "rel";
  5 | 
  6 | // Specifies the token-level features that will be created.
  7 | local use_glove = true;
  8 | local use_char = true;
  9 | local use_elmo = true;
 10 | local use_attentive_span_extractor = true;
 11 | 
 12 | // Specifies the model parameters.
 13 | local lstm_hidden_size = 200;
 14 | local lstm_n_layers = 1;
 15 | local feature_size = 10;
 16 | local feedforward_layers = 2;
 17 | local char_n_filters = 50;
 18 | local feedforward_dim = 150;
 19 | local max_span_width = 8;
 20 | local feedforward_dropout = 0.2;
 21 | local lexical_dropout = 0.5;
 22 | local lstm_dropout = 0.4;
 23 | local loss_weights = {
 24 |   "ner": 1.0,
 25 |   "relation": 1.0,
 26 |   "coref": 1.0
 27 | };
 28 | 
 29 | // Coref settings.
 30 | local coref_spans_per_word = 0.4;
 31 | local coref_max_antecedents = 100;
 32 | 
 33 | // Relation settings.
 34 | local relation_spans_per_word = 0.4;
 35 | local relation_positive_label_weight = 1.0;
 36 | 
 37 | // Model training
 38 | local num_epochs = 250;
 39 | local patience = 25;
 40 | local learning_rate_scheduler = {
 41 |   "type": "reduce_on_plateau",
 42 |   "factor": 0.5,
 43 |   "mode": "max",
 44 |   "patience": 5
 45 | };
 46 | local learning_rate = 0.001;
 47 | 
 48 | 
 49 | ////////////////////////////////////////////////////////////////////////////////
 50 | 
 51 | // Nothing below this line needs to change.
 52 | 
 53 | 
 54 | // Storing constants.
 55 | 
 56 | local validation_metrics = {
 57 |   "ner": "+ner_f1",
 58 |   "rel": "+rel_f1",
 59 |   "coref": "+coref_f1"
 60 | };
 61 | 
 62 | local display_metrics = {
 63 |   "ner": ["ner_precision", "ner_recall", "ner_f1"],
 64 |   "rel": ["rel_precision", "rel_recall", "rel_f1", "rel_span_recall"],
 65 |   "coref": ["coref_precision", "coref_recall", "coref_f1", "coref_mention_recall"]
 66 | };
 67 | 
 68 | local glove_dim = 300;
 69 | local elmo_dim = 1024;
 70 | 
 71 | local module_initializer = [
 72 |   [".*linear_layers.*weight", {"type": "xavier_normal"}],
 73 |   [".*scorer._module.weight", {"type": "xavier_normal"}],
 74 |   ["_distance_embedding.weight", {"type": "xavier_normal"}]];
 75 | 
 76 | local dygie_initializer = [
 77 |   ["_span_width_embedding.weight", {"type": "xavier_normal"}],
 78 |   ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}],
 79 |   ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}]
 80 | ];
 81 | 
 82 | 
 83 | ////////////////////////////////////////////////////////////////////////////////
 84 | 
 85 | // Calculating dimensions.
 86 | 
 87 | local token_embedding_dim = ((if use_glove then glove_dim else 0) +
 88 |   (if use_char then char_n_filters else 0) +
 89 |   (if use_elmo then elmo_dim else 0));
 90 | local endpoint_span_emb_dim = 4 * lstm_hidden_size + feature_size;
 91 | local attended_span_emb_dim = if use_attentive_span_extractor then token_embedding_dim else 0;
 92 | local span_emb_dim = endpoint_span_emb_dim + attended_span_emb_dim;
 93 | local pair_emb_dim = 3 * span_emb_dim;
 94 | local relation_scorer_dim = pair_emb_dim;
 95 | local coref_scorer_dim = pair_emb_dim + feature_size;
 96 | 
 97 | ////////////////////////////////////////////////////////////////////////////////
 98 | 
 99 | // Function definitions
100 | 
101 | local make_feedforward(input_dim) = {
102 |   "input_dim": input_dim,
103 |   "num_layers": feedforward_layers,
104 |   "hidden_dims": feedforward_dim,
105 |   "activations": "relu",
106 |   "dropout": feedforward_dropout
107 | };
108 | 
109 | // Model components
110 | 
111 | local token_indexers = {
112 |   [if use_glove then "tokens"]: {
113 |     "type": "single_id",
114 |     "lowercase_tokens": false
115 |   },
116 |   [if use_char then "token_characters"]: {
117 |     "type": "characters",
118 |     "min_padding_length": 5
119 |   },
120 |   [if use_elmo then "elmo"]: {
121 |     "type": "elmo_characters"
122 |   }
123 | };
124 | 
125 | local text_field_embedder = {
126 |   "token_embedders": {
127 |     [if use_glove then "tokens"]: {
128 |       "type": "embedding",
129 |       "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz",
130 |       "embedding_dim": 300,
131 |       "trainable": false
132 |     },
133 |     [if use_char then "token_characters"]: {
134 |       "type": "character_encoding",
135 |       "embedding": {
136 |         "num_embeddings": 262,
137 |         "embedding_dim": 16
138 |       },
139 |       "encoder": {
140 |         "type": "cnn",
141 |         "embedding_dim": 16,
142 |         "num_filters": char_n_filters,
143 |         "ngram_filter_sizes": [5]
144 |       }
145 |     },
146 |     [if use_elmo then "elmo"]: {
147 |       "type": "elmo_token_embedder",
148 |       "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
149 |       "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
150 |       "do_layer_norm": false,
151 |       "dropout": 0.5
152 |     }
153 |   }
154 | };
155 | 
156 | 
157 | ////////////////////////////////////////////////////////////////////////////////
158 | 
159 | // The model
160 | 
161 | {
162 |   "dataset_reader": {
163 |     "type": "ie_json",
164 |     "token_indexers": token_indexers,
165 |     "max_span_width": max_span_width
166 |   },
167 |   "train_data_path": "tests/fixtures/scierc_article.json",
168 |   "validation_data_path": "tests/fixtures/scierc_article.json",
169 |   "model": {
170 |     "type": "dygie",
171 |     "text_field_embedder": text_field_embedder,
172 |     "initializer": dygie_initializer,
173 |     "loss_weights": loss_weights,
174 |     "lexical_dropout": lexical_dropout,
175 |     "feature_size": feature_size,
176 |     "use_attentive_span_extractor": use_attentive_span_extractor,
177 |     "max_span_width": max_span_width,
178 |     "display_metrics": display_metrics[target],
179 |     "context_layer": {
180 |       "type": "lstm",
181 |       "bidirectional": true,
182 |       "input_size": token_embedding_dim,
183 |       "hidden_size": lstm_hidden_size,
184 |       "num_layers": lstm_n_layers,
185 |       "dropout": lstm_dropout
186 |     },
187 |     "modules": {
188 |       "coref": {
189 |         "spans_per_word": coref_spans_per_word,
190 |         "max_antecedents": coref_max_antecedents,
191 |         "mention_feedforward": make_feedforward(span_emb_dim),
192 |         "antecedent_feedforward": make_feedforward(coref_scorer_dim),
193 |         "initializer": module_initializer
194 |       },
195 |       "ner": {
196 |         "mention_feedforward": make_feedforward(span_emb_dim),
197 |         "initializer": module_initializer
198 |       },
199 |       "relation": {
200 |         "spans_per_word": relation_spans_per_word,
201 |         "positive_label_weight": relation_positive_label_weight,
202 |         "mention_feedforward": make_feedforward(span_emb_dim),
203 |         "relation_feedforward": make_feedforward(relation_scorer_dim),
204 |         "initializer": module_initializer,
205 |       },
206 |     },
207 |   },
208 |   "iterator": {
209 |     "type": "ie_batch",
210 |     "batch_size": 10
211 |   },
212 |   "validation_iterator": {
213 |     "type": "ie_document",
214 |   },
215 |   "trainer": {
216 |     "num_epochs": num_epochs,
217 |     "grad_norm": 5.0,
218 |     "patience" : patience,
219 |     "cuda_device" : -1,
220 |     "validation_metric": validation_metrics[target],
221 |     "learning_rate_scheduler": learning_rate_scheduler,
222 |     "optimizer": {
223 |       "type": "adam",
224 |       "lr": learning_rate,
225 |     },
226 |   }
227 | }
228 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/multi_dataset/dev.jsonl:
--------------------------------------------------------------------------------
1 | {"doc_key": 58, "dataset": "ace-event", "sentences": [["For", "an", "organization", "that", "is", "incorporated", "in", "Maryland", "and", "has", "its", "headquarters", "in", "Maryland", ",", "the", "laws", "of", "Maryland", "apply", "without", "regard", "to", "the", "states", "in", "which", "the", "directors", "live", ".", "Remember", ",", "a", "corporation", "is", "legally", "a", "\"", "person", "\"", ",", "distinct", "from", "the", "corporeal", "persons", "who", "govern", ",", "manage", ",", "and", "operate", "the", "corporation", ".", "Thus", ",", "the", "laws", "applying", "to", "the", "corporate", "person", "are", "the", "laws", "of", "the", "state", "where", "that", "person", "\"", "lives", "\"", ".", "--"], ["the", "board", "is", "who", "decides", "that", "and", "board", "is", "so", "often", "chock", "full", "of", "the", "ceos", "golfing", "buddies", ",", "they", "serve", "on", "each", "other", "'s", "boards", "and", "problem", "here", "is", "gap", "between", "what", "the", "average", "worker", "makes", "and", "ceo", "is", "making", "is", "increasing", "and", "everyone", "agrees", "that", "is", "not", "a", "good", "thing", "for", "this", "country", ",", "plus", "the", "fact", ",", "when", "the", "guys", "get", "the", "big", "packageses", ",", "so", "often", "you", "see", "the", "companies", "going", "down", "the", "zblubs", "thank", "you", "for", "updating", "us", "."], ["in", "a", "strange", "way", "and", "this", "may", "sound", "unusual", ",", "there", "'s", "a", "little", "sense", "of", "relief", "because", "these", "soldiers", "were", "set", "to", "leave", "in", "january", "and", "now", "they", "got", "second", "deployment", "orders", ",", "the", "delay", ",", "of", "course", ",", "the", "inability", "to", "get", "access", "to", "turkey", ",", "but", "along", "the", "way", ",", "the", "4th", "infantry", "division", "is", "one", "seeped", "in", "tradition", "and", "there", "'s", "a", "real", "sense", "of", "pride", "among", "the", "soldiers", ",", "250", "of", "which", "left", "yesterday", ",", "over", "the", "course", "of", "the", "week", "12,000", "will", "be", "leaving", "from", "here", "."], ["Another", "argument", ",", "which", "is", "better", "but", "still", "disturbing", ",", "is", "that", ",", "yes", ",", "this", "is", "an", "ethics", "violation", ",", "and", "maybe", "worse", ",", "but", "it", "would", "only", "hurt", "the", "USCF", "to", "talk", "about", "it", ",", "and", "that", "only", "troublemakers", "like", "Sam", "Sloan", ",", "Larry", "Parr", ",", "and", ",", "I", "suppose", ",", "me", ",", "would", "talk", "about", "it", ",", "since", "doing", "so", "will", "obstruct", "the", "federation", "'s", "plans", ",", "cause", "us", "to", "pay", "legal", "expenses", ",", "run", "the", "risk", "of", "our", "being", "stuck", "without", "any", "office", "space", "at", "all", ",", "cause", "people", "not", "to", "loan", "us", "money", ",", "and", "so", "forth", "."]], "ner": [[[2, 2, "ORG"], [7, 7, "GPE"], [11, 11, "FAC"], [13, 13, "GPE"], [18, 18, "GPE"], [24, 24, "GPE"], [28, 28, "PER"], [34, 34, "ORG"], [39, 39, "ORG"], [46, 46, "PER"], [55, 55, "ORG"], [65, 65, "ORG"], [71, 71, "GPE"], [74, 74, "ORG"]], [[81, 81, "PER"], [87, 87, "PER"], [95, 95, "PER"], [97, 97, "PER"], [105, 105, "PER"], [115, 115, "PER"], [118, 118, "PER"], [134, 134, "GPE"], [142, 142, "PER"], [153, 153, "ORG"]], [[183, 183, "PER"], [210, 210, "GPE"], [218, 220, "ORG"], [236, 236, "PER"]], [[288, 288, "ORG"], [297, 297, "PER"], [299, 300, "PER"], [302, 303, "PER"], [323, 323, "ORG"], [343, 343, "FAC"], [344, 344, "FAC"], [349, 349, "PER"]]], "relations": [[[11, 11, 13, 13, "PART-WHOLE.Geographical"]], [[95, 95, 97, 97, "PER-SOC.Lasting-Personal"]], [], []], "events": [[], [], [[[187, "Movement.Transport"], [183, 183, "Artifact"]], [[241, "Movement.Transport"]], [[253, "Movement.Transport"]]], [[[352, "Transaction.Transfer-Money"], [349, 349, "Giver"]]]], "_orig_doc_key": ["soc.org.nonprofit_20050218.1902", "CNN_ENG_20030424_070008.15", "CNN_ENG_20030328_150609.10", "rec.games.chess.politics_20041217.2111"], "_orig_sent_ix": [35, 16, 7, 38]}
2 | {"doc_key": 76, "dataset": "ace05", "sentences": [["been", "relatively", "quiet", "in", "northern", "ireland", "over", "the", "past", "few", "years", "."], ["our", "national", "correspondent", "frank", "buckley", "is", "on", "the", "scene", "for", "us", "."], ["[", "translator", "speaking", "]", "and", "i", "thank", "you", "for", "that", "help", "."], ["the", "pentagon", "says", "the", "convoy", "was", "taliban", "and", "al", "qaeda", "troops", "."], ["He", "was", "sentenced", "to", "four", "months", "in", "prison", ",", "but", "appealed", "."], ["Toefting", "transferred", "to", "Bolton", "in", "February", "2002", "from", "German", "club", "Hamburg", "."], ["Toefting", "joined", "the", "Danish", "squad", "in", "1993", "and", "has", "41", "caps", "."], ["sanctions", "targeting", "Iraq", "civilians", ",", "an", "important", "step", "toward", "the", "U.S", "."], ["He", "also", "envisioned", "U.S", ".", "and", "U.N", ".", "inspectors", "working", "together", "."], ["role", "in", "Iraq", "in", "the", "``", "not", "too", "distant", "future", ".", "''"], ["He", "did", "not", "provide", "the", "baby", "'s", "name", "or", "other", "details", "."], ["MCI", "to", "pay", "huge", "fine", "to", "SEC", "for", "accounting", "fraud", ":", "report"], ["Suicide", "bombing", "at", "Israeli", "shopping", "mall", ";", "fifth", "attack", "in", "two", "days"], ["The", "explosion", "killed", "the", "attacker", "and", "four", "shoppers", ",", "police", "said", "."], ["It", "was", "not", "known", "whether", "he", "was", "hurt", "at", "the", "time", "."], ["Just", "as", "long", "as", "it", "'s", "there", ",", "they", "feel", "safe", "."]], "ner": [[[5, 5, "LOC"]], [[15, 16, "PER"], [14, 14, "PER"], [12, 12, "ORG"], [22, 22, "ORG"]], [[31, 31, "GPE"], [25, 25, "PER"], [29, 29, "PER"]], [[46, 46, "PER"], [40, 40, "PER"], [37, 37, "ORG"], [42, 42, "ORG"], [44, 45, "ORG"]], [[48, 48, "PER"], [55, 55, "FAC"]], [[63, 63, "ORG"], [70, 70, "ORG"], [69, 69, "ORG"], [68, 68, "GPE"], [60, 60, "PER"]], [[76, 76, "ORG"], [75, 75, "GPE"], [72, 72, "PER"]], [[86, 86, "GPE"], [94, 94, "GPE"], [87, 87, "PER"]], [[96, 96, "PER"], [102, 102, "ORG"], [99, 99, "GPE"], [104, 104, "PER"]], [[110, 110, "GPE"]], [[120, 120, "PER"], [125, 125, "PER"]], [], [], [[165, 165, "ORG"], [160, 160, "PER"], [163, 163, "PER"]], [[173, 173, "PER"]], [[188, 188, "PER"]]], "relations": [[], [[14, 14, 12, 12, "ORG-AFF"]], [], [[46, 46, 42, 42, "ORG-AFF"], [46, 46, 44, 45, "ORG-AFF"]], [], [[60, 60, 63, 63, "ORG-AFF"], [60, 60, 70, 70, "ORG-AFF"], [69, 69, 68, 68, "GEN-AFF"]], [[72, 72, 76, 76, "ORG-AFF"], [76, 76, 75, 75, "GEN-AFF"]], [[87, 87, 86, 86, "GEN-AFF"]], [[104, 104, 102, 102, "ORG-AFF"], [104, 104, 99, 99, "ORG-AFF"]], [], [], [], [], [], [], []], "_orig_doc_key": ["CNN_ENG_20030407_130604.10", "CNN_ENG_20030620_170011.14", "CNN_ENG_20030507_170539.0", "CNN_ENG_20030526_183538.3", "APW_ENG_20030331.0410", "APW_ENG_20030331.0410", "APW_ENG_20030331.0410", "APW_ENG_20030422.0469", "APW_ENG_20030422.0469", "APW_ENG_20030422.0469", "APW_ENG_20030419.0358", "AFP_ENG_20030519.0049", "APW_ENG_20030519.0367", "APW_ENG_20030519.0367", "APW_ENG_20030519.0367", "AGGRESSIVEVOICEDAILY_20041101.1806"], "_orig_sent_ix": [9, 5, 11, 10, 7, 14, 18, 6, 35, 48, 7, 3, 3, 12, 52, 18]}
3 | {"doc_key": "93324366_dev", "dataset": "genia", "sentences": [["Human", "T", "cell", "transcription", "factor", "GATA", "-", "3", "stimulates", "HIV", "-", "1", "expression", "."], ["A", "family", "of", "transcriptional", "activating", "proteins", ",", "the", "GATA", "factors", ",", "has", "been", "shown", "to", "bind", "to", "a", "consensus", "motif", "through", "a", "highly", "conserved", "C4", "zinc", "finger", "DNA", "binding", "domain", "."], ["One", "member", "of", "this", "multigene", "family", ",", "GATA", "-", "3", ",", "is", "most", "abundantly", "expressed", "in", "T", "lymphocytes", ",", "a", "cellular", "target", "for", "human", "immunodeficiency", "virus", "type", "1", "(", "HIV", "-", "1", ")", "infection", "and", "replication", "."], ["In", "vitro", "DNase", "I", "footprinting", "analysis", "revealed", "six", "hGATA", "-", "3", "binding", "sites", "in", "the", "U3", "region", "(", "the", "transcriptional", "regulatory", "domain", ")", "of", "the", "HIV", "-", "1", "LTR", "."], ["Cotransfection", "of", "an", "hGATA", "-", "3", "expression", "plasmid", "with", "a", "reporter", "plasmid", "whose", "transcription", "is", "directed", "by", "the", "HIV", "-", "1", "LTR", "resulted", "in", "6", "-", "to", "10", "-", "fold", "stimulation", "of", "LTR", "-", "mediated", "transcription", ",", "whereas", "site", "specific", "mutation", "of", "these", "GATA", "sites", "resulted", "in", "virtual", "abrogation", "of", "the", "activation", "by", "hGATA", "-", "3", "."], ["Further", ",", "deletion", "of", "the", "hGATA", "-", "3", "transcriptional", "activation", "domain", "abolished", "GATA", "-", "dependent", "HIV", "-", "1", "trans", "-", "activation", ",", "showing", "that", "the", "stimulation", "of", "viral", "transcription", "observed", "is", "a", "direct", "effect", "of", "cotransfected", "hGATA", "-", "3", "."], ["Introduction", "of", "the", "HIV", "-", "1", "plasmids", "in", "which", "the", "GATA", "sites", "have", "been", "mutated", "into", "human", "T", "lymphocytes", "also", "caused", "a", "significant", "reduction", "in", "LTR", "-", "mediated", "transcription", "at", "both", "the", "basal", "level", "and", "in", "(", "PHA", "-", "plus", "PMA", "-", ")", "stimulated", "T", "cells", "."], ["These", "observations", "suggest", "that", "in", "addition", "to", "its", "normal", "role", "in", "T", "lymphocyte", "gene", "regulation", ",", "hGATA", "-", "3", "may", "also", "play", "a", "significant", "role", "in", "HIV", "-", "1", "transcriptional", "activation", "."]], "ner": [[[0, 4, "protein"], [5, 7, "protein"]], [[22, 23, "protein"], [32, 33, "DNA"], [38, 40, "protein"], [36, 43, "protein"]], [[52, 54, "protein"], [61, 62, "cell_type"]], [[84, 85, "protein"], [90, 94, "DNA"], [97, 98, "DNA"], [101, 103, "DNA"], [107, 110, "DNA"]], [[115, 119, "DNA"], [122, 123, "DNA"], [130, 133, "DNA"], [144, 144, "DNA"], [155, 156, "DNA"], [165, 167, "protein"]], [[174, 176, "protein"], [174, 179, "DNA"], [204, 207, "protein"]], [[212, 215, "DNA"], [219, 220, "DNA"], [226, 227, "cell_type"], [234, 234, "DNA"], [252, 254, "cell_line"]], [[267, 268, "cell_type"], [272, 274, "protein"]]], "relations": [[], [], [], [], [], [], [], []], "clusters": [[[0, 7], [45, 50]], [[14, 19], [21, 23], [45, 54], [48, 50]], [[61, 62], [64, 80], [225, 227]], [[89, 110], [154, 156], [218, 220]], [[106, 110], [129, 133]], [[121, 123], [124, 124]], [[144, 147], [234, 237]], [[165, 167], [263, 263], [272, 274]], [[211, 215], [217, 217]]]}
4 | {"doc_key": "W03-0406", "dataset": "scierc", "sentences": [["In", "this", "paper", ",", "we", "improve", "an", "unsupervised", "learning", "method", "using", "the", "Expectation-Maximization", "-LRB-", "EM", "-RRB-", "algorithm", "proposed", "by", "Nigam", "et", "al.", "for", "text", "classification", "problems", "in", "order", "to", "apply", "it", "to", "word", "sense", "disambiguation", "-LRB-", "WSD", "-RRB-", "problems", "."], ["The", "improved", "method", "stops", "the", "EM", "algorithm", "at", "the", "optimum", "iteration", "number", "."], ["To", "estimate", "that", "number", ",", "we", "propose", "two", "methods", "."], ["In", "experiments", ",", "we", "solved", "50", "noun", "WSD", "problems", "in", "the", "Japanese", "Dictionary", "Task", "in", "SENSEVAL2", "."], ["The", "score", "of", "our", "method", "is", "a", "match", "for", "the", "best", "public", "score", "of", "this", "task", "."], ["Furthermore", ",", "our", "methods", "were", "confirmed", "to", "be", "effective", "also", "for", "verb", "WSD", "problems", "."]], "ner": [[[7, 9, "Method"], [12, 16, "Method"], [23, 25, "Task"], [30, 30, "Generic"], [32, 38, "Task"]], [[42, 42, "Generic"], [45, 46, "Method"], [49, 51, "OtherScientificTerm"]], [[56, 56, "Generic"]], [[69, 71, "Task"], [74, 76, "Task"], [78, 78, "Material"]], [[84, 84, "Generic"], [95, 95, "Generic"]], [[100, 100, "Generic"], [108, 110, "Task"]]], "relations": [[[12, 16, 7, 9, "USED-FOR"], [12, 16, 23, 25, "USED-FOR"], [30, 30, 32, 38, "USED-FOR"]], [], [], [[74, 76, 78, 78, "FEATURE-OF"]], [], [[100, 100, 108, 110, "USED-FOR"]]], "clusters": [[[49, 51], [56, 56]], [[7, 9], [30, 30], [42, 42], [84, 84], [100, 100]], [[74, 76], [95, 95]]]}
5 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/multi_dataset/test.jsonl:
--------------------------------------------------------------------------------
1 | {"doc_key": 15, "dataset": "ace-event", "sentences": [["He", "also", "owns", "a", "television", "and", "a", "radio", "station", "and", "a", "newspaper", "."], ["\"", "That", "does", "n't", "shock", "us", ",", "we", "have", "been", "saying", "it", "."], ["He", "has", "made", "no", "public", "comments", "so", "far", "on", "the", "Beijing", "talks", "."], ["The", "official", "did", "not", "disclose", "what", "North", "Korea", "'s", "bold", "plan", "was", "."], ["\"", "The", "United", "States", "is", "our", "major", "thrust", ",", "\"", "he", "said", "."], ["Head", "of", "Iran", "'s", "largest", "daily", "quits", "after", "conservatives", "win", "Tehran", "city", "hall"], ["He", "lost", "an", "appeal", "case", "on", "his", "sodomy", "sentence", "on", "April", "18", "."], ["\"", "Both", "said", "they", "look", "forward", "to", "seeing", "each", "other", "in", "Evian", "."], ["We", "just", "disagree", "on", "one", "item", ",", "\"", "the", "Canadian", "leader", "said", "."], ["Bush", ",", "Putin", "pal", "up", "after", "Iraq", "spat", ",", "but", "Iran", "row", "simmers"], ["``", "We", "are", "proceeding", "with", "all", "the", "plans", "for", "the", "vote", ".", "''"], ["Saddam", "has", "long", "claimed", "Iraq", "destroyed", "all", "its", "weapons", "of", "mass", "destruction", "."], ["Report", ":", "Hong", "Kong", "Jockey", "Club", "in", "talks", "to", "acquire", "its", "Macau", "rival"], ["Ships", "carrying", "equipment", "for", "U.S.", "troops", "are", "already", "waiting", "off", "the", "Turkish", "coast"], ["The", "Justice", "party", "changed", "the", "constitution", "after", "taking", "power", "in", "the", "elections", "."], ["at", "members", "of", "Erdogan", "'s", "party", ",", "the", "Anatolia", "news", "agency", "reported", "."]], "ner": [[[8, 8, "ORG"], [11, 11, "ORG"]], [], [[36, 36, "GPE"]], [[40, 40, "PER"], [45, 46, "GPE"]], [[54, 55, "GPE"]], [], [], [[102, 102, "GPE"]], [[113, 113, "GPE"], [114, 114, "PER"]], [[117, 117, "PER"], [119, 119, "PER"], [123, 123, "GPE"], [127, 127, "GPE"]], [], [[143, 143, "PER"], [147, 147, "GPE"], [151, 151, "WEA"]], [], [[169, 169, "VEH"], [173, 173, "GPE"], [174, 174, "PER"], [180, 180, "GPE"], [181, 181, "LOC"]], [[183, 184, "ORG"]], [[196, 196, "PER"], [198, 198, "PER"], [200, 200, "ORG"], [203, 203, "ORG"], [205, 205, "ORG"]]], "relations": [[], [], [], [], [], [], [], [], [[114, 114, 113, 113, "ORG-AFF.Employment"]], [], [], [], [], [[174, 174, 173, 173, "ORG-AFF.Employment"], [181, 181, 180, 180, "PART-WHOLE.Geographical"]], [], [[196, 196, 200, 200, "ORG-AFF.Membership"], [198, 198, 200, 200, "ORG-AFF.Employment"]]], "events": [[], [], [], [], [], [], [[[81, "Justice.Appeal"]], [[86, "Justice.Sentence"]]], [], [], [], [], [], [], [], [], []], "_orig_doc_key": ["AFP_ENG_20030415.0734", "AFP_ENG_20030425.0408", "AFP_ENG_20030425.0408", "AFP_ENG_20030425.0408", "AFP_ENG_20030430.0075", "AFP_ENG_20030504.0248", "AFP_ENG_20030508.0357", "AFP_ENG_20030527.0616", "AFP_ENG_20030527.0616", "AFP_ENG_20030601.0262", "APW_ENG_20030304.0555", "APW_ENG_20030308.0314", "APW_ENG_20030310.0719", "APW_ENG_20030311.0775", "APW_ENG_20030311.0775", "APW_ENG_20030311.0775"], "_orig_sent_ix": [13, 11, 24, 30, 20, 3, 13, 7, 12, 3, 24, 20, 3, 12, 31, 38]}
2 | {"doc_key": 72, "dataset": "ace05", "sentences": [["A", "total", "over", "the", "last", "two", "weeks", "now", "of", "close", "to", "24,000", "sorties", "."], ["That", "air", "base", "that", "coalition", "forces", "have", "now", "taken", "control", "of", "near", "Nasiriya", "."], ["Tales", "of", "how", "some", "in", "Congress", "want", "to", "honor", "the", "British", "prime", "minister", "."], ["Columns", "of", "coalition", "forces", "keep", "pressing", "north", "and", "tightening", "the", "noose", "on", "Baghdad", "."], ["These", "pictures", "were", "taken", "on", "the", "runway", "just", "moments", "after", "the", "coalition", "attack", "."], ["Outside", "the", "airport", ",", "the", "fighting", "was", "fierce", ",", "but", "the", "battle", "unequal", "."], ["This", "is", "fairly", "central", "in", "Baghdad", "on", "the", "edge", "of", "a", "government", "area", "."], ["That", "'s", "CNN", "'s", "Karl", "Penhaul", "with", "the", "very", "latest", "from", "the", "battlefront", "."], ["Let", "'", "s", "get", "an", "update", "now", "on", "casualties", "on", "Operation", "Iraqi", "Freedom", "."], ["i", "think", "because", "the", "day", "after", "came", "so", "fast", "and", "so", "suddenly", "in", "way", "."], ["in", "the", "short-term", ",", "that", "'s", "going", "to", "have", "to", "mean", "the", "american", "military", "."], ["it", "was", "the", "first", "american", "prisoner", "of", "war", "ever", "rescued", "since", "world", "war", "ii", "."], ["rehab", "taha", ",", "aka", "dr.", "germ", ",", "who", "directed", "iraq", "'s", "biological", "weapons", "program", "."], ["cnn", "has", "exclusively", "obtained", "the", "united", "nations", "english", "translations", "of", "her", "arabic", "work", "papers", "."], ["reporter", ":", "a", "perfectionism", "reflected", "in", "even", "the", "smallest", "details", "of", "stewart", "'s", "life", "."], ["this", "is", "a", "stock", "down", "more", "than", "70", "%", "over", "the", "last", "three", "years", "."]], "ner": [[], [[16, 16, "FAC"], [17, 17, "FAC"], [18, 18, "GPE"], [26, 26, "GPE"], [19, 19, "PER"]], [[40, 40, "PER"], [31, 31, "PER"], [33, 33, "ORG"], [38, 38, "GPE"]], [[44, 44, "GPE"], [54, 54, "GPE"], [42, 42, "PER"], [45, 45, "PER"]], [[67, 67, "GPE"], [62, 62, "FAC"]], [[72, 72, "FAC"]], [[84, 84, "FAC"], [96, 96, "LOC"], [92, 92, "LOC"], [95, 95, "ORG"], [89, 89, "GPE"]], [[100, 100, "ORG"], [102, 103, "PER"]], [[114, 114, "ORG"], [123, 123, "GPE"]], [[126, 126, "PER"]], [[154, 154, "ORG"], [153, 153, "GPE"]], [[160, 160, "GPE"], [161, 161, "PER"]], [[171, 172, "PER"], [178, 178, "PER"], [175, 176, "PER"], [183, 183, "WEA"], [180, 180, "GPE"], [184, 184, "ORG"]], [[196, 196, "PER"], [191, 192, "ORG"], [186, 186, "ORG"]], [[212, 212, "PER"]], []], "relations": [[], [[16, 16, 26, 26, "PHYS"], [19, 19, 17, 17, "ART"], [19, 19, 18, 18, "ORG-AFF"]], [[31, 31, 33, 33, "ORG-AFF"], [40, 40, 38, 38, "ORG-AFF"]], [[45, 45, 44, 44, "ORG-AFF"]], [], [], [[92, 92, 96, 96, "PART-WHOLE"], [84, 84, 89, 89, "PART-WHOLE"]], [[102, 103, 100, 100, "ORG-AFF"]], [], [], [[154, 154, 153, 153, "GEN-AFF"]], [[161, 161, 160, 160, "GEN-AFF"]], [[178, 178, 184, 184, "ORG-AFF"], [184, 184, 180, 180, "PART-WHOLE"]], [], [], []], "_orig_doc_key": ["CNN_IP_20030402.1600.02-2", "CNN_IP_20030402.1600.02-2", "CNN_IP_20030402.1600.02-2", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030405.1600.00-2", "CNN_IP_20030402.1600.00-2", "CNN_IP_20030402.1600.00-2", "CNN_ENG_20030411_070039.21", "CNN_ENG_20030411_070039.21", "CNN_ENG_20030403_080032.9", "CNN_ENG_20030416_180808.15", "CNN_ENG_20030416_180808.15", "CNN_ENG_20030607_170312.6", "CNN_ENG_20030516_123543.8"], "_orig_sent_ix": [53, 66, 78, 11, 20, 32, 46, 5, 6, 13, 15, 23, 10, 11, 47, 13]}
3 | {"doc_key": "91041706_test", "dataset": "genia", "sentences": [["Interferon", "-", "gamma", "and", "the", "sexual", "dimorphism", "of", "autoimmunity", "."], ["The", "sexual", "difference", "in", "the", "incidence", "of", "autoimmune", "diseases", "has", "remained", "an", "enigma", "for", "many", "years", "."], ["In", "the", "examination", "of", "the", "induction", "of", "autoimmunity", "in", "transgenic", "mice", ",", "evidence", "has", "been", "obtained", "further", "implicating", "the", "lymphokine", "interferon", "-", "gamma", "in", "the", "etiology", "of", "autoimmunity", "."], ["Sex", "steroid", "regulation", "of", "the", "production", "of", "this", "molecule", ",", "as", "well", "as", "other", "cytokines", ",", "may", "help", "explain", "the", "gender", "-", "specific", "differences", "in", "the", "immune", "system", ",", "including", "autoimmunity", "."]], "ner": [[[0, 2, "protein"]], [], [[46, 46, "protein"], [47, 49, "protein"]], [[70, 70, "protein"]]], "relations": [[], [], [], []], "clusters": [[[8, 8], [34, 34], [54, 54], [86, 86]]]}
4 | {"doc_key": "CVPR_2004_30_abs", "dataset": "scierc", "sentences": [["Background", "modeling", "is", "an", "important", "component", "of", "many", "vision", "systems", "."], ["Existing", "work", "in", "the", "area", "has", "mostly", "addressed", "scenes", "that", "consist", "of", "static", "or", "quasi-static", "structures", "."], ["When", "the", "scene", "exhibits", "a", "persistent", "dynamic", "behavior", "in", "time", ",", "such", "an", "assumption", "is", "violated", "and", "detection", "performance", "deteriorates", "."], ["In", "this", "paper", ",", "we", "propose", "a", "new", "method", "for", "the", "modeling", "and", "subtraction", "of", "such", "scenes", "."], ["Towards", "the", "modeling", "of", "the", "dynamic", "characteristics", ",", "optical", "flow", "is", "computed", "and", "utilized", "as", "a", "feature", "in", "a", "higher", "dimensional", "space", "."], ["Inherent", "ambiguities", "in", "the", "computation", "of", "features", "are", "addressed", "by", "using", "a", "data-dependent", "bandwidth", "for", "density", "estimation", "using", "kernels", "."], ["Extensive", "experiments", "demonstrate", "the", "utility", "and", "performance", "of", "the", "proposed", "approach", "."]], "ner": [[[0, 1, "Task"], [8, 9, "Task"]], [[23, 26, "OtherScientificTerm"]], [[30, 30, "Generic"], [33, 35, "OtherScientificTerm"], [45, 45, "Task"]], [[57, 57, "Generic"], [60, 65, "Task"], [65, 65, "Generic"]], [[69, 73, "Task"], [75, 76, "OtherScientificTerm"], [83, 83, "OtherScientificTerm"], [86, 88, "OtherScientificTerm"]], [[91, 91, "OtherScientificTerm"], [94, 96, "Task"], [102, 103, "OtherScientificTerm"], [105, 106, "Task"], [108, 108, "Method"]], [[120, 120, "Generic"]]], "relations": [[[0, 1, 8, 9, "PART-OF"]], [], [[33, 35, 30, 30, "FEATURE-OF"]], [[57, 57, 60, 65, "USED-FOR"]], [[75, 76, 69, 73, "USED-FOR"], [75, 76, 83, 83, "USED-FOR"], [83, 83, 69, 73, "USED-FOR"], [86, 88, 83, 83, "FEATURE-OF"]], [[91, 91, 94, 96, "FEATURE-OF"], [102, 103, 91, 91, "USED-FOR"], [102, 103, 105, 106, "USED-FOR"], [108, 108, 105, 106, "USED-FOR"]], []], "clusters": [[[57, 57], [120, 120]], [[30, 30], [65, 65]]]}
5 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/multi_dataset/train.jsonl:
--------------------------------------------------------------------------------
1 | {"doc_key": 758, "dataset": "ace-event", "sentences": [["ozzy", "'s", "not", "going", "to", "like", "me", "saying", "this", ",", "but", "he", "'s", "a", "very", "tender", ",", "gentle", "man", "."], ["the", "second", "attack", "occurred", "after", "some", "rocket", "firings", "aimed", ",", "apparently", ",", "toward", "the", "israelis", ",", "apparently", "in", "retaliation", "."], ["that", "'s", "just", "a", "fraction", "of", "the", "killings", "and", "rapes", "and", "torture", "that", "have", "grippedded", "the", "country", "since", "1998", "."], ["its", "basic", "message", ",", "the", "palestinian", "authority", "is", "corrupt", ",", "it", "fails", "to", "deliver", ",", "hamas", "is", "not", "corrupt", "."], ["reporter", ":", "settlers", "call", "the", "just", "concluded", "israeli", "-", "american-", "palestinian", "summit", "a", "surrender", ",", "and", "were", "not", "impressed", "."], ["and", "that", "the", "survivors", ",", "they", "had", "to", "go", "into", "the", "water", ",", "pick", "them", "up", "from", "the", "water", "."], ["when", "you", "first", "heard", "about", "sars", ",", "i", "mean", ",", "did", "you", "suddenly", "think", ",", "this", "may", "not", "happen", "?"], ["the", "flip", "side", "between", "stars", ",", "though", ",", "is", "once", "the", "fans", "grow", "up", "their", "star", "power", "may", "dim", "."], ["it", "would", "be", "deadly", "to", "let", "iran", "let", "their", "hands", "on", "nuclear", "weapons", "employ", "i", "would", "support", "any", "action", "."], ["fidelity", "is", "going", "to", "get", "rid", "of", "the", "3", "%", "upfront", "sales", "charge", "on", "five", "of", "its", "largest", "funds", "."], ["we", "'re", "told", "the", "russian", "president", "vladimir", "putin", "was", "greeted", "by", "prince", "charles", "as", "he", "arrived", "in", "london", "today", "."], ["that", "experience", "should", "continue", "to", "inspire", "us", ",", "as", "we", "seek", "to", "build", "a", "more", "peaceful", "and", "secure", "world", "."], ["*", "his", "dandruff", "shampoo", "was", "n't", "tough", "enough", "for", "black", ",", "so", "i", "bought", "him", "maximum", "strength", "selsun", "blue", "."], ["he", "is", "one", "of", "the", "biggest", "if", "not", "the", "biggest", "names", "in", "the", "world", "of", "hollywood", "fund", "-", "raisers", "."], ["sean", "patrick", "o'malley", ",", "the", "bishop", "of", "palm", "beach", ",", "florida", ",", "is", "expected", "to", "replace", "cardinal", "bernard", "law", "."], ["And", "uh", "so", "I", "says", "vote", "for", "him", ",", "vote", "for", "him", ",", "vote", "for", "him", ",", "you", "know", "."]], "ner": [[[0, 0, "PER"], [18, 18, "PER"]], [[26, 26, "WEA"], [34, 34, "GPE"]], [[56, 56, "GPE"]], [[65, 66, "GPE"], [75, 75, "ORG"]], [[80, 80, "PER"], [82, 82, "PER"], [87, 87, "GPE"], [89, 89, "GPE"], [90, 90, "GPE"]], [[103, 103, "PER"]], [], [[144, 144, "PER"], [151, 151, "PER"]], [[166, 166, "GPE"], [172, 172, "WEA"]], [[180, 180, "ORG"]], [[204, 204, "GPE"], [205, 205, "PER"], [206, 207, "PER"], [211, 211, "PER"], [212, 212, "PER"], [217, 217, "GPE"]], [[238, 238, "LOC"]], [], [[270, 270, "PER"], [275, 275, "ORG"], [276, 278, "PER"]], [[280, 282, "PER"], [285, 285, "PER"], [287, 288, "GPE"], [290, 290, "GPE"], [296, 296, "PER"], [297, 298, "PER"]], []], "relations": [[], [], [], [], [], [], [], [], [], [], [[205, 205, 204, 204, "ORG-AFF.Employment"]], [], [], [[276, 278, 275, 275, "ORG-AFF.Employment"]], [[285, 285, 287, 288, "GEN-AFF.Citizen-Resident-Religion-Ethnicity"], [287, 288, 290, 290, "PART-WHOLE.Geographical"]], []], "events": [[], [[[22, "Conflict.Attack"], [26, 26, "Instrument"]], [[27, "Conflict.Attack"], [26, 26, "Instrument"]]], [[[47, "Life.Die"], [56, 56, "Place"]], [[49, "Conflict.Attack"], [56, 56, "Place"]], [[51, "Conflict.Attack"], [56, 56, "Place"]]], [], [[[91, "Contact.Meet"], [87, 87, "Entity"], [89, 89, "Entity"], [90, 90, "Entity"]]], [], [], [], [], [], [[[215, "Movement.Transport"], [206, 207, "Artifact"], [217, 217, "Destination"]]], [], [], [], [[[295, "Personnel.Start-Position"], [280, 282, "Person"]]], []], "_orig_doc_key": ["CNN_ENG_20030607_173310.4", "CNN_ENG_20030610_130042.17", "CNN_ENG_20030612_173004.10", "CNN_ENG_20030614_173123.4", "CNN_ENG_20030614_173123.4", "CNN_ENG_20030617_112838.4", "CNN_ENG_20030617_193116.10", "CNN_ENG_20030618_193127.17", "CNN_ENG_20030624_082841.12", "CNN_ENG_20030624_140104.22", "CNN_ENG_20030624_153103.16", "CNN_ENG_20030624_153103.17", "CNN_ENG_20030625_210122.0", "CNN_ENG_20030626_193133.8", "CNN_ENG_20030630_075848.7", "fsh_29105"], "_orig_sent_ix": [46, 8, 6, 15, 26, 21, 81, 18, 16, 5, 3, 7, 4, 24, 3, 77]}
2 | {"doc_key": 474, "dataset": "ace05", "sentences": [["defense", "attorneys", "argue", "that", "an", "individual", "can", "not", "be", "charged", "under", "federal", "law", "and", "then", "state", "law", "for", "the", "same", "act", "."], ["the", "british", "museum", "is", "home", "to", "the", "largest", "mesopotamia", "collection", "outside", "iraq", ",", "including", "some", "of", "the", "earliest", "forms", "of", "writing", "."], ["what", "i", "should", "tell", "you", "is", "there", "are", "about", "15", "possible", "routes", "to", "get", "you", "to", "the", "top", "of", "mt.", "everest", "."], ["the", "other", "mission", "that", "it", "took", "part", "in", "was", "the", "operations", "before", "t", "fall", "of", "the", "saddam", "hussein", "and", "his", "regime", "."], ["now", ",", "also", ",", "this", "ship", "played", "an", "important", "part", "within", "the", "war", "and", "that", "'s", "because", "of", "the", "radar", "system", "."], ["during", "a", "live", "broadcast", "geraldo", "drew", "a", "map", "in", "the", "sand", "showing", "the", "location", "of", "the", "unit", "in", "relation", "to", "baghdad", "."], ["he", "was", "to", "his", "chagrin", ",", "could", "be", "the", "most", "toughest", "and", "important", "battle", "of", "them", "all", ",", "winning", "the", "peace", "."], ["i", "pulled", "out", "the", "youngest", "guy", "first", "and", "then", "i", "wanted", "to", "pull", "out", "the", "captain", ",", "but", "the", "captain", "said", "take"], ["as", "soon", "as", "the", "police", "officers", "pulled", "the", "gun", "out", "and", "held", "it", "up", ",", "i", "knew", "it", "was", "my", "rifle", "."], ["the", "president", "greeting", "a", "number", "of", "dignitaries", "as", "he", "gets", "ready", "here", "to", "board", "air", "force", "one", "on", "his", "way", "home", "."], ["a", "little", "earlier", "this", "morning", ",", "i", "spoke", "with", "an", "iraqi", "dissident", "who", "teaches", "at", "writes", "extensively", "on", "the", "middle", "east", "."], ["straun", "with", "smashed", "cars", ",", "buses", "and", "other", "debris", "representing", "the", "hypothetical", "damage", "from", "a", "fan", "that", "sized", "radioactive", "dirty", "bomb", "."], ["remember", "ali", "abbas", "who", "lost", "both", "his", "arms", ",", "his", "home", ",", "many", "of", "his", "relatives", "in", "a", "bombing", "on", "baghdad", "?"], ["no", "telling", "how", "long", "the", "osbourne", "phenomenon", "may", "last", "but", "the", "family", "'s", "loyal", "following", "will", "always", "be", "screaming", "for", "more", "."], ["considering", "how", "much", "publicity", "was", "circulating", "about", "this", "book", ",", "there", "is", "injunctions", "about", "publishing", "anything", "or", "talking", "about", "the", "plot", "."], ["on", "behal", "of", "republican", "candidates", "and", "i", "tend", "to", "do", "a", "lot", "of", "campaigning", "in", "the", "next", "year", "for", "the", "president", "."]], "ner": [[[5, 5, "PER"], [15, 15, "GPE"], [1, 1, "PER"]], [[23, 24, "ORG"], [26, 26, "ORG"], [30, 30, "LOC"], [33, 33, "GPE"]], [[63, 64, "LOC"], [58, 58, "PER"], [61, 61, "LOC"], [45, 45, "PER"], [48, 48, "PER"]], [[70, 70, "VEH"], [82, 83, "PER"], [85, 85, "PER"], [86, 86, "ORG"]], [[93, 93, "VEH"]], [[114, 114, "PER"], [126, 126, "PER"], [130, 130, "GPE"]], [[132, 132, "PER"], [135, 135, "PER"]], [[154, 154, "PER"], [163, 163, "PER"], [159, 159, "PER"], [169, 169, "PER"], [173, 173, "PER"]], [[195, 195, "PER"], [191, 191, "PER"], [181, 181, "PER"], [180, 180, "ORG"], [184, 184, "WEA"], [188, 188, "WEA"], [193, 193, "WEA"], [196, 196, "WEA"]], [[218, 218, "GPE"], [204, 204, "PER"], [209, 209, "LOC"], [199, 199, "PER"], [206, 206, "PER"], [216, 216, "PER"], [212, 214, "VEH"]], [[230, 230, "GPE"], [226, 226, "PER"], [231, 231, "PER"], [232, 232, "PER"], [239, 240, "GPE"]], [[245, 245, "VEH"], [247, 247, "VEH"], [262, 262, "WEA"]], [[265, 266, "PER"], [267, 267, "PER"], [270, 270, "PER"], [273, 273, "PER"], [278, 278, "PER"], [279, 279, "PER"], [274, 274, "FAC"], [276, 276, "PER"], [284, 284, "GPE"]], [[300, 300, "PER"], [291, 291, "PER"], [297, 297, "PER"]], [], [[350, 350, "PER"], [334, 334, "PER"], [333, 333, "ORG"], [336, 336, "PER"]]], "relations": [[], [], [[61, 61, 63, 64, "PART-WHOLE"], [58, 58, 61, 61, "PHYS"]], [], [], [[126, 126, 130, 130, "PHYS"]], [], [], [[181, 181, 180, 180, "ORG-AFF"], [195, 195, 196, 196, "ART"]], [[199, 199, 212, 214, "ART"], [216, 216, 218, 218, "GEN-AFF"]], [[231, 231, 230, 230, "GEN-AFF"]], [], [[265, 266, 284, 284, "PHYS"], [273, 273, 274, 274, "ART"], [278, 278, 279, 279, "PER-SOC"]], [], [], []], "_orig_doc_key": ["CNN_ENG_20030602_102826.13", "CNN_ENG_20030418_163834.14", "CNN_ENG_20030529_130011.6", "CNN_ENG_20030426_160621.0", "CNN_ENG_20030426_160621.0", "CNN_ENG_20030331_193655.14", "CNN_ENG_20030422_213527.4", "CNN_ENG_20030617_173115.22", "CNN_ENG_20030508_210555.5", "CNN_ENG_20030408_083034.11", "CNN_ENG_20030408_083034.11", "CNN_ENG_20030512_170454.13", "CNN_ENG_20030429_143706.14", "CNN_ENG_20030607_173310.4", "CNN_ENG_20030617_105836.4", "CNN_ENG_20030624_065843.24"], "_orig_sent_ix": [13, 15, 16, 7, 9, 6, 6, 17, 10, 20, 21, 4, 4, 53, 14, 17]}
3 | {"doc_key": "99221887_train", "dataset": "genia", "sentences": [["Suppressive", "effects", "of", "anti", "-", "inflammatory", "agents", "on", "human", "endothelial", "cell", "activation", "and", "induction", "of", "heat", "shock", "proteins", "."], ["BACKGROUND", ":", "Studies", "from", "our", "laboratory", "have", "shown", "that", "the", "earliest", "stages", "of", "atherosclerosis", "may", "be", "mediated", "by", "an", "autoimmune", "reaction", "against", "heat", "shock", "protein", "60", "(", "Hsp60", ")", "."], ["The", "interactions", "of", "Hsp60", "-", "specific", "T", "cells", "with", "arterial", "endothelial", "cells", "(", "EC", ")", "require", "expression", "of", "both", "Hsp60", "and", "certain", "adhesion", "molecules", "shown", "to", "be", "induced", "simultaneously", "in", "EC", "by", "mechanical", "and", "other", "types", "of", "stress", "."], ["Recently", ",", "it", "was", "shown", "that", "suppression", "of", "T", "cell", "-", "mediated", "immune", "responses", "by", "cyclosporin", "A", "(", "CyA", ")", "enhanced", "atherosclerotic", "lesion", "formation", "in", "mice", "."], ["In", "contrast", ",", "aspirin", "was", "found", "to", "lower", "the", "risk", "of", "myocardial", "infarction", "in", "men", "."], ["These", "conflicting", "observations", "may", "be", "due", "to", "different", "effects", "of", "anti", "-", "inflammatory", "agents", "on", "adhesion", "molecule", "and", "Hsp", "expression", "in", "EC", ",", "respectively", "."], ["MATERIAL", "AND", "METHODS", ":", "In", "the", "present", "study", ",", "we", "analyzed", "the", "effects", "of", "CyA", ",", "aspirin", ",", "and", "indomethacin", "on", "T", "cell", "proliferation", "using", "a", "proliferation", "assay", "."], ["To", "explore", "the", "expression", "of", "adhesion", "molecules", ",", "monocyte", "chemoattractant", "protein", "-", "1", "(", "MCP", "-", "1", ")", ",", "and", "Hsp60", "in", "human", "umbilical", "vein", "endothelial", "cells", "(", "HUVECs", ")", ",", "Northern", "blot", "analyses", "were", "used", "."], ["To", "examine", "the", "activation", "status", "of", "the", "transcription", "factors", "nuclear", "factor", "kappaB", "(", "NF", "-", "kappaB", ")", "and", "heat", "shock", "factor", "-", "1", "(", "HSF", "-", "1", ")", ",", "electrophoretic", "mobility", "shift", "assays", "were", "performed", "."], ["RESULTS", ":", "With", "the", "exception", "of", "indomethacin", ",", "the", "used", "immunosuppressive", "and", "anti", "-", "inflammatory", "agents", "significantly", "inhibited", "T", "cell", "proliferation", "in", "response", "to", "influenza", "virus", "antigen", "in", "a", "dose", "-", "dependent", "manner", "."], ["Interestingly", ",", "CyA", "and", "indomethacin", "did", "not", "suppress", "tumor", "necrosis", "factor", "-", "alpha", "(", "TNF", "-", "alpha", ")", "-", "induced", "adhesion", "molecule", "expression", "on", "HUVECs", ",", "whereas", "aspirin", "had", "an", "inhibitory", "effect", "."], ["These", "observations", "correlated", "with", "the", "modulation", "of", "NF", "-", "kappaB", "activity", "in", "EC", "."], ["All", "agents", "tested", "induced", "expression", "of", "Hsp60", "6", "hr", "after", "application", "."], ["In", "addition", ",", "aspirin", "and", "indomethacin", ",", "but", "not", "CyA", ",", "induced", "Hsp70", "expression", "in", "HUVECs", "that", "correlated", "with", "induction", "of", "HSF", "-", "1", "activity", "."], ["CONCLUSION", ":", "Our", "results", "show", "that", "the", "tested", "agents", "(", "except", "indomethacin", ")", "are", "inhibitors", "of", "the", "T", "cell", "-", "mediated", "immune", "response", ",", "as", "expected", ",", "that", "aspirin", "is", "an", "effective", "suppressor", "of", "adhesion", "molecule", "expression", ",", "and", "that", "all", "three", "agents", "can", "induce", "Hsp60", "in", "HUVECs", "."], ["These", "data", "provide", "the", "molecular", "basis", "for", "the", "notion", "that", "(", "1", ")", "part", "of", "the", "anti", "-", "atherogenic", "effect", "of", "aspirin", "may", "be", "due", "to", "the", "prevention", "of", "the", "adhesion", "of", "sensitized", "T", "cells", "to", "stressed", "EC", ";", "(", "2", ")", "that", "part", "of", "the", "atherosclerosis", "-", "promoting", "effect", "of", "CyA", "may", "be", "due", "to", "its", "potential", "as", "an", "inducer", "of", "Hsp60", "expression", "and", "its", "inability", "to", "down", "-", "regulate", "adhesion", "molecule", "expression", "on", "EC", ";", "and", "(", "3", ")", "that", "down", "-", "regulation", "of", "MCP", "-", "1", "expression", "by", "aspirin", "may", "result", "in", "decreased", "recruitment", "of", "monocytes", "into", "the", "arterial", "intima", "beneath", "stressed", "EC", "."]], "ner": [[[8, 10, "cell_type"], [15, 17, "protein"]], [[41, 44, "protein"], [46, 46, "protein"]], [[52, 52, "protein"], [52, 56, "cell_line"], [59, 60, "cell_type"], [62, 62, "cell_type"], [68, 68, "protein"], [71, 72, "protein"], [79, 79, "cell_type"]], [], [], [[152, 152, "cell_type"]], [], [[190, 191, "protein"], [193, 197, "protein"], [199, 201, "protein"], [205, 205, "protein"], [210, 211, "cell_type"], [207, 211, "cell_type"], [213, 213, "cell_type"]], [[229, 230, "protein"], [231, 233, "protein"], [235, 237, "protein"], [240, 244, "protein"], [246, 248, "protein"]], [[282, 284, "protein"]], [[300, 304, "protein"], [306, 308, "protein"], [316, 316, "cell_type"]], [[332, 334, "protein"], [337, 337, "cell_type"]], [[345, 345, "protein"]], [[363, 363, "protein"], [366, 366, "cell_type"], [372, 374, "protein"]], [[411, 412, "protein"], [422, 422, "protein"], [424, 424, "cell_type"]], [[458, 460, "cell_type"], [463, 463, "cell_type"], [488, 488, "protein"], [497, 498, "protein"], [501, 501, "cell_type"], [512, 514, "protein"], [524, 524, "cell_type"], [531, 531, "cell_type"]]], "relations": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "clusters": [[[3, 6], [141, 144]], [[41, 47], [345, 345]], [[103, 107], [170, 170], [294, 294], [360, 360], [477, 477], [482, 482], [491, 491]], [[118, 118], [172, 172], [319, 319], [354, 354], [405, 405], [447, 447], [517, 517]], [[170, 175], [339, 341], [417, 419]], [[175, 175], [264, 264], [296, 296], [356, 356]], [[177, 179], [276, 278]], [[190, 191], [193, 214]], [[205, 214], [422, 424]], [[207, 214], [316, 316], [366, 366], [424, 424]], [[343, 345], [488, 489]], [[363, 366], [367, 367]], [[433, 434], [435, 435]]]}
4 | {"doc_key": "INTERSPEECH_2008_21_abs", "dataset": "scierc", "sentences": [["This", "paper", "presents", "a", "research", "on", "the", "Czech", "talking", "head", "system", "."], ["It", "gives", "an", "overview", "of", "methods", "used", "for", "visual", "speech", "animation", ",", "parameterization", "of", "a", "human", "face", "and", "a", "tongue", ",", "necessary", "data", "sources", "and", "a", "synthesis", "method", "."], ["A", "3D", "animation", "model", "is", "used", "for", "a", "pseudo-muscular", "animation", "schema", "to", "create", "such", "animation", "of", "visual", "speech", "which", "is", "usable", "for", "a", "lipreading", "."], ["An", "extension", "of", "animation", "schema", "is", "presented", "to", "reach", "more", "precise", "deformations", "mainly", "in", "a", "lip", "area", "."], ["Furthermore", ",", "a", "problem", "of", "forming", "articulatory", "trajectories", "is", "formulated", "to", "solve", "labial", "coarticulation", "effects", "."], ["It", "is", "used", "for", "the", "synthesis", "method", "based", "on", "a", "selection", "of", "articulatory", "targets", "and", "interpolation", "technique", "."]], "ner": [[[7, 10, "Task"]], [[17, 17, "Generic"], [20, 22, "Task"], [38, 39, "Method"]], [[42, 44, "Method"], [49, 51, "Method"], [55, 58, "Task"], [64, 64, "Task"]], [[69, 70, "Method"]], [[89, 91, "Task"], [96, 98, "OtherScientificTerm"]], [[100, 100, "Generic"], [105, 106, "Method"], [110, 113, "OtherScientificTerm"], [115, 116, "Method"]]], "relations": [[], [[17, 17, 20, 22, "USED-FOR"]], [[42, 44, 49, 51, "USED-FOR"], [49, 51, 55, 58, "USED-FOR"], [55, 58, 64, 64, "USED-FOR"]], [], [[89, 91, 96, 98, "USED-FOR"]], [[100, 100, 105, 106, "USED-FOR"], [110, 113, 100, 100, "USED-FOR"], [110, 113, 115, 116, "CONJUNCTION"], [115, 116, 100, 100, "USED-FOR"]]], "clusters": [[[20, 22], [55, 58]], [[89, 91], [100, 100]]]}
5 | 


--------------------------------------------------------------------------------
/dygie/tests/fixtures/scierc_article.json:
--------------------------------------------------------------------------------
1 | {"clusters": [[[62, 64], [90, 91], [96, 98], [112, 114]], [[6, 6], [170, 170]], [[81, 82], [126, 127]], [[129, 131], [142, 142]]], "sentences": [["In", "this", "paper", ",", "a", "novel", "method", "to", "learn", "the", "intrinsic", "object", "structure", "for", "robust", "visual", "tracking", "is", "proposed", "."], ["The", "basic", "assumption", "is", "that", "the", "parameterized", "object", "state", "lies", "on", "a", "low", "dimensional", "manifold", "and", "can", "be", "learned", "from", "training", "data", "."], ["Based", "on", "this", "assumption", ",", "firstly", "we", "derived", "the", "dimensionality", "reduction", "and", "density", "estimation", "algorithm", "for", "unsupervised", "learning", "of", "object", "intrinsic", "representation", ",", "the", "obtained", "non-rigid", "part", "of", "object", "state", "reduces", "even", "to", "2", "dimensions", "."], ["Secondly", "the", "dynamical", "model", "is", "derived", "and", "trained", "based", "on", "this", "intrinsic", "representation", "."], ["Thirdly", "the", "learned", "intrinsic", "object", "structure", "is", "integrated", "into", "a", "particle-filter", "style", "tracker", "."], ["We", "will", "show", "that", "this", "intrinsic", "object", "representation", "has", "some", "interesting", "properties", "and", "based", "on", "which", "the", "newly", "derived", "dynamical", "model", "makes", "particle-filter", "style", "tracker", "more", "robust", "and", "reliable", "."], ["Experiments", "show", "that", "the", "learned", "tracker", "performs", "much", "better", "than", "existing", "trackers", "on", "the", "tracking", "of", "complex", "non-rigid", "motions", "such", "as", "fish", "twisting", "with", "self-occlusion", "and", "large", "inter-frame", "lip", "motion", "."], ["The", "proposed", "method", "also", "has", "the", "potential", "to", "solve", "other", "type", "of", "tracking", "problems", "."]], "ner": [[[6, 6, "Method"], [10, 12, "OtherScientificTerm"], [14, 16, "Task"]], [[26, 28, "OtherScientificTerm"], [32, 34, "OtherScientificTerm"]], [[52, 57, "Method"], [59, 64, "Task"], [62, 64, "Method"], [68, 72, "OtherScientificTerm"]], [[81, 82, "Method"], [90, 91, "Method"]], [[96, 98, "OtherScientificTerm"], [103, 105, "Method"]], [[112, 114, "Method"], [126, 127, "Method"], [129, 131, "Method"]], [[142, 142, "Generic"], [148, 148, "Generic"], [151, 155, "Task"], [153, 155, "OtherScientificTerm"], [158, 159, "OtherScientificTerm"], [161, 161, "OtherScientificTerm"], [164, 166, "OtherScientificTerm"]], [[170, 170, "Generic"], [180, 181, "Task"]]], "relations": [[[6, 6, 10, 12, "USED-FOR"], [10, 12, 14, 16, "USED-FOR"]], [[32, 34, 26, 28, "FEATURE-OF"]], [[52, 57, 59, 64, "USED-FOR"]], [[90, 91, 81, 82, "USED-FOR"]], [[96, 98, 103, 105, "PART-OF"]], [[126, 127, 129, 131, "USED-FOR"]], [[142, 142, 148, 148, "COMPARE"], [142, 142, 151, 155, "USED-FOR"], [148, 148, 151, 155, "USED-FOR"], [158, 159, 153, 155, "HYPONYM-OF"], [161, 161, 158, 159, "FEATURE-OF"], [161, 161, 164, 166, "CONJUNCTION"], [164, 166, 158, 159, "FEATURE-OF"]], [[170, 170, 180, 181, "USED-FOR"]]], "doc_key": "CVPR_2003_18_abs", "dataset": "scierc"}
2 | 


--------------------------------------------------------------------------------
/dygie/tests/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/tests/models/__init__.py


--------------------------------------------------------------------------------
/dygie/tests/models/coref_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for the coref module.
 3 | """
 4 | 
 5 | import json
 6 | 
 7 | from allennlp.common.testing import ModelTestCase
 8 | from allennlp.nn import util
 9 | 
10 | from dygie.models import DyGIE
11 | from dygie.data import IEJsonReader
12 | 
13 | 
14 | class TestCoref(ModelTestCase):
15 |     def setUp(self):
16 |         # TODO(dwadden) create smaller model for testing.
17 |         super(TestCoref, self).setUp()
18 |         self.config_file = "tests/fixtures/dygie_test.jsonnet"
19 |         self.data_file = "tests/fixtures/scierc_article.json"
20 |         self.set_up_model(self.config_file, self.data_file)
21 | 
22 |     def get_raw_data(self):
23 |         lines = []
24 |         with open(self.data_file, "r") as f:
25 |             for line in f:
26 |                 lines.append(json.loads(line))
27 |         return lines
28 | 
29 |     def test_coref_make_evaluation_metadata(self):
30 |         """
31 |         To compute coreference evaluation metrics, the evaluator needs access to the list of
32 |         coreference clusters, given in the same form as the original input. I check to make sure
33 |         that the clusters I pass in are indeed equivalent to the original input.
34 |         """
35 |         # Pull together the relevant training data.
36 |         data = self.dataset.as_tensor_dict()
37 |         metadata = data["metadata"]
38 |         text_mask = util.get_text_field_mask(data["text"]).float()
39 |         sentence_lengths = text_mask.sum(dim=1).long()
40 |         # Make sure the sentence lengths from the text mask are the same as the number of tokens.
41 |         assert sentence_lengths.tolist() == [len(entry["sentence"]) for entry in metadata]
42 | 
43 |         # Convert metadata back to form used for coref evaluation
44 |         evaluation_metadata = self.model._coref._make_evaluation_metadata(metadata, sentence_lengths)
45 |         clusters_metadata = evaluation_metadata[0]["clusters"]
46 |         # Convert from tuples to list to facilitate comparison.
47 |         clusters_metadata = [[list(span) for span in cluster] for cluster in clusters_metadata]
48 | 
49 |         # Get the raw data, and sort to match the metadata.
50 |         clusters_raw = self.get_raw_data()[0]["clusters"]
51 |         clusters_raw = sorted(clusters_raw, key=lambda entry: entry[0][0])
52 | 
53 |         # Compare the raw data to the converted metadata I have.
54 |         assert clusters_metadata == clusters_raw
55 | 


--------------------------------------------------------------------------------
/dygie/tests/models/dygie_test.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unit tests for the dygie.
 3 | """
 4 | 
 5 | from allennlp.common.testing import ModelTestCase
 6 | 
 7 | # TODO(dwadden) Figure out why tests break on CUDA.
 8 | 
 9 | 
10 | class TestDyGIE(ModelTestCase):
11 |     def setUp(self):
12 |         # TODO(dwadden) create smaller model for testing.
13 |         super(TestDyGIE, self).setUp()
14 |         self.config_file = "tests/fixtures/dygie_test_full.jsonnet"
15 |         self.data_file = "tests/fixtures/scierc_article.json"
16 |         self.set_up_model(self.config_file, self.data_file)
17 | 
18 |     def test_dygie_model_can_train_save_and_load(self):
19 |         self.ensure_model_can_train_save_and_load(self.param_file)
20 | 


--------------------------------------------------------------------------------
/dygie/tests/models/multi_dataset_test.sh:
--------------------------------------------------------------------------------
 1 | # This isn't a formal test; just a training script to make sure that
 2 | # multi-dataset training works for a single epoch without breaking.
 3 | 
 4 | # Usage (from root of project):
 5 | # bash dygie/tests/models/multi_dataset_test.sh
 6 | 
 7 | tmpdir=dygie/tests/tmp
 8 | 
 9 | if [[ -d $tmpdir ]]
10 | then
11 |     rm -r $tmpdir
12 | fi
13 | 
14 | mkdir -p $tmpdir
15 | 
16 | allennlp train "training_config/multi_dataset.jsonnet" \
17 |     --serialization-dir $tmpdir \
18 |     --include-package dygie
19 | 
20 | 
21 | # Remove tmpdir once training has finished.
22 | rm -r $tmpdir
23 | 


--------------------------------------------------------------------------------
/dygie/tests/models/relation_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Unit tests for the relation module.
  3 | 
  4 | This module wasn't matching TensorFlow performance so it's tested pretty heavily.
  5 | """
  6 | 
  7 | import torch
  8 | 
  9 | from allennlp.common.testing import ModelTestCase
 10 | 
 11 | # Needed to get the test framework to see the dataset readers and models.
 12 | from dygie import models
 13 | from dygie import data
 14 | 
 15 | 
 16 | class TestRelation(ModelTestCase):
 17 |     def setUp(self):
 18 |         super(TestRelation, self).setUp()
 19 |         self.config_file = "tests/fixtures/dygie_test.jsonnet"
 20 |         self.data_file = "tests/fixtures/scierc_article.json"
 21 |         self.set_up_model(self.config_file, self.data_file)
 22 | 
 23 |     def test_decode(self):
 24 |         def convert(x):
 25 |             return self.model.vocab.get_token_from_index(x, namespace="relation_labels")
 26 | 
 27 |         top_spans = torch.tensor([[[0, 2], [1, 3], [1, 3]],
 28 |                                   [[1, 6], [2, 4], [3, 8]],
 29 |                                   [[0, 1], [0, 1], [0, 1]]])
 30 |         predicted_relations = torch.tensor([[[-1, -1, 1],
 31 |                                              [1, -1, -1],
 32 |                                              [-1, 0, -1]],
 33 |                                             [[-1, -1, -1],
 34 |                                              [1, -1, 2],
 35 |                                              [-1, -1, 4]],
 36 |                                             [[1, 1, 2],
 37 |                                              [1, 3, 2],
 38 |                                              [-1, 2, 1]]])
 39 |         num_spans_to_keep = torch.tensor([2, 3, 0])
 40 |         predict_dict = {"top_spans": top_spans,
 41 |                         "predicted_relations": predicted_relations,
 42 |                         "num_spans_to_keep": num_spans_to_keep}
 43 |         decoded = self.model._relation.decode(predict_dict)
 44 |         expected = [{((1, 3), (0, 2)): convert(1)},
 45 |                     {((2, 4), (1, 6)): convert(1),
 46 |                      ((2, 4), (3, 8)): convert(2),
 47 |                      ((3, 8), (3, 8)): convert(4)},
 48 |                     {}]
 49 |         assert expected == decoded["decoded_relations_dict"]
 50 | 
 51 |     def test_compute_span_pair_embeddings(self):
 52 |         top_span_embeddings = torch.randn([3, 51, 1160])  # Make up random embeddings.
 53 | 
 54 |         embeddings = self.model._relation._compute_span_pair_embeddings(top_span_embeddings)
 55 | 
 56 |         batch_ix = 1
 57 |         ix1 = 22
 58 |         ix2 = 43
 59 |         emb1 = top_span_embeddings[batch_ix, ix1]
 60 |         emb2 = top_span_embeddings[batch_ix, ix2]
 61 |         emb_prod = emb1 * emb2
 62 |         emb = torch.cat([emb1, emb2, emb_prod])
 63 | 
 64 |         assert torch.allclose(emb, embeddings[batch_ix, ix1, ix2])
 65 | 
 66 |     def test_compute_relation_scores(self):
 67 |         self.model.eval()       # Need eval on in order to reproduce.
 68 |         relation = self.model._relation
 69 |         pairwise_embeddings = torch.randn(3, 46, 46, 3480, requires_grad=True)
 70 |         top_span_mention_scores = torch.randn(3, 46, 1, requires_grad=True)
 71 | 
 72 |         scores = relation._compute_relation_scores(pairwise_embeddings, top_span_mention_scores)
 73 | 
 74 |         batch_ix = 0
 75 |         ix1 = 31
 76 |         ix2 = 4
 77 | 
 78 |         score = relation._relation_scorer(
 79 |             relation._relation_feedforward(pairwise_embeddings[batch_ix, ix1, ix2].unsqueeze(0)))
 80 |         score += top_span_mention_scores[batch_ix, ix1] + top_span_mention_scores[batch_ix, ix2]
 81 |         score = torch.cat([torch.tensor([0.0]), score.squeeze()])
 82 | 
 83 |         assert torch.allclose(scores[batch_ix, ix1, ix2], score)
 84 | 
 85 |     def test_get_pruned_gold_relations(self):
 86 |         # Getting the pruned gold labels should add one to the input relation labels, then set all
 87 |         # the masked entries to -1.
 88 |         relation_labels = torch.tensor([[[-1, -1, 2, 3],
 89 |                                          [1, -1, -1, 0],
 90 |                                          [-1, 3, -1, 1],
 91 |                                          [0, -1, -1, -1]],
 92 |                                         [[0, 2, 1, 2],
 93 |                                          [-1, -1, -1, -1],
 94 |                                          [3, 0, -1, -1],
 95 |                                          [-1, 0, 1, -1]]])
 96 |         top_span_indices = torch.tensor([[0, 1, 3],
 97 |                                          [0, 2, 2]])
 98 |         top_span_masks = torch.tensor([[1, 1, 1],
 99 |                                        [1, 1, 0]]).unsqueeze(-1)
100 | 
101 |         labels = self.model._relation._get_pruned_gold_relations(
102 |             relation_labels, top_span_indices, top_span_masks)
103 | 
104 |         expected_labels = torch.tensor([[[0, 0, 4],
105 |                                          [2, 0, 1],
106 |                                          [1, 0, 0]],
107 |                                         [[1, 2, -1],
108 |                                          [4, 0, -1],
109 |                                          [-1, -1, -1]]])
110 | 
111 |         assert torch.equal(labels, expected_labels)
112 | 
113 |     def test_cross_entropy_ignore_index(self):
114 |         # Make sure that the cross entropy loss is ignoring entries whose gold label is -1, which
115 |         # corresponds, to masked-out entries.
116 |         relation_scores = torch.randn(2, 3, 3, self.model._relation._n_labels + 1)
117 |         gold_relations = torch.tensor([[[0, 0, 4],
118 |                                         [2, 0, 1],
119 |                                         [1, 0, 0]],
120 |                                        [[1, 2, -1],
121 |                                         [4, 0, -1],
122 |                                         [-1, -1, -1]]])
123 | 
124 |         # Calculate the loss with a loop over entries.
125 |         total_loss = torch.tensor([0.0])
126 |         for fold in [0, 1]:
127 |             for i in range(3):
128 |                 for j in range(3):
129 |                     scores_entry = relation_scores[fold, i, j].unsqueeze(0)
130 |                     gold_entry = gold_relations[fold, i, j].unsqueeze(0)
131 |                     if gold_entry >= 0:
132 |                         loss_entry = self.model._relation._loss(scores_entry, gold_entry)
133 |                         total_loss += loss_entry
134 | 
135 |         model_loss = self.model._relation._get_cross_entropy_loss(relation_scores, gold_relations)
136 |         assert torch.allclose(total_loss, model_loss)
137 | 


--------------------------------------------------------------------------------
/dygie/training/event_metrics.py:
--------------------------------------------------------------------------------
  1 | from overrides import overrides
  2 | from collections import Counter
  3 | 
  4 | from allennlp.training.metrics.metric import Metric
  5 | 
  6 | from dygie.training.f1 import compute_f1
  7 | 
  8 | 
  9 | def _invert_arguments(arguments, triggers):
 10 |     """
 11 |     For scoring the argument, we don't need the trigger spans to match exactly. We just need the
 12 |     trigger label corresponding to the predicted trigger span to be correct.
 13 |     """
 14 |     # Can't use a dict because multiple triggers could share the same argument.
 15 |     inverted = set()
 16 |     for k, v in arguments.items():
 17 |         if k[0] in triggers:  # If it's not, the trigger this arg points to is null. TODO(dwadden) check.
 18 |             trigger_label = triggers[k[0]]
 19 |             to_append = (k[1], trigger_label, v)
 20 |             inverted.add(to_append)
 21 | 
 22 |     return inverted
 23 | 
 24 | 
 25 | # TODO(dwadden) Clean this up.
 26 | class EventMetrics(Metric):
 27 |     """
 28 |     Computes precision, recall, and micro-averaged F1 for triggers and arguments.
 29 |     """
 30 |     def __init__(self):
 31 |         self.reset()
 32 | 
 33 |     @overrides
 34 |     def __call__(self, predicted_events_list, metadata_list):
 35 |         for predicted_events, metadata in zip(predicted_events_list, metadata_list):
 36 |             # Trigger scoring.
 37 |             predicted_triggers = predicted_events["trigger_dict"]
 38 |             gold_triggers = metadata.events.trigger_dict
 39 |             self._score_triggers(predicted_triggers, gold_triggers)
 40 | 
 41 |             # Argument scoring.
 42 |             predicted_arguments = predicted_events["argument_dict"]
 43 |             gold_arguments = metadata.events.argument_dict
 44 |             self._score_arguments(
 45 |                 predicted_triggers, gold_triggers, predicted_arguments, gold_arguments)
 46 | 
 47 |     def _score_triggers(self, predicted_triggers, gold_triggers):
 48 |         self._gold_triggers += len(gold_triggers)
 49 |         self._predicted_triggers += len(predicted_triggers)
 50 |         for token_ix, pred in predicted_triggers.items():
 51 |             label = pred[0]
 52 |             # Check whether the offsets match, and whether the labels match.
 53 |             if token_ix in gold_triggers:
 54 |                 self._matched_trigger_ids += 1
 55 |                 if gold_triggers[token_ix] == label:
 56 |                     self._matched_trigger_classes += 1
 57 | 
 58 |     def _score_arguments(self, predicted_triggers, gold_triggers, predicted_arguments, gold_arguments):
 59 |         # Note that the index of the trigger doesn't actually need to be correct to get full credit;
 60 |         # the event type and event role need to be correct (see Sec. 3 of paper).
 61 |         def format(arg_dict, trigger_dict, prediction=False):
 62 |             # Make it a list of [index, event_type, arg_label].
 63 |             res = []
 64 |             for (trigger_ix, arg_ix), label in arg_dict.items():
 65 |                 # If it doesn't match a trigger, don't predict it (enforced in decoding).
 66 |                 if trigger_ix not in trigger_dict:
 67 |                     continue
 68 |                 event_type = trigger_dict[trigger_ix]
 69 |                 # TODO(dwadden) This is clunky; it's because predictions have confidence scores.
 70 |                 if prediction:
 71 |                     event_type = event_type[0]
 72 |                     label = label[0]
 73 |                 res.append((arg_ix, event_type, label))
 74 |             return res
 75 | 
 76 |         formatted_gold_arguments = format(gold_arguments, gold_triggers, prediction=False)
 77 |         formatted_predicted_arguments = format(predicted_arguments, predicted_triggers, prediction=True)
 78 | 
 79 |         self._gold_arguments += len(formatted_gold_arguments)
 80 |         self._predicted_arguments += len(formatted_predicted_arguments)
 81 | 
 82 |         # Go through each predicted arg and look for a match.
 83 |         for entry in formatted_predicted_arguments:
 84 |             # No credit if not associated with a predicted trigger.
 85 |             class_match = int(any([entry == gold for gold in formatted_gold_arguments]))
 86 |             id_match = int(any([entry[:2] == gold[:2] for gold in formatted_gold_arguments]))
 87 | 
 88 |             self._matched_argument_classes += class_match
 89 |             self._matched_argument_ids += id_match
 90 | 
 91 | 
 92 |     @overrides
 93 |     def get_metric(self, reset=False):
 94 |         res = {}
 95 | 
 96 |         # Triggers
 97 |         res["trig_id_precision"], res["trig_id_recall"], res["trig_id_f1"] = compute_f1(
 98 |             self._predicted_triggers, self._gold_triggers, self._matched_trigger_ids)
 99 |         res["trig_class_precision"], res["trig_class_recall"], res["trig_class_f1"] = compute_f1(
100 |             self._predicted_triggers, self._gold_triggers, self._matched_trigger_classes)
101 | 
102 |         # Arguments
103 |         res["arg_id_precision"], res["arg_id_recall"], res["arg_id_f1"] = compute_f1(
104 |             self._predicted_arguments, self._gold_arguments, self._matched_argument_ids)
105 |         res["arg_class_precision"], res["arg_class_recall"], res["arg_class_f1"] = compute_f1(
106 |             self._predicted_arguments, self._gold_arguments, self._matched_argument_classes)
107 | 
108 |         # Reset counts if at end of epoch.
109 |         if reset:
110 |             self.reset()
111 | 
112 |         return res
113 | 
114 |     @overrides
115 |     def reset(self):
116 |         self._gold_triggers = 0
117 |         self._predicted_triggers = 0
118 |         self._matched_trigger_ids = 0
119 |         self._matched_trigger_classes = 0
120 |         self._gold_arguments = 0
121 |         self._predicted_arguments = 0
122 |         self._matched_argument_ids = 0
123 |         self._matched_argument_classes = 0
124 | 
125 | 
126 | class ArgumentStats(Metric):
127 |     """
128 |     Compute the fraction of predicted event arguments that are associated with multiple triggers.
129 |     """
130 |     def __init__(self):
131 |         self.reset()
132 | 
133 |     @overrides
134 |     def __call__(self, predicted_events_list):
135 |         for predicted_events in predicted_events_list:
136 |             predicted_arguments = _invert_arguments(predicted_events["argument_dict"],
137 |                                                     predicted_events["trigger_dict"])
138 |             # Count how many times each span appears as an argument.
139 |             span_counts = Counter()
140 |             for prediction in predicted_arguments:
141 |                 span_counts[prediction[0]] += 1
142 |             # Count how many spans appear more than once.
143 |             repeated = {k: v for k, v in span_counts.items() if v > 1}
144 |             self._total_arguments += len(span_counts)
145 |             self._repeated_arguments += len(repeated)
146 | 
147 |     @overrides
148 |     def get_metric(self, reset=False):
149 |         # Fraction of event arguments associated with multiple triggers.
150 |         args_multiple = (self._repeated_arguments / self._total_arguments
151 |                          if self._total_arguments
152 |                          else 0)
153 | 
154 |         if reset:
155 |             self.reset()
156 | 
157 |         res = dict(args_multiple=args_multiple)
158 |         return res
159 | 
160 |     @overrides
161 |     def reset(self):
162 |         self._total_arguments = 0
163 |         self._repeated_arguments = 0
164 | 


--------------------------------------------------------------------------------
/dygie/training/f1.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Function to compute F1 scores.
 3 | """
 4 | 
 5 | 
 6 | def safe_div(num, denom):
 7 |     if denom > 0:
 8 |         return num / denom
 9 |     else:
10 |         return 0
11 | 
12 | 
13 | def compute_f1(predicted, gold, matched):
14 |     precision = safe_div(matched, predicted)
15 |     recall = safe_div(matched, gold)
16 |     f1 = safe_div(2 * precision * recall, precision + recall)
17 |     return precision, recall, f1
18 | 


--------------------------------------------------------------------------------
/dygie/training/ner_metrics.py:
--------------------------------------------------------------------------------
 1 | from overrides import overrides
 2 | from typing import Optional
 3 | 
 4 | import torch
 5 | 
 6 | from allennlp.training.metrics.metric import Metric
 7 | 
 8 | from dygie.training.f1 import compute_f1
 9 | 
10 | # TODO(dwadden) Need to use the decoded predictions so that we catch the gold examples longer than
11 | # the span boundary.
12 | 
13 | class NERMetrics(Metric):
14 |     """
15 |     Computes precision, recall, and micro-averaged F1 from a list of predicted and gold labels.
16 |     """
17 |     def __init__(self, number_of_classes: int, none_label: int=0):
18 |         self.number_of_classes = number_of_classes
19 |         self.none_label = none_label
20 |         self.reset()
21 | 
22 |     @overrides
23 |     def __call__(self,
24 |                  predictions: torch.Tensor,
25 |                  gold_labels: torch.Tensor,
26 |                  mask: Optional[torch.Tensor] = None):
27 |         predictions = predictions.cpu()
28 |         gold_labels = gold_labels.cpu()
29 |         mask = mask.cpu()
30 |         for i in range(self.number_of_classes):
31 |             if i == self.none_label:
32 |                 continue
33 |             self._true_positives += ((predictions==i)*(gold_labels==i)*mask.bool()).sum().item()
34 |             self._false_positives += ((predictions==i)*(gold_labels!=i)*mask.bool()).sum().item()
35 |             self._true_negatives += ((predictions!=i)*(gold_labels!=i)*mask.bool()).sum().item()
36 |             self._false_negatives += ((predictions!=i)*(gold_labels==i)*mask.bool()).sum().item()
37 | 
38 |     @overrides
39 |     def get_metric(self, reset=False):
40 |         """
41 |         Returns
42 |         -------
43 |         A tuple of the following metrics based on the accumulated count statistics:
44 |         precision : float
45 |         recall : float
46 |         f1-measure : float
47 |         """
48 |         predicted = self._true_positives + self._false_positives
49 |         gold = self._true_positives + self._false_negatives
50 |         matched = self._true_positives
51 |         precision, recall, f1_measure = compute_f1(predicted, gold, matched)
52 | 
53 |         # Reset counts if at end of epoch.
54 |         if reset:
55 |             self.reset()
56 | 
57 |         return precision, recall, f1_measure
58 | 
59 |     @overrides
60 |     def reset(self):
61 |         self._true_positives = 0
62 |         self._false_positives = 0
63 |         self._true_negatives = 0
64 |         self._false_negatives = 0
65 | 


--------------------------------------------------------------------------------
/dygie/training/relation_metrics.py:
--------------------------------------------------------------------------------
 1 | from overrides import overrides
 2 | 
 3 | from allennlp.training.metrics.metric import Metric
 4 | 
 5 | from dygie.training.f1 import compute_f1
 6 | 
 7 | 
 8 | class RelationMetrics(Metric):
 9 |     """
10 |     Computes precision, recall, and micro-averaged F1 from a list of predicted and gold spans.
11 |     """
12 |     def __init__(self):
13 |         self.reset()
14 | 
15 |     # TODO(dwadden) This requires decoding because the dataset reader gets rid of gold spans wider
16 |     # than the span width. So, I can't just compare the tensor of gold labels to the tensor of
17 |     # predicted labels.
18 |     @overrides
19 |     def __call__(self, predicted_relation_list, metadata_list):
20 |         for predicted_relations, metadata in zip(predicted_relation_list, metadata_list):
21 |             gold_relations = metadata.relation_dict
22 |             self._total_gold += len(gold_relations)
23 |             self._total_predicted += len(predicted_relations)
24 |             for (span_1, span_2), label in predicted_relations.items():
25 |                 ix = (span_1, span_2)
26 |                 if ix in gold_relations and gold_relations[ix] == label:
27 |                     self._total_matched += 1
28 | 
29 |     @overrides
30 |     def get_metric(self, reset=False):
31 |         precision, recall, f1 = compute_f1(self._total_predicted, self._total_gold, self._total_matched)
32 | 
33 |         # Reset counts if at end of epoch.
34 |         if reset:
35 |             self.reset()
36 | 
37 |         return precision, recall, f1
38 | 
39 |     @overrides
40 |     def reset(self):
41 |         self._total_gold = 0
42 |         self._total_predicted = 0
43 |         self._total_matched = 0
44 | 


--------------------------------------------------------------------------------
/models/README.txt:
--------------------------------------------------------------------------------
 1 | Instructions for using the checkpoint for inference:
 2 | 
 3 | Basic Setup (One time activity)
 4 | 
 5 | 1. Clone the DYGIE++ repository from: https://github.com/dwadden/dygiepp. This repositiory is managed by Wadden et al., authors of the paper Entity, Relation, and Event Extraction with Contextualized Span Representations (https://www.aclweb.org/anthology/D19-1585.pdf).
 6 | 
 7 | git clone https://github.com/dwadden/dygiepp.git
 8 | 
 9 | 2. Navigate to the root of repo in your system and use the following commands to setup the conda environment:
10 | 
11 | conda create --name dygiepp python=3.7
12 | pip install -r requirements.txt
13 | conda develop .   # Adds DyGIE to your PYTHONPATH
14 | 
15 | Running Inference on Radiology Reports
16 | 
17 | 3. Activate the conda environment: 
18 | 
19 | conda activate dygiepp
20 | 
21 | 3. Copy the inference.py file to the root of the cloned repo where you have the dygie folder
22 | 
23 | 4. Run the inference.py file using the command:
24 | 
25 | python3 inference.py --model_path <path to file in model_checkpoint ending in tar.gz> --data_path <path to folder with reports> --out_path <path to file where to save result ending in .json> --cuda_device <optional id>
26 | 


--------------------------------------------------------------------------------
/models/inference.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import json 
  4 | import re
  5 | import argparse
  6 | 
  7 | def get_file_list(path):
  8 |     
  9 |     """Gets path to all the reports (.txt format files) in the specified folder, and
 10 |     saves it in a temporary json file
 11 |     
 12 |         Args:
 13 |             path: Path to the folder containing the reports
 14 |     """
 15 |     
 16 |     file_list = [item for item in glob.glob(f"{path}/*.txt")]
 17 |     
 18 |     # Number of files for inference at once depends on the memory available.
 19 |     ## Recemmended to use no more than batches of 25,000 files
 20 |     
 21 |     with open('./temp_file_list.json', 'w') as f:
 22 |         json.dump(file_list, f)
 23 | 
 24 | def preprocess_reports():
 25 |     
 26 |     """ Load up the files mentioned in the temporary json file, and
 27 |     processes them in format that the dygie model can take as input.
 28 |     Also save the processed file in a temporary file.
 29 |     """
 30 |     
 31 |     file_list = json.load(open("./temp_file_list.json"))
 32 |     final_list = []
 33 |     for idx, file in enumerate(file_list):
 34 | 
 35 |         temp_file = open(file).read()
 36 |         sen = re.sub('(?<! )(?=[/,-,:,.,!?()])|(?<=[/,-,:,.,!?()])(?! )', r' ',temp_file).split()
 37 |         temp_dict = {}
 38 | 
 39 |         temp_dict["doc_key"] = file
 40 |         
 41 |         ## Current way of inference takes in the whole report as 1 sentence
 42 |         temp_dict["sentences"] = [sen]
 43 | 
 44 |         final_list.append(temp_dict)
 45 | 
 46 |         if(idx % 1000 == 0):
 47 |             print(f"{idx+1} reports done")
 48 |     
 49 |     print(f"{idx+1} reports done")
 50 |     
 51 |     with open("./temp_dygie_input.json",'w') as outfile:
 52 |         for item in final_list:
 53 |             json.dump(item, outfile)
 54 |             outfile.write("\n")
 55 | 
 56 | def run_inference(model_path, cuda):
 57 |     
 58 |     """ Runs the inference on the processed input files. Saves the result in a
 59 |     temporary output file
 60 |     
 61 |     Args:
 62 |         model_path: Path to the model checkpoint
 63 |         cuda: GPU id
 64 |     
 65 |     
 66 |     """
 67 |     out_path = "./temp_dygie_output.json"
 68 |     data_path = "./temp_dygie_input.json"
 69 |     
 70 |     os.system(f"allennlp predict {model_path} {data_path} \
 71 |             --predictor dygie --include-package dygie \
 72 |             --use-dataset-reader \
 73 |             --output-file {out_path} \
 74 |             --cuda-device {cuda} \
 75 |             --silent")
 76 | 
 77 | def postprocess_reports():
 78 |     
 79 |     """Post processes all the reports and saves the result in train.json format
 80 |     """
 81 |     final_dict = {}
 82 | 
 83 |     file_name = f"./temp_dygie_output.json"
 84 |     data = []
 85 | 
 86 |     with open(file_name,'r') as f:
 87 |         for line in f:
 88 |             data.append(json.loads(line))
 89 | 
 90 |     for file in data:
 91 |         postprocess_individual_report(file, final_dict)
 92 |     
 93 |     return final_dict
 94 | 
 95 | def postprocess_individual_report(file, final_dict, data_source=None):
 96 |     
 97 |     """Postprocesses individual report
 98 |     
 99 |     Args:
100 |         file: output dict for individual reports
101 |         final_dict: Dict for storing all the reports
102 |     """
103 |     
104 |     try:
105 |         temp_dict = {}
106 | 
107 |         temp_dict['text'] = " ".join(file['sentences'][0])
108 |         n = file['predicted_ner'][0]
109 |         r = file['predicted_relations'][0]
110 |         s = file['sentences'][0]
111 |         temp_dict["entities"] = get_entity(n,r,s)
112 |         temp_dict["data_source"] = data_source
113 |         temp_dict["data_split"] = "inference"
114 | 
115 |         final_dict[file['doc_key']] = temp_dict
116 |     
117 |     except:
118 |         print(f"Error in doc key: {file['doc_key']}. Skipping inference on this file")
119 |         
120 | def get_entity(n,r,s):
121 |     
122 |     """Gets the entities for individual reports
123 |     
124 |     Args:
125 |         n: list of entities in the report
126 |         r: list of relations in the report
127 |         s: list containing tokens of the sentence
128 |         
129 |     Returns:
130 |         dict_entity: Dictionary containing the entites in the format similar to train.json 
131 |     
132 |     """
133 | 
134 |     dict_entity = {}
135 |     rel_list = [item[0:2] for item in r]
136 |     ner_list = [item[0:2] for item in n]
137 |     for idx, item in enumerate(n):
138 |         temp_dict = {}
139 |         start_idx, end_idx, label = item[0], item[1], item[2]
140 |         temp_dict['tokens'] = " ".join(s[start_idx:end_idx+1])
141 |         temp_dict['label'] = label
142 |         temp_dict['start_ix'] = start_idx
143 |         temp_dict['end_ix'] = end_idx
144 |         rel = []
145 |         relation_idx = [i for i,val in enumerate(rel_list) if val== [start_idx, end_idx]]
146 |         for i,val in enumerate(relation_idx):
147 |             obj = r[val][2:4]
148 |             lab = r[val][4]
149 |             try:
150 |                 object_idx = ner_list.index(obj) + 1
151 |             except:
152 |                 continue
153 |             rel.append([lab,str(object_idx)])
154 |         temp_dict['relations'] = rel
155 |         dict_entity[str(idx+1)] = temp_dict
156 |     
157 |     return dict_entity
158 | 
159 | def cleanup():
160 |     """Removes all the temporary files created during the inference process
161 |     
162 |     """
163 |     # os.system("rm temp_file_list.json")
164 |     # os.system("rm temp_dygie_input.json")
165 |     # os.system("rm temp_dygie_output.json")
166 | 
167 | def run(model_path, data_path, out_path, cuda):
168 |     
169 |     print("Getting paths to all the reports...")
170 |     get_file_list(data_path)
171 |     print(f"Got all the paths.")
172 |     
173 |     print("Preprocessing all the reports...")
174 |     preprocess_reports()
175 |     print("Done with preprocessing.")
176 |     
177 |     print("Running the inference now... This can take a bit of time")
178 |     run_inference(model_path, cuda)
179 |     print("Inference completed.")
180 |     
181 |     print("Postprocessing output file...")
182 |     final_dict = postprocess_reports()
183 |     print("Done postprocessing.")
184 | 
185 |     print("Saving results and performing final cleanup...")
186 |     cleanup()
187 | 
188 |     with open(out_path,'w') as outfile:
189 |         json.dump(final_dict, outfile)
190 | 
191 | if __name__ == '__main__':
192 |     parser = argparse.ArgumentParser()
193 |     
194 |     parser.add_argument('--model_path', type=str, nargs='?', required=True,
195 |                         help='path to model checkpoint')
196 |     
197 |     parser.add_argument('--data_path', type=str, nargs='?', required=True,
198 |                         help='path to folder containing reports')
199 |     
200 |     parser.add_argument('--out_path', type=str, nargs='?', required=True,
201 |                         help='path to file to write results')
202 |     
203 |     parser.add_argument('--cuda_device', type=int, nargs='?', required=False,
204 |                         default = -1, help='id of GPU, if to use')
205 | 
206 |     
207 |     args = parser.parse_args()
208 |     
209 |     run(args.model_path, args.data_path, args.out_path, args.cuda_device)
210 | 


--------------------------------------------------------------------------------
/temp_dygie_input.json:
--------------------------------------------------------------------------------
1 | {"doc_key": "data/s56075423.txt", "sentences": [["FINAL", "REPORT", "HISTORY", ":", "Intubated", "for", "overdose", ".", "COMPARISON", ":", "None", ".", "TECHNIQUE", ":", "Supine", "AP", "view", "of", "the", "chest", ".", "FINDINGS", ":", "Endotracheal", "tube", "terminates", "approximately", "4", ".", "4", "cm", "from", "the", "carina", ",", "in", "standard", "position", ".", "Nasogastric", "tube", "tip", "is", "within", "the", "stomach", ",", "as", "is", "the", "side", "port", ".", "Cardiac", ",", "mediastinal", "and", "hilar", "contours", "are", "normal", ".", "Lungs", "are", "clear", "and", "the", "pulmonary", "vascularity", "is", "normal", ".", "No", "pleural", "effusion", "or", "pneumothorax", "is", "present", ".", "No", "acute", "osseous", "abnormalities", "are", "seen", ".", "IMPRESSION", ":", "Standard", "positioning", "of", "the", "endotracheal", "tube", "and", "nasogastric", "tube", ".", "No", "acute", "cardiopulmonary", "process", "."]]}
2 | {"doc_key": "data/s59358936.txt", "sentences": [["FINAL", "REPORT", "EXAMINATION", ":", "CHEST", "(", "AP", "AND", "LAT", ")", "INDICATION", ":", "History", ":", "___M", "with", "subacute", "CVA", "seen", "on", "MRI", "TECHNIQUE", ":", "Upright", "AP", "and", "lateral", "views", "of", "the", "chest", "COMPARISON", ":", "None", ".", "FINDINGS", ":", "Heart", "size", "is", "normal", ".", "The", "aorta", "is", "tortuous", ".", "The", "pulmonary", "vasculature", "and", "hilar", "contours", "are", "normal", ".", "Lungs", "are", "hyperinflated", "but", "clear", ".", "No", "focal", "consolidation", ",", "pleural", "effusion", "or", "pneumothorax", "is", "present", ".", "No", "acute", "osseous", "abnormality", "is", "identified", ".", "IMPRESSION", ":", "No", "acute", "cardiopulmonary", "abnormality", "."]]}
3 | {"doc_key": "data/s58951365.txt", "sentences": [["FINAL", "REPORT", "EXAMINATION", ":", "Chest", ":", "Frontal", "and", "lateral", "views", "INDICATION", ":", "History", ":", "___F", "with", "cough", "/", "/", "Pneumonia", "TECHNIQUE", ":", "Chest", ":", "Frontal", "and", "Lateral", "COMPARISON", ":", "None", ".", "FINDINGS", ":", "The", "lungs", "are", "clear", "without", "focal", "consolidation", ".", "No", "pleural", "effusion", "or", "pneumothorax", "is", "seen", ".", "The", "cardiac", "and", "mediastinal", "silhouettes", "are", "unremarkable", ".", "IMPRESSION", ":", "No", "acute", "cardiopulmonary", "process", "."]]}
4 | 


--------------------------------------------------------------------------------
/temp_dygie_output.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/temp_dygie_output.json


--------------------------------------------------------------------------------
/temp_file_list.json:
--------------------------------------------------------------------------------
1 | ["data/s56075423.txt", "data/s59358936.txt", "data/s58951365.txt"]


--------------------------------------------------------------------------------