├── README.md ├── data ├── result.json ├── s56075423.txt ├── s58951365.txt └── s59358936.txt ├── dygie ├── __init__.py ├── data │ ├── __init__.py │ ├── dataset_readers │ │ ├── document.py │ │ └── dygie.py │ └── fields │ │ └── adjacency_field_assym.py ├── models │ ├── __init__.py │ ├── coref.py │ ├── dygie.py │ ├── entity_beam_pruner.py │ ├── events.py │ ├── ner.py │ ├── relation.py │ └── shared.py ├── predictors │ ├── __init__.py │ └── dygie.py ├── pytest.ini ├── spacy_interface │ ├── __init__.py │ └── spacy_interface.py ├── tests │ ├── data │ │ ├── __init__.py │ │ ├── annotated_doc_test.py │ │ ├── collate_test.py │ │ ├── document_test.py │ │ ├── dygie_test.py │ │ └── spacy_interface_test.py │ ├── fixtures │ │ ├── ace_event_article.json │ │ ├── ace_event_coref_article.json │ │ ├── collate │ │ │ ├── ace-event │ │ │ │ ├── dev.json │ │ │ │ ├── test.json │ │ │ │ └── train.json │ │ │ └── scierc │ │ │ │ ├── dev.json │ │ │ │ ├── test.json │ │ │ │ └── train.json │ │ ├── dygie_test.jsonnet │ │ ├── dygie_test_full.jsonnet │ │ ├── multi_dataset │ │ │ ├── dev.jsonl │ │ │ ├── test.jsonl │ │ │ └── train.jsonl │ │ └── scierc_article.json │ └── models │ │ ├── __init__.py │ │ ├── coref_test.py │ │ ├── dygie_test.py │ │ ├── multi_dataset_test.sh │ │ └── relation_test.py └── training │ ├── event_metrics.py │ ├── f1.py │ ├── ner_metrics.py │ └── relation_metrics.py ├── models ├── README.txt └── inference.py ├── temp_dygie_input.json ├── temp_dygie_output.json └── temp_file_list.json /README.md: -------------------------------------------------------------------------------- 1 | # RadGraph 2 | RadGraph: Extracting Clinical Entities and Relations from Radiology Reports 3 | -------------------------------------------------------------------------------- /data/result.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/data/result.json -------------------------------------------------------------------------------- /data/s56075423.txt: -------------------------------------------------------------------------------- 1 | FINAL REPORT 2 | HISTORY: Intubated for overdose. 3 | 4 | COMPARISON: None. 5 | 6 | TECHNIQUE: Supine AP view of the chest. 7 | 8 | FINDINGS: 9 | 10 | Endotracheal tube terminates approximately 4.4 cm from the carina, in standard 11 | position. Nasogastric tube tip is within the stomach, as is the side port. 12 | Cardiac, mediastinal and hilar contours are normal. Lungs are clear and the 13 | pulmonary vascularity is normal. No pleural effusion or pneumothorax is 14 | present. No acute osseous abnormalities are seen. 15 | 16 | IMPRESSION: 17 | 18 | Standard positioning of the endotracheal tube and nasogastric tube. No acute 19 | cardiopulmonary process. 20 | -------------------------------------------------------------------------------- /data/s58951365.txt: -------------------------------------------------------------------------------- 1 | FINAL REPORT 2 | EXAMINATION: 3 | Chest: Frontal and lateral views 4 | 5 | INDICATION: History: ___F with cough // Pneumonia 6 | 7 | TECHNIQUE: Chest: Frontal and Lateral 8 | 9 | COMPARISON: None. 10 | 11 | FINDINGS: 12 | 13 | The lungs are clear without focal consolidation. No pleural effusion or 14 | pneumothorax is seen. The cardiac and mediastinal silhouettes are 15 | unremarkable. 16 | 17 | IMPRESSION: 18 | 19 | No acute cardiopulmonary process. 20 | -------------------------------------------------------------------------------- /data/s59358936.txt: -------------------------------------------------------------------------------- 1 | FINAL REPORT 2 | EXAMINATION: CHEST (AP AND LAT) 3 | 4 | INDICATION: History: ___M with subacute CVA seen on MRI 5 | 6 | TECHNIQUE: Upright AP and lateral views of the chest 7 | 8 | COMPARISON: None. 9 | 10 | FINDINGS: 11 | 12 | Heart size is normal. The aorta is tortuous. The pulmonary vasculature and 13 | hilar contours are normal. Lungs are hyperinflated but clear. No focal 14 | consolidation, pleural effusion or pneumothorax is present. No acute osseous 15 | abnormality is identified. 16 | 17 | IMPRESSION: 18 | 19 | No acute cardiopulmonary abnormality. 20 | -------------------------------------------------------------------------------- /dygie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/__init__.py -------------------------------------------------------------------------------- /dygie/data/__init__.py: -------------------------------------------------------------------------------- 1 | from dygie.data.dataset_readers.dygie import DyGIEReader 2 | from dygie.data.dataset_readers.document import Document 3 | -------------------------------------------------------------------------------- /dygie/data/dataset_readers/dygie.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Dict, List, Optional, Tuple, DefaultDict, Set, Union 3 | import json 4 | import pickle as pkl 5 | import warnings 6 | 7 | from overrides import overrides 8 | 9 | from allennlp.common.file_utils import cached_path 10 | from allennlp.data.dataset_readers.dataset_reader import DatasetReader 11 | from allennlp.data.fields import (ListField, TextField, SpanField, MetadataField, 12 | SequenceLabelField, AdjacencyField, LabelField) 13 | from allennlp.data.instance import Instance 14 | from allennlp.data.tokenizers import Token 15 | from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer 16 | from allennlp.data.dataset_readers.dataset_utils import enumerate_spans 17 | 18 | from dygie.data.fields.adjacency_field_assym import AdjacencyFieldAssym 19 | from dygie.data.dataset_readers.document import Document, Sentence 20 | 21 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 22 | 23 | 24 | class DyGIEDataException(Exception): 25 | pass 26 | 27 | 28 | @DatasetReader.register("dygie") 29 | class DyGIEReader(DatasetReader): 30 | """ 31 | Reads a single JSON-formatted file. This is the same file format as used in the 32 | scierc, but is preprocessed 33 | """ 34 | def __init__(self, 35 | max_span_width: int, 36 | token_indexers: Dict[str, TokenIndexer] = None, 37 | **kwargs) -> None: 38 | super().__init__(**kwargs) 39 | self._max_span_width = max_span_width 40 | self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} 41 | 42 | @overrides 43 | def _read(self, file_path: str): 44 | # if `file_path` is a URL, redirect to the cache 45 | file_path = cached_path(file_path) 46 | 47 | with open(file_path, "r") as f: 48 | lines = f.readlines() 49 | 50 | for line in lines: 51 | # Loop over the documents. 52 | doc_text = json.loads(line) 53 | instance = self.text_to_instance(doc_text) 54 | yield instance 55 | 56 | def _too_long(self, span): 57 | return span[1] - span[0] + 1 > self._max_span_width 58 | 59 | def _process_ner(self, span_tuples, sent): 60 | ner_labels = [""] * len(span_tuples) 61 | 62 | for span, label in sent.ner_dict.items(): 63 | if self._too_long(span): 64 | continue 65 | ix = span_tuples.index(span) 66 | ner_labels[ix] = label 67 | 68 | return ner_labels 69 | 70 | def _process_coref(self, span_tuples, sent): 71 | coref_labels = [-1] * len(span_tuples) 72 | 73 | for span, label in sent.cluster_dict.items(): 74 | if self._too_long(span): 75 | continue 76 | ix = span_tuples.index(span) 77 | coref_labels[ix] = label 78 | return coref_labels 79 | 80 | def _process_relations(self, span_tuples, sent): 81 | relations = [] 82 | relation_indices = [] 83 | 84 | # Loop over the gold spans. Look up their indices in the list of span tuples and store 85 | # values. 86 | for (span1, span2), label in sent.relation_dict.items(): 87 | # If either span is beyond the max span width, skip it. 88 | if self._too_long(span1) or self._too_long(span2): 89 | continue 90 | ix1 = span_tuples.index(span1) 91 | ix2 = span_tuples.index(span2) 92 | relation_indices.append((ix1, ix2)) 93 | relations.append(label) 94 | 95 | return relations, relation_indices 96 | 97 | def _process_events(self, span_tuples, sent): 98 | n_tokens = len(sent.text) 99 | 100 | trigger_labels = [""] * n_tokens 101 | for tok_ix, trig_label in sent.events.trigger_dict.items(): 102 | trigger_labels[tok_ix] = trig_label 103 | 104 | arguments = [] 105 | argument_indices = [] 106 | 107 | for (trig_ix, arg_span), arg_label in sent.events.argument_dict.items(): 108 | if self._too_long(arg_span): 109 | continue 110 | arg_span_ix = span_tuples.index(arg_span) 111 | argument_indices.append((trig_ix, arg_span_ix)) 112 | arguments.append(arg_label) 113 | 114 | return trigger_labels, arguments, argument_indices 115 | 116 | def _process_sentence(self, sent: Sentence, dataset: str): 117 | # Get the sentence text and define the `text_field`. 118 | sentence_text = [self._normalize_word(word) for word in sent.text] 119 | text_field = TextField([Token(word) for word in sentence_text], self._token_indexers) 120 | 121 | # Enumerate spans. 122 | spans = [] 123 | for start, end in enumerate_spans(sentence_text, max_span_width=self._max_span_width): 124 | spans.append(SpanField(start, end, text_field)) 125 | span_field = ListField(spans) 126 | span_tuples = [(span.span_start, span.span_end) for span in spans] 127 | 128 | # Convert data to fields. 129 | # NOTE: The `ner_labels` and `coref_labels` would ideally have type 130 | # `ListField[SequenceLabelField]`, where the sequence labels are over the `SpanField` of 131 | # `spans`. But calling `as_tensor_dict()` fails on this specific data type. Matt G 132 | # recognized that this is an AllenNLP API issue and suggested that represent these as 133 | # `ListField[ListField[LabelField]]` instead. 134 | fields = {} 135 | fields["text"] = text_field 136 | fields["spans"] = span_field 137 | if sent.ner is not None: 138 | ner_labels = self._process_ner(span_tuples, sent) 139 | fields["ner_labels"] = ListField( 140 | [LabelField(entry, label_namespace=f"{dataset}__ner_labels") 141 | for entry in ner_labels]) 142 | if sent.cluster_dict is not None: 143 | # Skip indexing for coref labels, which are ints. 144 | coref_labels = self._process_coref(span_tuples, sent) 145 | fields["coref_labels"] = ListField( 146 | [LabelField(entry, label_namespace="coref_labels", skip_indexing=True) 147 | for entry in coref_labels]) 148 | if sent.relations is not None: 149 | relation_labels, relation_indices = self._process_relations(span_tuples, sent) 150 | fields["relation_labels"] = AdjacencyField( 151 | indices=relation_indices, sequence_field=span_field, labels=relation_labels, 152 | label_namespace=f"{dataset}__relation_labels") 153 | if sent.events is not None: 154 | trigger_labels, argument_labels, argument_indices = self._process_events(span_tuples, sent) 155 | fields["trigger_labels"] = SequenceLabelField( 156 | trigger_labels, text_field, label_namespace=f"{dataset}__trigger_labels") 157 | fields["argument_labels"] = AdjacencyFieldAssym( 158 | indices=argument_indices, row_field=text_field, col_field=span_field, 159 | labels=argument_labels, label_namespace=f"{dataset}__argument_labels") 160 | 161 | return fields 162 | 163 | def _process_sentence_fields(self, doc: Document): 164 | # Process each sentence. 165 | sentence_fields = [self._process_sentence(sent, doc.dataset) for sent in doc.sentences] 166 | 167 | # Make sure that all sentences have the same set of keys. 168 | first_keys = set(sentence_fields[0].keys()) 169 | for entry in sentence_fields: 170 | if set(entry.keys()) != first_keys: 171 | raise DyGIEDataException( 172 | f"Keys do not match across sentences for document {doc.doc_key}.") 173 | 174 | # For each field, store the data from all sentences together in a ListField. 175 | fields = {} 176 | keys = sentence_fields[0].keys() 177 | for key in keys: 178 | this_field = ListField([sent[key] for sent in sentence_fields]) 179 | fields[key] = this_field 180 | 181 | return fields 182 | 183 | @overrides 184 | def text_to_instance(self, doc_text: Dict[str, Any]): 185 | """ 186 | Convert a Document object into an instance. 187 | """ 188 | doc = Document.from_json(doc_text) 189 | 190 | # Make sure there are no single-token sentences; these break things. 191 | sent_lengths = [len(x) for x in doc.sentences] 192 | if min(sent_lengths) < 2: 193 | msg = (f"Document {doc.doc_key} has a sentence with a single token or no tokens. " 194 | "This may break the modeling code.") 195 | warnings.warn(msg) 196 | 197 | fields = self._process_sentence_fields(doc) 198 | fields["metadata"] = MetadataField(doc) 199 | 200 | return Instance(fields) 201 | 202 | @overrides 203 | def _instances_from_cache_file(self, cache_filename): 204 | with open(cache_filename, "rb") as f: 205 | for entry in pkl.load(f): 206 | yield entry 207 | 208 | @overrides 209 | def _instances_to_cache_file(self, cache_filename, instances): 210 | with open(cache_filename, "wb") as f: 211 | pkl.dump(instances, f, protocol=pkl.HIGHEST_PROTOCOL) 212 | 213 | @staticmethod 214 | def _normalize_word(word): 215 | if word == "/." or word == "/?": 216 | return word[1:] 217 | else: 218 | return word 219 | -------------------------------------------------------------------------------- /dygie/data/fields/adjacency_field_assym.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Set, Tuple 2 | import logging 3 | import textwrap 4 | 5 | from overrides import overrides 6 | import torch 7 | 8 | from allennlp.common.checks import ConfigurationError 9 | from allennlp.data.fields.field import Field 10 | from allennlp.data.fields.sequence_field import SequenceField 11 | from allennlp.data.vocabulary import Vocabulary 12 | 13 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 14 | 15 | 16 | class AdjacencyFieldAssym(Field[torch.Tensor]): 17 | """ 18 | There are cases where we need to express adjacency relations between elements in two different 19 | fields - for instance a TextField and a SpanField. This implements an "asymmetric" adjacency field. 20 | 21 | Parameters 22 | ---------- 23 | indices : ``List[Tuple[int, int]]`` 24 | row_field : ``SequenceField`` 25 | The field with the sequence that the rows of `indices` index into. 26 | col_field : ``SequenceField`` 27 | The field with the sequence that the columns of `indices` index into. 28 | labels : ``List[str]``, optional, default = None 29 | Optional labels for the edges of the adjacency matrix. 30 | label_namespace : ``str``, optional (default='labels') 31 | The namespace to use for converting tag strings into integers. We convert tag strings to 32 | integers for you, and this parameter tells the ``Vocabulary`` object which mapping from 33 | strings to integers to use (so that "O" as a tag doesn't get the same id as "O" as a word). 34 | padding_value : ``int``, (optional, default = -1) 35 | The value to use as padding. 36 | """ 37 | # It is possible that users want to use this field with a namespace which uses OOV/PAD tokens. 38 | # This warning will be repeated for every instantiation of this class (i.e for every data 39 | # instance), spewing a lot of warnings so this class variable is used to only log a single 40 | # warning per namespace. 41 | _already_warned_namespaces: Set[str] = set() 42 | 43 | def __init__(self, 44 | indices: List[Tuple[int, int]], 45 | row_field: SequenceField, 46 | col_field: SequenceField, 47 | labels: List[str] = None, 48 | label_namespace: str = 'labels', 49 | padding_value: int = -1) -> None: 50 | self.indices = indices 51 | self.labels = labels 52 | self.row_field = row_field 53 | self.col_field = col_field 54 | self._label_namespace = label_namespace 55 | self._padding_value = padding_value 56 | self._indexed_labels: List[int] = None 57 | 58 | self._maybe_warn_for_namespace(label_namespace) 59 | row_length = row_field.sequence_length() 60 | col_length = col_field.sequence_length() 61 | 62 | if len(set(indices)) != len(indices): 63 | raise ConfigurationError(f"Indices must be unique, but found {indices}") 64 | 65 | if not all([0 <= index[1] < col_length and 0 <= index[0] < row_length for index in indices]): 66 | raise ConfigurationError(f"Label indices and sequence length " 67 | f"are incompatible: {indices} and {row_length} or {col_length}") 68 | 69 | if labels is not None and len(indices) != len(labels): 70 | raise ConfigurationError(f"Labelled indices were passed, but their lengths do not match: " 71 | f" {labels}, {indices}") 72 | 73 | def _maybe_warn_for_namespace(self, label_namespace: str) -> None: 74 | if not (self._label_namespace.endswith("labels") or self._label_namespace.endswith("tags")): 75 | if label_namespace not in self._already_warned_namespaces: 76 | logger.warning("Your label namespace was '%s'. We recommend you use a namespace " 77 | "ending with 'labels' or 'tags', so we don't add UNK and PAD tokens by " 78 | "default to your vocabulary. See documentation for " 79 | "`non_padded_namespaces` parameter in Vocabulary.", 80 | self._label_namespace) 81 | self._already_warned_namespaces.add(label_namespace) 82 | 83 | @overrides 84 | def count_vocab_items(self, counter: Dict[str, Dict[str, int]]): 85 | if self._indexed_labels is None and self.labels is not None: 86 | for label in self.labels: 87 | counter[self._label_namespace][label] += 1 # type: ignore 88 | 89 | @overrides 90 | def index(self, vocab: Vocabulary): 91 | if self._indexed_labels is None and self.labels is not None: 92 | self._indexed_labels = [vocab.get_token_index(label, self._label_namespace) 93 | for label in self.labels] 94 | 95 | @overrides 96 | def get_padding_lengths(self) -> Dict[str, int]: 97 | return {'num_rows': self.row_field.sequence_length(), 98 | 'num_cols': self.col_field.sequence_length()} 99 | 100 | @overrides 101 | def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor: 102 | desired_num_rows = padding_lengths['num_rows'] 103 | desired_num_cols = padding_lengths['num_cols'] 104 | tensor = torch.ones(desired_num_rows, desired_num_cols) * self._padding_value 105 | labels = self._indexed_labels or [1 for _ in range(len(self.indices))] 106 | 107 | for index, label in zip(self.indices, labels): 108 | tensor[index] = label 109 | return tensor 110 | 111 | @overrides 112 | def empty_field(self) -> 'AdjacencyFieldAssym': 113 | # pylint: disable=protected-access 114 | # The empty_list here is needed for mypy 115 | empty_list: List[Tuple[int, int]] = [] 116 | adjacency_field = AdjacencyFieldAssym(empty_list, 117 | self.row_field.empty_field(), 118 | self.col_field.empty_field(), 119 | padding_value=self._padding_value) 120 | return adjacency_field 121 | 122 | def __str__(self) -> str: 123 | row_length = self.row_field.sequence_length() 124 | col_length = self.col_field.sequence_length() 125 | formatted_labels = "".join(["\t\t" + labels + "\n" 126 | for labels in textwrap.wrap(repr(self.labels), 100)]) 127 | formatted_indices = "".join(["\t\t" + index + "\n" 128 | for index in textwrap.wrap(repr(self.indices), 100)]) 129 | return f"AdjacencyFieldAssym of row length {row_length} and col length {col_length}\n" \ 130 | f"\t\twith indices:\n {formatted_indices}\n" \ 131 | f"\t\tand labels:\n {formatted_labels} \t\tin namespace: '{self._label_namespace}'." 132 | -------------------------------------------------------------------------------- /dygie/models/__init__.py: -------------------------------------------------------------------------------- 1 | from dygie.models.dygie import DyGIE 2 | -------------------------------------------------------------------------------- /dygie/models/entity_beam_pruner.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is basically a copy of AllenNLP's Pruner module, but with support for entity beams. 3 | """ 4 | 5 | from typing import Tuple, Union 6 | 7 | from overrides import overrides 8 | import torch 9 | 10 | from allennlp.nn import util 11 | from allennlp.modules import TimeDistributed 12 | 13 | 14 | def make_pruner(scorer, entity_beam=False, gold_beam=False): 15 | """ 16 | Create a pruner that either takes outputs of other scorers (i.e. entity beam), or uses its own 17 | scorer (the `default_scorer`). 18 | """ 19 | item_scorer = torch.nn.Sequential( 20 | TimeDistributed(scorer), 21 | TimeDistributed(torch.nn.Linear(scorer.get_output_dim(), 1))) 22 | min_score_to_keep = 1e-10 if entity_beam else None 23 | 24 | return Pruner(item_scorer, entity_beam, gold_beam, min_score_to_keep) 25 | 26 | 27 | class Pruner(torch.nn.Module): 28 | """ 29 | This module scores and prunes items in a list using a parameterised scoring function and a 30 | threshold. 31 | 32 | Parameters 33 | ---------- 34 | scorer : ``torch.nn.Module``, required. 35 | A module which, given a tensor of shape (batch_size, num_items, embedding_size), 36 | produces a tensor of shape (batch_size, num_items, 1), representing a scalar score 37 | per item in the tensor. 38 | entity_beam: bool, optional. 39 | If True, use class scores output from another module instead of using own scorer. 40 | gold_beam: bool, optional. 41 | If True, use gold arguments. 42 | min_score_to_keep : float, optional. 43 | If given, only keep items that score at least this high. 44 | """ 45 | def __init__(self, scorer: torch.nn.Module, entity_beam: bool = False, gold_beam: bool = False, 46 | min_score_to_keep: float = None) -> None: 47 | super().__init__() 48 | # If gold beam is on, then entity beam must be off and min_score_to_keep must be None. 49 | assert not (gold_beam and ((min_score_to_keep is not None) or entity_beam)) 50 | self._scorer = scorer 51 | self._entity_beam = entity_beam 52 | self._gold_beam = gold_beam 53 | self._min_score_to_keep = min_score_to_keep 54 | 55 | @overrides 56 | def forward(self, # pylint: disable=arguments-differ 57 | embeddings: torch.FloatTensor, 58 | mask: torch.LongTensor, 59 | num_items_to_keep: Union[int, torch.LongTensor], 60 | class_scores: torch.FloatTensor = None, 61 | gold_labels: torch.long = None) -> Tuple[torch.FloatTensor, torch.LongTensor, 62 | torch.LongTensor, torch.FloatTensor]: 63 | """ 64 | Extracts the top-k scoring items with respect to the scorer. We additionally return 65 | the indices of the top-k in their original order, not ordered by score, so that downstream 66 | components can rely on the original ordering (e.g., for knowing what spans are valid 67 | antecedents in a coreference resolution model). May use the same k for all sentences in 68 | minibatch, or different k for each. 69 | 70 | Parameters 71 | ---------- 72 | embeddings : ``torch.FloatTensor``, required. 73 | A tensor of shape (batch_size, num_items, embedding_size), containing an embedding for 74 | each item in the list that we want to prune. 75 | mask : ``torch.LongTensor``, required. 76 | A tensor of shape (batch_size, num_items), denoting unpadded elements of 77 | ``embeddings``. 78 | num_items_to_keep : ``Union[int, torch.LongTensor]``, required. 79 | If a tensor of shape (batch_size), specifies the number of items to keep for each 80 | individual sentence in minibatch. 81 | If an int, keep the same number of items for all sentences. 82 | class_scores: 83 | Class scores to be used with entity beam. 84 | candidate_labels: If in debugging mode, use gold labels to get beam. 85 | 86 | Returns 87 | ------- 88 | top_embeddings : ``torch.FloatTensor`` 89 | The representations of the top-k scoring items. 90 | Has shape (batch_size, max_num_items_to_keep, embedding_size). 91 | top_mask : ``torch.LongTensor`` 92 | The corresponding mask for ``top_embeddings``. 93 | Has shape (batch_size, max_num_items_to_keep). 94 | top_indices : ``torch.IntTensor`` 95 | The indices of the top-k scoring items into the original ``embeddings`` 96 | tensor. This is returned because it can be useful to retain pointers to 97 | the original items, if each item is being scored by multiple distinct 98 | scorers, for instance. Has shape (batch_size, max_num_items_to_keep). 99 | top_item_scores : ``torch.FloatTensor`` 100 | The values of the top-k scoring items. 101 | Has shape (batch_size, max_num_items_to_keep, 1). 102 | num_items_kept 103 | """ 104 | # If an int was given for number of items to keep, construct tensor by repeating the value. 105 | if isinstance(num_items_to_keep, int): 106 | batch_size = mask.size(0) 107 | # Put the tensor on same device as the mask. 108 | num_items_to_keep = num_items_to_keep * torch.ones([batch_size], dtype=torch.long, 109 | device=mask.device) 110 | 111 | mask = mask.unsqueeze(-1) 112 | num_items = embeddings.size(1) 113 | 114 | # Shape: (batch_size, num_items, 1) 115 | # If entity beam is one, use the class scores. Else ignore them and use the scorer. 116 | if self._entity_beam: 117 | scores, _ = class_scores.max(dim=-1) 118 | scores = scores.unsqueeze(-1) 119 | # If gold beam is one, give a score of 0 wherever the gold label is non-zero (indicating a 120 | # non-null label), otherwise give a large negative number. 121 | elif self._gold_beam: 122 | scores = torch.where(gold_labels > 0, 123 | torch.zeros_like(gold_labels, dtype=torch.float), 124 | -1e20 * torch.ones_like(gold_labels, dtype=torch.float)) 125 | scores = scores.unsqueeze(-1) 126 | else: 127 | scores = self._scorer(embeddings) 128 | 129 | # If we're only keeping items that score above a given threshold, change the number of kept 130 | # items here. 131 | if self._min_score_to_keep is not None: 132 | num_good_items = torch.sum(scores > self._min_score_to_keep, dim=1).squeeze() 133 | num_items_to_keep = torch.min(num_items_to_keep, num_good_items) 134 | # If gold beam is on, keep the gold items. 135 | if self._gold_beam: 136 | num_items_to_keep = torch.sum(gold_labels > 0, dim=1) 137 | 138 | # Always keep at least one item to avoid edge case with empty matrix. 139 | max_items_to_keep = max(num_items_to_keep.max().item(), 1) 140 | 141 | if scores.size(-1) != 1 or scores.dim() != 3: 142 | raise ValueError(f"The scorer passed to Pruner must produce a tensor of shape" 143 | f"(batch_size, num_items, 1), but found shape {scores.size()}") 144 | # Make sure that we don't select any masked items by setting their scores to be very 145 | # negative. These are logits, typically, so -1e20 should be plenty negative. 146 | # NOTE(`mask` needs to be a byte tensor now.) 147 | scores = util.replace_masked_values(scores, mask.bool(), -1e20) 148 | 149 | # Shape: (batch_size, max_num_items_to_keep, 1) 150 | _, top_indices = scores.topk(max_items_to_keep, 1) 151 | 152 | # Mask based on number of items to keep for each sentence. 153 | # Shape: (batch_size, max_num_items_to_keep) 154 | top_indices_mask = util.get_mask_from_sequence_lengths(num_items_to_keep, max_items_to_keep) 155 | top_indices_mask = top_indices_mask.bool() 156 | 157 | # Shape: (batch_size, max_num_items_to_keep) 158 | top_indices = top_indices.squeeze(-1) 159 | 160 | # Fill all masked indices with largest "top" index for that sentence, so that all masked 161 | # indices will be sorted to the end. 162 | # Shape: (batch_size, 1) 163 | fill_value, _ = top_indices.max(dim=1) 164 | fill_value = fill_value.unsqueeze(-1) 165 | # Shape: (batch_size, max_num_items_to_keep) 166 | top_indices = torch.where(top_indices_mask, top_indices, fill_value) 167 | 168 | # Now we order the selected indices in increasing order with 169 | # respect to their indices (and hence, with respect to the 170 | # order they originally appeared in the ``embeddings`` tensor). 171 | top_indices, _ = torch.sort(top_indices, 1) 172 | 173 | # Shape: (batch_size * max_num_items_to_keep) 174 | # torch.index_select only accepts 1D indices, but here 175 | # we need to select items for each element in the batch. 176 | flat_top_indices = util.flatten_and_batch_shift_indices(top_indices, num_items) 177 | 178 | # Shape: (batch_size, max_num_items_to_keep, embedding_size) 179 | top_embeddings = util.batched_index_select(embeddings, top_indices, flat_top_indices) 180 | 181 | # Combine the masks on spans that are out-of-bounds, and the mask on spans that are outside 182 | # the top k for each sentence. 183 | # Shape: (batch_size, max_num_items_to_keep) 184 | sequence_mask = util.batched_index_select(mask, top_indices, flat_top_indices) 185 | sequence_mask = sequence_mask.squeeze(-1).bool() 186 | top_mask = top_indices_mask & sequence_mask 187 | top_mask = top_mask.long() 188 | 189 | # Shape: (batch_size, max_num_items_to_keep, 1) 190 | top_scores = util.batched_index_select(scores, top_indices, flat_top_indices) 191 | 192 | return top_embeddings, top_mask, top_indices, top_scores, num_items_to_keep 193 | -------------------------------------------------------------------------------- /dygie/models/ner.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Dict, List, Optional, Callable 3 | 4 | import torch 5 | from torch.nn import functional as F 6 | from overrides import overrides 7 | 8 | from allennlp.data import Vocabulary 9 | from allennlp.models.model import Model 10 | from allennlp.modules import TimeDistributed 11 | from allennlp.nn import util, InitializerApplicator, RegularizerApplicator 12 | 13 | from dygie.training.ner_metrics import NERMetrics 14 | from dygie.data.dataset_readers import document 15 | 16 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 17 | 18 | 19 | class NERTagger(Model): 20 | """ 21 | Named entity recognition module of DyGIE model. 22 | 23 | Parameters 24 | ---------- 25 | mention_feedforward : ``FeedForward`` 26 | This feedforward network is applied to the span representations which is then scored 27 | by a linear layer. 28 | feature_size: ``int`` 29 | The embedding size for all the embedded features, such as distances or span widths. 30 | lexical_dropout: ``int`` 31 | The probability of dropping out dimensions of the embedded text. 32 | regularizer : ``RegularizerApplicator``, optional (default=``None``) 33 | If provided, will be used to calculate the regularization penalty during training. 34 | """ 35 | 36 | def __init__(self, 37 | vocab: Vocabulary, 38 | make_feedforward: Callable, 39 | span_emb_dim: int, 40 | regularizer: Optional[RegularizerApplicator] = None) -> None: 41 | super(NERTagger, self).__init__(vocab, regularizer) 42 | 43 | self._namespaces = [entry for entry in vocab.get_namespaces() if "ner_labels" in entry] 44 | 45 | # Number of classes determine the output dimension of the final layer 46 | self._n_labels = {name: vocab.get_vocab_size(name) for name in self._namespaces} 47 | 48 | # Null label is needed to keep track of when calculating the metrics 49 | for namespace in self._namespaces: 50 | null_label = vocab.get_token_index("", namespace) 51 | assert null_label == 0 # If not, the dummy class won't correspond to the null label. 52 | 53 | # The output dim is 1 less than the number of labels because we don't score the null label; 54 | # we just give it a score of 0 by default. 55 | 56 | # Create a separate scorer and metric for each dataset we're dealing with. 57 | self._ner_scorers = torch.nn.ModuleDict() 58 | self._ner_metrics = {} 59 | 60 | for namespace in self._namespaces: 61 | mention_feedforward = make_feedforward(input_dim=span_emb_dim) 62 | self._ner_scorers[namespace] = torch.nn.Sequential( 63 | TimeDistributed(mention_feedforward), 64 | TimeDistributed(torch.nn.Linear( 65 | mention_feedforward.get_output_dim(), 66 | self._n_labels[namespace] - 1))) 67 | 68 | self._ner_metrics[namespace] = NERMetrics(self._n_labels[namespace], null_label) 69 | 70 | self._active_namespace = None 71 | 72 | self._loss = torch.nn.CrossEntropyLoss(reduction="sum") 73 | 74 | @overrides 75 | def forward(self, # type: ignore 76 | spans: torch.IntTensor, 77 | span_mask: torch.IntTensor, 78 | span_embeddings: torch.IntTensor, 79 | sentence_lengths: torch.Tensor, 80 | ner_labels: torch.IntTensor = None, 81 | metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: 82 | """ 83 | TODO(dwadden) Write documentation. 84 | """ 85 | 86 | # Shape: (Batch size, Number of Spans, Span Embedding Size) 87 | # span_embeddings 88 | 89 | self._active_namespace = f"{metadata.dataset}__ner_labels" 90 | if self._active_namespace not in self._ner_scorers: 91 | return {"loss": 0} 92 | 93 | scorer = self._ner_scorers[self._active_namespace] 94 | 95 | ner_scores = scorer(span_embeddings) 96 | # Give large negative scores to masked-out elements. 97 | mask = span_mask.unsqueeze(-1) 98 | ner_scores = util.replace_masked_values(ner_scores, mask.bool(), -1e20) 99 | # The dummy_scores are the score for the null label. 100 | dummy_dims = [ner_scores.size(0), ner_scores.size(1), 1] 101 | dummy_scores = ner_scores.new_zeros(*dummy_dims) 102 | ner_scores = torch.cat((dummy_scores, ner_scores), -1) 103 | 104 | _, predicted_ner = ner_scores.max(2) 105 | 106 | predictions = self.predict(ner_scores.detach().cpu(), 107 | spans.detach().cpu(), 108 | span_mask.detach().cpu(), 109 | metadata) 110 | output_dict = {"predictions": predictions} 111 | 112 | if ner_labels is not None: 113 | metrics = self._ner_metrics[self._active_namespace] 114 | metrics(predicted_ner, ner_labels, span_mask) 115 | ner_scores_flat = ner_scores.view(-1, self._n_labels[self._active_namespace]) 116 | ner_labels_flat = ner_labels.view(-1) 117 | mask_flat = span_mask.view(-1).bool() 118 | 119 | loss = self._loss(ner_scores_flat[mask_flat], ner_labels_flat[mask_flat]) 120 | 121 | output_dict["loss"] = loss 122 | 123 | return output_dict 124 | 125 | def predict(self, ner_scores, spans, span_mask, metadata): 126 | # TODO(dwadden) Make sure the iteration works in documents with a single sentence. 127 | # Zipping up and iterating iterates over the zeroth dimension of each tensor; this 128 | # corresponds to iterating over sentences. 129 | predictions = [] 130 | zipped = zip(ner_scores, spans, span_mask, metadata) 131 | for ner_scores_sent, spans_sent, span_mask_sent, sentence in zipped: 132 | predicted_scores_raw, predicted_labels = ner_scores_sent.max(dim=1) 133 | softmax_scores = F.softmax(ner_scores_sent, dim=1) 134 | predicted_scores_softmax, _ = softmax_scores.max(dim=1) 135 | ix = (predicted_labels != 0) & span_mask_sent.bool() 136 | 137 | predictions_sent = [] 138 | zip_pred = zip(predicted_labels[ix], predicted_scores_raw[ix], 139 | predicted_scores_softmax[ix], spans_sent[ix]) 140 | for label, label_score_raw, label_score_softmax, label_span in zip_pred: 141 | label_str = self.vocab.get_token_from_index(label.item(), self._active_namespace) 142 | span_start, span_end = label_span.tolist() 143 | ner = [span_start, span_end, label_str, label_score_raw.item(), 144 | label_score_softmax.item()] 145 | prediction = document.PredictedNER(ner, sentence, sentence_offsets=True) 146 | predictions_sent.append(prediction) 147 | 148 | predictions.append(predictions_sent) 149 | 150 | return predictions 151 | 152 | # TODO(dwadden) This code is repeated elsewhere. Refactor. 153 | @overrides 154 | def get_metrics(self, reset: bool = False) -> Dict[str, float]: 155 | "Loop over the metrics for all namespaces, and return as dict." 156 | res = {} 157 | for namespace, metrics in self._ner_metrics.items(): 158 | precision, recall, f1 = metrics.get_metric(reset) 159 | prefix = namespace.replace("_labels", "") 160 | to_update = {f"{prefix}_precision": precision, 161 | f"{prefix}_recall": recall, 162 | f"{prefix}_f1": f1} 163 | res.update(to_update) 164 | 165 | res_avg = {} 166 | for name in ["precision", "recall", "f1"]: 167 | values = [res[key] for key in res if name in key] 168 | res_avg[f"MEAN__ner_{name}"] = sum(values) / len(values) if values else 0 169 | res.update(res_avg) 170 | 171 | return res 172 | -------------------------------------------------------------------------------- /dygie/models/relation.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any, Dict, List, Optional, Callable 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | from overrides import overrides 7 | 8 | from allennlp.data import Vocabulary 9 | from allennlp.models.model import Model 10 | from allennlp.nn import util, RegularizerApplicator 11 | from allennlp.modules import TimeDistributed 12 | 13 | from dygie.training.relation_metrics import RelationMetrics 14 | from dygie.models.entity_beam_pruner import Pruner 15 | from dygie.data.dataset_readers import document 16 | 17 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 18 | 19 | 20 | # TODO(dwadden) add tensor dimension comments. 21 | # TODO(dwadden) Different sentences should have different number of relation candidates depending on 22 | # length. 23 | class RelationExtractor(Model): 24 | """ 25 | Relation extraction module of DyGIE model. 26 | """ 27 | # TODO(dwadden) add option to make `mention_feedforward` be the NER tagger. 28 | 29 | def __init__(self, 30 | vocab: Vocabulary, 31 | make_feedforward: Callable, 32 | span_emb_dim: int, 33 | feature_size: int, 34 | spans_per_word: float, 35 | positive_label_weight: float = 1.0, 36 | regularizer: Optional[RegularizerApplicator] = None) -> None: 37 | super().__init__(vocab, regularizer) 38 | 39 | self._namespaces = [entry for entry in vocab.get_namespaces() if "relation_labels" in entry] 40 | self._n_labels = {name: vocab.get_vocab_size(name) for name in self._namespaces} 41 | 42 | self._mention_pruners = torch.nn.ModuleDict() 43 | self._relation_feedforwards = torch.nn.ModuleDict() 44 | self._relation_scorers = torch.nn.ModuleDict() 45 | self._relation_metrics = {} 46 | 47 | for namespace in self._namespaces: 48 | mention_feedforward = make_feedforward(input_dim=span_emb_dim) 49 | feedforward_scorer = torch.nn.Sequential( 50 | TimeDistributed(mention_feedforward), 51 | TimeDistributed(torch.nn.Linear(mention_feedforward.get_output_dim(), 1))) 52 | self._mention_pruners[namespace] = Pruner(feedforward_scorer) 53 | 54 | relation_scorer_dim = 3 * span_emb_dim 55 | relation_feedforward = make_feedforward(input_dim=relation_scorer_dim) 56 | self._relation_feedforwards[namespace] = relation_feedforward 57 | relation_scorer = torch.nn.Linear( 58 | relation_feedforward.get_output_dim(), self._n_labels[namespace]) 59 | self._relation_scorers[namespace] = relation_scorer 60 | 61 | self._relation_metrics[namespace] = RelationMetrics() 62 | 63 | self._spans_per_word = spans_per_word 64 | self._active_namespace = None 65 | 66 | self._loss = torch.nn.CrossEntropyLoss(reduction="sum", ignore_index=-1) 67 | 68 | @overrides 69 | def forward(self, # type: ignore 70 | spans: torch.IntTensor, 71 | span_mask, 72 | span_embeddings, # TODO(dwadden) add type. 73 | sentence_lengths, 74 | relation_labels: torch.IntTensor = None, 75 | metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: 76 | """ 77 | TODO(dwadden) Write documentation. 78 | """ 79 | self._active_namespace = f"{metadata.dataset}__relation_labels" 80 | 81 | if self._active_namespace not in self._relation_scorers: 82 | return {"loss": 0} 83 | 84 | (top_span_embeddings, top_span_mention_scores, 85 | num_spans_to_keep, top_span_mask, 86 | top_span_indices, top_spans) = self._prune_spans( 87 | spans, span_mask, span_embeddings, sentence_lengths) 88 | 89 | relation_scores = self._compute_relation_scores( 90 | self._compute_span_pair_embeddings(top_span_embeddings), top_span_mention_scores) 91 | 92 | prediction_dict, predictions = self.predict(top_spans.detach().cpu(), 93 | relation_scores.detach().cpu(), 94 | num_spans_to_keep.detach().cpu(), 95 | metadata) 96 | 97 | output_dict = {"predictions": predictions} 98 | 99 | # Evaluate loss and F1 if labels were provided. 100 | if relation_labels is not None: 101 | # Compute cross-entropy loss. 102 | gold_relations = self._get_pruned_gold_relations( 103 | relation_labels, top_span_indices, top_span_mask) 104 | 105 | cross_entropy = self._get_cross_entropy_loss(relation_scores, gold_relations) 106 | 107 | # Compute F1. 108 | assert len(prediction_dict) == len(metadata) # Make sure length of predictions is right. 109 | relation_metrics = self._relation_metrics[self._active_namespace] 110 | relation_metrics(prediction_dict, metadata) 111 | 112 | output_dict["loss"] = cross_entropy 113 | return output_dict 114 | 115 | def _prune_spans(self, spans, span_mask, span_embeddings, sentence_lengths): 116 | # Prune 117 | num_spans = spans.size(1) # Max number of spans for the minibatch. 118 | 119 | # Keep different number of spans for each minibatch entry. 120 | num_spans_to_keep = torch.ceil(sentence_lengths.float() * self._spans_per_word).long() 121 | 122 | pruner = self._mention_pruners[self._active_namespace] 123 | (top_span_embeddings, top_span_mask, 124 | top_span_indices, top_span_mention_scores, num_spans_kept) = pruner( 125 | span_embeddings, span_mask, num_spans_to_keep) 126 | 127 | top_span_mask = top_span_mask.unsqueeze(-1) 128 | 129 | flat_top_span_indices = util.flatten_and_batch_shift_indices(top_span_indices, num_spans) 130 | top_spans = util.batched_index_select(spans, 131 | top_span_indices, 132 | flat_top_span_indices) 133 | 134 | return top_span_embeddings, top_span_mention_scores, num_spans_to_keep, top_span_mask, top_span_indices, top_spans 135 | 136 | def predict(self, top_spans, relation_scores, num_spans_to_keep, metadata): 137 | preds_dict = [] 138 | predictions = [] 139 | zipped = zip(top_spans, relation_scores, num_spans_to_keep, metadata) 140 | 141 | for top_spans_sent, relation_scores_sent, num_spans_sent, sentence in zipped: 142 | pred_dict_sent, predictions_sent = self._predict_sentence( 143 | top_spans_sent, relation_scores_sent, num_spans_sent, sentence) 144 | preds_dict.append(pred_dict_sent) 145 | predictions.append(predictions_sent) 146 | 147 | return preds_dict, predictions 148 | 149 | def _predict_sentence(self, top_spans, relation_scores, num_spans_to_keep, sentence): 150 | keep = num_spans_to_keep.item() 151 | top_spans = [tuple(x) for x in top_spans.tolist()] 152 | 153 | # Iterate over all span pairs and labels. Record the span if the label isn't null. 154 | predicted_scores_raw, predicted_labels = relation_scores.max(dim=-1) 155 | softmax_scores = F.softmax(relation_scores, dim=-1) 156 | predicted_scores_softmax, _ = softmax_scores.max(dim=-1) 157 | predicted_labels -= 1 # Subtract 1 so that null labels get -1. 158 | 159 | keep_mask = torch.zeros(len(top_spans)) 160 | keep_mask[:keep] = 1 161 | keep_mask = keep_mask.bool() 162 | 163 | ix = (predicted_labels >= 0) & keep_mask 164 | 165 | res_dict = {} 166 | predictions = [] 167 | 168 | for i, j in ix.nonzero(as_tuple=False): 169 | span_1 = top_spans[i] 170 | span_2 = top_spans[j] 171 | label = predicted_labels[i, j].item() 172 | raw_score = predicted_scores_raw[i, j].item() 173 | softmax_score = predicted_scores_softmax[i, j].item() 174 | 175 | label_name = self.vocab.get_token_from_index(label, namespace=self._active_namespace) 176 | res_dict[(span_1, span_2)] = label_name 177 | list_entry = (span_1[0], span_1[1], span_2[0], span_2[1], label_name, raw_score, softmax_score) 178 | predictions.append(document.PredictedRelation(list_entry, sentence, sentence_offsets=True)) 179 | 180 | return res_dict, predictions 181 | 182 | # TODO(dwadden) This code is repeated elsewhere. Refactor. 183 | @overrides 184 | def get_metrics(self, reset: bool = False) -> Dict[str, float]: 185 | "Loop over the metrics for all namespaces, and return as dict." 186 | res = {} 187 | for namespace, metrics in self._relation_metrics.items(): 188 | precision, recall, f1 = metrics.get_metric(reset) 189 | prefix = namespace.replace("_labels", "") 190 | to_update = {f"{prefix}_precision": precision, 191 | f"{prefix}_recall": recall, 192 | f"{prefix}_f1": f1} 193 | res.update(to_update) 194 | 195 | res_avg = {} 196 | for name in ["precision", "recall", "f1"]: 197 | values = [res[key] for key in res if name in key] 198 | res_avg[f"MEAN__relation_{name}"] = sum(values) / len(values) if values else 0 199 | res.update(res_avg) 200 | 201 | return res 202 | 203 | @staticmethod 204 | def _compute_span_pair_embeddings(top_span_embeddings: torch.FloatTensor): 205 | """ 206 | TODO(dwadden) document me and add comments. 207 | """ 208 | # Shape: (batch_size, num_spans_to_keep, num_spans_to_keep, embedding_size) 209 | num_candidates = top_span_embeddings.size(1) 210 | 211 | embeddings_1_expanded = top_span_embeddings.unsqueeze(2) 212 | embeddings_1_tiled = embeddings_1_expanded.repeat(1, 1, num_candidates, 1) 213 | 214 | embeddings_2_expanded = top_span_embeddings.unsqueeze(1) 215 | embeddings_2_tiled = embeddings_2_expanded.repeat(1, num_candidates, 1, 1) 216 | 217 | similarity_embeddings = embeddings_1_expanded * embeddings_2_expanded 218 | 219 | pair_embeddings_list = [embeddings_1_tiled, embeddings_2_tiled, similarity_embeddings] 220 | pair_embeddings = torch.cat(pair_embeddings_list, dim=3) 221 | 222 | return pair_embeddings 223 | 224 | def _compute_relation_scores(self, pairwise_embeddings, top_span_mention_scores): 225 | relation_feedforward = self._relation_feedforwards[self._active_namespace] 226 | relation_scorer = self._relation_scorers[self._active_namespace] 227 | 228 | batch_size = pairwise_embeddings.size(0) 229 | max_num_spans = pairwise_embeddings.size(1) 230 | feature_dim = relation_feedforward.input_dim 231 | 232 | embeddings_flat = pairwise_embeddings.view(-1, feature_dim) 233 | 234 | relation_projected_flat = relation_feedforward(embeddings_flat) 235 | relation_scores_flat = relation_scorer(relation_projected_flat) 236 | 237 | relation_scores = relation_scores_flat.view(batch_size, max_num_spans, max_num_spans, -1) 238 | 239 | # Add the mention scores for each of the candidates. 240 | 241 | relation_scores += (top_span_mention_scores.unsqueeze(-1) + 242 | top_span_mention_scores.transpose(1, 2).unsqueeze(-1)) 243 | 244 | shape = [relation_scores.size(0), relation_scores.size(1), relation_scores.size(2), 1] 245 | dummy_scores = relation_scores.new_zeros(*shape) 246 | 247 | relation_scores = torch.cat([dummy_scores, relation_scores], -1) 248 | return relation_scores 249 | 250 | @staticmethod 251 | def _get_pruned_gold_relations(relation_labels, top_span_indices, top_span_masks): 252 | """ 253 | Loop over each slice and get the labels for the spans from that slice. 254 | All labels are offset by 1 so that the "null" label gets class zero. This is the desired 255 | behavior for the softmax. Labels corresponding to masked relations keep the label -1, which 256 | the softmax loss ignores. 257 | """ 258 | # TODO(dwadden) Test and possibly optimize. 259 | relations = [] 260 | 261 | zipped = zip(relation_labels, top_span_indices, top_span_masks.bool()) 262 | for sliced, ixs, top_span_mask in zipped: 263 | entry = sliced[ixs][:, ixs].unsqueeze(0) 264 | mask_entry = top_span_mask & top_span_mask.transpose(0, 1).unsqueeze(0) 265 | entry[mask_entry] += 1 266 | entry[~mask_entry] = -1 267 | relations.append(entry) 268 | 269 | return torch.cat(relations, dim=0) 270 | 271 | def _get_cross_entropy_loss(self, relation_scores, relation_labels): 272 | """ 273 | Compute cross-entropy loss on relation labels. Ignore diagonal entries and entries giving 274 | relations between masked out spans. 275 | """ 276 | # Need to add one for the null class. 277 | n_labels = self._n_labels[self._active_namespace] + 1 278 | scores_flat = relation_scores.view(-1, n_labels) 279 | # Need to add 1 so that the null label is 0, to line up with indices into prediction matrix. 280 | labels_flat = relation_labels.view(-1) 281 | # Compute cross-entropy loss. 282 | loss = self._loss(scores_flat, labels_flat) 283 | return loss 284 | -------------------------------------------------------------------------------- /dygie/models/shared.py: -------------------------------------------------------------------------------- 1 | """ 2 | Short utility functions. 3 | """ 4 | 5 | import torch 6 | 7 | 8 | def cumsum_shifted(xs): 9 | """ 10 | Assumes `xs` is a 1-d array. 11 | The usual cumsum has elements [x[1], x[1] + x[2], ...]. This one has elements 12 | [0, x[1], x[1] + x[2], ...]. Useful for calculating sentence offsets. 13 | """ 14 | cs = xs.cumsum(dim=0) 15 | shift = torch.zeros(1, dtype=torch.long, device=cs.device) # Put on correct device. 16 | return torch.cat([shift, cs[:-1]], dim=0) 17 | 18 | 19 | def batch_identity(batch_size, matrix_size, *args, **kwargs): 20 | """ 21 | Tile the identity matrix along axis 0, `batch_size` times. 22 | """ 23 | ident = torch.eye(matrix_size, *args, **kwargs).unsqueeze(0) 24 | res = ident.repeat(batch_size, 1, 1) 25 | return res 26 | 27 | 28 | def fields_to_batches(d, keys_to_ignore=[]): 29 | """ 30 | The input is a dict whose items are batched tensors. The output is a list of dictionaries - one 31 | per entry in the batch - with the slices of the tensors for that entry. Here's an example. 32 | Input: 33 | d = {"a": [[1, 2], [3,4]], "b": [1, 2]} 34 | Output: 35 | res = [{"a": [1, 2], "b": 1}, {"a": [3, 4], "b": 2}]. 36 | """ 37 | keys = [key for key in d.keys() if key not in keys_to_ignore] 38 | 39 | # Make sure all input dicts have same length. If they don't, there's a problem. 40 | lengths = {k: len(d[k]) for k in keys} 41 | if len(set(lengths.values())) != 1: 42 | msg = f"fields have different lengths: {lengths}." 43 | # If there's a doc key, add it to specify where the error is. 44 | if "doc_key" in d: 45 | msg = f"For document {d['doc_key']}, " + msg 46 | raise ValueError(msg) 47 | 48 | length = list(lengths.values())[0] 49 | res = [{k: d[k][i] for k in keys} for i in range(length)] 50 | return res 51 | 52 | 53 | def batches_to_fields(batches): 54 | """ 55 | The inverse of `fields_to_batches`. 56 | """ 57 | # Make sure all the keys match. 58 | first_keys = batches[0].keys() 59 | for entry in batches[1:]: 60 | if set(entry.keys()) != set(first_keys): 61 | raise ValueError("Keys to not match on all entries.") 62 | 63 | res = {k: [] for k in first_keys} 64 | for batch in batches: 65 | for k, v in batch.items(): 66 | res[k].append(v) 67 | 68 | return res 69 | -------------------------------------------------------------------------------- /dygie/predictors/__init__.py: -------------------------------------------------------------------------------- 1 | from dygie.predictors.dygie import DyGIEPredictor 2 | -------------------------------------------------------------------------------- /dygie/predictors/dygie.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import numpy as np 3 | import warnings 4 | 5 | from overrides import overrides 6 | import numpy 7 | import json 8 | 9 | from allennlp.common.util import JsonDict 10 | from allennlp.nn import util 11 | from allennlp.data import Batch 12 | from allennlp.data import DatasetReader 13 | from allennlp.models import Model 14 | from allennlp.predictors.predictor import Predictor 15 | 16 | 17 | @Predictor.register("dygie") 18 | class DyGIEPredictor(Predictor): 19 | """ 20 | Predictor for DyGIE model. 21 | 22 | If model was trained on coref, prediction is done on a whole document at 23 | once. This risks overflowing memory on large documents. 24 | If the model was trained without coref, prediction is done by sentence. 25 | """ 26 | def __init__( 27 | self, model: Model, dataset_reader: DatasetReader) -> None: 28 | super().__init__(model, dataset_reader) 29 | 30 | def predict(self, document): 31 | return self.predict_json({"document": document}) 32 | 33 | def predict_tokenized(self, tokenized_document: List[str]) -> JsonDict: 34 | instance = self._words_list_to_instance(tokenized_document) 35 | return self.predict_instance(instance) 36 | 37 | @overrides 38 | def dump_line(self, outputs): 39 | # Need to override to tell Python how to deal with Numpy ints. 40 | return json.dumps(outputs, default=int) + "\n" 41 | 42 | # TODO(dwadden) Can this be implemented in `forward_on_instance` instead? 43 | @overrides 44 | def predict_instance(self, instance): 45 | """ 46 | An instance is an entire document, represented as a list of sentences. 47 | """ 48 | model = self._model 49 | cuda_device = model._get_prediction_device() 50 | 51 | # Try to predict this batch. 52 | try: 53 | dataset = Batch([instance]) 54 | dataset.index_instances(model.vocab) 55 | model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) 56 | prediction = model.make_output_human_readable(model(**model_input)).to_json() 57 | # If we run out of GPU memory, warn user and indicate that this document failed. 58 | # This way, prediction doesn't grind to a halt every time we run out of GPU. 59 | except RuntimeError as err: 60 | # doc_key, dataset, sentences, message 61 | metadata = instance["metadata"].metadata 62 | doc_key = metadata.doc_key 63 | msg = (f"Encountered a RunTimeError on document {doc_key}. Skipping this example." 64 | f" Error message:\n{err.args[0]}.") 65 | warnings.warn(msg) 66 | prediction = metadata.to_json() 67 | prediction["_FAILED_PREDICTION"] = True 68 | 69 | return prediction 70 | -------------------------------------------------------------------------------- /dygie/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests/ 3 | python_paths = ./ 4 | addopts = -p no:warnings -------------------------------------------------------------------------------- /dygie/spacy_interface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/spacy_interface/__init__.py -------------------------------------------------------------------------------- /dygie/spacy_interface/spacy_interface.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from allennlp.data import Batch 3 | from dygie.models.dygie import DyGIE 4 | from dygie.data.dataset_readers.dygie import DyGIEReader 5 | from allennlp.models.archival import load_archive 6 | from allennlp.nn import util 7 | from spacy.language import Language 8 | from spacy.tokens import Span 9 | from spacy.tokens.doc import Doc 10 | from spacy.tokens.span import Span 11 | 12 | Doc.set_extension("rels", default=[], force=True) 13 | Span.set_extension("rels", default=[], force=True) 14 | Doc.set_extension("span_ents", default=[], force=True) 15 | Span.set_extension("label_", default=[], force=True) 16 | Doc.set_extension("events", default=[], force=True) 17 | Span.set_extension("events", default=[], force=True) 18 | 19 | 20 | def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc: 21 | doc_rels = [] 22 | doc_evs = [] 23 | # store events as relations. include confidence scores in the relation tuple (TODO: add relation property) 24 | for evs, ds in zip(prediction.get("predicted_events", []), doc.sents): 25 | sent_evs = [] 26 | for ev in evs: 27 | if len(ev)>=3: 28 | trig = [r for r in ev if r[1]=="TRIGGER"] 29 | arg0s = [r for r in ev if r[2]=="ARG0"] 30 | #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]] 31 | arg1s = [r for r in ev if r[2]=="ARG1"] 32 | e_trig = doc[trig[0][0]:trig[0][0]+1] 33 | for arg0 in arg0s: 34 | e_arg0 = doc[arg0[0] : arg0[1] + 1] 35 | for arg1 in arg1s: 36 | e_arg1 = doc[arg1[0] : arg1[1] + 1] 37 | #here confidence is set as the minimum among {trigger,args}, as a conservative measure. 38 | sent_evs.append({"ARG0":e_arg0,"ARG1":e_arg1,"RELATION_TRIGGER":e_trig,"CONF":min([arg0[4],arg1[4],trig[0][3]])}) 39 | 40 | doc_evs.append(sent_evs) 41 | ds._.events = sent_evs 42 | doc._.events = doc_evs 43 | #TODO add doc._.span_ents too. 44 | 45 | for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents): 46 | sent_rels = [] 47 | for rel in rels: 48 | e1 = doc[rel[0] : rel[1] + 1] 49 | e2 = doc[rel[2] : rel[3] + 1] 50 | tag = rel[4] 51 | sent_rels.append((e1, e2, tag)) 52 | doc_rels.append(sent_rels) 53 | ds._.rels = sent_rels 54 | doc._.rels = doc_rels 55 | if "predicted_ner" not in prediction: 56 | return doc 57 | preds = [p for r in prediction.get("predicted_ner", []) for p in r] 58 | # storing all span based entitis to doc._.span_ents 59 | span_ents = [] 60 | for sent in prediction["predicted_ner"]: 61 | ent_sent = [] 62 | for ent in sent: 63 | d = doc[ent[0] : ent[1] + 1] 64 | d._.label_ = ent[2] 65 | ent_sent.append(d) 66 | span_ents.append(ent_sent) 67 | doc._.span_ents = span_ents 68 | # store entities to doc.ents of spacy 69 | # because spacy can't support the overlapped entities we have to merge overlapped entities 70 | # to the longest ones. 71 | dist_ents = [] 72 | prc = [] 73 | for i, p1 in enumerate(preds): 74 | t = [p1] 75 | if i in prc: 76 | continue 77 | for j, p2 in enumerate(preds[i + 1 :]): 78 | if p2[0] <= p1[1]: 79 | t.append(p1) 80 | prc.append(j + i + 1) 81 | dist_ents.append(t) 82 | res = [] 83 | for t in dist_ents: 84 | if len(t) == 1: 85 | res.append(t[0]) 86 | elif len(t) > 1: 87 | mn = t[0][0] 88 | mx = t[0][1] 89 | for p in t[1:]: 90 | if p[0] < mn: 91 | mn = p[0] 92 | if p[1] > mx: 93 | mx = p[1] 94 | res.append([mn, mx, t[0][2], t[0][3], t[0][4]]) 95 | sel_ents = [] 96 | for ent in res: 97 | try: 98 | d = doc[ent[0] : ent[1] + 1] 99 | s = doc.char_span(d.start_char, d.end_char, label=ent[2]) 100 | if s: 101 | sel_ents.append(s) 102 | except Exception as e: 103 | print("error in spacy span", e) 104 | raise e 105 | doc.ents = sel_ents 106 | return doc 107 | 108 | 109 | class DygieppPipe: 110 | name = "dygiepp" 111 | 112 | def __init__( 113 | self, 114 | nlp: Language, 115 | pretrained_filepath: str = "./pretrained/scierc-lightweight.tar.gz", 116 | dataset_name: str = "scierc", 117 | ) -> None: 118 | """spacy factory class for adding information to spacy document. For now just entities and relations. 119 | It adds entities to doc.ents and relations to doc._.rels: List[List[Token,Token,str]] which is a list of relations 120 | as entity1, entity2, relation name 121 | 122 | Args: 123 | nlp (Language): Spacy Language instance 124 | name (str, optional): Pipe name. Defaults to "dygiepp". 125 | pretrained_filepath (str, optional): Address of pre-trained model to extract information. Defaults to "./pretrained/scierc-lightweight.tar.gz". 126 | dataset_name (str, optional): Dataset name used for model. Defaults to "scierc". 127 | """ 128 | # TODO add events and cluster information to spacy doc too 129 | archive = load_archive(pretrained_filepath) 130 | self._model = archive.model 131 | self._model.eval() 132 | archive.config["dataset_reader"].pop("type") # it's stupid but was necessary! 133 | self._dataset_reader = DyGIEReader.from_params(archive.config["dataset_reader"]) 134 | self.dataset_name = dataset_name 135 | 136 | def __call__(self, doc: Doc) -> Doc: 137 | cuda_device = self._model._get_prediction_device() 138 | sentences = [[tok.text for tok in sent] for sent in doc.sents] 139 | ins = self._dataset_reader.text_to_instance( 140 | {"sentences": sentences, "doc_key": "test", "dataset": self.dataset_name} 141 | ) 142 | dataset = Batch([ins]) 143 | dataset.index_instances(self._model.vocab) 144 | model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) 145 | prediction = self._model.make_output_human_readable( 146 | self._model(**model_input) 147 | ).to_json() 148 | # prepare and store ent/relation information to spacy Doc 149 | return prepare_spacy_doc(doc, prediction) 150 | -------------------------------------------------------------------------------- /dygie/tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/tests/data/__init__.py -------------------------------------------------------------------------------- /dygie/tests/data/annotated_doc_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spot checks for the classes defined in annotated_doc.py. 3 | 4 | Uses the example provided in data.md, with index modifications to account for 5 | the fact that spacy tokenizes contracted words into two tokens. 6 | 7 | Author: Serena G. Lotreck 8 | """ 9 | import unittest 10 | import os 11 | import shutil 12 | import sys 13 | 14 | sys.path.append('../../../scripts/new-dataset') 15 | 16 | import annotated_doc as ad 17 | import spacy 18 | 19 | 20 | class TestEnt(unittest.TestCase): 21 | def setUp(self): 22 | 23 | # Set up tempdir 24 | self.tmpdir = "tmp" 25 | os.makedirs(self.tmpdir, exist_ok=True) 26 | 27 | # Set up document text 28 | nlp = spacy.load("en_core_web_sm") 29 | dataset = 'scierc' 30 | text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " 31 | "She was elected in 2017.") 32 | text_path = f'{self.tmpdir}/myfile.txt' 33 | with open(text_path, 'w') as f: 34 | f.write(text) 35 | ann = ("T1\tCity 0 7\tSeattle\n" 36 | "T2\tPerson 25 37\tJenny Durkan\n" 37 | "T3\tCity 41 51\tthe city's\n" 38 | "T4\tPerson 59 62\tShe\n" 39 | "T5\tPersonnel.Election 67 74\telected\n" 40 | "T6\tYear 78 82\t2017\n" 41 | "R1\tMayor-Of Arg1:T2 Arg2:T3\n" 42 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" 43 | "*\tEQUIV T1 T3\n" 44 | "*\tEQUIV T2 T4\n") 45 | ann_path = f'{self.tmpdir}/myfile.ann' 46 | with open(ann_path, 'w') as f: 47 | f.write(ann) 48 | self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)] 49 | # NOTE: spacy tokenizes words with apostrophes into separate words. 50 | 51 | # Set up annotated_doc object 52 | self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path, 53 | ann_path, 54 | nlp, 55 | dataset, 56 | coref=True) 57 | self.annotated_doc.char_to_token() 58 | 59 | # Right answer 60 | self.ner = [[[0, 0, "City"]], [[6, 7, "Person"], [9, 11, "City"]], 61 | [[14, 14, "Person"], [16, 16, "Personnel.Election"], 62 | [18, 18, "Year"]]] 63 | 64 | def tearDown(self): 65 | 66 | shutil.rmtree(self.tmpdir) 67 | 68 | def test_format_ner_dygiepp(self): 69 | 70 | ner = ad.Ent.format_ner_dygiepp(self.annotated_doc.ents, 71 | self.sent_idx_tups) 72 | 73 | self.assertEqual(ner, self.ner) 74 | 75 | 76 | class TestBinRel(unittest.TestCase): 77 | def setUp(self): 78 | 79 | # Set up tempdir 80 | self.tmpdir = "tmp" 81 | os.makedirs(self.tmpdir, exist_ok=True) 82 | 83 | # Set up document text 84 | nlp = spacy.load("en_core_web_sm") 85 | dataset = 'scierc' 86 | text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " 87 | "She was elected in 2017.") 88 | text_path = f'{self.tmpdir}/myfile.txt' 89 | with open(text_path, 'w') as f: 90 | f.write(text) 91 | ann = ("T1\tCity 0 7\tSeattle\n" 92 | "T2\tPerson 25 37\tJenny Durkan\n" 93 | "T3\tCity 41 51\tthe city's\n" 94 | "T4\tPerson 59 62\tShe\n" 95 | "T5\tPersonnel.Election 67 74\telected\n" 96 | "T6\tYear 78 82\t2017\n" 97 | "R1\tMayor-Of Arg1:T2 Arg2:T3\n" 98 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" 99 | "*\tEQUIV T1 T3\n" 100 | "*\tEQUIV T2 T4\n") 101 | ann_path = f'{self.tmpdir}/myfile.ann' 102 | with open(ann_path, 'w') as f: 103 | f.write(ann) 104 | self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)] 105 | # NOTE: spacy tokenizes words with apostrophes into separate words. 106 | 107 | # Set up annotated_doc object 108 | self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path, 109 | ann_path, 110 | nlp, 111 | dataset, 112 | coref=True) 113 | self.annotated_doc.char_to_token() 114 | 115 | # Set up relation 116 | self.rel1 = ad.BinRel("R1\tMayor-Of Arg1:T2 Arg2:T3".split()) 117 | 118 | # Right answer 119 | self.relations = [[], [[6, 7, 9, 11, "Mayor-Of"]], []] 120 | 121 | def tearDown(self): 122 | 123 | shutil.rmtree(self.tmpdir) 124 | 125 | def test_set_arg_objects(self): 126 | 127 | self.rel1.set_arg_objects(self.annotated_doc.ents) 128 | 129 | self.assertEqual(self.rel1.arg1, self.annotated_doc.ents[1]) 130 | self.assertEqual(self.rel1.arg2, self.annotated_doc.ents[2]) 131 | 132 | def test_format_bin_rels_dygiepp(self): 133 | 134 | self.rel1.set_arg_objects(self.annotated_doc.ents) 135 | relations = ad.BinRel.format_bin_rels_dygiepp([self.rel1], 136 | self.sent_idx_tups) 137 | 138 | self.assertEqual(relations, self.relations) 139 | 140 | 141 | class TestEvent(unittest.TestCase): 142 | def setUp(self): 143 | 144 | # Set up tempdir 145 | self.tmpdir = "tmp" 146 | os.makedirs(self.tmpdir, exist_ok=True) 147 | 148 | # Set up document text 149 | nlp = spacy.load("en_core_web_sm") 150 | dataset = 'scierc' 151 | text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " 152 | "She was elected in 2017.") 153 | text_path = f'{self.tmpdir}/myfile.txt' 154 | with open(text_path, 'w') as f: 155 | f.write(text) 156 | ann = ("T1\tCity 0 7\tSeattle\n" 157 | "T2\tPerson 25 37\tJenny Durkan\n" 158 | "T3\tCity 41 51\tthe city's\n" 159 | "T4\tPerson 59 62\tShe\n" 160 | "T5\tPersonnel.Election 67 74\telected\n" 161 | "T6\tYear 78 82\t2017\n" 162 | "R1\tMayor-Of Arg1:T2 Arg2:T3\n" 163 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" 164 | "*\tEQUIV T1 T3\n" 165 | "*\tEQUIV T2 T4\n") 166 | ann_path = f'{self.tmpdir}/myfile.ann' 167 | with open(ann_path, 'w') as f: 168 | f.write(ann) 169 | self.sent_idx_tups = [(0, 6), (6, 14), (14, 19)] 170 | # NOTE: spacy tokenizes words with apostrophes into separate words. 171 | 172 | # Set up annotated_doc object 173 | self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path, 174 | ann_path, 175 | nlp, 176 | dataset, 177 | coref=True) 178 | self.annotated_doc.char_to_token() 179 | 180 | # Set up events 181 | self.event1 = ad.Event( 182 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6".split()) 183 | 184 | # Right answer 185 | self.events = [[], [], 186 | [[[16, "Personnel.Election"], [14, 14, "Person"], 187 | [18, 18, "Year"]]]] 188 | 189 | def tearDown(self): 190 | 191 | shutil.rmtree(self.tmpdir) 192 | 193 | def test_set_arg_objects(self): 194 | 195 | self.event1.set_arg_objects(self.annotated_doc.ents) 196 | 197 | self.assertEqual(self.event1.trigger, self.annotated_doc.ents[4]) 198 | self.assertEqual( 199 | self.event1.args, 200 | [self.annotated_doc.ents[3], self.annotated_doc.ents[5]]) 201 | 202 | def test_format_events_dygiepp(self): 203 | 204 | self.event1.set_arg_objects(self.annotated_doc.ents) 205 | events = ad.Event.format_events_dygiepp([self.event1], 206 | self.sent_idx_tups) 207 | 208 | self.assertEqual(events, self.events) 209 | 210 | 211 | class TestEquivRel(unittest.TestCase): 212 | def setUp(self): 213 | 214 | # Set up tempdir 215 | self.tmpdir = "tmp" 216 | os.makedirs(self.tmpdir, exist_ok=True) 217 | 218 | # Set up document text 219 | nlp = spacy.load("en_core_web_sm") 220 | dataset = 'scierc' 221 | text = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " 222 | "She was elected in 2017.") 223 | text_path = f'{self.tmpdir}/myfile.txt' 224 | with open(text_path, 'w') as f: 225 | f.write(text) 226 | ann = ("T1\tCity 0 7\tSeattle\n" 227 | "T2\tPerson 25 37\tJenny Durkan\n" 228 | "T3\tCity 41 51\tthe city's\n" 229 | "T4\tPerson 59 62\tShe\n" 230 | "T5\tPersonnel.Election 67 74\telected\n" 231 | "T6\tYear 78 82\t2017\n" 232 | "R1\tMayor-Of Arg1:T2 Arg2:T3\n" 233 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" 234 | "*\tEQUIV T1 T3\n" 235 | "*\tEQUIV T2 T4\n") 236 | ann_path = f'{self.tmpdir}/myfile.ann' 237 | with open(ann_path, 'w') as f: 238 | f.write(ann) 239 | 240 | # Set up annotated_doc object 241 | self.annotated_doc = ad.AnnotatedDoc.parse_ann(text_path, 242 | ann_path, 243 | nlp, 244 | dataset, 245 | coref=True) 246 | self.annotated_doc.char_to_token() 247 | 248 | # Set up equivalence relations 249 | self.equivrel1 = ad.EquivRel("*\tEQUIV T1 T3".split()) 250 | self.equivrel2 = ad.EquivRel("*\tEQUIV T2 T4".split()) 251 | 252 | # The dygiepp-formatted correct answer 253 | self.corefs = [[[0, 0], [9, 11]], [[6, 7], [14, 14]]] 254 | 255 | def tearDown(self): 256 | 257 | shutil.rmtree(self.tmpdir) 258 | 259 | def test_set_arg_objects(self): 260 | 261 | self.equivrel1.set_arg_objects(self.annotated_doc.ents) 262 | self.equivrel2.set_arg_objects(self.annotated_doc.ents) 263 | 264 | self.assertEqual( 265 | self.equivrel1.args, 266 | [self.annotated_doc.ents[0], self.annotated_doc.ents[2]]) 267 | self.assertEqual( 268 | self.equivrel2.args, 269 | [self.annotated_doc.ents[1], self.annotated_doc.ents[3]]) 270 | 271 | def test_format_corefs_dygiepp(self): 272 | 273 | self.equivrel1.set_arg_objects(self.annotated_doc.ents) 274 | self.equivrel2.set_arg_objects(self.annotated_doc.ents) 275 | corefs = ad.EquivRel.format_corefs_dygiepp( 276 | [self.equivrel1, self.equivrel2]) 277 | 278 | self.assertEqual(corefs, self.corefs) 279 | 280 | 281 | class TestAnnotatedDoc(unittest.TestCase): 282 | """ 283 | Tests the functionality of char_to_token and format_dygiepp. 284 | """ 285 | def setUp(self): 286 | 287 | # Set up temp dir and test docs 288 | self.tmpdir = "tmp" 289 | os.makedirs(self.tmpdir, exist_ok=True) 290 | 291 | txt = ("Seattle is a rainy city. Jenny Durkan is the city's mayor. " 292 | "She was elected in 2017.") 293 | 294 | self.txt = f'{self.tmpdir}/myfile.txt' 295 | with open(self.txt, 'w') as f: 296 | f.write(txt) 297 | 298 | ann = ("T1\tCity 0 7\tSeattle\n" 299 | "T2\tPerson 25 37\tJenny Durkan\n" 300 | "T3\tCity 41 51\tthe city's\n" 301 | "T4\tPerson 59 62\tShe\n" 302 | "T5\tPersonnel.Election 67 74\telected\n" 303 | "T6\tYear 78 82\t2017\n" 304 | "R1\tMayor-Of Arg1:T2 Arg2:T3\n" 305 | "E1\tPersonnel.Election:T5 Person:T4 Year:T6\n" 306 | "*\tEQUIV T1 T3\n" 307 | "*\tEQUIV T2 T4\n") 308 | 309 | self.ann = f'{self.tmpdir}/myfile.ann' 310 | with open(self.ann, 'w') as f: 311 | f.write(ann) 312 | 313 | # Define other attributes 314 | self.nlp = spacy.load("en_core_web_sm") 315 | self.dataset = 'scierc' 316 | 317 | # Define right answer 318 | self.dygiepp_dict = { 319 | "doc_key": 320 | "myfile", 321 | "dataset": 322 | self.dataset, 323 | "sentences": 324 | [[tok.text for tok in sent] for sent in self.nlp(txt).sents], 325 | "ner": [[[0, 0, "City"]], [[6, 7, "Person"], [9, 11, "City"]], 326 | [[14, 14, "Person"], [16, 16, "Personnel.Election"], 327 | [18, 18, "Year"]]], 328 | "relations": [[], [[6, 7, 9, 11, "Mayor-Of"]], []], 329 | "clusters": [[[0, 0], [9, 11]], [[6, 7], [14, 14]]], 330 | "events": [[], [], 331 | [[[16, "Personnel.Election"], [14, 14, "Person"], 332 | [18, 18, "Year"]]]] 333 | } 334 | 335 | def tearDown(self): 336 | 337 | shutil.rmtree(self.tmpdir) 338 | 339 | def test_char_to_token(self): 340 | 341 | annotated_doc = ad.AnnotatedDoc.parse_ann(self.txt, 342 | self.ann, 343 | self.nlp, 344 | self.dataset, 345 | coref=True) 346 | annotated_doc.char_to_token() 347 | 348 | self.assertEqual(annotated_doc.ents[0].tok_start, 0) 349 | self.assertEqual(annotated_doc.ents[1].tok_start, 6) 350 | self.assertEqual(annotated_doc.ents[2].tok_start, 9) 351 | 352 | self.assertEqual(annotated_doc.ents[0].tok_end, 0) 353 | self.assertEqual(annotated_doc.ents[1].tok_end, 7) 354 | self.assertEqual(annotated_doc.ents[2].tok_end, 11) 355 | 356 | def test_format_dygiepp(self): 357 | 358 | annotated_doc = ad.AnnotatedDoc.parse_ann(self.txt, 359 | self.ann, 360 | self.nlp, 361 | self.dataset, 362 | coref=True) 363 | annotated_doc.char_to_token() 364 | res = annotated_doc.format_dygiepp() 365 | 366 | self.assertEqual(res, self.dygiepp_dict) 367 | 368 | 369 | if __name__ == "__main__": 370 | unittest.main() 371 | -------------------------------------------------------------------------------- /dygie/tests/data/collate_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test that a dataset doesn't change when it's collated and then de-collated. 3 | """ 4 | 5 | import unittest 6 | import json 7 | import os 8 | import shutil 9 | import sys 10 | from pathlib import Path 11 | 12 | 13 | # Since the collating code isn't inside the `dygie` package, I need to do a little work to import 14 | # it. 15 | current_dir = Path(os.path.dirname(os.path.realpath(__file__))) 16 | common_root = current_dir.parent.parent.parent 17 | collate_dir = f"{common_root}/scripts/data/shared" 18 | sys.path.append(collate_dir) 19 | 20 | # Now import the code 21 | import collate 22 | import uncollate 23 | 24 | 25 | # Utility function. 26 | def load_jsonl(fname): 27 | with open(fname) as f: 28 | return [json.loads(x) for x in f] 29 | 30 | 31 | # The actual tests. 32 | class TestCollate(unittest.TestCase): 33 | def setUp(self): 34 | self.collated_dir = "tmp/collated" 35 | self.uncollated_dir = "tmp/uncollated" 36 | os.makedirs(self.collated_dir, exist_ok=True) 37 | os.makedirs(self.uncollated_dir, exist_ok=True) 38 | 39 | def tearDown(self): 40 | shutil.rmtree("tmp") 41 | 42 | @staticmethod 43 | def is_same(x1, x2): 44 | "Compare the fields in two dicts loaded from json." 45 | # Check if keys are same. 46 | if sorted(x1.keys()) != sorted(x2.keys()): 47 | return False 48 | 49 | # Loop over all fields. If not same, return False. 50 | for key in x1: 51 | if x1[key] != x2[key]: 52 | return False 53 | 54 | # If we get to the end, they're the same. 55 | return True 56 | 57 | def files_same(self, f1, f2): 58 | "Check that contests of two files are the same." 59 | data1 = load_jsonl(f1) 60 | data2 = load_jsonl(f2) 61 | 62 | # Ignore these in the comparison; `dataset` gets added, while `sentence_start` and 63 | # `clusters` get removed. 64 | fields_to_ignore = ["dataset", "sentence_start", "clusters"] 65 | for data in [data1, data2]: 66 | for entry in data: 67 | # Since the input data doesn't have a `dataset` field, we don't want to compare on 68 | # this. 69 | for field_to_ignore in fields_to_ignore: 70 | if field_to_ignore in entry: 71 | del entry[field_to_ignore] 72 | 73 | if len(data1) != len(data2): 74 | return False 75 | 76 | for entry1, entry2 in zip(data1, data2): 77 | if not self.is_same(entry1, entry2): 78 | return False 79 | 80 | return True 81 | 82 | def check_collate(self, dirname): 83 | input_dir = f"fixtures/collate/{dirname}" 84 | 85 | # Make the collator. 86 | collator_args = collate.get_args([input_dir, self.collated_dir, "--file_extension=json", 87 | f"--dataset={dirname}"]) 88 | collator_runner = collate.CollateRunner(**vars(collator_args)) 89 | 90 | # Make the uncollator. 91 | uncollator_args = uncollate.get_args( 92 | [self.collated_dir, self.uncollated_dir, f"--order_like_directory={input_dir}", 93 | "--file_extension=json"]) 94 | uncollator_runner = uncollate.UnCollateRunner(**vars(uncollator_args)) 95 | 96 | # Run both. 97 | collator_runner.run() 98 | uncollator_runner.run() 99 | 100 | for name in ["train", "dev", "test"]: 101 | assert self.files_same(f"{input_dir}/{name}.json", f"{self.uncollated_dir}/{name}.json") 102 | 103 | def test_collate(self): 104 | "Make sure that our Document class can read and write data without changing it." 105 | for dirname in ["ace-event", "scierc"]: 106 | self.check_collate(dirname) 107 | 108 | 109 | if __name__ == "__main__": 110 | unittest.main() 111 | -------------------------------------------------------------------------------- /dygie/tests/data/document_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Spot-checks for the Document class. 3 | """ 4 | 5 | import unittest 6 | import json 7 | import os 8 | import shutil 9 | 10 | from dygie.data import Document 11 | 12 | 13 | class TestDocument(unittest.TestCase): 14 | def setUp(self): 15 | self.tmpdir = "tmp" 16 | os.makedirs(self.tmpdir, exist_ok=True) 17 | 18 | def tearDown(self): 19 | shutil.rmtree(self.tmpdir) 20 | 21 | @staticmethod 22 | def is_same(x1, x2): 23 | "Compare the fields in two dicts loaded from json." 24 | # Check if keys are same. 25 | if x1.keys() != x2.keys(): 26 | return False 27 | 28 | # Loop over all fields. If not same, return False. 29 | for key in x1: 30 | if x1[key] != x2[key]: 31 | return False 32 | 33 | # If we get to the end, they're the same. 34 | return True 35 | 36 | def check_document(self, document_name): 37 | # Load the original file. 38 | with open(f"fixtures/{document_name}.json") as f: 39 | js = json.load(f) 40 | doc = Document.from_json(js) 41 | 42 | # Dump to file. 43 | tmpfile = f"{self.tmpdir}/{document_name}.json" 44 | dumped = doc.to_json() 45 | with open(tmpfile, "w") as f: 46 | json.dump(dumped, f) 47 | 48 | # Reload and compare. 49 | with open(tmpfile) as f: 50 | reloaded = json.load(f) 51 | assert self.is_same(js, reloaded) 52 | 53 | def test_document(self): 54 | "Make sure that our Document class can read and write data without changing it." 55 | for document_name in ["ace_event_article", "scierc_article", "ace_event_coref_article"]: 56 | self.check_document(document_name) 57 | 58 | 59 | if __name__ == "__main__": 60 | unittest.main() 61 | -------------------------------------------------------------------------------- /dygie/tests/data/dygie_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Short unit tests to make sure our dataset readers are behaving correctly. 3 | Checks a sample from the scierc data 4 | """ 5 | 6 | import unittest 7 | from allennlp.data.vocabulary import Vocabulary 8 | 9 | from dygie.data import DyGIEReader 10 | 11 | 12 | class TestDygieReader(unittest.TestCase): 13 | 14 | def setUp(self): 15 | # scierc 16 | # Sentence lengths: [20, 23, 36, 14, 14, 30, 31, 15]. 17 | # Cumulative sentence lengths: [20, 43, 79, 93, 107, 137, 168, 183]. 18 | self.reader = DyGIEReader(max_span_width=5) 19 | self.dataset = self.reader.read("dygie/tests/fixtures/scierc_article.json") 20 | 21 | def tearDown(self): 22 | pass 23 | 24 | def test_tokens_correct_scierc(self): 25 | # instances are now entire documents instead of sentences 26 | instance = self.dataset.instances[0] 27 | tokens = instance["text"][4][0:] 28 | assert len(tokens) == 14 29 | text = [token.text for token in tokens] 30 | assert text[:6] == ["Thirdly", "the", "learned", "intrinsic", "object", "structure"] 31 | 32 | def test_ner_correct_scierc(self): 33 | instance = self.dataset.instances[0] 34 | ner_field = instance["ner_labels"][3] 35 | spans = instance["spans"][3] 36 | 37 | for label, span in zip(ner_field, spans): 38 | start, end = span.span_start, span.span_end 39 | if start == 2 and end == 3: 40 | assert label.label == "Method" 41 | elif start == 11 and end == 12: 42 | assert label.label == "Method" 43 | else: 44 | assert label.label == "" 45 | 46 | def test_relation_correct_scierc(self): 47 | instance = self.dataset.instances[0] 48 | relation_field = instance["relation_labels"][5] 49 | span_list = relation_field.sequence_field 50 | # There should be one relation in this sentence, 51 | indices = relation_field.indices 52 | labels = relation_field.labels 53 | assert len(indices) == len(labels) == 1 54 | ix = indices[0] 55 | label = labels[0] 56 | # Check that the relation refers to the correct spans 57 | span1 = span_list[ix[0]] 58 | span2 = span_list[ix[1]] 59 | assert ((span1.span_start == 19 and span1.span_end == 20 and 60 | span2.span_start == 22 and span2.span_end == 24)) 61 | # Check that the label's correct. 62 | assert label == "USED-FOR" 63 | 64 | def test_coref_correct_scierc(self): 65 | instance = self.dataset.instances[0] 66 | coref_field = instance["coref_labels"] 67 | spans = instance["spans"] 68 | # A list, one entry per sentence. For each sentence, a dict mapping spans to cluster id's. 69 | cluster_mappings = [{(6, 6): 1}, 70 | {}, 71 | {(19, 21): 0}, 72 | {(11, 12): 0, (2, 3): 2}, 73 | {(3, 5): 0}, 74 | {(5, 7): 0, (19, 20): 2, (22, 24): 3}, 75 | {(5, 5): 3}, 76 | {(2, 2): 1}] 77 | for instance, cluster_mapping, span in zip(coref_field, cluster_mappings, spans): 78 | curr_coref_field = instance 79 | curr_span = span 80 | for label, span in zip(curr_coref_field, curr_span): 81 | start, end = span.span_start, span.span_end 82 | if (start, end) in cluster_mapping: 83 | # print(start, end) 84 | # print(label.label) 85 | assert cluster_mapping[(start, end)] == label.label 86 | else: 87 | assert label.label == -1 88 | 89 | def test_vocab_size_correct_scierc(self): 90 | vocab = Vocabulary.from_instances(self.dataset.instances) 91 | # There are 4 unique NER labels and 6 relation labels in the text fixture doc. For the ner 92 | # labels, there is an extra category for the null label. For the relation labels, there 93 | # isn't. This is due to the way their respective `Field`s represent labels. 94 | assert vocab.get_vocab_size("None__ner_labels") == 5 95 | assert vocab.get_vocab_size("None__relation_labels") == 6 96 | # For numeric labels, vocab size is 0. 97 | assert vocab.get_vocab_size("coref_labels") == 0 98 | 99 | 100 | if __name__ == "__main__": 101 | unittest.main() 102 | -------------------------------------------------------------------------------- /dygie/tests/data/spacy_interface_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from dygie.spacy_interface.spacy_interface import prepare_spacy_doc 3 | import spacy 4 | 5 | class TestSpacyInterface(unittest.TestCase): 6 | 7 | def setUp(self) -> None: 8 | nlp = spacy.load('en_core_web_sm') 9 | text = "Title: VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network\nSection:" 10 | doc = nlp(text) 11 | sentences = [[tok.text for tok in sent] for sent in doc.sents] 12 | self.prediction = {'doc_key': 'test', 13 | 'dataset': 'scierc', 14 | 'sentences': sentences, 15 | 'predicted_ner': [[[2, 2, 'Method', 15.5283, 1.0], 16 | [5, 11, 'Method', 3.0847, 0.9563], 17 | [6, 11, 'Method', 3.8185, 0.9672], 18 | [14, 18, 'Method', 3.4321, 0.9686], 19 | [15, 18, 'Method', 11.8431, 1.0], 20 | [19, 19, 'Generic', 4.7359, 0.7531]]], 21 | 'predicted_relations': [[[2, 2, 6, 11, 'HYPONYM-OF', 2.0108, 0.8819], 22 | [19, 19, 19, 19, 'USED-FOR', 0.8034, 0.2309]]]} 23 | self.doc = doc 24 | return super().setUp() 25 | 26 | def test_relation(self): 27 | doc = prepare_spacy_doc(self.doc, self.prediction) 28 | # number of sentences 29 | self.assertEqual(len(doc._.rels),1) 30 | # number of relations 31 | self.assertEqual(len(doc._.rels[0]),2) 32 | # type of relations 33 | self.assertEqual(doc._.rels[0][0][2], 'HYPONYM-OF') 34 | self.assertEqual(doc._.rels[0][1][2], 'USED-FOR') 35 | 36 | 37 | def test_span_based_entity(self): 38 | doc = prepare_spacy_doc(self.doc, self.prediction) 39 | # number of sentences 40 | self.assertEqual(len(doc._.span_ents),1) 41 | # number of span based entities 42 | self.assertEqual(len(doc._.span_ents[0]),6) 43 | 44 | def test_spacy_entity(self): 45 | doc = prepare_spacy_doc(self.doc, self.prediction) 46 | # number of proned merged entities 47 | self.assertEqual(len(doc.ents),4) 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | if __name__ == '__main__': 56 | unittest.main() -------------------------------------------------------------------------------- /dygie/tests/fixtures/ace_event_article.json: -------------------------------------------------------------------------------- 1 | {"events": [[], [[[22, "conflict_attack"]]], [], [], [[[99, "movement_transport"], [97, 98, "vehicle"], [97, 97, "artifact"], [101, 102, "destination"]]], [[[119, "movement_transport"], [114, 114, "vehicle"], [118, 118, "artifact"], [121, 124, "destination"]]], [[[137, "conflict_attack"]]], [], [], [], [], [], [], [], [], [], [[[235, "conflict_attack"], [232, 236, "attacker"]]], [], [], [], [[[273, "conflict_attack"], [270, 271, "target"]]], [], [[[291, "movement_transport"], [288, 289, "artifact"], [293, 293, "destination"]]], [], [], [], [], [], [], [], [], [[[412, "conflict_demonstrate"], [408, 408, "entity"], [409, 410, "time"]], [[415, "conflict_attack"]]], [[[430, "conflict_demonstrate"]]], [], [[[468, "conflict_demonstrate"], [454, 455, "place"], [457, 464, "entity"], [471, 471, "time"]]], [], [[[499, "conflict_demonstrate"], [491, 492, "place"], [496, 496, "time"]]], [], [], [[[559, "conflict_demonstrate"], [563, 567, "time"]]], [[[579, "conflict_demonstrate"], [573, 585, "place"]], [[585, "conflict_attack"], [584, 584, "place"]]], [], [], [], [], [], [], [], [[[693, "conflict_demonstrate"]]], [[[705, "conflict_demonstrate"], [716, 717, "place"]], [[721, "conflict_attack"], [720, 720, "place"]]], [], [], [], [], [], [], [[[834, "life_injure"], [833, 837, "victim"]], [[837, "conflict_attack"], [833, 837, "target"]]], [[[855, "conflict_attack"], [857, 860, "attacker"], [876, 876, "place"], [844, 845, "target"], [848, 852, "target"]]], [], [], [], [], [], [], [], [[[1007, "conflict_attack"]]], [[[1012, "conflict_attack"], [1013, 1015, "target"]]], [], [], [], [[[1063, "conflict_demonstrate"]]], [], [], [], [[[1088, "conflict_demonstrate"]]], [], []], "ner": [[[0, 0, "PER"], [1, 1, "PER"], [3, 3, "PER"]], [[10, 10, "PER"], [28, 31, "PER"], [29, 29, "PER"], [34, 34, "PER"], [37, 37, "PER"]], [[47, 48, "LOC"], [56, 65, "PER"], [63, 65, "VEH"], [64, 64, "VEH"]], [[74, 76, "PER"], [74, 74, "PER"], [81, 85, "GPE"], [82, 82, "GPE"], [85, 85, "GPE"]], [[91, 92, "PER"], [94, 95, "PER"], [94, 94, "ORG"], [97, 97, "PER"], [97, 98, "VEH"], [101, 102, "LOC"], [106, 108, "PER"], [108, 108, "LOC"]], [[114, 114, "VEH"], [118, 118, "PER"], [121, 124, "GPE"], [124, 124, "GPE"]], [[126, 127, "PER"]], [[140, 140, "time"], [141, 141, "PER"]], [[147, 155, "PER"], [148, 148, "GPE"], [151, 151, "PER"], [159, 160, "PER"]], [[168, 170, "ORG"]], [[179, 180, "PER"]], [[183, 186, "PER"], [183, 186, "PER"]], [[193, 194, "PER"], [196, 197, "PER"]], [[208, 210, "GPE"], [209, 209, "GPE"]], [[218, 219, "PER"], [220, 220, "GPE"], [222, 222, "GPE"]], [[224, 224, "GPE"], [226, 226, "GPE"], [228, 228, "GPE"]], [[230, 230, "GPE"], [232, 236, "GPE"], [236, 236, "GPE"]], [[238, 238, "PER"], [241, 241, "PER"], [244, 246, "ORG"], [244, 244, "PER"]], [[248, 249, "PER"], [250, 250, "PER"], [252, 254, "PER"]], [[256, 256, "PER"]], [[270, 271, "PER"]], [[275, 281, "VEH"], [277, 281, "VEH"], [280, 280, "PER"], [280, 281, "VEH"]], [[288, 289, "PER"], [293, 293, "GPE"]], [[295, 295, "PER"]], [], [[321, 321, "GPE"], [324, 327, "LOC"], [327, 327, "GPE"], [337, 339, "PER"], [339, 339, "LOC"], [341, 341, "LOC"], [344, 349, "LOC"]], [[351, 351, "time"], [352, 352, "LOC"], [353, 354, "PER"]], [[358, 359, "PER"]], [[364, 365, "PER"], [367, 368, "ORG"], [370, 372, "GPE"], [371, 372, "LOC"]], [[378, 378, "PER"], [383, 383, "ORG"], [388, 389, "GPE"]], [[393, 393, "PER"], [396, 396, "PER"], [400, 400, "GPE"], [402, 402, "LOC"]], [[405, 405, "ORG"], [408, 408, "PER"], [409, 410, "time"], [420, 421, "PER"]], [[423, 423, "ORG"]], [[432, 432, "GPE"], [434, 436, "GPE"], [437, 438, "time"], [441, 441, "PER"], [443, 444, "time"], [446, 447, "PER"], [450, 451, "FAC"]], [[454, 455, "GPE"], [457, 464, "ORG"], [471, 471, "time"]], [[473, 473, "PER"], [473, 475, "PER"], [478, 481, "PER"]], [[489, 489, "GPE"], [491, 492, "GPE"], [496, 496, "time"], [506, 513, "PER"], [507, 507, "GPE"], [510, 513, "LOC"], [511, 512, "LOC"]], [[515, 515, "ORG"], [515, 518, "PER"], [521, 521, "PER"], [523, 524, "GPE"]], [[526, 526, "PER"], [531, 531, "PER"], [535, 535, "GPE"]], [[537, 538, "PER"], [540, 540, "ORG"], [540, 541, "PER"], [543, 543, "PER"], [545, 545, "PER"], [547, 567, "PER"], [551, 551, "PER"], [557, 558, "PER"], [563, 567, "time"]], [[571, 571, "GPE"], [573, 585, "GPE"], [584, 584, "GPE"], [589, 595, "PER"]], [[597, 597, "PER"], [603, 604, "PER"]], [[606, 606, "PER"], [611, 611, "PER"]], [[617, 618, "PER"], [624, 626, "PER"]], [[628, 628, "PER"], [632, 640, "PER"], [634, 640, "PER"], [634, 638, "ORG"], [634, 635, "GPE"], [637, 637, "ORG"]], [[642, 642, "PER"], [645, 645, "PER"], [647, 647, "PER"]], [[651, 651, "PER"], [653, 653, "PER"], [658, 658, "PER"]], [[667, 667, "PER"], [669, 670, "ORG"], [678, 678, "ORG"]], [], [[696, 696, "PER"], [699, 700, "PER"], [703, 703, "PER"], [709, 709, "PER"], [716, 717, "GPE"], [720, 720, "GPE"]], [], [[730, 730, "PER"], [741, 748, "PER"], [743, 748, "PER"], [750, 751, "PER"], [753, 755, "PER"]], [[757, 757, "PER"], [760, 763, "ORG"]], [[767, 767, "PER"], [771, 772, "PER"]], [[776, 779, "ORG"]], [[782, 787, "PER"], [785, 787, "ORG"], [789, 791, "ORG"], [799, 799, "ORG"], [805, 805, "PER"], [809, 809, "ORG"], [822, 822, "ORG"]], [[824, 824, "PER"], [827, 827, "PER"], [833, 837, "PER"]], [[839, 839, "PER"], [840, 840, "ORG"], [844, 845, "PER"], [848, 852, "PER"], [857, 860, "ORG"], [858, 858, "GPE"], [860, 860, "PER"], [862, 862, "PER"], [866, 867, "GPE"], [872, 873, "GPE"], [876, 876, "GPE"]], [[878, 878, "PER"], [891, 893, "LOC"], [892, 892, "PER"], [895, 897, "GPE"], [896, 896, "GPE"]], [[899, 899, "PER"], [901, 901, "PER"], [903, 903, "PER"], [907, 910, "PER"], [907, 907, "PER"], [910, 910, "GPE"], [914, 914, "PER"], [920, 921, "PER"]], [[923, 923, "PER"], [926, 926, "PER"], [928, 930, "GPE"], [929, 929, "GPE"], [933, 939, "GPE"]], [[942, 942, "PER"], [942, 943, "PER"], [945, 945, "PER"]], [[952, 952, "PER"], [954, 954, "PER"], [957, 958, "FAC"]], [[960, 960, "PER"], [969, 969, "PER"], [979, 980, "FAC"]], [[982, 982, "PER"], [986, 986, "PER"], [989, 989, "PER"], [992, 992, "PER"], [994, 994, "GPE"]], [], [[1013, 1015, "ORG"], [1014, 1014, "GPE"]], [[1017, 1017, "GPE"]], [[1026, 1026, "PER"], [1028, 1028, "PER"], [1030, 1034, "PER"], [1032, 1034, "PER"], [1039, 1040, "PER"]], [[1043, 1043, "PER"], [1045, 1045, "PER"], [1048, 1048, "PER"], [1051, 1051, "PER"]], [], [[1065, 1065, "PER"]], [[1069, 1069, "PER"]], [[1073, 1073, "PER"], [1075, 1075, "PER"]], [[1077, 1077, "PER"], [1078, 1079, "PER"]], [[1094, 1095, "time"], [1105, 1106, "PER"], [1114, 1115, "time"]], [[1117, 1117, "PER"]]], "sentences": [["WOODRUFF", "I", "hope", "they", "get", "a", "little", "rest", "."], ["When", "we", "come", "back", ",", "one", "of", "the", "many", "sad", "aspects", "of", "this", "war", ",", "humanitarian", "aid", "rushed", "to", "people", "who", "need", "it", "but", "not", "everybody", "gets", "what", "they", "came", "for", "."], ["Slowly", "humanitarian", "aid", "is", "rolling", "into", "southern", "Iraq", "but", "dramatic", "scenes", "like", "this", "one", "of", "a", "crowd", "tearing", "into", "the", "supplies", "on", "a", "convoy", "truck", "only", "underscores", "how", "desperate", "the", "need", "is", "."], ["Correspondent", "Martin", "Geissler", "files", "this", "report", "from", "the", "Iraqi", "town", "of", "Safwan", "."], ["-LRB-", "BEGIN", "VIDEOTAPE", "-RRB-", "MARTIN", "GEISSLER", ",", "CNN", "CORRESPONDENT", "As", "our", "convoy", "rolled", "through", "southern", "Iraq", "the", "desperation", "of", "the", "people", "here", "soon", "became", "evident", "."], ["In", "trucks", "and", "on", "foot", "they", "came", "to", "the", "town", "of", "Safwan", "."], ["These", "people", "have", "been", "without", "food", "or", "water", "supplies", "since", "the", "war", "began", "."], ["Now", "they", "are", "desperate", "."], ["Within", "seconds", "the", "Kuwaiti", "aid", "workers", "who", "had", "organized", "this", "trip", "were", "overpowered", "by", "the", "mob", "."], ["These", "desperate", "scenes", "are", "exactly", "what", "the", "aide", "agencies", "wanted", "to", "avoid", "."], ["This", "is", "survival", "of", "the", "fittest", "."], ["Only", "the", "healthy", "and", "strong", "can", "get", "to", "the", "food", "."], ["The", "weak", "and", "the", "ill", "are", "left", "with", "nothing", "."], ["Despite", "this", "effort", "to", "help", "the", "Iraqi", "people", ",", "resentment", "is", "never", "far", "away", "."], ["UNIDENTIFIED", "MALE", "We", "hate", "U.S.", "."], ["We", "hate", "British", ",", "England", "."], ["We", "hate", "any", "state", "in", "war", "here", "."], ["GEISSLER", "What", "do", "you", "think", "about", "Saddam", "'s", "regime", "?"], ["UNIDENTIFIED", "MALE", "Saddam", "'s", "very", "good", "man", "."], ["GEISSLER", "As", "the", "supplies", "ran", "out", "the", "mood", "swung", "from", "frantic", "to", "ugly", "."], ["Delivery", "drivers", "were", "threatened", "."], ["One", "of", "the", "buses", "in", "our", "convoy", "was", "held", "up", "at", "knifepoint", "."], ["The", "troops", "have", "moved", "into", "Safwan", "."], ["We", ",", "as", "a", "consequence", ",", "have", "had", "to", "move", "out", "."], ["It", "'s", "simply", "too", "dangerous", "."], ["This", "is", "a", "clear", "indication", "that", "despite", "the", "coalition", "reassurances", "that", "this", "part", "of", "Iraq", "is", "safe", "and", "despite", "the", "aid", "being", "brought", "into", "the", "people", "here", ",", "it", "is", "still", "a", "very", ",", "very", "volatile", "area", "."], ["Tonight", "here", "the", "strong", "are", "eating", "."], ["The", "weak", "still", "go", "hungry", "."], ["Martin", "Geissler", ",", "ITV", "News", ",", "Safwan", "southern", "Iraq", "."], ["-LRB-", "END", "VIDEOTAPE", "-RRB-", "WOODRUFF", "So", "many", "different", "pictures", "we", "are", "getting", "from", "across", "that", "country", "."], ["Well", ",", "they", "are", "making", "their", "voices", "heard", "at", "home", "and", "abroad", "."], ["When", "we", "return", ",", "Americans", "this", "weekend", "are", "marching", "against", "the", "war", "and", "in", "support", "of", "the", "troops", "."], ["We", "'ll", "check", "out", "some", "of", "the", "demonstrations", "."], ["Here", "in", "the", "United", "States", "this", "weekend", "just", "as", "they", "did", "last", "weekend", ",", "anti-war", "protesters", "taking", "to", "the", "street", "."], ["In", "Los", "Angeles", ",", "the", "International", "Black", "Coalition", "for", "Peace", "and", "Justice", "is", "sponsoring", "a", "rally", "for", "peace", "today", "."], ["Congresswoman", "Maxine", "Waters", "was", "among", "those", "scheduled", "to", "attend", "."], ["In", "the", "meantime", "further", "north", "in", "California", ",", "San", "Francisco", "is", "the", "setting", "today", "for", "a", "rally", "aimed", "at", "boosting", "the", "moral", "of", "the", "American", "troops", "in", "the", "Persian", "Gulf", "region", "."], ["CNN", "'s", "Rusty", "Dornin", "is", "with", "us", "from", "San", "Francisco", "."], ["Rusty", ",", "what", "sort", "of", "crowd", "is", "showing", "up", "there", "?"], ["RUSTY", "DORNIN", ",", "CNN", "CORRESPONDENT", "Well", "Judy", ",", "this", "is", "the", "largest", "group", "really", "we", "'ve", "seen", "of", "the", "support", "the", "troops", "rallies", "that", "have", "been", "over", "the", "last", "few", "weeks", "."], ["Of", "course", "this", "is", "the", "home", "of", "the", "anti-", "war", "demonstrations", "stemming", "back", "to", "the", "Vietnam", "War", "but", "there", "are", "close", "to", "between", "500", "and", "1,000", "people", "."], ["I", "'m", "getting", "various", "estimates", "on", "the", "crowd", "."], ["We", "did", "want", "to", "show", "you", "an", "interesting", "thing", "here", "."], ["Some", "folks", "are", "showing", "some", "solidarity", "with", "the", "speakers", "here", "."], ["You", "'re", "looking", "at", "some", "of", "San", "Francisco", "'s", "police", "department", "'s", "officers", "."], ["I", "did", "ask", "them", "why", "they", "did", "that", "."], ["They", "said", "they", "were", "wearing", "skullcaps", "so", "they", "decided", "to", "adopt", "this", "to", "show", "solidarity", "."], ["They", "said", "the", "department", "has", "not", "made", "any", "statement", "about", "whether", "they", "think", "that", "'s", "all", "right", "or", "not", "."], ["This", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["As", "I", "said", ",", "the", "officers", "did", "tell", "me", "that", "this", "is", "the", "largest", "pro-troops", "demonstration", "that", "has", "ever", "been", "in", "San", "Francisco", "since", "the", "Vietnam", "War", "."], ["So", "far", ",", "very", "peaceful", "."], ["As", "I", "said", ",", "there", "have", "been", "a", "few", "verbal", "exchanges", "but", "one", "of", "the", "most", "enthusiastically", "received", "speakers", "here", "was", "Bessam", "Al-Husaini", ",", "an", "Iraqi", "American", "."], ["He", "'s", "with", "the", "Iraqi", "American", "Council", "."], ["How", "do", "you", "feel", "about", "supporting", "the", "troops", "?"], ["How", "does", "the", "Iraqi", "American", "Council", "feel", "?"], ["BESSAM", "AL-HUSAINI", ",", "IRAQI", "AMERICAN", "COUNCIL", "Well", "the", "Iraqi", "American", "have", "been", "waiting", "for", "this", "liberation", "and", "they", "want", "to", "get", "rid", "of", "Saddam", "so", "bad", "and", "they", "will", "have", "to", "take", "it", "the", "way", "it", "'s", "been", "offered", "to", "us", "."], ["DORNIN", "How", "do", "you", "feel", "though", "seeing", "pictures", "of", "civilians", "injured", "in", "the", "bombings", "?"], ["AL-HUSAINI", "We", "get", "reports", "that", "these", "civilians", "especially", "in", "the", "-LRB-", "INAUDIBLE", "-RRB-", "population", "have", "been", "attacked", "by", "the", "Iraqi", "regime", "themselves", "so", "they", "ca", "n't", "blame", "the", "American", "and", "said", "look", "what", "the", "American", "doing", "to", "us", "."], ["He", "would", "love", "to", "see", "this", "and", "would", "draw", "on", "this", "emotion", "from", "the", "Arab", "world", "and", "the", "European", "country", "."], ["DORNIN", "Now", "you", "said", "you", "did", "talk", "to", "your", "family", "in", "Baghdad", "as", "well", "and", "they", "also", "are", "somewhat", "suspicious", "of", "the", "Americans", "."], ["AL-HUSAINI", "Well", ",", "I", "mean", "the", "Iraqi", "people", "have", "been", "the", "primary", "victim", "from", "the", "whole", "thing", "."], ["Yeah", "my", "family", ",", "you", "know", ",", "still", "in", "fear", "."], ["You", "know", "they", "stay", "in", "one", "room", "."], ["They", "boarded", "all", "the", "-", "all", "the", "windows", "and", "they", "built", "with", "a", "brick", "one", "of", "the", "windows", "facing", "the", "street", "."], ["I", "mean", "yes", ",", "you", "know", ",", "my", "prayer", "to", "them", "but", "we", "can", "only", "do", "so", "much", "."], ["Hopefully", "it", "will", "be", "a", "short", "war", "."], ["It", "will", "be", "attack", "the", "Iraqi", "regime", "."], ["We", "can", "live", "in", "liberty", "and", "freedom", "soon", "."], ["DORNIN", "Now", "I", "understand", "some", "of", "the", "people", "here", "have", "been", "threatened", "by", "anti-war", "protesters", "."], ["Have", "you", "had", "anyone", "either", "threatening", "you", "or", "...", "AL-HUSAINI", "No", ",", "no", "problem", "."], ["This", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["DORNIN", "OK", "."], ["Thank", "you", "very", "much", "."], ["AL-HUSAINI", "Thank", "you", "."], ["DORNIN", "Bessam", "Al-Husaini", "here", "and", "it", "has", "been", "a", "very", "peaceful", "demonstration", "."], ["It", "is", "wrapping", "up", "this", "afternoon", "and", "it", "looks", "like", "the", "civil", "disobedience", "acts", "by", "the", "anti-protesters", "are", "scheduled", "to", "get", "underway", "once", "again", "next", "week", "."], ["Judy", "..."]], "clusters": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "relations": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "doc_key": "CNN_IP_20030329.1600.01-3", "dataset": "ace05_event"} 2 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/collate/scierc/dev.json: -------------------------------------------------------------------------------- 1 | {"clusters": [[[6, 17], [32, 32]], [[4, 4], [55, 55], [91, 91]], [[58, 62], [64, 64], [79, 79]]], "sentences": [["This", "paper", "presents", "an", "algorithm", "for", "computing", "optical", "flow", ",", "shape", ",", "motion", ",", "lighting", ",", "and", "albedo", "from", "an", "image", "sequence", "of", "a", "rigidly-moving", "Lambertian", "object", "under", "distant", "illumination", "."], ["The", "problem", "is", "formulated", "in", "a", "manner", "that", "subsumes", "structure", "from", "motion", ",", "multi-view", "stereo", ",", "and", "photo-metric", "stereo", "as", "special", "cases", "."], ["The", "algorithm", "utilizes", "both", "spatial", "and", "temporal", "intensity", "variation", "as", "cues", ":", "the", "former", "constrains", "flow", "and", "the", "latter", "constrains", "surface", "orientation", ";", "combining", "both", "cues", "enables", "dense", "reconstruction", "of", "both", "textured", "and", "texture-less", "surfaces", "."], ["The", "algorithm", "works", "by", "iteratively", "estimating", "affine", "camera", "parameters", ",", "illumination", ",", "shape", ",", "and", "albedo", "in", "an", "alternating", "fashion", "."], ["Results", "are", "demonstrated", "on", "videos", "of", "hand-held", "objects", "moving", "in", "front", "of", "a", "fixed", "light", "and", "camera", "."]], "ner": [[[4, 4, "Generic"], [6, 17, "Task"], [20, 21, "Material"], [24, 26, "Material"], [28, 29, "OtherScientificTerm"]], [[32, 32, "Generic"], [42, 42, "Material"], [44, 45, "Material"], [48, 49, "Material"]], [[55, 55, "Generic"], [58, 62, "OtherScientificTerm"], [64, 64, "Generic"], [67, 67, "Generic"], [69, 69, "OtherScientificTerm"], [72, 72, "Generic"], [74, 75, "OtherScientificTerm"], [79, 79, "Generic"], [81, 88, "Task"]], [[91, 91, "Generic"], [95, 105, "Method"]], [[115, 118, "Material"]]], "relations": [[[4, 4, 6, 17, "USED-FOR"], [20, 21, 4, 4, "USED-FOR"], [24, 26, 20, 21, "FEATURE-OF"], [28, 29, 24, 26, "FEATURE-OF"]], [[42, 42, 44, 45, "CONJUNCTION"], [44, 45, 48, 49, "CONJUNCTION"]], [[58, 62, 55, 55, "USED-FOR"], [67, 67, 64, 64, "HYPONYM-OF"], [67, 67, 69, 69, "USED-FOR"], [67, 67, 72, 72, "CONJUNCTION"], [72, 72, 64, 64, "HYPONYM-OF"], [72, 72, 74, 75, "USED-FOR"], [79, 79, 81, 88, "USED-FOR"]], [[95, 105, 91, 91, "USED-FOR"]], []], "doc_key": "ICCV_2003_158_abs"} 2 | {"clusters": [[[90, 91], [107, 107]]], "sentences": [["Past", "work", "of", "generating", "referring", "expressions", "mainly", "utilized", "attributes", "of", "objects", "and", "binary", "relations", "between", "objects", "."], ["However", ",", "such", "an", "approach", "does", "not", "work", "well", "when", "there", "is", "no", "distinctive", "attribute", "among", "objects", "."], ["To", "overcome", "this", "limitation", ",", "this", "paper", "proposes", "a", "method", "utilizing", "the", "perceptual", "groups", "of", "objects", "and", "n-ary", "relations", "among", "them", "."], ["The", "key", "is", "to", "identify", "groups", "of", "objects", "that", "are", "naturally", "recognized", "by", "humans", "."], ["We", "conducted", "psychological", "experiments", "with", "42", "subjects", "to", "collect", "referring", "expressions", "in", "such", "situations", ",", "and", "built", "a", "generation", "algorithm", "based", "on", "the", "results", "."], ["The", "evaluation", "using", "another", "23", "subjects", "showed", "that", "the", "proposed", "method", "could", "effectively", "generate", "proper", "referring", "expressions", "."]], "ner": [[[4, 5, "OtherScientificTerm"], [12, 13, "OtherScientificTerm"]], [], [[52, 53, "OtherScientificTerm"]], [], [[81, 82, "OtherScientificTerm"], [90, 91, "Method"]], [[107, 107, "Generic"], [112, 113, "OtherScientificTerm"]]], "relations": [[], [], [], [], [], []], "doc_key": "C04-1096"} 3 | {"clusters": [[[32, 32], [44, 44]], [[1, 2], [11, 11]], [[103, 104], [125, 126]], [[95, 96], [108, 109], [121, 122]]], "sentences": [["An", "entity-oriented", "approach", "to", "restricted-domain", "parsing", "is", "proposed", "."], ["In", "this", "approach", ",", "the", "definitions", "of", "the", "structure", "and", "surface", "representation", "of", "domain", "entities", "are", "grouped", "together", "."], ["Like", "semantic", "grammar", ",", "this", "allows", "easy", "exploitation", "of", "limited", "domain", "semantics", "."], ["In", "addition", ",", "it", "facilitates", "fragmentary", "recognition", "and", "the", "use", "of", "multiple", "parsing", "strategies", ",", "and", "so", "is", "particularly", "useful", "for", "robust", "recognition", "of", "extra-grammatical", "input", "."], ["Several", "advantages", "from", "the", "point", "of", "view", "of", "language", "definition", "are", "also", "noted", "."], ["Representative", "samples", "from", "an", "entity-oriented", "language", "definition", "are", "presented", ",", "along", "with", "a", "control", "structure", "for", "an", "entity-oriented", "parser", ",", "some", "parsing", "strategies", "that", "use", "the", "control", "structure", ",", "and", "worked", "examples", "of", "parses", "."], ["A", "parser", "incorporating", "the", "control", "structure", "and", "the", "parsing", "strategies", "is", "currently", "under", "implementation", "."]], "ner": [[[1, 2, "Method"], [4, 5, "Task"]], [[11, 11, "Generic"], [17, 23, "OtherScientificTerm"]], [[29, 30, "Method"], [32, 32, "Generic"], [37, 39, "OtherScientificTerm"]], [[44, 44, "Generic"], [46, 47, "Task"], [52, 54, "Method"], [63, 66, "OtherScientificTerm"]], [], [[86, 88, "OtherScientificTerm"], [95, 96, "OtherScientificTerm"], [99, 100, "Method"], [103, 104, "Method"], [108, 109, "OtherScientificTerm"]], [[118, 118, "Method"], [121, 122, "OtherScientificTerm"], [125, 126, "Method"]]], "relations": [[[1, 2, 4, 5, "USED-FOR"]], [], [[32, 32, 37, 39, "USED-FOR"]], [[44, 44, 46, 47, "USED-FOR"], [44, 44, 52, 54, "USED-FOR"], [52, 54, 63, 66, "USED-FOR"]], [], [[95, 96, 99, 100, "USED-FOR"], [108, 109, 103, 104, "USED-FOR"]], [[121, 122, 118, 118, "PART-OF"]]], "doc_key": "P84-1047"} 4 | {"clusters": [[[6, 11], [21, 21], [53, 53]], [[15, 16], [69, 69], [94, 94]], [[4, 11], [82, 83]]], "sentences": [["This", "paper", "summarizes", "the", "formalism", "of", "Category", "Cooccurrence", "Restrictions", "-LRB-", "CCRs", "-RRB-", "and", "describes", "two", "parsing", "algorithms", "that", "interpret", "it", "."], ["CCRs", "are", "Boolean", "conditions", "on", "the", "cooccurrence", "of", "categories", "in", "local", "trees", "which", "allow", "the", "statement", "of", "generalizations", "which", "can", "not", "be", "captured", "in", "other", "current", "syntax", "formalisms", "."], ["The", "use", "of", "CCRs", "leads", "to", "syntactic", "descriptions", "formulated", "entirely", "with", "restrictive", "statements", "."], ["The", "paper", "shows", "how", "conventional", "algorithms", "for", "the", "analysis", "of", "context", "free", "languages", "can", "be", "adapted", "to", "the", "CCR", "formalism", "."], ["Special", "attention", "is", "given", "to", "the", "part", "of", "the", "parser", "that", "checks", "the", "fulfillment", "of", "logical", "well-formedness", "conditions", "on", "trees", "."]], "ner": [[[4, 11, "Task"], [6, 11, "OtherScientificTerm"], [15, 16, "Method"], [19, 19, "Generic"]], [[21, 21, "OtherScientificTerm"], [23, 24, "OtherScientificTerm"], [31, 32, "OtherScientificTerm"], [36, 38, "OtherScientificTerm"], [47, 48, "Method"]], [[53, 53, "OtherScientificTerm"], [56, 57, "OtherScientificTerm"], [61, 62, "OtherScientificTerm"]], [[69, 69, "Generic"], [74, 76, "Material"], [82, 83, "Task"]], [[94, 94, "Method"], [100, 102, "OtherScientificTerm"], [104, 104, "OtherScientificTerm"]]], "relations": [[[15, 16, 19, 19, "USED-FOR"]], [], [[61, 62, 56, 57, "FEATURE-OF"]], [[69, 69, 82, 83, "USED-FOR"], [74, 76, 69, 69, "USED-FOR"]], [[100, 102, 104, 104, "FEATURE-OF"]]], "doc_key": "C88-1066"} 5 | {"clusters": [[[34, 36], [99, 101]], [[3, 5], [27, 27], [48, 48], [93, 93], [106, 106]]], "sentences": [["We", "present", "a", "text", "mining", "method", "for", "finding", "synonymous", "expressions", "based", "on", "the", "distributional", "hypothesis", "in", "a", "set", "of", "coherent", "corpora", "."], ["This", "paper", "proposes", "a", "new", "methodology", "to", "improve", "the", "accuracy", "of", "a", "term", "aggregation", "system", "using", "each", "author", "'s", "text", "as", "a", "coherent", "corpus", "."], ["Our", "approach", "is", "based", "on", "the", "idea", "that", "one", "person", "tends", "to", "use", "one", "expression", "for", "one", "meaning", "."], ["According", "to", "our", "assumption", ",", "most", "of", "the", "words", "with", "similar", "context", "features", "in", "each", "author", "'s", "corpus", "tend", "not", "to", "be", "synonymous", "expressions", "."], ["Our", "proposed", "method", "improves", "the", "accuracy", "of", "our", "term", "aggregation", "system", ",", "showing", "that", "our", "approach", "is", "successful", "."]], "ner": [[[3, 5, "Method"], [8, 9, "OtherScientificTerm"], [13, 14, "OtherScientificTerm"]], [[27, 27, "Generic"], [31, 31, "Metric"], [34, 36, "Method"]], [[48, 48, "Generic"]], [[76, 78, "OtherScientificTerm"], [88, 89, "OtherScientificTerm"]], [[93, 93, "Generic"], [96, 96, "Metric"], [99, 101, "Method"], [106, 106, "Generic"]]], "relations": [[[3, 5, 8, 9, "USED-FOR"], [13, 14, 3, 5, "USED-FOR"]], [[31, 31, 34, 36, "EVALUATE-FOR"], [34, 36, 27, 27, "EVALUATE-FOR"]], [], [], [[96, 96, 99, 101, "EVALUATE-FOR"], [99, 101, 93, 93, "EVALUATE-FOR"]]], "doc_key": "C04-1116"} 6 | {"clusters": [[[28, 31], [68, 70], [96, 96], [123, 123]], [[78, 79], [108, 108]], [[42, 44], [48, 48]]], "sentences": [["In", "this", "work", ",", "we", "present", "a", "technique", "for", "robust", "estimation", ",", "which", "by", "explicitly", "incorporating", "the", "inherent", "uncertainty", "of", "the", "estimation", "procedure", ",", "results", "in", "a", "more", "efficient", "robust", "estimation", "algorithm", "."], ["In", "addition", ",", "we", "build", "on", "recent", "work", "in", "randomized", "model", "verification", ",", "and", "use", "this", "to", "characterize", "the", "`", "non-randomness", "'", "of", "a", "solution", "."], ["The", "combination", "of", "these", "two", "strategies", "results", "in", "a", "robust", "estimation", "procedure", "that", "provides", "a", "significant", "speed-up", "over", "existing", "RANSAC", "techniques", ",", "while", "requiring", "no", "prior", "information", "to", "guide", "the", "sampling", "process", "."], ["In", "particular", ",", "our", "algorithm", "requires", ",", "on", "average", ",", "3-10", "times", "fewer", "samples", "than", "standard", "RANSAC", ",", "which", "is", "in", "close", "agreement", "with", "theoretical", "predictions", "."], ["The", "efficiency", "of", "the", "algorithm", "is", "demonstrated", "on", "a", "selection", "of", "geometric", "estimation", "problems", "."]], "ner": [[[7, 7, "Generic"], [9, 10, "Task"], [17, 22, "OtherScientificTerm"], [28, 31, "Method"]], [[42, 44, "Task"], [48, 48, "Generic"]], [[64, 64, "Generic"], [68, 70, "Method"], [78, 79, "Method"], [84, 85, "OtherScientificTerm"], [89, 90, "OtherScientificTerm"]], [[96, 96, "Generic"], [108, 108, "Method"], [116, 117, "OtherScientificTerm"]], [[123, 123, "Generic"], [130, 132, "Task"]]], "relations": [[[7, 7, 9, 10, "USED-FOR"], [7, 7, 28, 31, "USED-FOR"], [17, 22, 7, 7, "USED-FOR"]], [], [[64, 64, 68, 70, "USED-FOR"], [78, 79, 68, 70, "COMPARE"]], [[96, 96, 108, 108, "COMPARE"]], [[130, 132, 123, 123, "EVALUATE-FOR"]]], "doc_key": "ICCV_2009_47_abs"} 7 | {"clusters": [[[58, 59], [71, 72], [94, 95]], [[8, 10], [22, 22], [33, 33], [50, 50], [63, 63], [83, 83]], [[40, 41], [67, 68]]], "sentences": [["An", "attempt", "has", "been", "made", "to", "use", "an", "Augmented", "Transition", "Network", "as", "a", "procedural", "dialog", "model", "."], ["The", "development", "of", "such", "a", "model", "appears", "to", "be", "important", "in", "several", "respects", ":", "as", "a", "device", "to", "represent", "and", "to", "use", "different", "dialog", "schemata", "proposed", "in", "empirical", "conversation", "analysis", ";", "as", "a", "device", "to", "represent", "and", "to", "use", "models", "of", "verbal", "interaction", ";", "as", "a", "device", "combining", "knowledge", "about", "dialog", "schemata", "and", "about", "verbal", "interaction", "with", "knowledge", "about", "task-oriented", "and", "goal-directed", "dialogs", "."], ["A", "standard", "ATN", "should", "be", "further", "developed", "in", "order", "to", "account", "for", "the", "verbal", "interactions", "of", "task-oriented", "dialogs", "."]], "ner": [[[8, 10, "Method"], [14, 15, "Method"]], [[22, 22, "Generic"], [33, 33, "Generic"], [40, 41, "OtherScientificTerm"], [45, 46, "Method"], [50, 50, "Generic"], [56, 56, "Generic"], [58, 59, "OtherScientificTerm"], [63, 63, "Generic"], [67, 68, "OtherScientificTerm"], [71, 72, "OtherScientificTerm"], [76, 79, "Material"]], [[83, 83, "Method"], [94, 95, "OtherScientificTerm"], [97, 98, "Material"]]], "relations": [[[8, 10, 14, 15, "HYPONYM-OF"]], [[40, 41, 33, 33, "USED-FOR"], [40, 41, 45, 46, "USED-FOR"], [56, 56, 50, 50, "USED-FOR"], [56, 56, 58, 59, "USED-FOR"], [67, 68, 71, 72, "CONJUNCTION"]], [[83, 83, 94, 95, "USED-FOR"], [94, 95, 97, 98, "FEATURE-OF"]]], "doc_key": "C80-1073"} 8 | {"clusters": [[[4, 6], [25, 25], [65, 65], [70, 70], [88, 88]], [[20, 22], [91, 92]], [[15, 17], [77, 77]]], "sentences": [["We", "present", "a", "practically", "unsupervised", "learning", "method", "to", "produce", "single-snippet", "answers", "to", "definition", "questions", "in", "question", "answering", "systems", "that", "supplement", "Web", "search", "engines", "."], ["The", "method", "exploits", "on-line", "encyclopedias", "and", "dictionaries", "to", "generate", "automatically", "an", "arbitrarily", "large", "number", "of", "positive", "and", "negative", "definition", "examples", ",", "which", "are", "then", "used", "to", "train", "an", "svm", "to", "separate", "the", "two", "classes", "."], ["We", "show", "experimentally", "that", "the", "proposed", "method", "is", "viable", ",", "that", "it", "outperforms", "the", "alternative", "of", "training", "the", "system", "on", "questions", "and", "news", "articles", "from", "trec", ",", "and", "that", "it", "helps", "the", "search", "engine", "handle", "definition", "questions", "significantly", "better", "."]], "ner": [[[4, 6, "Method"], [9, 10, "OtherScientificTerm"], [15, 17, "Method"], [20, 22, "Method"]], [[25, 25, "Generic"], [27, 30, "Material"], [39, 43, "Material"], [52, 52, "Method"]], [[65, 65, "Generic"], [70, 70, "Generic"], [73, 73, "Generic"], [77, 77, "Generic"], [81, 82, "Material"], [84, 84, "Material"], [88, 88, "Generic"], [91, 92, "Method"]]], "relations": [[[4, 6, 9, 10, "USED-FOR"], [15, 17, 20, 22, "USED-FOR"]], [[25, 25, 27, 30, "USED-FOR"], [27, 30, 39, 43, "USED-FOR"], [39, 43, 52, 52, "USED-FOR"]], [[70, 70, 73, 73, "COMPARE"], [81, 82, 77, 77, "USED-FOR"], [81, 82, 84, 84, "PART-OF"], [88, 88, 91, 92, "USED-FOR"]]], "doc_key": "H05-1041"} 9 | {"clusters": [], "sentences": [["We", "revisit", "the", "classical", "decision-theoretic", "problem", "of", "weighted", "expert", "voting", "from", "a", "statistical", "learning", "perspective", "."], ["In", "particular", ",", "we", "examine", "the", "consistency", "-LRB-", "both", "asymptotic", "and", "finitary", "-RRB-", "of", "the", "optimal", "Nitzan-Paroush", "weighted", "majority", "and", "related", "rules", "."], ["In", "the", "case", "of", "known", "expert", "competence", "levels", ",", "we", "give", "sharp", "error", "estimates", "for", "the", "optimal", "rule", "."], ["When", "the", "competence", "levels", "are", "unknown", ",", "they", "must", "be", "empirically", "estimated", "."], ["We", "provide", "frequentist", "and", "Bayesian", "analyses", "for", "this", "situation", "."], ["Some", "of", "our", "proof", "techniques", "are", "non-standard", "and", "may", "be", "of", "independent", "interest", "."], ["The", "bounds", "we", "derive", "are", "nearly", "optimal", ",", "and", "several", "challenging", "open", "problems", "are", "posed", "."], ["Experimental", "results", "are", "provided", "to", "illustrate", "the", "theory", "."]], "ner": [[[3, 9, "Task"], [12, 14, "Method"]], [[32, 34, "OtherScientificTerm"]], [[44, 46, "OtherScientificTerm"], [50, 52, "Method"], [55, 56, "OtherScientificTerm"]], [[60, 61, "OtherScientificTerm"]], [[75, 76, "Method"]], [], [], []], "relations": [[[12, 14, 3, 9, "USED-FOR"]], [], [[50, 52, 55, 56, "USED-FOR"]], [], [], [], [], []], "doc_key": "NIPS_2014_18_abs"} 10 | {"clusters": [[[51, 57], [74, 74]], [[7, 8], [70, 71]], [[3, 8], [136, 138]]], "sentences": [["We", "analyze", "a", "reweighted", "version", "of", "the", "Kikuchi", "approximation", "for", "estimating", "the", "log", "partition", "function", "of", "a", "product", "distribution", "defined", "over", "a", "region", "graph", "."], ["We", "establish", "sufficient", "conditions", "for", "the", "concavity", "of", "our", "reweighted", "objective", "function", "in", "terms", "of", "weight", "assignments", "in", "the", "Kikuchi", "expansion", ",", "and", "show", "that", "a", "reweighted", "version", "of", "the", "sum", "product", "algorithm", "applied", "to", "the", "Kikuchi", "region", "graph", "will", "produce", "global", "optima", "of", "the", "Kikuchi", "approximation", "whenever", "the", "algorithm", "converges", "."], ["When", "the", "region", "graph", "has", "two", "layers", ",", "corresponding", "to", "a", "Bethe", "approximation", ",", "we", "show", "that", "our", "sufficient", "conditions", "for", "concavity", "are", "also", "necessary", "."], ["Finally", ",", "we", "provide", "an", "explicit", "characterization", "of", "the", "polytope", "of", "concavity", "in", "terms", "of", "the", "cycle", "structure", "of", "the", "region", "graph", "."], ["We", "conclude", "with", "simulations", "that", "demonstrate", "the", "advantages", "of", "the", "reweighted", "Kikuchi", "approach", "."]], "ner": [[[3, 8, "Method"], [7, 8, "Method"], [12, 18, "Task"], [22, 23, "OtherScientificTerm"]], [[31, 31, "OtherScientificTerm"], [34, 36, "OtherScientificTerm"], [40, 41, "OtherScientificTerm"], [44, 45, "OtherScientificTerm"], [51, 57, "Method"], [61, 63, "OtherScientificTerm"], [66, 67, "OtherScientificTerm"], [70, 71, "Method"], [74, 74, "Generic"]], [[79, 80, "OtherScientificTerm"], [88, 89, "Method"], [98, 98, "OtherScientificTerm"]], [[114, 114, "OtherScientificTerm"], [119, 120, "OtherScientificTerm"], [123, 124, "OtherScientificTerm"]], [[136, 138, "Method"]]], "relations": [[[3, 8, 12, 18, "USED-FOR"], [12, 18, 22, 23, "FEATURE-OF"]], [[31, 31, 34, 36, "FEATURE-OF"], [51, 57, 61, 63, "USED-FOR"], [66, 67, 70, 71, "FEATURE-OF"]], [], [[119, 120, 123, 124, "FEATURE-OF"]], []], "doc_key": "NIPS_2014_10_abs"} 11 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/dygie_test.jsonnet: -------------------------------------------------------------------------------- 1 | // Quick test that doesn't load in any data. 2 | 3 | // Primary prediction target. Watch metrics associated with this target. 4 | local target = "rel"; 5 | 6 | // Specifies the token-level features that will be created. 7 | local use_glove = true; 8 | local use_char = true; 9 | local use_elmo = false; 10 | local use_attentive_span_extractor = true; 11 | 12 | // Specifies the model parameters. 13 | local lstm_hidden_size = 200; 14 | local lstm_n_layers = 1; 15 | local feature_size = 10; 16 | local feedforward_layers = 2; 17 | local char_n_filters = 50; 18 | local feedforward_dim = 150; 19 | local max_span_width = 8; 20 | local feedforward_dropout = 0.2; 21 | local lexical_dropout = 0.5; 22 | local lstm_dropout = 0.4; 23 | local loss_weights = { 24 | "ner": 1.0, 25 | "relation": 1.0, 26 | "coref": 1.0 27 | }; 28 | 29 | // Coref settings. 30 | local coref_spans_per_word = 0.4; 31 | local coref_max_antecedents = 100; 32 | 33 | // Relation settings. 34 | local relation_spans_per_word = 0.4; 35 | local relation_positive_label_weight = 1.0; 36 | 37 | // Model training 38 | local num_epochs = 250; 39 | local patience = 25; 40 | local learning_rate_scheduler = { 41 | "type": "reduce_on_plateau", 42 | "factor": 0.5, 43 | "mode": "max", 44 | "patience": 5 45 | }; 46 | local learning_rate = 0.001; 47 | 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | 51 | // Nothing below this line needs to change. 52 | 53 | 54 | // Storing constants. 55 | 56 | local validation_metrics = { 57 | "ner": "+ner_f1", 58 | "rel": "+rel_f1", 59 | "coref": "+coref_f1" 60 | }; 61 | 62 | local display_metrics = { 63 | "ner": ["ner_precision", "ner_recall", "ner_f1"], 64 | "rel": ["rel_precision", "rel_recall", "rel_f1", "rel_span_recall"], 65 | "coref": ["coref_precision", "coref_recall", "coref_f1", "coref_mention_recall"] 66 | }; 67 | 68 | local glove_dim = 300; 69 | local elmo_dim = 1024; 70 | 71 | local module_initializer = [ 72 | [".*linear_layers.*weight", {"type": "xavier_normal"}], 73 | [".*scorer._module.weight", {"type": "xavier_normal"}], 74 | ["_distance_embedding.weight", {"type": "xavier_normal"}]]; 75 | 76 | local dygie_initializer = [ 77 | ["_span_width_embedding.weight", {"type": "xavier_normal"}], 78 | ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], 79 | ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] 80 | ]; 81 | 82 | 83 | //////////////////////////////////////////////////////////////////////////////// 84 | 85 | // Calculating dimensions. 86 | 87 | local token_embedding_dim = ((if use_glove then glove_dim else 0) + 88 | (if use_char then char_n_filters else 0) + 89 | (if use_elmo then elmo_dim else 0)); 90 | local endpoint_span_emb_dim = 4 * lstm_hidden_size + feature_size; 91 | local attended_span_emb_dim = if use_attentive_span_extractor then token_embedding_dim else 0; 92 | local span_emb_dim = endpoint_span_emb_dim + attended_span_emb_dim; 93 | local pair_emb_dim = 3 * span_emb_dim; 94 | local relation_scorer_dim = pair_emb_dim; 95 | local coref_scorer_dim = pair_emb_dim + feature_size; 96 | 97 | //////////////////////////////////////////////////////////////////////////////// 98 | 99 | // Function definitions 100 | 101 | local make_feedforward(input_dim) = { 102 | "input_dim": input_dim, 103 | "num_layers": feedforward_layers, 104 | "hidden_dims": feedforward_dim, 105 | "activations": "relu", 106 | "dropout": feedforward_dropout 107 | }; 108 | 109 | // Model components 110 | 111 | local token_indexers = { 112 | [if use_glove then "tokens"]: { 113 | "type": "single_id", 114 | "lowercase_tokens": false 115 | }, 116 | [if use_char then "token_characters"]: { 117 | "type": "characters", 118 | "min_padding_length": 5 119 | }, 120 | [if use_elmo then "elmo"]: { 121 | "type": "elmo_characters" 122 | } 123 | }; 124 | 125 | local text_field_embedder = { 126 | "token_embedders": { 127 | [if use_glove then "tokens"]: { 128 | "type": "embedding", 129 | // "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz", 130 | "embedding_dim": 300, 131 | "trainable": false 132 | }, 133 | [if use_char then "token_characters"]: { 134 | "type": "character_encoding", 135 | "embedding": { 136 | "num_embeddings": 262, 137 | "embedding_dim": 16 138 | }, 139 | "encoder": { 140 | "type": "cnn", 141 | "embedding_dim": 16, 142 | "num_filters": char_n_filters, 143 | "ngram_filter_sizes": [5] 144 | } 145 | }, 146 | [if use_elmo then "elmo"]: { 147 | "type": "elmo_token_embedder", 148 | "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", 149 | "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", 150 | "do_layer_norm": false, 151 | "dropout": 0.5 152 | } 153 | } 154 | }; 155 | 156 | 157 | //////////////////////////////////////////////////////////////////////////////// 158 | 159 | // The model 160 | 161 | { 162 | "dataset_reader": { 163 | "type": "ie_json", 164 | "token_indexers": token_indexers, 165 | "max_span_width": max_span_width 166 | }, 167 | "train_data_path": "tests/fixtures/scierc_article.json", 168 | "validation_data_path": "tests/fixtures/scierc_article.json", 169 | "model": { 170 | "type": "dygie", 171 | "text_field_embedder": text_field_embedder, 172 | "initializer": dygie_initializer, 173 | "loss_weights": loss_weights, 174 | "lexical_dropout": lexical_dropout, 175 | "feature_size": feature_size, 176 | "use_attentive_span_extractor": use_attentive_span_extractor, 177 | "max_span_width": max_span_width, 178 | "display_metrics": display_metrics[target], 179 | "context_layer": { 180 | "type": "lstm", 181 | "bidirectional": true, 182 | "input_size": token_embedding_dim, 183 | "hidden_size": lstm_hidden_size, 184 | "num_layers": lstm_n_layers, 185 | "dropout": lstm_dropout 186 | }, 187 | "modules": { 188 | "coref": { 189 | "spans_per_word": coref_spans_per_word, 190 | "max_antecedents": coref_max_antecedents, 191 | "mention_feedforward": make_feedforward(span_emb_dim), 192 | "antecedent_feedforward": make_feedforward(coref_scorer_dim), 193 | "initializer": module_initializer 194 | }, 195 | "ner": { 196 | "mention_feedforward": make_feedforward(span_emb_dim), 197 | "initializer": module_initializer 198 | }, 199 | "relation": { 200 | "spans_per_word": relation_spans_per_word, 201 | "positive_label_weight": relation_positive_label_weight, 202 | "mention_feedforward": make_feedforward(span_emb_dim), 203 | "relation_feedforward": make_feedforward(relation_scorer_dim), 204 | "initializer": module_initializer, 205 | }, 206 | }, 207 | }, 208 | "iterator": { 209 | "type": "ie_batch", 210 | "batch_size": 10 211 | }, 212 | "validation_iterator": { 213 | "type": "ie_document", 214 | }, 215 | "trainer": { 216 | "num_epochs": num_epochs, 217 | "grad_norm": 5.0, 218 | "patience" : patience, 219 | "cuda_device" : -1, 220 | "validation_metric": validation_metrics[target], 221 | "learning_rate_scheduler": learning_rate_scheduler, 222 | "optimizer": { 223 | "type": "adam", 224 | "lr": learning_rate, 225 | }, 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/dygie_test_full.jsonnet: -------------------------------------------------------------------------------- 1 | // Full end-to-end test, with all components turned on. 2 | 3 | // Primary prediction target. Watch metrics associated with this target. 4 | local target = "rel"; 5 | 6 | // Specifies the token-level features that will be created. 7 | local use_glove = true; 8 | local use_char = true; 9 | local use_elmo = true; 10 | local use_attentive_span_extractor = true; 11 | 12 | // Specifies the model parameters. 13 | local lstm_hidden_size = 200; 14 | local lstm_n_layers = 1; 15 | local feature_size = 10; 16 | local feedforward_layers = 2; 17 | local char_n_filters = 50; 18 | local feedforward_dim = 150; 19 | local max_span_width = 8; 20 | local feedforward_dropout = 0.2; 21 | local lexical_dropout = 0.5; 22 | local lstm_dropout = 0.4; 23 | local loss_weights = { 24 | "ner": 1.0, 25 | "relation": 1.0, 26 | "coref": 1.0 27 | }; 28 | 29 | // Coref settings. 30 | local coref_spans_per_word = 0.4; 31 | local coref_max_antecedents = 100; 32 | 33 | // Relation settings. 34 | local relation_spans_per_word = 0.4; 35 | local relation_positive_label_weight = 1.0; 36 | 37 | // Model training 38 | local num_epochs = 250; 39 | local patience = 25; 40 | local learning_rate_scheduler = { 41 | "type": "reduce_on_plateau", 42 | "factor": 0.5, 43 | "mode": "max", 44 | "patience": 5 45 | }; 46 | local learning_rate = 0.001; 47 | 48 | 49 | //////////////////////////////////////////////////////////////////////////////// 50 | 51 | // Nothing below this line needs to change. 52 | 53 | 54 | // Storing constants. 55 | 56 | local validation_metrics = { 57 | "ner": "+ner_f1", 58 | "rel": "+rel_f1", 59 | "coref": "+coref_f1" 60 | }; 61 | 62 | local display_metrics = { 63 | "ner": ["ner_precision", "ner_recall", "ner_f1"], 64 | "rel": ["rel_precision", "rel_recall", "rel_f1", "rel_span_recall"], 65 | "coref": ["coref_precision", "coref_recall", "coref_f1", "coref_mention_recall"] 66 | }; 67 | 68 | local glove_dim = 300; 69 | local elmo_dim = 1024; 70 | 71 | local module_initializer = [ 72 | [".*linear_layers.*weight", {"type": "xavier_normal"}], 73 | [".*scorer._module.weight", {"type": "xavier_normal"}], 74 | ["_distance_embedding.weight", {"type": "xavier_normal"}]]; 75 | 76 | local dygie_initializer = [ 77 | ["_span_width_embedding.weight", {"type": "xavier_normal"}], 78 | ["_context_layer._module.weight_ih.*", {"type": "xavier_normal"}], 79 | ["_context_layer._module.weight_hh.*", {"type": "orthogonal"}] 80 | ]; 81 | 82 | 83 | //////////////////////////////////////////////////////////////////////////////// 84 | 85 | // Calculating dimensions. 86 | 87 | local token_embedding_dim = ((if use_glove then glove_dim else 0) + 88 | (if use_char then char_n_filters else 0) + 89 | (if use_elmo then elmo_dim else 0)); 90 | local endpoint_span_emb_dim = 4 * lstm_hidden_size + feature_size; 91 | local attended_span_emb_dim = if use_attentive_span_extractor then token_embedding_dim else 0; 92 | local span_emb_dim = endpoint_span_emb_dim + attended_span_emb_dim; 93 | local pair_emb_dim = 3 * span_emb_dim; 94 | local relation_scorer_dim = pair_emb_dim; 95 | local coref_scorer_dim = pair_emb_dim + feature_size; 96 | 97 | //////////////////////////////////////////////////////////////////////////////// 98 | 99 | // Function definitions 100 | 101 | local make_feedforward(input_dim) = { 102 | "input_dim": input_dim, 103 | "num_layers": feedforward_layers, 104 | "hidden_dims": feedforward_dim, 105 | "activations": "relu", 106 | "dropout": feedforward_dropout 107 | }; 108 | 109 | // Model components 110 | 111 | local token_indexers = { 112 | [if use_glove then "tokens"]: { 113 | "type": "single_id", 114 | "lowercase_tokens": false 115 | }, 116 | [if use_char then "token_characters"]: { 117 | "type": "characters", 118 | "min_padding_length": 5 119 | }, 120 | [if use_elmo then "elmo"]: { 121 | "type": "elmo_characters" 122 | } 123 | }; 124 | 125 | local text_field_embedder = { 126 | "token_embedders": { 127 | [if use_glove then "tokens"]: { 128 | "type": "embedding", 129 | "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.300d.txt.gz", 130 | "embedding_dim": 300, 131 | "trainable": false 132 | }, 133 | [if use_char then "token_characters"]: { 134 | "type": "character_encoding", 135 | "embedding": { 136 | "num_embeddings": 262, 137 | "embedding_dim": 16 138 | }, 139 | "encoder": { 140 | "type": "cnn", 141 | "embedding_dim": 16, 142 | "num_filters": char_n_filters, 143 | "ngram_filter_sizes": [5] 144 | } 145 | }, 146 | [if use_elmo then "elmo"]: { 147 | "type": "elmo_token_embedder", 148 | "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", 149 | "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", 150 | "do_layer_norm": false, 151 | "dropout": 0.5 152 | } 153 | } 154 | }; 155 | 156 | 157 | //////////////////////////////////////////////////////////////////////////////// 158 | 159 | // The model 160 | 161 | { 162 | "dataset_reader": { 163 | "type": "ie_json", 164 | "token_indexers": token_indexers, 165 | "max_span_width": max_span_width 166 | }, 167 | "train_data_path": "tests/fixtures/scierc_article.json", 168 | "validation_data_path": "tests/fixtures/scierc_article.json", 169 | "model": { 170 | "type": "dygie", 171 | "text_field_embedder": text_field_embedder, 172 | "initializer": dygie_initializer, 173 | "loss_weights": loss_weights, 174 | "lexical_dropout": lexical_dropout, 175 | "feature_size": feature_size, 176 | "use_attentive_span_extractor": use_attentive_span_extractor, 177 | "max_span_width": max_span_width, 178 | "display_metrics": display_metrics[target], 179 | "context_layer": { 180 | "type": "lstm", 181 | "bidirectional": true, 182 | "input_size": token_embedding_dim, 183 | "hidden_size": lstm_hidden_size, 184 | "num_layers": lstm_n_layers, 185 | "dropout": lstm_dropout 186 | }, 187 | "modules": { 188 | "coref": { 189 | "spans_per_word": coref_spans_per_word, 190 | "max_antecedents": coref_max_antecedents, 191 | "mention_feedforward": make_feedforward(span_emb_dim), 192 | "antecedent_feedforward": make_feedforward(coref_scorer_dim), 193 | "initializer": module_initializer 194 | }, 195 | "ner": { 196 | "mention_feedforward": make_feedforward(span_emb_dim), 197 | "initializer": module_initializer 198 | }, 199 | "relation": { 200 | "spans_per_word": relation_spans_per_word, 201 | "positive_label_weight": relation_positive_label_weight, 202 | "mention_feedforward": make_feedforward(span_emb_dim), 203 | "relation_feedforward": make_feedforward(relation_scorer_dim), 204 | "initializer": module_initializer, 205 | }, 206 | }, 207 | }, 208 | "iterator": { 209 | "type": "ie_batch", 210 | "batch_size": 10 211 | }, 212 | "validation_iterator": { 213 | "type": "ie_document", 214 | }, 215 | "trainer": { 216 | "num_epochs": num_epochs, 217 | "grad_norm": 5.0, 218 | "patience" : patience, 219 | "cuda_device" : -1, 220 | "validation_metric": validation_metrics[target], 221 | "learning_rate_scheduler": learning_rate_scheduler, 222 | "optimizer": { 223 | "type": "adam", 224 | "lr": learning_rate, 225 | }, 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/multi_dataset/dev.jsonl: -------------------------------------------------------------------------------- 1 | {"doc_key": 58, "dataset": "ace-event", "sentences": [["For", "an", "organization", "that", "is", "incorporated", "in", "Maryland", "and", "has", "its", "headquarters", "in", "Maryland", ",", "the", "laws", "of", "Maryland", "apply", "without", "regard", "to", "the", "states", "in", "which", "the", "directors", "live", ".", "Remember", ",", "a", "corporation", "is", "legally", "a", "\"", "person", "\"", ",", "distinct", "from", "the", "corporeal", "persons", "who", "govern", ",", "manage", ",", "and", "operate", "the", "corporation", ".", "Thus", ",", "the", "laws", "applying", "to", "the", "corporate", "person", "are", "the", "laws", "of", "the", "state", "where", "that", "person", "\"", "lives", "\"", ".", "--"], ["the", "board", "is", "who", "decides", "that", "and", "board", "is", "so", "often", "chock", "full", "of", "the", "ceos", "golfing", "buddies", ",", "they", "serve", "on", "each", "other", "'s", "boards", "and", "problem", "here", "is", "gap", "between", "what", "the", "average", "worker", "makes", "and", "ceo", "is", "making", "is", "increasing", "and", "everyone", "agrees", "that", "is", "not", "a", "good", "thing", "for", "this", "country", ",", "plus", "the", "fact", ",", "when", "the", "guys", "get", "the", "big", "packageses", ",", "so", "often", "you", "see", "the", "companies", "going", "down", "the", "zblubs", "thank", "you", "for", "updating", "us", "."], ["in", "a", "strange", "way", "and", "this", "may", "sound", "unusual", ",", "there", "'s", "a", "little", "sense", "of", "relief", "because", "these", "soldiers", "were", "set", "to", "leave", "in", "january", "and", "now", "they", "got", "second", "deployment", "orders", ",", "the", "delay", ",", "of", "course", ",", "the", "inability", "to", "get", "access", "to", "turkey", ",", "but", "along", "the", "way", ",", "the", "4th", "infantry", "division", "is", "one", "seeped", "in", "tradition", "and", "there", "'s", "a", "real", "sense", "of", "pride", "among", "the", "soldiers", ",", "250", "of", "which", "left", "yesterday", ",", "over", "the", "course", "of", "the", "week", "12,000", "will", "be", "leaving", "from", "here", "."], ["Another", "argument", ",", "which", "is", "better", "but", "still", "disturbing", ",", "is", "that", ",", "yes", ",", "this", "is", "an", "ethics", "violation", ",", "and", "maybe", "worse", ",", "but", "it", "would", "only", "hurt", "the", "USCF", "to", "talk", "about", "it", ",", "and", "that", "only", "troublemakers", "like", "Sam", "Sloan", ",", "Larry", "Parr", ",", "and", ",", "I", "suppose", ",", "me", ",", "would", "talk", "about", "it", ",", "since", "doing", "so", "will", "obstruct", "the", "federation", "'s", "plans", ",", "cause", "us", "to", "pay", "legal", "expenses", ",", "run", "the", "risk", "of", "our", "being", "stuck", "without", "any", "office", "space", "at", "all", ",", "cause", "people", "not", "to", "loan", "us", "money", ",", "and", "so", "forth", "."]], "ner": [[[2, 2, "ORG"], [7, 7, "GPE"], [11, 11, "FAC"], [13, 13, "GPE"], [18, 18, "GPE"], [24, 24, "GPE"], [28, 28, "PER"], [34, 34, "ORG"], [39, 39, "ORG"], [46, 46, "PER"], [55, 55, "ORG"], [65, 65, "ORG"], [71, 71, "GPE"], [74, 74, "ORG"]], [[81, 81, "PER"], [87, 87, "PER"], [95, 95, "PER"], [97, 97, "PER"], [105, 105, "PER"], [115, 115, "PER"], [118, 118, "PER"], [134, 134, "GPE"], [142, 142, "PER"], [153, 153, "ORG"]], [[183, 183, "PER"], [210, 210, "GPE"], [218, 220, "ORG"], [236, 236, "PER"]], [[288, 288, "ORG"], [297, 297, "PER"], [299, 300, "PER"], [302, 303, "PER"], [323, 323, "ORG"], [343, 343, "FAC"], [344, 344, "FAC"], [349, 349, "PER"]]], "relations": [[[11, 11, 13, 13, "PART-WHOLE.Geographical"]], [[95, 95, 97, 97, "PER-SOC.Lasting-Personal"]], [], []], "events": [[], [], [[[187, "Movement.Transport"], [183, 183, "Artifact"]], [[241, "Movement.Transport"]], [[253, "Movement.Transport"]]], [[[352, "Transaction.Transfer-Money"], [349, 349, "Giver"]]]], "_orig_doc_key": ["soc.org.nonprofit_20050218.1902", "CNN_ENG_20030424_070008.15", "CNN_ENG_20030328_150609.10", "rec.games.chess.politics_20041217.2111"], "_orig_sent_ix": [35, 16, 7, 38]} 2 | {"doc_key": 76, "dataset": "ace05", "sentences": [["been", "relatively", "quiet", "in", "northern", "ireland", "over", "the", "past", "few", "years", "."], ["our", "national", "correspondent", "frank", "buckley", "is", "on", "the", "scene", "for", "us", "."], ["[", "translator", "speaking", "]", "and", "i", "thank", "you", "for", "that", "help", "."], ["the", "pentagon", "says", "the", "convoy", "was", "taliban", "and", "al", "qaeda", "troops", "."], ["He", "was", "sentenced", "to", "four", "months", "in", "prison", ",", "but", "appealed", "."], ["Toefting", "transferred", "to", "Bolton", "in", "February", "2002", "from", "German", "club", "Hamburg", "."], ["Toefting", "joined", "the", "Danish", "squad", "in", "1993", "and", "has", "41", "caps", "."], ["sanctions", "targeting", "Iraq", "civilians", ",", "an", "important", "step", "toward", "the", "U.S", "."], ["He", "also", "envisioned", "U.S", ".", "and", "U.N", ".", "inspectors", "working", "together", "."], ["role", "in", "Iraq", "in", "the", "``", "not", "too", "distant", "future", ".", "''"], ["He", "did", "not", "provide", "the", "baby", "'s", "name", "or", "other", "details", "."], ["MCI", "to", "pay", "huge", "fine", "to", "SEC", "for", "accounting", "fraud", ":", "report"], ["Suicide", "bombing", "at", "Israeli", "shopping", "mall", ";", "fifth", "attack", "in", "two", "days"], ["The", "explosion", "killed", "the", "attacker", "and", "four", "shoppers", ",", "police", "said", "."], ["It", "was", "not", "known", "whether", "he", "was", "hurt", "at", "the", "time", "."], ["Just", "as", "long", "as", "it", "'s", "there", ",", "they", "feel", "safe", "."]], "ner": [[[5, 5, "LOC"]], [[15, 16, "PER"], [14, 14, "PER"], [12, 12, "ORG"], [22, 22, "ORG"]], [[31, 31, "GPE"], [25, 25, "PER"], [29, 29, "PER"]], [[46, 46, "PER"], [40, 40, "PER"], [37, 37, "ORG"], [42, 42, "ORG"], [44, 45, "ORG"]], [[48, 48, "PER"], [55, 55, "FAC"]], [[63, 63, "ORG"], [70, 70, "ORG"], [69, 69, "ORG"], [68, 68, "GPE"], [60, 60, "PER"]], [[76, 76, "ORG"], [75, 75, "GPE"], [72, 72, "PER"]], [[86, 86, "GPE"], [94, 94, "GPE"], [87, 87, "PER"]], [[96, 96, "PER"], [102, 102, "ORG"], [99, 99, "GPE"], [104, 104, "PER"]], [[110, 110, "GPE"]], [[120, 120, "PER"], [125, 125, "PER"]], [], [], [[165, 165, "ORG"], [160, 160, "PER"], [163, 163, "PER"]], [[173, 173, "PER"]], [[188, 188, "PER"]]], "relations": [[], [[14, 14, 12, 12, "ORG-AFF"]], [], [[46, 46, 42, 42, "ORG-AFF"], [46, 46, 44, 45, "ORG-AFF"]], [], [[60, 60, 63, 63, "ORG-AFF"], [60, 60, 70, 70, "ORG-AFF"], [69, 69, 68, 68, "GEN-AFF"]], [[72, 72, 76, 76, "ORG-AFF"], [76, 76, 75, 75, "GEN-AFF"]], [[87, 87, 86, 86, "GEN-AFF"]], [[104, 104, 102, 102, "ORG-AFF"], [104, 104, 99, 99, "ORG-AFF"]], [], [], [], [], [], [], []], "_orig_doc_key": ["CNN_ENG_20030407_130604.10", "CNN_ENG_20030620_170011.14", "CNN_ENG_20030507_170539.0", "CNN_ENG_20030526_183538.3", "APW_ENG_20030331.0410", "APW_ENG_20030331.0410", "APW_ENG_20030331.0410", "APW_ENG_20030422.0469", "APW_ENG_20030422.0469", "APW_ENG_20030422.0469", "APW_ENG_20030419.0358", "AFP_ENG_20030519.0049", "APW_ENG_20030519.0367", "APW_ENG_20030519.0367", "APW_ENG_20030519.0367", "AGGRESSIVEVOICEDAILY_20041101.1806"], "_orig_sent_ix": [9, 5, 11, 10, 7, 14, 18, 6, 35, 48, 7, 3, 3, 12, 52, 18]} 3 | {"doc_key": "93324366_dev", "dataset": "genia", "sentences": [["Human", "T", "cell", "transcription", "factor", "GATA", "-", "3", "stimulates", "HIV", "-", "1", "expression", "."], ["A", "family", "of", "transcriptional", "activating", "proteins", ",", "the", "GATA", "factors", ",", "has", "been", "shown", "to", "bind", "to", "a", "consensus", "motif", "through", "a", "highly", "conserved", "C4", "zinc", "finger", "DNA", "binding", "domain", "."], ["One", "member", "of", "this", "multigene", "family", ",", "GATA", "-", "3", ",", "is", "most", "abundantly", "expressed", "in", "T", "lymphocytes", ",", "a", "cellular", "target", "for", "human", "immunodeficiency", "virus", "type", "1", "(", "HIV", "-", "1", ")", "infection", "and", "replication", "."], ["In", "vitro", "DNase", "I", "footprinting", "analysis", "revealed", "six", "hGATA", "-", "3", "binding", "sites", "in", "the", "U3", "region", "(", "the", "transcriptional", "regulatory", "domain", ")", "of", "the", "HIV", "-", "1", "LTR", "."], ["Cotransfection", "of", "an", "hGATA", "-", "3", "expression", "plasmid", "with", "a", "reporter", "plasmid", "whose", "transcription", "is", "directed", "by", "the", "HIV", "-", "1", "LTR", "resulted", "in", "6", "-", "to", "10", "-", "fold", "stimulation", "of", "LTR", "-", "mediated", "transcription", ",", "whereas", "site", "specific", "mutation", "of", "these", "GATA", "sites", "resulted", "in", "virtual", "abrogation", "of", "the", "activation", "by", "hGATA", "-", "3", "."], ["Further", ",", "deletion", "of", "the", "hGATA", "-", "3", "transcriptional", "activation", "domain", "abolished", "GATA", "-", "dependent", "HIV", "-", "1", "trans", "-", "activation", ",", "showing", "that", "the", "stimulation", "of", "viral", "transcription", "observed", "is", "a", "direct", "effect", "of", "cotransfected", "hGATA", "-", "3", "."], ["Introduction", "of", "the", "HIV", "-", "1", "plasmids", "in", "which", "the", "GATA", "sites", "have", "been", "mutated", "into", "human", "T", "lymphocytes", "also", "caused", "a", "significant", "reduction", "in", "LTR", "-", "mediated", "transcription", "at", "both", "the", "basal", "level", "and", "in", "(", "PHA", "-", "plus", "PMA", "-", ")", "stimulated", "T", "cells", "."], ["These", "observations", "suggest", "that", "in", "addition", "to", "its", "normal", "role", "in", "T", "lymphocyte", "gene", "regulation", ",", "hGATA", "-", "3", "may", "also", "play", "a", "significant", "role", "in", "HIV", "-", "1", "transcriptional", "activation", "."]], "ner": [[[0, 4, "protein"], [5, 7, "protein"]], [[22, 23, "protein"], [32, 33, "DNA"], [38, 40, "protein"], [36, 43, "protein"]], [[52, 54, "protein"], [61, 62, "cell_type"]], [[84, 85, "protein"], [90, 94, "DNA"], [97, 98, "DNA"], [101, 103, "DNA"], [107, 110, "DNA"]], [[115, 119, "DNA"], [122, 123, "DNA"], [130, 133, "DNA"], [144, 144, "DNA"], [155, 156, "DNA"], [165, 167, "protein"]], [[174, 176, "protein"], [174, 179, "DNA"], [204, 207, "protein"]], [[212, 215, "DNA"], [219, 220, "DNA"], [226, 227, "cell_type"], [234, 234, "DNA"], [252, 254, "cell_line"]], [[267, 268, "cell_type"], [272, 274, "protein"]]], "relations": [[], [], [], [], [], [], [], []], "clusters": [[[0, 7], [45, 50]], [[14, 19], [21, 23], [45, 54], [48, 50]], [[61, 62], [64, 80], [225, 227]], [[89, 110], [154, 156], [218, 220]], [[106, 110], [129, 133]], [[121, 123], [124, 124]], [[144, 147], [234, 237]], [[165, 167], [263, 263], [272, 274]], [[211, 215], [217, 217]]]} 4 | {"doc_key": "W03-0406", "dataset": "scierc", "sentences": [["In", "this", "paper", ",", "we", "improve", "an", "unsupervised", "learning", "method", "using", "the", "Expectation-Maximization", "-LRB-", "EM", "-RRB-", "algorithm", "proposed", "by", "Nigam", "et", "al.", "for", "text", "classification", "problems", "in", "order", "to", "apply", "it", "to", "word", "sense", "disambiguation", "-LRB-", "WSD", "-RRB-", "problems", "."], ["The", "improved", "method", "stops", "the", "EM", "algorithm", "at", "the", "optimum", "iteration", "number", "."], ["To", "estimate", "that", "number", ",", "we", "propose", "two", "methods", "."], ["In", "experiments", ",", "we", "solved", "50", "noun", "WSD", "problems", "in", "the", "Japanese", "Dictionary", "Task", "in", "SENSEVAL2", "."], ["The", "score", "of", "our", "method", "is", "a", "match", "for", "the", "best", "public", "score", "of", "this", "task", "."], ["Furthermore", ",", "our", "methods", "were", "confirmed", "to", "be", "effective", "also", "for", "verb", "WSD", "problems", "."]], "ner": [[[7, 9, "Method"], [12, 16, "Method"], [23, 25, "Task"], [30, 30, "Generic"], [32, 38, "Task"]], [[42, 42, "Generic"], [45, 46, "Method"], [49, 51, "OtherScientificTerm"]], [[56, 56, "Generic"]], [[69, 71, "Task"], [74, 76, "Task"], [78, 78, "Material"]], [[84, 84, "Generic"], [95, 95, "Generic"]], [[100, 100, "Generic"], [108, 110, "Task"]]], "relations": [[[12, 16, 7, 9, "USED-FOR"], [12, 16, 23, 25, "USED-FOR"], [30, 30, 32, 38, "USED-FOR"]], [], [], [[74, 76, 78, 78, "FEATURE-OF"]], [], [[100, 100, 108, 110, "USED-FOR"]]], "clusters": [[[49, 51], [56, 56]], [[7, 9], [30, 30], [42, 42], [84, 84], [100, 100]], [[74, 76], [95, 95]]]} 5 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/multi_dataset/test.jsonl: -------------------------------------------------------------------------------- 1 | {"doc_key": 15, "dataset": "ace-event", "sentences": [["He", "also", "owns", "a", "television", "and", "a", "radio", "station", "and", "a", "newspaper", "."], ["\"", "That", "does", "n't", "shock", "us", ",", "we", "have", "been", "saying", "it", "."], ["He", "has", "made", "no", "public", "comments", "so", "far", "on", "the", "Beijing", "talks", "."], ["The", "official", "did", "not", "disclose", "what", "North", "Korea", "'s", "bold", "plan", "was", "."], ["\"", "The", "United", "States", "is", "our", "major", "thrust", ",", "\"", "he", "said", "."], ["Head", "of", "Iran", "'s", "largest", "daily", "quits", "after", "conservatives", "win", "Tehran", "city", "hall"], ["He", "lost", "an", "appeal", "case", "on", "his", "sodomy", "sentence", "on", "April", "18", "."], ["\"", "Both", "said", "they", "look", "forward", "to", "seeing", "each", "other", "in", "Evian", "."], ["We", "just", "disagree", "on", "one", "item", ",", "\"", "the", "Canadian", "leader", "said", "."], ["Bush", ",", "Putin", "pal", "up", "after", "Iraq", "spat", ",", "but", "Iran", "row", "simmers"], ["``", "We", "are", "proceeding", "with", "all", "the", "plans", "for", "the", "vote", ".", "''"], ["Saddam", "has", "long", "claimed", "Iraq", "destroyed", "all", "its", "weapons", "of", "mass", "destruction", "."], ["Report", ":", "Hong", "Kong", "Jockey", "Club", "in", "talks", "to", "acquire", "its", "Macau", "rival"], ["Ships", "carrying", "equipment", "for", "U.S.", "troops", "are", "already", "waiting", "off", "the", "Turkish", "coast"], ["The", "Justice", "party", "changed", "the", "constitution", "after", "taking", "power", "in", "the", "elections", "."], ["at", "members", "of", "Erdogan", "'s", "party", ",", "the", "Anatolia", "news", "agency", "reported", "."]], "ner": [[[8, 8, "ORG"], [11, 11, "ORG"]], [], [[36, 36, "GPE"]], [[40, 40, "PER"], [45, 46, "GPE"]], [[54, 55, "GPE"]], [], [], [[102, 102, "GPE"]], [[113, 113, "GPE"], [114, 114, "PER"]], [[117, 117, "PER"], [119, 119, "PER"], [123, 123, "GPE"], [127, 127, "GPE"]], [], [[143, 143, "PER"], [147, 147, "GPE"], [151, 151, "WEA"]], [], [[169, 169, "VEH"], [173, 173, "GPE"], [174, 174, "PER"], [180, 180, "GPE"], [181, 181, "LOC"]], [[183, 184, "ORG"]], [[196, 196, "PER"], [198, 198, "PER"], [200, 200, "ORG"], [203, 203, "ORG"], [205, 205, "ORG"]]], "relations": [[], [], [], [], [], [], [], [], [[114, 114, 113, 113, "ORG-AFF.Employment"]], [], [], [], [], [[174, 174, 173, 173, "ORG-AFF.Employment"], [181, 181, 180, 180, "PART-WHOLE.Geographical"]], [], [[196, 196, 200, 200, "ORG-AFF.Membership"], [198, 198, 200, 200, "ORG-AFF.Employment"]]], "events": [[], [], [], [], [], [], [[[81, "Justice.Appeal"]], [[86, "Justice.Sentence"]]], [], [], [], [], [], [], [], [], []], "_orig_doc_key": ["AFP_ENG_20030415.0734", "AFP_ENG_20030425.0408", "AFP_ENG_20030425.0408", "AFP_ENG_20030425.0408", "AFP_ENG_20030430.0075", "AFP_ENG_20030504.0248", "AFP_ENG_20030508.0357", "AFP_ENG_20030527.0616", "AFP_ENG_20030527.0616", "AFP_ENG_20030601.0262", "APW_ENG_20030304.0555", "APW_ENG_20030308.0314", "APW_ENG_20030310.0719", "APW_ENG_20030311.0775", "APW_ENG_20030311.0775", "APW_ENG_20030311.0775"], "_orig_sent_ix": [13, 11, 24, 30, 20, 3, 13, 7, 12, 3, 24, 20, 3, 12, 31, 38]} 2 | {"doc_key": 72, "dataset": "ace05", "sentences": [["A", "total", "over", "the", "last", "two", "weeks", "now", "of", "close", "to", "24,000", "sorties", "."], ["That", "air", "base", "that", "coalition", "forces", "have", "now", "taken", "control", "of", "near", "Nasiriya", "."], ["Tales", "of", "how", "some", "in", "Congress", "want", "to", "honor", "the", "British", "prime", "minister", "."], ["Columns", "of", "coalition", "forces", "keep", "pressing", "north", "and", "tightening", "the", "noose", "on", "Baghdad", "."], ["These", "pictures", "were", "taken", "on", "the", "runway", "just", "moments", "after", "the", "coalition", "attack", "."], ["Outside", "the", "airport", ",", "the", "fighting", "was", "fierce", ",", "but", "the", "battle", "unequal", "."], ["This", "is", "fairly", "central", "in", "Baghdad", "on", "the", "edge", "of", "a", "government", "area", "."], ["That", "'s", "CNN", "'s", "Karl", "Penhaul", "with", "the", "very", "latest", "from", "the", "battlefront", "."], ["Let", "'", "s", "get", "an", "update", "now", "on", "casualties", "on", "Operation", "Iraqi", "Freedom", "."], ["i", "think", "because", "the", "day", "after", "came", "so", "fast", "and", "so", "suddenly", "in", "way", "."], ["in", "the", "short-term", ",", "that", "'s", "going", "to", "have", "to", "mean", "the", "american", "military", "."], ["it", "was", "the", "first", "american", "prisoner", "of", "war", "ever", "rescued", "since", "world", "war", "ii", "."], ["rehab", "taha", ",", "aka", "dr.", "germ", ",", "who", "directed", "iraq", "'s", "biological", "weapons", "program", "."], ["cnn", "has", "exclusively", "obtained", "the", "united", "nations", "english", "translations", "of", "her", "arabic", "work", "papers", "."], ["reporter", ":", "a", "perfectionism", "reflected", "in", "even", "the", "smallest", "details", "of", "stewart", "'s", "life", "."], ["this", "is", "a", "stock", "down", "more", "than", "70", "%", "over", "the", "last", "three", "years", "."]], "ner": [[], [[16, 16, "FAC"], [17, 17, "FAC"], [18, 18, "GPE"], [26, 26, "GPE"], [19, 19, "PER"]], [[40, 40, "PER"], [31, 31, "PER"], [33, 33, "ORG"], [38, 38, "GPE"]], [[44, 44, "GPE"], [54, 54, "GPE"], [42, 42, "PER"], [45, 45, "PER"]], [[67, 67, "GPE"], [62, 62, "FAC"]], [[72, 72, "FAC"]], [[84, 84, "FAC"], [96, 96, "LOC"], [92, 92, "LOC"], [95, 95, "ORG"], [89, 89, "GPE"]], [[100, 100, "ORG"], [102, 103, "PER"]], [[114, 114, "ORG"], [123, 123, "GPE"]], [[126, 126, "PER"]], [[154, 154, "ORG"], [153, 153, "GPE"]], [[160, 160, "GPE"], [161, 161, "PER"]], [[171, 172, "PER"], [178, 178, "PER"], [175, 176, "PER"], [183, 183, "WEA"], [180, 180, "GPE"], [184, 184, "ORG"]], [[196, 196, "PER"], [191, 192, "ORG"], [186, 186, "ORG"]], [[212, 212, "PER"]], []], "relations": [[], [[16, 16, 26, 26, "PHYS"], [19, 19, 17, 17, "ART"], [19, 19, 18, 18, "ORG-AFF"]], [[31, 31, 33, 33, "ORG-AFF"], [40, 40, 38, 38, "ORG-AFF"]], [[45, 45, 44, 44, "ORG-AFF"]], [], [], [[92, 92, 96, 96, "PART-WHOLE"], [84, 84, 89, 89, "PART-WHOLE"]], [[102, 103, 100, 100, "ORG-AFF"]], [], [], [[154, 154, 153, 153, "GEN-AFF"]], [[161, 161, 160, 160, "GEN-AFF"]], [[178, 178, 184, 184, "ORG-AFF"], [184, 184, 180, 180, "PART-WHOLE"]], [], [], []], "_orig_doc_key": ["CNN_IP_20030402.1600.02-2", "CNN_IP_20030402.1600.02-2", "CNN_IP_20030402.1600.02-2", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030404.1600.00-1", "CNN_IP_20030405.1600.00-2", "CNN_IP_20030402.1600.00-2", "CNN_IP_20030402.1600.00-2", "CNN_ENG_20030411_070039.21", "CNN_ENG_20030411_070039.21", "CNN_ENG_20030403_080032.9", "CNN_ENG_20030416_180808.15", "CNN_ENG_20030416_180808.15", "CNN_ENG_20030607_170312.6", "CNN_ENG_20030516_123543.8"], "_orig_sent_ix": [53, 66, 78, 11, 20, 32, 46, 5, 6, 13, 15, 23, 10, 11, 47, 13]} 3 | {"doc_key": "91041706_test", "dataset": "genia", "sentences": [["Interferon", "-", "gamma", "and", "the", "sexual", "dimorphism", "of", "autoimmunity", "."], ["The", "sexual", "difference", "in", "the", "incidence", "of", "autoimmune", "diseases", "has", "remained", "an", "enigma", "for", "many", "years", "."], ["In", "the", "examination", "of", "the", "induction", "of", "autoimmunity", "in", "transgenic", "mice", ",", "evidence", "has", "been", "obtained", "further", "implicating", "the", "lymphokine", "interferon", "-", "gamma", "in", "the", "etiology", "of", "autoimmunity", "."], ["Sex", "steroid", "regulation", "of", "the", "production", "of", "this", "molecule", ",", "as", "well", "as", "other", "cytokines", ",", "may", "help", "explain", "the", "gender", "-", "specific", "differences", "in", "the", "immune", "system", ",", "including", "autoimmunity", "."]], "ner": [[[0, 2, "protein"]], [], [[46, 46, "protein"], [47, 49, "protein"]], [[70, 70, "protein"]]], "relations": [[], [], [], []], "clusters": [[[8, 8], [34, 34], [54, 54], [86, 86]]]} 4 | {"doc_key": "CVPR_2004_30_abs", "dataset": "scierc", "sentences": [["Background", "modeling", "is", "an", "important", "component", "of", "many", "vision", "systems", "."], ["Existing", "work", "in", "the", "area", "has", "mostly", "addressed", "scenes", "that", "consist", "of", "static", "or", "quasi-static", "structures", "."], ["When", "the", "scene", "exhibits", "a", "persistent", "dynamic", "behavior", "in", "time", ",", "such", "an", "assumption", "is", "violated", "and", "detection", "performance", "deteriorates", "."], ["In", "this", "paper", ",", "we", "propose", "a", "new", "method", "for", "the", "modeling", "and", "subtraction", "of", "such", "scenes", "."], ["Towards", "the", "modeling", "of", "the", "dynamic", "characteristics", ",", "optical", "flow", "is", "computed", "and", "utilized", "as", "a", "feature", "in", "a", "higher", "dimensional", "space", "."], ["Inherent", "ambiguities", "in", "the", "computation", "of", "features", "are", "addressed", "by", "using", "a", "data-dependent", "bandwidth", "for", "density", "estimation", "using", "kernels", "."], ["Extensive", "experiments", "demonstrate", "the", "utility", "and", "performance", "of", "the", "proposed", "approach", "."]], "ner": [[[0, 1, "Task"], [8, 9, "Task"]], [[23, 26, "OtherScientificTerm"]], [[30, 30, "Generic"], [33, 35, "OtherScientificTerm"], [45, 45, "Task"]], [[57, 57, "Generic"], [60, 65, "Task"], [65, 65, "Generic"]], [[69, 73, "Task"], [75, 76, "OtherScientificTerm"], [83, 83, "OtherScientificTerm"], [86, 88, "OtherScientificTerm"]], [[91, 91, "OtherScientificTerm"], [94, 96, "Task"], [102, 103, "OtherScientificTerm"], [105, 106, "Task"], [108, 108, "Method"]], [[120, 120, "Generic"]]], "relations": [[[0, 1, 8, 9, "PART-OF"]], [], [[33, 35, 30, 30, "FEATURE-OF"]], [[57, 57, 60, 65, "USED-FOR"]], [[75, 76, 69, 73, "USED-FOR"], [75, 76, 83, 83, "USED-FOR"], [83, 83, 69, 73, "USED-FOR"], [86, 88, 83, 83, "FEATURE-OF"]], [[91, 91, 94, 96, "FEATURE-OF"], [102, 103, 91, 91, "USED-FOR"], [102, 103, 105, 106, "USED-FOR"], [108, 108, 105, 106, "USED-FOR"]], []], "clusters": [[[57, 57], [120, 120]], [[30, 30], [65, 65]]]} 5 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/multi_dataset/train.jsonl: -------------------------------------------------------------------------------- 1 | {"doc_key": 758, "dataset": "ace-event", "sentences": [["ozzy", "'s", "not", "going", "to", "like", "me", "saying", "this", ",", "but", "he", "'s", "a", "very", "tender", ",", "gentle", "man", "."], ["the", "second", "attack", "occurred", "after", "some", "rocket", "firings", "aimed", ",", "apparently", ",", "toward", "the", "israelis", ",", "apparently", "in", "retaliation", "."], ["that", "'s", "just", "a", "fraction", "of", "the", "killings", "and", "rapes", "and", "torture", "that", "have", "grippedded", "the", "country", "since", "1998", "."], ["its", "basic", "message", ",", "the", "palestinian", "authority", "is", "corrupt", ",", "it", "fails", "to", "deliver", ",", "hamas", "is", "not", "corrupt", "."], ["reporter", ":", "settlers", "call", "the", "just", "concluded", "israeli", "-", "american-", "palestinian", "summit", "a", "surrender", ",", "and", "were", "not", "impressed", "."], ["and", "that", "the", "survivors", ",", "they", "had", "to", "go", "into", "the", "water", ",", "pick", "them", "up", "from", "the", "water", "."], ["when", "you", "first", "heard", "about", "sars", ",", "i", "mean", ",", "did", "you", "suddenly", "think", ",", "this", "may", "not", "happen", "?"], ["the", "flip", "side", "between", "stars", ",", "though", ",", "is", "once", "the", "fans", "grow", "up", "their", "star", "power", "may", "dim", "."], ["it", "would", "be", "deadly", "to", "let", "iran", "let", "their", "hands", "on", "nuclear", "weapons", "employ", "i", "would", "support", "any", "action", "."], ["fidelity", "is", "going", "to", "get", "rid", "of", "the", "3", "%", "upfront", "sales", "charge", "on", "five", "of", "its", "largest", "funds", "."], ["we", "'re", "told", "the", "russian", "president", "vladimir", "putin", "was", "greeted", "by", "prince", "charles", "as", "he", "arrived", "in", "london", "today", "."], ["that", "experience", "should", "continue", "to", "inspire", "us", ",", "as", "we", "seek", "to", "build", "a", "more", "peaceful", "and", "secure", "world", "."], ["*", "his", "dandruff", "shampoo", "was", "n't", "tough", "enough", "for", "black", ",", "so", "i", "bought", "him", "maximum", "strength", "selsun", "blue", "."], ["he", "is", "one", "of", "the", "biggest", "if", "not", "the", "biggest", "names", "in", "the", "world", "of", "hollywood", "fund", "-", "raisers", "."], ["sean", "patrick", "o'malley", ",", "the", "bishop", "of", "palm", "beach", ",", "florida", ",", "is", "expected", "to", "replace", "cardinal", "bernard", "law", "."], ["And", "uh", "so", "I", "says", "vote", "for", "him", ",", "vote", "for", "him", ",", "vote", "for", "him", ",", "you", "know", "."]], "ner": [[[0, 0, "PER"], [18, 18, "PER"]], [[26, 26, "WEA"], [34, 34, "GPE"]], [[56, 56, "GPE"]], [[65, 66, "GPE"], [75, 75, "ORG"]], [[80, 80, "PER"], [82, 82, "PER"], [87, 87, "GPE"], [89, 89, "GPE"], [90, 90, "GPE"]], [[103, 103, "PER"]], [], [[144, 144, "PER"], [151, 151, "PER"]], [[166, 166, "GPE"], [172, 172, "WEA"]], [[180, 180, "ORG"]], [[204, 204, "GPE"], [205, 205, "PER"], [206, 207, "PER"], [211, 211, "PER"], [212, 212, "PER"], [217, 217, "GPE"]], [[238, 238, "LOC"]], [], [[270, 270, "PER"], [275, 275, "ORG"], [276, 278, "PER"]], [[280, 282, "PER"], [285, 285, "PER"], [287, 288, "GPE"], [290, 290, "GPE"], [296, 296, "PER"], [297, 298, "PER"]], []], "relations": [[], [], [], [], [], [], [], [], [], [], [[205, 205, 204, 204, "ORG-AFF.Employment"]], [], [], [[276, 278, 275, 275, "ORG-AFF.Employment"]], [[285, 285, 287, 288, "GEN-AFF.Citizen-Resident-Religion-Ethnicity"], [287, 288, 290, 290, "PART-WHOLE.Geographical"]], []], "events": [[], [[[22, "Conflict.Attack"], [26, 26, "Instrument"]], [[27, "Conflict.Attack"], [26, 26, "Instrument"]]], [[[47, "Life.Die"], [56, 56, "Place"]], [[49, "Conflict.Attack"], [56, 56, "Place"]], [[51, "Conflict.Attack"], [56, 56, "Place"]]], [], [[[91, "Contact.Meet"], [87, 87, "Entity"], [89, 89, "Entity"], [90, 90, "Entity"]]], [], [], [], [], [], [[[215, "Movement.Transport"], [206, 207, "Artifact"], [217, 217, "Destination"]]], [], [], [], [[[295, "Personnel.Start-Position"], [280, 282, "Person"]]], []], "_orig_doc_key": ["CNN_ENG_20030607_173310.4", "CNN_ENG_20030610_130042.17", "CNN_ENG_20030612_173004.10", "CNN_ENG_20030614_173123.4", "CNN_ENG_20030614_173123.4", "CNN_ENG_20030617_112838.4", "CNN_ENG_20030617_193116.10", "CNN_ENG_20030618_193127.17", "CNN_ENG_20030624_082841.12", "CNN_ENG_20030624_140104.22", "CNN_ENG_20030624_153103.16", "CNN_ENG_20030624_153103.17", "CNN_ENG_20030625_210122.0", "CNN_ENG_20030626_193133.8", "CNN_ENG_20030630_075848.7", "fsh_29105"], "_orig_sent_ix": [46, 8, 6, 15, 26, 21, 81, 18, 16, 5, 3, 7, 4, 24, 3, 77]} 2 | {"doc_key": 474, "dataset": "ace05", "sentences": [["defense", "attorneys", "argue", "that", "an", "individual", "can", "not", "be", "charged", "under", "federal", "law", "and", "then", "state", "law", "for", "the", "same", "act", "."], ["the", "british", "museum", "is", "home", "to", "the", "largest", "mesopotamia", "collection", "outside", "iraq", ",", "including", "some", "of", "the", "earliest", "forms", "of", "writing", "."], ["what", "i", "should", "tell", "you", "is", "there", "are", "about", "15", "possible", "routes", "to", "get", "you", "to", "the", "top", "of", "mt.", "everest", "."], ["the", "other", "mission", "that", "it", "took", "part", "in", "was", "the", "operations", "before", "t", "fall", "of", "the", "saddam", "hussein", "and", "his", "regime", "."], ["now", ",", "also", ",", "this", "ship", "played", "an", "important", "part", "within", "the", "war", "and", "that", "'s", "because", "of", "the", "radar", "system", "."], ["during", "a", "live", "broadcast", "geraldo", "drew", "a", "map", "in", "the", "sand", "showing", "the", "location", "of", "the", "unit", "in", "relation", "to", "baghdad", "."], ["he", "was", "to", "his", "chagrin", ",", "could", "be", "the", "most", "toughest", "and", "important", "battle", "of", "them", "all", ",", "winning", "the", "peace", "."], ["i", "pulled", "out", "the", "youngest", "guy", "first", "and", "then", "i", "wanted", "to", "pull", "out", "the", "captain", ",", "but", "the", "captain", "said", "take"], ["as", "soon", "as", "the", "police", "officers", "pulled", "the", "gun", "out", "and", "held", "it", "up", ",", "i", "knew", "it", "was", "my", "rifle", "."], ["the", "president", "greeting", "a", "number", "of", "dignitaries", "as", "he", "gets", "ready", "here", "to", "board", "air", "force", "one", "on", "his", "way", "home", "."], ["a", "little", "earlier", "this", "morning", ",", "i", "spoke", "with", "an", "iraqi", "dissident", "who", "teaches", "at", "writes", "extensively", "on", "the", "middle", "east", "."], ["straun", "with", "smashed", "cars", ",", "buses", "and", "other", "debris", "representing", "the", "hypothetical", "damage", "from", "a", "fan", "that", "sized", "radioactive", "dirty", "bomb", "."], ["remember", "ali", "abbas", "who", "lost", "both", "his", "arms", ",", "his", "home", ",", "many", "of", "his", "relatives", "in", "a", "bombing", "on", "baghdad", "?"], ["no", "telling", "how", "long", "the", "osbourne", "phenomenon", "may", "last", "but", "the", "family", "'s", "loyal", "following", "will", "always", "be", "screaming", "for", "more", "."], ["considering", "how", "much", "publicity", "was", "circulating", "about", "this", "book", ",", "there", "is", "injunctions", "about", "publishing", "anything", "or", "talking", "about", "the", "plot", "."], ["on", "behal", "of", "republican", "candidates", "and", "i", "tend", "to", "do", "a", "lot", "of", "campaigning", "in", "the", "next", "year", "for", "the", "president", "."]], "ner": [[[5, 5, "PER"], [15, 15, "GPE"], [1, 1, "PER"]], [[23, 24, "ORG"], [26, 26, "ORG"], [30, 30, "LOC"], [33, 33, "GPE"]], [[63, 64, "LOC"], [58, 58, "PER"], [61, 61, "LOC"], [45, 45, "PER"], [48, 48, "PER"]], [[70, 70, "VEH"], [82, 83, "PER"], [85, 85, "PER"], [86, 86, "ORG"]], [[93, 93, "VEH"]], [[114, 114, "PER"], [126, 126, "PER"], [130, 130, "GPE"]], [[132, 132, "PER"], [135, 135, "PER"]], [[154, 154, "PER"], [163, 163, "PER"], [159, 159, "PER"], [169, 169, "PER"], [173, 173, "PER"]], [[195, 195, "PER"], [191, 191, "PER"], [181, 181, "PER"], [180, 180, "ORG"], [184, 184, "WEA"], [188, 188, "WEA"], [193, 193, "WEA"], [196, 196, "WEA"]], [[218, 218, "GPE"], [204, 204, "PER"], [209, 209, "LOC"], [199, 199, "PER"], [206, 206, "PER"], [216, 216, "PER"], [212, 214, "VEH"]], [[230, 230, "GPE"], [226, 226, "PER"], [231, 231, "PER"], [232, 232, "PER"], [239, 240, "GPE"]], [[245, 245, "VEH"], [247, 247, "VEH"], [262, 262, "WEA"]], [[265, 266, "PER"], [267, 267, "PER"], [270, 270, "PER"], [273, 273, "PER"], [278, 278, "PER"], [279, 279, "PER"], [274, 274, "FAC"], [276, 276, "PER"], [284, 284, "GPE"]], [[300, 300, "PER"], [291, 291, "PER"], [297, 297, "PER"]], [], [[350, 350, "PER"], [334, 334, "PER"], [333, 333, "ORG"], [336, 336, "PER"]]], "relations": [[], [], [[61, 61, 63, 64, "PART-WHOLE"], [58, 58, 61, 61, "PHYS"]], [], [], [[126, 126, 130, 130, "PHYS"]], [], [], [[181, 181, 180, 180, "ORG-AFF"], [195, 195, 196, 196, "ART"]], [[199, 199, 212, 214, "ART"], [216, 216, 218, 218, "GEN-AFF"]], [[231, 231, 230, 230, "GEN-AFF"]], [], [[265, 266, 284, 284, "PHYS"], [273, 273, 274, 274, "ART"], [278, 278, 279, 279, "PER-SOC"]], [], [], []], "_orig_doc_key": ["CNN_ENG_20030602_102826.13", "CNN_ENG_20030418_163834.14", "CNN_ENG_20030529_130011.6", "CNN_ENG_20030426_160621.0", "CNN_ENG_20030426_160621.0", "CNN_ENG_20030331_193655.14", "CNN_ENG_20030422_213527.4", "CNN_ENG_20030617_173115.22", "CNN_ENG_20030508_210555.5", "CNN_ENG_20030408_083034.11", "CNN_ENG_20030408_083034.11", "CNN_ENG_20030512_170454.13", "CNN_ENG_20030429_143706.14", "CNN_ENG_20030607_173310.4", "CNN_ENG_20030617_105836.4", "CNN_ENG_20030624_065843.24"], "_orig_sent_ix": [13, 15, 16, 7, 9, 6, 6, 17, 10, 20, 21, 4, 4, 53, 14, 17]} 3 | {"doc_key": "99221887_train", "dataset": "genia", "sentences": [["Suppressive", "effects", "of", "anti", "-", "inflammatory", "agents", "on", "human", "endothelial", "cell", "activation", "and", "induction", "of", "heat", "shock", "proteins", "."], ["BACKGROUND", ":", "Studies", "from", "our", "laboratory", "have", "shown", "that", "the", "earliest", "stages", "of", "atherosclerosis", "may", "be", "mediated", "by", "an", "autoimmune", "reaction", "against", "heat", "shock", "protein", "60", "(", "Hsp60", ")", "."], ["The", "interactions", "of", "Hsp60", "-", "specific", "T", "cells", "with", "arterial", "endothelial", "cells", "(", "EC", ")", "require", "expression", "of", "both", "Hsp60", "and", "certain", "adhesion", "molecules", "shown", "to", "be", "induced", "simultaneously", "in", "EC", "by", "mechanical", "and", "other", "types", "of", "stress", "."], ["Recently", ",", "it", "was", "shown", "that", "suppression", "of", "T", "cell", "-", "mediated", "immune", "responses", "by", "cyclosporin", "A", "(", "CyA", ")", "enhanced", "atherosclerotic", "lesion", "formation", "in", "mice", "."], ["In", "contrast", ",", "aspirin", "was", "found", "to", "lower", "the", "risk", "of", "myocardial", "infarction", "in", "men", "."], ["These", "conflicting", "observations", "may", "be", "due", "to", "different", "effects", "of", "anti", "-", "inflammatory", "agents", "on", "adhesion", "molecule", "and", "Hsp", "expression", "in", "EC", ",", "respectively", "."], ["MATERIAL", "AND", "METHODS", ":", "In", "the", "present", "study", ",", "we", "analyzed", "the", "effects", "of", "CyA", ",", "aspirin", ",", "and", "indomethacin", "on", "T", "cell", "proliferation", "using", "a", "proliferation", "assay", "."], ["To", "explore", "the", "expression", "of", "adhesion", "molecules", ",", "monocyte", "chemoattractant", "protein", "-", "1", "(", "MCP", "-", "1", ")", ",", "and", "Hsp60", "in", "human", "umbilical", "vein", "endothelial", "cells", "(", "HUVECs", ")", ",", "Northern", "blot", "analyses", "were", "used", "."], ["To", "examine", "the", "activation", "status", "of", "the", "transcription", "factors", "nuclear", "factor", "kappaB", "(", "NF", "-", "kappaB", ")", "and", "heat", "shock", "factor", "-", "1", "(", "HSF", "-", "1", ")", ",", "electrophoretic", "mobility", "shift", "assays", "were", "performed", "."], ["RESULTS", ":", "With", "the", "exception", "of", "indomethacin", ",", "the", "used", "immunosuppressive", "and", "anti", "-", "inflammatory", "agents", "significantly", "inhibited", "T", "cell", "proliferation", "in", "response", "to", "influenza", "virus", "antigen", "in", "a", "dose", "-", "dependent", "manner", "."], ["Interestingly", ",", "CyA", "and", "indomethacin", "did", "not", "suppress", "tumor", "necrosis", "factor", "-", "alpha", "(", "TNF", "-", "alpha", ")", "-", "induced", "adhesion", "molecule", "expression", "on", "HUVECs", ",", "whereas", "aspirin", "had", "an", "inhibitory", "effect", "."], ["These", "observations", "correlated", "with", "the", "modulation", "of", "NF", "-", "kappaB", "activity", "in", "EC", "."], ["All", "agents", "tested", "induced", "expression", "of", "Hsp60", "6", "hr", "after", "application", "."], ["In", "addition", ",", "aspirin", "and", "indomethacin", ",", "but", "not", "CyA", ",", "induced", "Hsp70", "expression", "in", "HUVECs", "that", "correlated", "with", "induction", "of", "HSF", "-", "1", "activity", "."], ["CONCLUSION", ":", "Our", "results", "show", "that", "the", "tested", "agents", "(", "except", "indomethacin", ")", "are", "inhibitors", "of", "the", "T", "cell", "-", "mediated", "immune", "response", ",", "as", "expected", ",", "that", "aspirin", "is", "an", "effective", "suppressor", "of", "adhesion", "molecule", "expression", ",", "and", "that", "all", "three", "agents", "can", "induce", "Hsp60", "in", "HUVECs", "."], ["These", "data", "provide", "the", "molecular", "basis", "for", "the", "notion", "that", "(", "1", ")", "part", "of", "the", "anti", "-", "atherogenic", "effect", "of", "aspirin", "may", "be", "due", "to", "the", "prevention", "of", "the", "adhesion", "of", "sensitized", "T", "cells", "to", "stressed", "EC", ";", "(", "2", ")", "that", "part", "of", "the", "atherosclerosis", "-", "promoting", "effect", "of", "CyA", "may", "be", "due", "to", "its", "potential", "as", "an", "inducer", "of", "Hsp60", "expression", "and", "its", "inability", "to", "down", "-", "regulate", "adhesion", "molecule", "expression", "on", "EC", ";", "and", "(", "3", ")", "that", "down", "-", "regulation", "of", "MCP", "-", "1", "expression", "by", "aspirin", "may", "result", "in", "decreased", "recruitment", "of", "monocytes", "into", "the", "arterial", "intima", "beneath", "stressed", "EC", "."]], "ner": [[[8, 10, "cell_type"], [15, 17, "protein"]], [[41, 44, "protein"], [46, 46, "protein"]], [[52, 52, "protein"], [52, 56, "cell_line"], [59, 60, "cell_type"], [62, 62, "cell_type"], [68, 68, "protein"], [71, 72, "protein"], [79, 79, "cell_type"]], [], [], [[152, 152, "cell_type"]], [], [[190, 191, "protein"], [193, 197, "protein"], [199, 201, "protein"], [205, 205, "protein"], [210, 211, "cell_type"], [207, 211, "cell_type"], [213, 213, "cell_type"]], [[229, 230, "protein"], [231, 233, "protein"], [235, 237, "protein"], [240, 244, "protein"], [246, 248, "protein"]], [[282, 284, "protein"]], [[300, 304, "protein"], [306, 308, "protein"], [316, 316, "cell_type"]], [[332, 334, "protein"], [337, 337, "cell_type"]], [[345, 345, "protein"]], [[363, 363, "protein"], [366, 366, "cell_type"], [372, 374, "protein"]], [[411, 412, "protein"], [422, 422, "protein"], [424, 424, "cell_type"]], [[458, 460, "cell_type"], [463, 463, "cell_type"], [488, 488, "protein"], [497, 498, "protein"], [501, 501, "cell_type"], [512, 514, "protein"], [524, 524, "cell_type"], [531, 531, "cell_type"]]], "relations": [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []], "clusters": [[[3, 6], [141, 144]], [[41, 47], [345, 345]], [[103, 107], [170, 170], [294, 294], [360, 360], [477, 477], [482, 482], [491, 491]], [[118, 118], [172, 172], [319, 319], [354, 354], [405, 405], [447, 447], [517, 517]], [[170, 175], [339, 341], [417, 419]], [[175, 175], [264, 264], [296, 296], [356, 356]], [[177, 179], [276, 278]], [[190, 191], [193, 214]], [[205, 214], [422, 424]], [[207, 214], [316, 316], [366, 366], [424, 424]], [[343, 345], [488, 489]], [[363, 366], [367, 367]], [[433, 434], [435, 435]]]} 4 | {"doc_key": "INTERSPEECH_2008_21_abs", "dataset": "scierc", "sentences": [["This", "paper", "presents", "a", "research", "on", "the", "Czech", "talking", "head", "system", "."], ["It", "gives", "an", "overview", "of", "methods", "used", "for", "visual", "speech", "animation", ",", "parameterization", "of", "a", "human", "face", "and", "a", "tongue", ",", "necessary", "data", "sources", "and", "a", "synthesis", "method", "."], ["A", "3D", "animation", "model", "is", "used", "for", "a", "pseudo-muscular", "animation", "schema", "to", "create", "such", "animation", "of", "visual", "speech", "which", "is", "usable", "for", "a", "lipreading", "."], ["An", "extension", "of", "animation", "schema", "is", "presented", "to", "reach", "more", "precise", "deformations", "mainly", "in", "a", "lip", "area", "."], ["Furthermore", ",", "a", "problem", "of", "forming", "articulatory", "trajectories", "is", "formulated", "to", "solve", "labial", "coarticulation", "effects", "."], ["It", "is", "used", "for", "the", "synthesis", "method", "based", "on", "a", "selection", "of", "articulatory", "targets", "and", "interpolation", "technique", "."]], "ner": [[[7, 10, "Task"]], [[17, 17, "Generic"], [20, 22, "Task"], [38, 39, "Method"]], [[42, 44, "Method"], [49, 51, "Method"], [55, 58, "Task"], [64, 64, "Task"]], [[69, 70, "Method"]], [[89, 91, "Task"], [96, 98, "OtherScientificTerm"]], [[100, 100, "Generic"], [105, 106, "Method"], [110, 113, "OtherScientificTerm"], [115, 116, "Method"]]], "relations": [[], [[17, 17, 20, 22, "USED-FOR"]], [[42, 44, 49, 51, "USED-FOR"], [49, 51, 55, 58, "USED-FOR"], [55, 58, 64, 64, "USED-FOR"]], [], [[89, 91, 96, 98, "USED-FOR"]], [[100, 100, 105, 106, "USED-FOR"], [110, 113, 100, 100, "USED-FOR"], [110, 113, 115, 116, "CONJUNCTION"], [115, 116, 100, 100, "USED-FOR"]]], "clusters": [[[20, 22], [55, 58]], [[89, 91], [100, 100]]]} 5 | -------------------------------------------------------------------------------- /dygie/tests/fixtures/scierc_article.json: -------------------------------------------------------------------------------- 1 | {"clusters": [[[62, 64], [90, 91], [96, 98], [112, 114]], [[6, 6], [170, 170]], [[81, 82], [126, 127]], [[129, 131], [142, 142]]], "sentences": [["In", "this", "paper", ",", "a", "novel", "method", "to", "learn", "the", "intrinsic", "object", "structure", "for", "robust", "visual", "tracking", "is", "proposed", "."], ["The", "basic", "assumption", "is", "that", "the", "parameterized", "object", "state", "lies", "on", "a", "low", "dimensional", "manifold", "and", "can", "be", "learned", "from", "training", "data", "."], ["Based", "on", "this", "assumption", ",", "firstly", "we", "derived", "the", "dimensionality", "reduction", "and", "density", "estimation", "algorithm", "for", "unsupervised", "learning", "of", "object", "intrinsic", "representation", ",", "the", "obtained", "non-rigid", "part", "of", "object", "state", "reduces", "even", "to", "2", "dimensions", "."], ["Secondly", "the", "dynamical", "model", "is", "derived", "and", "trained", "based", "on", "this", "intrinsic", "representation", "."], ["Thirdly", "the", "learned", "intrinsic", "object", "structure", "is", "integrated", "into", "a", "particle-filter", "style", "tracker", "."], ["We", "will", "show", "that", "this", "intrinsic", "object", "representation", "has", "some", "interesting", "properties", "and", "based", "on", "which", "the", "newly", "derived", "dynamical", "model", "makes", "particle-filter", "style", "tracker", "more", "robust", "and", "reliable", "."], ["Experiments", "show", "that", "the", "learned", "tracker", "performs", "much", "better", "than", "existing", "trackers", "on", "the", "tracking", "of", "complex", "non-rigid", "motions", "such", "as", "fish", "twisting", "with", "self-occlusion", "and", "large", "inter-frame", "lip", "motion", "."], ["The", "proposed", "method", "also", "has", "the", "potential", "to", "solve", "other", "type", "of", "tracking", "problems", "."]], "ner": [[[6, 6, "Method"], [10, 12, "OtherScientificTerm"], [14, 16, "Task"]], [[26, 28, "OtherScientificTerm"], [32, 34, "OtherScientificTerm"]], [[52, 57, "Method"], [59, 64, "Task"], [62, 64, "Method"], [68, 72, "OtherScientificTerm"]], [[81, 82, "Method"], [90, 91, "Method"]], [[96, 98, "OtherScientificTerm"], [103, 105, "Method"]], [[112, 114, "Method"], [126, 127, "Method"], [129, 131, "Method"]], [[142, 142, "Generic"], [148, 148, "Generic"], [151, 155, "Task"], [153, 155, "OtherScientificTerm"], [158, 159, "OtherScientificTerm"], [161, 161, "OtherScientificTerm"], [164, 166, "OtherScientificTerm"]], [[170, 170, "Generic"], [180, 181, "Task"]]], "relations": [[[6, 6, 10, 12, "USED-FOR"], [10, 12, 14, 16, "USED-FOR"]], [[32, 34, 26, 28, "FEATURE-OF"]], [[52, 57, 59, 64, "USED-FOR"]], [[90, 91, 81, 82, "USED-FOR"]], [[96, 98, 103, 105, "PART-OF"]], [[126, 127, 129, 131, "USED-FOR"]], [[142, 142, 148, 148, "COMPARE"], [142, 142, 151, 155, "USED-FOR"], [148, 148, 151, 155, "USED-FOR"], [158, 159, 153, 155, "HYPONYM-OF"], [161, 161, 158, 159, "FEATURE-OF"], [161, 161, 164, 166, "CONJUNCTION"], [164, 166, 158, 159, "FEATURE-OF"]], [[170, 170, 180, 181, "USED-FOR"]]], "doc_key": "CVPR_2003_18_abs", "dataset": "scierc"} 2 | -------------------------------------------------------------------------------- /dygie/tests/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hlk-1135/RadGraph/84a4574595435f84f939c66eefadcc8b67697e1c/dygie/tests/models/__init__.py -------------------------------------------------------------------------------- /dygie/tests/models/coref_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the coref module. 3 | """ 4 | 5 | import json 6 | 7 | from allennlp.common.testing import ModelTestCase 8 | from allennlp.nn import util 9 | 10 | from dygie.models import DyGIE 11 | from dygie.data import IEJsonReader 12 | 13 | 14 | class TestCoref(ModelTestCase): 15 | def setUp(self): 16 | # TODO(dwadden) create smaller model for testing. 17 | super(TestCoref, self).setUp() 18 | self.config_file = "tests/fixtures/dygie_test.jsonnet" 19 | self.data_file = "tests/fixtures/scierc_article.json" 20 | self.set_up_model(self.config_file, self.data_file) 21 | 22 | def get_raw_data(self): 23 | lines = [] 24 | with open(self.data_file, "r") as f: 25 | for line in f: 26 | lines.append(json.loads(line)) 27 | return lines 28 | 29 | def test_coref_make_evaluation_metadata(self): 30 | """ 31 | To compute coreference evaluation metrics, the evaluator needs access to the list of 32 | coreference clusters, given in the same form as the original input. I check to make sure 33 | that the clusters I pass in are indeed equivalent to the original input. 34 | """ 35 | # Pull together the relevant training data. 36 | data = self.dataset.as_tensor_dict() 37 | metadata = data["metadata"] 38 | text_mask = util.get_text_field_mask(data["text"]).float() 39 | sentence_lengths = text_mask.sum(dim=1).long() 40 | # Make sure the sentence lengths from the text mask are the same as the number of tokens. 41 | assert sentence_lengths.tolist() == [len(entry["sentence"]) for entry in metadata] 42 | 43 | # Convert metadata back to form used for coref evaluation 44 | evaluation_metadata = self.model._coref._make_evaluation_metadata(metadata, sentence_lengths) 45 | clusters_metadata = evaluation_metadata[0]["clusters"] 46 | # Convert from tuples to list to facilitate comparison. 47 | clusters_metadata = [[list(span) for span in cluster] for cluster in clusters_metadata] 48 | 49 | # Get the raw data, and sort to match the metadata. 50 | clusters_raw = self.get_raw_data()[0]["clusters"] 51 | clusters_raw = sorted(clusters_raw, key=lambda entry: entry[0][0]) 52 | 53 | # Compare the raw data to the converted metadata I have. 54 | assert clusters_metadata == clusters_raw 55 | -------------------------------------------------------------------------------- /dygie/tests/models/dygie_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the dygie. 3 | """ 4 | 5 | from allennlp.common.testing import ModelTestCase 6 | 7 | # TODO(dwadden) Figure out why tests break on CUDA. 8 | 9 | 10 | class TestDyGIE(ModelTestCase): 11 | def setUp(self): 12 | # TODO(dwadden) create smaller model for testing. 13 | super(TestDyGIE, self).setUp() 14 | self.config_file = "tests/fixtures/dygie_test_full.jsonnet" 15 | self.data_file = "tests/fixtures/scierc_article.json" 16 | self.set_up_model(self.config_file, self.data_file) 17 | 18 | def test_dygie_model_can_train_save_and_load(self): 19 | self.ensure_model_can_train_save_and_load(self.param_file) 20 | -------------------------------------------------------------------------------- /dygie/tests/models/multi_dataset_test.sh: -------------------------------------------------------------------------------- 1 | # This isn't a formal test; just a training script to make sure that 2 | # multi-dataset training works for a single epoch without breaking. 3 | 4 | # Usage (from root of project): 5 | # bash dygie/tests/models/multi_dataset_test.sh 6 | 7 | tmpdir=dygie/tests/tmp 8 | 9 | if [[ -d $tmpdir ]] 10 | then 11 | rm -r $tmpdir 12 | fi 13 | 14 | mkdir -p $tmpdir 15 | 16 | allennlp train "training_config/multi_dataset.jsonnet" \ 17 | --serialization-dir $tmpdir \ 18 | --include-package dygie 19 | 20 | 21 | # Remove tmpdir once training has finished. 22 | rm -r $tmpdir 23 | -------------------------------------------------------------------------------- /dygie/tests/models/relation_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unit tests for the relation module. 3 | 4 | This module wasn't matching TensorFlow performance so it's tested pretty heavily. 5 | """ 6 | 7 | import torch 8 | 9 | from allennlp.common.testing import ModelTestCase 10 | 11 | # Needed to get the test framework to see the dataset readers and models. 12 | from dygie import models 13 | from dygie import data 14 | 15 | 16 | class TestRelation(ModelTestCase): 17 | def setUp(self): 18 | super(TestRelation, self).setUp() 19 | self.config_file = "tests/fixtures/dygie_test.jsonnet" 20 | self.data_file = "tests/fixtures/scierc_article.json" 21 | self.set_up_model(self.config_file, self.data_file) 22 | 23 | def test_decode(self): 24 | def convert(x): 25 | return self.model.vocab.get_token_from_index(x, namespace="relation_labels") 26 | 27 | top_spans = torch.tensor([[[0, 2], [1, 3], [1, 3]], 28 | [[1, 6], [2, 4], [3, 8]], 29 | [[0, 1], [0, 1], [0, 1]]]) 30 | predicted_relations = torch.tensor([[[-1, -1, 1], 31 | [1, -1, -1], 32 | [-1, 0, -1]], 33 | [[-1, -1, -1], 34 | [1, -1, 2], 35 | [-1, -1, 4]], 36 | [[1, 1, 2], 37 | [1, 3, 2], 38 | [-1, 2, 1]]]) 39 | num_spans_to_keep = torch.tensor([2, 3, 0]) 40 | predict_dict = {"top_spans": top_spans, 41 | "predicted_relations": predicted_relations, 42 | "num_spans_to_keep": num_spans_to_keep} 43 | decoded = self.model._relation.decode(predict_dict) 44 | expected = [{((1, 3), (0, 2)): convert(1)}, 45 | {((2, 4), (1, 6)): convert(1), 46 | ((2, 4), (3, 8)): convert(2), 47 | ((3, 8), (3, 8)): convert(4)}, 48 | {}] 49 | assert expected == decoded["decoded_relations_dict"] 50 | 51 | def test_compute_span_pair_embeddings(self): 52 | top_span_embeddings = torch.randn([3, 51, 1160]) # Make up random embeddings. 53 | 54 | embeddings = self.model._relation._compute_span_pair_embeddings(top_span_embeddings) 55 | 56 | batch_ix = 1 57 | ix1 = 22 58 | ix2 = 43 59 | emb1 = top_span_embeddings[batch_ix, ix1] 60 | emb2 = top_span_embeddings[batch_ix, ix2] 61 | emb_prod = emb1 * emb2 62 | emb = torch.cat([emb1, emb2, emb_prod]) 63 | 64 | assert torch.allclose(emb, embeddings[batch_ix, ix1, ix2]) 65 | 66 | def test_compute_relation_scores(self): 67 | self.model.eval() # Need eval on in order to reproduce. 68 | relation = self.model._relation 69 | pairwise_embeddings = torch.randn(3, 46, 46, 3480, requires_grad=True) 70 | top_span_mention_scores = torch.randn(3, 46, 1, requires_grad=True) 71 | 72 | scores = relation._compute_relation_scores(pairwise_embeddings, top_span_mention_scores) 73 | 74 | batch_ix = 0 75 | ix1 = 31 76 | ix2 = 4 77 | 78 | score = relation._relation_scorer( 79 | relation._relation_feedforward(pairwise_embeddings[batch_ix, ix1, ix2].unsqueeze(0))) 80 | score += top_span_mention_scores[batch_ix, ix1] + top_span_mention_scores[batch_ix, ix2] 81 | score = torch.cat([torch.tensor([0.0]), score.squeeze()]) 82 | 83 | assert torch.allclose(scores[batch_ix, ix1, ix2], score) 84 | 85 | def test_get_pruned_gold_relations(self): 86 | # Getting the pruned gold labels should add one to the input relation labels, then set all 87 | # the masked entries to -1. 88 | relation_labels = torch.tensor([[[-1, -1, 2, 3], 89 | [1, -1, -1, 0], 90 | [-1, 3, -1, 1], 91 | [0, -1, -1, -1]], 92 | [[0, 2, 1, 2], 93 | [-1, -1, -1, -1], 94 | [3, 0, -1, -1], 95 | [-1, 0, 1, -1]]]) 96 | top_span_indices = torch.tensor([[0, 1, 3], 97 | [0, 2, 2]]) 98 | top_span_masks = torch.tensor([[1, 1, 1], 99 | [1, 1, 0]]).unsqueeze(-1) 100 | 101 | labels = self.model._relation._get_pruned_gold_relations( 102 | relation_labels, top_span_indices, top_span_masks) 103 | 104 | expected_labels = torch.tensor([[[0, 0, 4], 105 | [2, 0, 1], 106 | [1, 0, 0]], 107 | [[1, 2, -1], 108 | [4, 0, -1], 109 | [-1, -1, -1]]]) 110 | 111 | assert torch.equal(labels, expected_labels) 112 | 113 | def test_cross_entropy_ignore_index(self): 114 | # Make sure that the cross entropy loss is ignoring entries whose gold label is -1, which 115 | # corresponds, to masked-out entries. 116 | relation_scores = torch.randn(2, 3, 3, self.model._relation._n_labels + 1) 117 | gold_relations = torch.tensor([[[0, 0, 4], 118 | [2, 0, 1], 119 | [1, 0, 0]], 120 | [[1, 2, -1], 121 | [4, 0, -1], 122 | [-1, -1, -1]]]) 123 | 124 | # Calculate the loss with a loop over entries. 125 | total_loss = torch.tensor([0.0]) 126 | for fold in [0, 1]: 127 | for i in range(3): 128 | for j in range(3): 129 | scores_entry = relation_scores[fold, i, j].unsqueeze(0) 130 | gold_entry = gold_relations[fold, i, j].unsqueeze(0) 131 | if gold_entry >= 0: 132 | loss_entry = self.model._relation._loss(scores_entry, gold_entry) 133 | total_loss += loss_entry 134 | 135 | model_loss = self.model._relation._get_cross_entropy_loss(relation_scores, gold_relations) 136 | assert torch.allclose(total_loss, model_loss) 137 | -------------------------------------------------------------------------------- /dygie/training/event_metrics.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | from collections import Counter 3 | 4 | from allennlp.training.metrics.metric import Metric 5 | 6 | from dygie.training.f1 import compute_f1 7 | 8 | 9 | def _invert_arguments(arguments, triggers): 10 | """ 11 | For scoring the argument, we don't need the trigger spans to match exactly. We just need the 12 | trigger label corresponding to the predicted trigger span to be correct. 13 | """ 14 | # Can't use a dict because multiple triggers could share the same argument. 15 | inverted = set() 16 | for k, v in arguments.items(): 17 | if k[0] in triggers: # If it's not, the trigger this arg points to is null. TODO(dwadden) check. 18 | trigger_label = triggers[k[0]] 19 | to_append = (k[1], trigger_label, v) 20 | inverted.add(to_append) 21 | 22 | return inverted 23 | 24 | 25 | # TODO(dwadden) Clean this up. 26 | class EventMetrics(Metric): 27 | """ 28 | Computes precision, recall, and micro-averaged F1 for triggers and arguments. 29 | """ 30 | def __init__(self): 31 | self.reset() 32 | 33 | @overrides 34 | def __call__(self, predicted_events_list, metadata_list): 35 | for predicted_events, metadata in zip(predicted_events_list, metadata_list): 36 | # Trigger scoring. 37 | predicted_triggers = predicted_events["trigger_dict"] 38 | gold_triggers = metadata.events.trigger_dict 39 | self._score_triggers(predicted_triggers, gold_triggers) 40 | 41 | # Argument scoring. 42 | predicted_arguments = predicted_events["argument_dict"] 43 | gold_arguments = metadata.events.argument_dict 44 | self._score_arguments( 45 | predicted_triggers, gold_triggers, predicted_arguments, gold_arguments) 46 | 47 | def _score_triggers(self, predicted_triggers, gold_triggers): 48 | self._gold_triggers += len(gold_triggers) 49 | self._predicted_triggers += len(predicted_triggers) 50 | for token_ix, pred in predicted_triggers.items(): 51 | label = pred[0] 52 | # Check whether the offsets match, and whether the labels match. 53 | if token_ix in gold_triggers: 54 | self._matched_trigger_ids += 1 55 | if gold_triggers[token_ix] == label: 56 | self._matched_trigger_classes += 1 57 | 58 | def _score_arguments(self, predicted_triggers, gold_triggers, predicted_arguments, gold_arguments): 59 | # Note that the index of the trigger doesn't actually need to be correct to get full credit; 60 | # the event type and event role need to be correct (see Sec. 3 of paper). 61 | def format(arg_dict, trigger_dict, prediction=False): 62 | # Make it a list of [index, event_type, arg_label]. 63 | res = [] 64 | for (trigger_ix, arg_ix), label in arg_dict.items(): 65 | # If it doesn't match a trigger, don't predict it (enforced in decoding). 66 | if trigger_ix not in trigger_dict: 67 | continue 68 | event_type = trigger_dict[trigger_ix] 69 | # TODO(dwadden) This is clunky; it's because predictions have confidence scores. 70 | if prediction: 71 | event_type = event_type[0] 72 | label = label[0] 73 | res.append((arg_ix, event_type, label)) 74 | return res 75 | 76 | formatted_gold_arguments = format(gold_arguments, gold_triggers, prediction=False) 77 | formatted_predicted_arguments = format(predicted_arguments, predicted_triggers, prediction=True) 78 | 79 | self._gold_arguments += len(formatted_gold_arguments) 80 | self._predicted_arguments += len(formatted_predicted_arguments) 81 | 82 | # Go through each predicted arg and look for a match. 83 | for entry in formatted_predicted_arguments: 84 | # No credit if not associated with a predicted trigger. 85 | class_match = int(any([entry == gold for gold in formatted_gold_arguments])) 86 | id_match = int(any([entry[:2] == gold[:2] for gold in formatted_gold_arguments])) 87 | 88 | self._matched_argument_classes += class_match 89 | self._matched_argument_ids += id_match 90 | 91 | 92 | @overrides 93 | def get_metric(self, reset=False): 94 | res = {} 95 | 96 | # Triggers 97 | res["trig_id_precision"], res["trig_id_recall"], res["trig_id_f1"] = compute_f1( 98 | self._predicted_triggers, self._gold_triggers, self._matched_trigger_ids) 99 | res["trig_class_precision"], res["trig_class_recall"], res["trig_class_f1"] = compute_f1( 100 | self._predicted_triggers, self._gold_triggers, self._matched_trigger_classes) 101 | 102 | # Arguments 103 | res["arg_id_precision"], res["arg_id_recall"], res["arg_id_f1"] = compute_f1( 104 | self._predicted_arguments, self._gold_arguments, self._matched_argument_ids) 105 | res["arg_class_precision"], res["arg_class_recall"], res["arg_class_f1"] = compute_f1( 106 | self._predicted_arguments, self._gold_arguments, self._matched_argument_classes) 107 | 108 | # Reset counts if at end of epoch. 109 | if reset: 110 | self.reset() 111 | 112 | return res 113 | 114 | @overrides 115 | def reset(self): 116 | self._gold_triggers = 0 117 | self._predicted_triggers = 0 118 | self._matched_trigger_ids = 0 119 | self._matched_trigger_classes = 0 120 | self._gold_arguments = 0 121 | self._predicted_arguments = 0 122 | self._matched_argument_ids = 0 123 | self._matched_argument_classes = 0 124 | 125 | 126 | class ArgumentStats(Metric): 127 | """ 128 | Compute the fraction of predicted event arguments that are associated with multiple triggers. 129 | """ 130 | def __init__(self): 131 | self.reset() 132 | 133 | @overrides 134 | def __call__(self, predicted_events_list): 135 | for predicted_events in predicted_events_list: 136 | predicted_arguments = _invert_arguments(predicted_events["argument_dict"], 137 | predicted_events["trigger_dict"]) 138 | # Count how many times each span appears as an argument. 139 | span_counts = Counter() 140 | for prediction in predicted_arguments: 141 | span_counts[prediction[0]] += 1 142 | # Count how many spans appear more than once. 143 | repeated = {k: v for k, v in span_counts.items() if v > 1} 144 | self._total_arguments += len(span_counts) 145 | self._repeated_arguments += len(repeated) 146 | 147 | @overrides 148 | def get_metric(self, reset=False): 149 | # Fraction of event arguments associated with multiple triggers. 150 | args_multiple = (self._repeated_arguments / self._total_arguments 151 | if self._total_arguments 152 | else 0) 153 | 154 | if reset: 155 | self.reset() 156 | 157 | res = dict(args_multiple=args_multiple) 158 | return res 159 | 160 | @overrides 161 | def reset(self): 162 | self._total_arguments = 0 163 | self._repeated_arguments = 0 164 | -------------------------------------------------------------------------------- /dygie/training/f1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Function to compute F1 scores. 3 | """ 4 | 5 | 6 | def safe_div(num, denom): 7 | if denom > 0: 8 | return num / denom 9 | else: 10 | return 0 11 | 12 | 13 | def compute_f1(predicted, gold, matched): 14 | precision = safe_div(matched, predicted) 15 | recall = safe_div(matched, gold) 16 | f1 = safe_div(2 * precision * recall, precision + recall) 17 | return precision, recall, f1 18 | -------------------------------------------------------------------------------- /dygie/training/ner_metrics.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | from typing import Optional 3 | 4 | import torch 5 | 6 | from allennlp.training.metrics.metric import Metric 7 | 8 | from dygie.training.f1 import compute_f1 9 | 10 | # TODO(dwadden) Need to use the decoded predictions so that we catch the gold examples longer than 11 | # the span boundary. 12 | 13 | class NERMetrics(Metric): 14 | """ 15 | Computes precision, recall, and micro-averaged F1 from a list of predicted and gold labels. 16 | """ 17 | def __init__(self, number_of_classes: int, none_label: int=0): 18 | self.number_of_classes = number_of_classes 19 | self.none_label = none_label 20 | self.reset() 21 | 22 | @overrides 23 | def __call__(self, 24 | predictions: torch.Tensor, 25 | gold_labels: torch.Tensor, 26 | mask: Optional[torch.Tensor] = None): 27 | predictions = predictions.cpu() 28 | gold_labels = gold_labels.cpu() 29 | mask = mask.cpu() 30 | for i in range(self.number_of_classes): 31 | if i == self.none_label: 32 | continue 33 | self._true_positives += ((predictions==i)*(gold_labels==i)*mask.bool()).sum().item() 34 | self._false_positives += ((predictions==i)*(gold_labels!=i)*mask.bool()).sum().item() 35 | self._true_negatives += ((predictions!=i)*(gold_labels!=i)*mask.bool()).sum().item() 36 | self._false_negatives += ((predictions!=i)*(gold_labels==i)*mask.bool()).sum().item() 37 | 38 | @overrides 39 | def get_metric(self, reset=False): 40 | """ 41 | Returns 42 | ------- 43 | A tuple of the following metrics based on the accumulated count statistics: 44 | precision : float 45 | recall : float 46 | f1-measure : float 47 | """ 48 | predicted = self._true_positives + self._false_positives 49 | gold = self._true_positives + self._false_negatives 50 | matched = self._true_positives 51 | precision, recall, f1_measure = compute_f1(predicted, gold, matched) 52 | 53 | # Reset counts if at end of epoch. 54 | if reset: 55 | self.reset() 56 | 57 | return precision, recall, f1_measure 58 | 59 | @overrides 60 | def reset(self): 61 | self._true_positives = 0 62 | self._false_positives = 0 63 | self._true_negatives = 0 64 | self._false_negatives = 0 65 | -------------------------------------------------------------------------------- /dygie/training/relation_metrics.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | 3 | from allennlp.training.metrics.metric import Metric 4 | 5 | from dygie.training.f1 import compute_f1 6 | 7 | 8 | class RelationMetrics(Metric): 9 | """ 10 | Computes precision, recall, and micro-averaged F1 from a list of predicted and gold spans. 11 | """ 12 | def __init__(self): 13 | self.reset() 14 | 15 | # TODO(dwadden) This requires decoding because the dataset reader gets rid of gold spans wider 16 | # than the span width. So, I can't just compare the tensor of gold labels to the tensor of 17 | # predicted labels. 18 | @overrides 19 | def __call__(self, predicted_relation_list, metadata_list): 20 | for predicted_relations, metadata in zip(predicted_relation_list, metadata_list): 21 | gold_relations = metadata.relation_dict 22 | self._total_gold += len(gold_relations) 23 | self._total_predicted += len(predicted_relations) 24 | for (span_1, span_2), label in predicted_relations.items(): 25 | ix = (span_1, span_2) 26 | if ix in gold_relations and gold_relations[ix] == label: 27 | self._total_matched += 1 28 | 29 | @overrides 30 | def get_metric(self, reset=False): 31 | precision, recall, f1 = compute_f1(self._total_predicted, self._total_gold, self._total_matched) 32 | 33 | # Reset counts if at end of epoch. 34 | if reset: 35 | self.reset() 36 | 37 | return precision, recall, f1 38 | 39 | @overrides 40 | def reset(self): 41 | self._total_gold = 0 42 | self._total_predicted = 0 43 | self._total_matched = 0 44 | -------------------------------------------------------------------------------- /models/README.txt: -------------------------------------------------------------------------------- 1 | Instructions for using the checkpoint for inference: 2 | 3 | Basic Setup (One time activity) 4 | 5 | 1. Clone the DYGIE++ repository from: https://github.com/dwadden/dygiepp. This repositiory is managed by Wadden et al., authors of the paper Entity, Relation, and Event Extraction with Contextualized Span Representations (https://www.aclweb.org/anthology/D19-1585.pdf). 6 | 7 | git clone https://github.com/dwadden/dygiepp.git 8 | 9 | 2. Navigate to the root of repo in your system and use the following commands to setup the conda environment: 10 | 11 | conda create --name dygiepp python=3.7 12 | pip install -r requirements.txt 13 | conda develop . # Adds DyGIE to your PYTHONPATH 14 | 15 | Running Inference on Radiology Reports 16 | 17 | 3. Activate the conda environment: 18 | 19 | conda activate dygiepp 20 | 21 | 3. Copy the inference.py file to the root of the cloned repo where you have the dygie folder 22 | 23 | 4. Run the inference.py file using the command: 24 | 25 | python3 inference.py --model_path --data_path --out_path --cuda_device 26 | -------------------------------------------------------------------------------- /models/inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import json 4 | import re 5 | import argparse 6 | 7 | def get_file_list(path): 8 | 9 | """Gets path to all the reports (.txt format files) in the specified folder, and 10 | saves it in a temporary json file 11 | 12 | Args: 13 | path: Path to the folder containing the reports 14 | """ 15 | 16 | file_list = [item for item in glob.glob(f"{path}/*.txt")] 17 | 18 | # Number of files for inference at once depends on the memory available. 19 | ## Recemmended to use no more than batches of 25,000 files 20 | 21 | with open('./temp_file_list.json', 'w') as f: 22 | json.dump(file_list, f) 23 | 24 | def preprocess_reports(): 25 | 26 | """ Load up the files mentioned in the temporary json file, and 27 | processes them in format that the dygie model can take as input. 28 | Also save the processed file in a temporary file. 29 | """ 30 | 31 | file_list = json.load(open("./temp_file_list.json")) 32 | final_list = [] 33 | for idx, file in enumerate(file_list): 34 | 35 | temp_file = open(file).read() 36 | sen = re.sub('(?