├── ner_evaluation
    ├── __init__.py
    ├── data
    │   ├── classes-selective.txt
    │   ├── classes-total.txt
    │   └── FirstHAREM-selective-dev.json
    ├── requirements.txt
    ├── results_writer.py
    ├── utils.py
    ├── dataset.py
    ├── tokenization.py
    ├── tag_encoder.py
    ├── run_bert_harem.py
    ├── postprocessing.py
    ├── run_inference.py
    ├── eval_tools.py
    ├── README.md
    ├── preprocessing.py
    ├── model.py
    └── trainer.py
├── qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf
├── LICENSE
└── README.md


/ner_evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ner_evaluation/data/classes-selective.txt:
--------------------------------------------------------------------------------
1 | PESSOA
2 | ORGANIZACAO
3 | LOCAL
4 | TEMPO
5 | VALOR


--------------------------------------------------------------------------------
/ner_evaluation/data/classes-total.txt:
--------------------------------------------------------------------------------
 1 | PESSOA
 2 | ORGANIZACAO
 3 | LOCAL
 4 | TEMPO
 5 | VALOR
 6 | ABSTRACCAO
 7 | ACONTECIMENTO
 8 | COISA
 9 | OBRA
10 | OUTRO


--------------------------------------------------------------------------------
/qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neuralmind-ai/portuguese-bert/HEAD/qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf


--------------------------------------------------------------------------------
/ner_evaluation/requirements.txt:
--------------------------------------------------------------------------------
1 | pytorch-transformers==1.1.0
2 | git+https://github.com/kmkurn/pytorch-crf.git@4cd79bc8af55fb0f34a2a39b2e38f0e71c208fd4#egg=pytorch_crf
3 | seqeval==0.0.12
4 | jsonlines==1.2.0
5 | scikit-learn==0.21.2
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, NeuralMind (Fabio Capuano de Souza, Rodrigo Nogueira,
 4 | Roberto de Alencar Lotufo)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/ner_evaluation/results_writer.py:
--------------------------------------------------------------------------------
 1 | import jsonlines
 2 | from argparse import Namespace
 3 | from datetime import datetime
 4 | from typing import Any
 5 | 
 6 | from eval_tools import SequenceMetrics
 7 | 
 8 | 
 9 | def to_float(value):
10 |     if isinstance(value, list):
11 |         return [float(val) for val in value]
12 |     else:
13 |         return float(value)
14 | 
15 | 
16 | def compile_results(args: Namespace,
17 |                     train_metrics: SequenceMetrics,
18 |                     valid_metrics: SequenceMetrics,
19 |                     best_epoch_metric: str = 'f1_score',
20 |                     **extra_values: Any):
21 |     results = {
22 |         'timestamp': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"),
23 |     }
24 |     attrs_args = [
25 |         ('num_train_epochs', 'epochs'),
26 |         'learning_rate',
27 |         'train_batch_size',
28 |         'gradient_accumulation_steps',
29 |         'train_file',
30 |         'valid_file',
31 |         'pooler',
32 |         'freeze_bert',
33 |         'output_dir',
34 |         'labels_file',
35 |         'classifier_lr',
36 |         'no_crf',
37 |         'seed',
38 |         'labels_file',
39 |         'lstm_hidden_size',
40 |         'lstm_layers',
41 |     ]
42 | 
43 |     for attr in attrs_args:
44 |         if len(attr) == 2:
45 |             source, dest = attr
46 |         else:
47 |             source = dest = attr
48 |         results[dest] = getattr(args, source, None)
49 | 
50 |     best_epoch = valid_metrics.get_best_epoch(best_epoch_metric)
51 |     results['best_epoch'] = best_epoch
52 | 
53 |     attrs_metrics = [
54 |         'f1_score',
55 |         'precision',
56 |         'recall',
57 |     ]
58 | 
59 |     for prefix, metrics in [('train', train_metrics),
60 |                             ('valid', valid_metrics)]:
61 |         for attr in attrs_metrics:
62 |             key = f'{prefix}_{attr}'
63 |             values = metrics.history.get(attr)
64 |             if values:
65 |                 results[key] = to_float(values)
66 |                 results[f'best_{key}'] = to_float(max(values))
67 | 
68 |     results['classification_report'] = valid_metrics.get_value(
69 |         'classification_report', best_epoch)
70 | 
71 |     for name, value in extra_values.items():
72 |         results[name] = to_float(value)
73 | 
74 |     return results
75 | 
76 | 
77 | def write_jsonl_results(results, path):
78 |     """Append a line to a jsonlines file."""
79 |     assert path.endswith('.jsonl')
80 |     with jsonlines.open(path, 'a') as writer:
81 |         writer.write(results)
82 | 


--------------------------------------------------------------------------------
/ner_evaluation/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from argparse import Namespace
 4 | from typing import Type, Union
 5 | 
 6 | import torch
 7 | from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
 8 | 
 9 | from model import get_model_and_kwargs_for_args
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def save_model(model: Type[torch.nn.Module], args: Namespace) -> None:
16 |     """Save a trained model and the associated configuration to output dir."""
17 |     model.save_pretrained(args.output_dir)
18 |     torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
19 | 
20 | 
21 | def load_model(args: Namespace,
22 |                model_path: str,
23 |                training: bool = True,
24 |                ) -> torch.nn.Module:
25 |     """Instantiates a pretrained model from parsed argument values.
26 | 
27 |     Args:
28 |         args: parsed arguments from argv.
29 |         model_path: name of model checkpoint or path to a checkpoint directory.
30 |         training: if True, loads a model with training-specific parameters.
31 |     """
32 | 
33 |     model_class, model_kwargs = get_model_and_kwargs_for_args(
34 |         args, training=training)
35 |     logger.info('model: {}, kwargs: {}'.format(
36 |         model_class.__name__, model_kwargs))
37 | 
38 |     cache_dir = os.path.join(
39 |         PYTORCH_PRETRAINED_BERT_CACHE,
40 |         'distributed_{}'.format(args.local_rank))
41 |     model = model_class.from_pretrained(
42 |         model_path,
43 |         num_labels=args.num_labels,
44 |         cache_dir=cache_dir,
45 |         output_hidden_states=True,  # Ensure all hidden states are returned
46 |         **model_kwargs)
47 | 
48 |     return model
49 | 
50 | 
51 | class ExponentialAccumulator:
52 |     """Exponential moving average train loss tracker."""
53 | 
54 |     def __init__(self, beta: float = 0.99):
55 |         self._accum = None
56 |         self.beta = beta
57 | 
58 |     def insert_value(self, value: float) -> float:
59 |         if self._accum is None:
60 |             self._accum = value
61 |         else:
62 |             self._accum = self.beta * self._accum + (1 - self.beta) * value
63 | 
64 |         return self._accum
65 | 
66 | 
67 | class RunningAccumulator:
68 |     """Loss value running accumulator."""
69 | 
70 |     def __init__(self):
71 |         self.total = 0
72 |         self.num_values = 0
73 | 
74 |     def accumulate(self, value: Union[torch.Tensor, float]):
75 |         if torch.is_tensor(value):
76 |             with torch.no_grad():
77 |                 self.total += value.item()
78 |         else:
79 |             self.total += value
80 | 
81 |         self.num_values += 1
82 | 
83 |     def mean(self) -> float:
84 |         return self.total / self.num_values
85 | 


--------------------------------------------------------------------------------
/ner_evaluation/dataset.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pickle
 4 | from typing import List, Tuple
 5 | 
 6 | import torch
 7 | from torch.utils.data import (
 8 |     Dataset,
 9 |     DataLoader,
10 |     TensorDataset,
11 | )
12 | from tqdm import tqdm
13 | 
14 | from model import BertForNERClassification
15 | from preprocessing import InputSpan
16 | 
17 | 
18 | logger = logging.getLogger(__name__)
19 | 
20 | 
21 | def get_dataset(features: List[InputSpan]) -> TensorDataset:
22 |     """Generate a TensorDataset from lists of tensors."""
23 |     all_input_ids = torch.tensor(
24 |         [f.input_ids for f in features], dtype=torch.long)
25 |     all_input_mask = torch.tensor(
26 |         [f.input_mask for f in features], dtype=torch.long)
27 |     all_segment_ids = torch.tensor(
28 |         [f.segment_ids for f in features], dtype=torch.long)
29 |     all_label_ids = torch.tensor(
30 |         [f.label_ids for f in features], dtype=torch.long)
31 |     all_prediction_mask = torch.tensor(
32 |         [f.prediction_mask for f in features], dtype=torch.uint8)
33 |     all_example_index = torch.tensor(
34 |         [f.example_index for f in features], dtype=torch.long)
35 |     all_doc_span_index = torch.tensor(
36 |         [f.doc_span_index for f in features], dtype=torch.long)
37 | 
38 |     return TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
39 |                          all_label_ids, all_prediction_mask,
40 |                          all_example_index, all_doc_span_index)
41 | 
42 | 
43 | def get_bert_encoded_features(model: BertForNERClassification,
44 |                               dataset: Dataset,
45 |                               batch_size: int,
46 |                               device: torch.device,
47 |                               ) -> Tuple[torch.Tensor, ...]:
48 |     """Returns a BERT encoded tensors of the dataset, to be used to speed up
49 |     the training of the classifier model with frozen BERT."""
50 |     model.eval()
51 |     dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
52 | 
53 |     all_encoded_inputs = []
54 | 
55 |     with torch.no_grad():
56 |         for batch in tqdm(dataloader, desc="Getting frozen BERT features"):
57 |             batch = tuple(t.to(device) for t in batch)
58 |             input_ids, input_mask, segment_ids, *_ = batch
59 | 
60 |             encoded_batch = model.bert_encode(
61 |                 input_ids, segment_ids, input_mask)
62 |             encoded_batch = encoded_batch.cpu()
63 |             all_encoded_inputs.append(encoded_batch)
64 | 
65 |     all_encoded_inputs = torch.cat(all_encoded_inputs, dim=0)
66 | 
67 |     return (all_encoded_inputs,
68 |             *dataset.tensors[1:])
69 | 
70 | 
71 | def get_bert_encoded_dataset(model: BertForNERClassification,
72 |                              dataset: Dataset,
73 |                              batch_size: int,
74 |                              device: torch.device,
75 |                              ) -> TensorDataset:
76 |     """Returns a BERT encoded version of the dataset, to be used to speed up
77 |     the training of the classifier model with frozen BERT."""
78 |     encoded_data = get_bert_encoded_features(
79 |         model, dataset, batch_size, device)
80 | 
81 |     return TensorDataset(*encoded_data)
82 | 


--------------------------------------------------------------------------------
/ner_evaluation/tokenization.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Tuple
  2 | 
  3 | from pytorch_transformers.tokenization_bert import (
  4 |     _is_punctuation as is_punctuation,
  5 |     _is_whitespace as is_whitespace,
  6 | )
  7 | 
  8 | 
  9 | class Token(object):
 10 |     """Info about a single token."""
 11 | 
 12 |     def __init__(self,
 13 |                  text: str,
 14 |                  offset: int,
 15 |                  index: int,
 16 |                  tail: str = '',
 17 |                  tag: str = None):
 18 | 
 19 |         if not isinstance(text, str) or not text:
 20 |             raise TypeError('text should be a non-empty string.')
 21 |         if not isinstance(offset, int) or offset < 0:
 22 |             raise TypeError('offset should be an int >= 0.')
 23 |         if not isinstance(index, int) or index < 0:
 24 |             raise TypeError('index should be an int >= 0.')
 25 | 
 26 |         self.text = text
 27 |         self.offset = offset
 28 |         self.tail = tail
 29 |         self.tag = tag
 30 |         self._example = None
 31 |         self._index = index
 32 | 
 33 |     def __str__(self):
 34 |         return '{}{}'.format(self.text, self.tail)
 35 | 
 36 |     def __repr__(self):
 37 |         return 'Token(text=%r, offset=%r, index=%r, tail=%r, tag=%r)' % \
 38 |             (self.text, self.offset, self.index, self.tail, self.tag)
 39 | 
 40 |     def __len__(self):
 41 |         return len(self.text) + len(self.tail)
 42 | 
 43 |     def __add__(self, char):
 44 |         self.text += char
 45 |         return self
 46 | 
 47 |     @property
 48 |     def example(self):
 49 |         return self._example
 50 | 
 51 |     @property
 52 |     def index(self):
 53 |         return self._index
 54 | 
 55 |     @property
 56 |     def is_punct(self):
 57 |         return is_punctuation(self.text)
 58 | 
 59 |     def has_tail(self):
 60 |         return bool(self.tail)
 61 | 
 62 |     @property
 63 |     def nbor(self):
 64 |         """Returns the neighboring token, e.g., 
 65 |         self._example.doc_tokens[self.index + 1]."""
 66 |         if self.index is None:
 67 |             return None
 68 |         try:
 69 |             return self._example.doc_tokens[self.index + 1]
 70 |         except IndexError:
 71 |             return None
 72 | 
 73 | 
 74 | def reconstruct_text_from_tokens(tokens: List[Token],
 75 |                                  include_last_tail: bool = False,
 76 |                                  ) -> str:
 77 |     """Concatenates the text of a sequence of tokens."""
 78 |     def text_generator(tokens):
 79 |         for i, token in enumerate(tokens):
 80 |             yield token.text
 81 |             if i < len(tokens) - 1 or include_last_tail:
 82 |                 yield token.tail
 83 | 
 84 |     return ''.join(piece for piece in text_generator(tokens))
 85 | 
 86 | 
 87 | class TokenizerWithAlignment:
 88 |     """Tokenizer that performs basic tokenization keeping string alignment."""
 89 | 
 90 |     def __init__(self):
 91 |         pass
 92 | 
 93 |     @staticmethod
 94 |     def _begin_new_token(doc_tokens, text, offset):
 95 |         token = Token(text=text, offset=offset, index=len(doc_tokens))
 96 |         doc_tokens.append(token)
 97 | 
 98 |         return token
 99 | 
100 |     def tokenize(self, text: str) -> Tuple[List[Token], List[int]]:
101 |         doc_tokens = []
102 |         char_to_word_offset = []
103 | 
104 |         new_word = True
105 |         curr_token = None
106 | 
107 |         for offset, c in enumerate(text):
108 |             if is_whitespace(c):
109 |                 new_word = True
110 |                 if curr_token:
111 |                     curr_token.tail += c
112 |             else:
113 |                 if is_punctuation(c):
114 |                     curr_token = self._begin_new_token(doc_tokens, c, offset)
115 |                     new_word = True
116 |                 else:
117 |                     if new_word:
118 |                         curr_token = self._begin_new_token(
119 |                             doc_tokens, c, offset)
120 |                     else:
121 |                         curr_token += c
122 |                     new_word = False
123 | 
124 |             # OBS: Whitespaces that appear before any tokens will have offset -1
125 |             # char_to_word_offset.append(len(doc_tokens) - 1)
126 |             char_to_word_offset.append(max(0, len(doc_tokens) - 1))
127 | 
128 |         return doc_tokens, char_to_word_offset
129 | 
130 |     def __call__(self, text: str) -> Tuple[List[Token], List[int]]:
131 |         return self.tokenize(text)
132 | 


--------------------------------------------------------------------------------
/ner_evaluation/tag_encoder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Defines NER tag encoder for predefined coding schemes.
  3 | """
  4 | from typing import List
  5 | 
  6 | BIO = 'BIO'
  7 | BILUO = 'BILUO'
  8 | 
  9 | SCHEMES = {
 10 |     BIO: ['B', 'I'],
 11 |     BILUO: ['B', 'I', 'L', 'U'],
 12 | }
 13 | 
 14 | VALID_TRANSITIONS = {
 15 |     BIO: {
 16 |         'B': ['B', 'I', 'O'],
 17 |         'I': ['B', 'I', 'O'],
 18 |         'O': ['B', 'O'],
 19 |     },
 20 |     BILUO: {
 21 |         'B': ['I', 'L'],
 22 |         'I': ['I', 'L'],
 23 |         'L': ['B', 'U', 'O'],
 24 |         'U': ['B', 'U', 'O'],
 25 |         'O': ['B', 'U', 'O'],
 26 |     },
 27 | }
 28 | 
 29 | 
 30 | class NERTagsEncoder(object):
 31 |     """Handles creation of NER tags for a list of named entity classes and
 32 |     conversion of tags to ids and vice versa."""
 33 | 
 34 |     def __init__(self,
 35 |                  classes: List[str],
 36 |                  scheme: str = BIO,
 37 |                  ignore_index: int = -100):
 38 | 
 39 |         if not len(set(classes)) == len(classes):
 40 |             raise ValueError("`classes` have duplicate entries.")
 41 |         if "O" in classes or "X" in classes:
 42 |             raise ValueError("`classes` should not have tag O nor X.")
 43 |         if ignore_index >= 0 or not isinstance(ignore_index, int):
 44 |             raise ValueError("`ignore_index` should be a negative int.")
 45 |         if scheme not in SCHEMES:
 46 |             raise ValueError("`scheme` should be one of {}".format(
 47 |                 tuple(SCHEMES.keys())))
 48 | 
 49 |         self.classes = tuple(classes)
 50 |         self.tags = ["O"]
 51 |         self.ignore_index = ignore_index
 52 |         self.tag_to_id = {"X": ignore_index}
 53 |         self.scheme = scheme
 54 | 
 55 |         for clss in classes:
 56 |             for subtag in SCHEMES[scheme]:
 57 |                 self.tags.append(f"{subtag}-{clss}")
 58 | 
 59 |         for i, tag in enumerate(self.tags):
 60 |             self.tag_to_id[tag] = i
 61 | 
 62 |     def __repr__(self):
 63 |         return ('{class_}(classes={classes!r}, scheme={scheme!r})') \
 64 |             .format(class_=self.__class__.__name__,
 65 |                     classes=self.classes,
 66 |                     scheme=self.scheme)
 67 | 
 68 |     @classmethod
 69 |     def from_labels_file(cls, filepath: str, *args, **kwargs):
 70 |         """Creates encoder from a file with NER label classes (one class per
 71 |         line) and a given scheme."""
 72 |         with open(filepath, 'r') as fd:
 73 |             ner_classes = [clss for clss in fd.read().splitlines() if clss]
 74 | 
 75 |         return cls(ner_classes, *args, **kwargs)
 76 | 
 77 |     @property
 78 |     def num_labels(self) -> int:
 79 |         return len(self.tags)
 80 | 
 81 |     def convert_tags_to_ids(self, tags: List[str]) -> List[int]:
 82 |         """Converts a list of tag strings to a list of tag ids."""
 83 |         return [self.tag_to_id[tag] for tag in tags]
 84 | 
 85 |     def convert_ids_to_tags(self, tag_ids: List[int]) -> List[str]:
 86 |         """Returns a list of tag strings from a list of tag ids."""
 87 |         return [self.tags[tag_id] for tag_id in tag_ids]
 88 | 
 89 |     def decode_valid(self, tag_sequence: List[str]) -> List[str]:
 90 |         """Processes a list of tag strings to remove invalid predictions given
 91 |         the valid transitions of the tag scheme, such as "I" tags coming after
 92 |         "O" tags."""
 93 |         if self.scheme == BILUO:
 94 |             import warnings
 95 |             warnings.warn(f"Valid decoding for BILUO scheme is not implemented. Returning input sequence.")
 96 |             return tag_sequence
 97 | 
 98 |         prev_tag = 'O'
 99 |         prev_type = 'O'
100 | 
101 |         final = []
102 |         for tag_and_cls in tag_sequence:
103 |             tag = tag_and_cls[0]
104 |             type_ = tag_and_cls.split('-')[-1]
105 |             valid_transitions = VALID_TRANSITIONS[self.scheme][prev_tag]
106 | 
107 |             valid_tag = False
108 |             if tag in valid_transitions:
109 |                 if tag in ('B', 'O'):
110 |                     valid_tag = True
111 |                 elif tag == 'I' and type_ == prev_type:
112 |                     valid_tag = True
113 | 
114 |             if valid_tag:
115 |                 prev_tag = tag
116 |                 prev_type = type_
117 |                 final.append(tag_and_cls)
118 |             else:
119 |                 prev_tag = 'O'
120 |                 prev_type = 'O'
121 |                 final.append('O')
122 | 
123 |         return final


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # BERTimbau - Portuguese BERT
 3 | 
 4 | This repository contains pre-trained [BERT](https://github.com/google-research/bert) models trained on the Portuguese language. BERT-Base and BERT-Large Cased variants were trained on the [BrWaC (Brazilian Web as Corpus)](https://www.researchgate.net/publication/326303825_The_brWaC_Corpus_A_New_Open_Resource_for_Brazilian_Portuguese), a large Portuguese corpus, for 1,000,000 steps, using whole-word mask. Model artifacts for TensorFlow and PyTorch can be found below.
 5 | 
 6 | The models are a result of an ongoing Master's Program. The [text submission for Qualifying Exam](qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf) is also included in the repository in PDF format, which contains more details about the pre-training procedure, vocabulary generation and downstream usage in the task of Named Entity Recognition.
 7 | 
 8 | ## Download
 9 | 
10 | The base and large models are available at [Hugging Face](https://huggingface.co/neuralmind)
11 | 
12 | 
13 | ## Evaluation benchmarks
14 | 
15 | The models were benchmarked on three tasks (Sentence Textual Similarity, Recognizing Textual Entailment and Named Entity Recognition) and compared to previous published results and [Multilingual BERT](https://github.com/google-research/bert/blob/master/multilingual.md). Metrics are: Pearson's correlation for STS and F1-score for RTE and NER.
16 | 
17 | | Task | Test Dataset           | BERTimbau-Large | BERTimbau-Base | mBERT  |      Previous SOTA    |
18 | |:----:|:----------------------:|:---------------:|:-------------: | :-----:| :--------------------:| 
19 | | STS  | ASSIN2                 |    **0.852**    |     0.836      |  0.809 | 0.83 [[1]](#References) |
20 | | RTE  | ASSIN2                 |    **90.0**     |     89.2       |  86.8  | 88.3 [[1]](#References) |
21 | | NER  | MiniHAREM (5 classes)  |    **83.7**     |     83.1       |  79.2  | 82.3 [[2]](#References) |
22 | | NER  | MiniHAREM (10 classes) |    **78.5**     |     77.6       |  73.1  | 74.6 [[2]](#References) |
23 | 
24 | ### NER experiments code
25 | 
26 | Code and instructions to reproduce the Named Entity Recognition experiments are in [`ner_evaluation/`](ner_evaluation/) directory.
27 | 
28 | 
29 | ## PyTorch usage example
30 | 
31 | Our PyTorch artifacts are compatible with the [🤗Huggingface Transformers](https://github.com/huggingface/transformers) library and are also available on the [Community models](https://huggingface.co/models):
32 | 
33 | - [BERTimbau Base model card](https://huggingface.co/neuralmind/bert-base-portuguese-cased)
34 | - [BERTimbau Large model card](https://huggingface.co/neuralmind/bert-large-portuguese-cased)
35 | 
36 | ```python
37 | from transformers import AutoModel, AutoTokenizer
38 | 
39 | # Using the community model
40 | # BERT Base
41 | tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
42 | model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
43 | 
44 | # BERT Large
45 | tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased')
46 | model = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')
47 | 
48 | # or, using BertModel and BertTokenizer directly
49 | from transformers import BertModel, BertTokenizer
50 | 
51 | tokenizer = BertTokenizer.from_pretrained('path/to/vocab.txt', do_lower_case=False)
52 | model = BertModel.from_pretrained('path/to/bert_dir')  # Or other BERT model class
53 | ```
54 | 
55 | ## Acknowledgement
56 | 
57 | We would like to thank Google for Cloud credits under a research grant that allowed us to train these models.
58 | 
59 | ## References
60 | 
61 | [1] [Multilingual Transformer Ensembles for Portuguese Natural Language Task](https://www.researchgate.net/publication/340236502_Multilingual_Transformer_Ensembles_for_Portuguese_Natural_Language_Tasks)
62 | 
63 | [2] [Assessing the Impact of Contextual Embeddings for Portuguese Named Entity Recognition](https://github.com/jneto04/ner-pt)
64 | 
65 | 
66 | ## How to cite this work
67 | 
68 |     @InProceedings{souza2020bertimbau,
69 |         author="Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto",
70 |         editor="Cerri, Ricardo and Prati, Ronaldo C.",
71 |         title="BERTimbau: Pretrained BERT Models for Brazilian Portuguese",
72 |         booktitle="Intelligent Systems",
73 |         year="2020",
74 |         publisher="Springer International Publishing",
75 |         address="Cham",
76 |         pages="403--417",
77 |         isbn="978-3-030-61377-8"
78 |     }
79 | 
80 | 
81 | 
82 |     @article{souza2019portuguese,
83 |         title={Portuguese Named Entity Recognition using BERT-CRF},
84 |         author={Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto},
85 |         journal={arXiv preprint arXiv:1909.10649},
86 |         url={http://arxiv.org/abs/1909.10649},
87 |         year={2019}
88 |     }
89 | 


--------------------------------------------------------------------------------
/ner_evaluation/run_bert_harem.py:
--------------------------------------------------------------------------------
  1 | """Training and evaluation entry point for HAREM experiments.
  2 | 
  3 | This file simply defines a function that loads input data into Example
  4 | instances for training/evaluation and defines evaluation metrics for each
  5 | dataset split set.
  6 | 
  7 | Since `load_and_cache_examples` function below uses
  8 | `preprocessing.read_examples` to read the JSON dataset files. See its docstring
  9 | for a description of the JSON structure.
 10 | """
 11 | 
 12 | import logging
 13 | from argparse import Namespace
 14 | from typing import List, Tuple
 15 | 
 16 | import torch
 17 | from pytorch_transformers import BertTokenizer
 18 | from seqeval.metrics import (classification_report,
 19 |                              f1_score,
 20 |                              precision_score,
 21 |                              recall_score)
 22 | from torch.utils.data import Dataset
 23 | 
 24 | from dataset import get_dataset
 25 | from eval_tools import confusion_matrix_nested, filtered, SequenceMetrics
 26 | from preprocessing import (Example, InputSpan, get_features_from_examples,
 27 |                            read_examples)
 28 | from tag_encoder import NERTagsEncoder
 29 | from trainer import main
 30 | 
 31 | 
 32 | logger = logging.getLogger(__name__)
 33 | 
 34 | 
 35 | def load_and_cache_examples(
 36 |     args: Namespace,
 37 |     tokenizer: BertTokenizer,
 38 |     tag_encoder: NERTagsEncoder,
 39 |     mode: str,
 40 | ) -> Tuple[Dataset, List[Example], List[InputSpan]]:
 41 |     """Preprocesses an input JSON file with raw training/evaluation
 42 |     examples and to BERT format according to the provided args (tokenizer,
 43 |     tag_encoder/scheme, max sequence length, doc stride, etc)."""
 44 |     if args.local_rank not in [-1, 0]:
 45 |         # Make sure only the first process in distributed training process
 46 |         # the dataset, and the others will use the cache.
 47 |         # TODO: Verify if this is working as expected.
 48 |         torch.distributed.barrier()
 49 | 
 50 |     if mode == 'train':
 51 |         input_file = args.train_file
 52 |     elif mode == 'valid':
 53 |         input_file = args.valid_file
 54 |     else:
 55 |         assert mode == 'eval', f"Invalid mode: {mode}"
 56 |         input_file = args.eval_file
 57 | 
 58 |     # HAREM dataset specific sanity checks
 59 |     # Assert all files use the same scenario (selective or total).
 60 |     scenario = 'selective' if 'selective' in input_file else 'total'
 61 |     assert scenario in args.labels_file
 62 | 
 63 |     examples = read_examples(
 64 |         input_file=input_file,
 65 |         is_training=True,
 66 |         classes=tag_encoder.classes,
 67 |         scheme=args.scheme)
 68 |     features = get_features_from_examples(
 69 |         examples,
 70 |         tag_encoder,
 71 |         tokenizer,
 72 |         args,
 73 |         mode=mode,
 74 |         unique_id_start=1000000000,
 75 |         verbose=args.verbose_logging)
 76 | 
 77 |     if mode != 'eval':
 78 |         if args.few_samples != -1:
 79 |             logger.info('Limiting dataset to %d examples.',
 80 |                         args.few_samples)
 81 |             examples = examples[:args.few_samples]
 82 |             features = list(filter(
 83 |                 lambda f: f.example_index < args.few_samples, features))
 84 |             logger.info('Final features: %d', len(features))
 85 | 
 86 |     if args.local_rank == 0:
 87 |         # Make sure only the first process in distributed training process
 88 |         # the dataset, and the others will use the cache
 89 |         # TODO: Verify if this is working as expected.
 90 |         torch.distributed.barrier()
 91 | 
 92 |     dataset = get_dataset(features)
 93 | 
 94 |     return dataset, examples, features
 95 | 
 96 | 
 97 | def get_train_metrics_fn(tag_encoder) -> SequenceMetrics:
 98 |     """Get SequenceMetrics instance for evaluating on the train data."""
 99 |     metrics = [
100 |         ('f1_score', f1_score)
101 |     ]
102 |     return SequenceMetrics(metrics)
103 | 
104 | 
105 | def get_eval_metrics_fn(tag_encoder) -> SequenceMetrics:
106 |     """Get SequenceMetrics instance for evaluating on the evaluation data.
107 |     """
108 |     metrics = [
109 |         ('f1_score', filtered(f1_score, tag_encoder)),
110 |         ('precision', filtered(
111 |             precision_score, tag_encoder)),
112 |         ('recall', filtered(
113 |             recall_score, tag_encoder)),
114 |         ('classification_report',
115 |             filtered(classification_report, tag_encoder, digits=4)),
116 |         ('confusion_matrix', confusion_matrix_nested),
117 |     ]
118 | 
119 |     return SequenceMetrics(metrics)
120 | 
121 | 
122 | if __name__ == "__main__":
123 | 
124 |     logging.basicConfig(
125 |         format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
126 |         datefmt='%m/%d/%Y %H:%M:%S',
127 |         level=logging.INFO)
128 | 
129 |     main(load_and_cache_examples,
130 |          get_train_metrics_fn=get_train_metrics_fn,
131 |          get_valid_metrics_fn=get_eval_metrics_fn,  # same as evaluation
132 |          get_eval_metrics_fn=get_eval_metrics_fn,
133 |          )
134 | 


--------------------------------------------------------------------------------
/ner_evaluation/postprocessing.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | 
  7 | def select_max_context_tokens(y_pred, prediction_mask, token_is_max_context):
  8 |     """Selects y_pred elements masked by prediction_mask &
  9 |     token_is_max_context.
 10 |     `y_pred` can be the output of any BERT model, and hence does not have a
 11 |     fixed expected length nor type.
 12 | 
 13 |     Shapes:
 14 |     -------
 15 |     y_pred: [seq_length] or [sum(prediction_mask)]. Shape depends on whether the
 16 |         BERT model has a CRF layer.
 17 |     prediction_mask: [seq_length]
 18 |     token_is_max_context: Variable length. Ranges from [doc_stride] up to 
 19 |         [seg_length - 1].
 20 |     """
 21 |     # Remove [CLS] token from prediction_mask
 22 |     prediction_mask = np.asarray(prediction_mask[1:], dtype=np.bool)
 23 |     max_context_mask = np.asarray(token_is_max_context, dtype=np.bool)
 24 | 
 25 |     if len(max_context_mask) < len(prediction_mask):
 26 |         # Right pad max_context with zeros to the size of prediction_mask
 27 |         right_pad = len(prediction_mask) - len(max_context_mask)
 28 |         max_context_mask = np.pad(max_context_mask, (0, right_pad),
 29 |                                   mode='constant', constant_values=(0, 0))
 30 | 
 31 |     # 1st case: y_pred is output of CRF layer
 32 |     if isinstance(y_pred, list):
 33 |         # y_pred is output of CRF layer (already masked by prediction_mask)
 34 |         # So we have to index max_context_mask by prediction_mask
 35 |         assert len(y_pred) == sum(prediction_mask)
 36 |         out_mask = max_context_mask[prediction_mask]
 37 | 
 38 |     else:
 39 |         y_pred = y_pred[1:]  # Remove [CLS] token
 40 | 
 41 |         if len(y_pred) == len(prediction_mask):
 42 |             # 2nd case: output of BERT model
 43 |             out_mask = prediction_mask & max_context_mask
 44 | 
 45 |         else:
 46 |             # y_pred is output of BERT-LSTM, that outputs arrays of variable
 47 |             # length (same size as non-masked input, i.e. sum(input_mask).
 48 |             # We just need to adjust the masks to have the same length as the
 49 |             # output.
 50 |             assert prediction_mask[len(y_pred):].sum() == 0
 51 |             assert max_context_mask[len(y_pred):].sum() == 0
 52 |             prediction_mask = prediction_mask[:len(y_pred)]
 53 |             max_context_mask = max_context_mask[:len(y_pred)]
 54 | 
 55 |             out_mask = prediction_mask & max_context_mask
 56 | 
 57 |     return np.asarray(y_pred)[out_mask]
 58 | 
 59 | 
 60 | def concatenate(list_tensors):
 61 |     """Concatenates a list of arrays/tensors/list."""
 62 | 
 63 |     if isinstance(list_tensors[0], np.ndarray):
 64 |         return np.concatenate(list_tensors)
 65 | 
 66 |     if isinstance(list_tensors[0], torch.Tensor):
 67 |         return torch.cat(list_tensors)
 68 | 
 69 |     if isinstance(list_tensors[0], list):
 70 |         output = []
 71 |         for tensor in list_tensors:
 72 |             output.extend(tensor)
 73 |         return output
 74 | 
 75 |     raise TypeError(f"Received invalid type: {type(list_tensors[0])}")
 76 | 
 77 | 
 78 | class MissingPartialOutputError(Exception):
 79 |     pass
 80 | 
 81 | 
 82 | class OutputComposer:
 83 |     """Combines the output of split examples using the max context tokens of
 84 |     each span."""
 85 | 
 86 |     def __init__(self, examples, features, output_transform_fn=None):
 87 |         self.examples = examples
 88 |         self.features = features
 89 |         self.ix2feature = defaultdict(dict)
 90 |         for feat in features:
 91 |             self.ix2feature[feat.example_index][feat.doc_span_index] = feat
 92 | 
 93 |         self.output_transform_fn = output_transform_fn
 94 |         self.reset()
 95 | 
 96 |     def reset(self):
 97 |         """Clear all partial outputs."""
 98 |         self.partial_outputs = {i: {} for i in range(len(self.examples))}
 99 | 
100 |     def insert_partial_output(self, example_ix, doc_span_ix, output):
101 |         """Selects max context tokens from partial output."""
102 |         feature = self.ix2feature[example_ix][doc_span_ix]
103 |         output = select_max_context_tokens(output,
104 |                                            feature.prediction_mask,
105 |                                            feature.token_is_max_context)
106 |         self.partial_outputs[example_ix][doc_span_ix] = output
107 | 
108 |     def insert_batch(self, example_ixs, doc_span_ixs, batch_output):
109 |         """Insert a batch of partial predictions."""
110 |         for output, example_ix, doc_span_ix in zip(batch_output,
111 |                                                    example_ixs,
112 |                                                    doc_span_ixs):
113 |             self.insert_partial_output(
114 |                 example_ix.item(), doc_span_ix.item(), output)
115 | 
116 |     def get_example_output(self, example_ix):
117 |         """Returns the final output of an example."""
118 |         N_spans = len(self.ix2feature[example_ix])
119 |         try:
120 |             example_partial_outputs = [
121 |                 self.partial_outputs[example_ix].get(j, []) for j in range(N_spans)
122 |             ]
123 |         except KeyError as err:
124 |             span_ix = err.args[0]
125 |             msg = (f"Missing partial output for example {example_ix}, span "
126 |                    f"{span_ix}.")
127 |             raise MissingPartialOutputError(msg) from None
128 | 
129 |         complete_output = concatenate(example_partial_outputs)
130 |         assert len(complete_output) == len(
131 |             self.examples[example_ix].doc_tokens)
132 | 
133 |         if self.output_transform_fn is not None:
134 |             transformed_output = self.output_transform_fn(complete_output)
135 |             return transformed_output
136 | 
137 |         return complete_output
138 | 
139 |     def get_outputs(self):
140 |         """Returns a list of max-context-combined outputs of all examples."""
141 |         outputs = []
142 |         for example_ix in range(len(self.examples)):
143 |             example_output = self.get_example_output(example_ix)
144 |             outputs.append(example_output)
145 | 
146 |         return outputs
147 | 


--------------------------------------------------------------------------------
/ner_evaluation/run_inference.py:
--------------------------------------------------------------------------------
  1 | """This script is an example on how to perform NER inference on plain texts.
  2 | 
  3 | Input file must be either a JSON file (that can have multiple documents) or a
  4 | txt file with a single document.
  5 | """
  6 | import json
  7 | import logging
  8 | import os
  9 | import tempfile
 10 | from argparse import ArgumentParser, Namespace
 11 | from typing import List, Tuple
 12 | 
 13 | import torch
 14 | from pytorch_transformers.tokenization_bert import BertTokenizer
 15 | from torch.utils.data import DataLoader, Dataset
 16 | from tqdm import tqdm
 17 | 
 18 | from dataset import get_dataset
 19 | from eval_tools import (SequenceMetrics, write_conll_prediction_file,
 20 |                         write_outputs_to_json)
 21 | from postprocessing import OutputComposer
 22 | from preprocessing import (Example, InputSpan, get_features_from_examples,
 23 |                            read_examples)
 24 | from tag_encoder import NERTagsEncoder
 25 | from trainer import evaluate
 26 | from utils import load_model
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def convert_txt_to_tmp_json_file(txt_file: str) -> str:
 32 |     """Converts a txt file with inference content to a JSON file with schema
 33 |     expected by read_examples. Returns a filename to the temp JSON file."""
 34 |     with open(txt_file) as fd:
 35 |         text = fd.read()
 36 | 
 37 |     tmp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
 38 |     json_data = [{"doc_id": 0, "doc_text": text}]
 39 | 
 40 |     tmp_file.write(json.dumps(json_data))
 41 |     tmp_file.close()
 42 | 
 43 |     return tmp_file.name
 44 | 
 45 | 
 46 | def load_and_cache_examples(
 47 |     input_file: str,
 48 |     args: Namespace,
 49 |     tokenizer: BertTokenizer,
 50 |     tag_encoder: NERTagsEncoder,
 51 |     mode: str,
 52 | ) -> Tuple[Dataset, List[Example], List[InputSpan]]:
 53 |     """Preprocesses an input JSON file to generate inference examples and
 54 |     convert to BERT format according to the provided args (tokenizer,
 55 |     tag_encoder/scheme, max sequence length, doc stride, etc)."""
 56 | 
 57 |     examples = read_examples(
 58 |         input_file=input_file,
 59 |         is_training=False,
 60 |         classes=tag_encoder.classes,
 61 |         scheme=args.scheme)
 62 |     features = get_features_from_examples(
 63 |         examples,
 64 |         tag_encoder,
 65 |         tokenizer,
 66 |         args,
 67 |         mode=mode,
 68 |         unique_id_start=0,
 69 |         verbose=args.verbose_logging)
 70 | 
 71 |     dataset = get_dataset(features)
 72 | 
 73 |     return dataset, examples, features
 74 | 
 75 | 
 76 | if __name__ == "__main__":
 77 | 
 78 |     parser = ArgumentParser("NER inference CLI")
 79 | 
 80 |     # Model and hyperparameters
 81 |     parser.add_argument("--input_file",
 82 |                         required=True,
 83 |                         help="File to load examples for inference (JSON or "
 84 |                              "txt).")
 85 |     parser.add_argument("--output_file",
 86 |                         default='-',
 87 |                         help="File to save prediction results. Defaults to "
 88 |                              "stdout.")
 89 |     parser.add_argument("--output_format",
 90 |                         choices=("json", "conll"),
 91 |                         default="json",
 92 |                         help="Format to save the predictions (json or conll). "
 93 |                              "Defaults to json.")
 94 | 
 95 |     parser.add_argument("--bert_model", default=None, type=str, required=True,
 96 |                         help="Bert pre-trained model name or path to a "
 97 |                         "checkpoint directory.")
 98 |     parser.add_argument("--tokenizer_model", default=None, type=str,
 99 |                         required=False,
100 |                         help="Path to tokenizer files. If empty, defaults to "
101 |                         "--bert_model.")
102 |     parser.add_argument("--do_lower_case",
103 |                         action='store_true',
104 |                         help="Whether to lower case the input text. True for "
105 |                         "uncased models, False for cased models.")
106 |     parser.add_argument("--max_seq_length", default=512, type=int,
107 |                         help="The maximum total input sequence length after "
108 |                         "WordPiece tokenization. Sequences longer than this "
109 |                         "will be split into multiple spans, and sequences "
110 |                         "shorter than this will be padded.")
111 |     parser.add_argument("--doc_stride", default=128, type=int,
112 |                         help="When splitting up a long document into chunks, "
113 |                         "how much stride to take between chunks.")
114 |     parser.add_argument('--labels_file',
115 |                         required=True,
116 |                         help="File with all NER classes to be considered, one "
117 |                         "per line.")
118 |     parser.add_argument('--scheme',
119 |                         default='bio', help='NER tagging scheme (BIO|BILUO).')
120 |     parser.add_argument('--no_crf',
121 |                         action='store_true',
122 |                         help='Remove the CRF layer (use plain BERT or '
123 |                         'BERT-LSTM).')
124 |     parser.add_argument('--pooler',
125 |                         default='last',
126 |                         help='Pooling strategy for extracting BERT encoded '
127 |                         'features from last BERT layers. '
128 |                         'One of "last", "sum" or "concat".')
129 |     parser.add_argument('--freeze_bert',
130 |                         action='store_true',
131 |                         help="Freeze BERT layers' parameters. If True, uses "
132 |                         "either a BERT-LSTM or BERT-LSTM-CRF model.")
133 |     parser.add_argument('--lstm_hidden_size',
134 |                         type=int,
135 |                         default=100,
136 |                         help=('Hidden dimension of the LSTM (only used when '
137 |                               'the BERT model is frozen.'))
138 |     parser.add_argument('--lstm_layers',
139 |                         type=int,
140 |                         default=1,
141 |                         help=('Number of LSTM layers (only used when the BERT '
142 |                               'model is frozen.'))
143 |     parser.add_argument('--no_cuda', action='store_true',
144 |                         help='Disables CUDA devices for inference.')
145 |     parser.add_argument('--batch_size', type=int,
146 |                         default=1, help='Batch size.')
147 |     parser.add_argument('--verbose_logging', action='store_true')
148 | 
149 |     args = parser.parse_args()
150 |     args.local_rank = -1
151 | 
152 |     logging.basicConfig()
153 | 
154 |     if torch.cuda.is_available and not args.no_cuda:
155 |         args.device = torch.device("cuda")
156 |         args.n_gpu = 1
157 |     else:
158 |         args.device = torch.device("cpu")
159 |         args.n_gpu = 0
160 | 
161 |     tokenizer_path = args.tokenizer_model or args.bert_model
162 |     tokenizer = BertTokenizer.from_pretrained(
163 |         tokenizer_path, do_lower_case=args.do_lower_case)
164 | 
165 |     # Instantiate NER Tag encoder
166 |     tag_encoder = NERTagsEncoder.from_labels_file(
167 |         args.labels_file, scheme=args.scheme.upper())
168 | 
169 |     args.num_labels = tag_encoder.num_labels
170 |     args.override_cache = True
171 | 
172 |     # Load a pretrained model
173 |     model = load_model(args, args.bert_model, training=False)
174 |     model.to(args.device)
175 | 
176 |     if args.input_file.endswith('.txt'):
177 |         args.inference_file = convert_txt_to_tmp_json_file(args.input_file)
178 |     else:
179 |         args.inference_file = args.input_file
180 | 
181 |     args.override_cache = True
182 | 
183 |     dataset, examples, features = load_and_cache_examples(
184 |         args.inference_file,
185 |         args=args,
186 |         tokenizer=tokenizer,
187 |         tag_encoder=tag_encoder,
188 |         mode='inference',
189 |     )
190 | 
191 |     output_composer = OutputComposer(
192 |         examples,
193 |         features,
194 |         output_transform_fn=tag_encoder.convert_ids_to_tags)
195 | 
196 |     logger.info("***** Running predictions *****")
197 |     logger.info("  Num orig examples = %d", len(examples))
198 |     logger.info("  Num split examples = %d", len(features))
199 |     logger.info("  Batch size = %d", args.batch_size)
200 | 
201 |     # Run prediction for full data
202 |     dataloader = DataLoader(dataset,
203 |                             batch_size=args.batch_size,
204 |                             num_workers=os.cpu_count())
205 | 
206 |     model.frozen_bert = False
207 | 
208 |     metrics = evaluate(
209 |         args,
210 |         model,
211 |         tqdm(dataloader, desc="Prediction"),
212 |         output_composer=output_composer,
213 |         sequence_metrics=SequenceMetrics([]),  # Empty metrics
214 |         reset=True,
215 |     )
216 | 
217 |     # Get predictions for all examples
218 |     all_y_pred_raw = output_composer.get_outputs()
219 |     # Filter invalid predictions
220 |     all_y_pred = [tag_encoder.decode_valid(y_pred)
221 |                   for y_pred in all_y_pred_raw]
222 | 
223 |     # Write predictions to output file
224 |     if args.output_format == 'conll':
225 |         write_conll_prediction_file(args.output_file, examples, all_y_pred)
226 | 
227 |     elif args.output_format == 'json':
228 |         write_outputs_to_json(args.output_file, examples, all_y_pred)
229 | 


--------------------------------------------------------------------------------
/ner_evaluation/eval_tools.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | import json
  3 | import sys
  4 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union
  5 | 
  6 | import numpy as np
  7 | from seqeval.metrics.sequence_labeling import get_entities
  8 | 
  9 | from preprocessing import Example, InputSpan
 10 | 
 11 | 
 12 | TAG_SEQUENCE = Union[List[int], List[str]]
 13 | METRIC_FN = Callable[[List[TAG_SEQUENCE], List[TAG_SEQUENCE]], Any]
 14 | 
 15 | 
 16 | def flatten(list_: List[Any]) -> List[Any]:
 17 |     """Flattens a nested list of tag predictions."""
 18 |     result = []
 19 | 
 20 |     for sub in list_:
 21 |         if sub and isinstance(sub, list) and isinstance(sub[0], list):
 22 |             result.extend(flatten(sub))
 23 |         elif isinstance(sub, list):
 24 |             result.extend(sub)
 25 |         else:
 26 |             result.append(sub)
 27 | 
 28 |     return result
 29 | 
 30 | 
 31 | def confusion_matrix_nested(y_true: List[TAG_SEQUENCE],
 32 |                             y_pred: List[TAG_SEQUENCE]) -> str:
 33 |     """Shortcut to Sklearn Confusion Matrix accepting nested lists of
 34 |     gold labels and predictions instead of flats lists."""
 35 |     from sklearn.metrics import confusion_matrix
 36 |     return confusion_matrix(flatten(y_true), flatten(y_pred))
 37 | 
 38 | 
 39 | def filtered(metric_fn: METRIC_FN,
 40 |              ner_tags,
 41 |              **kwargs: Any,
 42 |              ) -> METRIC_FN:
 43 |     """Wraps a metric function with invalid tag decoding filtering (removal of
 44 |     invalid tag predictions for the tag scheme).
 45 | 
 46 |     Args:
 47 |         metric_fn: a metric function.
 48 |         ner_tags: a NERLabelEncoder instance. Used to perform valid tag
 49 |             decoding.
 50 |         kwargs: extra arguments to be passed to `metric_fn`.
 51 |     """
 52 |     def metric(y_true: List[TAG_SEQUENCE], y_pred: List[TAG_SEQUENCE]) -> Any:
 53 |         y_pred = [ner_tags.decode_valid(y) for y in y_pred]
 54 |         return metric_fn(y_true, y_pred, **kwargs)
 55 |     return metric
 56 | 
 57 | 
 58 | def pad_max_context_array(max_context_mask, max_length=512):
 59 |     """Right pad max_context with zeros to the size of prediction_mask"""
 60 |     right_pad = max_length - len(max_context_mask)
 61 |     max_context_mask = np.pad(max_context_mask, (0, right_pad),
 62 |                               mode='constant', constant_values=(0, 0))
 63 | 
 64 |     return max_context_mask.astype(np.bool)
 65 | 
 66 | 
 67 | def postprocess_span_output(y_pred: TAG_SEQUENCE, span_features: InputSpan):
 68 |     """Postprocess the span output to consider only tokens of max context and
 69 |     not masked.
 70 | 
 71 |     The problem:
 72 |     The network is spitting span outputs. An example almost always have
 73 |     more than one span, and we have to combine all the spans to get the
 74 |     final output.
 75 | 
 76 |     Args:
 77 |         y_pred(List[int]): predicted class ids for one example span.
 78 |         span_features(InputFeatures): features of the span input.
 79 |     """
 80 | 
 81 |     out_cls_ids = []
 82 |     last_token_ix = -1
 83 | 
 84 |     # Get output classes skipping subtokens, the first [CLS] and masked tokens
 85 |     for tok_ix, cls_id in enumerate(y_pred[1:], start=1):
 86 | 
 87 |         is_considered = span_features.input_mask[tok_ix]
 88 |         pred_mask = span_features.prediction_mask[tok_ix]
 89 |         if is_considered and pred_mask:
 90 |             orig_token_ix = span_features.token_to_orig_map[tok_ix]
 91 |             is_max_context = span_features.token_is_max_context[tok_ix - 1]
 92 | 
 93 |             if orig_token_ix > last_token_ix:
 94 |                 last_token_ix = orig_token_ix
 95 | 
 96 |                 if is_max_context:
 97 |                     out_cls_ids.append(cls_id)
 98 | 
 99 |     return out_cls_ids
100 | 
101 | 
102 | class SequentialSpanPostProcessor(object):
103 |     """BERT (without CRF) Span post-processing class.
104 |     This class handles network postprocessing after each batch.
105 |     This class expects that the example order is NOT randomized, i.e., the
106 |     DataLoader uses a SequentialSampler.
107 |     """
108 | 
109 |     def __init__(self, features: List[InputSpan]):
110 |         self.features = features
111 |         self._index = 0
112 | 
113 |     def reset(self) -> None:
114 |         self._index = 0
115 | 
116 |     def __call__(self,
117 |                  y_true: TAG_SEQUENCE,
118 |                  y_pred: TAG_SEQUENCE,
119 |                  ) -> Tuple[int, TAG_SEQUENCE, TAG_SEQUENCE]:
120 |         """Performs max-context token selection for a single span."""
121 | 
122 |         span_features = self.features[self._index]
123 |         y_true = postprocess_span_output(y_true, span_features)
124 |         y_pred = postprocess_span_output(y_pred, span_features)
125 |         self._index += 1
126 | 
127 |         return span_features.example_index, y_true, y_pred
128 | 
129 | 
130 | class CRFSpanPostProcessor(object):
131 |     """Post-processes the output of the BERT-CRF network.
132 | 
133 |     The CRF layer outputs a list of lists of label ids of variable size.
134 |     Each sequence has a variable length, defined by the feature output mask.
135 |     Besides the prediction mask, we must select only the max context tokens of
136 |     each document span to reconstruct the example text.
137 |     """
138 | 
139 |     def __init__(self, features: List[InputSpan]):
140 |         self.features = features
141 |         # _index is the example index.
142 |         self._index = 0
143 | 
144 |     def reset(self) -> None:
145 |         self._index = 0
146 | 
147 |     def __call__(self, y_true: TAG_SEQUENCE, y_pred: TAG_SEQUENCE):
148 |         span_features = self.features[self._index]
149 | 
150 |         max_context_mask = pad_max_context_array(
151 |             span_features.token_is_max_context,
152 |             len(span_features.input_ids))
153 | 
154 |         output_mask = np.asarray(span_features.prediction_mask, dtype=np.uint)
155 |         partial_example_mask = max_context_mask[output_mask]
156 | 
157 |         y_true = [y for y, mask in zip(y_true, partial_example_mask) if mask]
158 |         y_pred = [y for y, mask in zip(y_pred, partial_example_mask) if mask]
159 | 
160 |         assert len(y_true) == len(y_pred), \
161 |             "y_true and y_pred should be of same length"
162 | 
163 |         self._index += 1
164 | 
165 |         return span_features.example_index, y_true, y_pred
166 | 
167 | 
168 | class SequenceMetrics(object):
169 |     """Calculates sequence metrics and keeps history of metric values.
170 | 
171 |     NOTE: Methods `get_best` and `get_best_epoch` assumes a **higher value**
172 |         is better.
173 |     """
174 | 
175 |     def __init__(self, metrics: List[Tuple[str, METRIC_FN]]):
176 |         self.metrics = {}
177 |         self.history = {}
178 | 
179 |         for metric_name, metric_fn in metrics:
180 |             self.add_metric(metric_name, metric_fn)
181 | 
182 |     def add_metric(self, metric_name: str, metric_fn: METRIC_FN) -> None:
183 |         self.metrics[metric_name] = metric_fn
184 |         self.history[metric_name] = []
185 | 
186 |     def clear_history(self) -> None:
187 |         self.history = {
188 |             k: [] for k in self.history.keys()
189 |         }
190 | 
191 |     def get_best(self, metric_name: str) -> Any:
192 |         """Returns the maximum value of the given metric by name."""
193 |         return max(self.history[metric_name])
194 | 
195 |     def get_best_epoch(self, metric_name: str) -> int:
196 |         """Returns the epoch number for which the metric has its highest
197 |         value."""
198 |         return int(np.argmax(self.history[metric_name]) + 1)
199 | 
200 |     def get_value(self, metric_name: str, epoch: Optional[int] = None) -> Any:
201 |         """Returns the value of a metric at a given epoch (defaults to last
202 |         epoch)."""
203 |         if epoch is None:
204 |             epoch = -1
205 |         else:
206 |             epoch = epoch - 1
207 |         return self.history[metric_name][epoch]
208 | 
209 |     def calculate_metrics(self,
210 |                           y_true: List[TAG_SEQUENCE],
211 |                           y_pred: List[TAG_SEQUENCE],
212 |                           ) -> Dict[str, Any]:
213 |         """Calculates all registered metrics for the gold and predicted tag
214 |         sequences.
215 | 
216 |         Args:
217 |             y_true: a list of gold tag sequences.
218 |             y_pred: a list of predicted tag sequences.
219 | 
220 |         Returns:
221 |             A dict of metric names to calculated metric values.
222 |         """
223 |         values = {}
224 | 
225 |         for name, metric_fn in self.metrics.items():
226 |             metric_value = metric_fn(y_true, y_pred)
227 |             values[name] = metric_value
228 |             self.history[name].append(metric_value)
229 | 
230 |         return values
231 | 
232 | 
233 | @contextlib.contextmanager
234 | def smart_open(filename=None):
235 |     if filename and filename != '-':
236 |         fh = open(filename, 'w')
237 |     else:
238 |         fh = sys.stdout
239 | 
240 |     try:
241 |         yield fh
242 |     finally:
243 |         if fh is not sys.stdout:
244 |             fh.close()
245 | 
246 | 
247 | def write_conll_prediction_file(
248 |         out_file: str,
249 |         examples: List[Example],
250 |         y_preds: List[TAG_SEQUENCE]) -> None:
251 |     """Writes a text output with predictions for a collection of Examples in
252 |     CoNLL evaluation format, one token per line:
253 | 
254 |     TOKEN GOLD-TAG PRED-TAG
255 | 
256 |     Distinct example outputs are separated by a blank line.
257 | 
258 |     Args:
259 |         out_file: the path of the output CoNLL prediction file.
260 |         examples: list of Example instances with associated tokens and gold
261 |             tag labels.
262 |         y_preds: list of predicted tag sequences for each example.
263 | 
264 |     Raises:
265 |         AssertionError: if (a) the lengths of y_preds and examples are not
266 |             equal, or (b) there is a mismatch in length of tokens, labels or
267 |             predicted tags for any example.
268 |     """
269 |     assert len(y_preds) == len(examples)
270 | 
271 |     with smart_open(out_file) as fd:
272 |         for example, pred_tag in zip(examples, y_preds):
273 | 
274 |             tokens = example.doc_tokens
275 |             labels = example.labels
276 | 
277 |             assert len(tokens) == len(labels)
278 |             assert len(labels) == len(pred_tag)
279 | 
280 |             for token, label, pred in zip(tokens, labels, pred_tag):
281 |                 fd.write('{} {} {}\n'.format(str(token.text), label, pred))
282 | 
283 |             # Separate examples by line break
284 |             fd.write('\n')
285 | 
286 | 
287 | def write_outputs_to_json(out_file: str,
288 |                           examples: List[Example],
289 |                           y_preds: List[TAG_SEQUENCE]) -> None:
290 |     """Writes a JSON with prediction outputs.
291 | 
292 |     Args:
293 |         out_file: path to an output file or '-' to use stdout.
294 |         examples: list of Example instances with associated tokens.
295 |         y_preds: list of predicted tag sequences for each example.
296 |     """
297 |     output = []
298 |     for example, y_pred in zip(examples, y_preds):
299 |         predicted_entities = []
300 | 
301 |         for entity in get_entities(y_pred):
302 |             entity_class, start_token_ix, end_token_ix = entity
303 |             start_char = example.doc_tokens[start_token_ix].offset
304 |             end_token = example.doc_tokens[end_token_ix]
305 |             end_char = end_token.offset + len(end_token)
306 | 
307 |             predicted_entities.append({
308 |                 'class': entity_class,
309 |                 'start_char': start_char,
310 |                 'end_char': end_char,
311 |                 'text': example.orig_text[start_char:end_char],
312 |             })
313 |         output.append({
314 |             'doc_id': example.doc_id,
315 |             'text': example.orig_text,
316 |             'entities': predicted_entities,
317 |         })
318 | 
319 |     with smart_open(out_file) as fd:
320 |         json.dump(output, fd)
321 | 


--------------------------------------------------------------------------------
/ner_evaluation/README.md:
--------------------------------------------------------------------------------
  1 | # Code for Named Entity Recognition task
  2 | 
  3 | This directory has code to train and evaluate BERT based models on NER task using the HAREM datasets. This package implements 4 architectures divided in two approaches:
  4 | 
  5 | **Fine-tuning**:
  6 | 
  7 | - BERT-CRF
  8 | - BERT
  9 | 
 10 | **Feature-based (BERT embeddings)**:
 11 | 
 12 | - BERT-LSTM-CRF
 13 | - BERT-LSTM
 14 | 
 15 | The training and evaluation entry point script is `run_bert_harem.py`. `run_inference.py` can be used to run inference on new data. All other files are modules. Commands to train and evaluate our BERT models on HAREM datasets are below for each distinct setup: Total and Selective scenarios, feature-based and Fine-tuning approaches, with and without CRF.
 16 | 
 17 | ## Environment Setup
 18 | 
 19 | The code uses a Python 3.6 environment and a GPU is desirable. The following steps use Conda to create a Python virtual environment. Please install Conda before
 20 | continuing or create an virtual environment using other tools and skip to step 3.
 21 | 
 22 | 1 - Create a Python 3.6 virtual environment. With conda:
 23 | 
 24 |     $ conda create -n bert_crf python=3.6
 25 | 
 26 | 2- Activate the environment:
 27 | 
 28 |     $ conda activate bert_crf
 29 |     # or, for older versions of Conda,
 30 |     $ source activate bert_crf
 31 | 
 32 | 3- Install **PyTorch 1.1.0** using pip or conda (instructions at [PyTorch Get Started guide](https://pytorch.org/get-started/previous-versions/#v110)). Other PyTorch versions were not tested.
 33 | 
 34 | 4- Install other requirements
 35 | 
 36 |     $ pip install -r requirements.txt
 37 | 
 38 | ## Trained models
 39 | 
 40 | Here are two checkpoints of trained NER models on First HAREM dataset.
 41 | 
 42 | [BERTimbau Base - BERT-CRF (selective scenario, 5 classes)](https://drive.google.com/file/d/125AMFLsAf33isxTumujUAYBVkoxE7zeT/view?usp=sharing)
 43 | 
 44 | [BERTimbau Base - BERT-CRF (total scenario, 10 classes)](https://drive.google.com/file/d/12PE1ypJ949rpatseSV37NXnHwB2Y8jZ5/view?usp=sharing)
 45 | 
 46 | ## Running inference
 47 | 
 48 | The script `run_inference.py` can be used to get predictions for new text using a trained NER model.
 49 | Instructions:
 50 | 
 51 | 1.  Download and extract a trained model checkpoint from above or train your own (instructions below).
 52 | 
 53 | 2.  Save the inference data in a txt file (for a single input document) or in a JSON file (for multiple documents):
 54 | 
 55 |         # inference_text.txt
 56 |         Pink Floyd foi uma banda britânica de rock formada em Londres em 1965.
 57 |         O grupo foi fundado pelos estudantes Syd Barrett (guitarra, vocal), Nick Mason (bateria), Roger Waters (baixo, voz) e Richard Wright (teclados, voz). Sob a liderança de Barrett, eles lançaram dois singles e um álbum de estreia de sucesso, The Piper at the Gates of Dawn (1967).
 58 | 
 59 |         # inference_data.json
 60 |         [{"doc_id": 0, "doc_text": "Text of the 1st document"}, {"doc_id": 1, "doc_text": "Text of the 2nd document"}]
 61 | 
 62 | 3.  Run inference command. This command assumes a downloaded checkpoint that was extracted in `bertimbau-base_bert-crf_total/`. Use the `--help` flag to display extra information about `--output_format` and `--output_file`.
 63 | 
 64 |         python run_inference.py \
 65 |           --bert_model bertimbau-base_bert-crf_total/ \
 66 |           --labels_file bertimbau-base_bert-crf_total/classes.txt \
 67 |           --input_file inference_text.txt \
 68 |           --output_format json \
 69 |           --output_file -
 70 | 
 71 |     By default, predictions will be printed in JSON format to stdout:
 72 | 
 73 |         [{"doc_id": 0, "text": "Pink Floyd foi uma banda brit\u00e2nica de rock formada em Londres em 1965.\nO grupo foi fundado pelos estudantes Syd Barrett (guitarra, vocal), Nick Mason (bateria), Roger Waters (baixo, voz) e Richard Wright (teclados, voz). Sob a lideran\u00e7a de Barrett, eles lan\u00e7aram dois singles e um \u00e1lbum de estreia de sucesso, The Piper at the Gates of Dawn (1967).", "entities": [{"class": "PESSOA", "start_char": 0, "end_char": 11, "text": "Pink Floyd "}, {"class": "LOCAL", "start_char": 54, "end_char": 62, "text": "Londres "}, {"class": "TEMPO", "start_char": 65, "end_char": 69, "text": "1965"}, {"class": "PESSOA", "start_char": 108, "end_char": 120, "text": "Syd Barrett "}, {"class": "PESSOA", "start_char": 139, "end_char": 150, "text": "Nick Mason "}, {"class": "PESSOA", "start_char": 161, "end_char": 174, "text": "Roger Waters "}, {"class": "PESSOA", "start_char": 189, "end_char": 204, "text": "Richard Wright "}, {"class": "PESSOA", "start_char": 240, "end_char": 247, "text": "Barrett"}, {"class": "OBRA", "start_char": 310, "end_char": 341, "text": "The Piper at the Gates of Dawn "}, {"class": "TEMPO", "start_char": 342, "end_char": 346, "text": "1967"}]}]
 74 | 
 75 | ## Running NER trainings and evaluations
 76 | 
 77 | In all commands below, `{pretrained_bert_model_path}` has to be changed by either a path to BERTimbau Base or Large checkpoint (downloaded from this repository), or the string `bert-base-multilingual-cased` to use mBERT.
 78 | 
 79 | In each training (`--do_train`), the model is trained for `--num_train_epochs` epochs using data from `--train_file` and validation is performed using data from `--valid_file`. The final checkpoint saves the model of best epoch in the output directory `--output_dir`. When `--do_eval` is set, a txt file with the
 80 | predictions for the test set (`--eval_file` arguments) in CoNLL format will also be saved. See the next section
 81 | for the commands to calculate the CoNLL metrics.
 82 | 
 83 | When the training ends, some metrics are displayed on the terminal for validation and test sets:
 84 | 
 85 | - Micro F1-score
 86 | - Precision
 87 | - Recall
 88 | - Classification Report: metrics per class. The micro avg line displays CoNLL equivalent metrics.
 89 | 
 90 | #### Datasets
 91 | 
 92 | The `data` directory contains the preprocessed HAREM datasets for both Selective and Total scenarios converted to JSON format. First HAREM is split in train/dev sets and Mini HAREM is used as test set. These JSON files are produced from original HAREM XML files using [this script](https://github.com/fabiocapsouza/harem_preprocessing). Train/dev split is done separately.
 93 | 
 94 | #### Important: Multi-GPU and FP16
 95 | 
 96 | Running this script in multi-GPU setup **is not recommended**. If the machine has multiple GPUs, limit the GPU visibility setting the `CUDA_VISIBLE_DEVICES` environment variable. Example:
 97 | 
 98 |     # Only GPU 0 will be visible
 99 |     CUDA_VISIBLE_DEVICES=0 python run_bert_harem.py [...]
100 | 
101 | FP16 training was not tested and so is also not recommended.
102 | 
103 | #### Batch size
104 | 
105 | The commands below set the batch size to 16 considering a BERT Base model and an 8GB GPU. The parameters `per_gpu_train_batch_size` and `gradient_accumulation_steps` can be changed to use less or more available memory and produce the same results, as long as `per_gpu_train_batch_size * gradient_accumulation_steps == 16`.
106 | 
107 | ### Fine-tuning experiments
108 | 
109 | #### BERT-CRF model
110 | 
111 |     # Total scenario
112 |     python run_bert_harem.py \
113 |         --bert_model {pretrained_bert_model_path} \
114 |         --labels_file data/classes-total.txt \
115 |         --do_train \
116 |         --train_file data/FirstHAREM-total-train.json \
117 |         --valid_file data/FirstHAREM-total-dev.json \
118 |         --num_train_epochs 15 \
119 |         --per_gpu_train_batch_size 2 \
120 |         --gradient_accumulation_steps 8 \
121 |         --do_eval \
122 |         --eval_file data/MiniHAREM-total.json \
123 |         --output_dir output_bert-crf_total
124 | 
125 |     # Selective scenario
126 |     python run_bert_harem.py \
127 |         --bert_model {pretrained_bert_model_path} \
128 |         --labels_file data/classes-selective.txt \
129 |         --do_train \
130 |         --train_file data/FirstHAREM-selective-train.json \
131 |         --valid_file data/FirstHAREM-selective-dev.json \
132 |         --num_train_epochs 15 \
133 |         --per_gpu_train_batch_size 2 \
134 |         --gradient_accumulation_steps 8 \
135 |         --do_eval \
136 |         --eval_file data/MiniHAREM-selective.json \
137 |         --output_dir output_bert-crf_selective
138 | 
139 | ---
140 | 
141 | #### BERT model
142 | 
143 |     # Total scenario
144 |     python run_bert_harem.py \
145 |         --bert_model {pretrained_bert_model_path} \
146 |         --labels_file data/classes-total.txt \
147 |         --do_train \
148 |         --train_file data/FirstHAREM-total-train.json \
149 |         --valid_file data/FirstHAREM-total-dev.json \
150 |         --no_crf \
151 |         --num_train_epochs 50 \
152 |         --per_gpu_train_batch_size 2 \
153 |         --gradient_accumulation_steps 8 \
154 |         --do_eval \
155 |         --eval_file data/MiniHAREM-total.json \
156 |         --output_dir output_bert_total
157 | 
158 |     # Selective scenario
159 |     python run_bert_harem.py \
160 |         --bert_model {pretrained_bert_model_path} \
161 |         --labels_file data/classes-selective.txt \
162 |         --do_train \
163 |         --train_file data/FirstHAREM-selective-train.json \
164 |         --valid_file data/FirstHAREM-selective-dev.json \
165 |         --no_crf \
166 |         --num_train_epochs 50 \
167 |         --per_gpu_train_batch_size 2 \
168 |         --gradient_accumulation_steps 8 \
169 |         --do_eval \
170 |         --eval_file data/MiniHAREM-selective.json \
171 |         --output_dir output_bert_selective
172 | 
173 | ---
174 | 
175 | ### Feature-based experiments
176 | 
177 | These experiments use the `--freeze_bert` flag to freeze all BERT's parameters and train a LSTM-CRF or LSTM model using BERT embeddings. `--pooler sum` indicates that BERT embeddings will be produced by summing the last 4 layers of BERT instead of using only the last layer.
178 | 
179 | #### BERT-LSTM-CRF model
180 | 
181 |     # Total scenario
182 |     python run_bert_harem.py \
183 |         --bert_model {pretrained_bert_model_path} \
184 |         --labels_file data/classes-total.txt \
185 |         --do_train \
186 |         --train_file data/FirstHAREM-total-train.json \
187 |         --valid_file data/FirstHAREM-total-dev.json \
188 |         --freeze_bert \
189 |         --pooler sum \
190 |         --num_train_epochs 50 \
191 |         --per_gpu_train_batch_size 2 \
192 |         --gradient_accumulation_steps 8 \
193 |         --do_eval \
194 |         --eval_file data/MiniHAREM-total.json \
195 |         --output_dir output_bert-lstm-crf_total
196 | 
197 |     # Selective scenario
198 |     python run_bert_harem.py \
199 |         --bert_model {pretrained_bert_model_path} \
200 |         --labels_file data/classes-selective.txt \
201 |         --do_train \
202 |         --train_file data/FirstHAREM-selective-train.json \
203 |         --valid_file data/FirstHAREM-selective-dev.json \
204 |         --freeze_bert \
205 |         --pooler sum \
206 |         --num_train_epochs 50 \
207 |         --per_gpu_train_batch_size 2 \
208 |         --gradient_accumulation_steps 8 \
209 |         --do_eval \
210 |         --eval_file data/MiniHAREM-selective.json \
211 |         --output_dir output_bert-lstm-crf_selective
212 | 
213 | ---
214 | 
215 | #### BERT-LSTM model
216 | 
217 |     # Total scenario
218 |     python run_bert_harem.py \
219 |         --bert_model {pretrained_bert_model_path} \
220 |         --labels_file data/classes-total.txt \
221 |         --do_train \
222 |         --train_file data/FirstHAREM-total-train.json \
223 |         --valid_file data/FirstHAREM-total-dev.json \
224 |         --freeze_bert \
225 |         --pooler sum \
226 |         --no_crf \
227 |         --num_train_epochs 100 \
228 |         --per_gpu_train_batch_size 2 \
229 |         --gradient_accumulation_steps 8 \
230 |         --do_eval \
231 |         --eval_file data/MiniHAREM-total.json \
232 |         --output_dir output_bert-lstm_total
233 | 
234 |     # Selective
235 |     python run_bert_harem.py \
236 |         --bert_model {pretrained_bert_model_path} \
237 |         --labels_file data/classes-selective.txt \
238 |         --do_train \
239 |         --train_file data/FirstHAREM-selective-train.json \
240 |         --valid_file data/FirstHAREM-selective-dev.json \
241 |         --freeze_bert \
242 |         --pooler sum \
243 |         --no_crf \
244 |         --num_train_epochs 100 \
245 |         --per_gpu_train_batch_size 2 \
246 |         --gradient_accumulation_steps 8 \
247 |         --do_eval \
248 |         --eval_file data/MiniHAREM-selective.json \
249 |         --output_dir output_bert-lstm_selective
250 | 
251 | ---
252 | 
253 | ### Computing CoNLL metrics
254 | 
255 | The [conlleval](https://www.clips.uantwerpen.be/conll2000/chunking/conlleval.txt) script should be used to compute the evaluation metrics using the `predictions_conll.txt` file that is output in the evaluation procedure, as explained below. However,
256 | the package uses the [seqeval library](https://github.com/chakki-works/seqeval) to compute CoNLL equivalent metrics which are printed in the console.
257 | 
258 | #### Using conlleval
259 | 
260 | Download the script and make it executable.
261 | 
262 |     $ chmod +x conlleval.txt
263 | 
264 | Then, run the command below inputing the corresponding `output_dir` of the trained model
265 | 
266 |     $ conlleval.txt < {output_dir}/predictions_conll.txt
267 | 
268 | For example, for BERTimbau-Large-CRF on Total scenario:
269 | 
270 |     $ ./conlleval.txt < output_bertimbau-large_BERT-CRF_total/predictions_conll.txt
271 |     processed 64853 tokens with 3642 phrases; found: 3523 phrases; correct: 2828.
272 |     accuracy:  96.80%; precision:  80.27%; recall:  77.65%; FB1:  78.94
273 |            ABSTRACCAO: precision:  59.33%; recall:  59.05%; FB1:  59.19  209
274 |         ACONTECIMENTO: precision:  36.51%; recall:  40.35%; FB1:  38.33  63
275 |                 COISA: precision:  61.26%; recall:  40.00%; FB1:  48.40  111
276 |                 LOCAL: precision:  89.71%; recall:  84.30%; FB1:  86.92  826
277 |                  OBRA: precision:  64.62%; recall:  65.97%; FB1:  65.28  195
278 |           ORGANIZACAO: precision:  71.11%; recall:  75.50%; FB1:  73.24  637
279 |                 OUTRO: precision:  50.00%; recall:  14.29%; FB1:  22.22  4
280 |                PESSOA: precision:  86.92%; recall:  83.79%; FB1:  85.33  803
281 |                 TEMPO: precision:  94.52%; recall:  90.61%; FB1:  92.52  347
282 |                 VALOR: precision:  80.79%; recall:  81.29%; FB1:  81.04  328
283 | 
284 | ### Available hyperparameters
285 | 
286 | Run `python run_bert_harem.py --help` to display the available hyperparameters. The default values are set to the ones used in our experiments.
287 | 


--------------------------------------------------------------------------------
/ner_evaluation/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import json
  3 | import logging
  4 | import os
  5 | 
  6 | from argparse import Namespace
  7 | from typing import (Dict, List, Optional)
  8 | import torch
  9 | 
 10 | from tag_encoder import NERTagsEncoder, SCHEMES
 11 | from tokenization import (
 12 |     Token,
 13 |     TokenizerWithAlignment,
 14 |     reconstruct_text_from_tokens,
 15 | )
 16 | 
 17 | from pytorch_transformers.tokenization_bert import BertTokenizer
 18 | 
 19 | LOGGER = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | NETag = collections.namedtuple("NETag", ['doc_id',
 23 |                                          'entity_id',
 24 |                                          'text',
 25 |                                          'type',
 26 |                                          'start_position',
 27 |                                          'end_position'])
 28 | 
 29 | 
 30 | class Example(object):
 31 |     """
 32 |     A single training/test example for NER training.
 33 |     """
 34 | 
 35 |     def __init__(self,
 36 |                  doc_id: int,
 37 |                  orig_text: str,
 38 |                  doc_tokens: List[Token],
 39 |                  tags: List[NETag],
 40 |                  labels: List[str],
 41 |                  ):
 42 |         self.doc_id = doc_id
 43 |         self.orig_text = orig_text
 44 |         self.doc_tokens = doc_tokens
 45 |         self.tags = tags
 46 |         self.labels = labels
 47 | 
 48 |         for token in doc_tokens:
 49 |             token._example = self
 50 | 
 51 |     def __str__(self):
 52 |         return repr(self)
 53 | 
 54 |     def __repr__(self):
 55 |         s = ('doc_id: {}\n'
 56 |              'orig_text:{}\n'
 57 |              'doc_tokens: {}\n'
 58 |              'labels: {}\n'
 59 |              'tags: {}\n').format(self.doc_id, self.orig_text, self.doc_tokens,
 60 |                                   self.labels, self.tags)
 61 |         return s
 62 | 
 63 | 
 64 | def read_examples(input_file: str,
 65 |                   is_training: bool,
 66 |                   classes: List[str] = None,
 67 |                   scheme: str = 'BIO',
 68 |                   ) -> List[Example]:
 69 |     """Read a JSON file into a list of Examples.
 70 | 
 71 |     The JSON file should contain a list of dictionaries, one dict per input
 72 |     document. Each dict should have the following entries:
 73 | 
 74 |     doc_id: an example unique identifier (for debugging).
 75 |     doc_text: the document text.
 76 |     entities: a list of dicts of named entities contained in `doc_text`.
 77 |         Each entity dict should have the following entries:
 78 | 
 79 |             entity_id: an identifier for the entity (debugging purposes).
 80 |             label: the named entity gold label.
 81 |             start_offset: start char offset of the entity in `doc_text`.
 82 |             end_offset: **exclusive** end char offset of the entity in
 83 |                 `doc_text`.
 84 |             text: the named entity text. It should be equal to the slice of the
 85 |                 document text using `start_offset` and `end_offset`, e.g.,
 86 |                 `doc_text[start_offset:end_offset]`.
 87 |     """
 88 |     scheme = scheme.upper()
 89 |     if scheme not in SCHEMES:
 90 |         raise ValueError("Invalid tagging scheme `{}`.".format(scheme))
 91 | 
 92 |     with open(input_file, "r", encoding='utf-8') as reader:
 93 |         input_data = json.load(reader)
 94 | 
 95 |     examples = []
 96 |     tokenizer_with_alignment = TokenizerWithAlignment()
 97 | 
 98 |     for document in input_data:
 99 |         doc_text = document["doc_text"]
100 |         doc_id = document["doc_id"]
101 | 
102 |         # Perform whitespace and punctuation tokenization keeping track of char
103 |         # alignment (char_to_word_offset)
104 |         doc_tokens, char_to_word_offset = tokenizer_with_alignment(doc_text)
105 |         labels = ["O"] * len(doc_tokens)
106 |         tags = []
107 | 
108 |         def set_label(index, tag):
109 |             if labels[index] != 'O':
110 |                 LOGGER.warning('Overwriting tag %s at position %s to %s',
111 |                                labels[index], index, tag)
112 |             labels[index] = tag
113 | 
114 |         if is_training:
115 |             for entity in document["entities"]:
116 |                 entity_id = entity["entity_id"]
117 |                 entity_text = entity["text"]
118 |                 entity_type = entity["label"]
119 |                 start_token = None
120 |                 end_token = None
121 | 
122 |                 entity_start_offset = entity["start_offset"]
123 |                 entity_end_offset = entity["end_offset"]
124 |                 start_token = char_to_word_offset[entity_start_offset]
125 |                 # end_offset is NOT inclusive to the text, e.g.,
126 |                 # entity_text == doc_text[start_offset:end_offset]
127 |                 end_token = char_to_word_offset[entity_end_offset - 1]
128 | 
129 |                 assert start_token <= end_token, \
130 |                     "End token cannot come before start token."
131 |                 reconstructed_text = reconstruct_text_from_tokens(
132 |                     doc_tokens[start_token:(end_token + 1)])
133 |                 assert entity_text.strip() == reconstructed_text, \
134 |                     "Entity text and reconstructed text are not equal: %s != %s" % (
135 |                         entity_text, reconstructed_text)
136 | 
137 |                 if scheme == 'BILUO':
138 |                     # BILUO scheme
139 |                     if start_token == end_token:
140 |                         tag = 'U-' + entity_type
141 |                         set_label(start_token, tag)
142 |                     else:
143 |                         for token_index in range(start_token, end_token + 1):
144 |                             if token_index == start_token:
145 |                                 tag = 'B-' + entity_type
146 |                             elif token_index == end_token:
147 |                                 tag = 'L-' + entity_type
148 |                             else:
149 |                                 tag = 'I-' + entity_type
150 | 
151 |                             set_label(token_index, tag)
152 | 
153 |                 elif scheme == 'BIO':
154 |                     # BIO scheme
155 |                     for token_index in range(start_token, end_token + 1):
156 |                         if token_index == start_token:
157 |                             tag = 'B-' + entity_type
158 |                         else:
159 |                             tag = 'I-' + entity_type
160 |                         set_label(token_index, tag)
161 | 
162 |                 entity = NETag(
163 |                     doc_id,
164 |                     entity_id,
165 |                     entity_text,
166 |                     entity_type,
167 |                     start_token,
168 |                     end_token,
169 |                 )
170 |                 tags.append(entity)
171 | 
172 |         example = Example(
173 |             doc_id=doc_id,
174 |             orig_text=doc_text,
175 |             doc_tokens=doc_tokens,
176 |             tags=tags,
177 |             labels=labels)
178 |         examples.append(example)
179 | 
180 |     return examples
181 | 
182 | 
183 | class InputSpan(object):
184 |     """A single set of features of data."""
185 | 
186 |     def __init__(self,
187 |                  unique_id: int,
188 |                  example_index: int,
189 |                  doc_span_index: int,
190 |                  tokens: List[Token],
191 |                  token_to_orig_map: Dict[int, int],
192 |                  token_is_max_context: List[bool],
193 |                  input_ids: List[int],
194 |                  input_mask: List[int],
195 |                  segment_ids: List[int],
196 |                  prediction_mask: List[bool],
197 |                  labels: Optional[List[str]] = (),
198 |                  label_ids: Optional[List[int]] = (),
199 |                  ):
200 |         self.unique_id = unique_id
201 |         self.example_index = example_index
202 |         self.doc_span_index = doc_span_index
203 |         self.tokens = tokens
204 |         self.token_to_orig_map = token_to_orig_map
205 |         self.token_is_max_context = token_is_max_context
206 |         self.input_ids = input_ids
207 |         self.input_mask = input_mask
208 |         self.segment_ids = segment_ids
209 |         self.labels = labels or []
210 |         self.label_ids = label_ids or []
211 |         self.prediction_mask = prediction_mask
212 | 
213 |     def __repr__(self):
214 |         return "<Input Features: example {}, span {}>".format(
215 |             self.example_index, self.doc_span_index,
216 |         )
217 | 
218 |     def __str__(self):
219 |         return self.__repr__()
220 | 
221 |     def __len__(self):
222 |         return len(self.tokens)
223 | 
224 | 
225 | def _check_is_max_context(doc_spans: List[InputSpan],
226 |                           cur_span_index: int,
227 |                           position: int,
228 |                           ) -> bool:
229 |     """Check if this is the 'max context' doc span for the token."""
230 | 
231 |     # Because of the sliding window approach taken to scoring documents, a
232 |     # single token can appear in multiple documents. E.g.
233 |     #  Doc: the man went to the store and bought a gallon of milk
234 |     #  Span A: the man went to the
235 |     #  Span B: to the store and bought
236 |     #  Span C: and bought a gallon of
237 |     #  ...
238 |     #
239 |     # Now the word 'bought' will have two scores from spans B and C. We only
240 |     # want to consider the score with "maximum context", which we define as
241 |     # the *minimum* of its left and right context (the *sum* of left and
242 |     # right context will always be the same, of course).
243 |     #
244 |     # In the example the maximum context for 'bought' would be span C since
245 |     # it has 1 left context and 3 right context, while span B has 4 left context
246 |     # and 0 right context.
247 |     best_score = None
248 |     best_span_index = None
249 |     for (span_index, doc_span) in enumerate(doc_spans):
250 |         end = doc_span.start + doc_span.length - 1
251 |         if position < doc_span.start:
252 |             continue
253 |         if position > end:
254 |             continue
255 |         num_left_context = position - doc_span.start
256 |         num_right_context = end - position
257 |         score = min(num_left_context, num_right_context) + \
258 |             0.01 * doc_span.length
259 |         if best_score is None or score > best_score:
260 |             best_score = score
261 |             best_span_index = span_index
262 | 
263 |     return cur_span_index == best_span_index
264 | 
265 | 
266 | def convert_examples_to_spans(examples: List[Example],
267 |                               ner_tags_converter: NERTagsEncoder,
268 |                               tokenizer: BertTokenizer,
269 |                               max_seq_length: int,
270 |                               doc_stride: int,
271 |                               is_training: bool,
272 |                               unique_id_start: Optional[int] = None,
273 |                               verbose: bool = True,
274 |                               ) -> List[InputSpan]:
275 |     """Converts examples to BERT input-ready data tensor-like structures,
276 |     splitting large documents into spans of `max_seq_length` using a stride of
277 |     `doc_stride` tokens."""
278 | 
279 |     unique_id = unique_id_start or 1000000000
280 | 
281 |     features = []
282 |     for (example_index, example) in enumerate(examples):
283 | 
284 |         doc_tokens = example.doc_tokens
285 |         doc_labels = example.labels
286 | 
287 |         tok_to_orig_index = []
288 |         orig_to_tok_index = []
289 |         all_doc_tokens = []
290 |         all_doc_labels = []
291 |         all_prediction_mask = []
292 | 
293 |         for i, token in enumerate(doc_tokens):
294 |             orig_to_tok_index.append(len(all_doc_tokens))
295 |             sub_tokens = tokenizer.tokenize(token.text)
296 |             for j, sub_token in enumerate(sub_tokens):
297 |                 # Create mapping from subtokens to original token
298 |                 tok_to_orig_index.append(i)
299 |                 all_doc_tokens.append(sub_token)
300 |                 # Mask all subtokens (j > 0)
301 |                 all_prediction_mask.append(j == 0)
302 | 
303 |                 if j == 0:
304 |                     label = doc_labels[i]
305 |                     all_doc_labels.append(label)
306 |                 else:
307 |                     all_doc_labels.append('X')
308 | 
309 |         assert len(all_doc_tokens) == len(all_prediction_mask)
310 |         if is_training:
311 |             assert len(all_doc_tokens) == len(all_doc_labels)
312 | 
313 |         # The -1 accounts for [CLS]. For NER we have only one sentence, so no
314 |         # [SEP] tokens.
315 |         max_tokens_for_doc = max_seq_length - 1
316 | 
317 |         # We can have documents that are longer than the maximum sequence length.
318 |         # To deal with this we do a sliding window approach, where we take chunks
319 |         # of the up to our max length with a stride of `doc_stride`.
320 |         _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
321 |             "DocSpan", ["start", "length"])
322 |         doc_spans = []
323 |         start_offset = 0
324 |         while start_offset < len(all_doc_tokens):
325 |             length = len(all_doc_tokens) - start_offset
326 |             if length > max_tokens_for_doc:
327 |                 length = max_tokens_for_doc
328 |             doc_spans.append(_DocSpan(start=start_offset, length=length))
329 |             if start_offset + length == len(all_doc_tokens):
330 |                 break
331 |             start_offset += min(length, doc_stride)
332 | 
333 |         for (doc_span_index, doc_span) in enumerate(doc_spans):
334 |             tokens = []
335 |             token_to_orig_map = {}
336 |             token_is_max_context = []
337 |             segment_ids = []
338 |             labels = None
339 |             label_ids = None
340 |             prediction_mask = []
341 |             # Include [CLS] token
342 |             tokens.append("[CLS]")
343 |             segment_ids.append(0)
344 |             prediction_mask.append(False)
345 | 
346 |             # Ignore [CLS] label
347 |             if is_training:
348 |                 labels = ['X']
349 | 
350 |             for i in range(doc_span.length):
351 |                 # Each doc span will have a dict that indicates if it is the
352 |                 # *max_context span* for the tokens inside it
353 |                 split_token_index = doc_span.start + i
354 |                 token_to_orig_map[len(
355 |                     tokens)] = tok_to_orig_index[split_token_index]
356 | 
357 |                 is_max_context = _check_is_max_context(doc_spans,
358 |                                                        doc_span_index,
359 |                                                        split_token_index)
360 |                 token_is_max_context.append(is_max_context)
361 |                 tokens.append(all_doc_tokens[split_token_index])
362 |                 segment_ids.append(0)
363 |                 if is_training:
364 |                     labels.append(all_doc_labels[split_token_index])
365 |                 prediction_mask.append(
366 |                     all_prediction_mask[split_token_index])
367 | 
368 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
369 |             if is_training:
370 |                 label_ids = ner_tags_converter.convert_tags_to_ids(labels)
371 | 
372 |             # The mask has 1 for real tokens and 0 for padding tokens. Only real
373 |             # tokens are attended to.
374 |             input_mask = [1] * len(input_ids)
375 | 
376 |             # Zero-pad up to the sequence length.
377 |             while len(input_ids) < max_seq_length:
378 |                 input_ids.append(0)
379 |                 input_mask.append(0)
380 |                 segment_ids.append(0)
381 |                 if is_training:
382 |                     label_ids.append(ner_tags_converter.ignore_index)
383 |                 prediction_mask.append(False)
384 | 
385 |             # If not training, use placeholder labels
386 |             if not is_training:
387 |                 labels = ['O'] * len(input_ids)
388 |                 label_ids = [ner_tags_converter.ignore_index] * len(input_ids)
389 | 
390 |             assert len(input_ids) == max_seq_length
391 |             assert len(input_mask) == max_seq_length
392 |             assert len(segment_ids) == max_seq_length
393 |             assert len(prediction_mask) == max_seq_length
394 |             if is_training:
395 |                 assert len(label_ids) == max_seq_length
396 | 
397 |             if verbose and example_index < 20:
398 |                 LOGGER.info("*** Example ***")
399 |                 LOGGER.info("unique_id: %s" % (unique_id))
400 |                 LOGGER.info("example_index: %s" % (example_index))
401 |                 LOGGER.info("doc_span_index: %s" % (doc_span_index))
402 |                 LOGGER.info("tokens: %s" % " ".join(tokens))
403 |                 LOGGER.info("token_to_orig_map: %s" % " ".join([
404 |                     "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
405 |                 LOGGER.info("token_is_max_context: %s", token_is_max_context)
406 |                 LOGGER.info("input_ids: %s" %
407 |                             " ".join([str(x) for x in input_ids]))
408 |                 LOGGER.info(
409 |                     "input_mask: %s" % " ".join([str(x) for x in input_mask]))
410 |                 LOGGER.info(
411 |                     "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
412 |                 LOGGER.info("prediction_mask: %s" % " ".join([
413 |                     str(x) for x in prediction_mask
414 |                 ]))
415 |                 if is_training:
416 |                     LOGGER.info(
417 |                         "label_ids: %s" % " ".join([str(x) for x in label_ids]))
418 | 
419 |                 LOGGER.info("tags:")
420 |                 inside_label = False
421 |                 for tok, lab, lab_id in zip(tokens, labels, label_ids):
422 |                     if lab[0] == "O":
423 |                         if inside_label and tok.startswith("##"):
424 |                             LOGGER.info(f'{tok}\tX')
425 |                         else:
426 |                             inside_label = False
427 |                     else:
428 |                         if lab[0] in ("B", "I", "L", "U") or inside_label:
429 |                             if lab[0] in ("B", "U"):
430 |                                 # new entity
431 |                                 LOGGER.info('')
432 |                             inside_label = True
433 |                             LOGGER.info(f'{tok}\t{lab}\t{lab_id}')
434 | 
435 |             features.append(
436 |                 InputSpan(
437 |                     unique_id=unique_id,
438 |                     example_index=example_index,
439 |                     doc_span_index=doc_span_index,
440 |                     tokens=tokens,
441 |                     token_to_orig_map=token_to_orig_map,
442 |                     token_is_max_context=token_is_max_context,
443 |                     input_ids=input_ids,
444 |                     input_mask=input_mask,
445 |                     segment_ids=segment_ids,
446 |                     labels=labels,
447 |                     label_ids=label_ids,
448 |                     prediction_mask=prediction_mask,
449 |                 ))
450 |             unique_id += 1
451 | 
452 |     return features
453 | 
454 | 
455 | def get_features_from_examples(examples: List[Example],
456 |                                ner_tags_converter: NERTagsEncoder,
457 |                                tokenizer: BertTokenizer,
458 |                                args: Namespace,  # args from ArgumentParser
459 |                                mode: str,
460 |                                unique_id_start: int = None,
461 |                                verbose: bool = True,
462 |                                ) -> List[InputSpan]:
463 |     """Convert examples to input spans. Read from cache if possible."""
464 | 
465 |     assert mode in ('train', 'valid', 'eval', 'inference'), "Invalid mode."
466 |     examples_file = getattr(args, mode + '_file') or mode
467 | 
468 |     cached_features_file = examples_file + '_{0}_{1}_{2}'.format(
469 |         list(filter(None, args.bert_model.split('/'))).pop(),
470 |         str(args.max_seq_length),
471 |         str(args.doc_stride))
472 | 
473 |     spans = None
474 |     loaded_from_cache = False
475 | 
476 |     if os.path.isfile(cached_features_file) and not args.override_cache:
477 |         # Read from cache
478 |         LOGGER.info('Reading cached features from {}'
479 |                     .format(cached_features_file))
480 |         spans = torch.load(cached_features_file)
481 |         loaded_from_cache = True
482 |     else:  # noqa: E772
483 |         LOGGER.info('Converting examples to features.')
484 |         is_training = True if mode in ('train', 'valid', 'eval') else False
485 |         spans = convert_examples_to_spans(
486 |             examples=examples,
487 |             ner_tags_converter=ner_tags_converter,
488 |             tokenizer=tokenizer,
489 |             max_seq_length=args.max_seq_length,
490 |             doc_stride=args.doc_stride,
491 |             is_training=is_training,
492 |             unique_id_start=unique_id_start,
493 |             verbose=verbose)
494 | 
495 |         if args.local_rank == -1 or torch.distributed.get_rank() == 0:
496 |             if not loaded_from_cache or args.override_cache:
497 |                 LOGGER.info(
498 |                     "  Saving %s features into cached file %s",
499 |                     mode, cached_features_file)
500 |                 torch.save(spans, cached_features_file)
501 | 
502 |     return spans
503 | 


--------------------------------------------------------------------------------
/ner_evaluation/model.py:
--------------------------------------------------------------------------------
  1 | """Implementations of BERT, BERT-CRF, BERT-LSTM and BERT-LSTM-CRF models."""
  2 | 
  3 | import logging
  4 | from argparse import Namespace
  5 | from typing import Any, Dict, Optional, Tuple, Type
  6 | 
  7 | import torch
  8 | from pytorch_transformers.modeling_bert import (BertConfig,
  9 |                                                 BertForTokenClassification)
 10 | from torchcrf import CRF
 11 | 
 12 | LOGGER = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def sum_last_4_layers(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor:
 16 |     """Sums the last 4 hidden representations of a sequence output of BERT.
 17 |     Args:
 18 |     -----
 19 |     sequence_output: Tuple of tensors of shape (batch, seq_length, hidden_size).
 20 |         For BERT base, the Tuple has length 13.
 21 | 
 22 |     Returns:
 23 |     --------
 24 |     summed_layers: Tensor of shape (batch, seq_length, hidden_size)
 25 |     """
 26 |     last_layers = sequence_outputs[-4:]
 27 |     return torch.stack(last_layers, dim=0).sum(dim=0)
 28 | 
 29 | 
 30 | def get_last_layer(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor:
 31 |     """Returns the last tensor of a list of tensors."""
 32 |     return sequence_outputs[-1]
 33 | 
 34 | 
 35 | def concat_last_4_layers(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor:
 36 |     """Concatenate the last 4 tensors of a tuple of tensors."""
 37 |     last_layers = sequence_outputs[-4:]
 38 |     return torch.cat(last_layers, dim=-1)
 39 | 
 40 | 
 41 | POOLERS = {
 42 |     'sum': sum_last_4_layers,
 43 |     'last': get_last_layer,
 44 |     'concat': concat_last_4_layers,
 45 | }
 46 | 
 47 | 
 48 | def get_model_and_kwargs_for_args(
 49 |         args: Namespace,
 50 |         training: bool = True,
 51 | ) -> Tuple[Type[torch.nn.Module], Dict[str, Any]]:
 52 |     """Given the parsed arguments, returns the correct model class and model
 53 |     args.
 54 | 
 55 |     Args:
 56 |         args: a Namespace object (from parsed argv command).
 57 |         training: if True, sets a high initialization value for classifier bias
 58 |             parameter after model initialization.
 59 |     """
 60 |     bias_O = 6 if training else None
 61 |     model_args = {
 62 |         'pooler': args.pooler,
 63 |         'bias_O': bias_O,
 64 |     }
 65 | 
 66 |     if args.freeze_bert:
 67 |         # Possible models: BERT-LSTM or BERT-LSTM-CRF
 68 |         model_args['lstm_layers'] = args.lstm_layers
 69 |         model_args['lstm_hidden_size'] = args.lstm_hidden_size
 70 |         if args.no_crf:
 71 |             model_class = BertLSTM
 72 |         else:
 73 |             model_class = BertLSTMCRF
 74 | 
 75 |     else:
 76 |         # Possible models: BertForNERClassification or BertCRF
 77 |         if args.no_crf:
 78 |             model_class = BertForNERClassification
 79 |         else:
 80 |             model_class = BertCRF
 81 | 
 82 |     return model_class, model_args
 83 | 
 84 | 
 85 | class BertForNERClassification(BertForTokenClassification):
 86 |     """BERT model for NER task.
 87 | 
 88 |     The number of NER tags should be defined in the `BertConfig.num_labels`
 89 |     attribute.
 90 | 
 91 |     Args:
 92 |         config: BertConfig instance to build BERT model.
 93 |         weight_O: loss weight value for "O" tags in CrossEntropyLoss.
 94 |         bias_O: optional value to initiate the classifier's bias value for "O"
 95 |             tag.
 96 |         pooler: which pooler configuration to use to pass BERT features to the
 97 |             classifier.
 98 |     """
 99 | 
100 |     def __init__(self,
101 |                  config: BertConfig,
102 |                  weight_O: float = 0.01,
103 |                  bias_O: Optional[float] = None,
104 |                  pooler='last'):
105 |         super().__init__(config)
106 |         del self.classifier  # Deletes classifier of BertForTokenClassification
107 | 
108 |         num_labels = config.num_labels
109 | 
110 |         if pooler not in POOLERS:
111 |             message = ("Invalid pooler: %s. Pooler must be one of %s."
112 |                        % (pooler, list(POOLERS.keys())))
113 |             raise ValueError(message)
114 | 
115 |         self._build_classifier(config, pooler)
116 |         if bias_O is not None:
117 |             self.set_bias_tag_O(bias_O)
118 | 
119 |         assert isinstance(weight_O, float) and 0 < weight_O < 1
120 |         weights = [1.] * num_labels
121 |         weights[0] = weight_O
122 |         weights = torch.tensor(weights)
123 |         self.loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
124 | 
125 |         self.frozen_bert = False
126 |         self.pooler = POOLERS.get(pooler)
127 | 
128 |     def _build_classifier(self, config, pooler):
129 |         """Build tag classifier."""
130 |         if pooler in ('last', 'sum'):
131 |             self.classifier = torch.nn.Linear(config.hidden_size,
132 |                                               config.num_labels)
133 |         else:
134 |             assert pooler == 'concat'
135 |             self.classifier = torch.nn.Linear(4 * config.hidden_size,
136 |                                               config.num_labels)
137 | 
138 |     def set_bias_tag_O(self, bias_O: Optional[float] = None):
139 |         """Increase tag "O" bias to produce high probabilities early on and
140 |         reduce instability in early training."""
141 |         if bias_O is not None:
142 |             LOGGER.info('Setting bias of OUT token to %s.', bias_O)
143 |             self.classifier.bias.data[0] = bias_O
144 | 
145 |     def freeze_bert(self):
146 |         """Freeze all BERT parameters. Only the classifier weights will be
147 |         updated."""
148 |         for p in self.bert.parameters():
149 |             p.requires_grad = False
150 |         self.frozen_bert = True
151 | 
152 |     def bert_encode(self, input_ids, token_type_ids=None, attention_mask=None):
153 |         """Gets encoded sequence from BERT model and pools the layers accordingly.
154 |         BertModel outputs a tuple whose elements are:
155 |         1- Last encoder layer output. Tensor of shape (B, S, H)
156 |         2- Pooled output of the [CLS] token. Tensor of shape (B, H)
157 |         3- Encoder inputs (embeddings) + all Encoder layers' outputs. This
158 |             requires the flag `output_hidden_states=True` on BertConfig. Returns
159 |             List of tensors of shapes (B, S, H).
160 |         4- Attention results, if `output_attentions=True` in BertConfig.
161 | 
162 |         This method uses just the 3rd output and pools the layers.
163 |         """
164 |         _, _, all_layers_sequence_outputs, *_ = self.bert(
165 |             input_ids,
166 |             token_type_ids=token_type_ids,
167 |             attention_mask=attention_mask)
168 | 
169 |         # Use the defined pooler to pool the hidden representation layers
170 |         sequence_output = self.pooler(all_layers_sequence_outputs)
171 | 
172 |         return sequence_output
173 | 
174 |     def predict_logits(self, input_ids, token_type_ids=None,
175 |                        attention_mask=None):
176 |         """Returns the logits prediction from BERT + classifier."""
177 |         if self.frozen_bert:
178 |             sequence_output = input_ids
179 |         else:
180 |             sequence_output = self.bert_encode(
181 |                 input_ids, token_type_ids, attention_mask)
182 | 
183 |         sequence_output = self.dropout(sequence_output)
184 |         logits = self.classifier(sequence_output)  # (batch, seq, tags)
185 | 
186 |         return logits
187 | 
188 |     def forward(self,
189 |                 input_ids,
190 |                 token_type_ids=None,
191 |                 attention_mask=None,
192 |                 labels=None,
193 |                 prediction_mask=None,
194 |                 ) -> Dict[str, torch.Tensor]:
195 |         """Performs the forward pass of the network.
196 | 
197 |         If `labels` are not None, it will calculate and return the loss.
198 |         Otherwise, it will return the logits and predicted tags tensors.
199 | 
200 |         Args:
201 |             input_ids: tensor of input token ids.
202 |             token_type_ids: tensor of input sentence type id (0 or 1). Should be
203 |                 all zeros for NER. Can be safely set to `None`.
204 |             attention_mask: mask tensor that should have value 0 for [PAD]
205 |                 tokens and 1 for other tokens.
206 |             labels: tensor of gold NER tag label ids. Values should be ints in
207 |                 the range [0, config.num_labels - 1].
208 |             prediction_mask: mask tensor should have value 0 for tokens that do
209 |                 not have an associated prediction, such as [CLS] and WordPìece
210 |                 subtoken continuations (that start with ##).
211 | 
212 |         Returns a dict with calculated tensors:
213 |           - "logits"
214 |           - "y_pred"
215 |           - "loss" (if `labels` is not `None`)
216 |         """
217 |         outputs = {}
218 | 
219 |         logits = self.predict_logits(input_ids=input_ids,
220 |                                      token_type_ids=token_type_ids,
221 |                                      attention_mask=attention_mask)
222 |         _, y_pred = torch.max(logits, dim=-1)
223 |         y_pred = y_pred.cpu().numpy()
224 |         outputs['logits'] = logits
225 |         outputs['y_pred'] = y_pred
226 | 
227 |         if labels is not None:
228 |             # Only keep active parts of the loss
229 |             mask = prediction_mask
230 |             if mask is not None:
231 |                 mask = mask.view(-1)
232 |                 active_logits = logits.view(-1, self.num_labels)[mask]
233 |                 active_labels = labels.view(-1)[mask]
234 |                 loss = self.loss_fct(active_logits, active_labels)
235 |             else:
236 |                 loss = self.loss_fct(
237 |                     logits.view(-1, self.num_labels), labels.view(-1))
238 |             outputs['loss'] = loss
239 | 
240 |         return outputs
241 | 
242 | 
243 | class BertCRF(BertForNERClassification):
244 |     """BERT-CRF model.
245 | 
246 |     Args:
247 |         config: BertConfig instance to build BERT model.
248 |         kwargs: arguments to be passed to superclass.
249 |     """
250 | 
251 |     def __init__(self, config: BertConfig, **kwargs: Any):
252 |         super().__init__(config, **kwargs)
253 |         del self.loss_fct  # Delete unused CrossEntropyLoss
254 |         self.crf = CRF(num_tags=config.num_labels, batch_first=True)
255 | 
256 |     def forward(self,
257 |                 input_ids,
258 |                 token_type_ids=None,
259 |                 attention_mask=None,
260 |                 labels=None,
261 |                 prediction_mask=None,
262 |                 ) -> Dict[str, torch.Tensor]:
263 |         """Performs the forward pass of the network.
264 | 
265 |         If `labels` is not `None`, it will calculate and return the the loss,
266 |         that is the negative log-likelihood of the batch.
267 |         Otherwise, it will calculate the most probable sequence outputs using
268 |         Viterbi decoding and return a list of sequences (List[List[int]]) of
269 |         variable lengths.
270 | 
271 |         Args:
272 |             input_ids: tensor of input token ids.
273 |             token_type_ids: tensor of input sentence type id (0 or 1). Should be
274 |                 all zeros for NER. Can be safely set to `None`.
275 |             attention_mask: mask tensor that should have value 0 for [PAD]
276 |                 tokens and 1 for other tokens.
277 |             labels: tensor of gold NER tag label ids. Values should be ints in
278 |                 the range [0, config.num_labels - 1].
279 |             prediction_mask: mask tensor should have value 0 for tokens that do
280 |                 not have an associated prediction, such as [CLS] and WordPìece
281 |                 subtoken continuations (that start with ##).
282 | 
283 |         Returns a dict with calculated tensors:
284 |           - "logits"
285 |           - "loss" (if `labels` is not `None`)
286 |           - "y_pred" (if `labels` is `None`)
287 |         """
288 |         outputs = {}
289 | 
290 |         logits = self.predict_logits(input_ids=input_ids,
291 |                                      token_type_ids=token_type_ids,
292 |                                      attention_mask=attention_mask)
293 |         outputs['logits'] = logits
294 | 
295 |         # mask: mask padded sequence and also subtokens, because they must
296 |         # not be used in CRF.
297 |         mask = prediction_mask
298 |         batch_size = logits.shape[0]
299 | 
300 |         if labels is not None:
301 |             # Negative of the log likelihood.
302 |             # Loop through the batch here because of 2 reasons:
303 |             # 1- the CRF package assumes the mask tensor cannot have interleaved
304 |             # zeros and ones. In other words, the mask should start with True
305 |             # values, transition to False at some moment and never transition
306 |             # back to True. That can only happen for simple padded sequences.
307 |             # 2- The first column of mask tensor should be all True, and we
308 |             # cannot guarantee that because we have to mask all non-first
309 |             # subtokens of the WordPiece tokenization.
310 |             loss = 0
311 |             for seq_logits, seq_labels, seq_mask in zip(logits, labels, mask):
312 |                 # Index logits and labels using prediction mask to pass only the
313 |                 # first subtoken of each word to CRF.
314 |                 seq_logits = seq_logits[seq_mask].unsqueeze(0)
315 |                 seq_labels = seq_labels[seq_mask].unsqueeze(0)
316 |                 loss -= self.crf(seq_logits, seq_labels,
317 |                                  reduction='token_mean')
318 | 
319 |             loss /= batch_size
320 |             outputs['loss'] = loss
321 | 
322 |         else:
323 |             # Same reasons for iterating
324 |             output_tags = []
325 |             for seq_logits, seq_mask in zip(logits, mask):
326 |                 seq_logits = seq_logits[seq_mask].unsqueeze(0)
327 |                 tags = self.crf.decode(seq_logits)
328 |                 # Unpack "batch" results
329 |                 output_tags.append(tags[0])
330 | 
331 |             outputs['y_pred'] = output_tags
332 | 
333 |         return outputs
334 | 
335 | 
336 | class BertLSTM(BertForNERClassification):
337 |     """BERT model with an LSTM model as classifier. This model is meant to be
338 |     used with frozen BERT schemes (feature-based).
339 | 
340 |     Args:
341 |         config: BertConfig instance to build BERT model.
342 |         lstm_hidden_size: hidden size of LSTM layers. Defaults to 100.
343 |         lstm_layers: number of LSTM layers. Defaults to 1.
344 |         kwargs: arguments to be passed to superclass.
345 |     """
346 | 
347 |     def __init__(self,
348 |                  config: BertConfig,
349 |                  lstm_hidden_size: int = 100,
350 |                  lstm_layers: int = 1,
351 |                  **kwargs: Any):
352 | 
353 |         lstm_dropout = 0.2 if lstm_layers > 1 else 0
354 |         self.lstm_hidden_size = lstm_hidden_size
355 |         self.lstm_layers = lstm_layers
356 |         pooler = kwargs.get('pooler', 'last')
357 | 
358 |         super().__init__(config, **kwargs)
359 | 
360 |         if pooler in ('last', 'sum'):
361 |             lstm_input_size = config.hidden_size
362 |         else:
363 |             assert pooler == 'concat'
364 |             lstm_input_size = 4 * config.hidden_size
365 | 
366 |         self.lstm = torch.nn.LSTM(input_size=lstm_input_size,
367 |                                   hidden_size=lstm_hidden_size,
368 |                                   num_layers=lstm_layers,
369 |                                   dropout=lstm_dropout,
370 |                                   batch_first=True,
371 |                                   bidirectional=True)
372 | 
373 |     def _build_classifier(self, config, pooler):
374 |         """Build label classifier."""
375 |         self.classifier = torch.nn.Linear(2 * self.lstm_hidden_size,
376 |                                           config.num_labels)
377 | 
378 |     def _pack_bert_encoded_sequence(self, encoded_sequence, attention_mask):
379 |         """Returns a PackedSequence to be used by LSTM.
380 | 
381 |         The encoded_sequence is the output of BERT, of shape (B, S, H).
382 |         This method sorts the tensor by sequence length using the
383 |         attention_mask along the batch dimension. Then it packs the sorted
384 |         tensor.
385 | 
386 |         Args:
387 |         -----
388 |         encoded_sequence (tensor): output of BERT. Shape: (B, S, H)
389 |         attention_mask (tensor): Shape: (B, S)
390 | 
391 |         Returns:
392 |         --------
393 |         sorted_encoded_sequence (tensor): sorted `encoded_sequence`.
394 |         sorted_ixs (tensor): tensor of indices returned by `torch.sort` when
395 |             performing the sort operation. These indices can be used to unsort
396 |             the output of the LSTM.
397 |         """
398 |         seq_lengths = attention_mask.sum(dim=1)   # Shape: (B,)
399 |         sorted_lengths, sort_ixs = torch.sort(seq_lengths, descending=True)
400 | 
401 |         sorted_encoded_sequence = encoded_sequence[sort_ixs, :, :]
402 | 
403 |         packed_sequence = torch.nn.utils.rnn.pack_padded_sequence(
404 |             sorted_encoded_sequence,
405 |             sorted_lengths,
406 |             batch_first=True)
407 | 
408 |         return packed_sequence, sort_ixs
409 | 
410 |     def _unpack_lstm_output(self, packed_sequence, sort_ixs):
411 |         """Unpacks and unsorts a sorted PackedSequence that is output by LSTM.
412 | 
413 |         Args:
414 |             packed_sequence (PackedSequence): output of LSTM. Shape: (B, S, Hl)
415 |             sort_ixs (tensor): the indexes of be used for unsorting. Shape: (B,)
416 | 
417 |         Returns:
418 |             The unsorted sequence.
419 |         """
420 |         B = len(sort_ixs)
421 | 
422 |         # Unpack
423 |         unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_sequence,
424 |                                                              batch_first=True)
425 | 
426 |         assert unpacked.shape <= (B, 512, 2 * self.lstm.hidden_size)
427 | 
428 |         # Prepare indices for unsort
429 |         sort_ixs = sort_ixs.unsqueeze(1).unsqueeze(1)  # (B, 1, 1)
430 |         # (B, S, Hl)
431 |         sort_ixs = sort_ixs.expand(-1, unpacked.shape[1], unpacked.shape[2])
432 |         # Unsort
433 |         unsorted_sequence = (torch.zeros_like(unpacked)
434 |                              .scatter_(0, sort_ixs, unpacked))
435 | 
436 |         return unsorted_sequence
437 | 
438 |     def forward_lstm(self, bert_encoded_sequence, attention_mask):
439 |         packed_sequence, sorted_ixs = self._pack_bert_encoded_sequence(
440 |             bert_encoded_sequence, attention_mask)
441 | 
442 |         packed_lstm_out, _ = self.lstm(packed_sequence)
443 |         lstm_out = self._unpack_lstm_output(packed_lstm_out, sorted_ixs)
444 | 
445 |         return lstm_out
446 | 
447 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None,
448 |                 labels=None, prediction_mask=None):
449 |         """Performs the forward pass of the network.
450 | 
451 |         Computes the logits, predicted tags and if `labels` is not None, it will
452 |         it will calculate and return the the loss, that is, the negative
453 |         log-likelihood of the batch.
454 | 
455 |         Args:
456 |             input_ids: tensor of input token ids.
457 |             token_type_ids: tensor of input sentence type id (0 or 1). Should be
458 |                 all zeros for NER. Can be safely set to `None`.
459 |             attention_mask: mask tensor that should have value 0 for [PAD]
460 |                 tokens and 1 for other tokens.
461 |             labels: tensor of gold NER tag label ids. Values should be ints in
462 |                 the range [0, config.num_labels - 1].
463 |             prediction_mask: mask tensor should have value 0 for tokens that do
464 |                 not have an associated prediction, such as [CLS] and WordPìece
465 |                 subtoken continuations (that start with ##).
466 | 
467 |         Returns:
468 |             A dict with calculated tensors:
469 |             - "logits"
470 |             - "y_pred"
471 |             - "loss" (if `labels` is not `None`)
472 |         """
473 |         outputs = {}
474 | 
475 |         if self.frozen_bert:
476 |             sequence_output = input_ids
477 |         else:
478 |             sequence_output = self.bert_encode(
479 |                 input_ids, token_type_ids, attention_mask)
480 | 
481 |         sequence_output = self.dropout(sequence_output)  # (batch, seq, H)
482 | 
483 |         lstm_out = self.forward_lstm(
484 |             sequence_output, attention_mask)  # (batch, seq, Hl)
485 |         sequence_output = self.dropout(lstm_out)
486 | 
487 |         logits = self.classifier(sequence_output)
488 |         _, y_pred = torch.max(logits, dim=-1)
489 |         y_pred = y_pred.cpu().numpy()
490 |         outputs['logits'] = logits
491 |         outputs['y_pred'] = y_pred
492 | 
493 |         if labels is not None:
494 |             # Only keep active parts of the loss
495 |             mask = prediction_mask
496 |             if mask is not None:
497 |                 # Adjust mask and labels to have the same length as logits
498 |                 mask = mask[:, :logits.size(1)].contiguous()
499 |                 labels = labels[:, :logits.size(1)].contiguous()
500 | 
501 |                 mask = mask.view(-1)
502 |                 active_logits = logits.view(-1, self.num_labels)[mask]
503 |                 active_labels = labels.view(-1)[mask]
504 |                 loss = self.loss_fct(active_logits, active_labels)
505 |             else:
506 |                 loss = self.loss_fct(
507 |                     logits.view(-1, self.num_labels), labels.view(-1))
508 | 
509 |             outputs['loss'] = loss
510 | 
511 |         return outputs
512 | 
513 | 
514 | class BertLSTMCRF(BertLSTM):
515 |     """BERT model with an LSTM-CRF as classifier. This model is meant to be
516 |     used with frozen BERT schemes (feature-based).
517 | 
518 |     Args:
519 |         config: BertConfig instance to build BERT model.
520 |         kwargs: arguments to be passed to superclass (see BertLSTM).
521 |     """
522 | 
523 |     def __init__(self, config: BertConfig, **kwargs: Any):
524 |         super().__init__(config, **kwargs)
525 |         self.crf = CRF(num_tags=config.num_labels, batch_first=True)
526 | 
527 |     def forward(self,
528 |                 input_ids,
529 |                 token_type_ids=None,
530 |                 attention_mask=None,
531 |                 labels=None,
532 |                 prediction_mask=None,
533 |                 ) -> Dict[str, torch.Tensor]:
534 |         """Performs the forward pass of the network.
535 | 
536 |         If `labels` are not None, it will calculate and return the the loss,
537 |         that is the negative log-likelihood of the batch.
538 |         Otherwise, it will calculate the most probable sequence outputs using
539 |         Viterbi decoding and return a list of sequences (List[List[int]]) of
540 |         variable lengths.
541 | 
542 |         Args:
543 |             input_ids: tensor of input token ids.
544 |             token_type_ids: tensor of input sentence type id (0 or 1). Should be
545 |                 all zeros for NER. Can be safely set to `None`.
546 |             attention_mask: mask tensor that should have value 0 for [PAD]
547 |                 tokens and 1 for other tokens.
548 |             labels: tensor of gold NER tag label ids. Values should be ints in
549 |                 the range [0, config.num_labels - 1].
550 |             prediction_mask: mask tensor should have value 0 for tokens that do
551 |                 not have an associated prediction, such as [CLS] and WordPìece
552 |                 subtoken continuations (that start with ##).
553 | 
554 |         Returns:
555 |             A dict with calculated tensors:
556 | 
557 |             - "logits"
558 |             - "loss" (if `labels` is not `None`)
559 |             - "y_pred" (if `labels` is `None`)
560 |         """
561 |         outputs = {}
562 | 
563 |         if self.frozen_bert:
564 |             sequence_output = input_ids
565 |         else:
566 |             sequence_output = self.bert_encode(
567 |                 input_ids, token_type_ids, attention_mask)
568 | 
569 |         sequence_output = self.dropout(sequence_output)  # (batch, seq, H)
570 | 
571 |         lstm_out = self.forward_lstm(
572 |             sequence_output, attention_mask)  # (batch, seq, Hl)
573 |         sequence_output = self.dropout(lstm_out)
574 |         logits = self.classifier(sequence_output)
575 |         outputs['logits'] = logits
576 | 
577 |         mask = prediction_mask  # (B, S)
578 |         # Logits sequence length depends on the inputs:  logits.shape <= (B, S)
579 |         # We have to make the mask and labels the same size.
580 |         mask = mask[:, :logits.size(1)].contiguous()
581 | 
582 |         if labels is not None:
583 |             # Negative of the log likelihood.
584 |             # Loop through the batch here because of 2 reasons:
585 |             # 1- the CRF package assumes the mask tensor cannot have interleaved
586 |             # zeros and ones. In other words, the mask should start with True
587 |             # values, transition to False at some moment and never transition
588 |             # back to True. That can only happen for simple padded sequences.
589 |             # 2- The first column of mask tensor should be all True, and we
590 |             # cannot guarantee that because we have to mask all non-first
591 |             # subtokens of the WordPiece tokenization.
592 |             labels = labels[:, :logits.size(1)].contiguous()
593 |             batch_size = input_ids.size(0)
594 |             loss = 0
595 |             for seq_logits, seq_labels, seq_mask in zip(logits, labels, mask):
596 |                 # Index logits and labels using prediction mask to pass only the
597 |                 # first subtoken of each word to CRF.
598 |                 seq_logits = seq_logits[seq_mask].unsqueeze(0)
599 |                 seq_labels = seq_labels[seq_mask].unsqueeze(0)
600 |                 loss -= self.crf(seq_logits, seq_labels,
601 |                                  reduction='token_mean')
602 | 
603 |             loss /= batch_size
604 |             outputs['loss'] = loss
605 | 
606 |         else:
607 |             # Same reasons for iterating
608 |             output_tags = []
609 |             for seq_logits, seq_mask in zip(logits, mask):
610 |                 seq_logits = seq_logits[seq_mask].unsqueeze(0)
611 |                 tags = self.crf.decode(seq_logits)
612 |                 # Unpack "batch" results
613 |                 output_tags.append(tags[0])
614 | 
615 |             outputs['y_pred'] = output_tags
616 | 
617 |         return outputs
618 | 


--------------------------------------------------------------------------------
/ner_evaluation/trainer.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """This file defines the `main` function that handles BERT, BERT-CRF,
 17 | BERT-LSTM and BERT-LSTM-CRF training and evaluation on NER task.
 18 | 
 19 | The `main` function should be imported and called by another script that passes
 20 | functions to 1) load and preprocess input data and 2) define metrics evaluate
 21 | the model during training or testing phases.
 22 | 
 23 | For further information, see `main` function docstring and the ArgumentParser
 24 | arguments.
 25 | 
 26 | The code was inspired by Huggingface Tranformers' script for training and
 27 | evaluating BERT on SQuAD dataset.
 28 | """
 29 | 
 30 | from __future__ import absolute_import, division, print_function
 31 | 
 32 | import argparse
 33 | import logging
 34 | import os
 35 | import random
 36 | import sys
 37 | from argparse import Namespace
 38 | from typing import Any, Callable, Dict, List, Optional, Tuple
 39 | 
 40 | import numpy as np
 41 | import torch
 42 | from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
 43 | from pytorch_transformers.tokenization_bert import BertTokenizer
 44 | from torch import nn
 45 | from torch.nn.utils import clip_grad_norm_
 46 | from torch.utils.data import (DataLoader, Dataset, RandomSampler,
 47 |                               SequentialSampler)
 48 | from torch.utils.data.distributed import DistributedSampler
 49 | from tqdm.autonotebook import tqdm, trange
 50 | 
 51 | from dataset import get_bert_encoded_dataset
 52 | from eval_tools import SequenceMetrics, write_conll_prediction_file
 53 | from postprocessing import OutputComposer
 54 | from preprocessing import Example, InputSpan
 55 | from results_writer import compile_results, write_jsonl_results
 56 | from tag_encoder import NERTagsEncoder
 57 | from utils import RunningAccumulator, load_model, save_model
 58 | 
 59 | logger = logging.getLogger(__name__)
 60 | 
 61 | 
 62 | def set_seed(seed: int) -> None:
 63 |     random.seed(seed)
 64 |     np.random.seed(seed)
 65 |     torch.manual_seed(seed)
 66 |     if torch.cuda.device_count() > 0:
 67 |         torch.cuda.manual_seed_all(seed)
 68 | 
 69 | 
 70 | def prepare_dataloaders(
 71 |     args: Namespace,
 72 |     train_dataset: Dataset,
 73 |     valid_dataset: Optional[Dataset] = None,
 74 | ) -> Tuple[DataLoader, DataLoader, Optional[DataLoader]]:
 75 |     """Instantiates the train, train evaluation and validation dataloaders (if
 76 |     needed)."""
 77 |     # Instantiate Dataloader
 78 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
 79 |     if args.local_rank == -1:
 80 |         train_sampler = RandomSampler(train_dataset)
 81 |     else:
 82 |         train_sampler = DistributedSampler(train_dataset)
 83 |     train_dataloader = DataLoader(
 84 |         train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
 85 | 
 86 |     train_eval_sampler = SequentialSampler(train_dataset)
 87 |     train_eval_dataloader = DataLoader(
 88 |         train_dataset,
 89 |         sampler=train_eval_sampler,
 90 |         batch_size=args.train_batch_size)
 91 | 
 92 |     valid_dataloader = None
 93 |     if valid_dataset:
 94 |         valid_sampler = SequentialSampler(valid_dataset)
 95 |         valid_dataloader = DataLoader(
 96 |             valid_dataset,
 97 |             sampler=valid_sampler,
 98 |             batch_size=args.train_batch_size)
 99 | 
100 |     # Logs
101 |     logger.info("  Num examples = %d", len(train_dataset))
102 |     if valid_dataset:
103 |         logger.info("  Num valid examples = %d", len(valid_dataset))
104 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
105 |     logger.info("  Instantaneous batch size per GPU = %d",
106 |                 args.per_gpu_train_batch_size)
107 |     logger.info(
108 |         "  Total train batch size (w. parallel, distributed & accumulation) = %d",
109 |         (args.train_batch_size * args.gradient_accumulation_steps *
110 |          (torch.distributed.get_world_size()
111 |           if args.local_rank != -1 else 1)))
112 |     logger.info("  Gradient Accumulation steps = %d",
113 |                 args.gradient_accumulation_steps)
114 | 
115 |     return train_dataloader, train_eval_dataloader, valid_dataloader
116 | 
117 | 
118 | def prepare_optimizer_and_scheduler(args: Namespace,
119 |                                     model: nn.Module,
120 |                                     num_batches: int,
121 |                                     ) -> Tuple[AdamW, WarmupLinearSchedule]:
122 |     """Configures BERT's AdamW optimizer and WarmupLinearSchedule learning rate
123 |     scheduler. Divides parameters into two learning rate groups, with higher
124 |     learning rate for non-BERT parameters (classifier model)."""
125 |     t_total = (num_batches // args.gradient_accumulation_steps *
126 |                args.num_train_epochs)
127 | 
128 |     if args.local_rank != -1:
129 |         t_total = t_total // torch.distributed.get_world_size()
130 | 
131 |     logger.info("  Total optimization steps = %d", t_total)
132 | 
133 |     # Prepare optimizer
134 |     param_optimizer = list(
135 |         filter(lambda p: p[1].requires_grad, model.named_parameters()))
136 | 
137 |     no_decay = ['bias', 'LayerNorm.weight']
138 |     higher_lr = ['classifier', 'crf', 'lstm']
139 | 
140 |     def is_classifier_param(param_name: str) -> bool:
141 |         return any(hl in param_name for hl in higher_lr)
142 | 
143 |     def ignore_in_weight_decay(param_name: str) -> bool:
144 |         return any(nd in param_name for nd in no_decay)
145 | 
146 |     optimizer_grouped_parameters = [
147 |         {'params': [p for name, p in param_optimizer
148 |                     if not ignore_in_weight_decay(name)
149 |                     and not is_classifier_param(name)],
150 |          'weight_decay': 0.01},
151 |         {'params': [p for name, p in param_optimizer
152 |                     if not ignore_in_weight_decay(name)
153 |                     and is_classifier_param(name)],
154 |          'weight_decay': 0.01,
155 |          'lr': args.classifier_lr},
156 |         {'params': [p for name, p in param_optimizer
157 |                     if ignore_in_weight_decay(name)
158 |                     and not is_classifier_param(name)],
159 |          'weight_decay': 0.0},
160 |     ]
161 | 
162 |     # To reproduce BertAdam specific behavior set correct_bias=False
163 |     optimizer = AdamW(optimizer_grouped_parameters,
164 |                       lr=args.learning_rate,
165 |                       correct_bias=False)
166 |     num_warmup_steps = t_total * args.warmup_proportion
167 |     scheduler = WarmupLinearSchedule(optimizer,
168 |                                      warmup_steps=num_warmup_steps,
169 |                                      t_total=t_total)
170 | 
171 |     return optimizer, scheduler
172 | 
173 | 
174 | def train(args: Namespace,
175 |           model: torch.nn.Module,
176 |           train_dataset: Dataset,
177 |           train_metrics: SequenceMetrics,
178 |           train_output_composer: OutputComposer,
179 |           valid_dataset: Optional[Dataset] = None,
180 |           valid_metrics: Optional[SequenceMetrics] = None,
181 |           valid_output_composer: Optional[OutputComposer] = None) -> None:
182 |     """Train routine."""
183 | 
184 |     logger.info("***** Running training *****")
185 | 
186 |     train_dl, train_eval_dl, valid_dl = prepare_dataloaders(
187 |         args, train_dataset, valid_dataset)
188 | 
189 |     optimizer, scheduler = prepare_optimizer_and_scheduler(
190 |         args, model, num_batches=len(train_dl))
191 | 
192 |     # Multi-gpu, distributed and fp16 setup
193 |     if args.fp16:
194 |         try:
195 |             from apex import amp
196 |         except ImportError:
197 |             msg = ("Please install apex from "
198 |                    "https://www.github.com/nvidia/apex to use fp16 training.")
199 |             raise ImportError(msg)
200 |         model, optimizer = amp.initialize(
201 |             model, optimizer, opt_level=args.fp16_opt_level)
202 | 
203 |     # multi-gpu training (should be after apex fp16 initialization)
204 |     if args.n_gpu > 1:
205 |         model = torch.nn.DataParallel(model)
206 | 
207 |     # Distributed training (should be after apex fp16 initialization)
208 |     if args.local_rank != -1:
209 |         model = torch.nn.parallel.DistributedDataParallel(
210 |             model,
211 |             device_ids=[args.local_rank],
212 |             output_device=args.local_rank,
213 |             find_unused_parameters=True)
214 | 
215 |     global_step = 0
216 |     train_losses = []
217 |     if valid_dataset:
218 |         min_val_loss = float('inf')
219 | 
220 |     # Training loop
221 |     try:
222 |         epoch_tqdm = trange(int(args.num_train_epochs), desc="Epoch")
223 |         loss_accum = RunningAccumulator()
224 |         for epoch in epoch_tqdm:
225 |             model.train()
226 |             stats = {}
227 | 
228 |             train_tqdm = tqdm(train_dl, desc="Iter")
229 |             for step, batch in enumerate(train_tqdm):
230 |                 if args.n_gpu == 1:
231 |                     # multi-gpu does scattering it-self
232 |                     batch = tuple(t.to(args.device) for t in batch)
233 |                 # Unpack batch
234 |                 input_ids = batch[0]
235 |                 input_mask = batch[1]
236 |                 segment_ids = batch[2]
237 |                 label_ids = batch[3]
238 |                 prediction_mask = batch[4]
239 |                 # example_ixs = batch[5]
240 |                 # doc_span_ixs = batch[6]
241 | 
242 |                 outs = model(input_ids, segment_ids,
243 |                              input_mask, label_ids, prediction_mask)
244 |                 loss = outs['loss']
245 |                 if args.n_gpu > 1:
246 |                     loss = loss.mean()  # mean() to average on multi-gpu.
247 |                 if args.gradient_accumulation_steps > 1:
248 |                     loss = loss / args.gradient_accumulation_steps
249 | 
250 |                 loss_accum.accumulate(loss.item())
251 |                 running_mean_loss = loss_accum.mean()
252 |                 train_tqdm.set_postfix({'loss': running_mean_loss})
253 | 
254 |                 if args.fp16:
255 |                     with amp.scale_loss(loss, optimizer) as scaled_loss:
256 |                         scaled_loss.backward()
257 |                     clip_grad_norm_(amp.master_params(
258 |                         optimizer), args.max_grad_norm)
259 |                 else:
260 |                     loss.backward()
261 |                     clip_grad_norm_(model.parameters(), args.max_grad_norm)
262 | 
263 |                 if (step + 1) % args.gradient_accumulation_steps == 0:
264 | 
265 |                     # Perform gradient clipping
266 |                     for group in optimizer.param_groups:
267 |                         for p in group['params']:
268 |                             if p.grad is None:
269 |                                 continue
270 |                             clip_grad_norm_(p, 1)
271 | 
272 |                     scheduler.step()
273 |                     optimizer.step()
274 |                     optimizer.zero_grad()
275 | 
276 |             global_step += 1
277 | 
278 |             train_losses.append(loss_accum.mean())
279 | 
280 |             stats['loss'] = format_tqdm_metric(train_losses[-1],
281 |                                                float(min(train_losses)),
282 |                                                fmt='{:.3e}')
283 | 
284 |             # Evaluate train set
285 |             if epoch % 5 == 0 or epoch == args.num_train_epochs - 1:
286 |                 trn_epoch_metrics = evaluate(
287 |                     args,
288 |                     model,
289 |                     tqdm(train_eval_dl, desc="Train metrics"),
290 |                     train_output_composer,
291 |                     train_metrics,
292 |                 )
293 | 
294 |                 stats['trn_f1'] = format_tqdm_metric(
295 |                     trn_epoch_metrics['f1_score'],
296 |                     train_metrics.get_best('f1_score'),
297 |                     fmt='{:.2%}')
298 | 
299 |                 epoch_tqdm.set_postfix(stats)
300 |                 epoch_tqdm.refresh()
301 | 
302 |             if valid_dataset:
303 |                 # Evaluate validation set
304 |                 val_epoch_metrics = evaluate(
305 |                     args,
306 |                     model,
307 |                     tqdm(valid_dl, desc="Validation"),
308 |                     valid_output_composer,
309 |                     valid_metrics,
310 |                 )
311 | 
312 |                 # Show metrics on tqdm
313 |                 if 'loss' in val_epoch_metrics:
314 |                     epoch_val_loss = val_epoch_metrics['loss']
315 |                     min_val_loss = min(min_val_loss, epoch_val_loss)
316 |                     stats['val_loss'] = format_tqdm_metric(
317 |                         epoch_val_loss, min_val_loss, fmt='{:.3e}')
318 | 
319 |                 stats['val_f1'] = format_tqdm_metric(
320 |                     val_epoch_metrics['f1_score'],
321 |                     valid_metrics.get_best('f1_score'),
322 |                     fmt='{:.2%}')
323 | 
324 |                 best_epoch = valid_metrics.get_best_epoch('f1_score')
325 |                 stats['best_epoch'] = best_epoch
326 | 
327 |                 # Save model if best epoch
328 |                 if best_epoch == epoch + 1:
329 |                     tqdm.write('Best epoch. Saving model.')
330 |                     save_model(model, args)
331 | 
332 |             epoch_tqdm.set_postfix(stats)
333 |             epoch_tqdm.refresh()
334 | 
335 |         # End of training
336 |         if args.valid_file:
337 |             logger.info("  Validation F1 scores: %s",
338 |                         valid_metrics.history['f1_score'])
339 |             best_epoch = valid_metrics.get_best_epoch('f1_score')
340 |             logger.info("  Validation confusion matrix:")
341 |             logger.info("  Epoch %d", best_epoch)
342 |             conf_mat = valid_metrics.get_value("confusion_matrix", best_epoch)
343 |             logger.info("\n" + str(conf_mat))
344 |             logger.info("  Validation classification report:")
345 |             classif_report = valid_metrics.get_value(
346 |                 "classification_report", best_epoch)
347 |             logger.info("\n" + str(classif_report))
348 | 
349 |     except KeyboardInterrupt:
350 |         action = ''
351 |         while action.lower() not in ('y', 'n'):
352 |             action = input(
353 |                 '\nInterrupted. Continue execution to save model '
354 |                 'weights? [Y/n]')
355 |             if action == 'n':
356 |                 sys.exit()
357 | 
358 |     if not valid_dataset:
359 |         # If not using valid dataset, save model of last epoch
360 |         logger.info('Saving model from last epoch.')
361 |         save_model(model, args)
362 | 
363 |     if args.results_file:
364 |         # Append this run results
365 |         write_jsonl_results(
366 |             compile_results(args, train_metrics,
367 |                             valid_metrics, train_losses=train_losses),
368 |             args.results_file,
369 |         )
370 | 
371 | 
372 | def evaluate(args: Namespace,
373 |              model: nn.Module,
374 |              dataloader: DataLoader,
375 |              output_composer: OutputComposer,
376 |              sequence_metrics: SequenceMetrics,
377 |              reset: bool = True,
378 |              ) -> Dict[str, Any]:
379 |     """Runs a model forward pass on the entire dataloader to compute predictions
380 |     for all examples. Final predictions are gathered in `output_composer`,
381 |     combining the max-context tokens of each forward pass. Returns the
382 |     metrics dict computed by `sequence_metrics.calculate_metrics()`."""
383 |     # Evaluate
384 |     model.eval()
385 | 
386 |     losses = []
387 |     for step, batch in enumerate(dataloader):
388 |         if args.n_gpu == 1:
389 |             batch = tuple(t.to(args.device) for t in batch)
390 |         # Unpack batch
391 |         input_ids = batch[0]
392 |         input_mask = batch[1]
393 |         segment_ids = batch[2]
394 |         label_ids = batch[3]
395 |         prediction_mask = batch[4]
396 |         example_ixs = batch[5]
397 |         doc_span_ixs = batch[6]
398 | 
399 |         with torch.no_grad():
400 |             if args.no_crf:
401 |                 # BERT or BERT-LSTM
402 |                 outs = model(
403 |                     input_ids,
404 |                     segment_ids,
405 |                     input_mask,
406 |                     labels=label_ids,
407 |                     prediction_mask=prediction_mask)
408 |             else:
409 |                 # BERT-CRF or BERT-LSTM-CRF.
410 |                 # We do not pass `labels` otherwise y_pred is not calculated.
411 |                 outs = model(
412 |                     input_ids,
413 |                     segment_ids,
414 |                     input_mask,
415 |                     prediction_mask=prediction_mask)
416 | 
417 |             y_pred = outs['y_pred']
418 | 
419 |         output_composer.insert_batch(example_ixs, doc_span_ixs, y_pred)
420 | 
421 |         loss = outs.get('loss')
422 |         if loss is not None:
423 |             loss = loss.item()
424 |             losses.append(loss)
425 | 
426 |     y_true = [example.labels for example in output_composer.examples]
427 |     y_pred = output_composer.get_outputs()
428 |     metrics = sequence_metrics.calculate_metrics(y_true, y_pred)
429 | 
430 |     if losses:
431 |         metrics['loss'] = float(np.mean(losses))
432 | 
433 |     return metrics
434 | 
435 | 
436 | def format_tqdm_metric(value: float, best_value: float, fmt: str) -> str:
437 |     """Formats a value to display in tqdm."""
438 |     if value == best_value:
439 |         return (fmt + '*').format(value)
440 | 
441 |     return (fmt + ' (' + fmt + '*)').format(value, best_value)
442 | 
443 | 
444 | def main(
445 |     load_and_cache_examples_fn: Callable[
446 |         [Namespace, BertTokenizer, NERTagsEncoder, str],
447 |         Tuple[Dataset, List[Example], List[InputSpan]]],
448 |     get_train_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics],
449 |     get_valid_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics],
450 |     get_eval_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics]
451 | ):
452 |     """Script entry-point. Performs training and/or evaluation routines.
453 | 
454 |     This function handles model training and evaluation. All arguments are
455 |     functions that handle 1) training and evaluation data loading and
456 |     preprocessing or 2) defining evaluation metrics. By modifying these
457 |     functions, one can adapt this script to other NER datasets in distinct
458 |     formats.
459 | 
460 |     Args:
461 |         load_and_cache_examples_fn: a function that handles dataset loading and
462 |             preprocessing. The data should be loaded and converted into
463 |             `preprocessing.Example` instances, that can then be used to
464 |             generate InputSpans and a BERT-ready Dataset.
465 | 
466 |             This function receives the following inputs:
467 | 
468 |             args: a Namespace object of parsed CLI arguments with model
469 |                 hyperparameters and dataset input files.
470 |             bert_tokenizer: a loaded instance of BertTokenizer.
471 |             tag_encoder: a NERTagsEncoder instance created from the tasks NER
472 |                 classes.
473 |             mode: a mode string (train|valid|eval) to select which input file
474 |                 to read (args.train_file, args.valid_file or args.eval_file).
475 | 
476 |         get_train_metrics_fn: a function that receives a NERTagsEncoder and
477 |             returns a SequenceMetrics object to evaluate the model on train
478 |             data during training (`--do_train`).
479 |         get_valid_metrics_fn: a function that receives a NERTagsEncoder and
480 |             returns a SequenceMetrics object to evaluate the model on
481 |             validation data during training (`--do_train`).
482 |         get_eval_metrics_fn: a function that receives a NERTagsEncoder and
483 |             returns a SequenceMetrics object to evaluate the model on test data
484 |             during evaluation (`--do_eval`).
485 |     """
486 |     parser = argparse.ArgumentParser()
487 | 
488 |     # Model and hyperparameters
489 |     parser.add_argument("--bert_model", default=None, type=str, required=True,
490 |                         help="Bert pre-trained model name or path to a "
491 |                         "checkpoint directory.")
492 |     parser.add_argument("--tokenizer_model", default=None, type=str,
493 |                         required=False,
494 |                         help="Path to tokenizer files. If empty, defaults to "
495 |                         "--bert_model.")
496 |     parser.add_argument("--do_lower_case",
497 |                         action='store_true',
498 |                         help="Whether to lower case the input text. True for "
499 |                         "uncased models, False for cased models.")
500 |     parser.add_argument("--max_seq_length", default=512, type=int,
501 |                         help="The maximum total input sequence length after "
502 |                         "WordPiece tokenization. Sequences longer than this "
503 |                         "will be split into multiple spans, and sequences "
504 |                         "shorter than this will be padded.")
505 |     parser.add_argument("--doc_stride", default=128, type=int,
506 |                         help="When splitting up a long document into chunks, "
507 |                         "how much stride to take between chunks.")
508 |     parser.add_argument('--labels_file',
509 |                         required=True,
510 |                         help="File with all NER classes to be considered, one "
511 |                         "per line.")
512 |     parser.add_argument('--scheme',
513 |                         default='bio', help='NER tagging scheme (BIO|BILUO).')
514 |     parser.add_argument('--no_crf',
515 |                         action='store_true',
516 |                         help='Remove the CRF layer (use plain BERT or '
517 |                         'BERT-LSTM).')
518 |     parser.add_argument('--pooler',
519 |                         default='last',
520 |                         help='Pooling strategy for extracting BERT encoded '
521 |                         'features from last BERT layers. '
522 |                         'One of "last", "sum" or "concat".')
523 |     parser.add_argument('--freeze_bert',
524 |                         action='store_true',
525 |                         help="Freeze BERT layers' parameters. If True, uses "
526 |                         "either a BERT-LSTM or BERT-LSTM-CRF model.")
527 |     parser.add_argument('--lstm_hidden_size',
528 |                         type=int,
529 |                         default=100,
530 |                         help=('Hidden dimension of the LSTM (only used when '
531 |                               'the BERT model is frozen.'))
532 |     parser.add_argument('--lstm_layers',
533 |                         type=int,
534 |                         default=1,
535 |                         help=('Number of LSTM layers (only used when the BERT '
536 |                               'model is frozen.'))
537 |     # General
538 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
539 |                         help="The output directory where the model checkpoints"
540 |                         " and predictions will be written.")
541 |     parser.add_argument("--no_cuda",
542 |                         action='store_true',
543 |                         help="Whether not to use CUDA when available")
544 |     parser.add_argument("--verbose_logging", action='store_true',
545 |                         help="If true, all of the warnings related to data "
546 |                         "processing will be printed.")
547 |     parser.add_argument('--override_cache',
548 |                         action='store_true',
549 |                         help='Override feature caches of input files.')
550 | 
551 |     # Training related
552 |     parser.add_argument("--do_train", action='store_true',
553 |                         help="Whether to run training.")
554 |     parser.add_argument("--train_file", default=None,
555 |                         type=str, help="JSON for training.")
556 |     parser.add_argument("--valid_file", default=None, type=str,
557 |                         help="JSON for validating during training.")
558 |     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
559 |                         help="Batch size per GPU/CPU for training.")
560 |     parser.add_argument("--learning_rate", default=5e-5,
561 |                         type=float, help="The initial learning rate for Adam.")
562 |     parser.add_argument('--classifier_lr',
563 |                         type=float,
564 |                         default=1e-3,
565 |                         help='Learning rate of the classifier and CRF layers.')
566 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
567 |                         help="Total number of training epochs to perform.")
568 |     parser.add_argument("--warmup_proportion", default=0.1, type=float,
569 |                         help="Proportion of training to perform linear "
570 |                              "learning rate warmup for. E.g., 0.1 = 10%% "
571 |                              "of training.")
572 |     parser.add_argument('--seed',
573 |                         type=int,
574 |                         default=42,
575 |                         help="random seed for initialization")
576 |     parser.add_argument('--gradient_accumulation_steps',
577 |                         type=int,
578 |                         default=1,
579 |                         help="Number of updates steps to accumulate before "
580 |                              "performing a backward/update pass.")
581 |     parser.add_argument('--max_grad_norm',
582 |                         type=float,
583 |                         default=1.,
584 |                         help="Maximum value of gradient norm on update.")
585 |     parser.add_argument("--local_rank",
586 |                         type=int,
587 |                         default=-1,
588 |                         help="local_rank for distributed training on gpus")
589 |     parser.add_argument('--fp16',
590 |                         action='store_true',
591 |                         help="Whether to use 16-bit float precision instead of"
592 |                         " 32-bit")
593 |     parser.add_argument('--loss_scale',
594 |                         type=float, default=0,
595 |                         help="Loss scaling to improve fp16 numeric stability. "
596 |                              "Only used when fp16 set to True.\n"
597 |                              "0 (default value): dynamic loss scaling.\n"
598 |                              "Positive power of 2: static loss scaling "
599 |                              "value.\n")
600 |     parser.add_argument('--few_samples',
601 |                         type=int, default=-1,
602 |                         help="Turn on few samples for training.")
603 |     parser.add_argument('--results_file',
604 |                         default=None,
605 |                         required=False,
606 |                         help='Optional JSONlines file to log train runs.')
607 | 
608 |     # Evaluation related
609 |     parser.add_argument("--do_eval", action='store_true',
610 |                         help="Whether to run eval on the test set.")
611 |     parser.add_argument("--eval_file", default=None, type=str,
612 |                         help="JSON for evaluating the model.")
613 |     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
614 |                         help="Batch size per GPU/CPU for evaluation.")
615 | 
616 |     args = parser.parse_args()
617 | 
618 |     # Setup CUDA, GPU & distributed training
619 |     if args.local_rank == -1 or args.no_cuda:
620 |         device = torch.device(
621 |             "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
622 |         args.n_gpu = torch.cuda.device_count()
623 |     else:
624 |         # Initializes the distributed backend which will take care of
625 |         # sychronizing nodes/GPUs
626 |         torch.cuda.set_device(args.local_rank)
627 |         device = torch.device("cuda", args.local_rank)
628 |         torch.distributed.init_process_group(backend='nccl')
629 |         args.n_gpu = 1
630 |     args.device = device
631 | 
632 |     logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits "
633 |                 "training: {}".format(
634 |                     device, args.n_gpu, bool(args.local_rank != -1), args.fp16))
635 |     logger.info("seed: {}, output_dir: {}".format(args.seed, args.output_dir))
636 | 
637 |     if args.gradient_accumulation_steps < 1:
638 |         message = ("Invalid gradient_accumulation_steps parameter: {}, should "
639 |                    "be >= 1".format(args.gradient_accumulation_steps))
640 |         raise ValueError(message)
641 | 
642 |     set_seed(args.seed)
643 | 
644 |     if not args.do_train and not args.do_eval:
645 |         raise ValueError(
646 |             "At least one of `do_train` or `do_eval` must be"
647 |             "True.")
648 | 
649 |     if args.do_train:
650 |         if not args.train_file:
651 |             raise ValueError(
652 |                 "If `do_train` is True, then `train_file` must be specified.")
653 |     if args.do_eval:
654 |         if not args.eval_file:
655 |             raise ValueError(
656 |                 "If `do_eval` is True, then `eval_file` must be "
657 |                 "specified.")
658 | 
659 |     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) \
660 |             and args.do_train:
661 |         raise ValueError(
662 |             "Output directory () already exists and is not empty.")
663 |     if not os.path.exists(args.output_dir):
664 |         os.makedirs(args.output_dir)
665 | 
666 |     tokenizer_path = args.tokenizer_model or args.bert_model
667 |     tokenizer = BertTokenizer.from_pretrained(
668 |         tokenizer_path, do_lower_case=args.do_lower_case)
669 | 
670 |     # Instantiate NER Tag encoder
671 |     tag_encoder = NERTagsEncoder.from_labels_file(
672 |         args.labels_file, scheme=args.scheme.upper())
673 | 
674 |     args.num_labels = tag_encoder.num_labels
675 | 
676 |     # Load a pretrained model
677 |     model = load_model(args, args.bert_model, training=args.do_train)
678 |     model.to(device)
679 | 
680 |     train_examples = None
681 |     valid_dataset, valid_examples, valid_features = None, None, None
682 | 
683 |     # Train
684 |     if args.do_train:
685 |         # Read examples and get features and dataset
686 |         train_dataset, train_examples, train_features = load_and_cache_examples_fn(
687 |             args,
688 |             tokenizer,
689 |             tag_encoder,
690 |             mode='train',
691 |         )
692 | 
693 |         # Instantiate OutputComposer to post-process train examples
694 |         train_output_comp = OutputComposer(
695 |             train_examples,
696 |             train_features,
697 |             output_transform_fn=tag_encoder.convert_ids_to_tags)
698 | 
699 |         if args.valid_file:
700 |             logger.info("Reading validation examples.")
701 | 
702 |             valid_dataset, valid_examples, valid_features = load_and_cache_examples_fn(
703 |                 args,
704 |                 tokenizer,
705 |                 tag_encoder,
706 |                 mode='valid',
707 |             )
708 |             # Instantiate OutputComposer to post-process valid examples
709 |             valid_output_comp = OutputComposer(
710 |                 valid_examples,
711 |                 valid_features,
712 |                 output_transform_fn=tag_encoder.convert_ids_to_tags)
713 | 
714 |         if args.freeze_bert:
715 |             # Freeze BERT layers
716 |             logger.info("Freezing BERT layers.")
717 |             model.freeze_bert()
718 |             assert model.frozen_bert
719 | 
720 |             logger.info("Creating BERT encoded datasets...")
721 | 
722 |             train_dataset = get_bert_encoded_dataset(
723 |                 model, train_dataset, args.per_gpu_train_batch_size,
724 |                 args.device)
725 |             if valid_dataset:
726 |                 valid_dataset = get_bert_encoded_dataset(
727 |                     model, valid_dataset, args.per_gpu_train_batch_size,
728 |                     args.device)
729 | 
730 |         # Initialize Metrics tracker
731 |         train_metrics = get_train_metrics_fn(tag_encoder)
732 | 
733 |         if args.valid_file:
734 |             valid_metrics = get_valid_metrics_fn(tag_encoder)
735 | 
736 |         # Training loop
737 |         train(
738 |             args,
739 |             model,
740 |             train_dataset,
741 |             train_metrics=train_metrics,
742 |             train_output_composer=train_output_comp,
743 |             valid_dataset=valid_dataset,
744 |             valid_metrics=valid_metrics,
745 |             valid_output_composer=valid_output_comp,
746 |         )
747 | 
748 |         # Save tokenizer
749 |         tokenizer.save_pretrained(args.output_dir)
750 | 
751 |         # Load a trained model and config that you have fine-tuned
752 |         logger.info('Loading best model')
753 |         model = load_model(args, model_path=args.output_dir, training=False)
754 |         model.to(device)
755 | 
756 |     if args.do_eval and (
757 |             args.local_rank == -1 or torch.distributed.get_rank() == 0):
758 | 
759 |         logger.info("Reading evaluation examples.")
760 |         eval_dataset, eval_examples, eval_features = load_and_cache_examples_fn(
761 |             args,
762 |             tokenizer,
763 |             tag_encoder,
764 |             mode='eval',
765 |         )
766 |         # Instantiate OutputComposer to post-process eval examples
767 |         eval_output_comp = OutputComposer(
768 |             eval_examples,
769 |             eval_features,
770 |             output_transform_fn=tag_encoder.convert_ids_to_tags)
771 | 
772 |         logger.info("***** Running evaluation predictions *****")
773 |         logger.info("  Num orig examples = %d", len(eval_examples))
774 |         logger.info("  Num split examples = %d", len(eval_features))
775 |         logger.info("  Batch size = %d", args.per_gpu_eval_batch_size)
776 | 
777 |         # Run prediction for full data
778 |         eval_sampler = SequentialSampler(eval_dataset)
779 |         eval_dataloader = DataLoader(eval_dataset,
780 |                                      sampler=eval_sampler,
781 |                                      batch_size=args.per_gpu_eval_batch_size,
782 |                                      num_workers=os.cpu_count())
783 | 
784 |         # Define SequenceMetrics that handle the postprocessing
785 |         eval_metrics = get_eval_metrics_fn(tag_encoder)
786 | 
787 |         model.frozen_bert = False
788 | 
789 |         metrics = evaluate(
790 |             args,
791 |             model,
792 |             tqdm(eval_dataloader, desc="Evaluation"),
793 |             eval_output_comp,
794 |             eval_metrics,
795 |             reset=False,
796 |         )
797 | 
798 |         # Display and save test metrics
799 |         metrics_values = []
800 |         for metric_name in ('f1_score', 'precision', 'recall'):
801 |             metric_value = metrics[metric_name]
802 |             metrics_values.append(metric_value)
803 |             logger.info("%s: %s", metric_name, metric_value)
804 | 
805 |         with open(os.path.join(args.output_dir, 'metrics.txt'), 'w') as fd:
806 |             fd.write(','.join(map(str, metrics_values)))
807 | 
808 |         logger.info('Classification report:')
809 |         logger.info('\n%s', metrics['classification_report'])
810 | 
811 |         conll_file = os.path.join(args.output_dir, 'predictions_conll.txt')
812 |         logger.info('Writing CoNLL style prediction file to %s.', conll_file)
813 | 
814 |         # Get predictions for all examples
815 |         y_pred = eval_output_comp.get_outputs()
816 |         # Filter invalid predictions
817 |         y_pred_filt = [tag_encoder.decode_valid(preds) for preds in y_pred]
818 | 
819 |         # Write CoNLL file
820 |         write_conll_prediction_file(conll_file, eval_examples, y_pred_filt)
821 | 


--------------------------------------------------------------------------------
/ner_evaluation/data/FirstHAREM-selective-dev.json:
--------------------------------------------------------------------------------
1 | [{"doc_id": "HAREM-361-02413", "doc_text": "\nFernando Ferreira\n[click for a page in english]\nCMAF- Universidade de Lisboa Gabinete A2-31 Avenida Professor Gama Pinto, 2 Telefone do Gabinete: 217904893 P-1649-003 Lisboa Extens\u00e3o interna: 4293 Portugal Email: ferferr@cii.fc.ul.pt Departamento de Matem\u00e1tica | Faculdade de Ci\u00eancias | Universidade de Lisboa | CMAF\nApresenta\u00e7\u00e3o\nBem vindos \u00e0minha p\u00e1gina pessoal. Sou Professor Associado do Departamento de Matem\u00e1tica da Universidade de Lisboa e membro do Centro de Matem\u00e1tica e Aplica\u00e7\u00f5es Fundamentais - CMAF. Clique aqui para obter o meu CV.\nInteresses Acad\u00e9micos\nL\u00f3gica Matem\u00e1tica, em especial teorias fracas da aritm\u00e9tica e da an\u00e1lise. Filosofia  e Fundamentos de Matem\u00e1tica . Tenho um interesse amador (no sentido latino da palavra) por alguns problemas da Filosofia Antiga , particularmente no problema da falsidade em Parm\u00e9nides e Plat\u00e3o. Tamb\u00e9m escrevi alguns ensaios exposit\u00f3rios sobre temas da l\u00f3gica: clique aqui para os ver.\nEnsino\nNo presente semestre dou aulas te\u00f3rico-pr\u00e1ticas de \u00c1lgebra 2, cadeira do segundo ano das licenciaturas em Matem\u00e1tica. O Professor Jos\u00e9 Perdig\u00e3o Dias da Silva \u00e9o regente da cadeira.\nNo semestre passado fui respons\u00e1vel pelas cadeiras de Topologia e Introdu\u00e7\u00e3o \u00e0An\u00e1lise Funcional, do terceiro ano das licenciaturas em Matem\u00e1tica, e de Teoria da Demonstra\u00e7\u00e3o, do \nMestrado em Matem\u00e1tica.\nNo ano passado ensinei a cadeira de L\u00f3gica Matem\u00e1tica aos finalistas de Matem\u00e1tica e licenciaturas relacionadas. Clique aqui para\nver a p\u00e1gina desta cadeira. Tamb\u00e9m dei a cadeira L\u00f3gica de Primeira-Ordem ao primeiro ano das licenciaturas em Inform\u00e1tica e Engenharia da Linguagem e do Conhecimento. A p\u00e1gina web desta cadeira ainda se encontra dispon\u00edvel on-line em html://www.alf1.cii.fc.ul.pt/~ferferr/lpo.html.\nTamb\u00e9m colaboro no Mestrado em Filosofia da Linguagem e da Consci\u00eancia da Faculdade de Letras.\nEventos\nDe 25 a 28 de Junho decorrer\u00e1 em Lisboa, no CMAF, a School on Real Algebraic and Analytic Geometry and O-minimal Structures .\n\u00c0s quintas-feiras decorre o Semin\u00e1rio de L\u00f3gica Matem\u00e1tica (SLM), organizado por mim e pelo Professor Narciso Garcia do Instituto Superior T\u00e9cnico. Se quiser ter not\u00edcias regulares sobre o SLM, por favor contacte-me.\nV\u00e1ria\nSou co-editor da Disputatio , uma revista de Filosofia Anal\u00edtica.\n", "entities": [{"entity_id": "23", "text": "Fernando Ferreira", "label": "PESSOA", "start_offset": 1, "end_offset": 18}, {"entity_id": "24", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 49, "end_offset": 53}, {"entity_id": "25", "text": "Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 55, "end_offset": 77}, {"entity_id": "26", "text": "Gabinete A2-31 Avenida Professor Gama Pinto, 2", "label": "LOCAL", "start_offset": 78, "end_offset": 124}, {"entity_id": "28", "text": "Lisboa", "label": "LOCAL", "start_offset": 168, "end_offset": 174}, {"entity_id": "30", "text": "Portugal", "label": "LOCAL", "start_offset": 198, "end_offset": 206}, {"entity_id": "32", "text": "Departamento de Matem\u00e1tica", "label": "ORGANIZACAO", "start_offset": 235, "end_offset": 261}, {"entity_id": "33", "text": "Faculdade de Ci\u00eancias", "label": "ORGANIZACAO", "start_offset": 264, "end_offset": 285}, {"entity_id": "34", "text": "Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 288, "end_offset": 310}, {"entity_id": "35", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 313, "end_offset": 317}, {"entity_id": "36", "text": "Professor Associado", "label": "PESSOA", "start_offset": 369, "end_offset": 388}, {"entity_id": "37", "text": "Departamento de Matem\u00e1tica da Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 392, "end_offset": 444}, {"entity_id": "38", "text": "Centro de Matem\u00e1tica e Aplica\u00e7\u00f5es Fundamentais", "label": "ORGANIZACAO", "start_offset": 457, "end_offset": 503}, {"entity_id": "39", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 506, "end_offset": 510}, {"entity_id": "48", "text": "Professor Jos\u00e9 Perdig\u00e3o Dias da Silva", "label": "PESSOA", "start_offset": 1065, "end_offset": 1102}, {"entity_id": "61", "text": "Faculdade de Letras", "label": "ORGANIZACAO", "start_offset": 1816, "end_offset": 1835}, {"entity_id": "62", "text": "25", "label": "TEMPO", "start_offset": 1848, "end_offset": 1850}, {"entity_id": "63", "text": "28 de Junho", "label": "TEMPO", "start_offset": 1853, "end_offset": 1864}, {"entity_id": "64", "text": "Lisboa", "label": "LOCAL", "start_offset": 1878, "end_offset": 1884}, {"entity_id": "65", "text": "CMAF", "label": "LOCAL", "start_offset": 1889, "end_offset": 1893}, {"entity_id": "69", "text": "Professor Narciso Garcia", "label": "PESSOA", "start_offset": 2063, "end_offset": 2087}, {"entity_id": "70", "text": "Instituto Superior T\u00e9cnico", "label": "ORGANIZACAO", "start_offset": 2091, "end_offset": 2117}]}, {"doc_id": "HAREM-281-01176", "doc_text": "\nBOMBEIROS VOLUNT\u00c1RIOS DE VILA NOVA DE OLIVEIRINHA\nClique aqui para ENTRAR NO MENU\n(Fotografia do Quartel Constru\u00eddo em 1935)\nCLIQUE AQUI para enviar uma mensagem\nVisitas desde 13/05/2001\nMensagem do Presidente da Direc\u00e7\u00e3o\nCaros amigos dos Bombeiros:\nA nossa p\u00e1gina na Internet j\u00e1 est\u00e1 activa desde o dia 13 de Maio de 2001, data em que se comemorou mais uma Festa dos Carolos (2001) .\nA Festa dos Carolos \u00e9 uma tradi\u00e7\u00e3o desta terra que os Bombeiros querem manter viva neste come\u00e7o do novo mil\u00e9nio.\nE como nesta nova era as solicita\u00e7\u00f5es s\u00e3o diversas, os Bombeiros Volunt\u00e1rios de Vila Nova de Oliveirinha t\u00eam bem presente os novos desafios.\nAssim, paralelamente \u00e0 constru\u00e7\u00e3o do Novo Quartel dos Bombeiros --temos dado passos bastante importantes!--, vamos continuar a melhorar esta p\u00e1gina na Internet.\nContinuamos a receber conte\u00fados para dotarmos esta p\u00e1gina com bastante informa\u00e7\u00e3o, pelo que a vossa ajuda pode ser determinante. Para tal,\npodem escrever-nos, enviar um fax ou uma mensagem via correio electr\u00f3nico. Para isso, visite a p\u00e1gina de CONTACTOS .\nVamos todos ajudar os Bombeiros.\nVamos todos divulgar aquilo que \u00e9 este corpo, o Corpo dos Bombeiros Volunt\u00e1rios de V. Nova de Oliveirinha.\nCaros amigos, fiquem pois atentos \u00e0s actualiza\u00e7\u00f5es desta p\u00e1gina.\nCom os melhores cumprimentos.\nEduardo Pereira\n(Presidente da Direc\u00e7\u00e3o)\nNOTA:\nClique aqui ou na imagem do Quartel para continuar a navegar!\n", "entities": [{"entity_id": "74", "text": "BOMBEIROS VOLUNT\u00c1RIOS DE VILA NOVA DE OLIVEIRINHA", "label": "ORGANIZACAO", "start_offset": 1, "end_offset": 50}, {"entity_id": "75", "text": "1935", "label": "TEMPO", "start_offset": 120, "end_offset": 124}, {"entity_id": "76", "text": "13/05/2001", "label": "TEMPO", "start_offset": 177, "end_offset": 187}, {"entity_id": "77", "text": "Presidente da Direc\u00e7\u00e3o", "label": "PESSOA", "start_offset": 200, "end_offset": 222}, {"entity_id": "78", "text": "Bombeiros", "label": "PESSOA", "start_offset": 240, "end_offset": 249}, {"entity_id": "79", "text": "Internet", "label": "LOCAL", "start_offset": 269, "end_offset": 277}, {"entity_id": "80", "text": "13 de Maio de 2001", "label": "TEMPO", "start_offset": 305, "end_offset": 323}, {"entity_id": "82", "text": "2001", "label": "TEMPO", "start_offset": 378, "end_offset": 382}, {"entity_id": "84", "text": "Bombeiros", "label": "PESSOA", "start_offset": 440, "end_offset": 449}, {"entity_id": "85", "text": "Bombeiros Volunt\u00e1rios de Vila Nova de Oliveirinha", "label": "ORGANIZACAO", "start_offset": 554, "end_offset": 603}, {"entity_id": "86", "text": "Internet", "label": "LOCAL", "start_offset": 791, "end_offset": 799}, {"entity_id": "87", "text": "Bombeiros", "label": "PESSOA", "start_offset": 1079, "end_offset": 1088}, {"entity_id": "88", "text": "Corpo dos Bombeiros Volunt\u00e1rios de V. Nova de Oliveirinha", "label": "ORGANIZACAO", "start_offset": 1138, "end_offset": 1195}, {"entity_id": "89", "text": "Eduardo Pereira", "label": "PESSOA", "start_offset": 1292, "end_offset": 1307}, {"entity_id": "90", "text": "Presidente da Direc\u00e7\u00e3o", "label": "PESSOA", "start_offset": 1309, "end_offset": 1331}]}, {"doc_id": "HAREM-284-04226", "doc_text": "\nSunab autua empresas por alta abusiva dos pre\u00e7os \nDa Sucursal de Bras\u00edlia e da Reportagem Local\nA Sunab (Superintend\u00eancia Nacional de Abastecimento) autuou 62 estabelecimentos comerciais em 16 Estados entre 27 de junho e 8 de julho \u00faltimo. \nO motivo da autua\u00e7\u00e3o foi a pr\u00e1tica de aumento abusivo de pre\u00e7os acima da varia\u00e7\u00e3o dos custos de acordo com a nova Lei Antitruste (n\u00ba 8.884/94). \nA fiscaliza\u00e7\u00e3o tamb\u00e9m foi motivada pelo descumprimento de normas de comercializa\u00e7\u00e3o. \nEntre os autuados, est\u00e3o seis supermercados e oito ind\u00fastrias. \nA Sunab tamb\u00e9m constatou a pr\u00e1tica de aumento abusivo de pre\u00e7os em outros 23 estabelecimentos comerciais. \nSupermercados \nA Procuradoria do Estado de S\u00e3o Paulo deve finalizar os pareceres sobre os sete supermercados autuados pelo Procon dentro de uma semana . \nOs autuados foram: O Barateiro, Carrefour, P\u00e3o de A\u00e7\u00facar, C\u00e2ndia, Extra, Eldorado e Paes Mendon\u00e7a. \nEles teriam vendido em mar\u00e7o acima da m\u00e9dia dos \u00faltimos quatro meses de 93. \nOs supermercadistas apresentaram defesa. \nAverigua\u00e7\u00e3o \nAs empresas de vale-refei\u00e7\u00e3o dever\u00e3o ser alvo de um processo de averigua\u00e7\u00e3o preliminar feito pelo governo. \nEm representa\u00e7\u00e3o entregue ontem ao Minist\u00e9rio da Justi\u00e7a, elas foram acusadas de terem formado cartel para aumentar em at\u00e9 200% a taxa cobrada pelos seus servi\u00e7os. \nA representa\u00e7\u00e3o foi encaminhada pelo comerciante paulista Ronaldo Cheguri de Almeida, em nome de cerca de 300 donos de bares e restaurantes de S\u00e3o Paulo. \n", "entities": [{"entity_id": "370", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 1, "end_offset": 6}, {"entity_id": "371", "text": "Sucursal de Bras\u00edlia", "label": "ORGANIZACAO", "start_offset": 54, "end_offset": 74}, {"entity_id": "372", "text": "Reportagem Local", "label": "ORGANIZACAO", "start_offset": 80, "end_offset": 96}, {"entity_id": "373", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 99, "end_offset": 104}, {"entity_id": "374", "text": "Superintend\u00eancia Nacional de Abastecimento", "label": "ORGANIZACAO", "start_offset": 106, "end_offset": 148}, {"entity_id": "375", "text": "62", "label": "VALOR", "start_offset": 157, "end_offset": 159}, {"entity_id": "376", "text": "16", "label": "VALOR", "start_offset": 191, "end_offset": 193}, {"entity_id": "377", "text": "27 de junho", "label": "TEMPO", "start_offset": 208, "end_offset": 219}, {"entity_id": "378", "text": "8 de julho", "label": "TEMPO", "start_offset": 222, "end_offset": 232}, {"entity_id": "380", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 539, "end_offset": 544}, {"entity_id": "381", "text": "23", "label": "VALOR", "start_offset": 611, "end_offset": 613}, {"entity_id": "382", "text": "Procuradoria do Estado de S\u00e3o Paulo", "label": "ORGANIZACAO", "start_offset": 661, "end_offset": 696}, {"entity_id": "383", "text": "Procon", "label": "ORGANIZACAO", "start_offset": 767, "end_offset": 773}, {"entity_id": "384", "text": "Barateiro", "label": "ORGANIZACAO", "start_offset": 819, "end_offset": 828}, {"entity_id": "385", "text": "Carrefour", "label": "ORGANIZACAO", "start_offset": 830, "end_offset": 839}, {"entity_id": "386", "text": "P\u00e3o de A\u00e7\u00facar", "label": "ORGANIZACAO", "start_offset": 841, "end_offset": 854}, {"entity_id": "387", "text": "C\u00e2ndia", "label": "ORGANIZACAO", "start_offset": 856, "end_offset": 862}, {"entity_id": "388", "text": "Extra", "label": "ORGANIZACAO", "start_offset": 864, "end_offset": 869}, {"entity_id": "389", "text": "Eldorado", "label": "ORGANIZACAO", "start_offset": 871, "end_offset": 879}, {"entity_id": "390", "text": "Paes Mendon\u00e7a", "label": "ORGANIZACAO", "start_offset": 882, "end_offset": 895}, {"entity_id": "391", "text": "mar\u00e7o", "label": "TEMPO", "start_offset": 921, "end_offset": 926}, {"entity_id": "392", "text": "93", "label": "TEMPO", "start_offset": 970, "end_offset": 972}, {"entity_id": "393", "text": "Minist\u00e9rio da Justi\u00e7a", "label": "ORGANIZACAO", "start_offset": 1173, "end_offset": 1194}, {"entity_id": "394", "text": "200%", "label": "VALOR", "start_offset": 1261, "end_offset": 1265}, {"entity_id": "395", "text": "Ronaldo Cheguri de Almeida", "label": "PESSOA", "start_offset": 1361, "end_offset": 1387}, {"entity_id": "396", "text": "cerca de 300", "label": "VALOR", "start_offset": 1400, "end_offset": 1412}, {"entity_id": "397", "text": "S\u00e3o Paulo", "label": "LOCAL", "start_offset": 1446, "end_offset": 1455}]}, {"doc_id": "HAREM-367-06201", "doc_text": "\n A REVISTA S\u00c3O PAULO EM PERSPECTIVA, da Fundacao Seade, Estado de Sao Paulo, acaba de lancar seu ultimo numero (v+12 ,n 4) dedicado \u00e0 Comunicacai e informacao. \n Nas palavras de seu editor Miguel Chaia \"Neste n\u00famero, S\u00e3o Paulo em Perspectiva traz artigos que discutem e refletem a natureza da comunica\u00e7\u00e3o e, particularmente da informa\u00e7\u00e3o, numa situa\u00e7\u00e3o na qual avan\u00e7am rapidamente as conquistas tecnol\u00f3gicas da inform\u00e1tica e acentuam-se os efeitos dos meios de comunica\u00e7\u00e3o de massa. \n Simultaneamente, continuam a funcionar de forma significativa institui\u00e7\u00f5es acad\u00eamicas, de pesquisa ou t\u00e9cnicas que buscam produzir e disseminar conhecimento voltado ao desenvolvimento das ci\u00eancias sociais, \u00e0 continuidade de pesquisas e ao subs\u00eddio a debates e programas p\u00fablicos, propiciando maior racionaliza\u00e7\u00e3o \u00e0s interven\u00e7\u00f5es na realidade social. \n Considerando estas duas tend\u00eancias, os textos apresentados analisam as caracter\u00edsticas de uma sociedade globalizada que se fundamenta na m\u00eddia eletr\u00f4nica, na velocidade da comunica\u00e7\u00e3o e na heterogeneidade da produ\u00e7\u00e3o, troca e consumo da informa\u00e7\u00e3o. \n Tal processo torna-se cada vez mais sofisticado, exigindo avan\u00e7ados servi\u00e7os e aparelhagens tecnol\u00f3gicas, novas rela\u00e7\u00f5es entre emiss\u00e3o e recep\u00e7\u00e3o de mensagens e, tamb\u00e9m, novas formas de produ\u00e7\u00e3o de conhecimento. \n Nesta situa\u00e7\u00e3o, os sujeitos devem estar preparados para a inser\u00e7\u00e3o em in\u00e9ditos processos cognitivos, tanto aqueles que s\u00e3o profissionais da \u00e1rea da comunica\u00e7\u00e3o, quanto os usu\u00e1rios dos servi\u00e7os oferecidos. \n\n O Conteudo da Revista pode ser oservado a partir do seu sumario: \n\n SUM\u00c1RIO \n\n COMUNICA\u00c7\u00c3O &amp;  INFORMA\u00c7\u00c3O: \n\nO Rumor do Conhecimento\nAldo de Albuquerque Barreto \nGest\u00e3o e Tratamento da Informa\u00e7\u00e3o na Sociedade Tecnol\u00f3gica Othon Jambeiro \n Comunica\u00e7\u00e3o,M\u00eddiaeCultura \nNorval Baitello Junior \n Muito Al\u00e9m da lnforma\u00e7\u00e3o: m\u00eddia ,cidadania e o dilema democr\u00e1tico Mauro P+ Porto \n Sociedade da Informa\u00e7\u00e3o, Comunica\u00e7\u00f5es e Democracia Ven\u00edcio A+ de Linia \n O Mal-Estar Brasileiro na Sociedade de Informa\u00e7\u00e3o \nAna Malin\n Desmidiatizar o Pensamento: economia das representa\u00e7\u00f5es e subdesenvolvimento informacional \nMargaretihe Born Steinberger\n O Imagin\u00e1rio da Cibercultura \nAndr\u00e9 Lemos\n Fontes Eletr\u00f4nicas de Informa\u00e7\u00e3o: novas formas de comunica\u00e7\u00e3o e de produ\u00e7\u00e3o do conhecimento \nSolange Puntel Mostafa / Marisa Terra\n Comunica\u00e7\u00e3o da Ci\u00eancia\nIsaac Epstein\n Informa\u00e7\u00e3o e Sociedade: novos par\u00e2metros te\u00f3rico-pr\u00e1ticos de gest\u00e3o e transfer\u00eancia informacional \nRegina Maria Marteleto\n Sociedade Civil, Estado e Terceiro Setor \nMaria do Carmo Brant de Carvalho\n A Coordena\u00e7\u00e3o, a Argumenta\u00e7\u00e3o e a Comunica\u00e7\u00e3o das Estat\u00edsticas: v\u00e9rtices de um mesmo tri\u00e2ngulo \n Nelson de Castro Senra \n A Arquitetura de Sistemas de Informa\u00e7\u00f5es Estat\u00edsticas na Internet Marilda Lopes Ginez de Lara \n As Novas e Velhas Demandas por Informa\u00e7\u00e3o Estat\u00edstica \nPaulo de Martino Jannuzzi\n O Sistema Banc\u00e1rio e o Aparecimento da Moeda Eletr\u00f4nica Maria Cristina Penido de Freitas\n A Revista pode ser obtida atraves da Internet no site da Funda\u00e7\u00e3o SEADE: ou pelo email : com Cleide \n ou Tania, Tel.011-2241654.\n\n O artigo que tenho na Revista eh fruto de pesquisa em dase de finaliza\u00e7\u00e3o, financiada pelo CNPq e que trata de: \n\n Informacao e conhecimento, pois a informa\u00e7\u00e3o modificou o seu status na academia quando o seu destino se vinculou ao conhecimento, como fato cognitivo do sujeito e ao desenvolvimento como decorr\u00eancia social natural da acumula\u00e7\u00e3o deste conhecimento. \n A ess\u00eancia do fen\u00f4meno da informa\u00e7\u00e3o passou a ser esta condi\u00e7\u00e3o de intencionalidade em gerar conhecimento no indiv\u00edduo e em sua realidade. \n As modifica\u00e7\u00f5es na esfera de influ\u00eancia da informa\u00e7\u00e3o n\u00e3o foram acompanhadas de uma explana\u00e7\u00e3o te\u00f3rica em que, poss\u00edveis evid\u00eancias do processo de transforma\u00e7\u00e3o: informacao-conhecimento, fossem esclarecidos. \n Esta e outras condi\u00e7\u00f5es espec\u00edficas da manifesta\u00e7\u00e3o da informa\u00e7\u00e3o como participante deste processo s\u00e3o estudadas neste artigo. \n Assim, dividimos o artigo em duas partes: a primeira procura mostrar as poss\u00edveis evid\u00eancias conceituais da exist\u00eancia da rela\u00e7\u00e3o informa\u00e7\u00e3o e conhecimento; e a segunda pretende apresentar os resultados iniciais de pesquisa ainda em andamento, onde se procura qualificar os mecanismos de elabora\u00e7\u00e3o do pensamento nesta rela\u00e7\u00e3o de transforma\u00e7\u00e3o, com dados emp\u00edricos paratr\u00eas \u00e1reas do conhecimento ou comunidades ling\u00fc\u00edsticas ou grupos informacionais diferenciados: a comunica\u00e7\u00e3o, a fisica e a ciencia da informa\u00e7\u00e3o.\n", "entities": [{"entity_id": "473", "text": "Fundacao Seade", "label": "ORGANIZACAO", "start_offset": 41, "end_offset": 55}, {"entity_id": "474", "text": "Estado de Sao Paulo", "label": "LOCAL", "start_offset": 57, "end_offset": 76}, {"entity_id": "476", "text": "Miguel Chaia", "label": "PESSOA", "start_offset": 190, "end_offset": 202}, {"entity_id": "479", "text": "Aldo de Albuquerque Barreto", "label": "PESSOA", "start_offset": 1647, "end_offset": 1674}, {"entity_id": "481", "text": "Othon Jambeiro ", "label": "PESSOA", "start_offset": 1735, "end_offset": 1750}, {"entity_id": "483", "text": "Norval Baitello Junior", "label": "PESSOA", "start_offset": 1779, "end_offset": 1801}, {"entity_id": "485", "text": "Mauro P+ Porto", "label": "PESSOA", "start_offset": 1870, "end_offset": 1884}, {"entity_id": "487", "text": "Ven\u00edcio A+ de Linia", "label": "PESSOA", "start_offset": 1938, "end_offset": 1957}, {"entity_id": "489", "text": "Ana Malin", "label": "PESSOA", "start_offset": 2011, "end_offset": 2020}, {"entity_id": "491", "text": "Margaretihe Born Steinberger", "label": "PESSOA", "start_offset": 2114, "end_offset": 2142}, {"entity_id": "493", "text": "Andr\u00e9 Lemos", "label": "PESSOA", "start_offset": 2174, "end_offset": 2185}, {"entity_id": "495", "text": "Solange Puntel Mostafa", "label": "PESSOA", "start_offset": 2280, "end_offset": 2302}, {"entity_id": "496", "text": "Marisa Terra", "label": "PESSOA", "start_offset": 2305, "end_offset": 2317}, {"entity_id": "498", "text": "Isaac Epstein", "label": "PESSOA", "start_offset": 2342, "end_offset": 2355}, {"entity_id": "500", "text": "Regina Maria Marteleto", "label": "PESSOA", "start_offset": 2456, "end_offset": 2478}, {"entity_id": "502", "text": "Maria do Carmo Brant de Carvalho", "label": "PESSOA", "start_offset": 2522, "end_offset": 2554}, {"entity_id": "504", "text": "Nelson de Castro Senra", "label": "PESSOA", "start_offset": 2653, "end_offset": 2675}, {"entity_id": "506", "text": "Marilda Lopes Ginez de Lara ", "label": "PESSOA", "start_offset": 2744, "end_offset": 2772}, {"entity_id": "508", "text": "Paulo de Martino Jannuzzi", "label": "PESSOA", "start_offset": 2829, "end_offset": 2854}, {"entity_id": "510", "text": "Maria Cristina Penido de Freitas", "label": "PESSOA", "start_offset": 2912, "end_offset": 2944}, {"entity_id": "511", "text": "Internet", "label": "LOCAL", "start_offset": 2983, "end_offset": 2991}, {"entity_id": "512", "text": "Funda\u00e7\u00e3o SEADE", "label": "ORGANIZACAO", "start_offset": 3003, "end_offset": 3017}, {"entity_id": "513", "text": "Cleide", "label": "PESSOA", "start_offset": 3039, "end_offset": 3045}, {"entity_id": "514", "text": "Tania", "label": "PESSOA", "start_offset": 3051, "end_offset": 3056}, {"entity_id": "516", "text": "CNPq", "label": "ORGANIZACAO", "start_offset": 3168, "end_offset": 3172}]}, {"doc_id": "HAREM-862-03412", "doc_text": "\nConcurso Para Auditor Fiscal do INSS              \n J\u00e1 est\u00e1 pronta a minuta do edital do concurso para auditor fiscal do INSS, que oferecer\u00e1 150 vagas, prometidas pelo governo federal, conforme revelou o chefe de Divis\u00e3o na Coordena\u00e7\u00e3o Geral do INSS, Maur\u00edlio Gon\u00e7alves Dias.              \nO INSS aguarda apenas a autoriza\u00e7\u00e3o oficial, para dar in\u00edcio ao processo seletivo.              \nNo dia 30 de junho, a Comiss\u00e3o de Controle e Gest\u00e3o Fiscal, do Minist\u00e9rio da Fazenda, publicou, no Di\u00e1rio Oficial, uma recomenda\u00e7\u00e3o ao Minist\u00e9rio de Or\u00e7amento e Gest\u00e3o, pela autoriza\u00e7\u00e3o para a abertura de concursos.              \nNo total, a oferta ser\u00e1 de 3.728 vagas, em carreiras de n\u00edvel superior, conforme promessa feita no dia 30 de maio.              \nUma fonte do INSS disse que \u00e9 grande a possibilidade da Universidade de Bras\u00edlia (UnB) organizar o seu concurso.              \nA institui\u00e7\u00e3o \u00e9 a mesma que coordenou o \u00faltimo processo seletivo do INSS, realizado em 1998.              \nPara participar do concurso \u00e9 necess\u00e1rio ter conclu\u00eddo curso superior em qualquer \u00e1rea.              \nA remunera\u00e7\u00e3o \u00e9 de R$2.409,66, podendo chegar a R$3.613, com a Gratifica\u00e7\u00e3o de Desempenho por Atividade Tribut\u00e1ria (GDAT), obtida em fun\u00e7\u00e3o do alcance das metas de arrecada\u00e7\u00e3o e dos resultados obtidos com a fiscaliza\u00e7\u00e3o.              \nO diretor de Arrecada\u00e7\u00e3o Fiscal do INSS, Luiz Alberto Lazinho, acredita que o conte\u00fado program\u00e1tico das provas seguir\u00e1 o modelo do \u00faltimo processo seletivo, realizado em 1998.              \n\"Os candidatos devem dar especial aten\u00e7\u00e3o a Contabilidade, Direito Tribut\u00e1rio e Legisla\u00e7\u00e3o Previdenci\u00e1ria\", sugeriu Luiz Alberto Lazinho.              \nPara aqueles que v\u00e3o participar do processo seletivo, o professor de Direito Previdenci\u00e1rio F\u00e1bio Zambite d\u00e1 uma dica importante: os candidatos devem estudar com bastante aten\u00e7\u00e3o o Decreto 3.048/99, que aprova o Regulamento da Previd\u00eancia Social.              \n\"A Legisla\u00e7\u00e3o \u00e9 muito extensa.              \nAo inv\u00e9s de estudar as Leis de Custeio e de Benef\u00edcios, al\u00e9m do Regulamento da Previd\u00eancia Social sugiro que o candidato estude diretamente o Decreto 3.048/99, que reproduz o que dizem essas leis.              \nAssim, o candidato ganha tempo na hora de estudar\", orienta.              \nOutra sugest\u00e3o do professor \u00e9 que os concorrentes analisem com especial aten\u00e7\u00e3o a Lei 9.876/99, que introduz altera\u00e7\u00f5es na Previd\u00eancia Social.              \n\"Uma dessas altera\u00e7\u00f5es diz respeito \u00e0 mudan\u00e7a no c\u00e1lculo das aposentadorias.              \nEssa lei tamb\u00e9m cria o fator previdenci\u00e1rio, que certamente ser\u00e1 uma das quest\u00f5es da prova\", disse.              \n", "entities": [{"entity_id": "908", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 33, "end_offset": 37}, {"entity_id": "909", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 122, "end_offset": 126}, {"entity_id": "910", "text": "150", "label": "VALOR", "start_offset": 142, "end_offset": 145}, {"entity_id": "911", "text": "Divis\u00e3o na Coordena\u00e7\u00e3o Geral do INSS", "label": "ORGANIZACAO", "start_offset": 214, "end_offset": 250}, {"entity_id": "912", "text": "Maur\u00edlio Gon\u00e7alves Dias", "label": "PESSOA", "start_offset": 252, "end_offset": 275}, {"entity_id": "913", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 293, "end_offset": 297}, {"entity_id": "914", "text": "30 de junho", "label": "TEMPO", "start_offset": 395, "end_offset": 406}, {"entity_id": "915", "text": "Comiss\u00e3o de Controle e Gest\u00e3o Fiscal", "label": "ORGANIZACAO", "start_offset": 410, "end_offset": 446}, {"entity_id": "916", "text": "Minist\u00e9rio da Fazenda", "label": "ORGANIZACAO", "start_offset": 451, "end_offset": 472}, {"entity_id": "917", "text": "Di\u00e1rio Oficial", "label": "LOCAL", "start_offset": 487, "end_offset": 501}, {"entity_id": "918", "text": "Minist\u00e9rio de Or\u00e7amento e Gest\u00e3o", "label": "ORGANIZACAO", "start_offset": 523, "end_offset": 555}, {"entity_id": "919", "text": "3.728", "label": "VALOR", "start_offset": 645, "end_offset": 650}, {"entity_id": "920", "text": "30 de maio", "label": "TEMPO", "start_offset": 721, "end_offset": 731}, {"entity_id": "921", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 760, "end_offset": 764}, {"entity_id": "922", "text": "Universidade de Bras\u00edlia", "label": "ORGANIZACAO", "start_offset": 803, "end_offset": 827}, {"entity_id": "923", "text": "UnB", "label": "ORGANIZACAO", "start_offset": 829, "end_offset": 832}, {"entity_id": "924", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 942, "end_offset": 946}, {"entity_id": "925", "text": "1998", "label": "TEMPO", "start_offset": 961, "end_offset": 965}, {"entity_id": "926", "text": "R$2.409,66", "label": "VALOR", "start_offset": 1102, "end_offset": 1112}, {"entity_id": "927", "text": "R$3.613", "label": "VALOR", "start_offset": 1131, "end_offset": 1138}, {"entity_id": "930", "text": "Arrecada\u00e7\u00e3o Fiscal do INSS", "label": "ORGANIZACAO", "start_offset": 1331, "end_offset": 1357}, {"entity_id": "931", "text": "Luiz Alberto Lazinho", "label": "PESSOA", "start_offset": 1359, "end_offset": 1379}, {"entity_id": "932", "text": "1998", "label": "TEMPO", "start_offset": 1488, "end_offset": 1492}, {"entity_id": "936", "text": "Luiz Alberto Lazinho", "label": "PESSOA", "start_offset": 1624, "end_offset": 1644}, {"entity_id": "938", "text": "F\u00e1bio Zambite", "label": "PESSOA", "start_offset": 1752, "end_offset": 1765}]}, {"doc_id": "HAREM-071-00386", "doc_text": "\nUma vila no Interior\nUm simbolo de grande beleza\nAs terras de S. Martinho foram povoadas desde remotas eras, gra\u00e7as \u00e0 fertilidade do rio Bestan\u00e7a e \u00e0 facilidade de defesas naturais e pontos estrat\u00e9gicos como a Pena ou Pedra Sobreposta, em Paus, e a Mogueira, perto de S. Martinho.\nNo morro da Mogueira h\u00e1 vest\u00edgios evidentes da presen\u00e7a dos Celtas, e dos Romanos.\nTrata-se de um castro romanizado.\nA estes povos seguiram-se os Suevos, os Visigodos, depois os Mouros que lhe deram o nome, e por fim, os povoadores crist\u00e3os da Reconquista.Atendendo ao nome \u00abS. Martinho\u00bb, pode concluir-se que deve ter sido par\u00f3quia desde os prim\u00f3rdios da cristianiza\u00e7\u00e3o destas paragens.\nPaus, S. Jo\u00e3o e Gosende eram, nessa \u00e9poca, simples povoados desta freguesia.\nDada a sua fertilidade, os Mouros com todas as suas for\u00e7as a reconquista crist\u00e3, motivo pelo qual, sendo j\u00e1 crist\u00e3o todo o noroeste(de Resende ao Porto) e estando ainda S. Martinho nas m\u00e3os dos Mouros, os crist\u00e3os de Resende, falando de S. Martinho, lhe chamavam de \u00abMouros\u00bb.\nAp\u00f3s a reconquista, em 1058, tentou-se o repovoamento com a doa\u00e7\u00e3o de terras a senhores da nobreza, concretamente com as Honras de Cardoso, de Cantim, de Fonseca, de Paredes e de Temonde.\nS. Martinho foi concelho desde tempos anteriores \u00e0 nacionalidade, pois recebeu foral de Fernando Magno, confirmado por D. Teresa em 1 de Mar\u00e7o de 1121, e novo foral do rei D. Manuel em 20 de Outubro de 1513.\nFoi tamb\u00e9m julgado medieval, abrangendo uma longa faixa de territ\u00f3rio, desde o Douro \u00e0 cruz do Ross\u00e3o no montemuro, e desde a serra das Meadas a terras do concelho de Aregos e da honra de Resende.\nO julgado foi suprimido por decreto de 28 de Dezembro de 1840 e incorporado na comarca de Lamego e o concelho foi extinto em 24 de Outubro de 1855, data em que, tanto o concelho como o julgado passaram a fazer parte do concelho e da comarca de Resende.\n", "entities": [{"entity_id": "1289", "text": "Interior", "label": "LOCAL", "start_offset": 13, "end_offset": 21}, {"entity_id": "1290", "text": "S. Martinho", "label": "LOCAL", "start_offset": 63, "end_offset": 74}, {"entity_id": "1291", "text": "Bestan\u00e7a", "label": "LOCAL", "start_offset": 138, "end_offset": 146}, {"entity_id": "1292", "text": "Pena", "label": "LOCAL", "start_offset": 211, "end_offset": 215}, {"entity_id": "1293", "text": "Pedra Sobreposta", "label": "LOCAL", "start_offset": 219, "end_offset": 235}, {"entity_id": "1294", "text": "Paus", "label": "LOCAL", "start_offset": 240, "end_offset": 244}, {"entity_id": "1295", "text": "Mogueira", "label": "LOCAL", "start_offset": 250, "end_offset": 258}, {"entity_id": "1296", "text": "S. Martinho", "label": "LOCAL", "start_offset": 269, "end_offset": 280}, {"entity_id": "1297", "text": "Mogueira", "label": "LOCAL", "start_offset": 294, "end_offset": 302}, {"entity_id": "1298", "text": "Celtas", "label": "PESSOA", "start_offset": 342, "end_offset": 348}, {"entity_id": "1299", "text": "Romanos", "label": "PESSOA", "start_offset": 356, "end_offset": 363}, {"entity_id": "1300", "text": "Suevos", "label": "PESSOA", "start_offset": 428, "end_offset": 434}, {"entity_id": "1301", "text": "Visigodos", "label": "PESSOA", "start_offset": 439, "end_offset": 448}, {"entity_id": "1302", "text": "Mouros", "label": "PESSOA", "start_offset": 460, "end_offset": 466}, {"entity_id": "1303", "text": "Reconquista", "label": "TEMPO", "start_offset": 526, "end_offset": 537}, {"entity_id": "1305", "text": "Paus", "label": "LOCAL", "start_offset": 670, "end_offset": 674}, {"entity_id": "1306", "text": "S. Jo\u00e3o", "label": "LOCAL", "start_offset": 676, "end_offset": 683}, {"entity_id": "1307", "text": "Gosende", "label": "LOCAL", "start_offset": 686, "end_offset": 693}, {"entity_id": "1308", "text": "Mouros", "label": "PESSOA", "start_offset": 774, "end_offset": 780}, {"entity_id": "1309", "text": "Resende", "label": "LOCAL", "start_offset": 882, "end_offset": 889}, {"entity_id": "1310", "text": "Porto", "label": "LOCAL", "start_offset": 893, "end_offset": 898}, {"entity_id": "1311", "text": "S. Martinho", "label": "LOCAL", "start_offset": 916, "end_offset": 927}, {"entity_id": "1312", "text": "Mouros", "label": "PESSOA", "start_offset": 941, "end_offset": 947}, {"entity_id": "1313", "text": "Resende", "label": "LOCAL", "start_offset": 964, "end_offset": 971}, {"entity_id": "1314", "text": "S. Martinho", "label": "LOCAL", "start_offset": 984, "end_offset": 995}, {"entity_id": "1316", "text": "1058", "label": "TEMPO", "start_offset": 1046, "end_offset": 1050}, {"entity_id": "1318", "text": "Cardoso", "label": "LOCAL", "start_offset": 1154, "end_offset": 1161}, {"entity_id": "1319", "text": "Cantim", "label": "LOCAL", "start_offset": 1166, "end_offset": 1172}, {"entity_id": "1320", "text": "Fonseca", "label": "LOCAL", "start_offset": 1177, "end_offset": 1184}, {"entity_id": "1321", "text": "Paredes", "label": "LOCAL", "start_offset": 1189, "end_offset": 1196}, {"entity_id": "1322", "text": "Temonde", "label": "LOCAL", "start_offset": 1202, "end_offset": 1209}, {"entity_id": "1324", "text": "S. Martinho", "label": "LOCAL", "start_offset": 1211, "end_offset": 1222}, {"entity_id": "1325", "text": "Fernando Magno", "label": "PESSOA", "start_offset": 1299, "end_offset": 1313}, {"entity_id": "1326", "text": "D. Teresa", "label": "PESSOA", "start_offset": 1330, "end_offset": 1339}, {"entity_id": "1327", "text": "1 de Mar\u00e7o de 1121", "label": "TEMPO", "start_offset": 1343, "end_offset": 1361}, {"entity_id": "1328", "text": "D. Manuel", "label": "PESSOA", "start_offset": 1383, "end_offset": 1392}, {"entity_id": "1329", "text": "20 de Outubro de 1513", "label": "TEMPO", "start_offset": 1396, "end_offset": 1417}, {"entity_id": "1330", "text": "Douro", "label": "LOCAL", "start_offset": 1498, "end_offset": 1503}, {"entity_id": "1331", "text": "Ross\u00e3o", "label": "LOCAL", "start_offset": 1514, "end_offset": 1520}, {"entity_id": "1332", "text": "Meadas", "label": "LOCAL", "start_offset": 1555, "end_offset": 1561}, {"entity_id": "1333", "text": "Aregos", "label": "LOCAL", "start_offset": 1586, "end_offset": 1592}, {"entity_id": "1334", "text": "Resende", "label": "LOCAL", "start_offset": 1607, "end_offset": 1614}, {"entity_id": "1335", "text": "28 de Dezembro de 1840", "label": "TEMPO", "start_offset": 1655, "end_offset": 1677}, {"entity_id": "1336", "text": "Lamego", "label": "LOCAL", "start_offset": 1706, "end_offset": 1712}, {"entity_id": "1337", "text": "24 de Outubro de 1855", "label": "TEMPO", "start_offset": 1741, "end_offset": 1762}, {"entity_id": "1338", "text": "Resende", "label": "LOCAL", "start_offset": 1860, "end_offset": 1867}]}, {"doc_id": "HAREM-19H-01369", "doc_text": "\nNorte-americanos disparam contra posi\u00e7\u00f5es iraquianas\nAparelhos norte-americanos dispararam m\u00edsseis e lan\u00e7aram bombas sobre posi\u00e7\u00f5es iraquianas no norte do pa\u00eds \u00abem resposta a disparos de artilharia anti-a\u00e9rea\u00bb do Iraque, anunciou o Pent\u00e1gono em comunicado.\nSegundo o departamento da defesa, avi\u00f5es F-15 \u00abque efectuavam voos de rotina na zona de exclus\u00e3o a\u00e9rea no norte do Iraque\u00bb dispararam primeiro tr\u00eas m\u00edsseis e lan\u00e7aram bombas guiadas por laser contra um centro de comando militar e uma esta\u00e7\u00e3o de r\u00e1dio.\nUm pouco mais tarde, outros aparelhos F-15 lan\u00e7aram cinco bombas contra objectivos n\u00e3o identificados perto de Mossul.\nOs aparelhos regressaram \u00e0 sua base na Turquia, acrescenta o comunicado do Pent\u00e1gono.\n", "entities": [{"entity_id": "2057", "text": "Iraque", "label": "ORGANIZACAO", "start_offset": 214, "end_offset": 220}, {"entity_id": "2058", "text": "Pent\u00e1gono", "label": "ORGANIZACAO", "start_offset": 233, "end_offset": 242}, {"entity_id": "2060", "text": "Iraque", "label": "LOCAL", "start_offset": 373, "end_offset": 379}, {"entity_id": "2062", "text": "Mossul", "label": "LOCAL", "start_offset": 620, "end_offset": 626}, {"entity_id": "2063", "text": "Turquia", "label": "LOCAL", "start_offset": 667, "end_offset": 674}, {"entity_id": "2064", "text": "Pent\u00e1gono", "label": "ORGANIZACAO", "start_offset": 703, "end_offset": 712}]}, {"doc_id": "HAREM-276-04861", "doc_text": "\nORGANIZA\u00c7\u00c3O ESTRUTURAL DA MEMBRANA \nA membrana celular \u00e9 uma camada com apenas 7,5 a 10 nm de espessura, constitu\u00edda por l\u00edpidos intercalados com prote\u00ednas que define os limites de cada c\u00e9lula. \nFunciona como uma barreira de permeabilidade que permite \u00e0 c\u00e9lula manter um meio qu\u00edmico apropriado para os seus processos metab\u00f3licos, regular o volume citoplasm\u00e1tico e transferir informa\u00e7\u00e3o sob a forma de sinais qu\u00edmicos e el\u00e9ctricos. \nAs membranas que revestem os v\u00e1rios organelos (n\u00facleo, mitoc\u00f4ndria, ret\u00edculo endoplasm\u00e1tico, lisossomas e aparelho de Golgi) permitem a compartimentaliza\u00e7\u00e3o funcional da c\u00e9lula, com possibilidade de limitar processos bioqu\u00edmicas a certos locais. \nApesar das particularidades individuais, todas as membranas biol\u00f3gicas s\u00e3o formadas por uma dupla camada fosfol\u00edpidica e por prote\u00ednas unidas por liga\u00e7\u00f5es covalentes e que se comportam segundo o Modelo Mosaico Flu\u00eddo. \nA maioria dos l\u00edpidos e das prote\u00ednas movem-se livremente no plano da membrana. \nEm alguns casos , h\u00e1 restri\u00e7\u00e3o deste movimento de forma a permitir \u00e0 c\u00e9lula a realiza\u00e7\u00e3o de algumas fun\u00e7\u00f5es em partes selectivas da sua membrana. \n\u00c9 o caso da sequestra\u00e7\u00e3o de receptores de acetilcolina ao n\u00edvel da placa motora das c\u00e9lulas musculares esquel\u00e9ticas. \nOs principais l\u00edpidos presentes na membrana celular s\u00e3o os fosfol\u00edpidos, o colesterol e os glicol\u00edpidos. \nA sua distribui\u00e7\u00e3o pelas duas camadas \u00e9 assim\u00e9trica, o que pode reflectir as diferentes fun\u00e7\u00f5es das duas superf\u00edcies da membrana. \nOs fosfol\u00edpidos s\u00e3o mol\u00e9culas antip\u00e1ticas e disp\u00f5em-se em bicamada com a por\u00e7\u00e3o hidr\u00f3foba n\u00e3o polar ( caudas de \u00e1cidos gordos) dirigida para o centro da membrana e com a por\u00e7\u00e3o hidrof\u00edlica polar (cabe\u00e7a com terminal fosfato) direccionada para o exterior ou interior da c\u00e9lula. \nOs fosfol\u00edpidos mais abundantes s\u00e3o os fosfol\u00edpidos ligados \u00e0 colina (fosfatidilcolina e esfingomielina) e os aminofosfol\u00edpidos (fosfatidilserina e fosfatidiletanolamina). \nO fosfatidilglicerol, o fosfatidilinositol e a cardiolipina s\u00e3o tamb\u00e9m importantes mas est\u00e3o presentes em menores quantidades.\nAs Dimens\u00f5es da via de difus\u00e3o incluem a \u00e1rea de sec\u00e7\u00e3o e a dist\u00e2ncia. \nQuanto maior a \u00e1rea de sec\u00e7\u00e3o e menor a dist\u00e2ncia a percorrer, maior o fluxo. \nNo pulm\u00e3o e no intestino, onde a difus\u00e3o \u00e9 importante para a troca de  subst\u00e2ncias entre os meios interno e externo, a \u00e1rea de difus\u00e3o \u00e9 grande e a dist\u00e2ncia a percorrer pequena. \n", "entities": [{"entity_id": "3519", "text": "7,5 a 10 nm", "label": "VALOR", "start_offset": 80, "end_offset": 91}]}]


--------------------------------------------------------------------------------