├── ner_evaluation ├── __init__.py ├── data │ ├── classes-selective.txt │ ├── classes-total.txt │ └── FirstHAREM-selective-dev.json ├── requirements.txt ├── results_writer.py ├── utils.py ├── dataset.py ├── tokenization.py ├── tag_encoder.py ├── run_bert_harem.py ├── postprocessing.py ├── run_inference.py ├── eval_tools.py ├── README.md ├── preprocessing.py ├── model.py └── trainer.py ├── qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf ├── LICENSE └── README.md /ner_evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ner_evaluation/data/classes-selective.txt: -------------------------------------------------------------------------------- 1 | PESSOA 2 | ORGANIZACAO 3 | LOCAL 4 | TEMPO 5 | VALOR -------------------------------------------------------------------------------- /ner_evaluation/data/classes-total.txt: -------------------------------------------------------------------------------- 1 | PESSOA 2 | ORGANIZACAO 3 | LOCAL 4 | TEMPO 5 | VALOR 6 | ABSTRACCAO 7 | ACONTECIMENTO 8 | COISA 9 | OBRA 10 | OUTRO -------------------------------------------------------------------------------- /qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neuralmind-ai/portuguese-bert/HEAD/qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf -------------------------------------------------------------------------------- /ner_evaluation/requirements.txt: -------------------------------------------------------------------------------- 1 | pytorch-transformers==1.1.0 2 | git+https://github.com/kmkurn/pytorch-crf.git@4cd79bc8af55fb0f34a2a39b2e38f0e71c208fd4#egg=pytorch_crf 3 | seqeval==0.0.12 4 | jsonlines==1.2.0 5 | scikit-learn==0.21.2 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020, NeuralMind (Fabio Capuano de Souza, Rodrigo Nogueira, 4 | Roberto de Alencar Lotufo) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /ner_evaluation/results_writer.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | from argparse import Namespace 3 | from datetime import datetime 4 | from typing import Any 5 | 6 | from eval_tools import SequenceMetrics 7 | 8 | 9 | def to_float(value): 10 | if isinstance(value, list): 11 | return [float(val) for val in value] 12 | else: 13 | return float(value) 14 | 15 | 16 | def compile_results(args: Namespace, 17 | train_metrics: SequenceMetrics, 18 | valid_metrics: SequenceMetrics, 19 | best_epoch_metric: str = 'f1_score', 20 | **extra_values: Any): 21 | results = { 22 | 'timestamp': datetime.now().strftime("%Y-%m-%dT%H:%M:%S"), 23 | } 24 | attrs_args = [ 25 | ('num_train_epochs', 'epochs'), 26 | 'learning_rate', 27 | 'train_batch_size', 28 | 'gradient_accumulation_steps', 29 | 'train_file', 30 | 'valid_file', 31 | 'pooler', 32 | 'freeze_bert', 33 | 'output_dir', 34 | 'labels_file', 35 | 'classifier_lr', 36 | 'no_crf', 37 | 'seed', 38 | 'labels_file', 39 | 'lstm_hidden_size', 40 | 'lstm_layers', 41 | ] 42 | 43 | for attr in attrs_args: 44 | if len(attr) == 2: 45 | source, dest = attr 46 | else: 47 | source = dest = attr 48 | results[dest] = getattr(args, source, None) 49 | 50 | best_epoch = valid_metrics.get_best_epoch(best_epoch_metric) 51 | results['best_epoch'] = best_epoch 52 | 53 | attrs_metrics = [ 54 | 'f1_score', 55 | 'precision', 56 | 'recall', 57 | ] 58 | 59 | for prefix, metrics in [('train', train_metrics), 60 | ('valid', valid_metrics)]: 61 | for attr in attrs_metrics: 62 | key = f'{prefix}_{attr}' 63 | values = metrics.history.get(attr) 64 | if values: 65 | results[key] = to_float(values) 66 | results[f'best_{key}'] = to_float(max(values)) 67 | 68 | results['classification_report'] = valid_metrics.get_value( 69 | 'classification_report', best_epoch) 70 | 71 | for name, value in extra_values.items(): 72 | results[name] = to_float(value) 73 | 74 | return results 75 | 76 | 77 | def write_jsonl_results(results, path): 78 | """Append a line to a jsonlines file.""" 79 | assert path.endswith('.jsonl') 80 | with jsonlines.open(path, 'a') as writer: 81 | writer.write(results) 82 | -------------------------------------------------------------------------------- /ner_evaluation/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from argparse import Namespace 4 | from typing import Type, Union 5 | 6 | import torch 7 | from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE 8 | 9 | from model import get_model_and_kwargs_for_args 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def save_model(model: Type[torch.nn.Module], args: Namespace) -> None: 16 | """Save a trained model and the associated configuration to output dir.""" 17 | model.save_pretrained(args.output_dir) 18 | torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) 19 | 20 | 21 | def load_model(args: Namespace, 22 | model_path: str, 23 | training: bool = True, 24 | ) -> torch.nn.Module: 25 | """Instantiates a pretrained model from parsed argument values. 26 | 27 | Args: 28 | args: parsed arguments from argv. 29 | model_path: name of model checkpoint or path to a checkpoint directory. 30 | training: if True, loads a model with training-specific parameters. 31 | """ 32 | 33 | model_class, model_kwargs = get_model_and_kwargs_for_args( 34 | args, training=training) 35 | logger.info('model: {}, kwargs: {}'.format( 36 | model_class.__name__, model_kwargs)) 37 | 38 | cache_dir = os.path.join( 39 | PYTORCH_PRETRAINED_BERT_CACHE, 40 | 'distributed_{}'.format(args.local_rank)) 41 | model = model_class.from_pretrained( 42 | model_path, 43 | num_labels=args.num_labels, 44 | cache_dir=cache_dir, 45 | output_hidden_states=True, # Ensure all hidden states are returned 46 | **model_kwargs) 47 | 48 | return model 49 | 50 | 51 | class ExponentialAccumulator: 52 | """Exponential moving average train loss tracker.""" 53 | 54 | def __init__(self, beta: float = 0.99): 55 | self._accum = None 56 | self.beta = beta 57 | 58 | def insert_value(self, value: float) -> float: 59 | if self._accum is None: 60 | self._accum = value 61 | else: 62 | self._accum = self.beta * self._accum + (1 - self.beta) * value 63 | 64 | return self._accum 65 | 66 | 67 | class RunningAccumulator: 68 | """Loss value running accumulator.""" 69 | 70 | def __init__(self): 71 | self.total = 0 72 | self.num_values = 0 73 | 74 | def accumulate(self, value: Union[torch.Tensor, float]): 75 | if torch.is_tensor(value): 76 | with torch.no_grad(): 77 | self.total += value.item() 78 | else: 79 | self.total += value 80 | 81 | self.num_values += 1 82 | 83 | def mean(self) -> float: 84 | return self.total / self.num_values 85 | -------------------------------------------------------------------------------- /ner_evaluation/dataset.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import pickle 4 | from typing import List, Tuple 5 | 6 | import torch 7 | from torch.utils.data import ( 8 | Dataset, 9 | DataLoader, 10 | TensorDataset, 11 | ) 12 | from tqdm import tqdm 13 | 14 | from model import BertForNERClassification 15 | from preprocessing import InputSpan 16 | 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def get_dataset(features: List[InputSpan]) -> TensorDataset: 22 | """Generate a TensorDataset from lists of tensors.""" 23 | all_input_ids = torch.tensor( 24 | [f.input_ids for f in features], dtype=torch.long) 25 | all_input_mask = torch.tensor( 26 | [f.input_mask for f in features], dtype=torch.long) 27 | all_segment_ids = torch.tensor( 28 | [f.segment_ids for f in features], dtype=torch.long) 29 | all_label_ids = torch.tensor( 30 | [f.label_ids for f in features], dtype=torch.long) 31 | all_prediction_mask = torch.tensor( 32 | [f.prediction_mask for f in features], dtype=torch.uint8) 33 | all_example_index = torch.tensor( 34 | [f.example_index for f in features], dtype=torch.long) 35 | all_doc_span_index = torch.tensor( 36 | [f.doc_span_index for f in features], dtype=torch.long) 37 | 38 | return TensorDataset(all_input_ids, all_input_mask, all_segment_ids, 39 | all_label_ids, all_prediction_mask, 40 | all_example_index, all_doc_span_index) 41 | 42 | 43 | def get_bert_encoded_features(model: BertForNERClassification, 44 | dataset: Dataset, 45 | batch_size: int, 46 | device: torch.device, 47 | ) -> Tuple[torch.Tensor, ...]: 48 | """Returns a BERT encoded tensors of the dataset, to be used to speed up 49 | the training of the classifier model with frozen BERT.""" 50 | model.eval() 51 | dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False) 52 | 53 | all_encoded_inputs = [] 54 | 55 | with torch.no_grad(): 56 | for batch in tqdm(dataloader, desc="Getting frozen BERT features"): 57 | batch = tuple(t.to(device) for t in batch) 58 | input_ids, input_mask, segment_ids, *_ = batch 59 | 60 | encoded_batch = model.bert_encode( 61 | input_ids, segment_ids, input_mask) 62 | encoded_batch = encoded_batch.cpu() 63 | all_encoded_inputs.append(encoded_batch) 64 | 65 | all_encoded_inputs = torch.cat(all_encoded_inputs, dim=0) 66 | 67 | return (all_encoded_inputs, 68 | *dataset.tensors[1:]) 69 | 70 | 71 | def get_bert_encoded_dataset(model: BertForNERClassification, 72 | dataset: Dataset, 73 | batch_size: int, 74 | device: torch.device, 75 | ) -> TensorDataset: 76 | """Returns a BERT encoded version of the dataset, to be used to speed up 77 | the training of the classifier model with frozen BERT.""" 78 | encoded_data = get_bert_encoded_features( 79 | model, dataset, batch_size, device) 80 | 81 | return TensorDataset(*encoded_data) 82 | -------------------------------------------------------------------------------- /ner_evaluation/tokenization.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from pytorch_transformers.tokenization_bert import ( 4 | _is_punctuation as is_punctuation, 5 | _is_whitespace as is_whitespace, 6 | ) 7 | 8 | 9 | class Token(object): 10 | """Info about a single token.""" 11 | 12 | def __init__(self, 13 | text: str, 14 | offset: int, 15 | index: int, 16 | tail: str = '', 17 | tag: str = None): 18 | 19 | if not isinstance(text, str) or not text: 20 | raise TypeError('text should be a non-empty string.') 21 | if not isinstance(offset, int) or offset < 0: 22 | raise TypeError('offset should be an int >= 0.') 23 | if not isinstance(index, int) or index < 0: 24 | raise TypeError('index should be an int >= 0.') 25 | 26 | self.text = text 27 | self.offset = offset 28 | self.tail = tail 29 | self.tag = tag 30 | self._example = None 31 | self._index = index 32 | 33 | def __str__(self): 34 | return '{}{}'.format(self.text, self.tail) 35 | 36 | def __repr__(self): 37 | return 'Token(text=%r, offset=%r, index=%r, tail=%r, tag=%r)' % \ 38 | (self.text, self.offset, self.index, self.tail, self.tag) 39 | 40 | def __len__(self): 41 | return len(self.text) + len(self.tail) 42 | 43 | def __add__(self, char): 44 | self.text += char 45 | return self 46 | 47 | @property 48 | def example(self): 49 | return self._example 50 | 51 | @property 52 | def index(self): 53 | return self._index 54 | 55 | @property 56 | def is_punct(self): 57 | return is_punctuation(self.text) 58 | 59 | def has_tail(self): 60 | return bool(self.tail) 61 | 62 | @property 63 | def nbor(self): 64 | """Returns the neighboring token, e.g., 65 | self._example.doc_tokens[self.index + 1].""" 66 | if self.index is None: 67 | return None 68 | try: 69 | return self._example.doc_tokens[self.index + 1] 70 | except IndexError: 71 | return None 72 | 73 | 74 | def reconstruct_text_from_tokens(tokens: List[Token], 75 | include_last_tail: bool = False, 76 | ) -> str: 77 | """Concatenates the text of a sequence of tokens.""" 78 | def text_generator(tokens): 79 | for i, token in enumerate(tokens): 80 | yield token.text 81 | if i < len(tokens) - 1 or include_last_tail: 82 | yield token.tail 83 | 84 | return ''.join(piece for piece in text_generator(tokens)) 85 | 86 | 87 | class TokenizerWithAlignment: 88 | """Tokenizer that performs basic tokenization keeping string alignment.""" 89 | 90 | def __init__(self): 91 | pass 92 | 93 | @staticmethod 94 | def _begin_new_token(doc_tokens, text, offset): 95 | token = Token(text=text, offset=offset, index=len(doc_tokens)) 96 | doc_tokens.append(token) 97 | 98 | return token 99 | 100 | def tokenize(self, text: str) -> Tuple[List[Token], List[int]]: 101 | doc_tokens = [] 102 | char_to_word_offset = [] 103 | 104 | new_word = True 105 | curr_token = None 106 | 107 | for offset, c in enumerate(text): 108 | if is_whitespace(c): 109 | new_word = True 110 | if curr_token: 111 | curr_token.tail += c 112 | else: 113 | if is_punctuation(c): 114 | curr_token = self._begin_new_token(doc_tokens, c, offset) 115 | new_word = True 116 | else: 117 | if new_word: 118 | curr_token = self._begin_new_token( 119 | doc_tokens, c, offset) 120 | else: 121 | curr_token += c 122 | new_word = False 123 | 124 | # OBS: Whitespaces that appear before any tokens will have offset -1 125 | # char_to_word_offset.append(len(doc_tokens) - 1) 126 | char_to_word_offset.append(max(0, len(doc_tokens) - 1)) 127 | 128 | return doc_tokens, char_to_word_offset 129 | 130 | def __call__(self, text: str) -> Tuple[List[Token], List[int]]: 131 | return self.tokenize(text) 132 | -------------------------------------------------------------------------------- /ner_evaluation/tag_encoder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines NER tag encoder for predefined coding schemes. 3 | """ 4 | from typing import List 5 | 6 | BIO = 'BIO' 7 | BILUO = 'BILUO' 8 | 9 | SCHEMES = { 10 | BIO: ['B', 'I'], 11 | BILUO: ['B', 'I', 'L', 'U'], 12 | } 13 | 14 | VALID_TRANSITIONS = { 15 | BIO: { 16 | 'B': ['B', 'I', 'O'], 17 | 'I': ['B', 'I', 'O'], 18 | 'O': ['B', 'O'], 19 | }, 20 | BILUO: { 21 | 'B': ['I', 'L'], 22 | 'I': ['I', 'L'], 23 | 'L': ['B', 'U', 'O'], 24 | 'U': ['B', 'U', 'O'], 25 | 'O': ['B', 'U', 'O'], 26 | }, 27 | } 28 | 29 | 30 | class NERTagsEncoder(object): 31 | """Handles creation of NER tags for a list of named entity classes and 32 | conversion of tags to ids and vice versa.""" 33 | 34 | def __init__(self, 35 | classes: List[str], 36 | scheme: str = BIO, 37 | ignore_index: int = -100): 38 | 39 | if not len(set(classes)) == len(classes): 40 | raise ValueError("`classes` have duplicate entries.") 41 | if "O" in classes or "X" in classes: 42 | raise ValueError("`classes` should not have tag O nor X.") 43 | if ignore_index >= 0 or not isinstance(ignore_index, int): 44 | raise ValueError("`ignore_index` should be a negative int.") 45 | if scheme not in SCHEMES: 46 | raise ValueError("`scheme` should be one of {}".format( 47 | tuple(SCHEMES.keys()))) 48 | 49 | self.classes = tuple(classes) 50 | self.tags = ["O"] 51 | self.ignore_index = ignore_index 52 | self.tag_to_id = {"X": ignore_index} 53 | self.scheme = scheme 54 | 55 | for clss in classes: 56 | for subtag in SCHEMES[scheme]: 57 | self.tags.append(f"{subtag}-{clss}") 58 | 59 | for i, tag in enumerate(self.tags): 60 | self.tag_to_id[tag] = i 61 | 62 | def __repr__(self): 63 | return ('{class_}(classes={classes!r}, scheme={scheme!r})') \ 64 | .format(class_=self.__class__.__name__, 65 | classes=self.classes, 66 | scheme=self.scheme) 67 | 68 | @classmethod 69 | def from_labels_file(cls, filepath: str, *args, **kwargs): 70 | """Creates encoder from a file with NER label classes (one class per 71 | line) and a given scheme.""" 72 | with open(filepath, 'r') as fd: 73 | ner_classes = [clss for clss in fd.read().splitlines() if clss] 74 | 75 | return cls(ner_classes, *args, **kwargs) 76 | 77 | @property 78 | def num_labels(self) -> int: 79 | return len(self.tags) 80 | 81 | def convert_tags_to_ids(self, tags: List[str]) -> List[int]: 82 | """Converts a list of tag strings to a list of tag ids.""" 83 | return [self.tag_to_id[tag] for tag in tags] 84 | 85 | def convert_ids_to_tags(self, tag_ids: List[int]) -> List[str]: 86 | """Returns a list of tag strings from a list of tag ids.""" 87 | return [self.tags[tag_id] for tag_id in tag_ids] 88 | 89 | def decode_valid(self, tag_sequence: List[str]) -> List[str]: 90 | """Processes a list of tag strings to remove invalid predictions given 91 | the valid transitions of the tag scheme, such as "I" tags coming after 92 | "O" tags.""" 93 | if self.scheme == BILUO: 94 | import warnings 95 | warnings.warn(f"Valid decoding for BILUO scheme is not implemented. Returning input sequence.") 96 | return tag_sequence 97 | 98 | prev_tag = 'O' 99 | prev_type = 'O' 100 | 101 | final = [] 102 | for tag_and_cls in tag_sequence: 103 | tag = tag_and_cls[0] 104 | type_ = tag_and_cls.split('-')[-1] 105 | valid_transitions = VALID_TRANSITIONS[self.scheme][prev_tag] 106 | 107 | valid_tag = False 108 | if tag in valid_transitions: 109 | if tag in ('B', 'O'): 110 | valid_tag = True 111 | elif tag == 'I' and type_ == prev_type: 112 | valid_tag = True 113 | 114 | if valid_tag: 115 | prev_tag = tag 116 | prev_type = type_ 117 | final.append(tag_and_cls) 118 | else: 119 | prev_tag = 'O' 120 | prev_type = 'O' 121 | final.append('O') 122 | 123 | return final -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # BERTimbau - Portuguese BERT 3 | 4 | This repository contains pre-trained [BERT](https://github.com/google-research/bert) models trained on the Portuguese language. BERT-Base and BERT-Large Cased variants were trained on the [BrWaC (Brazilian Web as Corpus)](https://www.researchgate.net/publication/326303825_The_brWaC_Corpus_A_New_Open_Resource_for_Brazilian_Portuguese), a large Portuguese corpus, for 1,000,000 steps, using whole-word mask. Model artifacts for TensorFlow and PyTorch can be found below. 5 | 6 | The models are a result of an ongoing Master's Program. The [text submission for Qualifying Exam](qualifying_exam-portuguese_named_entity_recognition_using_bert_crf.pdf) is also included in the repository in PDF format, which contains more details about the pre-training procedure, vocabulary generation and downstream usage in the task of Named Entity Recognition. 7 | 8 | ## Download 9 | 10 | The base and large models are available at [Hugging Face](https://huggingface.co/neuralmind) 11 | 12 | 13 | ## Evaluation benchmarks 14 | 15 | The models were benchmarked on three tasks (Sentence Textual Similarity, Recognizing Textual Entailment and Named Entity Recognition) and compared to previous published results and [Multilingual BERT](https://github.com/google-research/bert/blob/master/multilingual.md). Metrics are: Pearson's correlation for STS and F1-score for RTE and NER. 16 | 17 | | Task | Test Dataset | BERTimbau-Large | BERTimbau-Base | mBERT | Previous SOTA | 18 | |:----:|:----------------------:|:---------------:|:-------------: | :-----:| :--------------------:| 19 | | STS | ASSIN2 | **0.852** | 0.836 | 0.809 | 0.83 [[1]](#References) | 20 | | RTE | ASSIN2 | **90.0** | 89.2 | 86.8 | 88.3 [[1]](#References) | 21 | | NER | MiniHAREM (5 classes) | **83.7** | 83.1 | 79.2 | 82.3 [[2]](#References) | 22 | | NER | MiniHAREM (10 classes) | **78.5** | 77.6 | 73.1 | 74.6 [[2]](#References) | 23 | 24 | ### NER experiments code 25 | 26 | Code and instructions to reproduce the Named Entity Recognition experiments are in [`ner_evaluation/`](ner_evaluation/) directory. 27 | 28 | 29 | ## PyTorch usage example 30 | 31 | Our PyTorch artifacts are compatible with the [🤗Huggingface Transformers](https://github.com/huggingface/transformers) library and are also available on the [Community models](https://huggingface.co/models): 32 | 33 | - [BERTimbau Base model card](https://huggingface.co/neuralmind/bert-base-portuguese-cased) 34 | - [BERTimbau Large model card](https://huggingface.co/neuralmind/bert-large-portuguese-cased) 35 | 36 | ```python 37 | from transformers import AutoModel, AutoTokenizer 38 | 39 | # Using the community model 40 | # BERT Base 41 | tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased') 42 | model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased') 43 | 44 | # BERT Large 45 | tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased') 46 | model = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased') 47 | 48 | # or, using BertModel and BertTokenizer directly 49 | from transformers import BertModel, BertTokenizer 50 | 51 | tokenizer = BertTokenizer.from_pretrained('path/to/vocab.txt', do_lower_case=False) 52 | model = BertModel.from_pretrained('path/to/bert_dir') # Or other BERT model class 53 | ``` 54 | 55 | ## Acknowledgement 56 | 57 | We would like to thank Google for Cloud credits under a research grant that allowed us to train these models. 58 | 59 | ## References 60 | 61 | [1] [Multilingual Transformer Ensembles for Portuguese Natural Language Task](https://www.researchgate.net/publication/340236502_Multilingual_Transformer_Ensembles_for_Portuguese_Natural_Language_Tasks) 62 | 63 | [2] [Assessing the Impact of Contextual Embeddings for Portuguese Named Entity Recognition](https://github.com/jneto04/ner-pt) 64 | 65 | 66 | ## How to cite this work 67 | 68 | @InProceedings{souza2020bertimbau, 69 | author="Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto", 70 | editor="Cerri, Ricardo and Prati, Ronaldo C.", 71 | title="BERTimbau: Pretrained BERT Models for Brazilian Portuguese", 72 | booktitle="Intelligent Systems", 73 | year="2020", 74 | publisher="Springer International Publishing", 75 | address="Cham", 76 | pages="403--417", 77 | isbn="978-3-030-61377-8" 78 | } 79 | 80 | 81 | 82 | @article{souza2019portuguese, 83 | title={Portuguese Named Entity Recognition using BERT-CRF}, 84 | author={Souza, F{\'a}bio and Nogueira, Rodrigo and Lotufo, Roberto}, 85 | journal={arXiv preprint arXiv:1909.10649}, 86 | url={http://arxiv.org/abs/1909.10649}, 87 | year={2019} 88 | } 89 | -------------------------------------------------------------------------------- /ner_evaluation/run_bert_harem.py: -------------------------------------------------------------------------------- 1 | """Training and evaluation entry point for HAREM experiments. 2 | 3 | This file simply defines a function that loads input data into Example 4 | instances for training/evaluation and defines evaluation metrics for each 5 | dataset split set. 6 | 7 | Since `load_and_cache_examples` function below uses 8 | `preprocessing.read_examples` to read the JSON dataset files. See its docstring 9 | for a description of the JSON structure. 10 | """ 11 | 12 | import logging 13 | from argparse import Namespace 14 | from typing import List, Tuple 15 | 16 | import torch 17 | from pytorch_transformers import BertTokenizer 18 | from seqeval.metrics import (classification_report, 19 | f1_score, 20 | precision_score, 21 | recall_score) 22 | from torch.utils.data import Dataset 23 | 24 | from dataset import get_dataset 25 | from eval_tools import confusion_matrix_nested, filtered, SequenceMetrics 26 | from preprocessing import (Example, InputSpan, get_features_from_examples, 27 | read_examples) 28 | from tag_encoder import NERTagsEncoder 29 | from trainer import main 30 | 31 | 32 | logger = logging.getLogger(__name__) 33 | 34 | 35 | def load_and_cache_examples( 36 | args: Namespace, 37 | tokenizer: BertTokenizer, 38 | tag_encoder: NERTagsEncoder, 39 | mode: str, 40 | ) -> Tuple[Dataset, List[Example], List[InputSpan]]: 41 | """Preprocesses an input JSON file with raw training/evaluation 42 | examples and to BERT format according to the provided args (tokenizer, 43 | tag_encoder/scheme, max sequence length, doc stride, etc).""" 44 | if args.local_rank not in [-1, 0]: 45 | # Make sure only the first process in distributed training process 46 | # the dataset, and the others will use the cache. 47 | # TODO: Verify if this is working as expected. 48 | torch.distributed.barrier() 49 | 50 | if mode == 'train': 51 | input_file = args.train_file 52 | elif mode == 'valid': 53 | input_file = args.valid_file 54 | else: 55 | assert mode == 'eval', f"Invalid mode: {mode}" 56 | input_file = args.eval_file 57 | 58 | # HAREM dataset specific sanity checks 59 | # Assert all files use the same scenario (selective or total). 60 | scenario = 'selective' if 'selective' in input_file else 'total' 61 | assert scenario in args.labels_file 62 | 63 | examples = read_examples( 64 | input_file=input_file, 65 | is_training=True, 66 | classes=tag_encoder.classes, 67 | scheme=args.scheme) 68 | features = get_features_from_examples( 69 | examples, 70 | tag_encoder, 71 | tokenizer, 72 | args, 73 | mode=mode, 74 | unique_id_start=1000000000, 75 | verbose=args.verbose_logging) 76 | 77 | if mode != 'eval': 78 | if args.few_samples != -1: 79 | logger.info('Limiting dataset to %d examples.', 80 | args.few_samples) 81 | examples = examples[:args.few_samples] 82 | features = list(filter( 83 | lambda f: f.example_index < args.few_samples, features)) 84 | logger.info('Final features: %d', len(features)) 85 | 86 | if args.local_rank == 0: 87 | # Make sure only the first process in distributed training process 88 | # the dataset, and the others will use the cache 89 | # TODO: Verify if this is working as expected. 90 | torch.distributed.barrier() 91 | 92 | dataset = get_dataset(features) 93 | 94 | return dataset, examples, features 95 | 96 | 97 | def get_train_metrics_fn(tag_encoder) -> SequenceMetrics: 98 | """Get SequenceMetrics instance for evaluating on the train data.""" 99 | metrics = [ 100 | ('f1_score', f1_score) 101 | ] 102 | return SequenceMetrics(metrics) 103 | 104 | 105 | def get_eval_metrics_fn(tag_encoder) -> SequenceMetrics: 106 | """Get SequenceMetrics instance for evaluating on the evaluation data. 107 | """ 108 | metrics = [ 109 | ('f1_score', filtered(f1_score, tag_encoder)), 110 | ('precision', filtered( 111 | precision_score, tag_encoder)), 112 | ('recall', filtered( 113 | recall_score, tag_encoder)), 114 | ('classification_report', 115 | filtered(classification_report, tag_encoder, digits=4)), 116 | ('confusion_matrix', confusion_matrix_nested), 117 | ] 118 | 119 | return SequenceMetrics(metrics) 120 | 121 | 122 | if __name__ == "__main__": 123 | 124 | logging.basicConfig( 125 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 126 | datefmt='%m/%d/%Y %H:%M:%S', 127 | level=logging.INFO) 128 | 129 | main(load_and_cache_examples, 130 | get_train_metrics_fn=get_train_metrics_fn, 131 | get_valid_metrics_fn=get_eval_metrics_fn, # same as evaluation 132 | get_eval_metrics_fn=get_eval_metrics_fn, 133 | ) 134 | -------------------------------------------------------------------------------- /ner_evaluation/postprocessing.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | def select_max_context_tokens(y_pred, prediction_mask, token_is_max_context): 8 | """Selects y_pred elements masked by prediction_mask & 9 | token_is_max_context. 10 | `y_pred` can be the output of any BERT model, and hence does not have a 11 | fixed expected length nor type. 12 | 13 | Shapes: 14 | ------- 15 | y_pred: [seq_length] or [sum(prediction_mask)]. Shape depends on whether the 16 | BERT model has a CRF layer. 17 | prediction_mask: [seq_length] 18 | token_is_max_context: Variable length. Ranges from [doc_stride] up to 19 | [seg_length - 1]. 20 | """ 21 | # Remove [CLS] token from prediction_mask 22 | prediction_mask = np.asarray(prediction_mask[1:], dtype=np.bool) 23 | max_context_mask = np.asarray(token_is_max_context, dtype=np.bool) 24 | 25 | if len(max_context_mask) < len(prediction_mask): 26 | # Right pad max_context with zeros to the size of prediction_mask 27 | right_pad = len(prediction_mask) - len(max_context_mask) 28 | max_context_mask = np.pad(max_context_mask, (0, right_pad), 29 | mode='constant', constant_values=(0, 0)) 30 | 31 | # 1st case: y_pred is output of CRF layer 32 | if isinstance(y_pred, list): 33 | # y_pred is output of CRF layer (already masked by prediction_mask) 34 | # So we have to index max_context_mask by prediction_mask 35 | assert len(y_pred) == sum(prediction_mask) 36 | out_mask = max_context_mask[prediction_mask] 37 | 38 | else: 39 | y_pred = y_pred[1:] # Remove [CLS] token 40 | 41 | if len(y_pred) == len(prediction_mask): 42 | # 2nd case: output of BERT model 43 | out_mask = prediction_mask & max_context_mask 44 | 45 | else: 46 | # y_pred is output of BERT-LSTM, that outputs arrays of variable 47 | # length (same size as non-masked input, i.e. sum(input_mask). 48 | # We just need to adjust the masks to have the same length as the 49 | # output. 50 | assert prediction_mask[len(y_pred):].sum() == 0 51 | assert max_context_mask[len(y_pred):].sum() == 0 52 | prediction_mask = prediction_mask[:len(y_pred)] 53 | max_context_mask = max_context_mask[:len(y_pred)] 54 | 55 | out_mask = prediction_mask & max_context_mask 56 | 57 | return np.asarray(y_pred)[out_mask] 58 | 59 | 60 | def concatenate(list_tensors): 61 | """Concatenates a list of arrays/tensors/list.""" 62 | 63 | if isinstance(list_tensors[0], np.ndarray): 64 | return np.concatenate(list_tensors) 65 | 66 | if isinstance(list_tensors[0], torch.Tensor): 67 | return torch.cat(list_tensors) 68 | 69 | if isinstance(list_tensors[0], list): 70 | output = [] 71 | for tensor in list_tensors: 72 | output.extend(tensor) 73 | return output 74 | 75 | raise TypeError(f"Received invalid type: {type(list_tensors[0])}") 76 | 77 | 78 | class MissingPartialOutputError(Exception): 79 | pass 80 | 81 | 82 | class OutputComposer: 83 | """Combines the output of split examples using the max context tokens of 84 | each span.""" 85 | 86 | def __init__(self, examples, features, output_transform_fn=None): 87 | self.examples = examples 88 | self.features = features 89 | self.ix2feature = defaultdict(dict) 90 | for feat in features: 91 | self.ix2feature[feat.example_index][feat.doc_span_index] = feat 92 | 93 | self.output_transform_fn = output_transform_fn 94 | self.reset() 95 | 96 | def reset(self): 97 | """Clear all partial outputs.""" 98 | self.partial_outputs = {i: {} for i in range(len(self.examples))} 99 | 100 | def insert_partial_output(self, example_ix, doc_span_ix, output): 101 | """Selects max context tokens from partial output.""" 102 | feature = self.ix2feature[example_ix][doc_span_ix] 103 | output = select_max_context_tokens(output, 104 | feature.prediction_mask, 105 | feature.token_is_max_context) 106 | self.partial_outputs[example_ix][doc_span_ix] = output 107 | 108 | def insert_batch(self, example_ixs, doc_span_ixs, batch_output): 109 | """Insert a batch of partial predictions.""" 110 | for output, example_ix, doc_span_ix in zip(batch_output, 111 | example_ixs, 112 | doc_span_ixs): 113 | self.insert_partial_output( 114 | example_ix.item(), doc_span_ix.item(), output) 115 | 116 | def get_example_output(self, example_ix): 117 | """Returns the final output of an example.""" 118 | N_spans = len(self.ix2feature[example_ix]) 119 | try: 120 | example_partial_outputs = [ 121 | self.partial_outputs[example_ix].get(j, []) for j in range(N_spans) 122 | ] 123 | except KeyError as err: 124 | span_ix = err.args[0] 125 | msg = (f"Missing partial output for example {example_ix}, span " 126 | f"{span_ix}.") 127 | raise MissingPartialOutputError(msg) from None 128 | 129 | complete_output = concatenate(example_partial_outputs) 130 | assert len(complete_output) == len( 131 | self.examples[example_ix].doc_tokens) 132 | 133 | if self.output_transform_fn is not None: 134 | transformed_output = self.output_transform_fn(complete_output) 135 | return transformed_output 136 | 137 | return complete_output 138 | 139 | def get_outputs(self): 140 | """Returns a list of max-context-combined outputs of all examples.""" 141 | outputs = [] 142 | for example_ix in range(len(self.examples)): 143 | example_output = self.get_example_output(example_ix) 144 | outputs.append(example_output) 145 | 146 | return outputs 147 | -------------------------------------------------------------------------------- /ner_evaluation/run_inference.py: -------------------------------------------------------------------------------- 1 | """This script is an example on how to perform NER inference on plain texts. 2 | 3 | Input file must be either a JSON file (that can have multiple documents) or a 4 | txt file with a single document. 5 | """ 6 | import json 7 | import logging 8 | import os 9 | import tempfile 10 | from argparse import ArgumentParser, Namespace 11 | from typing import List, Tuple 12 | 13 | import torch 14 | from pytorch_transformers.tokenization_bert import BertTokenizer 15 | from torch.utils.data import DataLoader, Dataset 16 | from tqdm import tqdm 17 | 18 | from dataset import get_dataset 19 | from eval_tools import (SequenceMetrics, write_conll_prediction_file, 20 | write_outputs_to_json) 21 | from postprocessing import OutputComposer 22 | from preprocessing import (Example, InputSpan, get_features_from_examples, 23 | read_examples) 24 | from tag_encoder import NERTagsEncoder 25 | from trainer import evaluate 26 | from utils import load_model 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def convert_txt_to_tmp_json_file(txt_file: str) -> str: 32 | """Converts a txt file with inference content to a JSON file with schema 33 | expected by read_examples. Returns a filename to the temp JSON file.""" 34 | with open(txt_file) as fd: 35 | text = fd.read() 36 | 37 | tmp_file = tempfile.NamedTemporaryFile(mode='w', delete=False) 38 | json_data = [{"doc_id": 0, "doc_text": text}] 39 | 40 | tmp_file.write(json.dumps(json_data)) 41 | tmp_file.close() 42 | 43 | return tmp_file.name 44 | 45 | 46 | def load_and_cache_examples( 47 | input_file: str, 48 | args: Namespace, 49 | tokenizer: BertTokenizer, 50 | tag_encoder: NERTagsEncoder, 51 | mode: str, 52 | ) -> Tuple[Dataset, List[Example], List[InputSpan]]: 53 | """Preprocesses an input JSON file to generate inference examples and 54 | convert to BERT format according to the provided args (tokenizer, 55 | tag_encoder/scheme, max sequence length, doc stride, etc).""" 56 | 57 | examples = read_examples( 58 | input_file=input_file, 59 | is_training=False, 60 | classes=tag_encoder.classes, 61 | scheme=args.scheme) 62 | features = get_features_from_examples( 63 | examples, 64 | tag_encoder, 65 | tokenizer, 66 | args, 67 | mode=mode, 68 | unique_id_start=0, 69 | verbose=args.verbose_logging) 70 | 71 | dataset = get_dataset(features) 72 | 73 | return dataset, examples, features 74 | 75 | 76 | if __name__ == "__main__": 77 | 78 | parser = ArgumentParser("NER inference CLI") 79 | 80 | # Model and hyperparameters 81 | parser.add_argument("--input_file", 82 | required=True, 83 | help="File to load examples for inference (JSON or " 84 | "txt).") 85 | parser.add_argument("--output_file", 86 | default='-', 87 | help="File to save prediction results. Defaults to " 88 | "stdout.") 89 | parser.add_argument("--output_format", 90 | choices=("json", "conll"), 91 | default="json", 92 | help="Format to save the predictions (json or conll). " 93 | "Defaults to json.") 94 | 95 | parser.add_argument("--bert_model", default=None, type=str, required=True, 96 | help="Bert pre-trained model name or path to a " 97 | "checkpoint directory.") 98 | parser.add_argument("--tokenizer_model", default=None, type=str, 99 | required=False, 100 | help="Path to tokenizer files. If empty, defaults to " 101 | "--bert_model.") 102 | parser.add_argument("--do_lower_case", 103 | action='store_true', 104 | help="Whether to lower case the input text. True for " 105 | "uncased models, False for cased models.") 106 | parser.add_argument("--max_seq_length", default=512, type=int, 107 | help="The maximum total input sequence length after " 108 | "WordPiece tokenization. Sequences longer than this " 109 | "will be split into multiple spans, and sequences " 110 | "shorter than this will be padded.") 111 | parser.add_argument("--doc_stride", default=128, type=int, 112 | help="When splitting up a long document into chunks, " 113 | "how much stride to take between chunks.") 114 | parser.add_argument('--labels_file', 115 | required=True, 116 | help="File with all NER classes to be considered, one " 117 | "per line.") 118 | parser.add_argument('--scheme', 119 | default='bio', help='NER tagging scheme (BIO|BILUO).') 120 | parser.add_argument('--no_crf', 121 | action='store_true', 122 | help='Remove the CRF layer (use plain BERT or ' 123 | 'BERT-LSTM).') 124 | parser.add_argument('--pooler', 125 | default='last', 126 | help='Pooling strategy for extracting BERT encoded ' 127 | 'features from last BERT layers. ' 128 | 'One of "last", "sum" or "concat".') 129 | parser.add_argument('--freeze_bert', 130 | action='store_true', 131 | help="Freeze BERT layers' parameters. If True, uses " 132 | "either a BERT-LSTM or BERT-LSTM-CRF model.") 133 | parser.add_argument('--lstm_hidden_size', 134 | type=int, 135 | default=100, 136 | help=('Hidden dimension of the LSTM (only used when ' 137 | 'the BERT model is frozen.')) 138 | parser.add_argument('--lstm_layers', 139 | type=int, 140 | default=1, 141 | help=('Number of LSTM layers (only used when the BERT ' 142 | 'model is frozen.')) 143 | parser.add_argument('--no_cuda', action='store_true', 144 | help='Disables CUDA devices for inference.') 145 | parser.add_argument('--batch_size', type=int, 146 | default=1, help='Batch size.') 147 | parser.add_argument('--verbose_logging', action='store_true') 148 | 149 | args = parser.parse_args() 150 | args.local_rank = -1 151 | 152 | logging.basicConfig() 153 | 154 | if torch.cuda.is_available and not args.no_cuda: 155 | args.device = torch.device("cuda") 156 | args.n_gpu = 1 157 | else: 158 | args.device = torch.device("cpu") 159 | args.n_gpu = 0 160 | 161 | tokenizer_path = args.tokenizer_model or args.bert_model 162 | tokenizer = BertTokenizer.from_pretrained( 163 | tokenizer_path, do_lower_case=args.do_lower_case) 164 | 165 | # Instantiate NER Tag encoder 166 | tag_encoder = NERTagsEncoder.from_labels_file( 167 | args.labels_file, scheme=args.scheme.upper()) 168 | 169 | args.num_labels = tag_encoder.num_labels 170 | args.override_cache = True 171 | 172 | # Load a pretrained model 173 | model = load_model(args, args.bert_model, training=False) 174 | model.to(args.device) 175 | 176 | if args.input_file.endswith('.txt'): 177 | args.inference_file = convert_txt_to_tmp_json_file(args.input_file) 178 | else: 179 | args.inference_file = args.input_file 180 | 181 | args.override_cache = True 182 | 183 | dataset, examples, features = load_and_cache_examples( 184 | args.inference_file, 185 | args=args, 186 | tokenizer=tokenizer, 187 | tag_encoder=tag_encoder, 188 | mode='inference', 189 | ) 190 | 191 | output_composer = OutputComposer( 192 | examples, 193 | features, 194 | output_transform_fn=tag_encoder.convert_ids_to_tags) 195 | 196 | logger.info("***** Running predictions *****") 197 | logger.info(" Num orig examples = %d", len(examples)) 198 | logger.info(" Num split examples = %d", len(features)) 199 | logger.info(" Batch size = %d", args.batch_size) 200 | 201 | # Run prediction for full data 202 | dataloader = DataLoader(dataset, 203 | batch_size=args.batch_size, 204 | num_workers=os.cpu_count()) 205 | 206 | model.frozen_bert = False 207 | 208 | metrics = evaluate( 209 | args, 210 | model, 211 | tqdm(dataloader, desc="Prediction"), 212 | output_composer=output_composer, 213 | sequence_metrics=SequenceMetrics([]), # Empty metrics 214 | reset=True, 215 | ) 216 | 217 | # Get predictions for all examples 218 | all_y_pred_raw = output_composer.get_outputs() 219 | # Filter invalid predictions 220 | all_y_pred = [tag_encoder.decode_valid(y_pred) 221 | for y_pred in all_y_pred_raw] 222 | 223 | # Write predictions to output file 224 | if args.output_format == 'conll': 225 | write_conll_prediction_file(args.output_file, examples, all_y_pred) 226 | 227 | elif args.output_format == 'json': 228 | write_outputs_to_json(args.output_file, examples, all_y_pred) 229 | -------------------------------------------------------------------------------- /ner_evaluation/eval_tools.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import json 3 | import sys 4 | from typing import Any, Callable, Dict, List, Optional, Tuple, Union 5 | 6 | import numpy as np 7 | from seqeval.metrics.sequence_labeling import get_entities 8 | 9 | from preprocessing import Example, InputSpan 10 | 11 | 12 | TAG_SEQUENCE = Union[List[int], List[str]] 13 | METRIC_FN = Callable[[List[TAG_SEQUENCE], List[TAG_SEQUENCE]], Any] 14 | 15 | 16 | def flatten(list_: List[Any]) -> List[Any]: 17 | """Flattens a nested list of tag predictions.""" 18 | result = [] 19 | 20 | for sub in list_: 21 | if sub and isinstance(sub, list) and isinstance(sub[0], list): 22 | result.extend(flatten(sub)) 23 | elif isinstance(sub, list): 24 | result.extend(sub) 25 | else: 26 | result.append(sub) 27 | 28 | return result 29 | 30 | 31 | def confusion_matrix_nested(y_true: List[TAG_SEQUENCE], 32 | y_pred: List[TAG_SEQUENCE]) -> str: 33 | """Shortcut to Sklearn Confusion Matrix accepting nested lists of 34 | gold labels and predictions instead of flats lists.""" 35 | from sklearn.metrics import confusion_matrix 36 | return confusion_matrix(flatten(y_true), flatten(y_pred)) 37 | 38 | 39 | def filtered(metric_fn: METRIC_FN, 40 | ner_tags, 41 | **kwargs: Any, 42 | ) -> METRIC_FN: 43 | """Wraps a metric function with invalid tag decoding filtering (removal of 44 | invalid tag predictions for the tag scheme). 45 | 46 | Args: 47 | metric_fn: a metric function. 48 | ner_tags: a NERLabelEncoder instance. Used to perform valid tag 49 | decoding. 50 | kwargs: extra arguments to be passed to `metric_fn`. 51 | """ 52 | def metric(y_true: List[TAG_SEQUENCE], y_pred: List[TAG_SEQUENCE]) -> Any: 53 | y_pred = [ner_tags.decode_valid(y) for y in y_pred] 54 | return metric_fn(y_true, y_pred, **kwargs) 55 | return metric 56 | 57 | 58 | def pad_max_context_array(max_context_mask, max_length=512): 59 | """Right pad max_context with zeros to the size of prediction_mask""" 60 | right_pad = max_length - len(max_context_mask) 61 | max_context_mask = np.pad(max_context_mask, (0, right_pad), 62 | mode='constant', constant_values=(0, 0)) 63 | 64 | return max_context_mask.astype(np.bool) 65 | 66 | 67 | def postprocess_span_output(y_pred: TAG_SEQUENCE, span_features: InputSpan): 68 | """Postprocess the span output to consider only tokens of max context and 69 | not masked. 70 | 71 | The problem: 72 | The network is spitting span outputs. An example almost always have 73 | more than one span, and we have to combine all the spans to get the 74 | final output. 75 | 76 | Args: 77 | y_pred(List[int]): predicted class ids for one example span. 78 | span_features(InputFeatures): features of the span input. 79 | """ 80 | 81 | out_cls_ids = [] 82 | last_token_ix = -1 83 | 84 | # Get output classes skipping subtokens, the first [CLS] and masked tokens 85 | for tok_ix, cls_id in enumerate(y_pred[1:], start=1): 86 | 87 | is_considered = span_features.input_mask[tok_ix] 88 | pred_mask = span_features.prediction_mask[tok_ix] 89 | if is_considered and pred_mask: 90 | orig_token_ix = span_features.token_to_orig_map[tok_ix] 91 | is_max_context = span_features.token_is_max_context[tok_ix - 1] 92 | 93 | if orig_token_ix > last_token_ix: 94 | last_token_ix = orig_token_ix 95 | 96 | if is_max_context: 97 | out_cls_ids.append(cls_id) 98 | 99 | return out_cls_ids 100 | 101 | 102 | class SequentialSpanPostProcessor(object): 103 | """BERT (without CRF) Span post-processing class. 104 | This class handles network postprocessing after each batch. 105 | This class expects that the example order is NOT randomized, i.e., the 106 | DataLoader uses a SequentialSampler. 107 | """ 108 | 109 | def __init__(self, features: List[InputSpan]): 110 | self.features = features 111 | self._index = 0 112 | 113 | def reset(self) -> None: 114 | self._index = 0 115 | 116 | def __call__(self, 117 | y_true: TAG_SEQUENCE, 118 | y_pred: TAG_SEQUENCE, 119 | ) -> Tuple[int, TAG_SEQUENCE, TAG_SEQUENCE]: 120 | """Performs max-context token selection for a single span.""" 121 | 122 | span_features = self.features[self._index] 123 | y_true = postprocess_span_output(y_true, span_features) 124 | y_pred = postprocess_span_output(y_pred, span_features) 125 | self._index += 1 126 | 127 | return span_features.example_index, y_true, y_pred 128 | 129 | 130 | class CRFSpanPostProcessor(object): 131 | """Post-processes the output of the BERT-CRF network. 132 | 133 | The CRF layer outputs a list of lists of label ids of variable size. 134 | Each sequence has a variable length, defined by the feature output mask. 135 | Besides the prediction mask, we must select only the max context tokens of 136 | each document span to reconstruct the example text. 137 | """ 138 | 139 | def __init__(self, features: List[InputSpan]): 140 | self.features = features 141 | # _index is the example index. 142 | self._index = 0 143 | 144 | def reset(self) -> None: 145 | self._index = 0 146 | 147 | def __call__(self, y_true: TAG_SEQUENCE, y_pred: TAG_SEQUENCE): 148 | span_features = self.features[self._index] 149 | 150 | max_context_mask = pad_max_context_array( 151 | span_features.token_is_max_context, 152 | len(span_features.input_ids)) 153 | 154 | output_mask = np.asarray(span_features.prediction_mask, dtype=np.uint) 155 | partial_example_mask = max_context_mask[output_mask] 156 | 157 | y_true = [y for y, mask in zip(y_true, partial_example_mask) if mask] 158 | y_pred = [y for y, mask in zip(y_pred, partial_example_mask) if mask] 159 | 160 | assert len(y_true) == len(y_pred), \ 161 | "y_true and y_pred should be of same length" 162 | 163 | self._index += 1 164 | 165 | return span_features.example_index, y_true, y_pred 166 | 167 | 168 | class SequenceMetrics(object): 169 | """Calculates sequence metrics and keeps history of metric values. 170 | 171 | NOTE: Methods `get_best` and `get_best_epoch` assumes a **higher value** 172 | is better. 173 | """ 174 | 175 | def __init__(self, metrics: List[Tuple[str, METRIC_FN]]): 176 | self.metrics = {} 177 | self.history = {} 178 | 179 | for metric_name, metric_fn in metrics: 180 | self.add_metric(metric_name, metric_fn) 181 | 182 | def add_metric(self, metric_name: str, metric_fn: METRIC_FN) -> None: 183 | self.metrics[metric_name] = metric_fn 184 | self.history[metric_name] = [] 185 | 186 | def clear_history(self) -> None: 187 | self.history = { 188 | k: [] for k in self.history.keys() 189 | } 190 | 191 | def get_best(self, metric_name: str) -> Any: 192 | """Returns the maximum value of the given metric by name.""" 193 | return max(self.history[metric_name]) 194 | 195 | def get_best_epoch(self, metric_name: str) -> int: 196 | """Returns the epoch number for which the metric has its highest 197 | value.""" 198 | return int(np.argmax(self.history[metric_name]) + 1) 199 | 200 | def get_value(self, metric_name: str, epoch: Optional[int] = None) -> Any: 201 | """Returns the value of a metric at a given epoch (defaults to last 202 | epoch).""" 203 | if epoch is None: 204 | epoch = -1 205 | else: 206 | epoch = epoch - 1 207 | return self.history[metric_name][epoch] 208 | 209 | def calculate_metrics(self, 210 | y_true: List[TAG_SEQUENCE], 211 | y_pred: List[TAG_SEQUENCE], 212 | ) -> Dict[str, Any]: 213 | """Calculates all registered metrics for the gold and predicted tag 214 | sequences. 215 | 216 | Args: 217 | y_true: a list of gold tag sequences. 218 | y_pred: a list of predicted tag sequences. 219 | 220 | Returns: 221 | A dict of metric names to calculated metric values. 222 | """ 223 | values = {} 224 | 225 | for name, metric_fn in self.metrics.items(): 226 | metric_value = metric_fn(y_true, y_pred) 227 | values[name] = metric_value 228 | self.history[name].append(metric_value) 229 | 230 | return values 231 | 232 | 233 | @contextlib.contextmanager 234 | def smart_open(filename=None): 235 | if filename and filename != '-': 236 | fh = open(filename, 'w') 237 | else: 238 | fh = sys.stdout 239 | 240 | try: 241 | yield fh 242 | finally: 243 | if fh is not sys.stdout: 244 | fh.close() 245 | 246 | 247 | def write_conll_prediction_file( 248 | out_file: str, 249 | examples: List[Example], 250 | y_preds: List[TAG_SEQUENCE]) -> None: 251 | """Writes a text output with predictions for a collection of Examples in 252 | CoNLL evaluation format, one token per line: 253 | 254 | TOKEN GOLD-TAG PRED-TAG 255 | 256 | Distinct example outputs are separated by a blank line. 257 | 258 | Args: 259 | out_file: the path of the output CoNLL prediction file. 260 | examples: list of Example instances with associated tokens and gold 261 | tag labels. 262 | y_preds: list of predicted tag sequences for each example. 263 | 264 | Raises: 265 | AssertionError: if (a) the lengths of y_preds and examples are not 266 | equal, or (b) there is a mismatch in length of tokens, labels or 267 | predicted tags for any example. 268 | """ 269 | assert len(y_preds) == len(examples) 270 | 271 | with smart_open(out_file) as fd: 272 | for example, pred_tag in zip(examples, y_preds): 273 | 274 | tokens = example.doc_tokens 275 | labels = example.labels 276 | 277 | assert len(tokens) == len(labels) 278 | assert len(labels) == len(pred_tag) 279 | 280 | for token, label, pred in zip(tokens, labels, pred_tag): 281 | fd.write('{} {} {}\n'.format(str(token.text), label, pred)) 282 | 283 | # Separate examples by line break 284 | fd.write('\n') 285 | 286 | 287 | def write_outputs_to_json(out_file: str, 288 | examples: List[Example], 289 | y_preds: List[TAG_SEQUENCE]) -> None: 290 | """Writes a JSON with prediction outputs. 291 | 292 | Args: 293 | out_file: path to an output file or '-' to use stdout. 294 | examples: list of Example instances with associated tokens. 295 | y_preds: list of predicted tag sequences for each example. 296 | """ 297 | output = [] 298 | for example, y_pred in zip(examples, y_preds): 299 | predicted_entities = [] 300 | 301 | for entity in get_entities(y_pred): 302 | entity_class, start_token_ix, end_token_ix = entity 303 | start_char = example.doc_tokens[start_token_ix].offset 304 | end_token = example.doc_tokens[end_token_ix] 305 | end_char = end_token.offset + len(end_token) 306 | 307 | predicted_entities.append({ 308 | 'class': entity_class, 309 | 'start_char': start_char, 310 | 'end_char': end_char, 311 | 'text': example.orig_text[start_char:end_char], 312 | }) 313 | output.append({ 314 | 'doc_id': example.doc_id, 315 | 'text': example.orig_text, 316 | 'entities': predicted_entities, 317 | }) 318 | 319 | with smart_open(out_file) as fd: 320 | json.dump(output, fd) 321 | -------------------------------------------------------------------------------- /ner_evaluation/README.md: -------------------------------------------------------------------------------- 1 | # Code for Named Entity Recognition task 2 | 3 | This directory has code to train and evaluate BERT based models on NER task using the HAREM datasets. This package implements 4 architectures divided in two approaches: 4 | 5 | **Fine-tuning**: 6 | 7 | - BERT-CRF 8 | - BERT 9 | 10 | **Feature-based (BERT embeddings)**: 11 | 12 | - BERT-LSTM-CRF 13 | - BERT-LSTM 14 | 15 | The training and evaluation entry point script is `run_bert_harem.py`. `run_inference.py` can be used to run inference on new data. All other files are modules. Commands to train and evaluate our BERT models on HAREM datasets are below for each distinct setup: Total and Selective scenarios, feature-based and Fine-tuning approaches, with and without CRF. 16 | 17 | ## Environment Setup 18 | 19 | The code uses a Python 3.6 environment and a GPU is desirable. The following steps use Conda to create a Python virtual environment. Please install Conda before 20 | continuing or create an virtual environment using other tools and skip to step 3. 21 | 22 | 1 - Create a Python 3.6 virtual environment. With conda: 23 | 24 | $ conda create -n bert_crf python=3.6 25 | 26 | 2- Activate the environment: 27 | 28 | $ conda activate bert_crf 29 | # or, for older versions of Conda, 30 | $ source activate bert_crf 31 | 32 | 3- Install **PyTorch 1.1.0** using pip or conda (instructions at [PyTorch Get Started guide](https://pytorch.org/get-started/previous-versions/#v110)). Other PyTorch versions were not tested. 33 | 34 | 4- Install other requirements 35 | 36 | $ pip install -r requirements.txt 37 | 38 | ## Trained models 39 | 40 | Here are two checkpoints of trained NER models on First HAREM dataset. 41 | 42 | [BERTimbau Base - BERT-CRF (selective scenario, 5 classes)](https://drive.google.com/file/d/125AMFLsAf33isxTumujUAYBVkoxE7zeT/view?usp=sharing) 43 | 44 | [BERTimbau Base - BERT-CRF (total scenario, 10 classes)](https://drive.google.com/file/d/12PE1ypJ949rpatseSV37NXnHwB2Y8jZ5/view?usp=sharing) 45 | 46 | ## Running inference 47 | 48 | The script `run_inference.py` can be used to get predictions for new text using a trained NER model. 49 | Instructions: 50 | 51 | 1. Download and extract a trained model checkpoint from above or train your own (instructions below). 52 | 53 | 2. Save the inference data in a txt file (for a single input document) or in a JSON file (for multiple documents): 54 | 55 | # inference_text.txt 56 | Pink Floyd foi uma banda britânica de rock formada em Londres em 1965. 57 | O grupo foi fundado pelos estudantes Syd Barrett (guitarra, vocal), Nick Mason (bateria), Roger Waters (baixo, voz) e Richard Wright (teclados, voz). Sob a liderança de Barrett, eles lançaram dois singles e um álbum de estreia de sucesso, The Piper at the Gates of Dawn (1967). 58 | 59 | # inference_data.json 60 | [{"doc_id": 0, "doc_text": "Text of the 1st document"}, {"doc_id": 1, "doc_text": "Text of the 2nd document"}] 61 | 62 | 3. Run inference command. This command assumes a downloaded checkpoint that was extracted in `bertimbau-base_bert-crf_total/`. Use the `--help` flag to display extra information about `--output_format` and `--output_file`. 63 | 64 | python run_inference.py \ 65 | --bert_model bertimbau-base_bert-crf_total/ \ 66 | --labels_file bertimbau-base_bert-crf_total/classes.txt \ 67 | --input_file inference_text.txt \ 68 | --output_format json \ 69 | --output_file - 70 | 71 | By default, predictions will be printed in JSON format to stdout: 72 | 73 | [{"doc_id": 0, "text": "Pink Floyd foi uma banda brit\u00e2nica de rock formada em Londres em 1965.\nO grupo foi fundado pelos estudantes Syd Barrett (guitarra, vocal), Nick Mason (bateria), Roger Waters (baixo, voz) e Richard Wright (teclados, voz). Sob a lideran\u00e7a de Barrett, eles lan\u00e7aram dois singles e um \u00e1lbum de estreia de sucesso, The Piper at the Gates of Dawn (1967).", "entities": [{"class": "PESSOA", "start_char": 0, "end_char": 11, "text": "Pink Floyd "}, {"class": "LOCAL", "start_char": 54, "end_char": 62, "text": "Londres "}, {"class": "TEMPO", "start_char": 65, "end_char": 69, "text": "1965"}, {"class": "PESSOA", "start_char": 108, "end_char": 120, "text": "Syd Barrett "}, {"class": "PESSOA", "start_char": 139, "end_char": 150, "text": "Nick Mason "}, {"class": "PESSOA", "start_char": 161, "end_char": 174, "text": "Roger Waters "}, {"class": "PESSOA", "start_char": 189, "end_char": 204, "text": "Richard Wright "}, {"class": "PESSOA", "start_char": 240, "end_char": 247, "text": "Barrett"}, {"class": "OBRA", "start_char": 310, "end_char": 341, "text": "The Piper at the Gates of Dawn "}, {"class": "TEMPO", "start_char": 342, "end_char": 346, "text": "1967"}]}] 74 | 75 | ## Running NER trainings and evaluations 76 | 77 | In all commands below, `{pretrained_bert_model_path}` has to be changed by either a path to BERTimbau Base or Large checkpoint (downloaded from this repository), or the string `bert-base-multilingual-cased` to use mBERT. 78 | 79 | In each training (`--do_train`), the model is trained for `--num_train_epochs` epochs using data from `--train_file` and validation is performed using data from `--valid_file`. The final checkpoint saves the model of best epoch in the output directory `--output_dir`. When `--do_eval` is set, a txt file with the 80 | predictions for the test set (`--eval_file` arguments) in CoNLL format will also be saved. See the next section 81 | for the commands to calculate the CoNLL metrics. 82 | 83 | When the training ends, some metrics are displayed on the terminal for validation and test sets: 84 | 85 | - Micro F1-score 86 | - Precision 87 | - Recall 88 | - Classification Report: metrics per class. The micro avg line displays CoNLL equivalent metrics. 89 | 90 | #### Datasets 91 | 92 | The `data` directory contains the preprocessed HAREM datasets for both Selective and Total scenarios converted to JSON format. First HAREM is split in train/dev sets and Mini HAREM is used as test set. These JSON files are produced from original HAREM XML files using [this script](https://github.com/fabiocapsouza/harem_preprocessing). Train/dev split is done separately. 93 | 94 | #### Important: Multi-GPU and FP16 95 | 96 | Running this script in multi-GPU setup **is not recommended**. If the machine has multiple GPUs, limit the GPU visibility setting the `CUDA_VISIBLE_DEVICES` environment variable. Example: 97 | 98 | # Only GPU 0 will be visible 99 | CUDA_VISIBLE_DEVICES=0 python run_bert_harem.py [...] 100 | 101 | FP16 training was not tested and so is also not recommended. 102 | 103 | #### Batch size 104 | 105 | The commands below set the batch size to 16 considering a BERT Base model and an 8GB GPU. The parameters `per_gpu_train_batch_size` and `gradient_accumulation_steps` can be changed to use less or more available memory and produce the same results, as long as `per_gpu_train_batch_size * gradient_accumulation_steps == 16`. 106 | 107 | ### Fine-tuning experiments 108 | 109 | #### BERT-CRF model 110 | 111 | # Total scenario 112 | python run_bert_harem.py \ 113 | --bert_model {pretrained_bert_model_path} \ 114 | --labels_file data/classes-total.txt \ 115 | --do_train \ 116 | --train_file data/FirstHAREM-total-train.json \ 117 | --valid_file data/FirstHAREM-total-dev.json \ 118 | --num_train_epochs 15 \ 119 | --per_gpu_train_batch_size 2 \ 120 | --gradient_accumulation_steps 8 \ 121 | --do_eval \ 122 | --eval_file data/MiniHAREM-total.json \ 123 | --output_dir output_bert-crf_total 124 | 125 | # Selective scenario 126 | python run_bert_harem.py \ 127 | --bert_model {pretrained_bert_model_path} \ 128 | --labels_file data/classes-selective.txt \ 129 | --do_train \ 130 | --train_file data/FirstHAREM-selective-train.json \ 131 | --valid_file data/FirstHAREM-selective-dev.json \ 132 | --num_train_epochs 15 \ 133 | --per_gpu_train_batch_size 2 \ 134 | --gradient_accumulation_steps 8 \ 135 | --do_eval \ 136 | --eval_file data/MiniHAREM-selective.json \ 137 | --output_dir output_bert-crf_selective 138 | 139 | --- 140 | 141 | #### BERT model 142 | 143 | # Total scenario 144 | python run_bert_harem.py \ 145 | --bert_model {pretrained_bert_model_path} \ 146 | --labels_file data/classes-total.txt \ 147 | --do_train \ 148 | --train_file data/FirstHAREM-total-train.json \ 149 | --valid_file data/FirstHAREM-total-dev.json \ 150 | --no_crf \ 151 | --num_train_epochs 50 \ 152 | --per_gpu_train_batch_size 2 \ 153 | --gradient_accumulation_steps 8 \ 154 | --do_eval \ 155 | --eval_file data/MiniHAREM-total.json \ 156 | --output_dir output_bert_total 157 | 158 | # Selective scenario 159 | python run_bert_harem.py \ 160 | --bert_model {pretrained_bert_model_path} \ 161 | --labels_file data/classes-selective.txt \ 162 | --do_train \ 163 | --train_file data/FirstHAREM-selective-train.json \ 164 | --valid_file data/FirstHAREM-selective-dev.json \ 165 | --no_crf \ 166 | --num_train_epochs 50 \ 167 | --per_gpu_train_batch_size 2 \ 168 | --gradient_accumulation_steps 8 \ 169 | --do_eval \ 170 | --eval_file data/MiniHAREM-selective.json \ 171 | --output_dir output_bert_selective 172 | 173 | --- 174 | 175 | ### Feature-based experiments 176 | 177 | These experiments use the `--freeze_bert` flag to freeze all BERT's parameters and train a LSTM-CRF or LSTM model using BERT embeddings. `--pooler sum` indicates that BERT embeddings will be produced by summing the last 4 layers of BERT instead of using only the last layer. 178 | 179 | #### BERT-LSTM-CRF model 180 | 181 | # Total scenario 182 | python run_bert_harem.py \ 183 | --bert_model {pretrained_bert_model_path} \ 184 | --labels_file data/classes-total.txt \ 185 | --do_train \ 186 | --train_file data/FirstHAREM-total-train.json \ 187 | --valid_file data/FirstHAREM-total-dev.json \ 188 | --freeze_bert \ 189 | --pooler sum \ 190 | --num_train_epochs 50 \ 191 | --per_gpu_train_batch_size 2 \ 192 | --gradient_accumulation_steps 8 \ 193 | --do_eval \ 194 | --eval_file data/MiniHAREM-total.json \ 195 | --output_dir output_bert-lstm-crf_total 196 | 197 | # Selective scenario 198 | python run_bert_harem.py \ 199 | --bert_model {pretrained_bert_model_path} \ 200 | --labels_file data/classes-selective.txt \ 201 | --do_train \ 202 | --train_file data/FirstHAREM-selective-train.json \ 203 | --valid_file data/FirstHAREM-selective-dev.json \ 204 | --freeze_bert \ 205 | --pooler sum \ 206 | --num_train_epochs 50 \ 207 | --per_gpu_train_batch_size 2 \ 208 | --gradient_accumulation_steps 8 \ 209 | --do_eval \ 210 | --eval_file data/MiniHAREM-selective.json \ 211 | --output_dir output_bert-lstm-crf_selective 212 | 213 | --- 214 | 215 | #### BERT-LSTM model 216 | 217 | # Total scenario 218 | python run_bert_harem.py \ 219 | --bert_model {pretrained_bert_model_path} \ 220 | --labels_file data/classes-total.txt \ 221 | --do_train \ 222 | --train_file data/FirstHAREM-total-train.json \ 223 | --valid_file data/FirstHAREM-total-dev.json \ 224 | --freeze_bert \ 225 | --pooler sum \ 226 | --no_crf \ 227 | --num_train_epochs 100 \ 228 | --per_gpu_train_batch_size 2 \ 229 | --gradient_accumulation_steps 8 \ 230 | --do_eval \ 231 | --eval_file data/MiniHAREM-total.json \ 232 | --output_dir output_bert-lstm_total 233 | 234 | # Selective 235 | python run_bert_harem.py \ 236 | --bert_model {pretrained_bert_model_path} \ 237 | --labels_file data/classes-selective.txt \ 238 | --do_train \ 239 | --train_file data/FirstHAREM-selective-train.json \ 240 | --valid_file data/FirstHAREM-selective-dev.json \ 241 | --freeze_bert \ 242 | --pooler sum \ 243 | --no_crf \ 244 | --num_train_epochs 100 \ 245 | --per_gpu_train_batch_size 2 \ 246 | --gradient_accumulation_steps 8 \ 247 | --do_eval \ 248 | --eval_file data/MiniHAREM-selective.json \ 249 | --output_dir output_bert-lstm_selective 250 | 251 | --- 252 | 253 | ### Computing CoNLL metrics 254 | 255 | The [conlleval](https://www.clips.uantwerpen.be/conll2000/chunking/conlleval.txt) script should be used to compute the evaluation metrics using the `predictions_conll.txt` file that is output in the evaluation procedure, as explained below. However, 256 | the package uses the [seqeval library](https://github.com/chakki-works/seqeval) to compute CoNLL equivalent metrics which are printed in the console. 257 | 258 | #### Using conlleval 259 | 260 | Download the script and make it executable. 261 | 262 | $ chmod +x conlleval.txt 263 | 264 | Then, run the command below inputing the corresponding `output_dir` of the trained model 265 | 266 | $ conlleval.txt < {output_dir}/predictions_conll.txt 267 | 268 | For example, for BERTimbau-Large-CRF on Total scenario: 269 | 270 | $ ./conlleval.txt < output_bertimbau-large_BERT-CRF_total/predictions_conll.txt 271 | processed 64853 tokens with 3642 phrases; found: 3523 phrases; correct: 2828. 272 | accuracy: 96.80%; precision: 80.27%; recall: 77.65%; FB1: 78.94 273 | ABSTRACCAO: precision: 59.33%; recall: 59.05%; FB1: 59.19 209 274 | ACONTECIMENTO: precision: 36.51%; recall: 40.35%; FB1: 38.33 63 275 | COISA: precision: 61.26%; recall: 40.00%; FB1: 48.40 111 276 | LOCAL: precision: 89.71%; recall: 84.30%; FB1: 86.92 826 277 | OBRA: precision: 64.62%; recall: 65.97%; FB1: 65.28 195 278 | ORGANIZACAO: precision: 71.11%; recall: 75.50%; FB1: 73.24 637 279 | OUTRO: precision: 50.00%; recall: 14.29%; FB1: 22.22 4 280 | PESSOA: precision: 86.92%; recall: 83.79%; FB1: 85.33 803 281 | TEMPO: precision: 94.52%; recall: 90.61%; FB1: 92.52 347 282 | VALOR: precision: 80.79%; recall: 81.29%; FB1: 81.04 328 283 | 284 | ### Available hyperparameters 285 | 286 | Run `python run_bert_harem.py --help` to display the available hyperparameters. The default values are set to the ones used in our experiments. 287 | -------------------------------------------------------------------------------- /ner_evaluation/preprocessing.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import json 3 | import logging 4 | import os 5 | 6 | from argparse import Namespace 7 | from typing import (Dict, List, Optional) 8 | import torch 9 | 10 | from tag_encoder import NERTagsEncoder, SCHEMES 11 | from tokenization import ( 12 | Token, 13 | TokenizerWithAlignment, 14 | reconstruct_text_from_tokens, 15 | ) 16 | 17 | from pytorch_transformers.tokenization_bert import BertTokenizer 18 | 19 | LOGGER = logging.getLogger(__name__) 20 | 21 | 22 | NETag = collections.namedtuple("NETag", ['doc_id', 23 | 'entity_id', 24 | 'text', 25 | 'type', 26 | 'start_position', 27 | 'end_position']) 28 | 29 | 30 | class Example(object): 31 | """ 32 | A single training/test example for NER training. 33 | """ 34 | 35 | def __init__(self, 36 | doc_id: int, 37 | orig_text: str, 38 | doc_tokens: List[Token], 39 | tags: List[NETag], 40 | labels: List[str], 41 | ): 42 | self.doc_id = doc_id 43 | self.orig_text = orig_text 44 | self.doc_tokens = doc_tokens 45 | self.tags = tags 46 | self.labels = labels 47 | 48 | for token in doc_tokens: 49 | token._example = self 50 | 51 | def __str__(self): 52 | return repr(self) 53 | 54 | def __repr__(self): 55 | s = ('doc_id: {}\n' 56 | 'orig_text:{}\n' 57 | 'doc_tokens: {}\n' 58 | 'labels: {}\n' 59 | 'tags: {}\n').format(self.doc_id, self.orig_text, self.doc_tokens, 60 | self.labels, self.tags) 61 | return s 62 | 63 | 64 | def read_examples(input_file: str, 65 | is_training: bool, 66 | classes: List[str] = None, 67 | scheme: str = 'BIO', 68 | ) -> List[Example]: 69 | """Read a JSON file into a list of Examples. 70 | 71 | The JSON file should contain a list of dictionaries, one dict per input 72 | document. Each dict should have the following entries: 73 | 74 | doc_id: an example unique identifier (for debugging). 75 | doc_text: the document text. 76 | entities: a list of dicts of named entities contained in `doc_text`. 77 | Each entity dict should have the following entries: 78 | 79 | entity_id: an identifier for the entity (debugging purposes). 80 | label: the named entity gold label. 81 | start_offset: start char offset of the entity in `doc_text`. 82 | end_offset: **exclusive** end char offset of the entity in 83 | `doc_text`. 84 | text: the named entity text. It should be equal to the slice of the 85 | document text using `start_offset` and `end_offset`, e.g., 86 | `doc_text[start_offset:end_offset]`. 87 | """ 88 | scheme = scheme.upper() 89 | if scheme not in SCHEMES: 90 | raise ValueError("Invalid tagging scheme `{}`.".format(scheme)) 91 | 92 | with open(input_file, "r", encoding='utf-8') as reader: 93 | input_data = json.load(reader) 94 | 95 | examples = [] 96 | tokenizer_with_alignment = TokenizerWithAlignment() 97 | 98 | for document in input_data: 99 | doc_text = document["doc_text"] 100 | doc_id = document["doc_id"] 101 | 102 | # Perform whitespace and punctuation tokenization keeping track of char 103 | # alignment (char_to_word_offset) 104 | doc_tokens, char_to_word_offset = tokenizer_with_alignment(doc_text) 105 | labels = ["O"] * len(doc_tokens) 106 | tags = [] 107 | 108 | def set_label(index, tag): 109 | if labels[index] != 'O': 110 | LOGGER.warning('Overwriting tag %s at position %s to %s', 111 | labels[index], index, tag) 112 | labels[index] = tag 113 | 114 | if is_training: 115 | for entity in document["entities"]: 116 | entity_id = entity["entity_id"] 117 | entity_text = entity["text"] 118 | entity_type = entity["label"] 119 | start_token = None 120 | end_token = None 121 | 122 | entity_start_offset = entity["start_offset"] 123 | entity_end_offset = entity["end_offset"] 124 | start_token = char_to_word_offset[entity_start_offset] 125 | # end_offset is NOT inclusive to the text, e.g., 126 | # entity_text == doc_text[start_offset:end_offset] 127 | end_token = char_to_word_offset[entity_end_offset - 1] 128 | 129 | assert start_token <= end_token, \ 130 | "End token cannot come before start token." 131 | reconstructed_text = reconstruct_text_from_tokens( 132 | doc_tokens[start_token:(end_token + 1)]) 133 | assert entity_text.strip() == reconstructed_text, \ 134 | "Entity text and reconstructed text are not equal: %s != %s" % ( 135 | entity_text, reconstructed_text) 136 | 137 | if scheme == 'BILUO': 138 | # BILUO scheme 139 | if start_token == end_token: 140 | tag = 'U-' + entity_type 141 | set_label(start_token, tag) 142 | else: 143 | for token_index in range(start_token, end_token + 1): 144 | if token_index == start_token: 145 | tag = 'B-' + entity_type 146 | elif token_index == end_token: 147 | tag = 'L-' + entity_type 148 | else: 149 | tag = 'I-' + entity_type 150 | 151 | set_label(token_index, tag) 152 | 153 | elif scheme == 'BIO': 154 | # BIO scheme 155 | for token_index in range(start_token, end_token + 1): 156 | if token_index == start_token: 157 | tag = 'B-' + entity_type 158 | else: 159 | tag = 'I-' + entity_type 160 | set_label(token_index, tag) 161 | 162 | entity = NETag( 163 | doc_id, 164 | entity_id, 165 | entity_text, 166 | entity_type, 167 | start_token, 168 | end_token, 169 | ) 170 | tags.append(entity) 171 | 172 | example = Example( 173 | doc_id=doc_id, 174 | orig_text=doc_text, 175 | doc_tokens=doc_tokens, 176 | tags=tags, 177 | labels=labels) 178 | examples.append(example) 179 | 180 | return examples 181 | 182 | 183 | class InputSpan(object): 184 | """A single set of features of data.""" 185 | 186 | def __init__(self, 187 | unique_id: int, 188 | example_index: int, 189 | doc_span_index: int, 190 | tokens: List[Token], 191 | token_to_orig_map: Dict[int, int], 192 | token_is_max_context: List[bool], 193 | input_ids: List[int], 194 | input_mask: List[int], 195 | segment_ids: List[int], 196 | prediction_mask: List[bool], 197 | labels: Optional[List[str]] = (), 198 | label_ids: Optional[List[int]] = (), 199 | ): 200 | self.unique_id = unique_id 201 | self.example_index = example_index 202 | self.doc_span_index = doc_span_index 203 | self.tokens = tokens 204 | self.token_to_orig_map = token_to_orig_map 205 | self.token_is_max_context = token_is_max_context 206 | self.input_ids = input_ids 207 | self.input_mask = input_mask 208 | self.segment_ids = segment_ids 209 | self.labels = labels or [] 210 | self.label_ids = label_ids or [] 211 | self.prediction_mask = prediction_mask 212 | 213 | def __repr__(self): 214 | return "".format( 215 | self.example_index, self.doc_span_index, 216 | ) 217 | 218 | def __str__(self): 219 | return self.__repr__() 220 | 221 | def __len__(self): 222 | return len(self.tokens) 223 | 224 | 225 | def _check_is_max_context(doc_spans: List[InputSpan], 226 | cur_span_index: int, 227 | position: int, 228 | ) -> bool: 229 | """Check if this is the 'max context' doc span for the token.""" 230 | 231 | # Because of the sliding window approach taken to scoring documents, a 232 | # single token can appear in multiple documents. E.g. 233 | # Doc: the man went to the store and bought a gallon of milk 234 | # Span A: the man went to the 235 | # Span B: to the store and bought 236 | # Span C: and bought a gallon of 237 | # ... 238 | # 239 | # Now the word 'bought' will have two scores from spans B and C. We only 240 | # want to consider the score with "maximum context", which we define as 241 | # the *minimum* of its left and right context (the *sum* of left and 242 | # right context will always be the same, of course). 243 | # 244 | # In the example the maximum context for 'bought' would be span C since 245 | # it has 1 left context and 3 right context, while span B has 4 left context 246 | # and 0 right context. 247 | best_score = None 248 | best_span_index = None 249 | for (span_index, doc_span) in enumerate(doc_spans): 250 | end = doc_span.start + doc_span.length - 1 251 | if position < doc_span.start: 252 | continue 253 | if position > end: 254 | continue 255 | num_left_context = position - doc_span.start 256 | num_right_context = end - position 257 | score = min(num_left_context, num_right_context) + \ 258 | 0.01 * doc_span.length 259 | if best_score is None or score > best_score: 260 | best_score = score 261 | best_span_index = span_index 262 | 263 | return cur_span_index == best_span_index 264 | 265 | 266 | def convert_examples_to_spans(examples: List[Example], 267 | ner_tags_converter: NERTagsEncoder, 268 | tokenizer: BertTokenizer, 269 | max_seq_length: int, 270 | doc_stride: int, 271 | is_training: bool, 272 | unique_id_start: Optional[int] = None, 273 | verbose: bool = True, 274 | ) -> List[InputSpan]: 275 | """Converts examples to BERT input-ready data tensor-like structures, 276 | splitting large documents into spans of `max_seq_length` using a stride of 277 | `doc_stride` tokens.""" 278 | 279 | unique_id = unique_id_start or 1000000000 280 | 281 | features = [] 282 | for (example_index, example) in enumerate(examples): 283 | 284 | doc_tokens = example.doc_tokens 285 | doc_labels = example.labels 286 | 287 | tok_to_orig_index = [] 288 | orig_to_tok_index = [] 289 | all_doc_tokens = [] 290 | all_doc_labels = [] 291 | all_prediction_mask = [] 292 | 293 | for i, token in enumerate(doc_tokens): 294 | orig_to_tok_index.append(len(all_doc_tokens)) 295 | sub_tokens = tokenizer.tokenize(token.text) 296 | for j, sub_token in enumerate(sub_tokens): 297 | # Create mapping from subtokens to original token 298 | tok_to_orig_index.append(i) 299 | all_doc_tokens.append(sub_token) 300 | # Mask all subtokens (j > 0) 301 | all_prediction_mask.append(j == 0) 302 | 303 | if j == 0: 304 | label = doc_labels[i] 305 | all_doc_labels.append(label) 306 | else: 307 | all_doc_labels.append('X') 308 | 309 | assert len(all_doc_tokens) == len(all_prediction_mask) 310 | if is_training: 311 | assert len(all_doc_tokens) == len(all_doc_labels) 312 | 313 | # The -1 accounts for [CLS]. For NER we have only one sentence, so no 314 | # [SEP] tokens. 315 | max_tokens_for_doc = max_seq_length - 1 316 | 317 | # We can have documents that are longer than the maximum sequence length. 318 | # To deal with this we do a sliding window approach, where we take chunks 319 | # of the up to our max length with a stride of `doc_stride`. 320 | _DocSpan = collections.namedtuple( # pylint: disable=invalid-name 321 | "DocSpan", ["start", "length"]) 322 | doc_spans = [] 323 | start_offset = 0 324 | while start_offset < len(all_doc_tokens): 325 | length = len(all_doc_tokens) - start_offset 326 | if length > max_tokens_for_doc: 327 | length = max_tokens_for_doc 328 | doc_spans.append(_DocSpan(start=start_offset, length=length)) 329 | if start_offset + length == len(all_doc_tokens): 330 | break 331 | start_offset += min(length, doc_stride) 332 | 333 | for (doc_span_index, doc_span) in enumerate(doc_spans): 334 | tokens = [] 335 | token_to_orig_map = {} 336 | token_is_max_context = [] 337 | segment_ids = [] 338 | labels = None 339 | label_ids = None 340 | prediction_mask = [] 341 | # Include [CLS] token 342 | tokens.append("[CLS]") 343 | segment_ids.append(0) 344 | prediction_mask.append(False) 345 | 346 | # Ignore [CLS] label 347 | if is_training: 348 | labels = ['X'] 349 | 350 | for i in range(doc_span.length): 351 | # Each doc span will have a dict that indicates if it is the 352 | # *max_context span* for the tokens inside it 353 | split_token_index = doc_span.start + i 354 | token_to_orig_map[len( 355 | tokens)] = tok_to_orig_index[split_token_index] 356 | 357 | is_max_context = _check_is_max_context(doc_spans, 358 | doc_span_index, 359 | split_token_index) 360 | token_is_max_context.append(is_max_context) 361 | tokens.append(all_doc_tokens[split_token_index]) 362 | segment_ids.append(0) 363 | if is_training: 364 | labels.append(all_doc_labels[split_token_index]) 365 | prediction_mask.append( 366 | all_prediction_mask[split_token_index]) 367 | 368 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 369 | if is_training: 370 | label_ids = ner_tags_converter.convert_tags_to_ids(labels) 371 | 372 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 373 | # tokens are attended to. 374 | input_mask = [1] * len(input_ids) 375 | 376 | # Zero-pad up to the sequence length. 377 | while len(input_ids) < max_seq_length: 378 | input_ids.append(0) 379 | input_mask.append(0) 380 | segment_ids.append(0) 381 | if is_training: 382 | label_ids.append(ner_tags_converter.ignore_index) 383 | prediction_mask.append(False) 384 | 385 | # If not training, use placeholder labels 386 | if not is_training: 387 | labels = ['O'] * len(input_ids) 388 | label_ids = [ner_tags_converter.ignore_index] * len(input_ids) 389 | 390 | assert len(input_ids) == max_seq_length 391 | assert len(input_mask) == max_seq_length 392 | assert len(segment_ids) == max_seq_length 393 | assert len(prediction_mask) == max_seq_length 394 | if is_training: 395 | assert len(label_ids) == max_seq_length 396 | 397 | if verbose and example_index < 20: 398 | LOGGER.info("*** Example ***") 399 | LOGGER.info("unique_id: %s" % (unique_id)) 400 | LOGGER.info("example_index: %s" % (example_index)) 401 | LOGGER.info("doc_span_index: %s" % (doc_span_index)) 402 | LOGGER.info("tokens: %s" % " ".join(tokens)) 403 | LOGGER.info("token_to_orig_map: %s" % " ".join([ 404 | "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) 405 | LOGGER.info("token_is_max_context: %s", token_is_max_context) 406 | LOGGER.info("input_ids: %s" % 407 | " ".join([str(x) for x in input_ids])) 408 | LOGGER.info( 409 | "input_mask: %s" % " ".join([str(x) for x in input_mask])) 410 | LOGGER.info( 411 | "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 412 | LOGGER.info("prediction_mask: %s" % " ".join([ 413 | str(x) for x in prediction_mask 414 | ])) 415 | if is_training: 416 | LOGGER.info( 417 | "label_ids: %s" % " ".join([str(x) for x in label_ids])) 418 | 419 | LOGGER.info("tags:") 420 | inside_label = False 421 | for tok, lab, lab_id in zip(tokens, labels, label_ids): 422 | if lab[0] == "O": 423 | if inside_label and tok.startswith("##"): 424 | LOGGER.info(f'{tok}\tX') 425 | else: 426 | inside_label = False 427 | else: 428 | if lab[0] in ("B", "I", "L", "U") or inside_label: 429 | if lab[0] in ("B", "U"): 430 | # new entity 431 | LOGGER.info('') 432 | inside_label = True 433 | LOGGER.info(f'{tok}\t{lab}\t{lab_id}') 434 | 435 | features.append( 436 | InputSpan( 437 | unique_id=unique_id, 438 | example_index=example_index, 439 | doc_span_index=doc_span_index, 440 | tokens=tokens, 441 | token_to_orig_map=token_to_orig_map, 442 | token_is_max_context=token_is_max_context, 443 | input_ids=input_ids, 444 | input_mask=input_mask, 445 | segment_ids=segment_ids, 446 | labels=labels, 447 | label_ids=label_ids, 448 | prediction_mask=prediction_mask, 449 | )) 450 | unique_id += 1 451 | 452 | return features 453 | 454 | 455 | def get_features_from_examples(examples: List[Example], 456 | ner_tags_converter: NERTagsEncoder, 457 | tokenizer: BertTokenizer, 458 | args: Namespace, # args from ArgumentParser 459 | mode: str, 460 | unique_id_start: int = None, 461 | verbose: bool = True, 462 | ) -> List[InputSpan]: 463 | """Convert examples to input spans. Read from cache if possible.""" 464 | 465 | assert mode in ('train', 'valid', 'eval', 'inference'), "Invalid mode." 466 | examples_file = getattr(args, mode + '_file') or mode 467 | 468 | cached_features_file = examples_file + '_{0}_{1}_{2}'.format( 469 | list(filter(None, args.bert_model.split('/'))).pop(), 470 | str(args.max_seq_length), 471 | str(args.doc_stride)) 472 | 473 | spans = None 474 | loaded_from_cache = False 475 | 476 | if os.path.isfile(cached_features_file) and not args.override_cache: 477 | # Read from cache 478 | LOGGER.info('Reading cached features from {}' 479 | .format(cached_features_file)) 480 | spans = torch.load(cached_features_file) 481 | loaded_from_cache = True 482 | else: # noqa: E772 483 | LOGGER.info('Converting examples to features.') 484 | is_training = True if mode in ('train', 'valid', 'eval') else False 485 | spans = convert_examples_to_spans( 486 | examples=examples, 487 | ner_tags_converter=ner_tags_converter, 488 | tokenizer=tokenizer, 489 | max_seq_length=args.max_seq_length, 490 | doc_stride=args.doc_stride, 491 | is_training=is_training, 492 | unique_id_start=unique_id_start, 493 | verbose=verbose) 494 | 495 | if args.local_rank == -1 or torch.distributed.get_rank() == 0: 496 | if not loaded_from_cache or args.override_cache: 497 | LOGGER.info( 498 | " Saving %s features into cached file %s", 499 | mode, cached_features_file) 500 | torch.save(spans, cached_features_file) 501 | 502 | return spans 503 | -------------------------------------------------------------------------------- /ner_evaluation/model.py: -------------------------------------------------------------------------------- 1 | """Implementations of BERT, BERT-CRF, BERT-LSTM and BERT-LSTM-CRF models.""" 2 | 3 | import logging 4 | from argparse import Namespace 5 | from typing import Any, Dict, Optional, Tuple, Type 6 | 7 | import torch 8 | from pytorch_transformers.modeling_bert import (BertConfig, 9 | BertForTokenClassification) 10 | from torchcrf import CRF 11 | 12 | LOGGER = logging.getLogger(__name__) 13 | 14 | 15 | def sum_last_4_layers(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor: 16 | """Sums the last 4 hidden representations of a sequence output of BERT. 17 | Args: 18 | ----- 19 | sequence_output: Tuple of tensors of shape (batch, seq_length, hidden_size). 20 | For BERT base, the Tuple has length 13. 21 | 22 | Returns: 23 | -------- 24 | summed_layers: Tensor of shape (batch, seq_length, hidden_size) 25 | """ 26 | last_layers = sequence_outputs[-4:] 27 | return torch.stack(last_layers, dim=0).sum(dim=0) 28 | 29 | 30 | def get_last_layer(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor: 31 | """Returns the last tensor of a list of tensors.""" 32 | return sequence_outputs[-1] 33 | 34 | 35 | def concat_last_4_layers(sequence_outputs: Tuple[torch.Tensor]) -> torch.Tensor: 36 | """Concatenate the last 4 tensors of a tuple of tensors.""" 37 | last_layers = sequence_outputs[-4:] 38 | return torch.cat(last_layers, dim=-1) 39 | 40 | 41 | POOLERS = { 42 | 'sum': sum_last_4_layers, 43 | 'last': get_last_layer, 44 | 'concat': concat_last_4_layers, 45 | } 46 | 47 | 48 | def get_model_and_kwargs_for_args( 49 | args: Namespace, 50 | training: bool = True, 51 | ) -> Tuple[Type[torch.nn.Module], Dict[str, Any]]: 52 | """Given the parsed arguments, returns the correct model class and model 53 | args. 54 | 55 | Args: 56 | args: a Namespace object (from parsed argv command). 57 | training: if True, sets a high initialization value for classifier bias 58 | parameter after model initialization. 59 | """ 60 | bias_O = 6 if training else None 61 | model_args = { 62 | 'pooler': args.pooler, 63 | 'bias_O': bias_O, 64 | } 65 | 66 | if args.freeze_bert: 67 | # Possible models: BERT-LSTM or BERT-LSTM-CRF 68 | model_args['lstm_layers'] = args.lstm_layers 69 | model_args['lstm_hidden_size'] = args.lstm_hidden_size 70 | if args.no_crf: 71 | model_class = BertLSTM 72 | else: 73 | model_class = BertLSTMCRF 74 | 75 | else: 76 | # Possible models: BertForNERClassification or BertCRF 77 | if args.no_crf: 78 | model_class = BertForNERClassification 79 | else: 80 | model_class = BertCRF 81 | 82 | return model_class, model_args 83 | 84 | 85 | class BertForNERClassification(BertForTokenClassification): 86 | """BERT model for NER task. 87 | 88 | The number of NER tags should be defined in the `BertConfig.num_labels` 89 | attribute. 90 | 91 | Args: 92 | config: BertConfig instance to build BERT model. 93 | weight_O: loss weight value for "O" tags in CrossEntropyLoss. 94 | bias_O: optional value to initiate the classifier's bias value for "O" 95 | tag. 96 | pooler: which pooler configuration to use to pass BERT features to the 97 | classifier. 98 | """ 99 | 100 | def __init__(self, 101 | config: BertConfig, 102 | weight_O: float = 0.01, 103 | bias_O: Optional[float] = None, 104 | pooler='last'): 105 | super().__init__(config) 106 | del self.classifier # Deletes classifier of BertForTokenClassification 107 | 108 | num_labels = config.num_labels 109 | 110 | if pooler not in POOLERS: 111 | message = ("Invalid pooler: %s. Pooler must be one of %s." 112 | % (pooler, list(POOLERS.keys()))) 113 | raise ValueError(message) 114 | 115 | self._build_classifier(config, pooler) 116 | if bias_O is not None: 117 | self.set_bias_tag_O(bias_O) 118 | 119 | assert isinstance(weight_O, float) and 0 < weight_O < 1 120 | weights = [1.] * num_labels 121 | weights[0] = weight_O 122 | weights = torch.tensor(weights) 123 | self.loss_fct = torch.nn.CrossEntropyLoss(weight=weights) 124 | 125 | self.frozen_bert = False 126 | self.pooler = POOLERS.get(pooler) 127 | 128 | def _build_classifier(self, config, pooler): 129 | """Build tag classifier.""" 130 | if pooler in ('last', 'sum'): 131 | self.classifier = torch.nn.Linear(config.hidden_size, 132 | config.num_labels) 133 | else: 134 | assert pooler == 'concat' 135 | self.classifier = torch.nn.Linear(4 * config.hidden_size, 136 | config.num_labels) 137 | 138 | def set_bias_tag_O(self, bias_O: Optional[float] = None): 139 | """Increase tag "O" bias to produce high probabilities early on and 140 | reduce instability in early training.""" 141 | if bias_O is not None: 142 | LOGGER.info('Setting bias of OUT token to %s.', bias_O) 143 | self.classifier.bias.data[0] = bias_O 144 | 145 | def freeze_bert(self): 146 | """Freeze all BERT parameters. Only the classifier weights will be 147 | updated.""" 148 | for p in self.bert.parameters(): 149 | p.requires_grad = False 150 | self.frozen_bert = True 151 | 152 | def bert_encode(self, input_ids, token_type_ids=None, attention_mask=None): 153 | """Gets encoded sequence from BERT model and pools the layers accordingly. 154 | BertModel outputs a tuple whose elements are: 155 | 1- Last encoder layer output. Tensor of shape (B, S, H) 156 | 2- Pooled output of the [CLS] token. Tensor of shape (B, H) 157 | 3- Encoder inputs (embeddings) + all Encoder layers' outputs. This 158 | requires the flag `output_hidden_states=True` on BertConfig. Returns 159 | List of tensors of shapes (B, S, H). 160 | 4- Attention results, if `output_attentions=True` in BertConfig. 161 | 162 | This method uses just the 3rd output and pools the layers. 163 | """ 164 | _, _, all_layers_sequence_outputs, *_ = self.bert( 165 | input_ids, 166 | token_type_ids=token_type_ids, 167 | attention_mask=attention_mask) 168 | 169 | # Use the defined pooler to pool the hidden representation layers 170 | sequence_output = self.pooler(all_layers_sequence_outputs) 171 | 172 | return sequence_output 173 | 174 | def predict_logits(self, input_ids, token_type_ids=None, 175 | attention_mask=None): 176 | """Returns the logits prediction from BERT + classifier.""" 177 | if self.frozen_bert: 178 | sequence_output = input_ids 179 | else: 180 | sequence_output = self.bert_encode( 181 | input_ids, token_type_ids, attention_mask) 182 | 183 | sequence_output = self.dropout(sequence_output) 184 | logits = self.classifier(sequence_output) # (batch, seq, tags) 185 | 186 | return logits 187 | 188 | def forward(self, 189 | input_ids, 190 | token_type_ids=None, 191 | attention_mask=None, 192 | labels=None, 193 | prediction_mask=None, 194 | ) -> Dict[str, torch.Tensor]: 195 | """Performs the forward pass of the network. 196 | 197 | If `labels` are not None, it will calculate and return the loss. 198 | Otherwise, it will return the logits and predicted tags tensors. 199 | 200 | Args: 201 | input_ids: tensor of input token ids. 202 | token_type_ids: tensor of input sentence type id (0 or 1). Should be 203 | all zeros for NER. Can be safely set to `None`. 204 | attention_mask: mask tensor that should have value 0 for [PAD] 205 | tokens and 1 for other tokens. 206 | labels: tensor of gold NER tag label ids. Values should be ints in 207 | the range [0, config.num_labels - 1]. 208 | prediction_mask: mask tensor should have value 0 for tokens that do 209 | not have an associated prediction, such as [CLS] and WordPìece 210 | subtoken continuations (that start with ##). 211 | 212 | Returns a dict with calculated tensors: 213 | - "logits" 214 | - "y_pred" 215 | - "loss" (if `labels` is not `None`) 216 | """ 217 | outputs = {} 218 | 219 | logits = self.predict_logits(input_ids=input_ids, 220 | token_type_ids=token_type_ids, 221 | attention_mask=attention_mask) 222 | _, y_pred = torch.max(logits, dim=-1) 223 | y_pred = y_pred.cpu().numpy() 224 | outputs['logits'] = logits 225 | outputs['y_pred'] = y_pred 226 | 227 | if labels is not None: 228 | # Only keep active parts of the loss 229 | mask = prediction_mask 230 | if mask is not None: 231 | mask = mask.view(-1) 232 | active_logits = logits.view(-1, self.num_labels)[mask] 233 | active_labels = labels.view(-1)[mask] 234 | loss = self.loss_fct(active_logits, active_labels) 235 | else: 236 | loss = self.loss_fct( 237 | logits.view(-1, self.num_labels), labels.view(-1)) 238 | outputs['loss'] = loss 239 | 240 | return outputs 241 | 242 | 243 | class BertCRF(BertForNERClassification): 244 | """BERT-CRF model. 245 | 246 | Args: 247 | config: BertConfig instance to build BERT model. 248 | kwargs: arguments to be passed to superclass. 249 | """ 250 | 251 | def __init__(self, config: BertConfig, **kwargs: Any): 252 | super().__init__(config, **kwargs) 253 | del self.loss_fct # Delete unused CrossEntropyLoss 254 | self.crf = CRF(num_tags=config.num_labels, batch_first=True) 255 | 256 | def forward(self, 257 | input_ids, 258 | token_type_ids=None, 259 | attention_mask=None, 260 | labels=None, 261 | prediction_mask=None, 262 | ) -> Dict[str, torch.Tensor]: 263 | """Performs the forward pass of the network. 264 | 265 | If `labels` is not `None`, it will calculate and return the the loss, 266 | that is the negative log-likelihood of the batch. 267 | Otherwise, it will calculate the most probable sequence outputs using 268 | Viterbi decoding and return a list of sequences (List[List[int]]) of 269 | variable lengths. 270 | 271 | Args: 272 | input_ids: tensor of input token ids. 273 | token_type_ids: tensor of input sentence type id (0 or 1). Should be 274 | all zeros for NER. Can be safely set to `None`. 275 | attention_mask: mask tensor that should have value 0 for [PAD] 276 | tokens and 1 for other tokens. 277 | labels: tensor of gold NER tag label ids. Values should be ints in 278 | the range [0, config.num_labels - 1]. 279 | prediction_mask: mask tensor should have value 0 for tokens that do 280 | not have an associated prediction, such as [CLS] and WordPìece 281 | subtoken continuations (that start with ##). 282 | 283 | Returns a dict with calculated tensors: 284 | - "logits" 285 | - "loss" (if `labels` is not `None`) 286 | - "y_pred" (if `labels` is `None`) 287 | """ 288 | outputs = {} 289 | 290 | logits = self.predict_logits(input_ids=input_ids, 291 | token_type_ids=token_type_ids, 292 | attention_mask=attention_mask) 293 | outputs['logits'] = logits 294 | 295 | # mask: mask padded sequence and also subtokens, because they must 296 | # not be used in CRF. 297 | mask = prediction_mask 298 | batch_size = logits.shape[0] 299 | 300 | if labels is not None: 301 | # Negative of the log likelihood. 302 | # Loop through the batch here because of 2 reasons: 303 | # 1- the CRF package assumes the mask tensor cannot have interleaved 304 | # zeros and ones. In other words, the mask should start with True 305 | # values, transition to False at some moment and never transition 306 | # back to True. That can only happen for simple padded sequences. 307 | # 2- The first column of mask tensor should be all True, and we 308 | # cannot guarantee that because we have to mask all non-first 309 | # subtokens of the WordPiece tokenization. 310 | loss = 0 311 | for seq_logits, seq_labels, seq_mask in zip(logits, labels, mask): 312 | # Index logits and labels using prediction mask to pass only the 313 | # first subtoken of each word to CRF. 314 | seq_logits = seq_logits[seq_mask].unsqueeze(0) 315 | seq_labels = seq_labels[seq_mask].unsqueeze(0) 316 | loss -= self.crf(seq_logits, seq_labels, 317 | reduction='token_mean') 318 | 319 | loss /= batch_size 320 | outputs['loss'] = loss 321 | 322 | else: 323 | # Same reasons for iterating 324 | output_tags = [] 325 | for seq_logits, seq_mask in zip(logits, mask): 326 | seq_logits = seq_logits[seq_mask].unsqueeze(0) 327 | tags = self.crf.decode(seq_logits) 328 | # Unpack "batch" results 329 | output_tags.append(tags[0]) 330 | 331 | outputs['y_pred'] = output_tags 332 | 333 | return outputs 334 | 335 | 336 | class BertLSTM(BertForNERClassification): 337 | """BERT model with an LSTM model as classifier. This model is meant to be 338 | used with frozen BERT schemes (feature-based). 339 | 340 | Args: 341 | config: BertConfig instance to build BERT model. 342 | lstm_hidden_size: hidden size of LSTM layers. Defaults to 100. 343 | lstm_layers: number of LSTM layers. Defaults to 1. 344 | kwargs: arguments to be passed to superclass. 345 | """ 346 | 347 | def __init__(self, 348 | config: BertConfig, 349 | lstm_hidden_size: int = 100, 350 | lstm_layers: int = 1, 351 | **kwargs: Any): 352 | 353 | lstm_dropout = 0.2 if lstm_layers > 1 else 0 354 | self.lstm_hidden_size = lstm_hidden_size 355 | self.lstm_layers = lstm_layers 356 | pooler = kwargs.get('pooler', 'last') 357 | 358 | super().__init__(config, **kwargs) 359 | 360 | if pooler in ('last', 'sum'): 361 | lstm_input_size = config.hidden_size 362 | else: 363 | assert pooler == 'concat' 364 | lstm_input_size = 4 * config.hidden_size 365 | 366 | self.lstm = torch.nn.LSTM(input_size=lstm_input_size, 367 | hidden_size=lstm_hidden_size, 368 | num_layers=lstm_layers, 369 | dropout=lstm_dropout, 370 | batch_first=True, 371 | bidirectional=True) 372 | 373 | def _build_classifier(self, config, pooler): 374 | """Build label classifier.""" 375 | self.classifier = torch.nn.Linear(2 * self.lstm_hidden_size, 376 | config.num_labels) 377 | 378 | def _pack_bert_encoded_sequence(self, encoded_sequence, attention_mask): 379 | """Returns a PackedSequence to be used by LSTM. 380 | 381 | The encoded_sequence is the output of BERT, of shape (B, S, H). 382 | This method sorts the tensor by sequence length using the 383 | attention_mask along the batch dimension. Then it packs the sorted 384 | tensor. 385 | 386 | Args: 387 | ----- 388 | encoded_sequence (tensor): output of BERT. Shape: (B, S, H) 389 | attention_mask (tensor): Shape: (B, S) 390 | 391 | Returns: 392 | -------- 393 | sorted_encoded_sequence (tensor): sorted `encoded_sequence`. 394 | sorted_ixs (tensor): tensor of indices returned by `torch.sort` when 395 | performing the sort operation. These indices can be used to unsort 396 | the output of the LSTM. 397 | """ 398 | seq_lengths = attention_mask.sum(dim=1) # Shape: (B,) 399 | sorted_lengths, sort_ixs = torch.sort(seq_lengths, descending=True) 400 | 401 | sorted_encoded_sequence = encoded_sequence[sort_ixs, :, :] 402 | 403 | packed_sequence = torch.nn.utils.rnn.pack_padded_sequence( 404 | sorted_encoded_sequence, 405 | sorted_lengths, 406 | batch_first=True) 407 | 408 | return packed_sequence, sort_ixs 409 | 410 | def _unpack_lstm_output(self, packed_sequence, sort_ixs): 411 | """Unpacks and unsorts a sorted PackedSequence that is output by LSTM. 412 | 413 | Args: 414 | packed_sequence (PackedSequence): output of LSTM. Shape: (B, S, Hl) 415 | sort_ixs (tensor): the indexes of be used for unsorting. Shape: (B,) 416 | 417 | Returns: 418 | The unsorted sequence. 419 | """ 420 | B = len(sort_ixs) 421 | 422 | # Unpack 423 | unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_sequence, 424 | batch_first=True) 425 | 426 | assert unpacked.shape <= (B, 512, 2 * self.lstm.hidden_size) 427 | 428 | # Prepare indices for unsort 429 | sort_ixs = sort_ixs.unsqueeze(1).unsqueeze(1) # (B, 1, 1) 430 | # (B, S, Hl) 431 | sort_ixs = sort_ixs.expand(-1, unpacked.shape[1], unpacked.shape[2]) 432 | # Unsort 433 | unsorted_sequence = (torch.zeros_like(unpacked) 434 | .scatter_(0, sort_ixs, unpacked)) 435 | 436 | return unsorted_sequence 437 | 438 | def forward_lstm(self, bert_encoded_sequence, attention_mask): 439 | packed_sequence, sorted_ixs = self._pack_bert_encoded_sequence( 440 | bert_encoded_sequence, attention_mask) 441 | 442 | packed_lstm_out, _ = self.lstm(packed_sequence) 443 | lstm_out = self._unpack_lstm_output(packed_lstm_out, sorted_ixs) 444 | 445 | return lstm_out 446 | 447 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, 448 | labels=None, prediction_mask=None): 449 | """Performs the forward pass of the network. 450 | 451 | Computes the logits, predicted tags and if `labels` is not None, it will 452 | it will calculate and return the the loss, that is, the negative 453 | log-likelihood of the batch. 454 | 455 | Args: 456 | input_ids: tensor of input token ids. 457 | token_type_ids: tensor of input sentence type id (0 or 1). Should be 458 | all zeros for NER. Can be safely set to `None`. 459 | attention_mask: mask tensor that should have value 0 for [PAD] 460 | tokens and 1 for other tokens. 461 | labels: tensor of gold NER tag label ids. Values should be ints in 462 | the range [0, config.num_labels - 1]. 463 | prediction_mask: mask tensor should have value 0 for tokens that do 464 | not have an associated prediction, such as [CLS] and WordPìece 465 | subtoken continuations (that start with ##). 466 | 467 | Returns: 468 | A dict with calculated tensors: 469 | - "logits" 470 | - "y_pred" 471 | - "loss" (if `labels` is not `None`) 472 | """ 473 | outputs = {} 474 | 475 | if self.frozen_bert: 476 | sequence_output = input_ids 477 | else: 478 | sequence_output = self.bert_encode( 479 | input_ids, token_type_ids, attention_mask) 480 | 481 | sequence_output = self.dropout(sequence_output) # (batch, seq, H) 482 | 483 | lstm_out = self.forward_lstm( 484 | sequence_output, attention_mask) # (batch, seq, Hl) 485 | sequence_output = self.dropout(lstm_out) 486 | 487 | logits = self.classifier(sequence_output) 488 | _, y_pred = torch.max(logits, dim=-1) 489 | y_pred = y_pred.cpu().numpy() 490 | outputs['logits'] = logits 491 | outputs['y_pred'] = y_pred 492 | 493 | if labels is not None: 494 | # Only keep active parts of the loss 495 | mask = prediction_mask 496 | if mask is not None: 497 | # Adjust mask and labels to have the same length as logits 498 | mask = mask[:, :logits.size(1)].contiguous() 499 | labels = labels[:, :logits.size(1)].contiguous() 500 | 501 | mask = mask.view(-1) 502 | active_logits = logits.view(-1, self.num_labels)[mask] 503 | active_labels = labels.view(-1)[mask] 504 | loss = self.loss_fct(active_logits, active_labels) 505 | else: 506 | loss = self.loss_fct( 507 | logits.view(-1, self.num_labels), labels.view(-1)) 508 | 509 | outputs['loss'] = loss 510 | 511 | return outputs 512 | 513 | 514 | class BertLSTMCRF(BertLSTM): 515 | """BERT model with an LSTM-CRF as classifier. This model is meant to be 516 | used with frozen BERT schemes (feature-based). 517 | 518 | Args: 519 | config: BertConfig instance to build BERT model. 520 | kwargs: arguments to be passed to superclass (see BertLSTM). 521 | """ 522 | 523 | def __init__(self, config: BertConfig, **kwargs: Any): 524 | super().__init__(config, **kwargs) 525 | self.crf = CRF(num_tags=config.num_labels, batch_first=True) 526 | 527 | def forward(self, 528 | input_ids, 529 | token_type_ids=None, 530 | attention_mask=None, 531 | labels=None, 532 | prediction_mask=None, 533 | ) -> Dict[str, torch.Tensor]: 534 | """Performs the forward pass of the network. 535 | 536 | If `labels` are not None, it will calculate and return the the loss, 537 | that is the negative log-likelihood of the batch. 538 | Otherwise, it will calculate the most probable sequence outputs using 539 | Viterbi decoding and return a list of sequences (List[List[int]]) of 540 | variable lengths. 541 | 542 | Args: 543 | input_ids: tensor of input token ids. 544 | token_type_ids: tensor of input sentence type id (0 or 1). Should be 545 | all zeros for NER. Can be safely set to `None`. 546 | attention_mask: mask tensor that should have value 0 for [PAD] 547 | tokens and 1 for other tokens. 548 | labels: tensor of gold NER tag label ids. Values should be ints in 549 | the range [0, config.num_labels - 1]. 550 | prediction_mask: mask tensor should have value 0 for tokens that do 551 | not have an associated prediction, such as [CLS] and WordPìece 552 | subtoken continuations (that start with ##). 553 | 554 | Returns: 555 | A dict with calculated tensors: 556 | 557 | - "logits" 558 | - "loss" (if `labels` is not `None`) 559 | - "y_pred" (if `labels` is `None`) 560 | """ 561 | outputs = {} 562 | 563 | if self.frozen_bert: 564 | sequence_output = input_ids 565 | else: 566 | sequence_output = self.bert_encode( 567 | input_ids, token_type_ids, attention_mask) 568 | 569 | sequence_output = self.dropout(sequence_output) # (batch, seq, H) 570 | 571 | lstm_out = self.forward_lstm( 572 | sequence_output, attention_mask) # (batch, seq, Hl) 573 | sequence_output = self.dropout(lstm_out) 574 | logits = self.classifier(sequence_output) 575 | outputs['logits'] = logits 576 | 577 | mask = prediction_mask # (B, S) 578 | # Logits sequence length depends on the inputs: logits.shape <= (B, S) 579 | # We have to make the mask and labels the same size. 580 | mask = mask[:, :logits.size(1)].contiguous() 581 | 582 | if labels is not None: 583 | # Negative of the log likelihood. 584 | # Loop through the batch here because of 2 reasons: 585 | # 1- the CRF package assumes the mask tensor cannot have interleaved 586 | # zeros and ones. In other words, the mask should start with True 587 | # values, transition to False at some moment and never transition 588 | # back to True. That can only happen for simple padded sequences. 589 | # 2- The first column of mask tensor should be all True, and we 590 | # cannot guarantee that because we have to mask all non-first 591 | # subtokens of the WordPiece tokenization. 592 | labels = labels[:, :logits.size(1)].contiguous() 593 | batch_size = input_ids.size(0) 594 | loss = 0 595 | for seq_logits, seq_labels, seq_mask in zip(logits, labels, mask): 596 | # Index logits and labels using prediction mask to pass only the 597 | # first subtoken of each word to CRF. 598 | seq_logits = seq_logits[seq_mask].unsqueeze(0) 599 | seq_labels = seq_labels[seq_mask].unsqueeze(0) 600 | loss -= self.crf(seq_logits, seq_labels, 601 | reduction='token_mean') 602 | 603 | loss /= batch_size 604 | outputs['loss'] = loss 605 | 606 | else: 607 | # Same reasons for iterating 608 | output_tags = [] 609 | for seq_logits, seq_mask in zip(logits, mask): 610 | seq_logits = seq_logits[seq_mask].unsqueeze(0) 611 | tags = self.crf.decode(seq_logits) 612 | # Unpack "batch" results 613 | output_tags.append(tags[0]) 614 | 615 | outputs['y_pred'] = output_tags 616 | 617 | return outputs 618 | -------------------------------------------------------------------------------- /ner_evaluation/trainer.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """This file defines the `main` function that handles BERT, BERT-CRF, 17 | BERT-LSTM and BERT-LSTM-CRF training and evaluation on NER task. 18 | 19 | The `main` function should be imported and called by another script that passes 20 | functions to 1) load and preprocess input data and 2) define metrics evaluate 21 | the model during training or testing phases. 22 | 23 | For further information, see `main` function docstring and the ArgumentParser 24 | arguments. 25 | 26 | The code was inspired by Huggingface Tranformers' script for training and 27 | evaluating BERT on SQuAD dataset. 28 | """ 29 | 30 | from __future__ import absolute_import, division, print_function 31 | 32 | import argparse 33 | import logging 34 | import os 35 | import random 36 | import sys 37 | from argparse import Namespace 38 | from typing import Any, Callable, Dict, List, Optional, Tuple 39 | 40 | import numpy as np 41 | import torch 42 | from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule 43 | from pytorch_transformers.tokenization_bert import BertTokenizer 44 | from torch import nn 45 | from torch.nn.utils import clip_grad_norm_ 46 | from torch.utils.data import (DataLoader, Dataset, RandomSampler, 47 | SequentialSampler) 48 | from torch.utils.data.distributed import DistributedSampler 49 | from tqdm.autonotebook import tqdm, trange 50 | 51 | from dataset import get_bert_encoded_dataset 52 | from eval_tools import SequenceMetrics, write_conll_prediction_file 53 | from postprocessing import OutputComposer 54 | from preprocessing import Example, InputSpan 55 | from results_writer import compile_results, write_jsonl_results 56 | from tag_encoder import NERTagsEncoder 57 | from utils import RunningAccumulator, load_model, save_model 58 | 59 | logger = logging.getLogger(__name__) 60 | 61 | 62 | def set_seed(seed: int) -> None: 63 | random.seed(seed) 64 | np.random.seed(seed) 65 | torch.manual_seed(seed) 66 | if torch.cuda.device_count() > 0: 67 | torch.cuda.manual_seed_all(seed) 68 | 69 | 70 | def prepare_dataloaders( 71 | args: Namespace, 72 | train_dataset: Dataset, 73 | valid_dataset: Optional[Dataset] = None, 74 | ) -> Tuple[DataLoader, DataLoader, Optional[DataLoader]]: 75 | """Instantiates the train, train evaluation and validation dataloaders (if 76 | needed).""" 77 | # Instantiate Dataloader 78 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 79 | if args.local_rank == -1: 80 | train_sampler = RandomSampler(train_dataset) 81 | else: 82 | train_sampler = DistributedSampler(train_dataset) 83 | train_dataloader = DataLoader( 84 | train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) 85 | 86 | train_eval_sampler = SequentialSampler(train_dataset) 87 | train_eval_dataloader = DataLoader( 88 | train_dataset, 89 | sampler=train_eval_sampler, 90 | batch_size=args.train_batch_size) 91 | 92 | valid_dataloader = None 93 | if valid_dataset: 94 | valid_sampler = SequentialSampler(valid_dataset) 95 | valid_dataloader = DataLoader( 96 | valid_dataset, 97 | sampler=valid_sampler, 98 | batch_size=args.train_batch_size) 99 | 100 | # Logs 101 | logger.info(" Num examples = %d", len(train_dataset)) 102 | if valid_dataset: 103 | logger.info(" Num valid examples = %d", len(valid_dataset)) 104 | logger.info(" Num Epochs = %d", args.num_train_epochs) 105 | logger.info(" Instantaneous batch size per GPU = %d", 106 | args.per_gpu_train_batch_size) 107 | logger.info( 108 | " Total train batch size (w. parallel, distributed & accumulation) = %d", 109 | (args.train_batch_size * args.gradient_accumulation_steps * 110 | (torch.distributed.get_world_size() 111 | if args.local_rank != -1 else 1))) 112 | logger.info(" Gradient Accumulation steps = %d", 113 | args.gradient_accumulation_steps) 114 | 115 | return train_dataloader, train_eval_dataloader, valid_dataloader 116 | 117 | 118 | def prepare_optimizer_and_scheduler(args: Namespace, 119 | model: nn.Module, 120 | num_batches: int, 121 | ) -> Tuple[AdamW, WarmupLinearSchedule]: 122 | """Configures BERT's AdamW optimizer and WarmupLinearSchedule learning rate 123 | scheduler. Divides parameters into two learning rate groups, with higher 124 | learning rate for non-BERT parameters (classifier model).""" 125 | t_total = (num_batches // args.gradient_accumulation_steps * 126 | args.num_train_epochs) 127 | 128 | if args.local_rank != -1: 129 | t_total = t_total // torch.distributed.get_world_size() 130 | 131 | logger.info(" Total optimization steps = %d", t_total) 132 | 133 | # Prepare optimizer 134 | param_optimizer = list( 135 | filter(lambda p: p[1].requires_grad, model.named_parameters())) 136 | 137 | no_decay = ['bias', 'LayerNorm.weight'] 138 | higher_lr = ['classifier', 'crf', 'lstm'] 139 | 140 | def is_classifier_param(param_name: str) -> bool: 141 | return any(hl in param_name for hl in higher_lr) 142 | 143 | def ignore_in_weight_decay(param_name: str) -> bool: 144 | return any(nd in param_name for nd in no_decay) 145 | 146 | optimizer_grouped_parameters = [ 147 | {'params': [p for name, p in param_optimizer 148 | if not ignore_in_weight_decay(name) 149 | and not is_classifier_param(name)], 150 | 'weight_decay': 0.01}, 151 | {'params': [p for name, p in param_optimizer 152 | if not ignore_in_weight_decay(name) 153 | and is_classifier_param(name)], 154 | 'weight_decay': 0.01, 155 | 'lr': args.classifier_lr}, 156 | {'params': [p for name, p in param_optimizer 157 | if ignore_in_weight_decay(name) 158 | and not is_classifier_param(name)], 159 | 'weight_decay': 0.0}, 160 | ] 161 | 162 | # To reproduce BertAdam specific behavior set correct_bias=False 163 | optimizer = AdamW(optimizer_grouped_parameters, 164 | lr=args.learning_rate, 165 | correct_bias=False) 166 | num_warmup_steps = t_total * args.warmup_proportion 167 | scheduler = WarmupLinearSchedule(optimizer, 168 | warmup_steps=num_warmup_steps, 169 | t_total=t_total) 170 | 171 | return optimizer, scheduler 172 | 173 | 174 | def train(args: Namespace, 175 | model: torch.nn.Module, 176 | train_dataset: Dataset, 177 | train_metrics: SequenceMetrics, 178 | train_output_composer: OutputComposer, 179 | valid_dataset: Optional[Dataset] = None, 180 | valid_metrics: Optional[SequenceMetrics] = None, 181 | valid_output_composer: Optional[OutputComposer] = None) -> None: 182 | """Train routine.""" 183 | 184 | logger.info("***** Running training *****") 185 | 186 | train_dl, train_eval_dl, valid_dl = prepare_dataloaders( 187 | args, train_dataset, valid_dataset) 188 | 189 | optimizer, scheduler = prepare_optimizer_and_scheduler( 190 | args, model, num_batches=len(train_dl)) 191 | 192 | # Multi-gpu, distributed and fp16 setup 193 | if args.fp16: 194 | try: 195 | from apex import amp 196 | except ImportError: 197 | msg = ("Please install apex from " 198 | "https://www.github.com/nvidia/apex to use fp16 training.") 199 | raise ImportError(msg) 200 | model, optimizer = amp.initialize( 201 | model, optimizer, opt_level=args.fp16_opt_level) 202 | 203 | # multi-gpu training (should be after apex fp16 initialization) 204 | if args.n_gpu > 1: 205 | model = torch.nn.DataParallel(model) 206 | 207 | # Distributed training (should be after apex fp16 initialization) 208 | if args.local_rank != -1: 209 | model = torch.nn.parallel.DistributedDataParallel( 210 | model, 211 | device_ids=[args.local_rank], 212 | output_device=args.local_rank, 213 | find_unused_parameters=True) 214 | 215 | global_step = 0 216 | train_losses = [] 217 | if valid_dataset: 218 | min_val_loss = float('inf') 219 | 220 | # Training loop 221 | try: 222 | epoch_tqdm = trange(int(args.num_train_epochs), desc="Epoch") 223 | loss_accum = RunningAccumulator() 224 | for epoch in epoch_tqdm: 225 | model.train() 226 | stats = {} 227 | 228 | train_tqdm = tqdm(train_dl, desc="Iter") 229 | for step, batch in enumerate(train_tqdm): 230 | if args.n_gpu == 1: 231 | # multi-gpu does scattering it-self 232 | batch = tuple(t.to(args.device) for t in batch) 233 | # Unpack batch 234 | input_ids = batch[0] 235 | input_mask = batch[1] 236 | segment_ids = batch[2] 237 | label_ids = batch[3] 238 | prediction_mask = batch[4] 239 | # example_ixs = batch[5] 240 | # doc_span_ixs = batch[6] 241 | 242 | outs = model(input_ids, segment_ids, 243 | input_mask, label_ids, prediction_mask) 244 | loss = outs['loss'] 245 | if args.n_gpu > 1: 246 | loss = loss.mean() # mean() to average on multi-gpu. 247 | if args.gradient_accumulation_steps > 1: 248 | loss = loss / args.gradient_accumulation_steps 249 | 250 | loss_accum.accumulate(loss.item()) 251 | running_mean_loss = loss_accum.mean() 252 | train_tqdm.set_postfix({'loss': running_mean_loss}) 253 | 254 | if args.fp16: 255 | with amp.scale_loss(loss, optimizer) as scaled_loss: 256 | scaled_loss.backward() 257 | clip_grad_norm_(amp.master_params( 258 | optimizer), args.max_grad_norm) 259 | else: 260 | loss.backward() 261 | clip_grad_norm_(model.parameters(), args.max_grad_norm) 262 | 263 | if (step + 1) % args.gradient_accumulation_steps == 0: 264 | 265 | # Perform gradient clipping 266 | for group in optimizer.param_groups: 267 | for p in group['params']: 268 | if p.grad is None: 269 | continue 270 | clip_grad_norm_(p, 1) 271 | 272 | scheduler.step() 273 | optimizer.step() 274 | optimizer.zero_grad() 275 | 276 | global_step += 1 277 | 278 | train_losses.append(loss_accum.mean()) 279 | 280 | stats['loss'] = format_tqdm_metric(train_losses[-1], 281 | float(min(train_losses)), 282 | fmt='{:.3e}') 283 | 284 | # Evaluate train set 285 | if epoch % 5 == 0 or epoch == args.num_train_epochs - 1: 286 | trn_epoch_metrics = evaluate( 287 | args, 288 | model, 289 | tqdm(train_eval_dl, desc="Train metrics"), 290 | train_output_composer, 291 | train_metrics, 292 | ) 293 | 294 | stats['trn_f1'] = format_tqdm_metric( 295 | trn_epoch_metrics['f1_score'], 296 | train_metrics.get_best('f1_score'), 297 | fmt='{:.2%}') 298 | 299 | epoch_tqdm.set_postfix(stats) 300 | epoch_tqdm.refresh() 301 | 302 | if valid_dataset: 303 | # Evaluate validation set 304 | val_epoch_metrics = evaluate( 305 | args, 306 | model, 307 | tqdm(valid_dl, desc="Validation"), 308 | valid_output_composer, 309 | valid_metrics, 310 | ) 311 | 312 | # Show metrics on tqdm 313 | if 'loss' in val_epoch_metrics: 314 | epoch_val_loss = val_epoch_metrics['loss'] 315 | min_val_loss = min(min_val_loss, epoch_val_loss) 316 | stats['val_loss'] = format_tqdm_metric( 317 | epoch_val_loss, min_val_loss, fmt='{:.3e}') 318 | 319 | stats['val_f1'] = format_tqdm_metric( 320 | val_epoch_metrics['f1_score'], 321 | valid_metrics.get_best('f1_score'), 322 | fmt='{:.2%}') 323 | 324 | best_epoch = valid_metrics.get_best_epoch('f1_score') 325 | stats['best_epoch'] = best_epoch 326 | 327 | # Save model if best epoch 328 | if best_epoch == epoch + 1: 329 | tqdm.write('Best epoch. Saving model.') 330 | save_model(model, args) 331 | 332 | epoch_tqdm.set_postfix(stats) 333 | epoch_tqdm.refresh() 334 | 335 | # End of training 336 | if args.valid_file: 337 | logger.info(" Validation F1 scores: %s", 338 | valid_metrics.history['f1_score']) 339 | best_epoch = valid_metrics.get_best_epoch('f1_score') 340 | logger.info(" Validation confusion matrix:") 341 | logger.info(" Epoch %d", best_epoch) 342 | conf_mat = valid_metrics.get_value("confusion_matrix", best_epoch) 343 | logger.info("\n" + str(conf_mat)) 344 | logger.info(" Validation classification report:") 345 | classif_report = valid_metrics.get_value( 346 | "classification_report", best_epoch) 347 | logger.info("\n" + str(classif_report)) 348 | 349 | except KeyboardInterrupt: 350 | action = '' 351 | while action.lower() not in ('y', 'n'): 352 | action = input( 353 | '\nInterrupted. Continue execution to save model ' 354 | 'weights? [Y/n]') 355 | if action == 'n': 356 | sys.exit() 357 | 358 | if not valid_dataset: 359 | # If not using valid dataset, save model of last epoch 360 | logger.info('Saving model from last epoch.') 361 | save_model(model, args) 362 | 363 | if args.results_file: 364 | # Append this run results 365 | write_jsonl_results( 366 | compile_results(args, train_metrics, 367 | valid_metrics, train_losses=train_losses), 368 | args.results_file, 369 | ) 370 | 371 | 372 | def evaluate(args: Namespace, 373 | model: nn.Module, 374 | dataloader: DataLoader, 375 | output_composer: OutputComposer, 376 | sequence_metrics: SequenceMetrics, 377 | reset: bool = True, 378 | ) -> Dict[str, Any]: 379 | """Runs a model forward pass on the entire dataloader to compute predictions 380 | for all examples. Final predictions are gathered in `output_composer`, 381 | combining the max-context tokens of each forward pass. Returns the 382 | metrics dict computed by `sequence_metrics.calculate_metrics()`.""" 383 | # Evaluate 384 | model.eval() 385 | 386 | losses = [] 387 | for step, batch in enumerate(dataloader): 388 | if args.n_gpu == 1: 389 | batch = tuple(t.to(args.device) for t in batch) 390 | # Unpack batch 391 | input_ids = batch[0] 392 | input_mask = batch[1] 393 | segment_ids = batch[2] 394 | label_ids = batch[3] 395 | prediction_mask = batch[4] 396 | example_ixs = batch[5] 397 | doc_span_ixs = batch[6] 398 | 399 | with torch.no_grad(): 400 | if args.no_crf: 401 | # BERT or BERT-LSTM 402 | outs = model( 403 | input_ids, 404 | segment_ids, 405 | input_mask, 406 | labels=label_ids, 407 | prediction_mask=prediction_mask) 408 | else: 409 | # BERT-CRF or BERT-LSTM-CRF. 410 | # We do not pass `labels` otherwise y_pred is not calculated. 411 | outs = model( 412 | input_ids, 413 | segment_ids, 414 | input_mask, 415 | prediction_mask=prediction_mask) 416 | 417 | y_pred = outs['y_pred'] 418 | 419 | output_composer.insert_batch(example_ixs, doc_span_ixs, y_pred) 420 | 421 | loss = outs.get('loss') 422 | if loss is not None: 423 | loss = loss.item() 424 | losses.append(loss) 425 | 426 | y_true = [example.labels for example in output_composer.examples] 427 | y_pred = output_composer.get_outputs() 428 | metrics = sequence_metrics.calculate_metrics(y_true, y_pred) 429 | 430 | if losses: 431 | metrics['loss'] = float(np.mean(losses)) 432 | 433 | return metrics 434 | 435 | 436 | def format_tqdm_metric(value: float, best_value: float, fmt: str) -> str: 437 | """Formats a value to display in tqdm.""" 438 | if value == best_value: 439 | return (fmt + '*').format(value) 440 | 441 | return (fmt + ' (' + fmt + '*)').format(value, best_value) 442 | 443 | 444 | def main( 445 | load_and_cache_examples_fn: Callable[ 446 | [Namespace, BertTokenizer, NERTagsEncoder, str], 447 | Tuple[Dataset, List[Example], List[InputSpan]]], 448 | get_train_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics], 449 | get_valid_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics], 450 | get_eval_metrics_fn: Callable[[NERTagsEncoder], SequenceMetrics] 451 | ): 452 | """Script entry-point. Performs training and/or evaluation routines. 453 | 454 | This function handles model training and evaluation. All arguments are 455 | functions that handle 1) training and evaluation data loading and 456 | preprocessing or 2) defining evaluation metrics. By modifying these 457 | functions, one can adapt this script to other NER datasets in distinct 458 | formats. 459 | 460 | Args: 461 | load_and_cache_examples_fn: a function that handles dataset loading and 462 | preprocessing. The data should be loaded and converted into 463 | `preprocessing.Example` instances, that can then be used to 464 | generate InputSpans and a BERT-ready Dataset. 465 | 466 | This function receives the following inputs: 467 | 468 | args: a Namespace object of parsed CLI arguments with model 469 | hyperparameters and dataset input files. 470 | bert_tokenizer: a loaded instance of BertTokenizer. 471 | tag_encoder: a NERTagsEncoder instance created from the tasks NER 472 | classes. 473 | mode: a mode string (train|valid|eval) to select which input file 474 | to read (args.train_file, args.valid_file or args.eval_file). 475 | 476 | get_train_metrics_fn: a function that receives a NERTagsEncoder and 477 | returns a SequenceMetrics object to evaluate the model on train 478 | data during training (`--do_train`). 479 | get_valid_metrics_fn: a function that receives a NERTagsEncoder and 480 | returns a SequenceMetrics object to evaluate the model on 481 | validation data during training (`--do_train`). 482 | get_eval_metrics_fn: a function that receives a NERTagsEncoder and 483 | returns a SequenceMetrics object to evaluate the model on test data 484 | during evaluation (`--do_eval`). 485 | """ 486 | parser = argparse.ArgumentParser() 487 | 488 | # Model and hyperparameters 489 | parser.add_argument("--bert_model", default=None, type=str, required=True, 490 | help="Bert pre-trained model name or path to a " 491 | "checkpoint directory.") 492 | parser.add_argument("--tokenizer_model", default=None, type=str, 493 | required=False, 494 | help="Path to tokenizer files. If empty, defaults to " 495 | "--bert_model.") 496 | parser.add_argument("--do_lower_case", 497 | action='store_true', 498 | help="Whether to lower case the input text. True for " 499 | "uncased models, False for cased models.") 500 | parser.add_argument("--max_seq_length", default=512, type=int, 501 | help="The maximum total input sequence length after " 502 | "WordPiece tokenization. Sequences longer than this " 503 | "will be split into multiple spans, and sequences " 504 | "shorter than this will be padded.") 505 | parser.add_argument("--doc_stride", default=128, type=int, 506 | help="When splitting up a long document into chunks, " 507 | "how much stride to take between chunks.") 508 | parser.add_argument('--labels_file', 509 | required=True, 510 | help="File with all NER classes to be considered, one " 511 | "per line.") 512 | parser.add_argument('--scheme', 513 | default='bio', help='NER tagging scheme (BIO|BILUO).') 514 | parser.add_argument('--no_crf', 515 | action='store_true', 516 | help='Remove the CRF layer (use plain BERT or ' 517 | 'BERT-LSTM).') 518 | parser.add_argument('--pooler', 519 | default='last', 520 | help='Pooling strategy for extracting BERT encoded ' 521 | 'features from last BERT layers. ' 522 | 'One of "last", "sum" or "concat".') 523 | parser.add_argument('--freeze_bert', 524 | action='store_true', 525 | help="Freeze BERT layers' parameters. If True, uses " 526 | "either a BERT-LSTM or BERT-LSTM-CRF model.") 527 | parser.add_argument('--lstm_hidden_size', 528 | type=int, 529 | default=100, 530 | help=('Hidden dimension of the LSTM (only used when ' 531 | 'the BERT model is frozen.')) 532 | parser.add_argument('--lstm_layers', 533 | type=int, 534 | default=1, 535 | help=('Number of LSTM layers (only used when the BERT ' 536 | 'model is frozen.')) 537 | # General 538 | parser.add_argument("--output_dir", default=None, type=str, required=True, 539 | help="The output directory where the model checkpoints" 540 | " and predictions will be written.") 541 | parser.add_argument("--no_cuda", 542 | action='store_true', 543 | help="Whether not to use CUDA when available") 544 | parser.add_argument("--verbose_logging", action='store_true', 545 | help="If true, all of the warnings related to data " 546 | "processing will be printed.") 547 | parser.add_argument('--override_cache', 548 | action='store_true', 549 | help='Override feature caches of input files.') 550 | 551 | # Training related 552 | parser.add_argument("--do_train", action='store_true', 553 | help="Whether to run training.") 554 | parser.add_argument("--train_file", default=None, 555 | type=str, help="JSON for training.") 556 | parser.add_argument("--valid_file", default=None, type=str, 557 | help="JSON for validating during training.") 558 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 559 | help="Batch size per GPU/CPU for training.") 560 | parser.add_argument("--learning_rate", default=5e-5, 561 | type=float, help="The initial learning rate for Adam.") 562 | parser.add_argument('--classifier_lr', 563 | type=float, 564 | default=1e-3, 565 | help='Learning rate of the classifier and CRF layers.') 566 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 567 | help="Total number of training epochs to perform.") 568 | parser.add_argument("--warmup_proportion", default=0.1, type=float, 569 | help="Proportion of training to perform linear " 570 | "learning rate warmup for. E.g., 0.1 = 10%% " 571 | "of training.") 572 | parser.add_argument('--seed', 573 | type=int, 574 | default=42, 575 | help="random seed for initialization") 576 | parser.add_argument('--gradient_accumulation_steps', 577 | type=int, 578 | default=1, 579 | help="Number of updates steps to accumulate before " 580 | "performing a backward/update pass.") 581 | parser.add_argument('--max_grad_norm', 582 | type=float, 583 | default=1., 584 | help="Maximum value of gradient norm on update.") 585 | parser.add_argument("--local_rank", 586 | type=int, 587 | default=-1, 588 | help="local_rank for distributed training on gpus") 589 | parser.add_argument('--fp16', 590 | action='store_true', 591 | help="Whether to use 16-bit float precision instead of" 592 | " 32-bit") 593 | parser.add_argument('--loss_scale', 594 | type=float, default=0, 595 | help="Loss scaling to improve fp16 numeric stability. " 596 | "Only used when fp16 set to True.\n" 597 | "0 (default value): dynamic loss scaling.\n" 598 | "Positive power of 2: static loss scaling " 599 | "value.\n") 600 | parser.add_argument('--few_samples', 601 | type=int, default=-1, 602 | help="Turn on few samples for training.") 603 | parser.add_argument('--results_file', 604 | default=None, 605 | required=False, 606 | help='Optional JSONlines file to log train runs.') 607 | 608 | # Evaluation related 609 | parser.add_argument("--do_eval", action='store_true', 610 | help="Whether to run eval on the test set.") 611 | parser.add_argument("--eval_file", default=None, type=str, 612 | help="JSON for evaluating the model.") 613 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 614 | help="Batch size per GPU/CPU for evaluation.") 615 | 616 | args = parser.parse_args() 617 | 618 | # Setup CUDA, GPU & distributed training 619 | if args.local_rank == -1 or args.no_cuda: 620 | device = torch.device( 621 | "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 622 | args.n_gpu = torch.cuda.device_count() 623 | else: 624 | # Initializes the distributed backend which will take care of 625 | # sychronizing nodes/GPUs 626 | torch.cuda.set_device(args.local_rank) 627 | device = torch.device("cuda", args.local_rank) 628 | torch.distributed.init_process_group(backend='nccl') 629 | args.n_gpu = 1 630 | args.device = device 631 | 632 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits " 633 | "training: {}".format( 634 | device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) 635 | logger.info("seed: {}, output_dir: {}".format(args.seed, args.output_dir)) 636 | 637 | if args.gradient_accumulation_steps < 1: 638 | message = ("Invalid gradient_accumulation_steps parameter: {}, should " 639 | "be >= 1".format(args.gradient_accumulation_steps)) 640 | raise ValueError(message) 641 | 642 | set_seed(args.seed) 643 | 644 | if not args.do_train and not args.do_eval: 645 | raise ValueError( 646 | "At least one of `do_train` or `do_eval` must be" 647 | "True.") 648 | 649 | if args.do_train: 650 | if not args.train_file: 651 | raise ValueError( 652 | "If `do_train` is True, then `train_file` must be specified.") 653 | if args.do_eval: 654 | if not args.eval_file: 655 | raise ValueError( 656 | "If `do_eval` is True, then `eval_file` must be " 657 | "specified.") 658 | 659 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) \ 660 | and args.do_train: 661 | raise ValueError( 662 | "Output directory () already exists and is not empty.") 663 | if not os.path.exists(args.output_dir): 664 | os.makedirs(args.output_dir) 665 | 666 | tokenizer_path = args.tokenizer_model or args.bert_model 667 | tokenizer = BertTokenizer.from_pretrained( 668 | tokenizer_path, do_lower_case=args.do_lower_case) 669 | 670 | # Instantiate NER Tag encoder 671 | tag_encoder = NERTagsEncoder.from_labels_file( 672 | args.labels_file, scheme=args.scheme.upper()) 673 | 674 | args.num_labels = tag_encoder.num_labels 675 | 676 | # Load a pretrained model 677 | model = load_model(args, args.bert_model, training=args.do_train) 678 | model.to(device) 679 | 680 | train_examples = None 681 | valid_dataset, valid_examples, valid_features = None, None, None 682 | 683 | # Train 684 | if args.do_train: 685 | # Read examples and get features and dataset 686 | train_dataset, train_examples, train_features = load_and_cache_examples_fn( 687 | args, 688 | tokenizer, 689 | tag_encoder, 690 | mode='train', 691 | ) 692 | 693 | # Instantiate OutputComposer to post-process train examples 694 | train_output_comp = OutputComposer( 695 | train_examples, 696 | train_features, 697 | output_transform_fn=tag_encoder.convert_ids_to_tags) 698 | 699 | if args.valid_file: 700 | logger.info("Reading validation examples.") 701 | 702 | valid_dataset, valid_examples, valid_features = load_and_cache_examples_fn( 703 | args, 704 | tokenizer, 705 | tag_encoder, 706 | mode='valid', 707 | ) 708 | # Instantiate OutputComposer to post-process valid examples 709 | valid_output_comp = OutputComposer( 710 | valid_examples, 711 | valid_features, 712 | output_transform_fn=tag_encoder.convert_ids_to_tags) 713 | 714 | if args.freeze_bert: 715 | # Freeze BERT layers 716 | logger.info("Freezing BERT layers.") 717 | model.freeze_bert() 718 | assert model.frozen_bert 719 | 720 | logger.info("Creating BERT encoded datasets...") 721 | 722 | train_dataset = get_bert_encoded_dataset( 723 | model, train_dataset, args.per_gpu_train_batch_size, 724 | args.device) 725 | if valid_dataset: 726 | valid_dataset = get_bert_encoded_dataset( 727 | model, valid_dataset, args.per_gpu_train_batch_size, 728 | args.device) 729 | 730 | # Initialize Metrics tracker 731 | train_metrics = get_train_metrics_fn(tag_encoder) 732 | 733 | if args.valid_file: 734 | valid_metrics = get_valid_metrics_fn(tag_encoder) 735 | 736 | # Training loop 737 | train( 738 | args, 739 | model, 740 | train_dataset, 741 | train_metrics=train_metrics, 742 | train_output_composer=train_output_comp, 743 | valid_dataset=valid_dataset, 744 | valid_metrics=valid_metrics, 745 | valid_output_composer=valid_output_comp, 746 | ) 747 | 748 | # Save tokenizer 749 | tokenizer.save_pretrained(args.output_dir) 750 | 751 | # Load a trained model and config that you have fine-tuned 752 | logger.info('Loading best model') 753 | model = load_model(args, model_path=args.output_dir, training=False) 754 | model.to(device) 755 | 756 | if args.do_eval and ( 757 | args.local_rank == -1 or torch.distributed.get_rank() == 0): 758 | 759 | logger.info("Reading evaluation examples.") 760 | eval_dataset, eval_examples, eval_features = load_and_cache_examples_fn( 761 | args, 762 | tokenizer, 763 | tag_encoder, 764 | mode='eval', 765 | ) 766 | # Instantiate OutputComposer to post-process eval examples 767 | eval_output_comp = OutputComposer( 768 | eval_examples, 769 | eval_features, 770 | output_transform_fn=tag_encoder.convert_ids_to_tags) 771 | 772 | logger.info("***** Running evaluation predictions *****") 773 | logger.info(" Num orig examples = %d", len(eval_examples)) 774 | logger.info(" Num split examples = %d", len(eval_features)) 775 | logger.info(" Batch size = %d", args.per_gpu_eval_batch_size) 776 | 777 | # Run prediction for full data 778 | eval_sampler = SequentialSampler(eval_dataset) 779 | eval_dataloader = DataLoader(eval_dataset, 780 | sampler=eval_sampler, 781 | batch_size=args.per_gpu_eval_batch_size, 782 | num_workers=os.cpu_count()) 783 | 784 | # Define SequenceMetrics that handle the postprocessing 785 | eval_metrics = get_eval_metrics_fn(tag_encoder) 786 | 787 | model.frozen_bert = False 788 | 789 | metrics = evaluate( 790 | args, 791 | model, 792 | tqdm(eval_dataloader, desc="Evaluation"), 793 | eval_output_comp, 794 | eval_metrics, 795 | reset=False, 796 | ) 797 | 798 | # Display and save test metrics 799 | metrics_values = [] 800 | for metric_name in ('f1_score', 'precision', 'recall'): 801 | metric_value = metrics[metric_name] 802 | metrics_values.append(metric_value) 803 | logger.info("%s: %s", metric_name, metric_value) 804 | 805 | with open(os.path.join(args.output_dir, 'metrics.txt'), 'w') as fd: 806 | fd.write(','.join(map(str, metrics_values))) 807 | 808 | logger.info('Classification report:') 809 | logger.info('\n%s', metrics['classification_report']) 810 | 811 | conll_file = os.path.join(args.output_dir, 'predictions_conll.txt') 812 | logger.info('Writing CoNLL style prediction file to %s.', conll_file) 813 | 814 | # Get predictions for all examples 815 | y_pred = eval_output_comp.get_outputs() 816 | # Filter invalid predictions 817 | y_pred_filt = [tag_encoder.decode_valid(preds) for preds in y_pred] 818 | 819 | # Write CoNLL file 820 | write_conll_prediction_file(conll_file, eval_examples, y_pred_filt) 821 | -------------------------------------------------------------------------------- /ner_evaluation/data/FirstHAREM-selective-dev.json: -------------------------------------------------------------------------------- 1 | [{"doc_id": "HAREM-361-02413", "doc_text": "\nFernando Ferreira\n[click for a page in english]\nCMAF- Universidade de Lisboa Gabinete A2-31 Avenida Professor Gama Pinto, 2 Telefone do Gabinete: 217904893 P-1649-003 Lisboa Extens\u00e3o interna: 4293 Portugal Email: ferferr@cii.fc.ul.pt Departamento de Matem\u00e1tica | Faculdade de Ci\u00eancias | Universidade de Lisboa | CMAF\nApresenta\u00e7\u00e3o\nBem vindos \u00e0minha p\u00e1gina pessoal. Sou Professor Associado do Departamento de Matem\u00e1tica da Universidade de Lisboa e membro do Centro de Matem\u00e1tica e Aplica\u00e7\u00f5es Fundamentais - CMAF. Clique aqui para obter o meu CV.\nInteresses Acad\u00e9micos\nL\u00f3gica Matem\u00e1tica, em especial teorias fracas da aritm\u00e9tica e da an\u00e1lise. Filosofia e Fundamentos de Matem\u00e1tica . Tenho um interesse amador (no sentido latino da palavra) por alguns problemas da Filosofia Antiga , particularmente no problema da falsidade em Parm\u00e9nides e Plat\u00e3o. Tamb\u00e9m escrevi alguns ensaios exposit\u00f3rios sobre temas da l\u00f3gica: clique aqui para os ver.\nEnsino\nNo presente semestre dou aulas te\u00f3rico-pr\u00e1ticas de \u00c1lgebra 2, cadeira do segundo ano das licenciaturas em Matem\u00e1tica. O Professor Jos\u00e9 Perdig\u00e3o Dias da Silva \u00e9o regente da cadeira.\nNo semestre passado fui respons\u00e1vel pelas cadeiras de Topologia e Introdu\u00e7\u00e3o \u00e0An\u00e1lise Funcional, do terceiro ano das licenciaturas em Matem\u00e1tica, e de Teoria da Demonstra\u00e7\u00e3o, do \nMestrado em Matem\u00e1tica.\nNo ano passado ensinei a cadeira de L\u00f3gica Matem\u00e1tica aos finalistas de Matem\u00e1tica e licenciaturas relacionadas. Clique aqui para\nver a p\u00e1gina desta cadeira. Tamb\u00e9m dei a cadeira L\u00f3gica de Primeira-Ordem ao primeiro ano das licenciaturas em Inform\u00e1tica e Engenharia da Linguagem e do Conhecimento. A p\u00e1gina web desta cadeira ainda se encontra dispon\u00edvel on-line em html://www.alf1.cii.fc.ul.pt/~ferferr/lpo.html.\nTamb\u00e9m colaboro no Mestrado em Filosofia da Linguagem e da Consci\u00eancia da Faculdade de Letras.\nEventos\nDe 25 a 28 de Junho decorrer\u00e1 em Lisboa, no CMAF, a School on Real Algebraic and Analytic Geometry and O-minimal Structures .\n\u00c0s quintas-feiras decorre o Semin\u00e1rio de L\u00f3gica Matem\u00e1tica (SLM), organizado por mim e pelo Professor Narciso Garcia do Instituto Superior T\u00e9cnico. Se quiser ter not\u00edcias regulares sobre o SLM, por favor contacte-me.\nV\u00e1ria\nSou co-editor da Disputatio , uma revista de Filosofia Anal\u00edtica.\n", "entities": [{"entity_id": "23", "text": "Fernando Ferreira", "label": "PESSOA", "start_offset": 1, "end_offset": 18}, {"entity_id": "24", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 49, "end_offset": 53}, {"entity_id": "25", "text": "Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 55, "end_offset": 77}, {"entity_id": "26", "text": "Gabinete A2-31 Avenida Professor Gama Pinto, 2", "label": "LOCAL", "start_offset": 78, "end_offset": 124}, {"entity_id": "28", "text": "Lisboa", "label": "LOCAL", "start_offset": 168, "end_offset": 174}, {"entity_id": "30", "text": "Portugal", "label": "LOCAL", "start_offset": 198, "end_offset": 206}, {"entity_id": "32", "text": "Departamento de Matem\u00e1tica", "label": "ORGANIZACAO", "start_offset": 235, "end_offset": 261}, {"entity_id": "33", "text": "Faculdade de Ci\u00eancias", "label": "ORGANIZACAO", "start_offset": 264, "end_offset": 285}, {"entity_id": "34", "text": "Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 288, "end_offset": 310}, {"entity_id": "35", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 313, "end_offset": 317}, {"entity_id": "36", "text": "Professor Associado", "label": "PESSOA", "start_offset": 369, "end_offset": 388}, {"entity_id": "37", "text": "Departamento de Matem\u00e1tica da Universidade de Lisboa", "label": "ORGANIZACAO", "start_offset": 392, "end_offset": 444}, {"entity_id": "38", "text": "Centro de Matem\u00e1tica e Aplica\u00e7\u00f5es Fundamentais", "label": "ORGANIZACAO", "start_offset": 457, "end_offset": 503}, {"entity_id": "39", "text": "CMAF", "label": "ORGANIZACAO", "start_offset": 506, "end_offset": 510}, {"entity_id": "48", "text": "Professor Jos\u00e9 Perdig\u00e3o Dias da Silva", "label": "PESSOA", "start_offset": 1065, "end_offset": 1102}, {"entity_id": "61", "text": "Faculdade de Letras", "label": "ORGANIZACAO", "start_offset": 1816, "end_offset": 1835}, {"entity_id": "62", "text": "25", "label": "TEMPO", "start_offset": 1848, "end_offset": 1850}, {"entity_id": "63", "text": "28 de Junho", "label": "TEMPO", "start_offset": 1853, "end_offset": 1864}, {"entity_id": "64", "text": "Lisboa", "label": "LOCAL", "start_offset": 1878, "end_offset": 1884}, {"entity_id": "65", "text": "CMAF", "label": "LOCAL", "start_offset": 1889, "end_offset": 1893}, {"entity_id": "69", "text": "Professor Narciso Garcia", "label": "PESSOA", "start_offset": 2063, "end_offset": 2087}, {"entity_id": "70", "text": "Instituto Superior T\u00e9cnico", "label": "ORGANIZACAO", "start_offset": 2091, "end_offset": 2117}]}, {"doc_id": "HAREM-281-01176", "doc_text": "\nBOMBEIROS VOLUNT\u00c1RIOS DE VILA NOVA DE OLIVEIRINHA\nClique aqui para ENTRAR NO MENU\n(Fotografia do Quartel Constru\u00eddo em 1935)\nCLIQUE AQUI para enviar uma mensagem\nVisitas desde 13/05/2001\nMensagem do Presidente da Direc\u00e7\u00e3o\nCaros amigos dos Bombeiros:\nA nossa p\u00e1gina na Internet j\u00e1 est\u00e1 activa desde o dia 13 de Maio de 2001, data em que se comemorou mais uma Festa dos Carolos (2001) .\nA Festa dos Carolos \u00e9 uma tradi\u00e7\u00e3o desta terra que os Bombeiros querem manter viva neste come\u00e7o do novo mil\u00e9nio.\nE como nesta nova era as solicita\u00e7\u00f5es s\u00e3o diversas, os Bombeiros Volunt\u00e1rios de Vila Nova de Oliveirinha t\u00eam bem presente os novos desafios.\nAssim, paralelamente \u00e0 constru\u00e7\u00e3o do Novo Quartel dos Bombeiros --temos dado passos bastante importantes!--, vamos continuar a melhorar esta p\u00e1gina na Internet.\nContinuamos a receber conte\u00fados para dotarmos esta p\u00e1gina com bastante informa\u00e7\u00e3o, pelo que a vossa ajuda pode ser determinante. Para tal,\npodem escrever-nos, enviar um fax ou uma mensagem via correio electr\u00f3nico. Para isso, visite a p\u00e1gina de CONTACTOS .\nVamos todos ajudar os Bombeiros.\nVamos todos divulgar aquilo que \u00e9 este corpo, o Corpo dos Bombeiros Volunt\u00e1rios de V. Nova de Oliveirinha.\nCaros amigos, fiquem pois atentos \u00e0s actualiza\u00e7\u00f5es desta p\u00e1gina.\nCom os melhores cumprimentos.\nEduardo Pereira\n(Presidente da Direc\u00e7\u00e3o)\nNOTA:\nClique aqui ou na imagem do Quartel para continuar a navegar!\n", "entities": [{"entity_id": "74", "text": "BOMBEIROS VOLUNT\u00c1RIOS DE VILA NOVA DE OLIVEIRINHA", "label": "ORGANIZACAO", "start_offset": 1, "end_offset": 50}, {"entity_id": "75", "text": "1935", "label": "TEMPO", "start_offset": 120, "end_offset": 124}, {"entity_id": "76", "text": "13/05/2001", "label": "TEMPO", "start_offset": 177, "end_offset": 187}, {"entity_id": "77", "text": "Presidente da Direc\u00e7\u00e3o", "label": "PESSOA", "start_offset": 200, "end_offset": 222}, {"entity_id": "78", "text": "Bombeiros", "label": "PESSOA", "start_offset": 240, "end_offset": 249}, {"entity_id": "79", "text": "Internet", "label": "LOCAL", "start_offset": 269, "end_offset": 277}, {"entity_id": "80", "text": "13 de Maio de 2001", "label": "TEMPO", "start_offset": 305, "end_offset": 323}, {"entity_id": "82", "text": "2001", "label": "TEMPO", "start_offset": 378, "end_offset": 382}, {"entity_id": "84", "text": "Bombeiros", "label": "PESSOA", "start_offset": 440, "end_offset": 449}, {"entity_id": "85", "text": "Bombeiros Volunt\u00e1rios de Vila Nova de Oliveirinha", "label": "ORGANIZACAO", "start_offset": 554, "end_offset": 603}, {"entity_id": "86", "text": "Internet", "label": "LOCAL", "start_offset": 791, "end_offset": 799}, {"entity_id": "87", "text": "Bombeiros", "label": "PESSOA", "start_offset": 1079, "end_offset": 1088}, {"entity_id": "88", "text": "Corpo dos Bombeiros Volunt\u00e1rios de V. Nova de Oliveirinha", "label": "ORGANIZACAO", "start_offset": 1138, "end_offset": 1195}, {"entity_id": "89", "text": "Eduardo Pereira", "label": "PESSOA", "start_offset": 1292, "end_offset": 1307}, {"entity_id": "90", "text": "Presidente da Direc\u00e7\u00e3o", "label": "PESSOA", "start_offset": 1309, "end_offset": 1331}]}, {"doc_id": "HAREM-284-04226", "doc_text": "\nSunab autua empresas por alta abusiva dos pre\u00e7os \nDa Sucursal de Bras\u00edlia e da Reportagem Local\nA Sunab (Superintend\u00eancia Nacional de Abastecimento) autuou 62 estabelecimentos comerciais em 16 Estados entre 27 de junho e 8 de julho \u00faltimo. \nO motivo da autua\u00e7\u00e3o foi a pr\u00e1tica de aumento abusivo de pre\u00e7os acima da varia\u00e7\u00e3o dos custos de acordo com a nova Lei Antitruste (n\u00ba 8.884/94). \nA fiscaliza\u00e7\u00e3o tamb\u00e9m foi motivada pelo descumprimento de normas de comercializa\u00e7\u00e3o. \nEntre os autuados, est\u00e3o seis supermercados e oito ind\u00fastrias. \nA Sunab tamb\u00e9m constatou a pr\u00e1tica de aumento abusivo de pre\u00e7os em outros 23 estabelecimentos comerciais. \nSupermercados \nA Procuradoria do Estado de S\u00e3o Paulo deve finalizar os pareceres sobre os sete supermercados autuados pelo Procon dentro de uma semana . \nOs autuados foram: O Barateiro, Carrefour, P\u00e3o de A\u00e7\u00facar, C\u00e2ndia, Extra, Eldorado e Paes Mendon\u00e7a. \nEles teriam vendido em mar\u00e7o acima da m\u00e9dia dos \u00faltimos quatro meses de 93. \nOs supermercadistas apresentaram defesa. \nAverigua\u00e7\u00e3o \nAs empresas de vale-refei\u00e7\u00e3o dever\u00e3o ser alvo de um processo de averigua\u00e7\u00e3o preliminar feito pelo governo. \nEm representa\u00e7\u00e3o entregue ontem ao Minist\u00e9rio da Justi\u00e7a, elas foram acusadas de terem formado cartel para aumentar em at\u00e9 200% a taxa cobrada pelos seus servi\u00e7os. \nA representa\u00e7\u00e3o foi encaminhada pelo comerciante paulista Ronaldo Cheguri de Almeida, em nome de cerca de 300 donos de bares e restaurantes de S\u00e3o Paulo. \n", "entities": [{"entity_id": "370", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 1, "end_offset": 6}, {"entity_id": "371", "text": "Sucursal de Bras\u00edlia", "label": "ORGANIZACAO", "start_offset": 54, "end_offset": 74}, {"entity_id": "372", "text": "Reportagem Local", "label": "ORGANIZACAO", "start_offset": 80, "end_offset": 96}, {"entity_id": "373", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 99, "end_offset": 104}, {"entity_id": "374", "text": "Superintend\u00eancia Nacional de Abastecimento", "label": "ORGANIZACAO", "start_offset": 106, "end_offset": 148}, {"entity_id": "375", "text": "62", "label": "VALOR", "start_offset": 157, "end_offset": 159}, {"entity_id": "376", "text": "16", "label": "VALOR", "start_offset": 191, "end_offset": 193}, {"entity_id": "377", "text": "27 de junho", "label": "TEMPO", "start_offset": 208, "end_offset": 219}, {"entity_id": "378", "text": "8 de julho", "label": "TEMPO", "start_offset": 222, "end_offset": 232}, {"entity_id": "380", "text": "Sunab", "label": "ORGANIZACAO", "start_offset": 539, "end_offset": 544}, {"entity_id": "381", "text": "23", "label": "VALOR", "start_offset": 611, "end_offset": 613}, {"entity_id": "382", "text": "Procuradoria do Estado de S\u00e3o Paulo", "label": "ORGANIZACAO", "start_offset": 661, "end_offset": 696}, {"entity_id": "383", "text": "Procon", "label": "ORGANIZACAO", "start_offset": 767, "end_offset": 773}, {"entity_id": "384", "text": "Barateiro", "label": "ORGANIZACAO", "start_offset": 819, "end_offset": 828}, {"entity_id": "385", "text": "Carrefour", "label": "ORGANIZACAO", "start_offset": 830, "end_offset": 839}, {"entity_id": "386", "text": "P\u00e3o de A\u00e7\u00facar", "label": "ORGANIZACAO", "start_offset": 841, "end_offset": 854}, {"entity_id": "387", "text": "C\u00e2ndia", "label": "ORGANIZACAO", "start_offset": 856, "end_offset": 862}, {"entity_id": "388", "text": "Extra", "label": "ORGANIZACAO", "start_offset": 864, "end_offset": 869}, {"entity_id": "389", "text": "Eldorado", "label": "ORGANIZACAO", "start_offset": 871, "end_offset": 879}, {"entity_id": "390", "text": "Paes Mendon\u00e7a", "label": "ORGANIZACAO", "start_offset": 882, "end_offset": 895}, {"entity_id": "391", "text": "mar\u00e7o", "label": "TEMPO", "start_offset": 921, "end_offset": 926}, {"entity_id": "392", "text": "93", "label": "TEMPO", "start_offset": 970, "end_offset": 972}, {"entity_id": "393", "text": "Minist\u00e9rio da Justi\u00e7a", "label": "ORGANIZACAO", "start_offset": 1173, "end_offset": 1194}, {"entity_id": "394", "text": "200%", "label": "VALOR", "start_offset": 1261, "end_offset": 1265}, {"entity_id": "395", "text": "Ronaldo Cheguri de Almeida", "label": "PESSOA", "start_offset": 1361, "end_offset": 1387}, {"entity_id": "396", "text": "cerca de 300", "label": "VALOR", "start_offset": 1400, "end_offset": 1412}, {"entity_id": "397", "text": "S\u00e3o Paulo", "label": "LOCAL", "start_offset": 1446, "end_offset": 1455}]}, {"doc_id": "HAREM-367-06201", "doc_text": "\n A REVISTA S\u00c3O PAULO EM PERSPECTIVA, da Fundacao Seade, Estado de Sao Paulo, acaba de lancar seu ultimo numero (v+12 ,n 4) dedicado \u00e0 Comunicacai e informacao. \n Nas palavras de seu editor Miguel Chaia \"Neste n\u00famero, S\u00e3o Paulo em Perspectiva traz artigos que discutem e refletem a natureza da comunica\u00e7\u00e3o e, particularmente da informa\u00e7\u00e3o, numa situa\u00e7\u00e3o na qual avan\u00e7am rapidamente as conquistas tecnol\u00f3gicas da inform\u00e1tica e acentuam-se os efeitos dos meios de comunica\u00e7\u00e3o de massa. \n Simultaneamente, continuam a funcionar de forma significativa institui\u00e7\u00f5es acad\u00eamicas, de pesquisa ou t\u00e9cnicas que buscam produzir e disseminar conhecimento voltado ao desenvolvimento das ci\u00eancias sociais, \u00e0 continuidade de pesquisas e ao subs\u00eddio a debates e programas p\u00fablicos, propiciando maior racionaliza\u00e7\u00e3o \u00e0s interven\u00e7\u00f5es na realidade social. \n Considerando estas duas tend\u00eancias, os textos apresentados analisam as caracter\u00edsticas de uma sociedade globalizada que se fundamenta na m\u00eddia eletr\u00f4nica, na velocidade da comunica\u00e7\u00e3o e na heterogeneidade da produ\u00e7\u00e3o, troca e consumo da informa\u00e7\u00e3o. \n Tal processo torna-se cada vez mais sofisticado, exigindo avan\u00e7ados servi\u00e7os e aparelhagens tecnol\u00f3gicas, novas rela\u00e7\u00f5es entre emiss\u00e3o e recep\u00e7\u00e3o de mensagens e, tamb\u00e9m, novas formas de produ\u00e7\u00e3o de conhecimento. \n Nesta situa\u00e7\u00e3o, os sujeitos devem estar preparados para a inser\u00e7\u00e3o em in\u00e9ditos processos cognitivos, tanto aqueles que s\u00e3o profissionais da \u00e1rea da comunica\u00e7\u00e3o, quanto os usu\u00e1rios dos servi\u00e7os oferecidos. \n\n O Conteudo da Revista pode ser oservado a partir do seu sumario: \n\n SUM\u00c1RIO \n\n COMUNICA\u00c7\u00c3O & INFORMA\u00c7\u00c3O: \n\nO Rumor do Conhecimento\nAldo de Albuquerque Barreto \nGest\u00e3o e Tratamento da Informa\u00e7\u00e3o na Sociedade Tecnol\u00f3gica Othon Jambeiro \n Comunica\u00e7\u00e3o,M\u00eddiaeCultura \nNorval Baitello Junior \n Muito Al\u00e9m da lnforma\u00e7\u00e3o: m\u00eddia ,cidadania e o dilema democr\u00e1tico Mauro P+ Porto \n Sociedade da Informa\u00e7\u00e3o, Comunica\u00e7\u00f5es e Democracia Ven\u00edcio A+ de Linia \n O Mal-Estar Brasileiro na Sociedade de Informa\u00e7\u00e3o \nAna Malin\n Desmidiatizar o Pensamento: economia das representa\u00e7\u00f5es e subdesenvolvimento informacional \nMargaretihe Born Steinberger\n O Imagin\u00e1rio da Cibercultura \nAndr\u00e9 Lemos\n Fontes Eletr\u00f4nicas de Informa\u00e7\u00e3o: novas formas de comunica\u00e7\u00e3o e de produ\u00e7\u00e3o do conhecimento \nSolange Puntel Mostafa / Marisa Terra\n Comunica\u00e7\u00e3o da Ci\u00eancia\nIsaac Epstein\n Informa\u00e7\u00e3o e Sociedade: novos par\u00e2metros te\u00f3rico-pr\u00e1ticos de gest\u00e3o e transfer\u00eancia informacional \nRegina Maria Marteleto\n Sociedade Civil, Estado e Terceiro Setor \nMaria do Carmo Brant de Carvalho\n A Coordena\u00e7\u00e3o, a Argumenta\u00e7\u00e3o e a Comunica\u00e7\u00e3o das Estat\u00edsticas: v\u00e9rtices de um mesmo tri\u00e2ngulo \n Nelson de Castro Senra \n A Arquitetura de Sistemas de Informa\u00e7\u00f5es Estat\u00edsticas na Internet Marilda Lopes Ginez de Lara \n As Novas e Velhas Demandas por Informa\u00e7\u00e3o Estat\u00edstica \nPaulo de Martino Jannuzzi\n O Sistema Banc\u00e1rio e o Aparecimento da Moeda Eletr\u00f4nica Maria Cristina Penido de Freitas\n A Revista pode ser obtida atraves da Internet no site da Funda\u00e7\u00e3o SEADE: ou pelo email : com Cleide \n ou Tania, Tel.011-2241654.\n\n O artigo que tenho na Revista eh fruto de pesquisa em dase de finaliza\u00e7\u00e3o, financiada pelo CNPq e que trata de: \n\n Informacao e conhecimento, pois a informa\u00e7\u00e3o modificou o seu status na academia quando o seu destino se vinculou ao conhecimento, como fato cognitivo do sujeito e ao desenvolvimento como decorr\u00eancia social natural da acumula\u00e7\u00e3o deste conhecimento. \n A ess\u00eancia do fen\u00f4meno da informa\u00e7\u00e3o passou a ser esta condi\u00e7\u00e3o de intencionalidade em gerar conhecimento no indiv\u00edduo e em sua realidade. \n As modifica\u00e7\u00f5es na esfera de influ\u00eancia da informa\u00e7\u00e3o n\u00e3o foram acompanhadas de uma explana\u00e7\u00e3o te\u00f3rica em que, poss\u00edveis evid\u00eancias do processo de transforma\u00e7\u00e3o: informacao-conhecimento, fossem esclarecidos. \n Esta e outras condi\u00e7\u00f5es espec\u00edficas da manifesta\u00e7\u00e3o da informa\u00e7\u00e3o como participante deste processo s\u00e3o estudadas neste artigo. \n Assim, dividimos o artigo em duas partes: a primeira procura mostrar as poss\u00edveis evid\u00eancias conceituais da exist\u00eancia da rela\u00e7\u00e3o informa\u00e7\u00e3o e conhecimento; e a segunda pretende apresentar os resultados iniciais de pesquisa ainda em andamento, onde se procura qualificar os mecanismos de elabora\u00e7\u00e3o do pensamento nesta rela\u00e7\u00e3o de transforma\u00e7\u00e3o, com dados emp\u00edricos paratr\u00eas \u00e1reas do conhecimento ou comunidades ling\u00fc\u00edsticas ou grupos informacionais diferenciados: a comunica\u00e7\u00e3o, a fisica e a ciencia da informa\u00e7\u00e3o.\n", "entities": [{"entity_id": "473", "text": "Fundacao Seade", "label": "ORGANIZACAO", "start_offset": 41, "end_offset": 55}, {"entity_id": "474", "text": "Estado de Sao Paulo", "label": "LOCAL", "start_offset": 57, "end_offset": 76}, {"entity_id": "476", "text": "Miguel Chaia", "label": "PESSOA", "start_offset": 190, "end_offset": 202}, {"entity_id": "479", "text": "Aldo de Albuquerque Barreto", "label": "PESSOA", "start_offset": 1647, "end_offset": 1674}, {"entity_id": "481", "text": "Othon Jambeiro ", "label": "PESSOA", "start_offset": 1735, "end_offset": 1750}, {"entity_id": "483", "text": "Norval Baitello Junior", "label": "PESSOA", "start_offset": 1779, "end_offset": 1801}, {"entity_id": "485", "text": "Mauro P+ Porto", "label": "PESSOA", "start_offset": 1870, "end_offset": 1884}, {"entity_id": "487", "text": "Ven\u00edcio A+ de Linia", "label": "PESSOA", "start_offset": 1938, "end_offset": 1957}, {"entity_id": "489", "text": "Ana Malin", "label": "PESSOA", "start_offset": 2011, "end_offset": 2020}, {"entity_id": "491", "text": "Margaretihe Born Steinberger", "label": "PESSOA", "start_offset": 2114, "end_offset": 2142}, {"entity_id": "493", "text": "Andr\u00e9 Lemos", "label": "PESSOA", "start_offset": 2174, "end_offset": 2185}, {"entity_id": "495", "text": "Solange Puntel Mostafa", "label": "PESSOA", "start_offset": 2280, "end_offset": 2302}, {"entity_id": "496", "text": "Marisa Terra", "label": "PESSOA", "start_offset": 2305, "end_offset": 2317}, {"entity_id": "498", "text": "Isaac Epstein", "label": "PESSOA", "start_offset": 2342, "end_offset": 2355}, {"entity_id": "500", "text": "Regina Maria Marteleto", "label": "PESSOA", "start_offset": 2456, "end_offset": 2478}, {"entity_id": "502", "text": "Maria do Carmo Brant de Carvalho", "label": "PESSOA", "start_offset": 2522, "end_offset": 2554}, {"entity_id": "504", "text": "Nelson de Castro Senra", "label": "PESSOA", "start_offset": 2653, "end_offset": 2675}, {"entity_id": "506", "text": "Marilda Lopes Ginez de Lara ", "label": "PESSOA", "start_offset": 2744, "end_offset": 2772}, {"entity_id": "508", "text": "Paulo de Martino Jannuzzi", "label": "PESSOA", "start_offset": 2829, "end_offset": 2854}, {"entity_id": "510", "text": "Maria Cristina Penido de Freitas", "label": "PESSOA", "start_offset": 2912, "end_offset": 2944}, {"entity_id": "511", "text": "Internet", "label": "LOCAL", "start_offset": 2983, "end_offset": 2991}, {"entity_id": "512", "text": "Funda\u00e7\u00e3o SEADE", "label": "ORGANIZACAO", "start_offset": 3003, "end_offset": 3017}, {"entity_id": "513", "text": "Cleide", "label": "PESSOA", "start_offset": 3039, "end_offset": 3045}, {"entity_id": "514", "text": "Tania", "label": "PESSOA", "start_offset": 3051, "end_offset": 3056}, {"entity_id": "516", "text": "CNPq", "label": "ORGANIZACAO", "start_offset": 3168, "end_offset": 3172}]}, {"doc_id": "HAREM-862-03412", "doc_text": "\nConcurso Para Auditor Fiscal do INSS \n J\u00e1 est\u00e1 pronta a minuta do edital do concurso para auditor fiscal do INSS, que oferecer\u00e1 150 vagas, prometidas pelo governo federal, conforme revelou o chefe de Divis\u00e3o na Coordena\u00e7\u00e3o Geral do INSS, Maur\u00edlio Gon\u00e7alves Dias. \nO INSS aguarda apenas a autoriza\u00e7\u00e3o oficial, para dar in\u00edcio ao processo seletivo. \nNo dia 30 de junho, a Comiss\u00e3o de Controle e Gest\u00e3o Fiscal, do Minist\u00e9rio da Fazenda, publicou, no Di\u00e1rio Oficial, uma recomenda\u00e7\u00e3o ao Minist\u00e9rio de Or\u00e7amento e Gest\u00e3o, pela autoriza\u00e7\u00e3o para a abertura de concursos. \nNo total, a oferta ser\u00e1 de 3.728 vagas, em carreiras de n\u00edvel superior, conforme promessa feita no dia 30 de maio. \nUma fonte do INSS disse que \u00e9 grande a possibilidade da Universidade de Bras\u00edlia (UnB) organizar o seu concurso. \nA institui\u00e7\u00e3o \u00e9 a mesma que coordenou o \u00faltimo processo seletivo do INSS, realizado em 1998. \nPara participar do concurso \u00e9 necess\u00e1rio ter conclu\u00eddo curso superior em qualquer \u00e1rea. \nA remunera\u00e7\u00e3o \u00e9 de R$2.409,66, podendo chegar a R$3.613, com a Gratifica\u00e7\u00e3o de Desempenho por Atividade Tribut\u00e1ria (GDAT), obtida em fun\u00e7\u00e3o do alcance das metas de arrecada\u00e7\u00e3o e dos resultados obtidos com a fiscaliza\u00e7\u00e3o. \nO diretor de Arrecada\u00e7\u00e3o Fiscal do INSS, Luiz Alberto Lazinho, acredita que o conte\u00fado program\u00e1tico das provas seguir\u00e1 o modelo do \u00faltimo processo seletivo, realizado em 1998. \n\"Os candidatos devem dar especial aten\u00e7\u00e3o a Contabilidade, Direito Tribut\u00e1rio e Legisla\u00e7\u00e3o Previdenci\u00e1ria\", sugeriu Luiz Alberto Lazinho. \nPara aqueles que v\u00e3o participar do processo seletivo, o professor de Direito Previdenci\u00e1rio F\u00e1bio Zambite d\u00e1 uma dica importante: os candidatos devem estudar com bastante aten\u00e7\u00e3o o Decreto 3.048/99, que aprova o Regulamento da Previd\u00eancia Social. \n\"A Legisla\u00e7\u00e3o \u00e9 muito extensa. \nAo inv\u00e9s de estudar as Leis de Custeio e de Benef\u00edcios, al\u00e9m do Regulamento da Previd\u00eancia Social sugiro que o candidato estude diretamente o Decreto 3.048/99, que reproduz o que dizem essas leis. \nAssim, o candidato ganha tempo na hora de estudar\", orienta. \nOutra sugest\u00e3o do professor \u00e9 que os concorrentes analisem com especial aten\u00e7\u00e3o a Lei 9.876/99, que introduz altera\u00e7\u00f5es na Previd\u00eancia Social. \n\"Uma dessas altera\u00e7\u00f5es diz respeito \u00e0 mudan\u00e7a no c\u00e1lculo das aposentadorias. \nEssa lei tamb\u00e9m cria o fator previdenci\u00e1rio, que certamente ser\u00e1 uma das quest\u00f5es da prova\", disse. \n", "entities": [{"entity_id": "908", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 33, "end_offset": 37}, {"entity_id": "909", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 122, "end_offset": 126}, {"entity_id": "910", "text": "150", "label": "VALOR", "start_offset": 142, "end_offset": 145}, {"entity_id": "911", "text": "Divis\u00e3o na Coordena\u00e7\u00e3o Geral do INSS", "label": "ORGANIZACAO", "start_offset": 214, "end_offset": 250}, {"entity_id": "912", "text": "Maur\u00edlio Gon\u00e7alves Dias", "label": "PESSOA", "start_offset": 252, "end_offset": 275}, {"entity_id": "913", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 293, "end_offset": 297}, {"entity_id": "914", "text": "30 de junho", "label": "TEMPO", "start_offset": 395, "end_offset": 406}, {"entity_id": "915", "text": "Comiss\u00e3o de Controle e Gest\u00e3o Fiscal", "label": "ORGANIZACAO", "start_offset": 410, "end_offset": 446}, {"entity_id": "916", "text": "Minist\u00e9rio da Fazenda", "label": "ORGANIZACAO", "start_offset": 451, "end_offset": 472}, {"entity_id": "917", "text": "Di\u00e1rio Oficial", "label": "LOCAL", "start_offset": 487, "end_offset": 501}, {"entity_id": "918", "text": "Minist\u00e9rio de Or\u00e7amento e Gest\u00e3o", "label": "ORGANIZACAO", "start_offset": 523, "end_offset": 555}, {"entity_id": "919", "text": "3.728", "label": "VALOR", "start_offset": 645, "end_offset": 650}, {"entity_id": "920", "text": "30 de maio", "label": "TEMPO", "start_offset": 721, "end_offset": 731}, {"entity_id": "921", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 760, "end_offset": 764}, {"entity_id": "922", "text": "Universidade de Bras\u00edlia", "label": "ORGANIZACAO", "start_offset": 803, "end_offset": 827}, {"entity_id": "923", "text": "UnB", "label": "ORGANIZACAO", "start_offset": 829, "end_offset": 832}, {"entity_id": "924", "text": "INSS", "label": "ORGANIZACAO", "start_offset": 942, "end_offset": 946}, {"entity_id": "925", "text": "1998", "label": "TEMPO", "start_offset": 961, "end_offset": 965}, {"entity_id": "926", "text": "R$2.409,66", "label": "VALOR", "start_offset": 1102, "end_offset": 1112}, {"entity_id": "927", "text": "R$3.613", "label": "VALOR", "start_offset": 1131, "end_offset": 1138}, {"entity_id": "930", "text": "Arrecada\u00e7\u00e3o Fiscal do INSS", "label": "ORGANIZACAO", "start_offset": 1331, "end_offset": 1357}, {"entity_id": "931", "text": "Luiz Alberto Lazinho", "label": "PESSOA", "start_offset": 1359, "end_offset": 1379}, {"entity_id": "932", "text": "1998", "label": "TEMPO", "start_offset": 1488, "end_offset": 1492}, {"entity_id": "936", "text": "Luiz Alberto Lazinho", "label": "PESSOA", "start_offset": 1624, "end_offset": 1644}, {"entity_id": "938", "text": "F\u00e1bio Zambite", "label": "PESSOA", "start_offset": 1752, "end_offset": 1765}]}, {"doc_id": "HAREM-071-00386", "doc_text": "\nUma vila no Interior\nUm simbolo de grande beleza\nAs terras de S. Martinho foram povoadas desde remotas eras, gra\u00e7as \u00e0 fertilidade do rio Bestan\u00e7a e \u00e0 facilidade de defesas naturais e pontos estrat\u00e9gicos como a Pena ou Pedra Sobreposta, em Paus, e a Mogueira, perto de S. Martinho.\nNo morro da Mogueira h\u00e1 vest\u00edgios evidentes da presen\u00e7a dos Celtas, e dos Romanos.\nTrata-se de um castro romanizado.\nA estes povos seguiram-se os Suevos, os Visigodos, depois os Mouros que lhe deram o nome, e por fim, os povoadores crist\u00e3os da Reconquista.Atendendo ao nome \u00abS. Martinho\u00bb, pode concluir-se que deve ter sido par\u00f3quia desde os prim\u00f3rdios da cristianiza\u00e7\u00e3o destas paragens.\nPaus, S. Jo\u00e3o e Gosende eram, nessa \u00e9poca, simples povoados desta freguesia.\nDada a sua fertilidade, os Mouros com todas as suas for\u00e7as a reconquista crist\u00e3, motivo pelo qual, sendo j\u00e1 crist\u00e3o todo o noroeste(de Resende ao Porto) e estando ainda S. Martinho nas m\u00e3os dos Mouros, os crist\u00e3os de Resende, falando de S. Martinho, lhe chamavam de \u00abMouros\u00bb.\nAp\u00f3s a reconquista, em 1058, tentou-se o repovoamento com a doa\u00e7\u00e3o de terras a senhores da nobreza, concretamente com as Honras de Cardoso, de Cantim, de Fonseca, de Paredes e de Temonde.\nS. Martinho foi concelho desde tempos anteriores \u00e0 nacionalidade, pois recebeu foral de Fernando Magno, confirmado por D. Teresa em 1 de Mar\u00e7o de 1121, e novo foral do rei D. Manuel em 20 de Outubro de 1513.\nFoi tamb\u00e9m julgado medieval, abrangendo uma longa faixa de territ\u00f3rio, desde o Douro \u00e0 cruz do Ross\u00e3o no montemuro, e desde a serra das Meadas a terras do concelho de Aregos e da honra de Resende.\nO julgado foi suprimido por decreto de 28 de Dezembro de 1840 e incorporado na comarca de Lamego e o concelho foi extinto em 24 de Outubro de 1855, data em que, tanto o concelho como o julgado passaram a fazer parte do concelho e da comarca de Resende.\n", "entities": [{"entity_id": "1289", "text": "Interior", "label": "LOCAL", "start_offset": 13, "end_offset": 21}, {"entity_id": "1290", "text": "S. Martinho", "label": "LOCAL", "start_offset": 63, "end_offset": 74}, {"entity_id": "1291", "text": "Bestan\u00e7a", "label": "LOCAL", "start_offset": 138, "end_offset": 146}, {"entity_id": "1292", "text": "Pena", "label": "LOCAL", "start_offset": 211, "end_offset": 215}, {"entity_id": "1293", "text": "Pedra Sobreposta", "label": "LOCAL", "start_offset": 219, "end_offset": 235}, {"entity_id": "1294", "text": "Paus", "label": "LOCAL", "start_offset": 240, "end_offset": 244}, {"entity_id": "1295", "text": "Mogueira", "label": "LOCAL", "start_offset": 250, "end_offset": 258}, {"entity_id": "1296", "text": "S. Martinho", "label": "LOCAL", "start_offset": 269, "end_offset": 280}, {"entity_id": "1297", "text": "Mogueira", "label": "LOCAL", "start_offset": 294, "end_offset": 302}, {"entity_id": "1298", "text": "Celtas", "label": "PESSOA", "start_offset": 342, "end_offset": 348}, {"entity_id": "1299", "text": "Romanos", "label": "PESSOA", "start_offset": 356, "end_offset": 363}, {"entity_id": "1300", "text": "Suevos", "label": "PESSOA", "start_offset": 428, "end_offset": 434}, {"entity_id": "1301", "text": "Visigodos", "label": "PESSOA", "start_offset": 439, "end_offset": 448}, {"entity_id": "1302", "text": "Mouros", "label": "PESSOA", "start_offset": 460, "end_offset": 466}, {"entity_id": "1303", "text": "Reconquista", "label": "TEMPO", "start_offset": 526, "end_offset": 537}, {"entity_id": "1305", "text": "Paus", "label": "LOCAL", "start_offset": 670, "end_offset": 674}, {"entity_id": "1306", "text": "S. Jo\u00e3o", "label": "LOCAL", "start_offset": 676, "end_offset": 683}, {"entity_id": "1307", "text": "Gosende", "label": "LOCAL", "start_offset": 686, "end_offset": 693}, {"entity_id": "1308", "text": "Mouros", "label": "PESSOA", "start_offset": 774, "end_offset": 780}, {"entity_id": "1309", "text": "Resende", "label": "LOCAL", "start_offset": 882, "end_offset": 889}, {"entity_id": "1310", "text": "Porto", "label": "LOCAL", "start_offset": 893, "end_offset": 898}, {"entity_id": "1311", "text": "S. Martinho", "label": "LOCAL", "start_offset": 916, "end_offset": 927}, {"entity_id": "1312", "text": "Mouros", "label": "PESSOA", "start_offset": 941, "end_offset": 947}, {"entity_id": "1313", "text": "Resende", "label": "LOCAL", "start_offset": 964, "end_offset": 971}, {"entity_id": "1314", "text": "S. Martinho", "label": "LOCAL", "start_offset": 984, "end_offset": 995}, {"entity_id": "1316", "text": "1058", "label": "TEMPO", "start_offset": 1046, "end_offset": 1050}, {"entity_id": "1318", "text": "Cardoso", "label": "LOCAL", "start_offset": 1154, "end_offset": 1161}, {"entity_id": "1319", "text": "Cantim", "label": "LOCAL", "start_offset": 1166, "end_offset": 1172}, {"entity_id": "1320", "text": "Fonseca", "label": "LOCAL", "start_offset": 1177, "end_offset": 1184}, {"entity_id": "1321", "text": "Paredes", "label": "LOCAL", "start_offset": 1189, "end_offset": 1196}, {"entity_id": "1322", "text": "Temonde", "label": "LOCAL", "start_offset": 1202, "end_offset": 1209}, {"entity_id": "1324", "text": "S. Martinho", "label": "LOCAL", "start_offset": 1211, "end_offset": 1222}, {"entity_id": "1325", "text": "Fernando Magno", "label": "PESSOA", "start_offset": 1299, "end_offset": 1313}, {"entity_id": "1326", "text": "D. Teresa", "label": "PESSOA", "start_offset": 1330, "end_offset": 1339}, {"entity_id": "1327", "text": "1 de Mar\u00e7o de 1121", "label": "TEMPO", "start_offset": 1343, "end_offset": 1361}, {"entity_id": "1328", "text": "D. Manuel", "label": "PESSOA", "start_offset": 1383, "end_offset": 1392}, {"entity_id": "1329", "text": "20 de Outubro de 1513", "label": "TEMPO", "start_offset": 1396, "end_offset": 1417}, {"entity_id": "1330", "text": "Douro", "label": "LOCAL", "start_offset": 1498, "end_offset": 1503}, {"entity_id": "1331", "text": "Ross\u00e3o", "label": "LOCAL", "start_offset": 1514, "end_offset": 1520}, {"entity_id": "1332", "text": "Meadas", "label": "LOCAL", "start_offset": 1555, "end_offset": 1561}, {"entity_id": "1333", "text": "Aregos", "label": "LOCAL", "start_offset": 1586, "end_offset": 1592}, {"entity_id": "1334", "text": "Resende", "label": "LOCAL", "start_offset": 1607, "end_offset": 1614}, {"entity_id": "1335", "text": "28 de Dezembro de 1840", "label": "TEMPO", "start_offset": 1655, "end_offset": 1677}, {"entity_id": "1336", "text": "Lamego", "label": "LOCAL", "start_offset": 1706, "end_offset": 1712}, {"entity_id": "1337", "text": "24 de Outubro de 1855", "label": "TEMPO", "start_offset": 1741, "end_offset": 1762}, {"entity_id": "1338", "text": "Resende", "label": "LOCAL", "start_offset": 1860, "end_offset": 1867}]}, {"doc_id": "HAREM-19H-01369", "doc_text": "\nNorte-americanos disparam contra posi\u00e7\u00f5es iraquianas\nAparelhos norte-americanos dispararam m\u00edsseis e lan\u00e7aram bombas sobre posi\u00e7\u00f5es iraquianas no norte do pa\u00eds \u00abem resposta a disparos de artilharia anti-a\u00e9rea\u00bb do Iraque, anunciou o Pent\u00e1gono em comunicado.\nSegundo o departamento da defesa, avi\u00f5es F-15 \u00abque efectuavam voos de rotina na zona de exclus\u00e3o a\u00e9rea no norte do Iraque\u00bb dispararam primeiro tr\u00eas m\u00edsseis e lan\u00e7aram bombas guiadas por laser contra um centro de comando militar e uma esta\u00e7\u00e3o de r\u00e1dio.\nUm pouco mais tarde, outros aparelhos F-15 lan\u00e7aram cinco bombas contra objectivos n\u00e3o identificados perto de Mossul.\nOs aparelhos regressaram \u00e0 sua base na Turquia, acrescenta o comunicado do Pent\u00e1gono.\n", "entities": [{"entity_id": "2057", "text": "Iraque", "label": "ORGANIZACAO", "start_offset": 214, "end_offset": 220}, {"entity_id": "2058", "text": "Pent\u00e1gono", "label": "ORGANIZACAO", "start_offset": 233, "end_offset": 242}, {"entity_id": "2060", "text": "Iraque", "label": "LOCAL", "start_offset": 373, "end_offset": 379}, {"entity_id": "2062", "text": "Mossul", "label": "LOCAL", "start_offset": 620, "end_offset": 626}, {"entity_id": "2063", "text": "Turquia", "label": "LOCAL", "start_offset": 667, "end_offset": 674}, {"entity_id": "2064", "text": "Pent\u00e1gono", "label": "ORGANIZACAO", "start_offset": 703, "end_offset": 712}]}, {"doc_id": "HAREM-276-04861", "doc_text": "\nORGANIZA\u00c7\u00c3O ESTRUTURAL DA MEMBRANA \nA membrana celular \u00e9 uma camada com apenas 7,5 a 10 nm de espessura, constitu\u00edda por l\u00edpidos intercalados com prote\u00ednas que define os limites de cada c\u00e9lula. \nFunciona como uma barreira de permeabilidade que permite \u00e0 c\u00e9lula manter um meio qu\u00edmico apropriado para os seus processos metab\u00f3licos, regular o volume citoplasm\u00e1tico e transferir informa\u00e7\u00e3o sob a forma de sinais qu\u00edmicos e el\u00e9ctricos. \nAs membranas que revestem os v\u00e1rios organelos (n\u00facleo, mitoc\u00f4ndria, ret\u00edculo endoplasm\u00e1tico, lisossomas e aparelho de Golgi) permitem a compartimentaliza\u00e7\u00e3o funcional da c\u00e9lula, com possibilidade de limitar processos bioqu\u00edmicas a certos locais. \nApesar das particularidades individuais, todas as membranas biol\u00f3gicas s\u00e3o formadas por uma dupla camada fosfol\u00edpidica e por prote\u00ednas unidas por liga\u00e7\u00f5es covalentes e que se comportam segundo o Modelo Mosaico Flu\u00eddo. \nA maioria dos l\u00edpidos e das prote\u00ednas movem-se livremente no plano da membrana. \nEm alguns casos , h\u00e1 restri\u00e7\u00e3o deste movimento de forma a permitir \u00e0 c\u00e9lula a realiza\u00e7\u00e3o de algumas fun\u00e7\u00f5es em partes selectivas da sua membrana. \n\u00c9 o caso da sequestra\u00e7\u00e3o de receptores de acetilcolina ao n\u00edvel da placa motora das c\u00e9lulas musculares esquel\u00e9ticas. \nOs principais l\u00edpidos presentes na membrana celular s\u00e3o os fosfol\u00edpidos, o colesterol e os glicol\u00edpidos. \nA sua distribui\u00e7\u00e3o pelas duas camadas \u00e9 assim\u00e9trica, o que pode reflectir as diferentes fun\u00e7\u00f5es das duas superf\u00edcies da membrana. \nOs fosfol\u00edpidos s\u00e3o mol\u00e9culas antip\u00e1ticas e disp\u00f5em-se em bicamada com a por\u00e7\u00e3o hidr\u00f3foba n\u00e3o polar ( caudas de \u00e1cidos gordos) dirigida para o centro da membrana e com a por\u00e7\u00e3o hidrof\u00edlica polar (cabe\u00e7a com terminal fosfato) direccionada para o exterior ou interior da c\u00e9lula. \nOs fosfol\u00edpidos mais abundantes s\u00e3o os fosfol\u00edpidos ligados \u00e0 colina (fosfatidilcolina e esfingomielina) e os aminofosfol\u00edpidos (fosfatidilserina e fosfatidiletanolamina). \nO fosfatidilglicerol, o fosfatidilinositol e a cardiolipina s\u00e3o tamb\u00e9m importantes mas est\u00e3o presentes em menores quantidades.\nAs Dimens\u00f5es da via de difus\u00e3o incluem a \u00e1rea de sec\u00e7\u00e3o e a dist\u00e2ncia. \nQuanto maior a \u00e1rea de sec\u00e7\u00e3o e menor a dist\u00e2ncia a percorrer, maior o fluxo. \nNo pulm\u00e3o e no intestino, onde a difus\u00e3o \u00e9 importante para a troca de subst\u00e2ncias entre os meios interno e externo, a \u00e1rea de difus\u00e3o \u00e9 grande e a dist\u00e2ncia a percorrer pequena. \n", "entities": [{"entity_id": "3519", "text": "7,5 a 10 nm", "label": "VALOR", "start_offset": 80, "end_offset": 91}]}] --------------------------------------------------------------------------------