├── README.md ├── alphabet.py ├── bert ├── __pycache__ │ ├── modeling.cpython-36.pyc │ ├── modeling.cpython-37.pyc │ ├── optimization.cpython-36.pyc │ ├── optimization.cpython-37.pyc │ ├── sentiment_modeling.cpython-37.pyc │ └── tokenization.cpython-37.pyc ├── modeling.py ├── optimization.py ├── sentiment_modeling.py └── tokenization.py ├── crf_new.py ├── data └── readme ├── dataProcess.py ├── main.py ├── opinionMining.py └── relationAttention.py /README.md: -------------------------------------------------------------------------------- 1 | # SDRN 2 | Source code of the paper "Synchronous Double-channel Recurrent Network for Aspect-Opinion Pair Extraction, ACL 2020." 3 | 4 | #### Requirement: 5 | 6 | ``` 7 | python==3.6.8 8 | torch==0.4.0 9 | numpy==1.15.4 10 | ``` 11 | 12 | #### Dataset: 13 | 14-Res, 14-Lap, 15-Res: Download from https://drive.google.com/drive/folders/1wWK6fIvfYP-54afGDRN44VWlXuUAHs-l?usp=sharing 14 | 15 | MPQA：Download from http://www.cs.pitt.edu/mpqa/ 16 | 17 | JDPA: Download from http://verbs.colorado.edu/jdpacorpus/ 18 | 19 | #### Download BERT_Base: 20 | https://github.com/google-research/bert 21 | 22 | #### How to run: 23 | ``` 24 | python main.py --mode train # For training 25 | python main.py --mode test --test_model ./modelFinal.model # For testing 26 | ``` 27 | -------------------------------------------------------------------------------- /alphabet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | """ 5 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects. 6 | """ 7 | import json 8 | import os 9 | import sys 10 | 11 | 12 | class Alphabet: 13 | def __init__(self, name, label=False, keep_growing=True): 14 | self.name = name 15 | self.UNKNOWN = "" 16 | self.label = label 17 | self.instance2index = {} 18 | self.instances = [] 19 | self.keep_growing = keep_growing 20 | 21 | # Index 0 is occupied by default, all else following. 22 | self.default_index = 0 23 | self.next_index = 1 24 | if not self.label: 25 | self.add(self.UNKNOWN) 26 | 27 | def clear(self, keep_growing=True): 28 | self.instance2index = {} 29 | self.instances = [] 30 | self.keep_growing = keep_growing 31 | 32 | # Index 0 is occupied by default, all else following. 33 | self.default_index = 0 34 | self.next_index = 1 35 | 36 | def add(self, instance): 37 | if instance not in self.instance2index: 38 | self.instances.append(instance) 39 | self.instance2index[instance] = self.next_index 40 | self.next_index += 1 41 | 42 | def get_index(self, instance): 43 | try: 44 | return self.instance2index[instance] 45 | except KeyError: 46 | if self.keep_growing: 47 | index = self.next_index 48 | self.add(instance) 49 | return index 50 | else: 51 | return self.instance2index[self.UNKNOWN] 52 | 53 | def get_instance(self, index): 54 | if index == 0: 55 | if self.label: 56 | return self.instances[0] 57 | # First index is occupied by the wildcard element. 58 | return None 59 | try: 60 | return self.instances[index - 1] 61 | except IndexError: 62 | print('WARNING:Alphabet get_instance ,unknown instance, return the first label.') 63 | return self.instances[0] 64 | 65 | def size(self): 66 | # if self.label: 67 | # return len(self.instances) 68 | # else: 69 | return len(self.instances) + 1 70 | 71 | def iteritems(self): 72 | if sys.version_info[0] < 3: # If using python3, dict item access uses different syntax 73 | return self.instance2index.iteritems() 74 | else: 75 | return self.instance2index.items() 76 | 77 | def enumerate_items(self, start=1): 78 | if start < 1 or start >= self.size(): 79 | raise IndexError("Enumerate is allowed between [1 : size of the alphabet)") 80 | return zip(range(start, len(self.instances) + 1), self.instances[start - 1:]) 81 | 82 | def close(self): 83 | self.keep_growing = False 84 | 85 | def open(self): 86 | self.keep_growing = True 87 | 88 | def get_content(self): 89 | return {'instance2index': self.instance2index, 'instances': self.instances} 90 | 91 | def from_json(self, data): 92 | self.instances = data["instances"] 93 | self.instance2index = data["instance2index"] 94 | 95 | def save(self, output_directory, name=None): 96 | """ 97 | Save both alhpabet records to the given directory. 98 | :param output_directory: Directory to save model and weights. 99 | :param name: The alphabet saving name, optional. 100 | :return: 101 | """ 102 | saving_name = name if name else self.__name 103 | try: 104 | json.dump(self.get_content(), open(os.path.join(output_directory, saving_name + ".json"), 'w')) 105 | except Exception as e: 106 | print("Exception: Alphabet is not saved: " % repr(e)) 107 | 108 | def load(self, input_directory, name=None): 109 | """ 110 | Load model architecture and weights from the give directory. This allow we use old models even the structure 111 | changes. 112 | :param input_directory: Directory to save model and weights 113 | :return: 114 | """ 115 | loading_name = name if name else self.__name 116 | self.from_json(json.load(open(os.path.join(input_directory, loading_name + ".json")))) 117 | -------------------------------------------------------------------------------- /bert/__pycache__/modeling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/modeling.cpython-36.pyc -------------------------------------------------------------------------------- /bert/__pycache__/modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/modeling.cpython-37.pyc -------------------------------------------------------------------------------- /bert/__pycache__/optimization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/optimization.cpython-36.pyc -------------------------------------------------------------------------------- /bert/__pycache__/optimization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/optimization.cpython-37.pyc -------------------------------------------------------------------------------- /bert/__pycache__/sentiment_modeling.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/sentiment_modeling.cpython-37.pyc -------------------------------------------------------------------------------- /bert/__pycache__/tokenization.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chenshaowei57/SDRN/454ed1d1afcfd30d81d6bf8bf63eeacae619dcc0/bert/__pycache__/tokenization.cpython-37.pyc -------------------------------------------------------------------------------- /bert/modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch BERT model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import copy 22 | import json 23 | import math 24 | import six 25 | import torch 26 | import torch.nn as nn 27 | from torch.nn import CrossEntropyLoss 28 | 29 | 30 | def gelu(x): 31 | """Implementation of the gelu activation function. 32 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 33 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 34 | """ 35 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 36 | 37 | 38 | class BertConfig(object): 39 | """Configuration class to store the configuration of a `BertModel`. 40 | """ 41 | def __init__(self, 42 | vocab_size, 43 | hidden_size=768, 44 | num_hidden_layers=12, 45 | num_attention_heads=12, 46 | intermediate_size=3072, 47 | hidden_act="gelu", 48 | hidden_dropout_prob=0.1, 49 | attention_probs_dropout_prob=0.1, 50 | max_position_embeddings=512, 51 | type_vocab_size=16, 52 | initializer_range=0.02): 53 | """Constructs BertConfig. 54 | 55 | Args: 56 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 57 | hidden_size: Size of the encoder layers and the pooler layer. 58 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 59 | num_attention_heads: Number of attention heads for each attention layer in 60 | the Transformer encoder. 61 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 62 | layer in the Transformer encoder. 63 | hidden_act: The non-linear activation function (function or string) in the 64 | encoder and pooler. 65 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 66 | layers in the embeddings, encoder, and pooler. 67 | attention_probs_dropout_prob: The dropout ratio for the attention 68 | probabilities. 69 | max_position_embeddings: The maximum sequence length that this model might 70 | ever be used with. Typically set this to something large just in case 71 | (e.g., 512 or 1024 or 2048). 72 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 73 | `BertModel`. 74 | initializer_range: The sttdev of the truncated_normal_initializer for 75 | initializing all weight matrices. 76 | """ 77 | self.vocab_size = vocab_size 78 | self.hidden_size = hidden_size 79 | self.num_hidden_layers = num_hidden_layers 80 | self.num_attention_heads = num_attention_heads 81 | self.hidden_act = hidden_act 82 | self.intermediate_size = intermediate_size 83 | self.hidden_dropout_prob = hidden_dropout_prob 84 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 85 | self.max_position_embeddings = max_position_embeddings 86 | self.type_vocab_size = type_vocab_size 87 | self.initializer_range = initializer_range 88 | 89 | @classmethod 90 | def from_dict(cls, json_object): 91 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 92 | config = BertConfig(vocab_size=None) 93 | for (key, value) in six.iteritems(json_object): 94 | config.__dict__[key] = value 95 | return config 96 | 97 | @classmethod 98 | def from_json_file(cls, json_file): 99 | """Constructs a `BertConfig` from a json file of parameters.""" 100 | with open(json_file, "r") as reader: 101 | text = reader.read() 102 | return cls.from_dict(json.loads(text)) 103 | 104 | def to_dict(self): 105 | """Serializes this instance to a Python dictionary.""" 106 | output = copy.deepcopy(self.__dict__) 107 | return output 108 | 109 | def to_json_string(self): 110 | """Serializes this instance to a JSON string.""" 111 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 112 | 113 | 114 | class BERTLayerNorm(nn.Module): 115 | def __init__(self, config, variance_epsilon=1e-12): 116 | """Construct a layernorm module in the TF style (epsilon inside the square root). 117 | """ 118 | super(BERTLayerNorm, self).__init__() 119 | self.gamma = nn.Parameter(torch.ones(config.hidden_size)) 120 | self.beta = nn.Parameter(torch.zeros(config.hidden_size)) 121 | self.variance_epsilon = variance_epsilon 122 | 123 | def forward(self, x): 124 | u = x.mean(-1, keepdim=True) 125 | s = (x - u).pow(2).mean(-1, keepdim=True) 126 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 127 | return self.gamma * x + self.beta 128 | 129 | 130 | class BERTEmbeddings(nn.Module): 131 | def __init__(self, config): 132 | super(BERTEmbeddings, self).__init__() 133 | """Construct the embedding module from word, position and token_type embeddings. 134 | """ 135 | self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) 136 | self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) 137 | self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) 138 | 139 | # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load 140 | # any TensorFlow checkpoint file 141 | self.LayerNorm = BERTLayerNorm(config) 142 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 143 | 144 | def forward(self, input_ids, token_type_ids=None): 145 | seq_length = input_ids.size(1) 146 | position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) 147 | position_ids = position_ids.unsqueeze(0).expand_as(input_ids) 148 | if token_type_ids is None: 149 | token_type_ids = torch.zeros_like(input_ids) 150 | 151 | words_embeddings = self.word_embeddings(input_ids) 152 | position_embeddings = self.position_embeddings(position_ids) 153 | token_type_embeddings = self.token_type_embeddings(token_type_ids) 154 | 155 | embeddings = words_embeddings + position_embeddings + token_type_embeddings 156 | embeddings = self.LayerNorm(embeddings) 157 | embeddings = self.dropout(embeddings) 158 | return embeddings 159 | 160 | 161 | class BERTSelfAttention(nn.Module): 162 | def __init__(self, config): 163 | super(BERTSelfAttention, self).__init__() 164 | if config.hidden_size % config.num_attention_heads != 0: 165 | raise ValueError( 166 | "The hidden size (%d) is not a multiple of the number of attention " 167 | "heads (%d)" % (config.hidden_size, config.num_attention_heads)) 168 | self.num_attention_heads = config.num_attention_heads 169 | self.attention_head_size = int(config.hidden_size / config.num_attention_heads) 170 | self.all_head_size = self.num_attention_heads * self.attention_head_size 171 | 172 | self.query = nn.Linear(config.hidden_size, self.all_head_size) 173 | self.key = nn.Linear(config.hidden_size, self.all_head_size) 174 | self.value = nn.Linear(config.hidden_size, self.all_head_size) 175 | 176 | self.dropout = nn.Dropout(config.attention_probs_dropout_prob) 177 | 178 | def transpose_for_scores(self, x): 179 | new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) 180 | x = x.view(*new_x_shape) 181 | return x.permute(0, 2, 1, 3) 182 | 183 | def forward(self, hidden_states, attention_mask): 184 | mixed_query_layer = self.query(hidden_states) # [N, L, H] 185 | mixed_key_layer = self.key(hidden_states) 186 | mixed_value_layer = self.value(hidden_states) 187 | 188 | query_layer = self.transpose_for_scores(mixed_query_layer) # [N, K, L, H//K] 189 | key_layer = self.transpose_for_scores(mixed_key_layer) 190 | value_layer = self.transpose_for_scores(mixed_value_layer) 191 | 192 | # Take the dot product between "query" and "key" to get the raw attention scores. 193 | attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) # [N, K, L, L] 194 | attention_scores = attention_scores / math.sqrt(self.attention_head_size) 195 | # Apply the attention mask is (precomputed for all layers in BertModel forward() function) 196 | attention_scores = attention_scores + attention_mask 197 | 198 | # Normalize the attention scores to probabilities. 199 | attention_probs = nn.Softmax(dim=-1)(attention_scores) 200 | 201 | # This is actually dropping out entire tokens to attend to, which might 202 | # seem a bit unusual, but is taken from the original Transformer paper. 203 | attention_probs = self.dropout(attention_probs) 204 | 205 | context_layer = torch.matmul(attention_probs, value_layer) # [N, K, L, H//K] 206 | context_layer = context_layer.permute(0, 2, 1, 3).contiguous() # [N, L, K, H//K] 207 | new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) 208 | context_layer = context_layer.view(*new_context_layer_shape) # [N, L, H] 209 | return context_layer 210 | 211 | 212 | class BERTSelfOutput(nn.Module): 213 | def __init__(self, config): 214 | super(BERTSelfOutput, self).__init__() 215 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 216 | self.LayerNorm = BERTLayerNorm(config) 217 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 218 | 219 | def forward(self, hidden_states, input_tensor): 220 | hidden_states = self.dense(hidden_states) 221 | hidden_states = self.dropout(hidden_states) 222 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 223 | return hidden_states 224 | 225 | 226 | class BERTAttention(nn.Module): 227 | def __init__(self, config): 228 | super(BERTAttention, self).__init__() 229 | self.self = BERTSelfAttention(config) 230 | self.output = BERTSelfOutput(config) 231 | 232 | def forward(self, input_tensor, attention_mask): 233 | self_output = self.self(input_tensor, attention_mask) 234 | attention_output = self.output(self_output, input_tensor) 235 | return attention_output 236 | 237 | 238 | class BERTIntermediate(nn.Module): 239 | def __init__(self, config): 240 | super(BERTIntermediate, self).__init__() 241 | self.dense = nn.Linear(config.hidden_size, config.intermediate_size) 242 | self.intermediate_act_fn = gelu 243 | 244 | def forward(self, hidden_states): 245 | hidden_states = self.dense(hidden_states) 246 | hidden_states = self.intermediate_act_fn(hidden_states) 247 | return hidden_states 248 | 249 | 250 | class BERTOutput(nn.Module): 251 | def __init__(self, config): 252 | super(BERTOutput, self).__init__() 253 | self.dense = nn.Linear(config.intermediate_size, config.hidden_size) 254 | self.LayerNorm = BERTLayerNorm(config) 255 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 256 | 257 | def forward(self, hidden_states, input_tensor): 258 | hidden_states = self.dense(hidden_states) 259 | hidden_states = self.dropout(hidden_states) 260 | hidden_states = self.LayerNorm(hidden_states + input_tensor) 261 | return hidden_states 262 | 263 | 264 | class BERTLayer(nn.Module): 265 | def __init__(self, config): 266 | super(BERTLayer, self).__init__() 267 | self.attention = BERTAttention(config) 268 | self.intermediate = BERTIntermediate(config) 269 | self.output = BERTOutput(config) 270 | 271 | def forward(self, hidden_states, attention_mask): 272 | attention_output = self.attention(hidden_states, attention_mask) 273 | intermediate_output = self.intermediate(attention_output) 274 | layer_output = self.output(intermediate_output, attention_output) 275 | return layer_output 276 | 277 | 278 | class BERTEncoder(nn.Module): 279 | def __init__(self, config): 280 | super(BERTEncoder, self).__init__() 281 | layer = BERTLayer(config) 282 | self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) 283 | 284 | def forward(self, hidden_states, attention_mask): 285 | all_encoder_layers = [] 286 | for layer_module in self.layer: 287 | hidden_states = layer_module(hidden_states, attention_mask) 288 | all_encoder_layers.append(hidden_states) 289 | return all_encoder_layers 290 | 291 | 292 | class BERTPooler(nn.Module): 293 | def __init__(self, config): 294 | super(BERTPooler, self).__init__() 295 | self.dense = nn.Linear(config.hidden_size, config.hidden_size) 296 | self.activation = nn.Tanh() 297 | 298 | def forward(self, hidden_states): 299 | # We "pool" the model by simply taking the hidden state corresponding 300 | # to the first token. 301 | first_token_tensor = hidden_states[:, 0] 302 | pooled_output = self.dense(first_token_tensor) 303 | pooled_output = self.activation(pooled_output) 304 | return pooled_output 305 | 306 | 307 | class BertModel(nn.Module): 308 | """BERT model ("Bidirectional Embedding Representations from a Transformer"). 309 | 310 | Example usage: 311 | ```python 312 | # Already been converted into WordPiece token ids 313 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 314 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 315 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) 316 | 317 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 318 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 319 | 320 | model = modeling.BertModel(config=config) 321 | all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) 322 | ``` 323 | """ 324 | def __init__(self, config: BertConfig): 325 | """Constructor for BertModel. 326 | 327 | Args: 328 | config: `BertConfig` instance. 329 | """ 330 | super(BertModel, self).__init__() 331 | self.embeddings = BERTEmbeddings(config) 332 | self.encoder = BERTEncoder(config) 333 | self.pooler = BERTPooler(config) 334 | 335 | def forward(self, input_ids, token_type_ids=None, attention_mask=None): 336 | if attention_mask is None: 337 | attention_mask = torch.ones_like(input_ids) 338 | if token_type_ids is None: 339 | token_type_ids = torch.zeros_like(input_ids) 340 | 341 | # We create a 3D attention mask from a 2D tensor mask. 342 | # Sizes are [batch_size, 1, 1, to_seq_length] 343 | # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] 344 | # this attention mask is more simple than the triangular masking of causal attention 345 | # used in OpenAI GPT, we just need to prepare the broadcast dimension here. 346 | extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) 347 | 348 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 349 | # masked positions, this operation will create a tensor which is 0.0 for 350 | # positions we want to attend and -10000.0 for masked positions. 351 | # Since we are adding it to the raw scores before the softmax, this is 352 | # effectively the same as removing these entirely. 353 | extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility 354 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 355 | 356 | embedding_output = self.embeddings(input_ids, token_type_ids) 357 | all_encoder_layers = self.encoder(embedding_output, extended_attention_mask) 358 | sequence_output = all_encoder_layers[-1] 359 | pooled_output = self.pooler(sequence_output) 360 | return all_encoder_layers, pooled_output 361 | 362 | 363 | class BertForSequenceClassification(nn.Module): 364 | """BERT model for classification. 365 | This module is composed of the BERT model with a linear layer on top of 366 | the pooled output. 367 | 368 | Example usage: 369 | ```python 370 | # Already been converted into WordPiece token ids 371 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 372 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 373 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) 374 | 375 | config = BertConfig(vocab_size=32000, hidden_size=512, 376 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 377 | 378 | num_labels = 2 379 | 380 | model = BertForSequenceClassification(config, num_labels) 381 | logits = model(input_ids, token_type_ids, input_mask) 382 | ``` 383 | """ 384 | def __init__(self, config, num_labels): 385 | super(BertForSequenceClassification, self).__init__() 386 | self.bert = BertModel(config) 387 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 388 | self.classifier = nn.Linear(config.hidden_size, num_labels) 389 | 390 | def init_weights(module): 391 | if isinstance(module, (nn.Linear, nn.Embedding)): 392 | # Slightly different from the TF version which uses truncated_normal for initialization 393 | # cf https://github.com/pytorch/pytorch/pull/5617 394 | module.weight.data.normal_(mean=0.0, std=config.initializer_range) 395 | elif isinstance(module, BERTLayerNorm): 396 | module.beta.data.normal_(mean=0.0, std=config.initializer_range) 397 | module.gamma.data.normal_(mean=0.0, std=config.initializer_range) 398 | if isinstance(module, nn.Linear): 399 | module.bias.data.zero_() 400 | self.apply(init_weights) 401 | 402 | def forward(self, input_ids, token_type_ids, attention_mask, labels=None): 403 | _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) 404 | pooled_output = self.dropout(pooled_output) 405 | logits = self.classifier(pooled_output) 406 | 407 | if labels is not None: 408 | loss_fct = CrossEntropyLoss() 409 | loss = loss_fct(logits, labels) 410 | return loss, logits 411 | else: 412 | return logits 413 | 414 | 415 | class BertForQuestionAnswering(nn.Module): 416 | """BERT model for Question Answering (span extraction). 417 | This module is composed of the BERT model with a linear layer on top of 418 | the sequence output that computes start_logits and end_logits 419 | 420 | Example usage: 421 | ```python 422 | # Already been converted into WordPiece token ids 423 | input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) 424 | input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) 425 | token_type_ids = torch.LongTensor([[0, 0, 1], [0, 2, 0]]) 426 | 427 | config = BertConfig(vocab_size=32000, hidden_size=512, 428 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 429 | 430 | model = BertForQuestionAnswering(config) 431 | start_logits, end_logits = model(input_ids, token_type_ids, input_mask) 432 | ``` 433 | """ 434 | def __init__(self, config): 435 | super(BertForQuestionAnswering, self).__init__() 436 | self.bert = BertModel(config) 437 | # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version 438 | # self.dropout = nn.Dropout(config.hidden_dropout_prob) 439 | self.qa_outputs = nn.Linear(config.hidden_size, 2) 440 | 441 | def init_weights(module): 442 | if isinstance(module, (nn.Linear, nn.Embedding)): 443 | # Slightly different from the TF version which uses truncated_normal for initialization 444 | # cf https://github.com/pytorch/pytorch/pull/5617 445 | module.weight.data.normal_(mean=0.0, std=config.initializer_range) 446 | elif isinstance(module, BERTLayerNorm): 447 | module.beta.data.normal_(mean=0.0, std=config.initializer_range) 448 | module.gamma.data.normal_(mean=0.0, std=config.initializer_range) 449 | if isinstance(module, nn.Linear): 450 | module.bias.data.zero_() 451 | self.apply(init_weights) 452 | 453 | def forward(self, input_ids, token_type_ids, attention_mask, start_positions=None, end_positions=None): 454 | all_encoder_layers, _ = self.bert(input_ids, token_type_ids, attention_mask) 455 | sequence_output = all_encoder_layers[-1] 456 | logits = self.qa_outputs(sequence_output) 457 | start_logits, end_logits = logits.split(1, dim=-1) 458 | start_logits = start_logits.squeeze(-1) 459 | end_logits = end_logits.squeeze(-1) 460 | 461 | if start_positions is not None and end_positions is not None: 462 | # If we are on multi-GPU, split add a dimension 463 | if len(start_positions.size()) > 1: 464 | start_positions = start_positions.squeeze(-1) 465 | if len(end_positions.size()) > 1: 466 | end_positions = end_positions.squeeze(-1) 467 | # sometimes the start/end positions are outside our model inputs, we ignore these terms 468 | ignored_index = start_logits.size(1) 469 | start_positions.clamp_(0, ignored_index) 470 | end_positions.clamp_(0, ignored_index) 471 | 472 | loss_fct = CrossEntropyLoss(ignore_index=ignored_index) 473 | start_loss = loss_fct(start_logits, start_positions) 474 | end_loss = loss_fct(end_logits, end_positions) 475 | total_loss = (start_loss + end_loss) / 2 476 | return total_loss 477 | else: 478 | return start_logits, end_logits -------------------------------------------------------------------------------- /bert/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import math 18 | import torch 19 | from torch.optim import Optimizer 20 | from torch.nn.utils import clip_grad_norm_ 21 | 22 | def warmup_cosine(x, warmup=0.002): 23 | if x < warmup: 24 | return x/warmup 25 | return 0.5 * (1.0 + torch.cos(math.pi * x)) 26 | 27 | def warmup_constant(x, warmup=0.002): 28 | if x < warmup: 29 | return x/warmup 30 | return 1.0 31 | 32 | def warmup_linear(x, warmup=0.002): 33 | if x < warmup: 34 | return x/warmup 35 | return 1.0 - x 36 | 37 | SCHEDULES = { 38 | 'warmup_cosine':warmup_cosine, 39 | 'warmup_constant':warmup_constant, 40 | 'warmup_linear':warmup_linear, 41 | } 42 | 43 | 44 | class BERTAdam(Optimizer): 45 | """Implements BERT version of Adam algorithm with weight decay fix (and no ). 46 | Params: 47 | lr: learning rate 48 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 49 | t_total: total number of training steps for the learning 50 | rate schedule, -1 means constant learning rate. Default: -1 51 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 52 | b1: Adams b1. Default: 0.9 53 | b2: Adams b2. Default: 0.999 54 | e: Adams epsilon. Default: 1e-6 55 | weight_decay_rate: Weight decay. Default: 0.01 56 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 57 | """ 58 | def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', 59 | b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, 60 | max_grad_norm=1.0): 61 | if not lr >= 0.0: 62 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 63 | if schedule not in SCHEDULES: 64 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 65 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 66 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 67 | if not 0.0 <= b1 < 1.0: 68 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 69 | if not 0.0 <= b2 < 1.0: 70 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 71 | if not e >= 0.0: 72 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 73 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 74 | b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, 75 | max_grad_norm=max_grad_norm) 76 | super(BERTAdam, self).__init__(params, defaults) 77 | 78 | def get_lr(self): 79 | lr = [] 80 | for group in self.param_groups: 81 | for p in group['params']: 82 | state = self.state[p] 83 | if len(state) == 0: 84 | return [0] 85 | if group['t_total'] != -1: 86 | schedule_fct = SCHEDULES[group['schedule']] 87 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 88 | else: 89 | lr_scheduled = group['lr'] 90 | lr.append(lr_scheduled) 91 | return lr 92 | 93 | def step(self, closure=None): 94 | """Performs a single optimization step. 95 | 96 | Arguments: 97 | closure (callable, optional): A closure that reevaluates the model 98 | and returns the loss. 99 | """ 100 | loss = None 101 | if closure is not None: 102 | loss = closure() 103 | 104 | for group in self.param_groups: 105 | for p in group['params']: 106 | if p.grad is None: 107 | continue 108 | grad = p.grad.data 109 | if grad.is_sparse: 110 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 111 | 112 | state = self.state[p] 113 | 114 | # State initialization 115 | if len(state) == 0: 116 | state['step'] = 0 117 | # Exponential moving average of gradient values 118 | state['next_m'] = torch.zeros_like(p.data) 119 | # Exponential moving average of squared gradient values 120 | state['next_v'] = torch.zeros_like(p.data) 121 | 122 | next_m, next_v = state['next_m'], state['next_v'] 123 | beta1, beta2 = group['b1'], group['b2'] 124 | 125 | # Add grad clipping 126 | if group['max_grad_norm'] > 0: 127 | clip_grad_norm_(p, group['max_grad_norm']) 128 | 129 | # Decay the first and second moment running average coefficient 130 | # In-place operations to update the averages at the same time 131 | next_m.mul_(beta1).add_(1 - beta1, grad) 132 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 133 | update = next_m / (next_v.sqrt() + group['e']) 134 | 135 | # Just adding the square of the weights to the loss function is *not* 136 | # the correct way of using L2 regularization/weight decay with Adam, 137 | # since that will interact with the m and v parameters in strange ways. 138 | # 139 | # Instead we want ot decay the weights in a manner that doesn't interact 140 | # with the m/v parameters. This is equivalent to adding the square 141 | # of the weights to the loss with plain (non-momentum) SGD. 142 | if group['weight_decay_rate'] > 0.0: 143 | update += group['weight_decay_rate'] * p.data 144 | 145 | if group['t_total'] != -1: 146 | schedule_fct = SCHEDULES[group['schedule']] 147 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 148 | else: 149 | lr_scheduled = group['lr'] 150 | 151 | update_with_lr = lr_scheduled * update 152 | p.data.add_(-update_with_lr) 153 | 154 | state['step'] += 1 155 | 156 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 157 | # bias_correction1 = 1 - beta1 ** state['step'] 158 | # bias_correction2 = 1 - beta2 ** state['step'] 159 | 160 | return loss 161 | -------------------------------------------------------------------------------- /bert/sentiment_modeling.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.nn import CrossEntropyLoss 8 | from bert.modeling import BertModel, BERTLayerNorm 9 | from allennlp.modules import ConditionalRandomField 10 | 11 | def flatten(x): 12 | if len(x.size()) == 2: 13 | batch_size = x.size()[0] 14 | seq_length = x.size()[1] 15 | return x.view([batch_size * seq_length]) 16 | elif len(x.size()) == 3: 17 | batch_size = x.size()[0] 18 | seq_length = x.size()[1] 19 | hidden_size = x.size()[2] 20 | return x.view([batch_size * seq_length, hidden_size]) 21 | else: 22 | raise Exception() 23 | 24 | def reconstruct(x, ref): 25 | if len(x.size()) == 1: 26 | batch_size = ref.size()[0] 27 | turn_num = ref.size()[1] 28 | return x.view([batch_size, turn_num]) 29 | elif len(x.size()) == 2: 30 | batch_size = ref.size()[0] 31 | turn_num = ref.size()[1] 32 | sequence_length = x.size()[1] 33 | return x.view([batch_size, turn_num, sequence_length]) 34 | else: 35 | raise Exception() 36 | 37 | def flatten_emb_by_sentence(emb, emb_mask): 38 | batch_size = emb.size()[0] 39 | seq_length = emb.size()[1] 40 | flat_emb = flatten(emb) 41 | flat_emb_mask = emb_mask.view([batch_size * seq_length]) 42 | return flat_emb[flat_emb_mask.nonzero().squeeze(), :] 43 | 44 | def get_span_representation(span_starts, span_ends, input, input_mask): 45 | ''' 46 | :param span_starts: [N, M] 47 | :param span_ends: [N, M] 48 | :param input: [N, L, D] 49 | :param input_mask: [N, L] 50 | :return: [N*M, JR, D], [N*M, JR] 51 | ''' 52 | input_mask = input_mask.to(dtype=span_starts.dtype) # fp16 compatibility 53 | input_len = torch.sum(input_mask, dim=-1) # [N] 54 | word_offset = torch.cumsum(input_len, dim=0) # [N] 55 | word_offset -= input_len 56 | 57 | span_starts_offset = span_starts + word_offset.unsqueeze(1) 58 | span_ends_offset = span_ends + word_offset.unsqueeze(1) 59 | 60 | span_starts_offset = span_starts_offset.view([-1]) # [N*M] 61 | span_ends_offset = span_ends_offset.view([-1]) 62 | 63 | span_width = span_ends_offset - span_starts_offset + 1 64 | JR = torch.max(span_width) 65 | 66 | context_outputs = flatten_emb_by_sentence(input, input_mask) # [= 0x4E00 and cp <= 0x9FFF) or # 213 | (cp >= 0x3400 and cp <= 0x4DBF) or # 214 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 215 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 216 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 217 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 218 | (cp >= 0xF900 and cp <= 0xFAFF) or # 219 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 220 | return True 221 | 222 | return False 223 | 224 | def _clean_text(self, text): 225 | """Performs invalid character removal and whitespace cleanup on text.""" 226 | output = [] 227 | for char in text: 228 | cp = ord(char) 229 | if cp == 0 or cp == 0xfffd or _is_control(char): 230 | continue 231 | if _is_whitespace(char): 232 | output.append(" ") 233 | else: 234 | output.append(char) 235 | return "".join(output) 236 | 237 | 238 | class WordpieceTokenizer(object): 239 | """Runs WordPiece tokenization.""" 240 | 241 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 242 | self.vocab = vocab 243 | self.unk_token = unk_token 244 | self.max_input_chars_per_word = max_input_chars_per_word 245 | 246 | def tokenize(self, text): 247 | """Tokenizes a piece of text into its word pieces. 248 | 249 | This uses a greedy longest-match-first algorithm to perform tokenization 250 | using the given vocabulary. 251 | 252 | For example: 253 | input = "unaffable" 254 | output = ["un", "##aff", "##able"] 255 | 256 | Args: 257 | text: A single token or whitespace separated tokens. This should have 258 | already been passed through `BasicTokenizer. 259 | 260 | Returns: 261 | A list of wordpiece tokens. 262 | """ 263 | 264 | text = convert_to_unicode(text) 265 | 266 | output_tokens = [] 267 | for token in whitespace_tokenize(text): 268 | chars = list(token) 269 | if len(chars) > self.max_input_chars_per_word: 270 | output_tokens.append(self.unk_token) 271 | continue 272 | 273 | is_bad = False 274 | start = 0 275 | sub_tokens = [] 276 | while start < len(chars): 277 | end = len(chars) 278 | cur_substr = None 279 | while start < end: 280 | substr = "".join(chars[start:end]) 281 | if start > 0: 282 | substr = "##" + substr 283 | if substr in self.vocab: 284 | cur_substr = substr 285 | break 286 | end -= 1 287 | if cur_substr is None: 288 | is_bad = True 289 | break 290 | sub_tokens.append(cur_substr) 291 | start = end 292 | 293 | if is_bad: 294 | output_tokens.append(self.unk_token) 295 | else: 296 | output_tokens.extend(sub_tokens) 297 | return output_tokens 298 | 299 | 300 | def _is_whitespace(char): 301 | """Checks whether `chars` is a whitespace character.""" 302 | # \t, \n, and \r are technically contorl characters but we treat them 303 | # as whitespace since they are generally considered as such. 304 | if char == " " or char == "\t" or char == "\n" or char == "\r": 305 | return True 306 | cat = unicodedata.category(char) 307 | if cat == "Zs": 308 | return True 309 | return False 310 | 311 | 312 | def _is_control(char): 313 | """Checks whether `chars` is a control character.""" 314 | # These are technically control characters but we count them as whitespace 315 | # characters. 316 | if char == "\t" or char == "\n" or char == "\r": 317 | return False 318 | cat = unicodedata.category(char) 319 | if cat.startswith("C"): 320 | return True 321 | return False 322 | 323 | 324 | def _is_punctuation(char): 325 | """Checks whether `chars` is a punctuation character.""" 326 | cp = ord(char) 327 | # We treat all non-letter/number ASCII as punctuation. 328 | # Characters such as "^", "$", and "`" are not in the Unicode 329 | # Punctuation class but we treat them as punctuation anyways, for 330 | # consistency. 331 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 332 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 333 | return True 334 | cat = unicodedata.category(char) 335 | if cat.startswith("P"): 336 | return True 337 | return False 338 | -------------------------------------------------------------------------------- /crf_new.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | import torch 4 | import torch.autograd as autograd 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | START_TAG = -2 8 | STOP_TAG = -1 9 | 10 | 11 | # Compute log sum exp in a numerically stable way for the forward algorithm 12 | def log_sum_exp(vec, m_size): 13 | """ 14 | calculate log of exp sum 15 | args: 16 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 17 | m_size : hidden_dim 18 | return: 19 | batch_size, hidden_dim 20 | """ 21 | _, idx = torch.max(vec, 1) # B * 1 * M 22 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 23 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) # B * M 24 | 25 | class CRF(nn.Module): 26 | 27 | def __init__(self, tagset_size, gpu): 28 | super(CRF, self).__init__() 29 | print("build CRF...") 30 | self.gpu = gpu 31 | # Matrix of transition parameters. Entry i,j is the score of transitioning from i to j. 32 | self.tagset_size = tagset_size 33 | # # We add 2 here, because of START_TAG and STOP_TAG 34 | # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag 35 | init_transitions = torch.zeros(self.tagset_size+2, self.tagset_size+2) 36 | init_transitions[:,START_TAG] = -10000.0 37 | init_transitions[STOP_TAG,:] = -10000.0 38 | init_transitions[:,0] = -10000.0 39 | init_transitions[0,:] = -10000.0 40 | if self.gpu: 41 | init_transitions = init_transitions.cuda() 42 | self.transitions = nn.Parameter(init_transitions) 43 | self.softmax = nn.Softmax(dim=1) 44 | # self.transitions = nn.Parameter(torch.Tensor(self.tagset_size+2, self.tagset_size+2)) 45 | # self.transitions.data.zero_() 46 | 47 | def _calculate_PZ(self, feats, mask): 48 | """ 49 | input: 50 | feats: (batch, seq_len, self.tag_size+2) 51 | masks: (batch, seq_len) 52 | """ 53 | batch_size = feats.size(0) 54 | seq_len = feats.size(1) 55 | tag_size = feats.size(2) 56 | # print feats.view(seq_len, tag_size) 57 | assert(tag_size == self.tagset_size+2) 58 | mask = mask.transpose(1,0).contiguous() 59 | ins_num = seq_len * batch_size 60 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 61 | feats = feats.transpose(1,0).contiguous().view(ins_num,1, tag_size).expand(ins_num, tag_size, tag_size) 62 | ## need to consider start 63 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 64 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 65 | # build iter 66 | seq_iter = enumerate(scores) 67 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 68 | # only need start from start_tag 69 | partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size 70 | 71 | ## add start score (from start to all tag, duplicate to batch_size) 72 | # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1) 73 | # iter over last scores 74 | for idx, cur_values in seq_iter: 75 | # previous to_target is current from_target 76 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 77 | # cur_values: bat_size * from_target * to_target 78 | 79 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 80 | cur_partition = log_sum_exp(cur_values, tag_size) 81 | # print cur_partition.data 82 | 83 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 84 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 85 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 86 | 87 | ## effective updated partition part, only keep the partition value of mask value = 1 88 | masked_cur_partition = cur_partition.masked_select(mask_idx) 89 | ## let mask_idx broadcastable, to disable warning 90 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 91 | 92 | ## replace the partition where the maskvalue=1, other partition value keeps the same 93 | partition.masked_scatter_(mask_idx, masked_cur_partition) 94 | # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG 95 | cur_values = self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 96 | cur_partition = log_sum_exp(cur_values, tag_size) 97 | final_partition = cur_partition[:, STOP_TAG] 98 | return final_partition.sum(), scores 99 | 100 | 101 | def _viterbi_decode(self, feats, mask): 102 | """ 103 | input: 104 | feats: (batch, seq_len, self.tag_size+2) 105 | mask: (batch, seq_len) 106 | output: 107 | decode_idx: (batch, seq_len) decoded sequence 108 | path_score: (batch, 1) corresponding score for each sequence (to be implementated) 109 | """ 110 | batch_size = feats.size(0) 111 | seq_len = feats.size(1) 112 | tag_size = feats.size(2) 113 | assert(tag_size == self.tagset_size+2) 114 | ## calculate sentence length for each sentence 115 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 116 | ## mask to (seq_len, batch_size) 117 | mask = mask.transpose(1,0).contiguous() 118 | ins_num = seq_len * batch_size 119 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 120 | feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 121 | ## need to consider start 122 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 123 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 124 | 125 | # build iter 126 | seq_iter = enumerate(scores) 127 | ## record the position of best score 128 | back_points = list() 129 | partition_history = list() 130 | probLists = list() 131 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 132 | # mask = 1 + (-1)*mask 133 | mask = (1 - mask.long()).byte() 134 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 135 | # only need start from start_tag 136 | partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size) # bat_size * to_target_size 137 | # print "init part:",partition.size() 138 | partition_history.append(partition) 139 | # iter over last scores 140 | for idx, cur_values in seq_iter: 141 | # previous to_target is current from_target 142 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 143 | # cur_values: batch_size * from_target * to_target 144 | prb = self.softmax(cur_values) 145 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 146 | ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG 147 | # print "cur value:", cur_values.size() 148 | partition, cur_bp = torch.max(cur_values, 1) 149 | prb = torch.gather(prb, 1, cur_bp.unsqueeze(1)) 150 | probLists.append(prb.squeeze(1)) 151 | # print "partsize:",partition.size() 152 | # exit(0) 153 | # print partition 154 | # print cur_bp 155 | # print "one best, ",idx 156 | partition_history.append(partition) 157 | ## cur_bp: (batch_size, tag_size) max source score position in current tag 158 | ## set padded label as 0, which will be filtered in post processing 159 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 160 | back_points.append(cur_bp) 161 | # exit(0) 162 | ### add score to final STOP_TAG 163 | partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,0).contiguous() ## (batch_size, seq_len. tag_size) 164 | ### get the last position for each setences, and select the last partitions using gather() 165 | last_position = length_mask.view(batch_size,1,1).expand(batch_size, 1, tag_size) -1 166 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size,tag_size,1) 167 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 168 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size) 169 | _, last_bp = torch.max(last_values, 1) 170 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() 171 | pad_zero1 = autograd.Variable(torch.zeros(batch_size, tag_size)).float() 172 | if self.gpu: 173 | pad_zero = pad_zero.cuda() 174 | pad_zero1 = pad_zero1.cuda() 175 | back_points.append(pad_zero) 176 | probLists.append(pad_zero1) 177 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 178 | probLists = torch.cat(probLists).view(seq_len, batch_size, tag_size) 179 | 180 | ## select end ids in STOP_TAG 181 | pointer = last_bp[:, STOP_TAG] 182 | insert_last = pointer.contiguous().view(batch_size,1,1).expand(batch_size,1, tag_size) 183 | back_points = back_points.transpose(1,0).contiguous() 184 | 185 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 186 | # print "lp:",last_position 187 | # print "il:",insert_last 188 | back_points.scatter_(1, last_position, insert_last) 189 | 190 | # print "bp:",back_points 191 | # exit(0) 192 | back_points = back_points.transpose(1,0).contiguous() 193 | ## decode from the end, padded position ids are 0, which will be filtered if following evaluation 194 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) 195 | decode_prob = autograd.Variable(torch.FloatTensor(seq_len, batch_size)) 196 | 197 | if self.gpu: 198 | decode_idx = decode_idx.cuda() 199 | decode_prob = decode_prob.cuda() 200 | decode_idx[-1] = pointer.data 201 | decode_prob[-1] = torch.gather(probLists[-1], 1, pointer.contiguous().view(batch_size,1)).view(batch_size).data 202 | for idx in range(len(back_points)-2, -1, -1): 203 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size,1)).view(batch_size) 204 | dec_prob = torch.gather(probLists[idx], 1, pointer.contiguous().view(batch_size,1)).view(batch_size) 205 | decode_idx[idx] = pointer.data 206 | decode_prob[idx] = dec_prob.data 207 | path_score = None 208 | decode_idx = decode_idx.transpose(1,0) 209 | decode_prob = decode_prob.transpose(1,0) 210 | return decode_prob, decode_idx 211 | 212 | 213 | 214 | def forward(self, feats): 215 | path_score, best_path = self._viterbi_decode(feats) 216 | return path_score, best_path 217 | 218 | 219 | def _score_sentence(self, scores, mask, tags): 220 | """ 221 | input: 222 | scores: variable (seq_len, batch, tag_size, tag_size) 223 | mask: (batch, seq_len) 224 | tags: tensor (batch, seq_len) 225 | output: 226 | score: sum of score for gold sequences within whole batch 227 | """ 228 | # Gives the score of a provided tag sequence 229 | batch_size = scores.size(1) 230 | seq_len = scores.size(0) 231 | tag_size = scores.size(2) 232 | ## convert tag value into a new format, recorded label bigram information to index 233 | new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len)) 234 | if self.gpu: 235 | new_tags = new_tags.cuda() 236 | for idx in range(seq_len): 237 | if idx == 0: 238 | ## start -> first score 239 | new_tags[:,0] = (tag_size - 2)*tag_size + tags[:,0] 240 | 241 | else: 242 | new_tags[:,idx] = tags[:,idx-1]*tag_size + tags[:,idx] 243 | 244 | ## transition for label to STOP_TAG 245 | end_transition = self.transitions[:,STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size) 246 | ## length for batch, last word position = length - 1 247 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 248 | ## index the label id of last word 249 | end_ids = torch.gather(tags, 1, length_mask - 1) 250 | 251 | ## index the transition score for end_id to STOP_TAG 252 | end_energy = torch.gather(end_transition, 1, end_ids) 253 | 254 | ## convert tag as (seq_len, batch_size, 1) 255 | new_tags = new_tags.transpose(1,0).contiguous().view(seq_len, batch_size, 1) 256 | ### need convert tags id to search from 400 positions of scores 257 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, batch_size) # seq_len * bat_size 258 | ## mask transpose to (seq_len, batch_size) 259 | tg_energy = tg_energy.masked_select(mask.transpose(1,0)) 260 | 261 | # ## calculate the score from START_TAG to first label 262 | # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size) 263 | # start_energy = torch.gather(start_transition, 1, tags[0,:]) 264 | 265 | ## add all score together 266 | # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum() 267 | gold_score = tg_energy.sum() + end_energy.sum() 268 | return gold_score 269 | 270 | def neg_log_likelihood_loss(self, feats, mask, tags): 271 | # nonegative log likelihood 272 | batch_size = feats.size(0) 273 | forward_score, scores = self._calculate_PZ(feats, mask) 274 | gold_score = self._score_sentence(scores, mask, tags) 275 | # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0] 276 | # exit(0) 277 | return forward_score - gold_score 278 | 279 | 280 | 281 | def _viterbi_decode_nbest(self, feats, mask, nbest): 282 | """ 283 | input: 284 | feats: (batch, seq_len, self.tag_size+2) 285 | mask: (batch, seq_len) 286 | output: 287 | decode_idx: (batch, nbest, seq_len) decoded sequence 288 | path_score: (batch, nbest) corresponding score for each sequence (to be implementated) 289 | nbest decode for sentence with one token is not well supported, to be optimized 290 | """ 291 | batch_size = feats.size(0) 292 | seq_len = feats.size(1) 293 | tag_size = feats.size(2) 294 | assert(tag_size == self.tagset_size+2) 295 | ## calculate sentence length for each sentence 296 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 297 | ## mask to (seq_len, batch_size) 298 | mask = mask.transpose(1,0).contiguous() 299 | ins_num = seq_len * batch_size 300 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 301 | feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 302 | ## need to consider start 303 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 304 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 305 | 306 | # build iter 307 | seq_iter = enumerate(scores) 308 | ## record the position of best score 309 | back_points = list() 310 | partition_history = list() 311 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 312 | # mask = 1 + (-1)*mask 313 | mask = (1 - mask.long()).byte() 314 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 315 | # only need start from start_tag 316 | partition = inivalues[:, START_TAG, :].clone() # bat_size * to_target_size 317 | ## initial partition [batch_size, tag_size] 318 | partition_history.append(partition.view(batch_size, tag_size, 1).expand(batch_size, tag_size, nbest)) 319 | # iter over last scores 320 | for idx, cur_values in seq_iter: 321 | if idx == 1: 322 | cur_values = cur_values.view(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 323 | else: 324 | # previous to_target is current from_target 325 | # partition: previous results log(exp(from_target)), #(batch_size * nbest * from_target) 326 | # cur_values: batch_size * from_target * to_target 327 | cur_values = cur_values.view(batch_size, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) + partition.contiguous().view(batch_size, tag_size, nbest, 1).expand(batch_size, tag_size, nbest, tag_size) 328 | ## compare all nbest and all from target 329 | cur_values = cur_values.view(batch_size, tag_size*nbest, tag_size) 330 | # print "cur size:",cur_values.size() 331 | partition, cur_bp = torch.topk(cur_values, nbest, 1) 332 | ## cur_bp/partition: [batch_size, nbest, tag_size], id should be normize through nbest in following backtrace step 333 | # print partition[:,0,:] 334 | # print cur_bp[:,0,:] 335 | # print "nbest, ",idx 336 | if idx == 1: 337 | cur_bp = cur_bp*nbest 338 | partition = partition.transpose(2,1) 339 | cur_bp = cur_bp.transpose(2,1) 340 | 341 | # print partition 342 | # exit(0) 343 | #partition: (batch_size * to_target * nbest) 344 | #cur_bp: (batch_size * to_target * nbest) Notice the cur_bp number is the whole position of tag_size*nbest, need to convert when decode 345 | partition_history.append(partition) 346 | ## cur_bp: (batch_size,nbest, tag_size) topn source score position in current tag 347 | ## set padded label as 0, which will be filtered in post processing 348 | ## mask[idx] ? mask[idx-1] 349 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1, 1).expand(batch_size, tag_size, nbest), 0) 350 | # print cur_bp[0] 351 | back_points.append(cur_bp) 352 | ### add score to final STOP_TAG 353 | partition_history = torch.cat(partition_history,0).view(seq_len, batch_size, tag_size, nbest).transpose(1,0).contiguous() ## (batch_size, seq_len, nbest, tag_size) 354 | ### get the last position for each setences, and select the last partitions using gather() 355 | last_position = length_mask.view(batch_size,1,1,1).expand(batch_size, 1, tag_size, nbest) - 1 356 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, nbest, 1) 357 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 358 | last_values = last_partition.expand(batch_size, tag_size, nbest, tag_size) + self.transitions.view(1, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) 359 | last_values = last_values.view(batch_size, tag_size*nbest, tag_size) 360 | end_partition, end_bp = torch.topk(last_values, nbest, 1) 361 | ## end_partition: (batch, nbest, tag_size) 362 | end_bp = end_bp.transpose(2,1) 363 | # end_bp: (batch, tag_size, nbest) 364 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size, nbest)).long() 365 | if self.gpu: 366 | pad_zero = pad_zero.cuda() 367 | back_points.append(pad_zero) 368 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size, nbest) 369 | 370 | ## select end ids in STOP_TAG 371 | pointer = end_bp[:, STOP_TAG, :] ## (batch_size, nbest) 372 | insert_last = pointer.contiguous().view(batch_size, 1, 1, nbest).expand(batch_size, 1, tag_size, nbest) 373 | back_points = back_points.transpose(1,0).contiguous() 374 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 375 | # print "lp:",last_position 376 | # print "il:",insert_last[0] 377 | # exit(0) 378 | ## copy the ids of last position:insert_last to back_points, though the last_position index 379 | ## last_position includes the length of batch sentences 380 | # print "old:", back_points[9,0,:,:] 381 | back_points.scatter_(1, last_position, insert_last) 382 | ## back_points: [batch_size, seq_length, tag_size, nbest] 383 | # print "new:", back_points[9,0,:,:] 384 | # exit(0) 385 | # print pointer[2] 386 | ''' 387 | back_points: in simple demonstratration 388 | x,x,x,x,x,x,x,x,x,7 389 | x,x,x,x,x,4,0,0,0,0 390 | x,x,6,0,0,0,0,0,0,0 391 | ''' 392 | 393 | back_points = back_points.transpose(1,0).contiguous() 394 | # print back_points[0] 395 | ## back_points: (seq_len, batch, tag_size, nbest) 396 | ## decode from the end, padded position ids are 0, which will be filtered in following evaluation 397 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size, nbest)) 398 | if self.gpu: 399 | decode_idx = decode_idx.cuda() 400 | decode_idx[-1] = pointer.data/nbest 401 | # print "pointer-1:",pointer[2] 402 | # exit(0) 403 | # use old mask, let 0 means has token 404 | for idx in range(len(back_points)-2, -1, -1): 405 | # print "pointer: ",idx, pointer[3] 406 | # print "back:",back_points[idx][3] 407 | # print "mask:",mask[idx+1,3] 408 | new_pointer = torch.gather(back_points[idx].view(batch_size, tag_size*nbest), 1, pointer.contiguous().view(batch_size,nbest)) 409 | decode_idx[idx] = new_pointer.data/nbest 410 | # # use new pointer to remember the last end nbest ids for non longest 411 | pointer = new_pointer + pointer.contiguous().view(batch_size,nbest)*mask[idx].view(batch_size,1).expand(batch_size, nbest).long() 412 | 413 | # exit(0) 414 | path_score = None 415 | decode_idx = decode_idx.transpose(1,0) 416 | ## decode_idx: [batch, seq_len, nbest] 417 | # print decode_idx[:,:,0] 418 | # print "nbest:",nbest 419 | # print "diff:", decode_idx[:,:,0]- decode_idx[:,:,4] 420 | # print decode_idx[:,0,:] 421 | # exit(0) 422 | 423 | ### calculate probability for each sequence 424 | scores = end_partition[:, :, STOP_TAG] 425 | ## scores: [batch_size, nbest] 426 | max_scores,_ = torch.max(scores, 1) 427 | minus_scores = scores - max_scores.view(batch_size,1).expand(batch_size, nbest) 428 | path_score = F.softmax(minus_scores, 1) 429 | ## path_score: [batch_size, nbest] 430 | # exit(0) 431 | return path_score, decode_idx 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | -------------------------------------------------------------------------------- /data/readme: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /dataProcess.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import sys 6 | import os 7 | import json 8 | import pickle 9 | import argparse 10 | import collections 11 | sys.path.append("../") 12 | import numpy as np 13 | import torch 14 | from torch.utils.data import TensorDataset, DataLoader 15 | from torch.utils.data.sampler import RandomSampler, SequentialSampler 16 | from torch.utils.data.distributed import DistributedSampler 17 | 18 | import bert.tokenization as tokenization 19 | 20 | try: 21 | import xml.etree.ElementTree as ET, getopt, logging, sys, random, re, copy 22 | from xml.sax.saxutils import escape 23 | except: 24 | sys.exit('Some package is missing... Perhaps ?') 25 | 26 | class InputFeatures(object): 27 | """A single set of features of data.""" 28 | 29 | def __init__(self, 30 | tokens, 31 | token_ids, 32 | token_mask, 33 | segmentId, 34 | labels, 35 | label_ids, 36 | relations, 37 | gold_relations, 38 | token_to_orig_map): 39 | self.tokens = tokens # 40 | self.token_ids = token_ids 41 | self.token_mask = token_mask 42 | self.segmentId = segmentId 43 | self.labels = labels # 44 | self.label_ids = label_ids 45 | self.relations = relations 46 | self.gold_relations = gold_relations # 47 | self.token_to_orig_map = token_to_orig_map 48 | 49 | def readDataFromFile(path): 50 | f = open(path, "r", encoding="utf-8") 51 | lines = f.readlines() 52 | f.close() 53 | seq = 0 54 | datasets = [] 55 | words = [] 56 | labels = [] 57 | relations = [] 58 | relations_gold = [] 59 | for l in lines: 60 | if l.strip() == "#Relations": 61 | continue 62 | elif l.strip() == "" and len(words)>0: 63 | # if "B-T" in labels or "B-P" in labels: 64 | datasets.append({"words": words, "labels": labels, "relations": relations, "relations_gold": relations_gold}) 65 | if len(words)>seq: 66 | seq = len(words) 67 | words = [] 68 | labels = [] 69 | relations = [] 70 | relations_gold = [] 71 | elif len(l.strip().split("\t")) == 2: 72 | tempLine = l.strip().split("\t") 73 | # WORD 74 | words.append(tempLine[0].lower()) 75 | # LABEL 76 | labels.append(tempLine[1]) 77 | elif len(l.strip().split("\t")) == 4: 78 | rel = list(map(int, l.strip().split("\t"))) 79 | relations_gold.append([rel[2],rel[3],rel[0],rel[1]]) 80 | if -1 not in rel: 81 | relations.append(rel) 82 | print("max_seq_length"+str(seq)) 83 | return datasets 84 | 85 | def convert_examples_to_features(examples, tokenizer, max_seq_length=100): 86 | seq = 0 87 | features = [] 88 | labelDic = {"O":1, "B-T":2, "I-T":3,"B-P":4,"I-P":5} 89 | num = 0 90 | relationlen = 0 91 | aspectlen = 0 92 | opinionlen = 0 93 | sentlen = 0 94 | for (example_index, example) in enumerate(examples): 95 | tok_to_orig_index = [] 96 | orig_to_tok_index = [] 97 | all_doc_tokens = [] 98 | labels = [] 99 | gold_relations = [] 100 | #### split words and labels #### 101 | for (i, token) in enumerate(example["words"]): 102 | orig_to_tok_index.append(len(all_doc_tokens)) 103 | sub_tokens = tokenizer.tokenize(token) 104 | label = example["labels"][i] 105 | for sub_token in sub_tokens: 106 | tok_to_orig_index.append(i) 107 | all_doc_tokens.append(sub_token) 108 | if label == "B-T" or label == "B-P": 109 | for i in range(len(sub_tokens)): 110 | if i == 0: 111 | labels.append(label) 112 | else: 113 | labels.append("I-"+label.split("-")[-1]) 114 | else: 115 | for i in range(len(sub_tokens)): 116 | labels.append(label) 117 | #### update relations #### 118 | for r in example["relations"]: 119 | temp = [] 120 | for rr in r: 121 | if rr < len(example["words"]): 122 | temp.append(orig_to_tok_index[rr]) 123 | else: 124 | temp.append(len(all_doc_tokens)) 125 | gold_relations.append(temp) 126 | 127 | 128 | # Account for [CLS] and [SEP] with "- 2" 129 | if len(all_doc_tokens) > max_seq_length - 2: 130 | all_doc_tokens = all_doc_tokens[0:(max_seq_length - 2)] 131 | 132 | #### add start and end to token, make segment_ids #### 133 | tokens = [] 134 | token_to_orig_map = {} 135 | segment_ids = [] 136 | tokens.append("[CLS]") 137 | segment_ids.append(0) 138 | for index, token in enumerate(all_doc_tokens): 139 | token_to_orig_map[len(tokens)] = tok_to_orig_index[index] 140 | tokens.append(token) 141 | segment_ids.append(0) 142 | tokens.append("[SEP]") 143 | segment_ids.append(0) 144 | if len(tokens)>seq: 145 | seq = len(tokens) 146 | if len(tokens)>=120: 147 | num+=1 148 | continue 149 | #### make token_ids #### 150 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 151 | input_mask = [1] * len(input_ids) 152 | while len(input_ids) < max_seq_length: 153 | input_ids.append(0) 154 | input_mask.append(0) 155 | segment_ids.append(0) 156 | #### make label_id and relations #### 157 | label_ids = [0] * len(input_ids) 158 | label_ids[0] = 1 159 | relations = [] 160 | for i in range(len(input_ids)): 161 | relations.append([0]*len(input_ids)) 162 | for idx in range(len(labels)): 163 | if idx+1>len(labels): 164 | print(1) 165 | label_ids[idx+1] = labelDic[labels[idx]] 166 | label_ids[len(labels)+1] = 1 167 | for gr in gold_relations: 168 | for idx in range(gr[0]+1,gr[1]+1): 169 | for idy in range(gr[2]+1,gr[3]+1): 170 | relations[idx][idy] = 1 171 | for idx in range(gr[2]+1,gr[3]+1): 172 | for idy in range(gr[0]+1,gr[1]+1): 173 | relations[idx][idy] = 1 174 | # 175 | # 176 | sentlen+=1 177 | relationlen+=len(gold_relations) 178 | aspectlen+=labels.count("B-T") 179 | opinionlen+=labels.count("B-P") 180 | features.append( 181 | InputFeatures( 182 | tokens, 183 | input_ids, 184 | input_mask, 185 | segment_ids, 186 | labels, 187 | label_ids, 188 | relations, 189 | gold_relations, 190 | token_to_orig_map)) 191 | print(sentlen) 192 | print(aspectlen) 193 | print(opinionlen) 194 | print(relationlen) 195 | print("\n") 196 | return features 197 | 198 | if __name__ == '__main__': 199 | parser = argparse.ArgumentParser() 200 | ## Required parameters 201 | parser.add_argument("--train_file", type=str, required=True) 202 | parser.add_argument("--dev_file", type=str, required=True) 203 | parser.add_argument("--test_file", type=str, required=True) 204 | parser.add_argument("--output_file", type=str, required=True) 205 | parser.add_argument("--vocab_file", type=str, required=True) 206 | parser.add_argument("--do_lower_case", default=True, action='store_true', 207 | help="Whether to lower case the input text. Should be True for uncased " 208 | "models and False for cased models.") 209 | parser.add_argument("--max_seq_length", default=150, type=int, 210 | help="The maximum total input sequence length after WordPiece tokenization. Sequences " 211 | "longer than this will be truncated, and sequences shorter than this will be padded.") 212 | args = parser.parse_args() 213 | train_set = readDataFromFile(args.train_file) 214 | dev_set = readDataFromFile(args.dev_file) 215 | test_set = readDataFromFile(args.test_file) 216 | 217 | tokenizer = tokenization.FullTokenizer( 218 | vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) 219 | # A = tokenizer.tokenize("disappointed") 220 | # print(A) 221 | train_features = convert_examples_to_features(train_set, tokenizer, max_seq_length=120) 222 | dev_features = convert_examples_to_features(dev_set, tokenizer, max_seq_length=120) 223 | test_features = convert_examples_to_features(test_set, tokenizer, max_seq_length=120) 224 | # torch.save({"train":train_features, "test":test_features, "dev":dev_features}, args.output_file) 225 | # print(1) -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Shaowei Chen, Contact: chenshaowei0507@163.com 3 | # @Date: 2020-4-26 16:47:32 4 | 5 | import time 6 | import gc 7 | import torch 8 | from alphabet import Alphabet 9 | from opinionMining import opinionMining 10 | import sys 11 | import numpy as np 12 | import random 13 | import os 14 | import argparse 15 | from bert.modeling import BertConfig 16 | from torch.utils.data import TensorDataset, DataLoader 17 | from torch.utils.data.sampler import RandomSampler, SequentialSampler 18 | from bert.optimization import BERTAdam 19 | 20 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 21 | 22 | seed_num = 57 23 | random.seed(seed_num) 24 | torch.manual_seed(seed_num) 25 | np.random.seed(seed_num) 26 | 27 | 28 | class InputFeatures(object): 29 | """A single set of features of data.""" 30 | 31 | def __init__(self, 32 | tokens, 33 | token_ids, 34 | token_mask, 35 | segmentId, 36 | labels, 37 | label_ids, 38 | relations, 39 | gold_relations, 40 | token_to_orig_map): 41 | self.tokens = tokens # 42 | self.token_ids = token_ids 43 | self.token_mask = token_mask 44 | self.segmentId = segmentId 45 | self.labels = labels # 46 | self.label_ids = label_ids 47 | self.relations = relations 48 | self.gold_relations = gold_relations # 49 | self.token_to_orig_map = token_to_orig_map 50 | 51 | 52 | #### target token level precision #### 53 | def targetPredictCheck(targetPredict, batch_target_label, mask): 54 | pred = targetPredict.cpu().data.numpy() 55 | gold = batch_target_label.cpu().data.numpy() 56 | mask = mask.cpu().data.numpy() 57 | overlaped = (pred == gold) 58 | right_token = np.sum(overlaped * mask) 59 | total_token = mask.sum() 60 | return right_token, total_token 61 | 62 | 63 | #### relation token level precision #### 64 | def relationPredictCheck(relationPredict, batch_relation): 65 | relationCheck = torch.ones(relationPredict.size(0), relationPredict.size(1), relationPredict.size(2)) * 0.1 66 | pred = relationPredict.cpu() 67 | pred = torch.gt(pred, relationCheck).data.numpy() 68 | gold = batch_relation.cpu().data.numpy() 69 | overlaped = (pred == gold) 70 | right_token = np.sum(overlaped * gold) 71 | total_token = gold.sum() 72 | return right_token, total_token 73 | 74 | 75 | def recover_label(targetPredict, all_labels, all_input_mask): 76 | pred_variable = targetPredict 77 | gold_variable = all_labels 78 | mask_variable = all_input_mask 79 | batch_size = gold_variable.size(0) 80 | seq_len = gold_variable.size(1) 81 | mask = mask_variable.cpu().data.numpy() 82 | pred_tag = pred_variable.cpu().data.numpy() 83 | gold_tag = gold_variable.cpu().data.numpy() 84 | pred_label = [] 85 | gold_label = [] 86 | for idx in range(batch_size): 87 | pred = [pred_tag[idx][idy] - 1 for idy in range(seq_len) if mask[idx][idy] != 0] 88 | gold = [gold_tag[idx][idy] - 1 for idy in range(seq_len) if mask[idx][idy] != 0] 89 | assert (len(pred) == len(gold)) 90 | pred_label.append(pred) 91 | gold_label.append(gold) 92 | return pred_label, gold_label 93 | 94 | 95 | def get_ner_fmeasure(gold_results, pred_results, tagScheme): 96 | target_gold, opinion_gold = splitTandO(gold_results) 97 | target_pred, opinion_pred = splitTandO(pred_results) 98 | 99 | assert (len(target_gold) == len(target_pred)) 100 | assert (len(opinion_gold) == len(opinion_pred)) 101 | 102 | if tagScheme == "BIO": 103 | TP, TR, TF = evalForBIO(target_gold, target_pred) 104 | OP, OR, OF = evalForBIO(opinion_gold, opinion_pred) 105 | else: 106 | print("erro tagScheme!") 107 | exit(0) 108 | 109 | return TP, TR, TF, OP, OR, OF 110 | 111 | 112 | def evalForBIO(gold, pred): 113 | correct = 0 114 | predicted = 0 115 | relevant = 0 116 | # count correct 117 | for num in range(len(gold)): 118 | if gold[num] == '1': 119 | if num < len(gold) - 1: 120 | if gold[num + 1] != '2': 121 | if pred[num] == '1' and pred[num + 1] != '2': 122 | correct += 1 123 | else: 124 | if pred[num] == '1': 125 | for j in range(num + 1, len(gold)): 126 | if gold[j] == '2': 127 | if pred[j] == '2' and j < len(gold) - 1: 128 | continue 129 | elif pred[j] == '2' and j == len(gold) - 1: 130 | correct += 1 131 | break 132 | else: 133 | break 134 | 135 | else: 136 | if pred[j] != '2': 137 | correct += 1 138 | break 139 | else: 140 | if pred[num] == '1': 141 | correct += 1 142 | # count predict 143 | for num in range(len(pred)): 144 | if pred[num] == '1': 145 | predicted += 1 146 | # count gold 147 | for num in range(len(gold)): 148 | if gold[num] == '1': 149 | relevant += 1 150 | 151 | precision = float(correct) / (predicted + 1e-6) 152 | recall = float(correct) / (relevant + 1e-6) 153 | f1 = 2 * precision * recall / (precision + recall + 1e-6) 154 | 155 | return precision, recall, f1 156 | 157 | 158 | def splitTandO(result): 159 | target = [] 160 | opinion = [] 161 | for idx in range(len(result)): 162 | for idy in range(len(result[idx])): 163 | if result[idx][idy] == 0 or result[idx][idy] == -1: 164 | target.append('0') 165 | opinion.append('0') 166 | elif result[idx][idy] == 1: 167 | target.append('1') 168 | opinion.append('0') 169 | elif result[idx][idy] == 2: 170 | target.append('2') 171 | opinion.append('0') 172 | elif result[idx][idy] == 3: 173 | target.append('0') 174 | opinion.append('1') 175 | elif result[idx][idy] == 4: 176 | target.append('0') 177 | opinion.append('2') 178 | return target, opinion 179 | 180 | 181 | def fmeasure_strict(pred_relations, raw_Ids): 182 | goldTotal = 0 183 | correct = 0 184 | predictTotal = 0 185 | for idx in range(len(pred_relations)): 186 | standard = raw_Ids[idx].gold_relations 187 | pred = pred_relations[idx] 188 | goldTotal += len(standard) 189 | predictTotal += len(pred) 190 | for r in standard: 191 | if r in pred: 192 | correct += 1 193 | precision = float(correct) / (predictTotal + 1e-6) 194 | recall = float(correct) / (goldTotal + 1e-6) 195 | f1 = 2 * precision * recall / (precision + recall + 1e-6) 196 | return precision, recall, f1 197 | 198 | def make_relation(R_tensor, instance_text, thred): 199 | total_result = [] 200 | for idx in range(len(instance_text)): 201 | opinionList = [] 202 | targetList = [] 203 | relationResult = [] 204 | for idy in range(len(instance_text[idx])): 205 | if instance_text[idx][idy] == 3: 206 | if idy == len(instance_text[idx]) - 1: 207 | opinionList.append([idy, idy + 1]) 208 | else: 209 | for k in range(idy + 1, len(instance_text[idx])): 210 | if instance_text[idx][k] != 4: 211 | opinionList.append([idy, k]) 212 | break 213 | elif instance_text[idx][k] == 4 and k == len(instance_text[idx]) - 1: 214 | opinionList.append([idy, k + 1]) 215 | break 216 | elif instance_text[idx][idy] == 1: 217 | if idy == len(instance_text[idx]) - 1: 218 | targetList.append([idy - 1, idy]) 219 | else: 220 | for k in range(idy + 1, len(instance_text[idx])): 221 | if instance_text[idx][k] != 2: 222 | targetList.append([idy, k]) 223 | break 224 | elif instance_text[idx][k] == 2 and k == len(instance_text[idx]) - 1: 225 | targetList.append([idy, k + 1]) 226 | break 227 | for o in opinionList: 228 | for t in targetList: 229 | score1 = np.sum(R_tensor[idx][o[0]:o[1], t[0]:t[1]]) / (o[1] - o[0]) # *(t[1]-t[0])) 230 | score2 = np.sum(R_tensor[idx][t[0]:t[1], o[0]:o[1]]) / (t[1] - t[0]) # *(t[1]-t[0])) 231 | if (score1 + score2) / 2 > thred: 232 | if [o[0] - 1, o[1] - 1, t[0] - 1, t[1] - 1] not in relationResult: 233 | relationResult.append([o[0] - 1, o[1] - 1, t[0] - 1, t[1] - 1]) 234 | total_result.append(relationResult) 235 | return total_result 236 | 237 | 238 | def evaluate(eval_dataloader, test_set, model, output_file_path, args): 239 | pred_results = [] 240 | gold_results = [] 241 | relation_result = [] 242 | 243 | model.eval() 244 | 245 | for step, batch in enumerate(eval_dataloader): 246 | if args.ifgpu: 247 | batch = tuple(t.cuda() for t in batch) # multi-gpu does scattering it-self 248 | all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels = batch 249 | max_seq_len = torch.max(torch.sum(all_input_mask, dim=1)) 250 | all_input_ids = all_input_ids[:, :max_seq_len].contiguous() 251 | all_input_mask = all_input_mask[:, :max_seq_len].contiguous() 252 | all_segment_ids = all_segment_ids[:, :max_seq_len].contiguous() 253 | all_labels = all_labels[:, :max_seq_len].contiguous() 254 | targetPredict, relationPredict = model(all_input_ids, all_segment_ids, all_input_mask) 255 | 256 | # get real label 257 | pred_label, gold_label = recover_label(targetPredict, all_labels, all_input_mask) 258 | pred_results += pred_label 259 | gold_results += gold_label 260 | relation_result += list(relationPredict.cpu().data.numpy()) 261 | 262 | # evaluate 263 | TP, TR, TF, OP, OR, OF = get_ner_fmeasure(gold_results, pred_results, args.tagScheme) 264 | pred_relations = make_relation(relation_result, pred_results, args.inference_threds) 265 | RP, RR, RF = fmeasure_strict(pred_relations, test_set) 266 | 267 | # write to file 268 | labelDic = ["O", "B-T", "I-T", "B-P", "I-P", "O"] 269 | output_file = open(output_file_path, "w", encoding="utf-8") 270 | for k in range(len(test_set)): 271 | words = test_set[k].tokens 272 | pred = pred_results[k] 273 | gold = test_set[k].labels 274 | relations = pred_relations[k] 275 | for j in range(len(gold)): 276 | output_file.write(words[j + 1] + "\t" + gold[j] + "\t" + labelDic[pred[j + 1]] + "\n") 277 | output_file.write("#Relations\n") 278 | for r in relations: 279 | output_file.write(str(r[0]) + "\t" + str(r[1]) + "\t" + str(r[2]) + "\t" + str(r[3]) + "\n") 280 | output_file.write("\n") 281 | output_file.close() 282 | 283 | 284 | return RP, RR, RF, TP, TR, TF, OP, OR, OF 285 | 286 | 287 | 288 | def bert_load_state_dict(model, state_dict): 289 | missing_keys = [] 290 | unexpected_keys = [] 291 | error_msgs = [] 292 | 293 | # copy state_dict so _load_from_state_dict can modify it 294 | metadata = getattr(state_dict, '_metadata', None) 295 | state_dict = state_dict.copy() 296 | if metadata is not None: 297 | state_dict._metadata = metadata 298 | 299 | def load(module, prefix=''): 300 | local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) 301 | module._load_from_state_dict( 302 | state_dict, prefix, True, missing_keys, unexpected_keys, error_msgs) 303 | for name, child in module._modules.items(): 304 | if child is not None: 305 | load(child, prefix + name + '.') 306 | 307 | load(model, prefix='' if hasattr(model, 'bert') else 'bert.') 308 | return model 309 | 310 | 311 | def read_data(train_features, type, batchsize): 312 | assert type in ["train", "test"] 313 | all_input_ids = torch.tensor([f.token_ids for f in train_features], dtype=torch.long) 314 | all_input_mask = torch.tensor([f.token_mask for f in train_features], dtype=torch.long) 315 | all_segment_ids = torch.tensor([f.segmentId for f in train_features], dtype=torch.long) 316 | all_relations = torch.tensor([f.relations for f in train_features], dtype=torch.long) 317 | all_labels = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) 318 | all_labels[:, :1] = torch.ones(all_labels.size(0), 1).long() 319 | if type == "train": 320 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels) 321 | train_sampler = RandomSampler(train_data) 322 | else: 323 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels) 324 | train_sampler = SequentialSampler(train_data) 325 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batchsize) 326 | return train_dataloader 327 | 328 | 329 | def main(args): 330 | if not os.path.exists(args.test_eval_dir): 331 | os.makedirs(args.test_eval_dir) 332 | if not os.path.exists(args.eval_dir): 333 | os.makedirs(args.eval_dir) 334 | if not os.path.exists(args.model_dir): 335 | os.makedirs(args.model_dir) 336 | 337 | #### print config #### 338 | print(args) 339 | 340 | #### add label #### 341 | label_alphabet = Alphabet('label', True) 342 | label_alphabet.add("O") 343 | label_alphabet.add("B-T") 344 | label_alphabet.add("I-T") 345 | label_alphabet.add("B-P") 346 | label_alphabet.add("I-P") 347 | 348 | #### read data 349 | print("Loading data....") 350 | datasets = torch.load(args.data) 351 | train_set = datasets["train"] 352 | test_set = datasets["test"] 353 | train_dataloader = read_data(train_set, "train", args.batchSize) 354 | eval_dataloader = read_data(test_set, "test", args.batchSize) 355 | 356 | #### load BERT config #### 357 | print("Loading BERT config....") 358 | bert_config = BertConfig.from_json_file(args.bert_json_dir) 359 | 360 | #### defined model #### 361 | model = opinionMining(args, bert_config, label_alphabet) 362 | if args.mode == "test": 363 | assert args.test_model != "" 364 | model = torch.load(args.test_model) 365 | test_start = time.time() 366 | # evaluate 367 | RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate( 368 | eval_dataloader, test_set, model, args.test_eval_dir + "/test_output", args) 369 | test_finish = time.time() 370 | test_cost = test_finish - test_start 371 | print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0)) 372 | print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (RP, RR, RF)) 373 | print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (TP, TR, TF)) 374 | print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (OP, OR, OF)) 375 | else: 376 | print("Loading model from pretrained checkpoint: " + args.bert_checkpoint_dir) 377 | model = bert_load_state_dict(model, torch.load(args.bert_checkpoint_dir, map_location='cpu')) 378 | 379 | #### define optimizer #### 380 | num_train_steps = int(len(train_set) / args.batchSize * args.iteration) 381 | param_optimizer = list(model.named_parameters()) 382 | optimizer_grouped_parameters = [ 383 | {'params': [p for n, p in param_optimizer if "bert" in n], 'weight_decay': 0.01}, 384 | {'params': [p for n, p in param_optimizer if "bert" not in n], 'lr': args.lr_rate, 'weight_decay': 0.01}] 385 | optimizer_grouped_parameters_r = [ 386 | {'params': [p for n, p in param_optimizer if "bert" in n], 'weight_decay': 0.01}, 387 | {'params': [p for n, p in param_optimizer if "relation" in n], 'lr': args.R_lr_rate, 'weight_decay': 0.01}] 388 | optimizer = BERTAdam(optimizer_grouped_parameters, 389 | lr=2e-05, 390 | warmup=0.1, 391 | t_total=num_train_steps) 392 | optimizer_r = BERTAdam(optimizer_grouped_parameters_r, 393 | lr=2e-05, 394 | warmup=0.1, 395 | t_total=num_train_steps) 396 | 397 | #### train #### 398 | print("start training......") 399 | best_Score = -10000 400 | lr = args.lr_rate 401 | for idx in range(args.iteration): 402 | epoch_start = time.time() 403 | temp_start = epoch_start 404 | print("Epoch: %s/%s" % (idx, args.iteration)) 405 | 406 | if idx>10: 407 | lr = lr*args.lr_decay 408 | print(lr) 409 | optimizer.param_groups[1]["lr"] = lr 410 | optimizer_r.param_groups[1]["lr"] = lr 411 | 412 | sample_loss = 0 413 | total_loss = 0 414 | right_target_token = 0 415 | whole_target_token = 0 416 | right_relation_token = 0 417 | whole_relation_token = 0 418 | 419 | model.train() 420 | model.zero_grad() 421 | for step, batch in enumerate(train_dataloader): 422 | if args.ifgpu: 423 | batch = tuple(t.cuda() for t in batch) 424 | all_input_ids, all_input_mask, all_segment_ids, all_relations, all_labels = batch 425 | max_seq_len = torch.max(torch.sum(all_input_mask, dim=1)) 426 | all_input_ids = all_input_ids[:, :max_seq_len].contiguous() 427 | all_input_mask = all_input_mask[:, :max_seq_len].contiguous() 428 | all_segment_ids = all_segment_ids[:, :max_seq_len].contiguous() 429 | all_relations = all_relations[:, :max_seq_len, :max_seq_len].contiguous() 430 | all_labels = all_labels[:, :max_seq_len].contiguous() 431 | tloss, rloss, targetPredict, relationPredict = model.neg_log_likelihood_loss(all_input_ids, 432 | all_segment_ids, 433 | all_labels, 434 | all_relations, 435 | all_input_mask) 436 | # check right number 437 | targetRight, targetWhole = targetPredictCheck(targetPredict, all_labels, all_input_mask) 438 | relationRight, relationWhole = relationPredictCheck(relationPredict, all_relations) 439 | 440 | # cal right and whole label number 441 | right_target_token += targetRight 442 | whole_target_token += targetWhole 443 | right_relation_token += relationRight 444 | whole_relation_token += relationWhole 445 | # cal loss 446 | sample_loss += rloss.data[0] + tloss.data[0] 447 | total_loss += rloss.data[0] + tloss.data[0] 448 | # print train info 449 | if step % 20 == 0: 450 | temp_time = time.time() 451 | temp_cost = temp_time - temp_start 452 | temp_start = temp_time 453 | print(" Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f" % ( 454 | step * args.batchSize, temp_cost, sample_loss, right_target_token, whole_target_token, 455 | (right_target_token + 0.) / whole_target_token, right_relation_token, whole_relation_token, 456 | (right_relation_token + 0.) / whole_relation_token)) 457 | if sample_loss > 1e8 or str(sample_loss) == "nan": 458 | print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") 459 | exit(1) 460 | sys.stdout.flush() 461 | sample_loss = 0 462 | 463 | if step % 2 == 0: 464 | loss = 9*rloss + tloss # 465 | loss.backward() 466 | optimizer.step() 467 | optimizer.zero_grad() 468 | else: 469 | rloss.backward() 470 | optimizer_r.step() 471 | optimizer_r.zero_grad() 472 | 473 | temp_time = time.time() 474 | temp_cost = temp_time - temp_start 475 | print(" Instance: %s; Time: %.2fs; loss: %.4f; target_acc: %s/%s=%.4f; relation_acc: %s/%s=%.4f" % ( 476 | step * args.batchSize, temp_cost, sample_loss, right_target_token, whole_target_token, 477 | (right_target_token + 0.) / whole_target_token, right_relation_token, whole_relation_token, 478 | (right_relation_token + 0.) / whole_relation_token)) 479 | 480 | epoch_finish = time.time() 481 | epoch_cost = epoch_finish - epoch_start 482 | print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % ( 483 | idx, epoch_cost, len(train_set) / epoch_cost, total_loss)) 484 | print("totalloss:", total_loss) 485 | if total_loss > 1e8 or str(total_loss) == "nan": 486 | print("ERROR: LOSS EXPLOSION (>1e8) ! PLEASE SET PROPER PARAMETERS AND STRUCTURE! EXIT....") 487 | exit(1) 488 | 489 | # evaluate 490 | RP, RR, RF, TP, TR, TF, OP, OR, OF = evaluate( 491 | eval_dataloader, test_set, model, args.eval_dir + "/test_output_" + str(idx), args) 492 | test_finish = time.time() 493 | test_cost = test_finish - epoch_finish 494 | current_Score = RF 495 | 496 | print("test: time: %.2fs, speed: %.2fst/s" % (test_cost, 0)) 497 | print("relation result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (RP, RR, RF)) 498 | print("target result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (TP, TR, TF)) 499 | print("opinion result: Precision: %.4f; Recall: %.4f; F1: %.4f" % (OP, OR, OF)) 500 | 501 | if current_Score > best_Score: 502 | print("Exceed previous best f score with target f: %.4f and opinion f: %.4f and relation f: %.4f" % ( 503 | TF, OF, RF)) 504 | model_name = args.model_dir + "/modelFinal.model" 505 | print("Save current best model in file:", model_name) 506 | torch.save(model, model_name) 507 | best_Score = current_Score 508 | 509 | gc.collect() 510 | 511 | 512 | if __name__ == '__main__': 513 | parser = argparse.ArgumentParser() 514 | parser.add_argument('--mode', type=str, default="test", choices=["train", "test"]) 515 | parser.add_argument('--data', type=str, default="./data/2014Lap.pt") 516 | 517 | ## if test 518 | parser.add_argument('--test_model', type=str, default="./model/2014Lap2/modelFinal.model") 519 | parser.add_argument('--test_eval_dir', type=str, default="./test_eval/2014Lap2") 520 | 521 | ## if train 522 | parser.add_argument('--model_dir', type=str, default="./model/2014Lap2") 523 | parser.add_argument('--eval_dir', type=str, default="./eval/2014Lap2") 524 | parser.add_argument('--bert_json_dir', type=str, 525 | default="/home/ramon/chenshaowei_summer/IJCAI2020_Rebuttal/bert-base-uncased/bert_config.json") 526 | parser.add_argument('--bert_checkpoint_dir', type=str, 527 | default="/home/ramon/chenshaowei_summer/IJCAI2020_Rebuttal/bert-base-uncased/pytorch_model.bin") 528 | 529 | parser.add_argument('--tagScheme', type=str, default="BIO") 530 | parser.add_argument('--ifgpu', type=bool, default=True) 531 | 532 | parser.add_argument('--target_hidden_dim', type=int, default=250) 533 | parser.add_argument('--relation_hidden_dim', type=int, default=250) 534 | parser.add_argument('--relation_attention_dim', type=int, default=250) 535 | parser.add_argument('--relation_threds', type=float, default=0.1) 536 | parser.add_argument('--inference_threds', type=float, default=0.5) 537 | parser.add_argument('--iteration', type=int, default=70) 538 | parser.add_argument('--batchSize', type=int, default=10) 539 | parser.add_argument('--dropout', type=float, default=0.5) 540 | parser.add_argument('--lr_rate', type=float, default=0.001) 541 | parser.add_argument('--R_lr_rate', type=float, default=0.001) 542 | parser.add_argument('--lr_decay', type=float, default=0.98) 543 | parser.add_argument('--step', type=int, default=1) 544 | 545 | args = parser.parse_args() 546 | main(args) -------------------------------------------------------------------------------- /opinionMining.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Shaowei Chen, Contact: chenshaowei0507@163.com 3 | # @Date: 2020-4-26 16:47:32 4 | 5 | import torch 6 | import torch.nn as nn 7 | from relationAttention import RelationAttention 8 | from crf_new import CRF 9 | from bert.modeling import BertModel, BERTLayerNorm 10 | import threading 11 | import torch.nn.functional as F 12 | import torch.nn.init as init 13 | 14 | 15 | class opinionMining(nn.Module): 16 | def __init__(self, args, config, label_alphabet): 17 | super(opinionMining, self).__init__() 18 | print("build network...") 19 | self.gpu = args.ifgpu 20 | self.label_size = label_alphabet.size() 21 | self.bert_encoder_dim = config.hidden_size 22 | self.target_hidden_dim = args.target_hidden_dim 23 | self.relation_hidden_dim = args.relation_hidden_dim 24 | self.relation_threds = args.relation_threds 25 | self.drop = args.dropout 26 | self.step = args.step 27 | 28 | # encoder 29 | self.bert = BertModel(config) 30 | 31 | # target syn 32 | self.targetSyn_r = nn.Parameter(torch.Tensor(self.target_hidden_dim, self.bert_encoder_dim)) 33 | self.targetSyn_s = nn.Parameter(torch.Tensor(self.target_hidden_dim, self.bert_encoder_dim)) 34 | # relation syn 35 | self.relationSyn_u = nn.Parameter(torch.Tensor(self.relation_hidden_dim, self.bert_encoder_dim)) 36 | self.relationSyn_s = nn.Parameter(torch.Tensor(self.relation_hidden_dim, self.bert_encoder_dim)) 37 | init.xavier_uniform(self.targetSyn_r) 38 | init.xavier_uniform(self.targetSyn_s) 39 | init.xavier_uniform(self.relationSyn_u) 40 | init.xavier_uniform(self.relationSyn_s) 41 | 42 | # crf 43 | self.targetHidden2Tag = nn.Parameter(torch.Tensor(self.label_size + 2, self.target_hidden_dim)) 44 | self.targetHidden2Tag_b = nn.Parameter(torch.Tensor(1, self.label_size + 2)) 45 | init.xavier_uniform(self.targetHidden2Tag) 46 | init.xavier_uniform(self.targetHidden2Tag_b) 47 | 48 | self.crf = CRF(self.label_size, self.gpu) 49 | 50 | # relation 51 | self.relationAttention = RelationAttention(args) 52 | 53 | # other 54 | self.dropout = nn.Dropout(self.drop) 55 | self.softmax = nn.Softmax(dim=2) 56 | 57 | if self.gpu: 58 | self.bert = self.bert.cuda() 59 | self.targetSyn_r.data = self.targetSyn_r.cuda() 60 | self.targetSyn_s.data = self.targetSyn_s.cuda() 61 | self.relationSyn_u.data = self.relationSyn_u.cuda() 62 | self.relationSyn_s.data = self.relationSyn_s.cuda() 63 | self.targetHidden2Tag.data = self.targetHidden2Tag.cuda() 64 | self.targetHidden2Tag_b.data = self.targetHidden2Tag_b.cuda() 65 | self.relationAttention = self.relationAttention.cuda() 66 | self.dropout = self.dropout.cuda() 67 | self.softmax = self.softmax.cuda() 68 | 69 | def init_weights(module): 70 | if isinstance(module, BERTLayerNorm): 71 | module.beta.data.normal_(mean=0.0, std=config.initializer_range) 72 | module.gamma.data.normal_(mean=0.0, std=config.initializer_range) 73 | 74 | self.apply(init_weights) 75 | 76 | def neg_log_likelihood_loss(self, all_input_ids, all_segment_ids, all_labels, all_relations, all_input_mask): 77 | batch_size = all_input_ids.size(0) 78 | seq_len = all_input_ids.size(1) 79 | maskTemp1 = all_input_mask.view(batch_size, 1, seq_len).repeat(1, seq_len, 1) 80 | maskTemp2 = all_input_mask.view(batch_size, seq_len, 1).repeat(1, 1, seq_len) 81 | maskMatrix = maskTemp1 * maskTemp2 82 | 83 | targetPredictScore, r_tensor = self.mainStructure(maskMatrix, all_input_ids, all_segment_ids, self.step, 84 | all_input_mask) 85 | # target Loss 86 | target_loss = self.crf.neg_log_likelihood_loss(targetPredictScore, all_input_mask.byte(), all_labels) 87 | scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) 88 | target_loss = target_loss / batch_size 89 | 90 | # relation Loss 91 | weight = torch.FloatTensor([0.01, 1.0]).cuda() 92 | relation_loss_function = nn.CrossEntropyLoss(weight=weight) 93 | relationScoreLoss = r_tensor.view(-1, 1) 94 | relationlabelLoss = all_relations.view(batch_size * seq_len * seq_len) 95 | relationScoreLoss = torch.cat([1 - relationScoreLoss, relationScoreLoss], 1) 96 | relation_loss = relation_loss_function(relationScoreLoss, relationlabelLoss) 97 | 98 | return target_loss, relation_loss, tag_seq, r_tensor 99 | 100 | def forward(self, all_input_ids, all_segment_ids, all_input_mask): 101 | batch_size = all_input_ids.size(0) 102 | seq_len = all_input_ids.size(1) 103 | maskTemp1 = all_input_mask.view(batch_size, 1, seq_len).repeat(1, seq_len, 1) 104 | maskTemp2 = all_input_mask.view(batch_size, seq_len, 1).repeat(1, 1, seq_len) 105 | maskMatrix = maskTemp1 * maskTemp2 106 | 107 | targetPredictScore, r_tensor = self.mainStructure(maskMatrix, all_input_ids, all_segment_ids, self.step, 108 | all_input_mask) 109 | scores, tag_seq = self.crf._viterbi_decode(targetPredictScore, all_input_mask.byte()) 110 | 111 | return tag_seq, r_tensor 112 | 113 | def mainStructure(self, maskMatrix, all_input_ids, all_segment_ids, steps, all_input_mask): 114 | 115 | batch_size = all_input_ids.size(0) 116 | seq_len = all_input_ids.size(1) 117 | # bert 118 | all_encoder_layers, _ = self.bert(all_input_ids, all_segment_ids, all_input_mask) 119 | sequence_output = all_encoder_layers[-1] 120 | sequence_output = self.dropout(sequence_output) 121 | 122 | # T tensor and R tensor 123 | t_tensor = torch.zeros(batch_size, seq_len, seq_len) 124 | r_tensor = torch.zeros(batch_size, seq_len, seq_len) 125 | if self.gpu: 126 | t_tensor = t_tensor.cuda() 127 | r_tensor = r_tensor.cuda() 128 | 129 | for i in range(steps): 130 | # target syn 131 | r_temp = r_tensor.ge(self.relation_threds).float() 132 | r_tensor = r_tensor * r_temp # b x s x s 133 | target_weighted = torch.bmm(r_tensor, sequence_output) 134 | target_div = torch.sum(r_tensor, 2) 135 | targetIfZero = target_div.eq(0).float() 136 | target_div = target_div + targetIfZero 137 | target_div = target_div.unsqueeze(2).repeat(1, 1, self.bert_encoder_dim) 138 | target_r = torch.div(target_weighted, target_div) 139 | target_hidden = F.linear(sequence_output, self.targetSyn_s, None) + F.linear(target_r, self.targetSyn_r, None) 140 | target_hidden = F.tanh(target_hidden) 141 | 142 | # relation syn 143 | relation_weighted = torch.bmm(t_tensor, sequence_output) 144 | relation_div = torch.sum(t_tensor, 2) 145 | relationIfZero = relation_div.eq(0).float() 146 | relation_div = relation_div + relationIfZero 147 | relation_div = relation_div.unsqueeze(2).repeat(1, 1, self.bert_encoder_dim) 148 | relation_a = torch.div(relation_weighted, relation_div) 149 | relation_hidden = F.linear(sequence_output, self.relationSyn_s, None)+F.linear(relation_a, self.relationSyn_u, None) 150 | relation_hidden = F.tanh(relation_hidden) 151 | 152 | # crf 153 | targetPredictInput = F.linear(target_hidden, self.targetHidden2Tag, self.targetHidden2Tag_b)#self.targetHidden2Tag(target_hidden) 154 | 155 | # Relation Attention 156 | relationScore = self.relationAttention(relation_hidden) 157 | 158 | 159 | # update T_tensor 160 | tag_score, tag_seq = self.crf._viterbi_decode(targetPredictInput, all_input_mask.byte()) 161 | threads = [] 162 | temp_T_tensor = torch.zeros(batch_size, seq_len, seq_len) 163 | if self.gpu: 164 | temp_T_tensor = temp_T_tensor.cuda() 165 | for i in range(batch_size): 166 | t = threading.Thread(target=self.makeEntity, args=(i, tag_seq[i, :], temp_T_tensor, seq_len)) 167 | threads.append(t) 168 | for i in range(batch_size): 169 | threads[i].start() 170 | for i in range(batch_size): 171 | threads[i].join() 172 | tag_score_final = tag_score.unsqueeze(2).repeat(1, 1, seq_len)+tag_score.unsqueeze(1).repeat(1, seq_len, 1) 173 | t_tensor = tag_score_final * temp_T_tensor 174 | 175 | # Update R_tensor 176 | r_tensor = relationScore * (maskMatrix.float()) 177 | 178 | return targetPredictInput, r_tensor 179 | def makeEntity(self, idx, tag_seq, temp_T_tensor, seq_len): 180 | # don't consider the entity which starts with "I-X" 181 | tag_seq = tag_seq.cpu() 182 | Abegin = -1 183 | Aend = -1 184 | Obegin = -1 185 | Oend = -1 186 | for idy in range(seq_len): 187 | if tag_seq[idy] in [0, 1, 2, 4]: 188 | if Abegin != -1: 189 | temp_T_tensor[idx, Abegin:Aend, Abegin:Aend] = torch.ones(Aend - Abegin, Aend - Abegin) 190 | Abegin = -1 191 | Aend = -1 192 | if Obegin != -1: 193 | temp_T_tensor[idx, Obegin:Oend, Obegin:Oend] = torch.ones(Oend - Obegin, Oend - Obegin) 194 | Obegin = -1 195 | Oend = -1 196 | if tag_seq[idy] == 2: 197 | Abegin = idy 198 | Aend = idy + 1 199 | if tag_seq[idy] == 3 and Abegin != -1: 200 | Aend += 1 201 | if tag_seq[idy] == 4: 202 | Obegin = idy 203 | Oend = idy + 1 204 | if tag_seq[idy] == 5 and Obegin != -1: 205 | Oend += 1 206 | if Abegin != -1: 207 | temp_T_tensor[idx, Abegin:Aend, Abegin:Aend] = torch.ones(Aend - Abegin, Aend - Abegin) 208 | if Obegin != -1: 209 | temp_T_tensor[idx, Obegin:Oend, Obegin:Oend] = torch.ones(Oend - Obegin, Oend - Obegin) 210 | 211 | return temp_T_tensor -------------------------------------------------------------------------------- /relationAttention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Shaowei Chen, Contact: chenshaowei0507@163.com 3 | # @Date: 2020-4-26 16:47:32 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torch.nn.init as init 9 | 10 | 11 | class RelationAttention(nn.Module): 12 | def __init__(self, args): 13 | super(RelationAttention, self).__init__() 14 | self.relation_hidden_dim = args.relation_hidden_dim 15 | self.relation_attention_dim = args.relation_attention_dim 16 | 17 | self.w_ta = nn.Parameter(torch.Tensor(self.relation_attention_dim, self.relation_hidden_dim)) 18 | self.w_ja = nn.Parameter(torch.Tensor(self.relation_attention_dim, self.relation_hidden_dim)) 19 | self.b = nn.Parameter(torch.Tensor(1, 1, 1, self.relation_attention_dim)) 20 | self.v = nn.Parameter(torch.Tensor(1, self.relation_attention_dim)) 21 | 22 | init.xavier_uniform(self.w_ta) 23 | init.xavier_uniform(self.w_ja) 24 | init.xavier_uniform(self.b) 25 | init.xavier_uniform(self.v) 26 | 27 | self.softmax = nn.Softmax(dim=2) 28 | 29 | def forward(self, relation_hidden): 30 | """ 31 | input: 32 | wordHidden: (batch_size, sent_len, word_hidden_dim) 33 | target_a: (batch_size, sent_len, word_hidden_dim) 34 | output: 35 | Variable(batch_size, sent_len, hidden_dim) 36 | """ 37 | batchSize = relation_hidden.size(0) 38 | seqLen = relation_hidden.size(1) 39 | 40 | ta_result = F.linear(relation_hidden, self.w_ta, None).view(batchSize, seqLen, 1, self.relation_attention_dim).repeat(1, 1, seqLen, 41 | 1) 42 | ja_result = F.linear(relation_hidden, self.w_ja, None).view(batchSize, 1, seqLen, self.relation_attention_dim).repeat(1, seqLen, 1, 43 | 1) 44 | attention_alpha = torch.tanh(ta_result + ja_result+self.b) 45 | attention_alpha = F.linear(attention_alpha, self.v, None) 46 | 47 | attention_alpha = self.softmax(attention_alpha.view(batchSize, seqLen, seqLen)) 48 | return attention_alpha 49 | --------------------------------------------------------------------------------