├── .gitignore ├── .travis.yml ├── README.md ├── bert ├── extract_features.py ├── modeling.py └── tokenization.py ├── bert_embedding.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | multi_cased_L-12_H-768_A-12/ 2 | bert/multi_cased_L-12_H-768_A-12/ 3 | bert/multi_cased_L-12_H-768_A-12 4 | .ipynb_checkpoints/ 5 | bert/.ipynb_checkpoints/ 6 | bert/__pycache__/ 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | script: bert_embedding 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BERT-embedding 2 | A simple wrapper class for extracting features(embedding) and comparing them using BERT 3 | 4 | ## How to Use 5 | 6 | ### Installation 7 | ```bash 8 | git clone https://github.com/seriousmac/BERT-embedding.git 9 | cd BERT-embedding 10 | pip install -r requirements.txt 11 | wget https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip 12 | unzip multi_cased_L-12_H-768_A-12.zip -d bert/ 13 | ``` 14 | 15 | ### Run a test 16 | ```bash 17 | python bert_embedding.py 18 | ``` 19 | 20 | ### Major functions 21 | - `bert.init()` #초기화 22 | 23 | - `bert.extract(sentence)` #모든 결과 추출, 아래의 input and output에서 입출력 구조 자세히 설명 24 | - `bert.extracts(sentences)` #string list를 입력받음 25 | 26 | - `bert.extract_v1(sentence)` #embedding 값만 추출 27 | - `bert.extracts_v1(sentences)` 28 | 29 | - `bert.cal_dif_cls(result1, result2)` #extract 혹은 extracts의 출력 결과를 이용하여 distance 계산 30 | - `bert.cal_dif_cls_layer(result1, result2, layer_num)` #위의 함수에서 특정 layer에 대해서만 계산 31 | 32 | 33 | ### Input and output 34 | - bert.extracts(sentences) 35 | - input: list of string 36 | - output: list of dict 37 | - 'features': 입력한 문장 내 토큰 갯수 만큼의 list 38 | - 'token': 토큰 값 39 | - 'layers': list of layer dict 40 | - 'index': layer 번호 41 | - 'values': 768길이의 float값 list =extracting features(embedding) 42 | 43 | 44 | ## Examples 45 | 46 | ### Example 1 - 한 문장에서 embedding 추출하기 47 | ```python 48 | from bert_embedding import BERT 49 | 50 | bert = BERT() 51 | bert.init() 52 | 53 | sentence = "[OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 '국민' 타이틀을 거머쥔 스타로 꼽혔다." 54 | result = bert.extract(sentence) 55 | ``` 56 | 57 | ### Example 2 - 여러 문장에서 embedding 추출하기 58 | ```python 59 | from bert_embedding import BERT 60 | 61 | bert = BERT() 62 | bert.init() 63 | 64 | sentences = ['‘세계의 공장’으로 막대한 달러를 쓸어담으며 경제력을 키웠던 중국의 좋은 시절도 오래가지 않을 듯>하다.', 65 | '자본 유출과 서비스 수지 적자 폭이 커지며 경상수지 적자를 향해 빠르게 다가가고 있어서다.', 66 | "[OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 '국민' 타이틀을 거머쥔 스타로 꼽혔다.", 67 | "OBS '독특한 연예뉴스'(기획·연출·감수 윤경철, 작가 박은경·김현선)가 '국민 신드롬'을 일으킨 첫사랑의 아이콘 >김연아, 수지, 설현의 근황을 살펴봤다."] 68 | results = bert.extracts(sentences) 69 | ``` 70 | 71 | 72 | ### Example 3 - CLS만 이용해 distance가 가장 가까운 문장 찾기 73 | ```python 74 | from bert_embedding import BERT 75 | 76 | bert = BERT() 77 | bert.init() 78 | 79 | sentences = ['‘세계의 공장’으로 막대한 달러를 쓸어담으며 경제력을 키웠던 중국의 좋은 시절도 오래가지 않을 듯>하다.', 80 | '자본 유출과 서비스 수지 적자 폭이 커지며 경상수지 적자를 향해 빠르게 다가가고 있어서다.', 81 | '[OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 국민 타이틀을 거머쥔 스타로 꼽혔다.', 82 | 'OBS 독특한 연예뉴스(기획·연출·감수 윤경철, 작가 박은경·김현선)가 국민 신드롬을 일으킨 첫사랑의 아이콘 >김연아, 수지, 설현의 근황을 살펴봤다.', 83 | '오늘은 날씨가 좋습니다. 맛집을 찾아 가볼까요? 아이들이 좋아하더라구요.', 84 | '보쌈집에서는 보쌈을 맛있게 하면 그만입니다.ㅋㅋ'] 85 | 86 | results = bert.extracts(sentences) 87 | 88 | distances = [] 89 | for i in range(len(results)): 90 | distance = [] 91 | for j in range(len(results)): 92 | if i == j: 93 | distance.append(99999) 94 | else: 95 | distance.append(bert.cal_dif_cls(results[i], results[j])) 96 | distances.append(distance) 97 | 98 | for idx in range(len(sentences)): 99 | print(sentences[idx]) 100 | print(sentences[distances[idx].index(min(distances[idx]))]) 101 | print() 102 | ``` 103 | 104 | 출력 결과 105 | ``` 106 | ‘세계의 공장’으로 막대한 달러를 쓸어담으며 경제력을 키웠던 중국의 좋은 시절도 오래가지 않을 듯>하다. 107 | 자본 유출과 서비스 수지 적자 폭이 커지며 경상수지 적자를 향해 빠르게 다가가고 있어서다. 108 | 109 | 자본 유출과 서비스 수지 적자 폭이 커지며 경상수지 적자를 향해 빠르게 다가가고 있어서다. 110 | ‘세계의 공장’으로 막대한 달러를 쓸어담으며 경제력을 키웠던 중국의 좋은 시절도 오래가지 않을 듯>하다. 111 | 112 | [OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 국민 타이틀을 거머쥔 스타로 꼽혔다. 113 | OBS 독특한 연예뉴스(기획·연출·감수 윤경철, 작가 박은경·김현선)가 국민 신드롬을 일으킨 첫사랑의 아이콘 >김연아, 수지, 설현의 근황을 살펴봤다. 114 | 115 | OBS 독특한 연예뉴스(기획·연출·감수 윤경철, 작가 박은경·김현선)가 국민 신드롬을 일으킨 첫사랑의 아이콘 >김연아, 수지, 설현의 근황을 살펴봤다. 116 | [OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 국민 타이틀을 거머쥔 스타로 꼽혔다. 117 | 118 | 오늘은 날씨가 좋습니다. 맛집을 찾아 가볼까요? 아이들이 좋아하더라구요. 119 | 보쌈집에서는 보쌈을 맛있게 하면 그만입니다.ㅋㅋ 120 | 121 | 보쌈집에서는 보쌈을 맛있게 하면 그만입니다.ㅋㅋ 122 | 오늘은 날씨가 좋습니다. 맛집을 찾아 가볼까요? 아이들이 좋아하더라구요. 123 | ``` 124 | 125 | ### Example 4 - 문장 내 특정 token의 embedding만 비교하기 126 | 127 | ```python 128 | from bert_embedding import BERT 129 | bert = BERT() 130 | bert.init() 131 | 132 | sentences = ["마치 화보 컷을 방불케 한 이번 이미지는 해외 로케촬영 시 촬영된 컷으로 특히, 옐로우 컬러의 레트로한 틴트선글라스를 착용한 채 지프차를 운전하는 수지의 모습에서 기존의 청순한 모습과는 다른 도회적인 분위기와 한층 성숙해진 모습을 보여주며 극 중 캐릭터에 대한 기대감을 높였다.", 133 | '자본 유출과 서비스 수지 적자 폭이 커지며 경상 수지 적자를 향해 빠르게 다가가고 있어서다.', 134 | "[조연수 기자] 가수 겸 배우 수지가 국민 타이틀을 거머쥔 스타로 꼽혔다."] 135 | 136 | results = bert.extracts(sentences) 137 | 138 | for i in range(len(results)): 139 | for j in range(len(results)): 140 | print(sentences[i]) 141 | print(sentences[j]) 142 | cal_dif_keyword(results[i], results[j], '수지') 143 | ``` 144 | 145 | 146 | 147 | ## To Do List 148 | - [x] Define class 149 | - [x] embedding 쉽게 추출하기 150 | - [x] CLS만을 이용해 문장의 distance 계산하기 151 | - [x] 문장 내 모든 token들의 embedding을 이용해 distance 계산하기 152 | - [x] 문장 내 특정 token만 비교하기 (예: 경제의 '수지'와 연예의 '수지' 값의 차이 확인하기) 153 | -------------------------------------------------------------------------------- /bert/extract_features.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Extract pre-computed feature vectors from BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import codecs 22 | import collections 23 | import json 24 | import re 25 | 26 | from . import modeling 27 | from . import tokenization 28 | import tensorflow as tf 29 | 30 | flags = tf.flags 31 | 32 | FLAGS = flags.FLAGS 33 | 34 | flags.DEFINE_string("input_file", None, "") 35 | 36 | flags.DEFINE_string("output_file", None, "") 37 | 38 | flags.DEFINE_string("layers", "-1,-2,-3,-4", "") 39 | 40 | flags.DEFINE_string( 41 | "bert_config_file", None, 42 | "The config json file corresponding to the pre-trained BERT model. " 43 | "This specifies the model architecture.") 44 | 45 | flags.DEFINE_integer( 46 | "max_seq_length", 128, 47 | "The maximum total input sequence length after WordPiece tokenization. " 48 | "Sequences longer than this will be truncated, and sequences shorter " 49 | "than this will be padded.") 50 | 51 | flags.DEFINE_string( 52 | "init_checkpoint", None, 53 | "Initial checkpoint (usually from a pre-trained BERT model).") 54 | 55 | flags.DEFINE_string("vocab_file", None, 56 | "The vocabulary file that the BERT model was trained on.") 57 | 58 | flags.DEFINE_bool( 59 | "do_lower_case", True, 60 | "Whether to lower case the input text. Should be True for uncased " 61 | "models and False for cased models.") 62 | 63 | flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") 64 | 65 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 66 | 67 | flags.DEFINE_string("master", None, 68 | "If using a TPU, the address of the master.") 69 | 70 | flags.DEFINE_integer( 71 | "num_tpu_cores", 8, 72 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 73 | 74 | flags.DEFINE_bool( 75 | "use_one_hot_embeddings", False, 76 | "If True, tf.one_hot will be used for embedding lookups, otherwise " 77 | "tf.nn.embedding_lookup will be used. On TPUs, this should be True " 78 | "since it is much faster.") 79 | 80 | 81 | class InputExample(object): 82 | 83 | def __init__(self, unique_id, text_a, text_b): 84 | self.unique_id = unique_id 85 | self.text_a = text_a 86 | self.text_b = text_b 87 | 88 | 89 | class InputFeatures(object): 90 | """A single set of features of data.""" 91 | 92 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 93 | self.unique_id = unique_id 94 | self.tokens = tokens 95 | self.input_ids = input_ids 96 | self.input_mask = input_mask 97 | self.input_type_ids = input_type_ids 98 | 99 | 100 | def input_fn_builder(features, seq_length): 101 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 102 | 103 | all_unique_ids = [] 104 | all_input_ids = [] 105 | all_input_mask = [] 106 | all_input_type_ids = [] 107 | 108 | for feature in features: 109 | all_unique_ids.append(feature.unique_id) 110 | all_input_ids.append(feature.input_ids) 111 | all_input_mask.append(feature.input_mask) 112 | all_input_type_ids.append(feature.input_type_ids) 113 | 114 | def input_fn(params): 115 | """The actual input function.""" 116 | batch_size = params["batch_size"] 117 | 118 | num_examples = len(features) 119 | 120 | # This is for demo purposes and does NOT scale to large data sets. We do 121 | # not use Dataset.from_generator() because that uses tf.py_func which is 122 | # not TPU compatible. The right way to load data is with TFRecordReader. 123 | d = tf.data.Dataset.from_tensor_slices({ 124 | "unique_ids": 125 | tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), 126 | "input_ids": 127 | tf.constant( 128 | all_input_ids, shape=[num_examples, seq_length], 129 | dtype=tf.int32), 130 | "input_mask": 131 | tf.constant( 132 | all_input_mask, 133 | shape=[num_examples, seq_length], 134 | dtype=tf.int32), 135 | "input_type_ids": 136 | tf.constant( 137 | all_input_type_ids, 138 | shape=[num_examples, seq_length], 139 | dtype=tf.int32), 140 | }) 141 | 142 | d = d.batch(batch_size=batch_size, drop_remainder=False) 143 | return d 144 | 145 | return input_fn 146 | 147 | 148 | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, 149 | use_one_hot_embeddings): 150 | """Returns `model_fn` closure for TPUEstimator.""" 151 | 152 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 153 | """The `model_fn` for TPUEstimator.""" 154 | 155 | unique_ids = features["unique_ids"] 156 | input_ids = features["input_ids"] 157 | input_mask = features["input_mask"] 158 | input_type_ids = features["input_type_ids"] 159 | 160 | model = modeling.BertModel( 161 | config=bert_config, 162 | is_training=False, 163 | input_ids=input_ids, 164 | input_mask=input_mask, 165 | token_type_ids=input_type_ids, 166 | use_one_hot_embeddings=use_one_hot_embeddings) 167 | 168 | if mode != tf.estimator.ModeKeys.PREDICT: 169 | raise ValueError("Only PREDICT modes are supported: %s" % (mode)) 170 | 171 | tvars = tf.trainable_variables() 172 | scaffold_fn = None 173 | (assignment_map, 174 | initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 175 | tvars, init_checkpoint) 176 | if use_tpu: 177 | 178 | def tpu_scaffold(): 179 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 180 | return tf.train.Scaffold() 181 | 182 | scaffold_fn = tpu_scaffold 183 | else: 184 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 185 | 186 | tf.logging.info("**** Trainable Variables ****") 187 | for var in tvars: 188 | init_string = "" 189 | if var.name in initialized_variable_names: 190 | init_string = ", *INIT_FROM_CKPT*" 191 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 192 | init_string) 193 | 194 | all_layers = model.get_all_encoder_layers() 195 | 196 | predictions = { 197 | "unique_id": unique_ids, 198 | } 199 | 200 | for (i, layer_index) in enumerate(layer_indexes): 201 | predictions["layer_output_%d" % i] = all_layers[layer_index] 202 | 203 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 204 | mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) 205 | return output_spec 206 | 207 | return model_fn 208 | 209 | 210 | def convert_examples_to_features(examples, seq_length, tokenizer): 211 | """Loads a data file into a list of `InputBatch`s.""" 212 | 213 | features = [] 214 | for (ex_index, example) in enumerate(examples): 215 | tokens_a = tokenizer.tokenize(example.text_a) 216 | 217 | tokens_b = None 218 | if example.text_b: 219 | tokens_b = tokenizer.tokenize(example.text_b) 220 | 221 | if tokens_b: 222 | # Modifies `tokens_a` and `tokens_b` in place so that the total 223 | # length is less than the specified length. 224 | # Account for [CLS], [SEP], [SEP] with "- 3" 225 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 226 | else: 227 | # Account for [CLS] and [SEP] with "- 2" 228 | if len(tokens_a) > seq_length - 2: 229 | tokens_a = tokens_a[0:(seq_length - 2)] 230 | 231 | # The convention in BERT is: 232 | # (a) For sequence pairs: 233 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 234 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 235 | # (b) For single sequences: 236 | # tokens: [CLS] the dog is hairy . [SEP] 237 | # type_ids: 0 0 0 0 0 0 0 238 | # 239 | # Where "type_ids" are used to indicate whether this is the first 240 | # sequence or the second sequence. The embedding vectors for `type=0` and 241 | # `type=1` were learned during pre-training and are added to the wordpiece 242 | # embedding vector (and position vector). This is not *strictly* necessary 243 | # since the [SEP] token unambiguously separates the sequences, but it makes 244 | # it easier for the model to learn the concept of sequences. 245 | # 246 | # For classification tasks, the first vector (corresponding to [CLS]) is 247 | # used as as the "sentence vector". Note that this only makes sense because 248 | # the entire model is fine-tuned. 249 | tokens = [] 250 | input_type_ids = [] 251 | tokens.append("[CLS]") 252 | input_type_ids.append(0) 253 | for token in tokens_a: 254 | tokens.append(token) 255 | input_type_ids.append(0) 256 | tokens.append("[SEP]") 257 | input_type_ids.append(0) 258 | 259 | if tokens_b: 260 | for token in tokens_b: 261 | tokens.append(token) 262 | input_type_ids.append(1) 263 | tokens.append("[SEP]") 264 | input_type_ids.append(1) 265 | 266 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 267 | 268 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 269 | # tokens are attended to. 270 | input_mask = [1] * len(input_ids) 271 | 272 | # Zero-pad up to the sequence length. 273 | while len(input_ids) < seq_length: 274 | input_ids.append(0) 275 | input_mask.append(0) 276 | input_type_ids.append(0) 277 | 278 | assert len(input_ids) == seq_length 279 | assert len(input_mask) == seq_length 280 | assert len(input_type_ids) == seq_length 281 | 282 | if ex_index < 5: 283 | tf.logging.info("*** Example ***") 284 | tf.logging.info("unique_id: %s" % (example.unique_id)) 285 | tf.logging.info("tokens: %s" % " ".join( 286 | [tokenization.printable_text(x) for x in tokens])) 287 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 288 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 289 | tf.logging.info( 290 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 291 | 292 | features.append( 293 | InputFeatures( 294 | unique_id=example.unique_id, 295 | tokens=tokens, 296 | input_ids=input_ids, 297 | input_mask=input_mask, 298 | input_type_ids=input_type_ids)) 299 | return features 300 | 301 | 302 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 303 | """Truncates a sequence pair in place to the maximum length.""" 304 | 305 | # This is a simple heuristic which will always truncate the longer sequence 306 | # one token at a time. This makes more sense than truncating an equal percent 307 | # of tokens from each, since if one sequence is very short then each token 308 | # that's truncated likely contains more information than a longer sequence. 309 | while True: 310 | total_length = len(tokens_a) + len(tokens_b) 311 | if total_length <= max_length: 312 | break 313 | if len(tokens_a) > len(tokens_b): 314 | tokens_a.pop() 315 | else: 316 | tokens_b.pop() 317 | 318 | 319 | def read_examples(input_file): 320 | """Read a list of `InputExample`s from an input file.""" 321 | examples = [] 322 | unique_id = 0 323 | with tf.gfile.GFile(input_file, "r") as reader: 324 | while True: 325 | line = tokenization.convert_to_unicode(reader.readline()) 326 | if not line: 327 | break 328 | line = line.strip() 329 | text_a = None 330 | text_b = None 331 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 332 | if m is None: 333 | text_a = line 334 | else: 335 | text_a = m.group(1) 336 | text_b = m.group(2) 337 | examples.append( 338 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 339 | unique_id += 1 340 | return examples 341 | 342 | 343 | def main(_): 344 | tf.logging.set_verbosity(tf.logging.INFO) 345 | 346 | layer_indexes = [int(x) for x in FLAGS.layers.split(",")] 347 | 348 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 349 | 350 | tokenizer = tokenization.FullTokenizer( 351 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 352 | 353 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 354 | run_config = tf.contrib.tpu.RunConfig( 355 | master=FLAGS.master, 356 | tpu_config=tf.contrib.tpu.TPUConfig( 357 | num_shards=FLAGS.num_tpu_cores, 358 | per_host_input_for_training=is_per_host)) 359 | 360 | examples = read_examples(FLAGS.input_file) 361 | 362 | features = convert_examples_to_features( 363 | examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) 364 | 365 | unique_id_to_feature = {} 366 | for feature in features: 367 | unique_id_to_feature[feature.unique_id] = feature 368 | 369 | model_fn = model_fn_builder( 370 | bert_config=bert_config, 371 | init_checkpoint=FLAGS.init_checkpoint, 372 | layer_indexes=layer_indexes, 373 | use_tpu=FLAGS.use_tpu, 374 | use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) 375 | 376 | # If TPU is not available, this will fall back to normal Estimator on CPU 377 | # or GPU. 378 | estimator = tf.contrib.tpu.TPUEstimator( 379 | use_tpu=FLAGS.use_tpu, 380 | model_fn=model_fn, 381 | config=run_config, 382 | predict_batch_size=FLAGS.batch_size) 383 | 384 | input_fn = input_fn_builder( 385 | features=features, seq_length=FLAGS.max_seq_length) 386 | 387 | with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, 388 | "w")) as writer: 389 | for result in estimator.predict(input_fn, yield_single_examples=True): 390 | unique_id = int(result["unique_id"]) 391 | feature = unique_id_to_feature[unique_id] 392 | output_json = collections.OrderedDict() 393 | output_json["linex_index"] = unique_id 394 | all_features = [] 395 | for (i, token) in enumerate(feature.tokens): 396 | all_layers = [] 397 | for (j, layer_index) in enumerate(layer_indexes): 398 | layer_output = result["layer_output_%d" % j] 399 | layers = collections.OrderedDict() 400 | layers["index"] = layer_index 401 | layers["values"] = [ 402 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat 403 | ] 404 | all_layers.append(layers) 405 | features = collections.OrderedDict() 406 | features["token"] = token 407 | features["layers"] = all_layers 408 | all_features.append(features) 409 | output_json["features"] = all_features 410 | writer.write(json.dumps(output_json) + "\n") 411 | 412 | 413 | if __name__ == "__main__": 414 | flags.mark_flag_as_required("input_file") 415 | flags.mark_flag_as_required("vocab_file") 416 | flags.mark_flag_as_required("bert_config_file") 417 | flags.mark_flag_as_required("init_checkpoint") 418 | flags.mark_flag_as_required("output_file") 419 | tf.app.run() 420 | -------------------------------------------------------------------------------- /bert/modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """The main BERT model and related functions.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import copy 23 | import json 24 | import math 25 | import re 26 | import numpy as np 27 | import six 28 | import tensorflow as tf 29 | 30 | 31 | class BertConfig(object): 32 | """Configuration for `BertModel`.""" 33 | 34 | def __init__(self, 35 | vocab_size, 36 | hidden_size=768, 37 | num_hidden_layers=12, 38 | num_attention_heads=12, 39 | intermediate_size=3072, 40 | hidden_act="gelu", 41 | hidden_dropout_prob=0.1, 42 | attention_probs_dropout_prob=0.1, 43 | max_position_embeddings=512, 44 | type_vocab_size=16, 45 | initializer_range=0.02): 46 | """Constructs BertConfig. 47 | 48 | Args: 49 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 50 | hidden_size: Size of the encoder layers and the pooler layer. 51 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 52 | num_attention_heads: Number of attention heads for each attention layer in 53 | the Transformer encoder. 54 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 55 | layer in the Transformer encoder. 56 | hidden_act: The non-linear activation function (function or string) in the 57 | encoder and pooler. 58 | hidden_dropout_prob: The dropout probability for all fully connected 59 | layers in the embeddings, encoder, and pooler. 60 | attention_probs_dropout_prob: The dropout ratio for the attention 61 | probabilities. 62 | max_position_embeddings: The maximum sequence length that this model might 63 | ever be used with. Typically set this to something large just in case 64 | (e.g., 512 or 1024 or 2048). 65 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 66 | `BertModel`. 67 | initializer_range: The stdev of the truncated_normal_initializer for 68 | initializing all weight matrices. 69 | """ 70 | self.vocab_size = vocab_size 71 | self.hidden_size = hidden_size 72 | self.num_hidden_layers = num_hidden_layers 73 | self.num_attention_heads = num_attention_heads 74 | self.hidden_act = hidden_act 75 | self.intermediate_size = intermediate_size 76 | self.hidden_dropout_prob = hidden_dropout_prob 77 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 78 | self.max_position_embeddings = max_position_embeddings 79 | self.type_vocab_size = type_vocab_size 80 | self.initializer_range = initializer_range 81 | 82 | @classmethod 83 | def from_dict(cls, json_object): 84 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 85 | config = BertConfig(vocab_size=None) 86 | for (key, value) in six.iteritems(json_object): 87 | config.__dict__[key] = value 88 | return config 89 | 90 | @classmethod 91 | def from_json_file(cls, json_file): 92 | """Constructs a `BertConfig` from a json file of parameters.""" 93 | with tf.gfile.GFile(json_file, "r") as reader: 94 | text = reader.read() 95 | return cls.from_dict(json.loads(text)) 96 | 97 | def to_dict(self): 98 | """Serializes this instance to a Python dictionary.""" 99 | output = copy.deepcopy(self.__dict__) 100 | return output 101 | 102 | def to_json_string(self): 103 | """Serializes this instance to a JSON string.""" 104 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 105 | 106 | 107 | class BertModel(object): 108 | """BERT model ("Bidirectional Encoder Representations from Transformers"). 109 | 110 | Example usage: 111 | 112 | ```python 113 | # Already been converted into WordPiece token ids 114 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 115 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 116 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 117 | 118 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 119 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 120 | 121 | model = modeling.BertModel(config=config, is_training=True, 122 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 123 | 124 | label_embeddings = tf.get_variable(...) 125 | pooled_output = model.get_pooled_output() 126 | logits = tf.matmul(pooled_output, label_embeddings) 127 | ... 128 | ``` 129 | """ 130 | 131 | def __init__(self, 132 | config, 133 | is_training, 134 | input_ids, 135 | input_mask=None, 136 | token_type_ids=None, 137 | use_one_hot_embeddings=False, 138 | scope=None): 139 | """Constructor for BertModel. 140 | 141 | Args: 142 | config: `BertConfig` instance. 143 | is_training: bool. true for training model, false for eval model. Controls 144 | whether dropout will be applied. 145 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 146 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 147 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 148 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 149 | embeddings or tf.embedding_lookup() for the word embeddings. 150 | scope: (optional) variable scope. Defaults to "bert". 151 | 152 | Raises: 153 | ValueError: The config is invalid or one of the input tensor shapes 154 | is invalid. 155 | """ 156 | config = copy.deepcopy(config) 157 | if not is_training: 158 | config.hidden_dropout_prob = 0.0 159 | config.attention_probs_dropout_prob = 0.0 160 | 161 | input_shape = get_shape_list(input_ids, expected_rank=2) 162 | batch_size = input_shape[0] 163 | seq_length = input_shape[1] 164 | 165 | if input_mask is None: 166 | input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) 167 | 168 | if token_type_ids is None: 169 | token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) 170 | 171 | with tf.variable_scope(scope, default_name="bert"): 172 | with tf.variable_scope("embeddings"): 173 | # Perform embedding lookup on the word ids. 174 | (self.embedding_output, self.embedding_table) = embedding_lookup( 175 | input_ids=input_ids, 176 | vocab_size=config.vocab_size, 177 | embedding_size=config.hidden_size, 178 | initializer_range=config.initializer_range, 179 | word_embedding_name="word_embeddings", 180 | use_one_hot_embeddings=use_one_hot_embeddings) 181 | 182 | # Add positional embeddings and token type embeddings, then layer 183 | # normalize and perform dropout. 184 | self.embedding_output = embedding_postprocessor( 185 | input_tensor=self.embedding_output, 186 | use_token_type=True, 187 | token_type_ids=token_type_ids, 188 | token_type_vocab_size=config.type_vocab_size, 189 | token_type_embedding_name="token_type_embeddings", 190 | use_position_embeddings=True, 191 | position_embedding_name="position_embeddings", 192 | initializer_range=config.initializer_range, 193 | max_position_embeddings=config.max_position_embeddings, 194 | dropout_prob=config.hidden_dropout_prob) 195 | 196 | with tf.variable_scope("encoder"): 197 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 198 | # mask of shape [batch_size, seq_length, seq_length] which is used 199 | # for the attention scores. 200 | attention_mask = create_attention_mask_from_input_mask( 201 | input_ids, input_mask) 202 | 203 | # Run the stacked transformer. 204 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 205 | self.all_encoder_layers = transformer_model( 206 | input_tensor=self.embedding_output, 207 | attention_mask=attention_mask, 208 | hidden_size=config.hidden_size, 209 | num_hidden_layers=config.num_hidden_layers, 210 | num_attention_heads=config.num_attention_heads, 211 | intermediate_size=config.intermediate_size, 212 | intermediate_act_fn=get_activation(config.hidden_act), 213 | hidden_dropout_prob=config.hidden_dropout_prob, 214 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 215 | initializer_range=config.initializer_range, 216 | do_return_all_layers=True) 217 | 218 | self.sequence_output = self.all_encoder_layers[-1] 219 | # The "pooler" converts the encoded sequence tensor of shape 220 | # [batch_size, seq_length, hidden_size] to a tensor of shape 221 | # [batch_size, hidden_size]. This is necessary for segment-level 222 | # (or segment-pair-level) classification tasks where we need a fixed 223 | # dimensional representation of the segment. 224 | with tf.variable_scope("pooler"): 225 | # We "pool" the model by simply taking the hidden state corresponding 226 | # to the first token. We assume that this has been pre-trained 227 | first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) 228 | self.pooled_output = tf.layers.dense( 229 | first_token_tensor, 230 | config.hidden_size, 231 | activation=tf.tanh, 232 | kernel_initializer=create_initializer(config.initializer_range)) 233 | 234 | def get_pooled_output(self): 235 | return self.pooled_output 236 | 237 | def get_sequence_output(self): 238 | """Gets final hidden layer of encoder. 239 | 240 | Returns: 241 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 242 | to the final hidden of the transformer encoder. 243 | """ 244 | return self.sequence_output 245 | 246 | def get_all_encoder_layers(self): 247 | return self.all_encoder_layers 248 | 249 | def get_embedding_output(self): 250 | """Gets output of the embedding lookup (i.e., input to the transformer). 251 | 252 | Returns: 253 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 254 | to the output of the embedding layer, after summing the word 255 | embeddings with the positional embeddings and the token type embeddings, 256 | then performing layer normalization. This is the input to the transformer. 257 | """ 258 | return self.embedding_output 259 | 260 | def get_embedding_table(self): 261 | return self.embedding_table 262 | 263 | 264 | def gelu(x): 265 | """Gaussian Error Linear Unit. 266 | 267 | This is a smoother version of the RELU. 268 | Original paper: https://arxiv.org/abs/1606.08415 269 | Args: 270 | x: float Tensor to perform activation. 271 | 272 | Returns: 273 | `x` with the GELU activation applied. 274 | """ 275 | cdf = 0.5 * (1.0 + tf.tanh( 276 | (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) 277 | return x * cdf 278 | 279 | 280 | def get_activation(activation_string): 281 | """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. 282 | 283 | Args: 284 | activation_string: String name of the activation function. 285 | 286 | Returns: 287 | A Python function corresponding to the activation function. If 288 | `activation_string` is None, empty, or "linear", this will return None. 289 | If `activation_string` is not a string, it will return `activation_string`. 290 | 291 | Raises: 292 | ValueError: The `activation_string` does not correspond to a known 293 | activation. 294 | """ 295 | 296 | # We assume that anything that"s not a string is already an activation 297 | # function, so we just return it. 298 | if not isinstance(activation_string, six.string_types): 299 | return activation_string 300 | 301 | if not activation_string: 302 | return None 303 | 304 | act = activation_string.lower() 305 | if act == "linear": 306 | return None 307 | elif act == "relu": 308 | return tf.nn.relu 309 | elif act == "gelu": 310 | return gelu 311 | elif act == "tanh": 312 | return tf.tanh 313 | else: 314 | raise ValueError("Unsupported activation: %s" % act) 315 | 316 | 317 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint): 318 | """Compute the union of the current variables and checkpoint variables.""" 319 | assignment_map = {} 320 | initialized_variable_names = {} 321 | 322 | name_to_variable = collections.OrderedDict() 323 | for var in tvars: 324 | name = var.name 325 | m = re.match("^(.*):\\d+$", name) 326 | if m is not None: 327 | name = m.group(1) 328 | name_to_variable[name] = var 329 | 330 | init_vars = tf.train.list_variables(init_checkpoint) 331 | 332 | assignment_map = collections.OrderedDict() 333 | for x in init_vars: 334 | (name, var) = (x[0], x[1]) 335 | if name not in name_to_variable: 336 | continue 337 | assignment_map[name] = name 338 | initialized_variable_names[name] = 1 339 | initialized_variable_names[name + ":0"] = 1 340 | 341 | return (assignment_map, initialized_variable_names) 342 | 343 | 344 | def dropout(input_tensor, dropout_prob): 345 | """Perform dropout. 346 | 347 | Args: 348 | input_tensor: float Tensor. 349 | dropout_prob: Python float. The probability of dropping out a value (NOT of 350 | *keeping* a dimension as in `tf.nn.dropout`). 351 | 352 | Returns: 353 | A version of `input_tensor` with dropout applied. 354 | """ 355 | if dropout_prob is None or dropout_prob == 0.0: 356 | return input_tensor 357 | 358 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 359 | return output 360 | 361 | 362 | def layer_norm(input_tensor, name=None): 363 | """Run layer normalization on the last dimension of the tensor.""" 364 | return tf.contrib.layers.layer_norm( 365 | inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) 366 | 367 | 368 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): 369 | """Runs layer normalization followed by dropout.""" 370 | output_tensor = layer_norm(input_tensor, name) 371 | output_tensor = dropout(output_tensor, dropout_prob) 372 | return output_tensor 373 | 374 | 375 | def create_initializer(initializer_range=0.02): 376 | """Creates a `truncated_normal_initializer` with the given range.""" 377 | return tf.truncated_normal_initializer(stddev=initializer_range) 378 | 379 | 380 | def embedding_lookup(input_ids, 381 | vocab_size, 382 | embedding_size=128, 383 | initializer_range=0.02, 384 | word_embedding_name="word_embeddings", 385 | use_one_hot_embeddings=False): 386 | """Looks up words embeddings for id tensor. 387 | 388 | Args: 389 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 390 | ids. 391 | vocab_size: int. Size of the embedding vocabulary. 392 | embedding_size: int. Width of the word embeddings. 393 | initializer_range: float. Embedding initialization range. 394 | word_embedding_name: string. Name of the embedding table. 395 | use_one_hot_embeddings: bool. If True, use one-hot method for word 396 | embeddings. If False, use `tf.gather()`. 397 | 398 | Returns: 399 | float Tensor of shape [batch_size, seq_length, embedding_size]. 400 | """ 401 | # This function assumes that the input is of shape [batch_size, seq_length, 402 | # num_inputs]. 403 | # 404 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 405 | # reshape to [batch_size, seq_length, 1]. 406 | if input_ids.shape.ndims == 2: 407 | input_ids = tf.expand_dims(input_ids, axis=[-1]) 408 | 409 | embedding_table = tf.get_variable( 410 | name=word_embedding_name, 411 | shape=[vocab_size, embedding_size], 412 | initializer=create_initializer(initializer_range)) 413 | 414 | flat_input_ids = tf.reshape(input_ids, [-1]) 415 | if use_one_hot_embeddings: 416 | one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) 417 | output = tf.matmul(one_hot_input_ids, embedding_table) 418 | else: 419 | output = tf.gather(embedding_table, flat_input_ids) 420 | 421 | input_shape = get_shape_list(input_ids) 422 | 423 | output = tf.reshape(output, 424 | input_shape[0:-1] + [input_shape[-1] * embedding_size]) 425 | return (output, embedding_table) 426 | 427 | 428 | def embedding_postprocessor(input_tensor, 429 | use_token_type=False, 430 | token_type_ids=None, 431 | token_type_vocab_size=16, 432 | token_type_embedding_name="token_type_embeddings", 433 | use_position_embeddings=True, 434 | position_embedding_name="position_embeddings", 435 | initializer_range=0.02, 436 | max_position_embeddings=512, 437 | dropout_prob=0.1): 438 | """Performs various post-processing on a word embedding tensor. 439 | 440 | Args: 441 | input_tensor: float Tensor of shape [batch_size, seq_length, 442 | embedding_size]. 443 | use_token_type: bool. Whether to add embeddings for `token_type_ids`. 444 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 445 | Must be specified if `use_token_type` is True. 446 | token_type_vocab_size: int. The vocabulary size of `token_type_ids`. 447 | token_type_embedding_name: string. The name of the embedding table variable 448 | for token type ids. 449 | use_position_embeddings: bool. Whether to add position embeddings for the 450 | position of each token in the sequence. 451 | position_embedding_name: string. The name of the embedding table variable 452 | for positional embeddings. 453 | initializer_range: float. Range of the weight initialization. 454 | max_position_embeddings: int. Maximum sequence length that might ever be 455 | used with this model. This can be longer than the sequence length of 456 | input_tensor, but cannot be shorter. 457 | dropout_prob: float. Dropout probability applied to the final output tensor. 458 | 459 | Returns: 460 | float tensor with same shape as `input_tensor`. 461 | 462 | Raises: 463 | ValueError: One of the tensor shapes or input values is invalid. 464 | """ 465 | input_shape = get_shape_list(input_tensor, expected_rank=3) 466 | batch_size = input_shape[0] 467 | seq_length = input_shape[1] 468 | width = input_shape[2] 469 | 470 | output = input_tensor 471 | 472 | if use_token_type: 473 | if token_type_ids is None: 474 | raise ValueError("`token_type_ids` must be specified if" 475 | "`use_token_type` is True.") 476 | token_type_table = tf.get_variable( 477 | name=token_type_embedding_name, 478 | shape=[token_type_vocab_size, width], 479 | initializer=create_initializer(initializer_range)) 480 | # This vocab will be small so we always do one-hot here, since it is always 481 | # faster for a small vocabulary. 482 | flat_token_type_ids = tf.reshape(token_type_ids, [-1]) 483 | one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) 484 | token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) 485 | token_type_embeddings = tf.reshape(token_type_embeddings, 486 | [batch_size, seq_length, width]) 487 | output += token_type_embeddings 488 | 489 | if use_position_embeddings: 490 | assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) 491 | with tf.control_dependencies([assert_op]): 492 | full_position_embeddings = tf.get_variable( 493 | name=position_embedding_name, 494 | shape=[max_position_embeddings, width], 495 | initializer=create_initializer(initializer_range)) 496 | # Since the position embedding table is a learned variable, we create it 497 | # using a (long) sequence length `max_position_embeddings`. The actual 498 | # sequence length might be shorter than this, for faster training of 499 | # tasks that do not have long sequences. 500 | # 501 | # So `full_position_embeddings` is effectively an embedding table 502 | # for position [0, 1, 2, ..., max_position_embeddings-1], and the current 503 | # sequence has positions [0, 1, 2, ... seq_length-1], so we can just 504 | # perform a slice. 505 | position_embeddings = tf.slice(full_position_embeddings, [0, 0], 506 | [seq_length, -1]) 507 | num_dims = len(output.shape.as_list()) 508 | 509 | # Only the last two dimensions are relevant (`seq_length` and `width`), so 510 | # we broadcast among the first dimensions, which is typically just 511 | # the batch size. 512 | position_broadcast_shape = [] 513 | for _ in range(num_dims - 2): 514 | position_broadcast_shape.append(1) 515 | position_broadcast_shape.extend([seq_length, width]) 516 | position_embeddings = tf.reshape(position_embeddings, 517 | position_broadcast_shape) 518 | output += position_embeddings 519 | 520 | output = layer_norm_and_dropout(output, dropout_prob) 521 | return output 522 | 523 | 524 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 525 | """Create 3D attention mask from a 2D tensor mask. 526 | 527 | Args: 528 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 529 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 530 | 531 | Returns: 532 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 533 | """ 534 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 535 | batch_size = from_shape[0] 536 | from_seq_length = from_shape[1] 537 | 538 | to_shape = get_shape_list(to_mask, expected_rank=2) 539 | to_seq_length = to_shape[1] 540 | 541 | to_mask = tf.cast( 542 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 543 | 544 | # We don't assume that `from_tensor` is a mask (although it could be). We 545 | # don't actually care if we attend *from* padding tokens (only *to* padding) 546 | # tokens so we create a tensor of all ones. 547 | # 548 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 549 | broadcast_ones = tf.ones( 550 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 551 | 552 | # Here we broadcast along two dimensions to create the mask. 553 | mask = broadcast_ones * to_mask 554 | 555 | return mask 556 | 557 | 558 | def attention_layer(from_tensor, 559 | to_tensor, 560 | attention_mask=None, 561 | num_attention_heads=1, 562 | size_per_head=512, 563 | query_act=None, 564 | key_act=None, 565 | value_act=None, 566 | attention_probs_dropout_prob=0.0, 567 | initializer_range=0.02, 568 | do_return_2d_tensor=False, 569 | batch_size=None, 570 | from_seq_length=None, 571 | to_seq_length=None): 572 | """Performs multi-headed attention from `from_tensor` to `to_tensor`. 573 | 574 | This is an implementation of multi-headed attention based on "Attention 575 | is all you Need". If `from_tensor` and `to_tensor` are the same, then 576 | this is self-attention. Each timestep in `from_tensor` attends to the 577 | corresponding sequence in `to_tensor`, and returns a fixed-with vector. 578 | 579 | This function first projects `from_tensor` into a "query" tensor and 580 | `to_tensor` into "key" and "value" tensors. These are (effectively) a list 581 | of tensors of length `num_attention_heads`, where each tensor is of shape 582 | [batch_size, seq_length, size_per_head]. 583 | 584 | Then, the query and key tensors are dot-producted and scaled. These are 585 | softmaxed to obtain attention probabilities. The value tensors are then 586 | interpolated by these probabilities, then concatenated back to a single 587 | tensor and returned. 588 | 589 | In practice, the multi-headed attention are done with transposes and 590 | reshapes rather than actual separate tensors. 591 | 592 | Args: 593 | from_tensor: float Tensor of shape [batch_size, from_seq_length, 594 | from_width]. 595 | to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. 596 | attention_mask: (optional) int32 Tensor of shape [batch_size, 597 | from_seq_length, to_seq_length]. The values should be 1 or 0. The 598 | attention scores will effectively be set to -infinity for any positions in 599 | the mask that are 0, and will be unchanged for positions that are 1. 600 | num_attention_heads: int. Number of attention heads. 601 | size_per_head: int. Size of each attention head. 602 | query_act: (optional) Activation function for the query transform. 603 | key_act: (optional) Activation function for the key transform. 604 | value_act: (optional) Activation function for the value transform. 605 | attention_probs_dropout_prob: (optional) float. Dropout probability of the 606 | attention probabilities. 607 | initializer_range: float. Range of the weight initializer. 608 | do_return_2d_tensor: bool. If True, the output will be of shape [batch_size 609 | * from_seq_length, num_attention_heads * size_per_head]. If False, the 610 | output will be of shape [batch_size, from_seq_length, num_attention_heads 611 | * size_per_head]. 612 | batch_size: (Optional) int. If the input is 2D, this might be the batch size 613 | of the 3D version of the `from_tensor` and `to_tensor`. 614 | from_seq_length: (Optional) If the input is 2D, this might be the seq length 615 | of the 3D version of the `from_tensor`. 616 | to_seq_length: (Optional) If the input is 2D, this might be the seq length 617 | of the 3D version of the `to_tensor`. 618 | 619 | Returns: 620 | float Tensor of shape [batch_size, from_seq_length, 621 | num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is 622 | true, this will be of shape [batch_size * from_seq_length, 623 | num_attention_heads * size_per_head]). 624 | 625 | Raises: 626 | ValueError: Any of the arguments or tensor shapes are invalid. 627 | """ 628 | 629 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 630 | seq_length, width): 631 | output_tensor = tf.reshape( 632 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 633 | 634 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 635 | return output_tensor 636 | 637 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 638 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 639 | 640 | if len(from_shape) != len(to_shape): 641 | raise ValueError( 642 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 643 | 644 | if len(from_shape) == 3: 645 | batch_size = from_shape[0] 646 | from_seq_length = from_shape[1] 647 | to_seq_length = to_shape[1] 648 | elif len(from_shape) == 2: 649 | if (batch_size is None or from_seq_length is None or to_seq_length is None): 650 | raise ValueError( 651 | "When passing in rank 2 tensors to attention_layer, the values " 652 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 653 | "must all be specified.") 654 | 655 | # Scalar dimensions referenced here: 656 | # B = batch size (number of sequences) 657 | # F = `from_tensor` sequence length 658 | # T = `to_tensor` sequence length 659 | # N = `num_attention_heads` 660 | # H = `size_per_head` 661 | 662 | from_tensor_2d = reshape_to_matrix(from_tensor) 663 | to_tensor_2d = reshape_to_matrix(to_tensor) 664 | 665 | # `query_layer` = [B*F, N*H] 666 | query_layer = tf.layers.dense( 667 | from_tensor_2d, 668 | num_attention_heads * size_per_head, 669 | activation=query_act, 670 | name="query", 671 | kernel_initializer=create_initializer(initializer_range)) 672 | 673 | # `key_layer` = [B*T, N*H] 674 | key_layer = tf.layers.dense( 675 | to_tensor_2d, 676 | num_attention_heads * size_per_head, 677 | activation=key_act, 678 | name="key", 679 | kernel_initializer=create_initializer(initializer_range)) 680 | 681 | # `value_layer` = [B*T, N*H] 682 | value_layer = tf.layers.dense( 683 | to_tensor_2d, 684 | num_attention_heads * size_per_head, 685 | activation=value_act, 686 | name="value", 687 | kernel_initializer=create_initializer(initializer_range)) 688 | 689 | # `query_layer` = [B, N, F, H] 690 | query_layer = transpose_for_scores(query_layer, batch_size, 691 | num_attention_heads, from_seq_length, 692 | size_per_head) 693 | 694 | # `key_layer` = [B, N, T, H] 695 | key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, 696 | to_seq_length, size_per_head) 697 | 698 | # Take the dot product between "query" and "key" to get the raw 699 | # attention scores. 700 | # `attention_scores` = [B, N, F, T] 701 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 702 | attention_scores = tf.multiply(attention_scores, 703 | 1.0 / math.sqrt(float(size_per_head))) 704 | 705 | if attention_mask is not None: 706 | # `attention_mask` = [B, 1, F, T] 707 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 708 | 709 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 710 | # masked positions, this operation will create a tensor which is 0.0 for 711 | # positions we want to attend and -10000.0 for masked positions. 712 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 713 | 714 | # Since we are adding it to the raw scores before the softmax, this is 715 | # effectively the same as removing these entirely. 716 | attention_scores += adder 717 | 718 | # Normalize the attention scores to probabilities. 719 | # `attention_probs` = [B, N, F, T] 720 | attention_probs = tf.nn.softmax(attention_scores) 721 | 722 | # This is actually dropping out entire tokens to attend to, which might 723 | # seem a bit unusual, but is taken from the original Transformer paper. 724 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 725 | 726 | # `value_layer` = [B, T, N, H] 727 | value_layer = tf.reshape( 728 | value_layer, 729 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 730 | 731 | # `value_layer` = [B, N, T, H] 732 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 733 | 734 | # `context_layer` = [B, N, F, H] 735 | context_layer = tf.matmul(attention_probs, value_layer) 736 | 737 | # `context_layer` = [B, F, N, H] 738 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 739 | 740 | if do_return_2d_tensor: 741 | # `context_layer` = [B*F, N*H] 742 | context_layer = tf.reshape( 743 | context_layer, 744 | [batch_size * from_seq_length, num_attention_heads * size_per_head]) 745 | else: 746 | # `context_layer` = [B, F, N*H] 747 | context_layer = tf.reshape( 748 | context_layer, 749 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 750 | 751 | return context_layer 752 | 753 | 754 | def transformer_model(input_tensor, 755 | attention_mask=None, 756 | hidden_size=768, 757 | num_hidden_layers=12, 758 | num_attention_heads=12, 759 | intermediate_size=3072, 760 | intermediate_act_fn=gelu, 761 | hidden_dropout_prob=0.1, 762 | attention_probs_dropout_prob=0.1, 763 | initializer_range=0.02, 764 | do_return_all_layers=False): 765 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 766 | 767 | This is almost an exact implementation of the original Transformer encoder. 768 | 769 | See the original paper: 770 | https://arxiv.org/abs/1706.03762 771 | 772 | Also see: 773 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 774 | 775 | Args: 776 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 777 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 778 | seq_length], with 1 for positions that can be attended to and 0 in 779 | positions that should not be. 780 | hidden_size: int. Hidden size of the Transformer. 781 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 782 | num_attention_heads: int. Number of attention heads in the Transformer. 783 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 784 | forward) layer. 785 | intermediate_act_fn: function. The non-linear activation function to apply 786 | to the output of the intermediate/feed-forward layer. 787 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 788 | attention_probs_dropout_prob: float. Dropout probability of the attention 789 | probabilities. 790 | initializer_range: float. Range of the initializer (stddev of truncated 791 | normal). 792 | do_return_all_layers: Whether to also return all layers or just the final 793 | layer. 794 | 795 | Returns: 796 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 797 | hidden layer of the Transformer. 798 | 799 | Raises: 800 | ValueError: A Tensor shape or parameter is invalid. 801 | """ 802 | if hidden_size % num_attention_heads != 0: 803 | raise ValueError( 804 | "The hidden size (%d) is not a multiple of the number of attention " 805 | "heads (%d)" % (hidden_size, num_attention_heads)) 806 | 807 | attention_head_size = int(hidden_size / num_attention_heads) 808 | input_shape = get_shape_list(input_tensor, expected_rank=3) 809 | batch_size = input_shape[0] 810 | seq_length = input_shape[1] 811 | input_width = input_shape[2] 812 | 813 | # The Transformer performs sum residuals on all layers so the input needs 814 | # to be the same as the hidden size. 815 | if input_width != hidden_size: 816 | raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % 817 | (input_width, hidden_size)) 818 | 819 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 820 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 821 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 822 | # help the optimizer. 823 | prev_output = reshape_to_matrix(input_tensor) 824 | 825 | all_layer_outputs = [] 826 | for layer_idx in range(num_hidden_layers): 827 | with tf.variable_scope("layer_%d" % layer_idx): 828 | layer_input = prev_output 829 | 830 | with tf.variable_scope("attention"): 831 | attention_heads = [] 832 | with tf.variable_scope("self"): 833 | attention_head = attention_layer( 834 | from_tensor=layer_input, 835 | to_tensor=layer_input, 836 | attention_mask=attention_mask, 837 | num_attention_heads=num_attention_heads, 838 | size_per_head=attention_head_size, 839 | attention_probs_dropout_prob=attention_probs_dropout_prob, 840 | initializer_range=initializer_range, 841 | do_return_2d_tensor=True, 842 | batch_size=batch_size, 843 | from_seq_length=seq_length, 844 | to_seq_length=seq_length) 845 | attention_heads.append(attention_head) 846 | 847 | attention_output = None 848 | if len(attention_heads) == 1: 849 | attention_output = attention_heads[0] 850 | else: 851 | # In the case where we have other sequences, we just concatenate 852 | # them to the self-attention head before the projection. 853 | attention_output = tf.concat(attention_heads, axis=-1) 854 | 855 | # Run a linear projection of `hidden_size` then add a residual 856 | # with `layer_input`. 857 | with tf.variable_scope("output"): 858 | attention_output = tf.layers.dense( 859 | attention_output, 860 | hidden_size, 861 | kernel_initializer=create_initializer(initializer_range)) 862 | attention_output = dropout(attention_output, hidden_dropout_prob) 863 | attention_output = layer_norm(attention_output + layer_input) 864 | 865 | # The activation is only applied to the "intermediate" hidden layer. 866 | with tf.variable_scope("intermediate"): 867 | intermediate_output = tf.layers.dense( 868 | attention_output, 869 | intermediate_size, 870 | activation=intermediate_act_fn, 871 | kernel_initializer=create_initializer(initializer_range)) 872 | 873 | # Down-project back to `hidden_size` then add the residual. 874 | with tf.variable_scope("output"): 875 | layer_output = tf.layers.dense( 876 | intermediate_output, 877 | hidden_size, 878 | kernel_initializer=create_initializer(initializer_range)) 879 | layer_output = dropout(layer_output, hidden_dropout_prob) 880 | layer_output = layer_norm(layer_output + attention_output) 881 | prev_output = layer_output 882 | all_layer_outputs.append(layer_output) 883 | 884 | if do_return_all_layers: 885 | final_outputs = [] 886 | for layer_output in all_layer_outputs: 887 | final_output = reshape_from_matrix(layer_output, input_shape) 888 | final_outputs.append(final_output) 889 | return final_outputs 890 | else: 891 | final_output = reshape_from_matrix(prev_output, input_shape) 892 | return final_output 893 | 894 | 895 | def get_shape_list(tensor, expected_rank=None, name=None): 896 | """Returns a list of the shape of tensor, preferring static dimensions. 897 | 898 | Args: 899 | tensor: A tf.Tensor object to find the shape of. 900 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 901 | specified and the `tensor` has a different rank, and exception will be 902 | thrown. 903 | name: Optional name of the tensor for the error message. 904 | 905 | Returns: 906 | A list of dimensions of the shape of tensor. All static dimensions will 907 | be returned as python integers, and dynamic dimensions will be returned 908 | as tf.Tensor scalars. 909 | """ 910 | if name is None: 911 | name = tensor.name 912 | 913 | if expected_rank is not None: 914 | assert_rank(tensor, expected_rank, name) 915 | 916 | shape = tensor.shape.as_list() 917 | 918 | non_static_indexes = [] 919 | for (index, dim) in enumerate(shape): 920 | if dim is None: 921 | non_static_indexes.append(index) 922 | 923 | if not non_static_indexes: 924 | return shape 925 | 926 | dyn_shape = tf.shape(tensor) 927 | for index in non_static_indexes: 928 | shape[index] = dyn_shape[index] 929 | return shape 930 | 931 | 932 | def reshape_to_matrix(input_tensor): 933 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 934 | ndims = input_tensor.shape.ndims 935 | if ndims < 2: 936 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 937 | (input_tensor.shape)) 938 | if ndims == 2: 939 | return input_tensor 940 | 941 | width = input_tensor.shape[-1] 942 | output_tensor = tf.reshape(input_tensor, [-1, width]) 943 | return output_tensor 944 | 945 | 946 | def reshape_from_matrix(output_tensor, orig_shape_list): 947 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 948 | if len(orig_shape_list) == 2: 949 | return output_tensor 950 | 951 | output_shape = get_shape_list(output_tensor) 952 | 953 | orig_dims = orig_shape_list[0:-1] 954 | width = output_shape[-1] 955 | 956 | return tf.reshape(output_tensor, orig_dims + [width]) 957 | 958 | 959 | def assert_rank(tensor, expected_rank, name=None): 960 | """Raises an exception if the tensor rank is not of the expected rank. 961 | 962 | Args: 963 | tensor: A tf.Tensor to check the rank of. 964 | expected_rank: Python integer or list of integers, expected rank. 965 | name: Optional name of the tensor for the error message. 966 | 967 | Raises: 968 | ValueError: If the expected shape doesn't match the actual shape. 969 | """ 970 | if name is None: 971 | name = tensor.name 972 | 973 | expected_rank_dict = {} 974 | if isinstance(expected_rank, six.integer_types): 975 | expected_rank_dict[expected_rank] = True 976 | else: 977 | for x in expected_rank: 978 | expected_rank_dict[x] = True 979 | 980 | actual_rank = tensor.shape.ndims 981 | if actual_rank not in expected_rank_dict: 982 | scope_name = tf.get_variable_scope().name 983 | raise ValueError( 984 | "For the tensor `%s` in scope `%s`, the actual rank " 985 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 986 | (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) 987 | -------------------------------------------------------------------------------- /bert/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import re 23 | import unicodedata 24 | import six 25 | import tensorflow as tf 26 | 27 | 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): 29 | """Checks whether the casing config is consistent with the checkpoint name.""" 30 | 31 | # The casing has to be passed in by the user and there is no explicit check 32 | # as to whether it matches the checkpoint. The casing information probably 33 | # should have been stored in the bert_config.json file, but it's not, so 34 | # we have to heuristically detect it to validate. 35 | 36 | if not init_checkpoint: 37 | return 38 | 39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) 40 | if m is None: 41 | return 42 | 43 | model_name = m.group(1) 44 | 45 | lower_models = [ 46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", 47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" 48 | ] 49 | 50 | cased_models = [ 51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", 52 | "multi_cased_L-12_H-768_A-12" 53 | ] 54 | 55 | is_bad_config = False 56 | if model_name in lower_models and not do_lower_case: 57 | is_bad_config = True 58 | actual_flag = "False" 59 | case_name = "lowercased" 60 | opposite_flag = "True" 61 | 62 | if model_name in cased_models and do_lower_case: 63 | is_bad_config = True 64 | actual_flag = "True" 65 | case_name = "cased" 66 | opposite_flag = "False" 67 | 68 | if is_bad_config: 69 | raise ValueError( 70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " 71 | "However, `%s` seems to be a %s model, so you " 72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches " 73 | "how the model was pre-training. If this error is wrong, please " 74 | "just comment out this check." % (actual_flag, init_checkpoint, 75 | model_name, case_name, opposite_flag)) 76 | 77 | 78 | def convert_to_unicode(text): 79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 80 | if six.PY3: 81 | if isinstance(text, str): 82 | return text 83 | elif isinstance(text, bytes): 84 | return text.decode("utf-8", "ignore") 85 | else: 86 | raise ValueError("Unsupported string type: %s" % (type(text))) 87 | elif six.PY2: 88 | if isinstance(text, str): 89 | return text.decode("utf-8", "ignore") 90 | elif isinstance(text, unicode): 91 | return text 92 | else: 93 | raise ValueError("Unsupported string type: %s" % (type(text))) 94 | else: 95 | raise ValueError("Not running on Python2 or Python 3?") 96 | 97 | 98 | def printable_text(text): 99 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 100 | 101 | # These functions want `str` for both Python2 and Python3, but in one case 102 | # it's a Unicode string and in the other it's a byte string. 103 | if six.PY3: 104 | if isinstance(text, str): 105 | return text 106 | elif isinstance(text, bytes): 107 | return text.decode("utf-8", "ignore") 108 | else: 109 | raise ValueError("Unsupported string type: %s" % (type(text))) 110 | elif six.PY2: 111 | if isinstance(text, str): 112 | return text 113 | elif isinstance(text, unicode): 114 | return text.encode("utf-8") 115 | else: 116 | raise ValueError("Unsupported string type: %s" % (type(text))) 117 | else: 118 | raise ValueError("Not running on Python2 or Python 3?") 119 | 120 | 121 | def load_vocab(vocab_file): 122 | """Loads a vocabulary file into a dictionary.""" 123 | vocab = collections.OrderedDict() 124 | index = 0 125 | with tf.gfile.GFile(vocab_file, "r") as reader: 126 | while True: 127 | token = convert_to_unicode(reader.readline()) 128 | if not token: 129 | break 130 | token = token.strip() 131 | vocab[token] = index 132 | index += 1 133 | return vocab 134 | 135 | 136 | def convert_by_vocab(vocab, items): 137 | """Converts a sequence of [tokens|ids] using the vocab.""" 138 | output = [] 139 | for item in items: 140 | output.append(vocab[item]) 141 | return output 142 | 143 | 144 | def convert_tokens_to_ids(vocab, tokens): 145 | return convert_by_vocab(vocab, tokens) 146 | 147 | 148 | def convert_ids_to_tokens(inv_vocab, ids): 149 | return convert_by_vocab(inv_vocab, ids) 150 | 151 | 152 | def whitespace_tokenize(text): 153 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 154 | text = text.strip() 155 | if not text: 156 | return [] 157 | tokens = text.split() 158 | return tokens 159 | 160 | 161 | class FullTokenizer(object): 162 | """Runs end-to-end tokenziation.""" 163 | 164 | def __init__(self, vocab_file, do_lower_case=True): 165 | self.vocab = load_vocab(vocab_file) 166 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 167 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 168 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 169 | 170 | def tokenize(self, text): 171 | split_tokens = [] 172 | for token in self.basic_tokenizer.tokenize(text): 173 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 174 | split_tokens.append(sub_token) 175 | 176 | return split_tokens 177 | 178 | def convert_tokens_to_ids(self, tokens): 179 | return convert_by_vocab(self.vocab, tokens) 180 | 181 | def convert_ids_to_tokens(self, ids): 182 | return convert_by_vocab(self.inv_vocab, ids) 183 | 184 | 185 | class BasicTokenizer(object): 186 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 187 | 188 | def __init__(self, do_lower_case=True): 189 | """Constructs a BasicTokenizer. 190 | 191 | Args: 192 | do_lower_case: Whether to lower case the input. 193 | """ 194 | self.do_lower_case = do_lower_case 195 | 196 | def tokenize(self, text): 197 | """Tokenizes a piece of text.""" 198 | text = convert_to_unicode(text) 199 | text = self._clean_text(text) 200 | 201 | # This was added on November 1st, 2018 for the multilingual and Chinese 202 | # models. This is also applied to the English models now, but it doesn't 203 | # matter since the English models were not trained on any Chinese data 204 | # and generally don't have any Chinese data in them (there are Chinese 205 | # characters in the vocabulary because Wikipedia does have some Chinese 206 | # words in the English Wikipedia.). 207 | text = self._tokenize_chinese_chars(text) 208 | 209 | orig_tokens = whitespace_tokenize(text) 210 | split_tokens = [] 211 | for token in orig_tokens: 212 | if self.do_lower_case: 213 | token = token.lower() 214 | token = self._run_strip_accents(token) 215 | split_tokens.extend(self._run_split_on_punc(token)) 216 | 217 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 218 | return output_tokens 219 | 220 | def _run_strip_accents(self, text): 221 | """Strips accents from a piece of text.""" 222 | text = unicodedata.normalize("NFD", text) 223 | output = [] 224 | for char in text: 225 | cat = unicodedata.category(char) 226 | if cat == "Mn": 227 | continue 228 | output.append(char) 229 | return "".join(output) 230 | 231 | def _run_split_on_punc(self, text): 232 | """Splits punctuation on a piece of text.""" 233 | chars = list(text) 234 | i = 0 235 | start_new_word = True 236 | output = [] 237 | while i < len(chars): 238 | char = chars[i] 239 | if _is_punctuation(char): 240 | output.append([char]) 241 | start_new_word = True 242 | else: 243 | if start_new_word: 244 | output.append([]) 245 | start_new_word = False 246 | output[-1].append(char) 247 | i += 1 248 | 249 | return ["".join(x) for x in output] 250 | 251 | def _tokenize_chinese_chars(self, text): 252 | """Adds whitespace around any CJK character.""" 253 | output = [] 254 | for char in text: 255 | cp = ord(char) 256 | if self._is_chinese_char(cp): 257 | output.append(" ") 258 | output.append(char) 259 | output.append(" ") 260 | else: 261 | output.append(char) 262 | return "".join(output) 263 | 264 | def _is_chinese_char(self, cp): 265 | """Checks whether CP is the codepoint of a CJK character.""" 266 | # This defines a "chinese character" as anything in the CJK Unicode block: 267 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 268 | # 269 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 270 | # despite its name. The modern Korean Hangul alphabet is a different block, 271 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 272 | # space-separated words, so they are not treated specially and handled 273 | # like the all of the other languages. 274 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 275 | (cp >= 0x3400 and cp <= 0x4DBF) or # 276 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 277 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 278 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 279 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 280 | (cp >= 0xF900 and cp <= 0xFAFF) or # 281 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 282 | return True 283 | 284 | return False 285 | 286 | def _clean_text(self, text): 287 | """Performs invalid character removal and whitespace cleanup on text.""" 288 | output = [] 289 | for char in text: 290 | cp = ord(char) 291 | if cp == 0 or cp == 0xfffd or _is_control(char): 292 | continue 293 | if _is_whitespace(char): 294 | output.append(" ") 295 | else: 296 | output.append(char) 297 | return "".join(output) 298 | 299 | 300 | class WordpieceTokenizer(object): 301 | """Runs WordPiece tokenziation.""" 302 | 303 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 304 | self.vocab = vocab 305 | self.unk_token = unk_token 306 | self.max_input_chars_per_word = max_input_chars_per_word 307 | 308 | def tokenize(self, text): 309 | """Tokenizes a piece of text into its word pieces. 310 | 311 | This uses a greedy longest-match-first algorithm to perform tokenization 312 | using the given vocabulary. 313 | 314 | For example: 315 | input = "unaffable" 316 | output = ["un", "##aff", "##able"] 317 | 318 | Args: 319 | text: A single token or whitespace separated tokens. This should have 320 | already been passed through `BasicTokenizer. 321 | 322 | Returns: 323 | A list of wordpiece tokens. 324 | """ 325 | 326 | text = convert_to_unicode(text) 327 | 328 | output_tokens = [] 329 | for token in whitespace_tokenize(text): 330 | chars = list(token) 331 | if len(chars) > self.max_input_chars_per_word: 332 | output_tokens.append(self.unk_token) 333 | continue 334 | 335 | is_bad = False 336 | start = 0 337 | sub_tokens = [] 338 | while start < len(chars): 339 | end = len(chars) 340 | cur_substr = None 341 | while start < end: 342 | substr = "".join(chars[start:end]) 343 | if start > 0: 344 | substr = "##" + substr 345 | if substr in self.vocab: 346 | cur_substr = substr 347 | break 348 | end -= 1 349 | if cur_substr is None: 350 | is_bad = True 351 | break 352 | sub_tokens.append(cur_substr) 353 | start = end 354 | 355 | if is_bad: 356 | output_tokens.append(self.unk_token) 357 | else: 358 | output_tokens.extend(sub_tokens) 359 | return output_tokens 360 | 361 | 362 | def _is_whitespace(char): 363 | """Checks whether `chars` is a whitespace character.""" 364 | # \t, \n, and \r are technically contorl characters but we treat them 365 | # as whitespace since they are generally considered as such. 366 | if char == " " or char == "\t" or char == "\n" or char == "\r": 367 | return True 368 | cat = unicodedata.category(char) 369 | if cat == "Zs": 370 | return True 371 | return False 372 | 373 | 374 | def _is_control(char): 375 | """Checks whether `chars` is a control character.""" 376 | # These are technically control characters but we count them as whitespace 377 | # characters. 378 | if char == "\t" or char == "\n" or char == "\r": 379 | return False 380 | cat = unicodedata.category(char) 381 | if cat in ("Cc", "Cf"): 382 | return True 383 | return False 384 | 385 | 386 | def _is_punctuation(char): 387 | """Checks whether `chars` is a punctuation character.""" 388 | cp = ord(char) 389 | # We treat all non-letter/number ASCII as punctuation. 390 | # Characters such as "^", "$", and "`" are not in the Unicode 391 | # Punctuation class but we treat them as punctuation anyways, for 392 | # consistency. 393 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 394 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 395 | return True 396 | cat = unicodedata.category(char) 397 | if cat.startswith("P"): 398 | return True 399 | return False 400 | -------------------------------------------------------------------------------- /bert_embedding.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import collections 4 | import math 5 | import numpy as np 6 | import tensorflow as tf 7 | from bert import modeling, tokenization 8 | from bert.extract_features import input_fn_builder, model_fn_builder, convert_examples_to_features, InputExample 9 | 10 | tf.logging.set_verbosity(tf.logging.ERROR) 11 | 12 | curr_path = os.path.dirname(os.path.realpath(__file__)) 13 | 14 | # https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip 15 | DIR = curr_path + '/bert/' 16 | MODEL_DIR = DIR + 'multi_cased_L-12_H-768_A-12/' 17 | CONFIG = MODEL_DIR + 'bert_config.json' 18 | CKPT = MODEL_DIR + 'bert_model.ckpt' 19 | VOCAB = MODEL_DIR + 'vocab.txt' 20 | 21 | _max_seq_length = 512 22 | _layers = [-1, -2, -3, -4] 23 | 24 | class BERT(object): 25 | def init(self): 26 | bert_config = modeling.BertConfig.from_json_file(CONFIG) 27 | model_fn = model_fn_builder(bert_config=bert_config, 28 | init_checkpoint=CKPT, 29 | layer_indexes=_layers, 30 | use_tpu=False, 31 | use_one_hot_embeddings=False) 32 | self._estimator = tf.contrib.tpu.TPUEstimator(model_fn=model_fn, 33 | model_dir=MODEL_DIR, 34 | use_tpu=False, 35 | predict_batch_size=32, 36 | config=tf.contrib.tpu.RunConfig(master=None, tpu_config=tf.contrib.tpu.TPUConfig(num_shards=8, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))) 37 | self._tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB, do_lower_case=False) 38 | 39 | def extract_v1(self, sentence): 40 | 41 | example = [InputExample(unique_id=0, text_a=sentence, text_b=None)] 42 | features = convert_examples_to_features(examples=example, 43 | seq_length=_max_seq_length, 44 | tokenizer=self._tokenizer) 45 | input_fn = input_fn_builder(features=features, 46 | seq_length=_max_seq_length) 47 | outputs = [] 48 | for output in self._estimator.predict(input_fn): 49 | outputs.append(output) 50 | 51 | return outputs[0] 52 | 53 | 54 | def extracts_v1(self, sentences): 55 | 56 | examples = [] 57 | for idx, sentence in enumerate(sentences): 58 | examples.append(InputExample(unique_id=idx, text_a=sentence, text_b=None)) 59 | features = convert_examples_to_features(examples=examples, 60 | seq_length=_max_seq_length, 61 | tokenizer=self._tokenizer) 62 | input_fn = input_fn_builder(features=features, 63 | seq_length=_max_seq_length) 64 | outputs = [] 65 | for output in self._estimator.predict(input_fn): 66 | outputs.append(output) 67 | 68 | return outputs 69 | 70 | def extract(self, sentence): 71 | 72 | example = [InputExample(unique_id=0, text_a=sentence, text_b=None)] 73 | features = convert_examples_to_features(examples=example, 74 | seq_length=_max_seq_length, 75 | tokenizer=self._tokenizer) 76 | input_fn = input_fn_builder(features=features, 77 | seq_length=_max_seq_length) 78 | outputs = [] 79 | for output in self._estimator.predict(input_fn): 80 | feature = features[0] 81 | output_dict = collections.OrderedDict() 82 | all_features = [] 83 | for (i, token) in enumerate(feature.tokens): 84 | all_layers = [] 85 | for (j, layer_index) in enumerate(_layers): 86 | layer_output = output["layer_output_%d" % j] 87 | layers = collections.OrderedDict() 88 | layers["index"] = layer_index 89 | layers["values"] = [ 90 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat 91 | ] 92 | all_layers.append(layers) 93 | features = collections.OrderedDict() 94 | features["token"] = token 95 | features["layers"] = all_layers 96 | all_features.append(features) 97 | output_dict["features"] = all_features 98 | outputs.append(output_dict) 99 | 100 | return outputs[0] 101 | 102 | def extracts(self, sentences): 103 | 104 | examples = [] 105 | for idx, sentence in enumerate(sentences): 106 | examples.append(InputExample(unique_id=idx, text_a=sentence, text_b=None)) 107 | features = convert_examples_to_features(examples=examples, 108 | seq_length=_max_seq_length, 109 | tokenizer=self._tokenizer) 110 | unique_id_to_feature = {} 111 | for feature in features: 112 | unique_id_to_feature[feature.unique_id] = feature 113 | input_fn = input_fn_builder(features=features, 114 | seq_length=_max_seq_length) 115 | outputs = [] 116 | for idx, output in enumerate(self._estimator.predict(input_fn)): 117 | feature = unique_id_to_feature[idx] 118 | output_dict = collections.OrderedDict() 119 | all_features = [] 120 | for (i, token) in enumerate(feature.tokens): 121 | all_layers = [] 122 | for (j, layer_index) in enumerate(_layers): 123 | layer_output = output["layer_output_%d" % j] 124 | layers = collections.OrderedDict() 125 | layers["index"] = layer_index 126 | layers["values"] = [ 127 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat 128 | ] 129 | all_layers.append(layers) 130 | features = collections.OrderedDict() 131 | features["token"] = token 132 | features["layers"] = all_layers 133 | all_features.append(features) 134 | output_dict["features"] = all_features 135 | outputs.append(output_dict) 136 | 137 | return outputs 138 | 139 | def cal_dif_cls(self, emb1, emb2): 140 | dif = 0 141 | for i in range(4): 142 | dif += (math.sqrt(sum((np.asarray(emb1['features'][0]['layers'][i]['values']) 143 | - np.asarray(emb2['features'][0]['layers'][i]['values']))**2)/768)) 144 | return dif 145 | 146 | def cal_dif_cls_layer(self, emb1, emb2, i): 147 | dif = (math.sqrt(sum((np.asarray(emb1['features'][0]['layers'][i]['values']) 148 | - np.asarray(emb2['features'][0]['layers'][i]['values']))**2)/768)) 149 | return dif 150 | 151 | def cal_dif_keyword(self, emb1, emb2, keyword): 152 | tokens = self._tokenizer.tokenize(keyword) 153 | candidates = [] 154 | 155 | for emb in [emb1, emb2]: 156 | tmp_values = [] 157 | sum_values = [] 158 | for k, feature in enumerate(emb['features']): 159 | if feature['token'] == tokens[0]: 160 | tmp = [] 161 | for t in range(len(tokens)): 162 | tmp.append(emb['features'][k+t]['layers']) 163 | if emb['features'][k+t]['token'] != tokens[t]: 164 | break 165 | if len(tmp) == len(tokens): 166 | tmp_values = [1] * 768 167 | sum_values = [tmp_values] * 4 168 | for _ in tmp: 169 | for i in range(4): 170 | for j in range(768): 171 | sum_values[i][j] += _[i]['values'][j] 172 | for i in range(4): 173 | for j in range(768): 174 | sum_values[i][j] = sum_values[i][j] 175 | candidates.append(sum_values) 176 | break 177 | 178 | if len(candidates) < 2: 179 | return -1 180 | 181 | dif = 0 182 | for i in range(4): 183 | dif += (math.sqrt(sum((np.asarray(candidates[0][i]) 184 | - np.asarray(candidates[1][i]))**2)/768)) 185 | return dif 186 | 187 | if __name__ == "__main__": 188 | bert = BERT() 189 | bert.init() 190 | 191 | sentences = ['‘세계의 공장’으로 막대한 달러를 쓸어담으며 경제력을 키웠던 중국의 좋은 시절도 오래가지 않을 듯>하다.', 192 | '자본 유출과 서비스 수지 적자 폭이 커지며 경상수지 적자를 향해 빠르게 다가가고 있어서다.', 193 | "[OBS 독특한 연예뉴스 조연수 기자] 가수 겸 배우 수지가 '국민' 타이틀을 거머쥔 스타로 꼽혔다.", 194 | "OBS '독특한 연예뉴스'(기획·연출·감수 윤경철, 작가 박은경·김현선)가 '국민 신드롬'을 일으킨 첫사랑의 아이콘 >김연아, 수지, 설현의 근황을 살펴봤다.", 195 | '오늘은 날씨가 좋습니다. 맛집을 찾아 가볼까요? 아이들이 좋아하더라구요.', 196 | '보쌈집에서는 보쌈을 맛있게 하면 그만입니다.ㅋㅋ'] 197 | 198 | results = bert.extracts_(sentences) 199 | 200 | distances = [] 201 | for i in range(len(results)): 202 | distance = [] 203 | for j in range(len(results)): 204 | if i == j: 205 | distance.append(99999) 206 | else: 207 | distance.append(bert.cal_dif_cls(results[i], results[j])) 208 | distances.append(distance) 209 | 210 | for idx in range(len(sentences)): 211 | print(sentences[idx]) 212 | print(sentences[distances[idx].index(min(distances[idx]))]) 213 | print() 214 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu 2 | --------------------------------------------------------------------------------