├── BERT-CNN ├── BERT_CNN.py ├── BertLayer.py ├── __pycache__ │ ├── BertLayer.cpython-36.pyc │ ├── data_pre.cpython-36.pyc │ ├── extract_features.cpython-36.pyc │ ├── freeze_keras_model.cpython-36.pyc │ ├── modeling.cpython-36.pyc │ └── tokenization.cpython-36.pyc ├── bert_experimental │ ├── README.md │ ├── bert_experimental │ │ ├── feature_extraction │ │ │ ├── l2_retriever.py │ │ │ └── text_preprocessing.py │ │ └── finetuning │ │ │ ├── __init__.py │ │ │ ├── bert_layer.py │ │ │ ├── graph_ops.py │ │ │ ├── modeling.py │ │ │ └── text_preprocessing.py │ ├── requirements.txt │ └── setup.py ├── data │ ├── read.me │ ├── test.tsv │ └── train.tsv ├── data_pre.py ├── eval.py ├── extract_features.py ├── freeze_keras_model.py ├── model.json ├── modeling.py ├── optimization.py ├── test_demo.tsv ├── tokenization.py └── uncased_L-12_H-768_A-12 │ └── file-should be here.txt ├── BERT ├── README.md ├── data │ ├── dev.tsv │ ├── test.tsv │ └── train.tsv.zip ├── modeling.py ├── outputs │ └── need-this.txt ├── run_classifier.py ├── tokenization.py ├── train_VC_b.py ├── train_model_VC.py └── uncased_L-12_H-768_A-12 │ └── file-should be here.txt ├── BERT_CNN_Visual_re_ranker_demo.ipynb ├── COCO_train2014_000000000009.jpg ├── COCO_val2014_000000000042.jpg ├── Evaluation ├── Result_tune_BERT_0.4.json ├── captions_val2014.json.zip └── coco_eval.py ├── LRCE_figure_1.png ├── Pre-trained.png ├── README.md ├── approch.png ├── data ├── README.md ├── test.tsv └── train.tsv ├── dataset_v1-1.png ├── hist.jpg ├── main.png ├── overlap_text.py ├── overlaping_result_v1.txt ├── overview.png ├── pre-trained ├── README.md ├── Visual_re-rank_re-ranked_output.txt ├── Visual_re-ranker.txt ├── caption.txt ├── model.py ├── sample_best.json ├── sample_best_baseline.json ├── visual-context_label.txt └── visual-context_prob.txt ├── visual_context ├── README.md ├── imagenet_classes.txt ├── imgs │ ├── COCO_val2014_000000185210.jpg │ └── COCO_val2014_000000235692.jpg ├── model.py ├── run-visual.py └── run-visual_CLIP.py └── word-count-hisgram.py /BERT-CNN/BERT_CNN.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ##!/usr/bin/env python3 3 | #!/bin/env python 4 | import sys 5 | import argparse 6 | import re 7 | import os 8 | import sys 9 | import json 10 | 11 | import logging 12 | import numpy as np 13 | import pandas as pd 14 | import tensorflow as tf 15 | import tensorflow_hub as hub 16 | from BertLayer import BertLayer 17 | from BertLayer import build_preprocessor 18 | from freeze_keras_model import freeze_keras_model 19 | 20 | from data_pre import * 21 | from tensorflow import keras 22 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint 23 | from sklearn.model_selection import train_test_split 24 | 25 | 26 | if not 'bert_repo' in sys.path: 27 | sys.path.insert(0, 'bert_repo') 28 | 29 | from modeling import BertModel, BertConfig 30 | from tokenization import FullTokenizer, convert_to_unicode 31 | from extract_features import InputExample, convert_examples_to_features 32 | 33 | 34 | # get TF logger 35 | log = logging.getLogger('tensorflow') 36 | log.handlers = [] 37 | 38 | 39 | parser=argparse.ArgumentParser() 40 | parser.add_argument('--train', default='/home/asabir/BERT_layers-git/data/train.tsv', help='beam serach', type=str,required=False) 41 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False) 42 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False) 43 | parser.add_argument('--epochs', default='5', help='', type=int,required=False) 44 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False) 45 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False) 46 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False) 47 | args = parser.parse_args() 48 | 49 | 50 | # Downlaod the pre-trained model 51 | 52 | #!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 53 | #!unzip uncased_L-12_H-768_A-12.zip 54 | 55 | 56 | # tf.Module 57 | def build_module_fn(config_path, vocab_path, do_lower_case=True): 58 | 59 | def bert_module_fn(is_training): 60 | """Spec function for a token embedding module.""" 61 | 62 | input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") 63 | input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask") 64 | token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids") 65 | 66 | config = BertConfig.from_json_file(config_path) 67 | model = BertModel(config=config, is_training=is_training, 68 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type) 69 | 70 | seq_output = model.all_encoder_layers[-1] 71 | pool_output = model.get_pooled_output() 72 | 73 | config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file") 74 | vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") 75 | lower_case = tf.constant(do_lower_case) 76 | 77 | tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) 78 | tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) 79 | 80 | input_map = {"input_ids": input_ids, 81 | "input_mask": input_mask, 82 | "segment_ids": token_type} 83 | 84 | output_map = {"pooled_output": pool_output, 85 | "sequence_output": seq_output} 86 | 87 | output_info_map = {"vocab_file": vocab_file, 88 | "do_lower_case": lower_case} 89 | 90 | hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) 91 | hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map) 92 | 93 | return bert_module_fn 94 | 95 | 96 | MODEL_DIR = "/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12" 97 | config_path = "/{}/bert_config.json".format(MODEL_DIR) 98 | vocab_path = "/{}/vocab.txt".format(MODEL_DIR) 99 | 100 | 101 | tags_and_args = [] 102 | for is_training in (True, False): 103 | tags = set() 104 | if is_training: 105 | tags.add("train") 106 | tags_and_args.append((tags, dict(is_training=is_training))) 107 | 108 | module_fn = build_module_fn(config_path, vocab_path) 109 | spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args) 110 | spec.export("bert-module", 111 | checkpoint_path="/{}/bert_model.ckpt".format(MODEL_DIR)) 112 | 113 | class BertLayer(tf.keras.layers.Layer): 114 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 115 | pooling="cls", do_preprocessing=True, verbose=False, 116 | tune_embeddings=False, trainable=True, **kwargs): 117 | 118 | self.trainable = trainable 119 | self.n_tune_layers = n_tune_layers 120 | self.tune_embeddings = tune_embeddings 121 | self.do_preprocessing = do_preprocessing 122 | 123 | self.verbose = verbose 124 | self.seq_len = seq_len 125 | self.pooling = pooling 126 | self.bert_path = bert_path 127 | 128 | self.var_per_encoder = 16 129 | if self.pooling not in ["cls", "mean", None]: 130 | raise NameError( 131 | f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}" 132 | ) 133 | 134 | super(BertLayer, self).__init__(**kwargs) 135 | 136 | def build(self, input_shape): 137 | 138 | self.bert = hub.Module(self.build_abspath(self.bert_path), 139 | trainable=self.trainable, name=f"{self.name}_module") 140 | 141 | trainable_layers = [] 142 | if self.tune_embeddings: 143 | trainable_layers.append("embeddings") 144 | 145 | if self.pooling == "cls": 146 | trainable_layers.append("pooler") 147 | 148 | if self.n_tune_layers > 0: 149 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name] 150 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder) 151 | for i in range(self.n_tune_layers): 152 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/") 153 | 154 | # Add module variables to layer's trainable weights 155 | for var in self.bert.variables: 156 | if any([l in var.name for l in trainable_layers]): 157 | self._trainable_weights.append(var) 158 | else: 159 | self._non_trainable_weights.append(var) 160 | 161 | if self.verbose: 162 | print("*** TRAINABLE VARS *** ") 163 | for var in self._trainable_weights: 164 | print(var) 165 | 166 | self.build_preprocessor() 167 | self.initialize_module() 168 | 169 | super(BertLayer, self).build(input_shape) 170 | 171 | def build_abspath(self, path): 172 | if path.startswith("https://") or path.startswith("gs://"): 173 | return path 174 | else: 175 | return os.path.abspath(path) 176 | 177 | def build_preprocessor(self): 178 | sess = tf.keras.backend.get_session() 179 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True) 180 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 181 | tokenization_info["do_lower_case"]]) 182 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case) 183 | 184 | def initialize_module(self): 185 | sess = tf.keras.backend.get_session() 186 | 187 | vars_initialized = sess.run([tf.is_variable_initialized(var) 188 | for var in self.bert.variables]) 189 | 190 | uninitialized = [] 191 | for var, is_initialized in zip(self.bert.variables, vars_initialized): 192 | if not is_initialized: 193 | uninitialized.append(var) 194 | 195 | if len(uninitialized): 196 | sess.run(tf.variables_initializer(uninitialized)) 197 | 198 | def call(self, input): 199 | 200 | if self.do_preprocessing: 201 | input = tf.numpy_function(self.preprocessor, 202 | [input], [tf.int32, tf.int32, tf.int32], 203 | name='preprocessor') 204 | for feature in input: 205 | feature.set_shape((None, self.seq_len)) 206 | 207 | input_ids, input_mask, segment_ids = input 208 | 209 | bert_inputs = dict( 210 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids 211 | ) 212 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True) 213 | 214 | if self.pooling == "cls": 215 | pooled = output["pooled_output"] 216 | else: 217 | result = output["sequence_output"] 218 | 219 | input_mask = tf.cast(input_mask, tf.float32) 220 | mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) 221 | masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / ( 222 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) 223 | 224 | if self.pooling == "mean": 225 | pooled = masked_reduce_mean(result, input_mask) 226 | else: 227 | pooled = mul_mask(result, input_mask) 228 | 229 | return pooled 230 | 231 | def get_config(self): 232 | config_dict = { 233 | "bert_path": self.bert_path, 234 | "seq_len": self.seq_len, 235 | "pooling": self.pooling, 236 | "n_tune_layers": self.n_tune_layers, 237 | "tune_embeddings": self.tune_embeddings, 238 | "do_preprocessing": self.do_preprocessing, 239 | "verbose": self.verbose 240 | } 241 | super(BertLayer, self).get_config() 242 | return config_dict 243 | 244 | 245 | # read the train data 246 | #df = pd.read_csv("/home/asabir/BERT_layers-git/data/train.tsv", sep='\t') 247 | df = pd.read_csv(args.train, sep='\t') 248 | 249 | 250 | 251 | 252 | #labels = df.is_duplicate.values 253 | labels = df.is_related.values 254 | 255 | texts = [] 256 | delimiter = " ||| " 257 | 258 | for vis, cap in zip(df.visual.tolist(), df.caption.tolist()): 259 | texts.append(delimiter.join((str(vis), str(cap)))) 260 | 261 | 262 | texts = np.array(texts) 263 | 264 | trX, tsX, trY, tsY = train_test_split(texts, labels, shuffle=True, test_size=0.2) 265 | 266 | 267 | # Buliding the model 268 | 269 | embedding_size = 768 270 | 271 | inp = tf.keras.Input(shape=(1,), dtype=tf.string) 272 | # Three Layers 273 | #encoder = BertLayer(bert_path="./bert-module/", seq_len=48, tune_embeddings=False, 274 | # pooling='cls', n_tune_layers=3, verbose=False) 275 | 276 | # All Layers 277 | encoder = BertLayer(bert_path="./bert-module/", seq_len=args.seq_len, tune_embeddings=False, pooling=None, n_tune_layers=args.num_bert_layer, verbose=False) 278 | 279 | 280 | 281 | cnn_out = tf.keras.layers.Conv1D(args.CNN_filters, args.CNN_kernel_size, padding='VALID', activation=tf.nn.relu)(encoder(inp)) 282 | pool = tf.keras.layers.MaxPooling1D(pool_size=2)(cnn_out) 283 | flat = tf.keras.layers.Flatten()(pool) 284 | pred = tf.keras.layers.Dense(1, activation="sigmoid")(flat) 285 | 286 | 287 | model = tf.keras.models.Model(inputs=[inp], outputs=[pred]) 288 | 289 | model.summary() 290 | 291 | model.compile( 292 | optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, ), 293 | loss="binary_crossentropy", 294 | metrics=["accuracy"]) 295 | 296 | # fit the data 297 | import logging 298 | logging.getLogger("tensorflow").setLevel(logging.WARNING) 299 | 300 | saver = keras.callbacks.ModelCheckpoint("bert_CNN_tuned.hdf5") 301 | 302 | model.fit(trX, trY, validation_data=[tsX, tsY], batch_size=args.batch_size, epochs=args.epochs, callbacks=[saver]) 303 | 304 | 305 | 306 | #save the model 307 | model.predict(trX[:10]) 308 | 309 | import json 310 | json.dump(model.to_json(), open("model.json", "w")) 311 | 312 | model = tf.keras.models.model_from_json(json.load(open("model.json")), 313 | custom_objects={"BertLayer": BertLayer}) 314 | 315 | model.load_weights("bert_CNN_tuned.hdf5") 316 | 317 | model.predict(trX[:10]) 318 | 319 | # For fast inference and less RAM usesage as post-processing we need to "freezing" the model. 320 | from tensorflow.python.framework.graph_util import convert_variables_to_constants 321 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference 322 | 323 | def freeze_keras_model(model, export_path=None, clear_devices=True): 324 | sess = tf.keras.backend.get_session() 325 | graph = sess.graph 326 | 327 | with graph.as_default(): 328 | 329 | input_tensors = model.inputs 330 | output_tensors = model.outputs 331 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors] 332 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors] 333 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors] 334 | 335 | tmp_g = graph.as_graph_def() 336 | if clear_devices: 337 | for node in tmp_g.node: 338 | node.device = "" 339 | 340 | tmp_g = optimize_for_inference( 341 | tmp_g, input_ops, output_ops, dtypes, False) 342 | 343 | tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops) 344 | 345 | if export_path is not None: 346 | with tf.gfile.GFile(export_path, "wb") as f: 347 | f.write(tmp_g.SerializeToString()) 348 | 349 | return tmp_g 350 | 351 | 352 | # freeze and save the model 353 | frozen_graph = freeze_keras_model(model, export_path="frozen_graph.pb") 354 | 355 | 356 | # inference 357 | #!git clone https://github.com/gaphex/bert_experimental/ 358 | 359 | import tensorflow as tf 360 | import numpy as np 361 | import sys 362 | 363 | sys.path.insert(0, "bert_experimental") 364 | 365 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor 366 | from bert_experimental.finetuning.graph_ops import load_graph 367 | 368 | 369 | restored_graph = load_graph("frozen_graph.pb") 370 | graph_ops = restored_graph.get_operations() 371 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name 372 | print(input_op, output_op) 373 | 374 | x = restored_graph.get_tensor_by_name(input_op + ':0') 375 | y = restored_graph.get_tensor_by_name(output_op + ':0') 376 | 377 | 378 | preprocessor = build_preprocessor("/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12/vocab.txt", 64) 379 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor') 380 | 381 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32]) 382 | 383 | # predictions 384 | 385 | sess = tf.Session(graph=restored_graph) 386 | 387 | trX[:10] 388 | 389 | y_out = sess.run(y, feed_dict={ 390 | x: trX[:10].reshape((-1,1)) 391 | }) 392 | 393 | print(y_out) 394 | -------------------------------------------------------------------------------- /BERT-CNN/BertLayer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import os 5 | import sys 6 | import json 7 | 8 | import logging 9 | import numpy as np 10 | import pandas as pd 11 | import tensorflow as tf 12 | import tensorflow_hub as hub 13 | from tensorflow import keras 14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint 15 | 16 | from sklearn.model_selection import train_test_split 17 | 18 | 19 | if not 'bert_repo' in sys.path: 20 | sys.path.insert(0, 'bert_repo') 21 | 22 | from modeling import BertModel, BertConfig 23 | from tokenization import FullTokenizer, convert_to_unicode 24 | from extract_features import InputExample, convert_examples_to_features 25 | 26 | 27 | def build_preprocessor(voc_path, seq_len, lower=True): 28 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower) 29 | 30 | def strings_to_arrays(sents): 31 | 32 | sents = np.atleast_1d(sents).reshape((-1,)) 33 | 34 | examples = [] 35 | for example in read_examples(sents): 36 | examples.append(example) 37 | 38 | features = convert_examples_to_features(examples, seq_len, tokenizer) 39 | arrays = features_to_arrays(features) 40 | return arrays 41 | 42 | 43 | class BertLayer(tf.keras.layers.Layer): 44 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 45 | pooling="cls", do_preprocessing=True, verbose=False, 46 | tune_embeddings=False, trainable=True, **kwargs): 47 | 48 | self.trainable = trainable 49 | self.n_tune_layers = n_tune_layers 50 | self.tune_embeddings = tune_embeddings 51 | self.do_preprocessing = do_preprocessing 52 | 53 | self.verbose = verbose 54 | self.seq_len = seq_len 55 | self.pooling = pooling 56 | self.bert_path = bert_path 57 | 58 | self.var_per_encoder = 16 59 | if self.pooling not in ["cls", "mean", None]: 60 | raise NameError( 61 | f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}" 62 | ) 63 | 64 | super(BertLayer, self).__init__(**kwargs) 65 | 66 | def build(self, input_shape): 67 | 68 | self.bert = hub.Module(self.build_abspath(self.bert_path), 69 | trainable=self.trainable, name=f"{self.name}_module") 70 | 71 | trainable_layers = [] 72 | if self.tune_embeddings: 73 | trainable_layers.append("embeddings") 74 | 75 | if self.pooling == "cls": 76 | trainable_layers.append("pooler") 77 | 78 | if self.n_tune_layers > 0: 79 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name] 80 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder) 81 | for i in range(self.n_tune_layers): 82 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/") 83 | 84 | # Add module variables to layer's trainable weights 85 | for var in self.bert.variables: 86 | if any([l in var.name for l in trainable_layers]): 87 | self._trainable_weights.append(var) 88 | else: 89 | self._non_trainable_weights.append(var) 90 | 91 | if self.verbose: 92 | print("*** TRAINABLE VARS *** ") 93 | for var in self._trainable_weights: 94 | print(var) 95 | 96 | self.build_preprocessor() 97 | self.initialize_module() 98 | 99 | super(BertLayer, self).build(input_shape) 100 | 101 | def build_abspath(self, path): 102 | if path.startswith("https://") or path.startswith("gs://"): 103 | return path 104 | else: 105 | return os.path.abspath(path) 106 | 107 | def build_preprocessor(self): 108 | sess = tf.keras.backend.get_session() 109 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True) 110 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 111 | tokenization_info["do_lower_case"]]) 112 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case) 113 | 114 | def initialize_module(self): 115 | sess = tf.keras.backend.get_session() 116 | 117 | vars_initialized = sess.run([tf.is_variable_initialized(var) 118 | for var in self.bert.variables]) 119 | 120 | uninitialized = [] 121 | for var, is_initialized in zip(self.bert.variables, vars_initialized): 122 | if not is_initialized: 123 | uninitialized.append(var) 124 | 125 | if len(uninitialized): 126 | sess.run(tf.variables_initializer(uninitialized)) 127 | 128 | def call(self, input): 129 | 130 | if self.do_preprocessing: 131 | input = tf.numpy_function(self.preprocessor, 132 | [input], [tf.int32, tf.int32, tf.int32], 133 | name='preprocessor') 134 | for feature in input: 135 | feature.set_shape((None, self.seq_len)) 136 | 137 | input_ids, input_mask, segment_ids = input 138 | 139 | bert_inputs = dict( 140 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids 141 | ) 142 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True) 143 | 144 | if self.pooling == "cls": 145 | pooled = output["pooled_output"] 146 | else: 147 | result = output["sequence_output"] 148 | 149 | input_mask = tf.cast(input_mask, tf.float32) 150 | mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1) 151 | masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / ( 152 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) 153 | 154 | if self.pooling == "mean": 155 | pooled = masked_reduce_mean(result, input_mask) 156 | else: 157 | pooled = mul_mask(result, input_mask) 158 | 159 | return pooled 160 | 161 | def get_config(self): 162 | config_dict = { 163 | "bert_path": self.bert_path, 164 | "seq_len": self.seq_len, 165 | "pooling": self.pooling, 166 | "n_tune_layers": self.n_tune_layers, 167 | "tune_embeddings": self.tune_embeddings, 168 | "do_preprocessing": self.do_preprocessing, 169 | "verbose": self.verbose 170 | } 171 | super(BertLayer, self).get_config() 172 | return config_dict 173 | -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/BertLayer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/BertLayer.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/data_pre.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/data_pre.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/extract_features.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/extract_features.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/modeling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/modeling.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/__pycache__/tokenization.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/tokenization.cpython-36.pyc -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/README.md: -------------------------------------------------------------------------------- 1 | https://github.com/gaphex/bert_experimental/tree/master/bert_experimental 2 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/feature_extraction/l2_retriever.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | class L2Retriever: 6 | def __init__(self, dim, top_k=3, use_norm=False, use_gpu=False): 7 | 8 | self.dim = dim 9 | self.top_k = top_k 10 | self.use_norm = use_norm 11 | config = tf.ConfigProto( 12 | device_count={'GPU': (1 if use_gpu else 0)} 13 | ) 14 | config.gpu_options.allow_growth = True 15 | self.session = tf.Session(config=config) 16 | self.dtype = "float32" 17 | 18 | self.query = tf.placeholder(self.dtype, [None, self.dim]) 19 | self.kbase = tf.placeholder(self.dtype, [None, self.dim]) 20 | if self.use_norm: 21 | self.norm = tf.placeholder(self.dtype, [None, 1]) 22 | else: 23 | self.norm = None 24 | 25 | self.build_graph() 26 | 27 | def build_graph(self): 28 | 29 | self.distance = self.euclidean_distances(self.kbase, self.query, self.norm) 30 | top_neg_dists, top_indices = tf.math.top_k( 31 | tf.negative(self.distance), k=self.top_k) 32 | top_dists = tf.sqrt(tf.abs(tf.negative(top_neg_dists))) 33 | 34 | self.top_distances = top_dists 35 | self.top_indices = top_indices 36 | 37 | def predict(self, kbase, query, norm=None): 38 | 39 | query = query.reshape((-1, self.dim)) 40 | feed_dict = {self.query: query, self.kbase: kbase} 41 | if self.use_norm: 42 | feed_dict[self.norm] = norm 43 | 44 | I, D = self.session.run([self.top_indices, self.top_distances], 45 | feed_dict=feed_dict) 46 | 47 | return I, D 48 | 49 | @staticmethod 50 | def euclidean_distances(kbase, query, norm=None): 51 | 52 | if norm is None: 53 | XX = tf.keras.backend.batch_dot(kbase, kbase, axes=1) 54 | else: 55 | XX = norm 56 | 57 | YY = tf.transpose(tf.keras.backend.batch_dot(query, query, axes=1)) 58 | XY = tf.matmul(kbase, tf.transpose(query)) 59 | 60 | distance = XX - 2 * XY + YY 61 | distance = tf.transpose(distance) 62 | 63 | return distance 64 | 65 | @staticmethod 66 | def compute_squared_l2_norm(mat): 67 | square_norms = np.sum(mat**2, axis=1, keepdims=True) 68 | return square_norms 69 | 70 | def __call__(self, kbase, query, norm=None): 71 | return self.predict(kbase, query, norm) 72 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/feature_extraction/text_preprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import tensorflow as tf 3 | import collections 4 | import unicodedata 5 | 6 | 7 | class FullTokenizer(object): 8 | """Runs end-to-end tokenziation.""" 9 | 10 | def __init__(self, vocab_file, do_lower_case=True): 11 | self.vocab = load_vocab(vocab_file) 12 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 13 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 14 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 15 | 16 | def tokenize(self, text): 17 | split_tokens = [] 18 | for token in self.basic_tokenizer.tokenize(text): 19 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 20 | split_tokens.append(sub_token) 21 | 22 | return split_tokens 23 | 24 | def convert_tokens_to_ids(self, tokens): 25 | return convert_by_vocab(self.vocab, tokens) 26 | 27 | def convert_ids_to_tokens(self, ids): 28 | return convert_by_vocab(self.inv_vocab, ids) 29 | 30 | def mark_unk_tokens(self, tokens, unk_token='[UNK]'): 31 | return [t if t in self.vocab else unk_token for t in tokens] 32 | 33 | 34 | class BasicTokenizer(object): 35 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 36 | 37 | def __init__(self, do_lower_case=True): 38 | """Constructs a BasicTokenizer. 39 | Args: 40 | do_lower_case: Whether to lower case the input. 41 | """ 42 | self.do_lower_case = do_lower_case 43 | 44 | def tokenize(self, text): 45 | """Tokenizes a piece of text.""" 46 | text = convert_to_unicode(text) 47 | text = self._clean_text(text) 48 | 49 | # This was added on November 1st, 2018 for the multilingual and Chinese 50 | # models. This is also applied to the English models now, but it doesn't 51 | # matter since the English models were not trained on any Chinese data 52 | # and generally don't have any Chinese data in them (there are Chinese 53 | # characters in the vocabulary because Wikipedia does have some Chinese 54 | # words in the English Wikipedia.). 55 | text = self._tokenize_chinese_chars(text) 56 | 57 | orig_tokens = whitespace_tokenize(text) 58 | split_tokens = [] 59 | for token in orig_tokens: 60 | if self.do_lower_case: 61 | token = token.lower() 62 | token = self._run_strip_accents(token) 63 | split_tokens.extend(self._run_split_on_punc(token)) 64 | 65 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 66 | return output_tokens 67 | 68 | def _run_strip_accents(self, text): 69 | """Strips accents from a piece of text.""" 70 | text = unicodedata.normalize("NFD", text) 71 | output = [] 72 | for char in text: 73 | cat = unicodedata.category(char) 74 | if cat == "Mn": 75 | continue 76 | output.append(char) 77 | return "".join(output) 78 | 79 | def _run_split_on_punc(self, text): 80 | """Splits punctuation on a piece of text.""" 81 | chars = list(text) 82 | i = 0 83 | start_new_word = True 84 | output = [] 85 | while i < len(chars): 86 | char = chars[i] 87 | if _is_punctuation(char): 88 | output.append([char]) 89 | start_new_word = True 90 | else: 91 | if start_new_word: 92 | output.append([]) 93 | start_new_word = False 94 | output[-1].append(char) 95 | i += 1 96 | 97 | return ["".join(x) for x in output] 98 | 99 | def _tokenize_chinese_chars(self, text): 100 | """Adds whitespace around any CJK character.""" 101 | output = [] 102 | for char in text: 103 | cp = ord(char) 104 | if self._is_chinese_char(cp): 105 | output.append(" ") 106 | output.append(char) 107 | output.append(" ") 108 | else: 109 | output.append(char) 110 | return "".join(output) 111 | 112 | def _is_chinese_char(self, cp): 113 | """Checks whether CP is the codepoint of a CJK character.""" 114 | # This defines a "chinese character" as anything in the CJK Unicode block: 115 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 116 | # 117 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 118 | # despite its name. The modern Korean Hangul alphabet is a different block, 119 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 120 | # space-separated words, so they are not treated specially and handled 121 | # like the all of the other languages. 122 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 123 | (cp >= 0x3400 and cp <= 0x4DBF) or # 124 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 125 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 126 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 127 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 128 | (cp >= 0xF900 and cp <= 0xFAFF) or # 129 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 130 | return True 131 | 132 | return False 133 | 134 | def _clean_text(self, text): 135 | """Performs invalid character removal and whitespace cleanup on text.""" 136 | output = [] 137 | for char in text: 138 | cp = ord(char) 139 | if cp == 0 or cp == 0xfffd or _is_control(char): 140 | continue 141 | if _is_whitespace(char): 142 | output.append(" ") 143 | else: 144 | output.append(char) 145 | return "".join(output) 146 | 147 | 148 | class WordpieceTokenizer(object): 149 | """Runs WordPiece tokenziation.""" 150 | 151 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 152 | self.vocab = vocab 153 | self.unk_token = unk_token 154 | self.max_input_chars_per_word = max_input_chars_per_word 155 | 156 | def tokenize(self, text): 157 | """Tokenizes a piece of text into its word pieces. 158 | This uses a greedy longest-match-first algorithm to perform tokenization 159 | using the given vocabulary. 160 | For example: 161 | input = "unaffable" 162 | output = ["un", "##aff", "##able"] 163 | Args: 164 | text: A single token or whitespace separated tokens. This should have 165 | already been passed through `BasicTokenizer. 166 | Returns: 167 | A list of wordpiece tokens. 168 | """ 169 | 170 | text = convert_to_unicode(text) 171 | 172 | output_tokens = [] 173 | for token in whitespace_tokenize(text): 174 | chars = list(token) 175 | if len(chars) > self.max_input_chars_per_word: 176 | output_tokens.append(self.unk_token) 177 | continue 178 | 179 | is_bad = False 180 | start = 0 181 | sub_tokens = [] 182 | while start < len(chars): 183 | end = len(chars) 184 | cur_substr = None 185 | while start < end: 186 | substr = "".join(chars[start:end]) 187 | if start > 0: 188 | substr = "##" + substr 189 | if substr in self.vocab: 190 | cur_substr = substr 191 | break 192 | end -= 1 193 | if cur_substr is None: 194 | is_bad = True 195 | break 196 | sub_tokens.append(cur_substr) 197 | start = end 198 | 199 | if is_bad: 200 | output_tokens.append(self.unk_token) 201 | else: 202 | output_tokens.extend(sub_tokens) 203 | return output_tokens 204 | 205 | 206 | class InputExample(object): 207 | 208 | def __init__(self, unique_id, text_a, text_b): 209 | self.unique_id = unique_id 210 | self.text_a = text_a 211 | self.text_b = text_b 212 | 213 | 214 | class InputFeatures(object): 215 | """A single set of features of data.""" 216 | 217 | def __init__(self, tokens, input_ids, input_mask, input_type_ids): 218 | # self.unique_id = unique_id 219 | self.tokens = tokens 220 | self.input_ids = input_ids 221 | self.input_mask = input_mask 222 | self.input_type_ids = input_type_ids 223 | 224 | 225 | def _is_whitespace(char): 226 | """Checks whether `chars` is a whitespace character.""" 227 | # \t, \n, and \r are technically contorl characters but we treat them 228 | # as whitespace since they are generally considered as such. 229 | if char == " " or char == "\t" or char == "\n" or char == "\r": 230 | return True 231 | cat = unicodedata.category(char) 232 | if cat == "Zs": 233 | return True 234 | return False 235 | 236 | 237 | def _is_control(char): 238 | """Checks whether `chars` is a control character.""" 239 | # These are technically control characters but we count them as whitespace 240 | # characters. 241 | if char == "\t" or char == "\n" or char == "\r": 242 | return False 243 | cat = unicodedata.category(char) 244 | if cat.startswith("C"): 245 | return True 246 | return False 247 | 248 | 249 | def _is_punctuation(char): 250 | """Checks whether `chars` is a punctuation character.""" 251 | cp = ord(char) 252 | # We treat all non-letter/number ASCII as punctuation. 253 | # Characters such as "^", "$", and "`" are not in the Unicode 254 | # Punctuation class but we treat them as punctuation anyways, for 255 | # consistency. 256 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 257 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 258 | return True 259 | cat = unicodedata.category(char) 260 | if cat.startswith("P"): 261 | return True 262 | return False 263 | 264 | 265 | def convert_to_unicode(text): 266 | if isinstance(text, str): 267 | return text 268 | elif isinstance(text, bytes): 269 | return text.decode("utf-8", "ignore") 270 | else: 271 | raise ValueError("Unsupported string type: %s" % (type(text))) 272 | 273 | 274 | def printable_text(text): 275 | if isinstance(text, str): 276 | return text 277 | elif isinstance(text, bytes): 278 | return text.decode("utf-8", "ignore") 279 | else: 280 | raise ValueError("Unsupported string type: %s" % (type(text))) 281 | 282 | 283 | def load_vocab(vocab_file): 284 | """Loads a vocabulary file into a dictionary.""" 285 | vocab = collections.OrderedDict() 286 | index = 0 287 | with tf.gfile.GFile(vocab_file, "r") as reader: 288 | while True: 289 | token = convert_to_unicode(reader.readline()) 290 | if not token: 291 | break 292 | token = token.strip() 293 | vocab[token] = index 294 | index += 1 295 | return vocab 296 | 297 | 298 | def convert_by_vocab(vocab, items): 299 | """Converts a sequence of [tokens|ids] using the vocab.""" 300 | output = [] 301 | for item in items: 302 | output.append(vocab[item]) 303 | return output 304 | 305 | 306 | def convert_tokens_to_ids(vocab, tokens): 307 | return convert_by_vocab(vocab, tokens) 308 | 309 | 310 | def convert_ids_to_tokens(inv_vocab, ids): 311 | return convert_by_vocab(inv_vocab, ids) 312 | 313 | 314 | def whitespace_tokenize(text): 315 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 316 | text = text.strip() 317 | if not text: 318 | return [] 319 | tokens = text.split() 320 | return tokens 321 | 322 | 323 | def convert_lst_to_features(lst_str, max_seq_length, max_position_embeddings, 324 | tokenizer, is_tokenized=False, mask_cls_sep=False): 325 | """Loads a data file into a list of `InputBatch`s.""" 326 | 327 | examples = read_tokenized_examples(lst_str) if is_tokenized else read_examples(lst_str) 328 | 329 | _tokenize = lambda x: tokenizer.mark_unk_tokens(x) if is_tokenized else tokenizer.tokenize(x) 330 | 331 | all_tokens = [(_tokenize(ex.text_a), _tokenize(ex.text_b) if ex.text_b else []) for ex in examples] 332 | 333 | # user did not specify a meaningful sequence length 334 | # override the sequence length by the maximum seq length of the current batch 335 | if max_seq_length is None: 336 | max_seq_length = max(len(ta) + len(tb) for ta, tb in all_tokens) 337 | # add special tokens into account 338 | # case 1: Account for [CLS], tokens_a [SEP], tokens_b [SEP] -> 3 additional tokens 339 | # case 2: Account for [CLS], tokens_a [SEP] -> 2 additional tokens 340 | max_seq_length += 3 if any(len(tb) for _, tb in all_tokens) else 2 341 | max_seq_length = min(max_seq_length, max_position_embeddings) 342 | 343 | for (tokens_a, tokens_b) in all_tokens: 344 | if tokens_b: 345 | # Modifies `tokens_a` and `tokens_b` in place so that the total 346 | # length is less than the specified length. 347 | # Account for [CLS], [SEP], [SEP] with "- 3" 348 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 349 | else: 350 | # Account for [CLS] and [SEP] with "- 2" 351 | if len(tokens_a) > max_seq_length - 2: 352 | tokens_a = tokens_a[0:(max_seq_length - 2)] 353 | 354 | # The convention in BERT is: 355 | # (a) For sequence pairs: 356 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 357 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 358 | # (b) For single sequences: 359 | # tokens: [CLS] the dog is hairy . [SEP] 360 | # type_ids: 0 0 0 0 0 0 0 361 | # 362 | # Where "type_ids" are used to indicate whether this is the first 363 | # sequence or the second sequence. The embedding vectors for `type=0` and 364 | # `type=1` were learned during pre-training and are added to the wordpiece 365 | # embedding vector (and position vector). This is not *strictly* necessary 366 | # since the [SEP] token unambiguously separates the sequences, but it makes 367 | # it easier for the model to learn the concept of sequences. 368 | # 369 | # For classification tasks, the first vector (corresponding to [CLS]) is 370 | # used as as the "sentence vector". Note that this only makes sense because 371 | # the entire model is fine-tuned. 372 | tokens = ['[CLS]'] + tokens_a + ['[SEP]'] 373 | input_type_ids = [0] * len(tokens) 374 | input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)] 375 | 376 | if tokens_b: 377 | tokens += tokens_b + ['[SEP]'] 378 | input_type_ids += [1] * (len(tokens_b) + 1) 379 | input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)] 380 | 381 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 382 | 383 | # Zero-pad up to the sequence length. more pythonic 384 | pad_len = max_seq_length - len(input_ids) 385 | input_ids += [0] * pad_len 386 | input_mask += [0] * pad_len 387 | input_type_ids += [0] * pad_len 388 | 389 | assert len(input_ids) == max_seq_length 390 | assert len(input_mask) == max_seq_length 391 | assert len(input_type_ids) == max_seq_length 392 | 393 | yield InputFeatures( 394 | # unique_id=example.unique_id, 395 | tokens=tokens, 396 | input_ids=input_ids, 397 | input_mask=input_mask, 398 | input_type_ids=input_type_ids) 399 | 400 | 401 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 402 | """Truncates a sequence pair in place to the maximum length.""" 403 | 404 | # This is a simple heuristic which will always truncate the longer sequence 405 | # one token at a time. This makes more sense than truncating an equal percent 406 | # of tokens from each, since if one sequence is very short then each token 407 | # that's truncated likely contains more information than a longer sequence. 408 | while True: 409 | total_length = len(tokens_a) + len(tokens_b) 410 | if total_length <= max_length: 411 | break 412 | if len(tokens_a) > len(tokens_b): 413 | tokens_a.pop() 414 | else: 415 | tokens_b.pop() 416 | 417 | 418 | def read_examples(lst_strs): 419 | """Read a list of `InputExample`s from a list of strings.""" 420 | unique_id = 0 421 | for ss in lst_strs: 422 | line = convert_to_unicode(ss) 423 | if not line: 424 | continue 425 | line = line.strip() 426 | text_a = None 427 | text_b = None 428 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 429 | if m is None: 430 | text_a = line 431 | else: 432 | text_a = m.group(1) 433 | text_b = m.group(2) 434 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) 435 | unique_id += 1 436 | 437 | 438 | def read_tokenized_examples(lst_strs): 439 | unique_id = 0 440 | lst_strs = [[convert_to_unicode(w) for w in s] for s in lst_strs] 441 | for ss in lst_strs: 442 | text_a = ss 443 | text_b = None 444 | try: 445 | j = ss.index('|||') 446 | text_a = ss[:j] 447 | text_b = ss[(j + 1):] 448 | except ValueError: 449 | pass 450 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) 451 | unique_id += 1 452 | 453 | def stub_preprocessor(text): 454 | return text 455 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/finetuning/bert_layer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | import tensorflow_hub as hub 4 | 5 | from .text_preprocessing import build_preprocessor 6 | 7 | 8 | class BertLayer(tf.keras.layers.Layer): 9 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 10 | pooling="cls", do_preprocessing=True, verbose=False, 11 | tune_embeddings=False, trainable=True, use_layers=None, 12 | as_dict=False, **kwargs): 13 | 14 | self.trainable = trainable 15 | self.n_tune_layers = n_tune_layers 16 | self.tune_embeddings = tune_embeddings 17 | self.do_preprocessing = do_preprocessing 18 | 19 | self.as_dict = as_dict 20 | self.verbose = verbose 21 | self.seq_len = seq_len 22 | self.pooling = pooling 23 | self.bert_path = bert_path 24 | self.use_layers = use_layers 25 | 26 | self.var_per_encoder = 16 27 | if self.pooling not in ["cls", "mean", "sqrt_mean", None]: 28 | raise NameError( 29 | f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}" 30 | ) 31 | 32 | super(BertLayer, self).__init__(**kwargs) 33 | 34 | def build(self, input_shape): 35 | 36 | self.bert = hub.Module(self.build_abspath(self.bert_path), 37 | trainable=self.trainable, name=f"{self.name}_module") 38 | 39 | trainable_layers = [] 40 | if self.tune_embeddings: 41 | trainable_layers.append("embeddings") 42 | 43 | if self.pooling == "cls": 44 | trainable_layers.append("pooler") 45 | 46 | if self.n_tune_layers > 0: 47 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name] 48 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder) 49 | if self.use_layers: 50 | n_encoder_layers = min(self.use_layers, n_encoder_layers) 51 | for i in range(self.n_tune_layers): 52 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/") 53 | 54 | # Add module variables to layer's trainable weights 55 | for var in self.bert.variables: 56 | if any([l in var.name for l in trainable_layers]): 57 | self._trainable_weights.append(var) 58 | else: 59 | self._non_trainable_weights.append(var) 60 | 61 | if self.verbose: 62 | print("*** TRAINABLE VARS *** ") 63 | for var in self._trainable_weights: 64 | print(var) 65 | 66 | self.build_preprocessor() 67 | self.initialize_module() 68 | 69 | super(BertLayer, self).build(input_shape) 70 | 71 | def build_abspath(self, path): 72 | if path.startswith("https://") or path.startswith("gs://"): 73 | return path 74 | else: 75 | return os.path.abspath(path) 76 | 77 | def build_preprocessor(self): 78 | sess = tf.compat.v1.keras.backend.get_session() 79 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True) 80 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 81 | tokenization_info["do_lower_case"]]) 82 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case) 83 | 84 | def initialize_module(self): 85 | sess = tf.compat.v1.keras.backend.get_session() 86 | 87 | vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var) 88 | for var in self.bert.variables]) 89 | 90 | uninitialized = [] 91 | for var, is_initialized in zip(self.bert.variables, vars_initialized): 92 | if not is_initialized: 93 | uninitialized.append(var) 94 | 95 | if len(uninitialized): 96 | sess.run(tf.compat.v1.variables_initializer(uninitialized)) 97 | 98 | def call(self, input): 99 | 100 | if self.do_preprocessing: 101 | input = tf.numpy_function(self.preprocessor, 102 | [input], [tf.int32, tf.int32, tf.int32], 103 | name='preprocessor') 104 | for feature in input: 105 | feature.set_shape((None, self.seq_len)) 106 | 107 | input_ids, input_mask, segment_ids = input 108 | 109 | bert_inputs = dict( 110 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids 111 | ) 112 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True) 113 | 114 | input_mask = tf.cast(input_mask, tf.float32) 115 | 116 | seq_output = output["sequence_output"] 117 | tok_output = mul_mask(output.get("token_output", seq_output), input_mask) 118 | 119 | if self.pooling == "cls": 120 | pooled = output["pooled_output"] 121 | else: 122 | if self.pooling == "mean": 123 | pooled = masked_reduce_mean(seq_output, input_mask) 124 | 125 | elif self.pooling == "sqrt_mean": 126 | pooled = masked_reduce_sqrt_mean(seq_output, input_mask) 127 | 128 | else: 129 | pooled = mul_mask(seq_output, input_mask) 130 | 131 | if self.as_dict: 132 | output = { 133 | "sequence_output": seq_output, 134 | "pooled_output": pooled, 135 | "token_output": tok_output 136 | } 137 | else: 138 | output = pooled 139 | 140 | return output 141 | 142 | def get_config(self): 143 | config_dict = { 144 | "bert_path": self.bert_path, 145 | "seq_len": self.seq_len, 146 | "pooling": self.pooling, 147 | "n_tune_layers": self.n_tune_layers, 148 | "tune_embeddings": self.tune_embeddings, 149 | "do_preprocessing": self.do_preprocessing, 150 | "use_layers": self.use_layers, 151 | "trainable": self.trainable, 152 | "as_dict": self.as_dict, 153 | "verbose": self.verbose 154 | } 155 | super(BertLayer, self).get_config() 156 | return config_dict 157 | 158 | 159 | class StatefulBertLayer(tf.keras.layers.Layer): 160 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 161 | pooling="cls", do_preprocessing=True, verbose=False, 162 | tune_embeddings=False, trainable=True, use_layers=None, 163 | as_dict=False, **kwargs): 164 | 165 | self.trainable = trainable 166 | self.n_tune_layers = n_tune_layers 167 | self.tune_embeddings = tune_embeddings 168 | self.do_preprocessing = do_preprocessing 169 | 170 | self.as_dict = as_dict 171 | self.verbose = verbose 172 | self.seq_len = seq_len 173 | self.pooling = pooling 174 | self.bert_path = bert_path 175 | self.use_layers = use_layers 176 | 177 | self.var_per_encoder = 16 178 | if self.pooling not in ["cls", "mean", "sqrt_mean", None]: 179 | raise NameError( 180 | f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}" 181 | ) 182 | 183 | super(StatefulBertLayer, self).__init__(**kwargs) 184 | 185 | def build(self, input_shape): 186 | 187 | self.bert = hub.Module(self.build_abspath(self.bert_path), 188 | trainable=self.trainable, name=f"{self.name}_module") 189 | 190 | trainable_layers = [] 191 | if self.tune_embeddings: 192 | trainable_layers.append("embeddings") 193 | 194 | if self.pooling == "cls": 195 | trainable_layers.append("pooler") 196 | 197 | if self.n_tune_layers > 0: 198 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name] 199 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder) 200 | if self.use_layers: 201 | n_encoder_layers = min(self.use_layers, n_encoder_layers) 202 | for i in range(self.n_tune_layers): 203 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/") 204 | 205 | # Add module variables to layer's trainable weights 206 | for var in self.bert.variables: 207 | if any([l in var.name for l in trainable_layers]): 208 | self._trainable_weights.append(var) 209 | else: 210 | self._non_trainable_weights.append(var) 211 | 212 | if self.verbose: 213 | print("*** TRAINABLE VARS *** ") 214 | for var in self._trainable_weights: 215 | print(var) 216 | 217 | self.build_preprocessor() 218 | self.initialize_module() 219 | 220 | super(StatefulBertLayer, self).build(input_shape) 221 | 222 | def build_abspath(self, path): 223 | if path.startswith("https://") or path.startswith("gs://"): 224 | return path 225 | else: 226 | return os.path.abspath(path) 227 | 228 | def build_preprocessor(self): 229 | sess = tf.compat.v1.keras.backend.get_session() 230 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True) 231 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], 232 | tokenization_info["do_lower_case"]]) 233 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case) 234 | 235 | def initialize_module(self): 236 | sess = tf.compat.v1.keras.backend.get_session() 237 | 238 | vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var) 239 | for var in self.bert.variables]) 240 | 241 | uninitialized = [] 242 | for var, is_initialized in zip(self.bert.variables, vars_initialized): 243 | if not is_initialized: 244 | uninitialized.append(var) 245 | 246 | if len(uninitialized): 247 | sess.run(tf.compat.v1.variables_initializer(uninitialized)) 248 | 249 | def call(self, input): 250 | 251 | if self.do_preprocessing: 252 | input_text, input_state = input 253 | 254 | preprocessed_text = tf.numpy_function( 255 | self.preprocessor, [input_text], 256 | [tf.int32, tf.int32, tf.int32], 257 | name='preprocessor') 258 | for feature in preprocessed_text: 259 | feature.set_shape((None, self.seq_len)) 260 | input_ids, input_mask, segment_ids = preprocessed_text 261 | 262 | else: 263 | input_ids, input_mask, segment_ids, input_state = input 264 | 265 | bert_inputs = dict( 266 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, input_state=input_state 267 | ) 268 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True) 269 | 270 | input_mask = tf.cast(input_mask, tf.float32) 271 | 272 | seq_output = output["sequence_output"] 273 | tok_output = mul_mask(output.get("token_output", seq_output), input_mask) 274 | 275 | if self.pooling == "cls": 276 | pooled = output["pooled_output"] 277 | else: 278 | if self.pooling == "mean": 279 | pooled = masked_reduce_mean(seq_output, input_mask) 280 | 281 | elif self.pooling == "sqrt_mean": 282 | pooled = masked_reduce_sqrt_mean(seq_output, input_mask) 283 | 284 | else: 285 | pooled = mul_mask(seq_output, input_mask) 286 | 287 | if self.as_dict: 288 | output["pooled_output"] = pooled 289 | else: 290 | output = pooled 291 | 292 | return output 293 | 294 | def get_config(self): 295 | config_dict = { 296 | "bert_path": self.bert_path, 297 | "seq_len": self.seq_len, 298 | "pooling": self.pooling, 299 | "n_tune_layers": self.n_tune_layers, 300 | "tune_embeddings": self.tune_embeddings, 301 | "do_preprocessing": self.do_preprocessing, 302 | "use_layers": self.use_layers, 303 | "trainable": self.trainable, 304 | "as_dict": self.as_dict, 305 | "verbose": self.verbose 306 | } 307 | super(StatefulBertLayer, self).get_config() 308 | return config_dict 309 | 310 | def mul_mask(x, m): 311 | return x * tf.expand_dims(m, axis=-1) 312 | 313 | def masked_reduce_mean(x, m): 314 | return tf.reduce_sum(mul_mask(x, m), axis=1) / ( 315 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10) 316 | 317 | def masked_reduce_sqrt_mean(x, m): 318 | return tf.reduce_sum(mul_mask(x, m), axis=1) / ( 319 | tf.sqrt(tf.reduce_sum(m, axis=1, keepdims=True)) + 1e-10) 320 | 321 | 322 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/finetuning/graph_ops.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from tensorflow.python.framework.graph_util import convert_variables_to_constants 4 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference 5 | 6 | 7 | def load_graph(frozen_graph_filename): 8 | with tf.io.gfile.GFile(frozen_graph_filename, "rb") as f: 9 | graph_def = tf.compat.v1.GraphDef() 10 | graph_def.ParseFromString(f.read()) 11 | 12 | with tf.Graph().as_default() as graph: 13 | tf.import_graph_def(graph_def) 14 | return graph 15 | 16 | 17 | ### UPD old version to tf2/working with tf 1.x 18 | def freeze_keras_model(model, export_path=None, clear_devices=True): 19 | """ 20 | Freezes the state of a session into a pruned computation graph. 21 | 22 | @param model The Keras model to be optimized for inference. 23 | @param clear_devices Remove the device directives from the graph for better portability. 24 | @return The frozen graph definition. 25 | """ 26 | from tensorflow.compat.v1.graph_util import convert_variables_to_constants 27 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference 28 | 29 | session = tf.compat.v1.keras.backend.get_session() 30 | graph = session.graph 31 | 32 | with graph.as_default(): 33 | 34 | input_tensors = model.inputs 35 | output_tensors = model.outputs 36 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors] 37 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors] 38 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors] 39 | 40 | tmp_g = graph.as_graph_def() 41 | if clear_devices: 42 | for node in tmp_g.node: 43 | node.device = "" 44 | 45 | tmp_g = optimize_for_inference( 46 | tmp_g, input_ops, output_ops, dtypes, False) 47 | 48 | tmp_g = convert_variables_to_constants(session, tmp_g, output_ops) 49 | 50 | if export_path is not None: 51 | with tf.io.gfile.GFile(export_path, "wb") as f: 52 | f.write(tmp_g.SerializeToString()) 53 | 54 | return tmp_g 55 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/bert_experimental/finetuning/text_preprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import tensorflow as tf 3 | import numpy as np 4 | import collections 5 | import unicodedata 6 | 7 | 8 | class FullTokenizer(object): 9 | """Runs end-to-end tokenziation.""" 10 | 11 | def __init__(self, vocab_file, do_lower_case=True): 12 | self.vocab = load_vocab(vocab_file) 13 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 14 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 15 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 16 | 17 | def tokenize(self, text): 18 | split_tokens = [] 19 | for token in self.basic_tokenizer.tokenize(text): 20 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 21 | split_tokens.append(sub_token) 22 | 23 | return split_tokens 24 | 25 | def convert_tokens_to_ids(self, tokens): 26 | return convert_by_vocab(self.vocab, tokens) 27 | 28 | def convert_ids_to_tokens(self, ids): 29 | return convert_by_vocab(self.inv_vocab, ids) 30 | 31 | def mark_unk_tokens(self, tokens, unk_token='[UNK]'): 32 | return [t if t in self.vocab else unk_token for t in tokens] 33 | 34 | 35 | class BasicTokenizer(object): 36 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 37 | 38 | def __init__(self, do_lower_case=True): 39 | """Constructs a BasicTokenizer. 40 | Args: 41 | do_lower_case: Whether to lower case the input. 42 | """ 43 | self.do_lower_case = do_lower_case 44 | 45 | def tokenize(self, text): 46 | """Tokenizes a piece of text.""" 47 | text = convert_to_unicode(text) 48 | text = self._clean_text(text) 49 | 50 | # This was added on November 1st, 2018 for the multilingual and Chinese 51 | # models. This is also applied to the English models now, but it doesn't 52 | # matter since the English models were not trained on any Chinese data 53 | # and generally don't have any Chinese data in them (there are Chinese 54 | # characters in the vocabulary because Wikipedia does have some Chinese 55 | # words in the English Wikipedia.). 56 | text = self._tokenize_chinese_chars(text) 57 | 58 | orig_tokens = whitespace_tokenize(text) 59 | split_tokens = [] 60 | for token in orig_tokens: 61 | if self.do_lower_case: 62 | token = token.lower() 63 | token = self._run_strip_accents(token) 64 | split_tokens.extend(self._run_split_on_punc(token)) 65 | 66 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 67 | return output_tokens 68 | 69 | def _run_strip_accents(self, text): 70 | """Strips accents from a piece of text.""" 71 | text = unicodedata.normalize("NFD", text) 72 | output = [] 73 | for char in text: 74 | cat = unicodedata.category(char) 75 | if cat == "Mn": 76 | continue 77 | output.append(char) 78 | return "".join(output) 79 | 80 | def _run_split_on_punc(self, text): 81 | """Splits punctuation on a piece of text.""" 82 | chars = list(text) 83 | i = 0 84 | start_new_word = True 85 | output = [] 86 | while i < len(chars): 87 | char = chars[i] 88 | if _is_punctuation(char): 89 | output.append([char]) 90 | start_new_word = True 91 | else: 92 | if start_new_word: 93 | output.append([]) 94 | start_new_word = False 95 | output[-1].append(char) 96 | i += 1 97 | 98 | return ["".join(x) for x in output] 99 | 100 | def _tokenize_chinese_chars(self, text): 101 | """Adds whitespace around any CJK character.""" 102 | output = [] 103 | for char in text: 104 | cp = ord(char) 105 | if self._is_chinese_char(cp): 106 | output.append(" ") 107 | output.append(char) 108 | output.append(" ") 109 | else: 110 | output.append(char) 111 | return "".join(output) 112 | 113 | def _is_chinese_char(self, cp): 114 | """Checks whether CP is the codepoint of a CJK character.""" 115 | # This defines a "chinese character" as anything in the CJK Unicode block: 116 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 117 | # 118 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 119 | # despite its name. The modern Korean Hangul alphabet is a different block, 120 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 121 | # space-separated words, so they are not treated specially and handled 122 | # like the all of the other languages. 123 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 124 | (cp >= 0x3400 and cp <= 0x4DBF) or # 125 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 126 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 127 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 128 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 129 | (cp >= 0xF900 and cp <= 0xFAFF) or # 130 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 131 | return True 132 | 133 | return False 134 | 135 | def _clean_text(self, text): 136 | """Performs invalid character removal and whitespace cleanup on text.""" 137 | output = [] 138 | for char in text: 139 | cp = ord(char) 140 | if cp == 0 or cp == 0xfffd or _is_control(char): 141 | continue 142 | if _is_whitespace(char): 143 | output.append(" ") 144 | else: 145 | output.append(char) 146 | return "".join(output) 147 | 148 | 149 | class WordpieceTokenizer(object): 150 | """Runs WordPiece tokenziation.""" 151 | 152 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 153 | self.vocab = vocab 154 | self.unk_token = unk_token 155 | self.max_input_chars_per_word = max_input_chars_per_word 156 | 157 | def tokenize(self, text): 158 | """Tokenizes a piece of text into its word pieces. 159 | This uses a greedy longest-match-first algorithm to perform tokenization 160 | using the given vocabulary. 161 | For example: 162 | input = "unaffable" 163 | output = ["un", "##aff", "##able"] 164 | Args: 165 | text: A single token or whitespace separated tokens. This should have 166 | already been passed through `BasicTokenizer. 167 | Returns: 168 | A list of wordpiece tokens. 169 | """ 170 | 171 | text = convert_to_unicode(text) 172 | 173 | output_tokens = [] 174 | for token in whitespace_tokenize(text): 175 | chars = list(token) 176 | if len(chars) > self.max_input_chars_per_word: 177 | output_tokens.append(self.unk_token) 178 | continue 179 | 180 | is_bad = False 181 | start = 0 182 | sub_tokens = [] 183 | while start < len(chars): 184 | end = len(chars) 185 | cur_substr = None 186 | while start < end: 187 | substr = "".join(chars[start:end]) 188 | if start > 0: 189 | substr = "##" + substr 190 | if substr in self.vocab: 191 | cur_substr = substr 192 | break 193 | end -= 1 194 | if cur_substr is None: 195 | is_bad = True 196 | break 197 | sub_tokens.append(cur_substr) 198 | start = end 199 | 200 | if is_bad: 201 | output_tokens.append(self.unk_token) 202 | else: 203 | output_tokens.extend(sub_tokens) 204 | return output_tokens 205 | 206 | 207 | class InputExample(object): 208 | 209 | def __init__(self, unique_id, text_a, text_b): 210 | self.unique_id = unique_id 211 | self.text_a = text_a 212 | self.text_b = text_b 213 | 214 | 215 | class InputFeatures(object): 216 | """A single set of features of data.""" 217 | 218 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 219 | self.unique_id = unique_id 220 | self.tokens = tokens 221 | self.input_ids = input_ids 222 | self.input_mask = input_mask 223 | self.input_type_ids = input_type_ids 224 | 225 | 226 | def _is_whitespace(char): 227 | """Checks whether `chars` is a whitespace character.""" 228 | # \t, \n, and \r are technically contorl characters but we treat them 229 | # as whitespace since they are generally considered as such. 230 | if char == " " or char == "\t" or char == "\n" or char == "\r": 231 | return True 232 | cat = unicodedata.category(char) 233 | if cat == "Zs": 234 | return True 235 | return False 236 | 237 | 238 | def _is_control(char): 239 | """Checks whether `chars` is a control character.""" 240 | # These are technically control characters but we count them as whitespace 241 | # characters. 242 | if char == "\t" or char == "\n" or char == "\r": 243 | return False 244 | cat = unicodedata.category(char) 245 | if cat.startswith("C"): 246 | return True 247 | return False 248 | 249 | 250 | def _is_punctuation(char): 251 | """Checks whether `chars` is a punctuation character.""" 252 | cp = ord(char) 253 | # We treat all non-letter/number ASCII as punctuation. 254 | # Characters such as "^", "$", and "`" are not in the Unicode 255 | # Punctuation class but we treat them as punctuation anyways, for 256 | # consistency. 257 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 258 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 259 | return True 260 | cat = unicodedata.category(char) 261 | if cat.startswith("P"): 262 | return True 263 | return False 264 | 265 | 266 | def convert_to_unicode(text): 267 | if isinstance(text, str): 268 | return text 269 | elif isinstance(text, bytes): 270 | return text.decode("utf-8", "ignore") 271 | else: 272 | raise ValueError("Unsupported string type: %s" % (type(text))) 273 | 274 | 275 | def printable_text(text): 276 | if isinstance(text, str): 277 | return text 278 | elif isinstance(text, bytes): 279 | return text.decode("utf-8", "ignore") 280 | else: 281 | raise ValueError("Unsupported string type: %s" % (type(text))) 282 | 283 | 284 | def load_vocab(vocab_file): 285 | """Loads a vocabulary file into a dictionary.""" 286 | vocab = collections.OrderedDict() 287 | index = 0 288 | with tf.io.gfile.GFile(vocab_file, "r") as reader: 289 | while True: 290 | token = convert_to_unicode(reader.readline()) 291 | if not token: 292 | break 293 | token = token.strip() 294 | vocab[token] = index 295 | index += 1 296 | return vocab 297 | 298 | 299 | def convert_by_vocab(vocab, items): 300 | """Converts a sequence of [tokens|ids] using the vocab.""" 301 | output = [] 302 | for item in items: 303 | output.append(vocab[item]) 304 | return output 305 | 306 | 307 | def convert_tokens_to_ids(vocab, tokens): 308 | return convert_by_vocab(vocab, tokens) 309 | 310 | 311 | def convert_ids_to_tokens(inv_vocab, ids): 312 | return convert_by_vocab(inv_vocab, ids) 313 | 314 | 315 | def whitespace_tokenize(text): 316 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 317 | text = text.strip() 318 | if not text: 319 | return [] 320 | tokens = text.split() 321 | return tokens 322 | 323 | 324 | def convert_examples_to_features(examples, seq_length, tokenizer): 325 | """Loads a data file into a list of `InputBatch`s.""" 326 | 327 | features = [] 328 | for (ex_index, example) in enumerate(examples): 329 | tokens_a = tokenizer.tokenize(example.text_a) 330 | 331 | tokens_b = None 332 | if example.text_b: 333 | tokens_b = tokenizer.tokenize(example.text_b) 334 | 335 | if tokens_b: 336 | # Modifies `tokens_a` and `tokens_b` in place so that the total 337 | # length is less than the specified length. 338 | # Account for [CLS], [SEP], [SEP] with "- 3" 339 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 340 | else: 341 | # Account for [CLS] and [SEP] with "- 2" 342 | if len(tokens_a) > seq_length - 2: 343 | tokens_a = tokens_a[0:(seq_length - 2)] 344 | 345 | # The convention in BERT is: 346 | # (a) For sequence pairs: 347 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 348 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 349 | # (b) For single sequences: 350 | # tokens: [CLS] the dog is hairy . [SEP] 351 | # type_ids: 0 0 0 0 0 0 0 352 | # 353 | # Where "type_ids" are used to indicate whether this is the first 354 | # sequence or the second sequence. The embedding vectors for `type=0` and 355 | # `type=1` were learned during pre-training and are added to the wordpiece 356 | # embedding vector (and position vector). This is not *strictly* necessary 357 | # since the [SEP] token unambiguously separates the sequences, but it makes 358 | # it easier for the model to learn the concept of sequences. 359 | # 360 | # For classification tasks, the first vector (corresponding to [CLS]) is 361 | # used as as the "sentence vector". Note that this only makes sense because 362 | # the entire model is fine-tuned. 363 | tokens = [] 364 | input_type_ids = [] 365 | tokens.append("[CLS]") 366 | input_type_ids.append(0) 367 | for token in tokens_a: 368 | tokens.append(token) 369 | input_type_ids.append(0) 370 | tokens.append("[SEP]") 371 | input_type_ids.append(0) 372 | 373 | if tokens_b: 374 | for token in tokens_b: 375 | tokens.append(token) 376 | input_type_ids.append(1) 377 | tokens.append("[SEP]") 378 | input_type_ids.append(1) 379 | 380 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 381 | 382 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 383 | # tokens are attended to. 384 | input_mask = [1] * len(input_ids) 385 | 386 | # Zero-pad up to the sequence length. 387 | while len(input_ids) < seq_length: 388 | input_ids.append(0) 389 | input_mask.append(0) 390 | input_type_ids.append(0) 391 | 392 | assert len(input_ids) == seq_length 393 | assert len(input_mask) == seq_length 394 | assert len(input_type_ids) == seq_length 395 | 396 | features.append( 397 | InputFeatures( 398 | unique_id=example.unique_id, 399 | tokens=tokens, 400 | input_ids=input_ids, 401 | input_mask=input_mask, 402 | input_type_ids=input_type_ids)) 403 | return features 404 | 405 | 406 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 407 | """Truncates a sequence pair in place to the maximum length.""" 408 | 409 | # This is a simple heuristic which will always truncate the longer sequence 410 | # one token at a time. This makes more sense than truncating an equal percent 411 | # of tokens from each, since if one sequence is very short then each token 412 | # that's truncated likely contains more information than a longer sequence. 413 | while True: 414 | total_length = len(tokens_a) + len(tokens_b) 415 | if total_length <= max_length: 416 | break 417 | if len(tokens_a) > len(tokens_b): 418 | tokens_a.pop() 419 | else: 420 | tokens_b.pop() 421 | 422 | 423 | def read_examples(str_list): 424 | """Read a list of `InputExample`s from a list of strings.""" 425 | unique_id = 0 426 | for s in str_list: 427 | line = convert_to_unicode(s) 428 | line = line.strip() 429 | text_a = None 430 | text_b = None 431 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 432 | if m is None: 433 | text_a = line 434 | else: 435 | text_a = m.group(1) 436 | text_b = m.group(2) 437 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) 438 | unique_id += 1 439 | 440 | 441 | def features_to_arrays(features): 442 | all_input_ids = [] 443 | all_input_mask = [] 444 | all_segment_ids = [] 445 | 446 | for feature in features: 447 | all_input_ids.append(feature.input_ids) 448 | all_input_mask.append(feature.input_mask) 449 | all_segment_ids.append(feature.input_type_ids) 450 | 451 | return (np.array(all_input_ids, dtype='int32'), 452 | np.array(all_input_mask, dtype='int32'), 453 | np.array(all_segment_ids, dtype='int32')) 454 | 455 | 456 | def build_preprocessor(voc_path, seq_len, lower=True): 457 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower) 458 | EMPTY_STR = "" 459 | PAD_STR = "pad" 460 | NULL_VAL = 0 461 | 462 | def strings_to_arrays(str_list): 463 | str_list = np.atleast_1d(str_list).reshape((-1,)) 464 | 465 | empty_id = (str_list == EMPTY_STR).nonzero()[0] 466 | str_list[empty_id] = PAD_STR 467 | 468 | examples = [] 469 | for example in read_examples(str_list): 470 | examples.append(example) 471 | 472 | features = convert_examples_to_features(examples, seq_len, tokenizer) 473 | arrays = features_to_arrays(features) 474 | 475 | for arr in arrays: 476 | arr[empty_id] = NULL_VAL 477 | str_list[empty_id] = EMPTY_STR 478 | return arrays 479 | 480 | return strings_to_arrays 481 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15.4 2 | tensorflow-hub==0.7.0 3 | -------------------------------------------------------------------------------- /BERT-CNN/bert_experimental/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | __version__ = '1.0.4' 4 | 5 | setup( 6 | name='bert_experimental', 7 | version=__version__, 8 | description='Utilities for finetuning BERT-like models', 9 | url='https://github.com/gaphex/bert_experimental', 10 | long_description=open('README.md', 'r', encoding="utf8").read(), 11 | long_description_content_type='text/markdown', 12 | author='Denis Antyukhov', 13 | author_email='gaphex@gmail.com', 14 | license='MIT', 15 | packages=find_packages(), 16 | zip_safe=False, 17 | install_requires=[ 18 | 'tensorflow>=1.15, <2.0', 19 | 'tensorflow-hub==0.7.0', 20 | 'numpy' 21 | ], 22 | classifiers=( 23 | 'Programming Language :: Python :: 3.7', 24 | 'License :: OSI Approved :: MIT License', 25 | 'Operating System :: OS Independent', 26 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 27 | ), 28 | keywords='bert nlp tensorflow machine learning sentence encoding embedding finetuning', 29 | ) 30 | -------------------------------------------------------------------------------- /BERT-CNN/data/read.me: -------------------------------------------------------------------------------- 1 | put the train.tsv file here 2 | -------------------------------------------------------------------------------- /BERT-CNN/data/test.tsv: -------------------------------------------------------------------------------- 1 | id visual caption 2 | 0 standard poodle shopping cart footwear a close up of a dog laying in a basket 3 | 1 street sign traffic light tower a black and white photo of a street light 4 | 2 toilet seat a white toilet with its seat up in a bathroom 5 | 3 mobile home studio couch house a living room filled with furniture and a coffee table 6 | 4 french loaf conch person a basket filled with sandwiches on top of a table 7 | 5 indian elephant a group of people riding on the back of an elephant 8 | 6 bow tie windsor glasses a man wearing glasses and a tie in a room 9 | 7 sombrero bonnet woman a woman standing in front of a giant cake 10 | 8 diaper bassinet human a baby sitting in front of a giant cake 11 | 9 bobsled go-kart human a group of children sitting around a piece of luggage 12 | 10 vase spotlight plant a bunch of flowers that are in a vase 13 | -------------------------------------------------------------------------------- /BERT-CNN/data/train.tsv: -------------------------------------------------------------------------------- 1 | id id1 id2 visual caption is_related 2 | 220740 220741 220742 marimba dalmatian picket fence a horse jumping competition is going on with people in the stands 1 3 | 385729 385730 385731 dishwasher microwave barber chair a person riding a horse on a dirt ground 0 4 | 59422 59423 59424 laptop carton comicbook a laptop that has stickers on its cover is sitting on a table 1 5 | 46638 46639 46640 suit Windsortie woodenspoon a young bow wearing a pink shirt and a purple tie 1 6 | 11870 11871 11872 studiocouch four-poster quilt a couple of girls sitting in a bed in a bedroom 1 7 | 471676 471677 471678 streetcar fire engine passenger car a multi layer plate with cakes and food on it 0 8 | 186795 186796 186797 shoe shop television monitor a man playing a wii on a large projector screen 1 9 | 121836 121837 121838 ox water buffalo alp cattle standing on a hill in fog 1 10 | 396224 396225 396226 altar desk perfume oranges sitting in a blue bowl on a wooden table 0 11 | 430635 430636 430637 speedboat paddle lifeboat pots and other items sit on a stove and counter 0 12 | 145057 145058 145059 shopping cart ashcan park bench a coin meter that is laying down on grates 1 13 | 409778 409779 409780 web site fire engine comic book a painting of a man from the back 0 14 | 155568 155569 155570 grocery store patio restaurant a man and woman walking up the stairs in a backyard 1 15 | 213951 213952 213953 microwave washer dining table the kitchen is equipped with all the latest appliances 1 16 | 489266 489267 489268 traffic light aircraft carrier chain saw a laptop computer on a desk with cables a mug and bowl 0 17 | 257649 257650 257651 grocery store confectionery shopping basket a couple of wooden tale stopped with fresh fruit 1 18 | 113826 113827 113828 lab coat vestment West Highland white terrier a group of people standing in rows with frisbees for a photo 1 19 | 486413 486414 486415 snorkel ski tennis ball two frames of a woman in the air on a tennis court 0 20 | 400432 400433 400434 crutch lawn mower chain saw eight underneath on ambarella in the forest parrot 0 21 | 341153 341154 341155 washer microwave dishwasher a small propeller plane sitting underneath a covering at an airport 0 22 | 462067 462068 462069 ballplayer baseball scoreboard a plate full of bright green lettuce next to some bread 0 23 | 443392 443393 443394 grocery store pineapple pizza a man in black and white stripes with makeup smiling 0 24 | 486660 486661 486662 wombat wallaby titi a persons shadow on the ground of them skateboarding 0 25 | 336616 336617 336618 moped motor scooter crash helmet multiple street signs are attached to the post 0 26 | 124199 124200 124201 sorrel hog barrel a brown horse eating from a hallowed out metal barrel 1 27 | 238004 238005 238006 tray washbasin cradle a cat laying on a couch near a remote control 1 28 | 319195 319196 319197 airliner wing web site a propeller airplane parked inside and airplane hanger 1 29 | 412036 412037 412038 grey whale breakwater killer whale a stop sign is standing at a street intersection 0 30 | 491896 491897 491898 teddy wool toyshop a woman in an old-fashioned kitchen with pots and pans 0 31 | 487501 487502 487503 snowmobile steam locomotive tow truck the living room is clean and empty from people 0 32 | 277093 277094 277095 microwave dishwasher chest a chair holding a laptop that is facing towards an oven 1 33 | 135542 135543 135544 water buffalo warthog hog sheep grazing under a tree in a grassy meadow 1 34 | 8448 8449 8450 mountainbike unicycle bicycle-built-for-two a picture of a person throwing a frisbee 1 35 | 170686 170687 170688 police van minibus ambulance a person in the army greeting someone in a suit 1 36 | 372016 372017 372018 Great Dane Irish wolfhound English setter a man standing in a room holding a remote 0 37 | 351158 351159 351160 sunglass bullet train sunglasses a woman opening the trunk of her car 0 38 | 414542 414543 414544 killer whale great white shark paddle a dog running across a field with a frisbee in his mouth 0 39 | 264998 264999 265000 bannister ski unicycle a man riding a skateboard along a metal hand rail 1 40 | 362868 362869 362870 zebra bustard gazelle a basket full of bananas with a net on top 0 41 | 88455 88456 88457 patio flagpole pole a fire hydrant and fire hose in a houses front yard 1 42 | 372512 372513 372514 seashore catamaran swimming trunks a man riding a surfboard on a wave in the ocean 0 43 | 387327 387328 387329 cellular telephone lab coat cash machine a baseball game ensues as people watch 0 44 | 248027 248028 248029 web site barbershop cinema a motor bike on the side of the street 1 45 | 347507 347508 347509 banana pineapple orange a bear itching itself on a bare tree 0 46 | 33714 33715 33716 picketfence streetcar mountainbike the red bike and the pink bike just started dating 1 47 | 173989 173990 173991 umbrella poncho jinrikisha a group of people walking down a street carrying umbrellas 1 48 | 20835 20836 20837 ballplayer baseball footballhelmet a man throwing a baseball from a mound on a field 1 49 | 16356 16357 16358 lumbermill barbershop turnstile a man working on a baseball bat while two others watch 1 50 | 193491 193492 193493 unicycle pole horizontal bar boy riding on his skateboard down a stair rail 1 51 | 384165 384166 384167 mixing bowl corn meat loaf a couple of sailors standing next to a woman 0 52 | 321736 321737 321738 ballplayer baseball football helmet a boys baseball game with a batter catcher and umpire 1 53 | 108395 108396 108397 crash helmet moped backpack a man with a suit and tie on a motor bike 1 54 | 215942 215943 215944 unicycle military uniform bearskin four guys are sitting on a bench in front of a building 1 55 | 134156 134157 134158 wine bottle eggnog red wine there is a bottle of wine next to a glass 1 56 | 297783 297784 297785 necklace thimble corkscrew this is an image of a meal and an avocado is included 1 57 | 110516 110517 110518 minivan cab police van a dog looking ahead with a stoic look in a car seat 1 58 | 3166 3167 3168 grocerystore headcabbage cauliflower a pile of vegetables on display at a grocery store 1 59 | 440075 440076 440077 ski curly-coatedretriever Gordonsetter elephants and their young in their natural habitat 0 60 | 71021 71022 71023 ballplayer baseball puck a baseball player and a flying black bat 1 61 | -------------------------------------------------------------------------------- /BERT-CNN/data_pre.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import sys 4 | import json 5 | 6 | import logging 7 | import numpy as np 8 | import pandas as pd 9 | import tensorflow as tf 10 | 11 | from modeling import BertModel, BertConfig 12 | from tokenization import FullTokenizer, convert_to_unicode 13 | from extract_features import InputExample, convert_examples_to_features 14 | 15 | def read_examples(str_list): 16 | """Read a list of `InputExample`s from a list of strings.""" 17 | unique_id = 0 18 | for s in str_list: 19 | line = convert_to_unicode(s) 20 | if not line: 21 | continue 22 | line = line.strip() 23 | text_a = None 24 | text_b = None 25 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 26 | if m is None: 27 | text_a = line 28 | else: 29 | text_a = m.group(1) 30 | text_b = m.group(2) 31 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b) 32 | unique_id += 1 33 | 34 | # Convert theses features to np.arrays to use with tf.Keras. 35 | def features_to_arrays(features): 36 | 37 | all_input_ids = [] 38 | all_input_mask = [] 39 | all_segment_ids = [] 40 | 41 | for feature in features: 42 | all_input_ids.append(feature.input_ids) 43 | all_input_mask.append(feature.input_mask) 44 | all_segment_ids.append(feature.input_type_ids) 45 | 46 | return (np.array(all_input_ids, dtype='int32'), 47 | np.array(all_input_mask, dtype='int32'), 48 | np.array(all_segment_ids, dtype='int32')) 49 | 50 | 51 | # built all togehter 52 | def build_preprocessor(voc_path, seq_len, lower=True): 53 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower) 54 | 55 | def strings_to_arrays(sents): 56 | 57 | sents = np.atleast_1d(sents).reshape((-1,)) 58 | 59 | examples = [] 60 | for example in read_examples(sents): 61 | examples.append(example) 62 | 63 | features = convert_examples_to_features(examples, seq_len, tokenizer) 64 | arrays = features_to_arrays(features) 65 | return arrays 66 | 67 | return strings_to_arrays 68 | 69 | -------------------------------------------------------------------------------- /BERT-CNN/eval.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import pandas as pd 4 | import sys 5 | import argparse 6 | from sklearn.model_selection import train_test_split 7 | 8 | sys.path.insert(0, "bert_experimental") 9 | 10 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor 11 | from bert_experimental.finetuning.graph_ops import load_graph 12 | 13 | 14 | 15 | parser=argparse.ArgumentParser(description='inference of the model') 16 | parser.add_argument('--testset', default='test.tsv', help='test file', type=str,required=True) 17 | parser.add_argument('--model', default='pre-trained model', help='', type=str, required=True) 18 | args = parser.parse_args() 19 | 20 | 21 | 22 | df = pd.read_csv(args.testset, sep='\t') 23 | 24 | 25 | texts = [] 26 | delimiter = " ||| " 27 | 28 | for vis, cap in zip(df.visual.tolist(), df.caption.tolist()): 29 | texts.append(delimiter.join((str(vis), str(cap)))) 30 | 31 | 32 | texts = np.array(texts) 33 | 34 | trX, tsX = train_test_split(texts, shuffle=False, test_size=0.01) 35 | 36 | 37 | restored_graph = load_graph(args.model) 38 | 39 | graph_ops = restored_graph.get_operations() 40 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name 41 | print(input_op, output_op) 42 | 43 | x = restored_graph.get_tensor_by_name(input_op + ':0') 44 | y = restored_graph.get_tensor_by_name(output_op + ':0') 45 | 46 | preprocessor = build_preprocessor("uncased_L-12_H-768_A-12/vocab.txt", 64) 47 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor') 48 | 49 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32]) 50 | 51 | ##predictions 52 | 53 | sess = tf.Session(graph=restored_graph) 54 | 55 | print(trX[:2]) 56 | 57 | y = tf.print(y, summarize=-1) 58 | #x = tf.print(x, summarize=-1) 59 | y_out = sess.run(y, feed_dict={ 60 | x: trX[:2].reshape((-1,1)) 61 | 62 | }) 63 | 64 | print(y_out) 65 | 66 | -------------------------------------------------------------------------------- /BERT-CNN/extract_features.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Extract pre-computed feature vectors from BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import codecs 22 | import collections 23 | import json 24 | import re 25 | 26 | import modeling 27 | import tokenization 28 | import tensorflow as tf 29 | 30 | flags = tf.flags 31 | 32 | FLAGS = flags.FLAGS 33 | 34 | flags.DEFINE_string("input_file", None, "") 35 | 36 | flags.DEFINE_string("output_file", None, "") 37 | 38 | flags.DEFINE_string("layers", "-1,-2,-3,-4", "") 39 | 40 | flags.DEFINE_string( 41 | "bert_config_file", None, 42 | "The config json file corresponding to the pre-trained BERT model. " 43 | "This specifies the model architecture.") 44 | 45 | flags.DEFINE_integer( 46 | "max_seq_length", 128, 47 | "The maximum total input sequence length after WordPiece tokenization. " 48 | "Sequences longer than this will be truncated, and sequences shorter " 49 | "than this will be padded.") 50 | 51 | flags.DEFINE_string( 52 | "init_checkpoint", None, 53 | "Initial checkpoint (usually from a pre-trained BERT model).") 54 | 55 | flags.DEFINE_string("vocab_file", None, 56 | "The vocabulary file that the BERT model was trained on.") 57 | 58 | flags.DEFINE_bool( 59 | "do_lower_case", True, 60 | "Whether to lower case the input text. Should be True for uncased " 61 | "models and False for cased models.") 62 | 63 | flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") 64 | 65 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 66 | 67 | flags.DEFINE_string("master", None, 68 | "If using a TPU, the address of the master.") 69 | 70 | flags.DEFINE_integer( 71 | "num_tpu_cores", 8, 72 | "Only used if `use_tpu` is True. Total number of TPU cores to use.") 73 | 74 | flags.DEFINE_bool( 75 | "use_one_hot_embeddings", False, 76 | "If True, tf.one_hot will be used for embedding lookups, otherwise " 77 | "tf.nn.embedding_lookup will be used. On TPUs, this should be True " 78 | "since it is much faster.") 79 | 80 | 81 | class InputExample(object): 82 | 83 | def __init__(self, unique_id, text_a, text_b): 84 | self.unique_id = unique_id 85 | self.text_a = text_a 86 | self.text_b = text_b 87 | 88 | 89 | class InputFeatures(object): 90 | """A single set of features of data.""" 91 | 92 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 93 | self.unique_id = unique_id 94 | self.tokens = tokens 95 | self.input_ids = input_ids 96 | self.input_mask = input_mask 97 | self.input_type_ids = input_type_ids 98 | 99 | 100 | def input_fn_builder(features, seq_length): 101 | """Creates an `input_fn` closure to be passed to TPUEstimator.""" 102 | 103 | all_unique_ids = [] 104 | all_input_ids = [] 105 | all_input_mask = [] 106 | all_input_type_ids = [] 107 | 108 | for feature in features: 109 | all_unique_ids.append(feature.unique_id) 110 | all_input_ids.append(feature.input_ids) 111 | all_input_mask.append(feature.input_mask) 112 | all_input_type_ids.append(feature.input_type_ids) 113 | 114 | def input_fn(params): 115 | """The actual input function.""" 116 | batch_size = params["batch_size"] 117 | 118 | num_examples = len(features) 119 | 120 | # This is for demo purposes and does NOT scale to large data sets. We do 121 | # not use Dataset.from_generator() because that uses tf.py_func which is 122 | # not TPU compatible. The right way to load data is with TFRecordReader. 123 | d = tf.data.Dataset.from_tensor_slices({ 124 | "unique_ids": 125 | tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), 126 | "input_ids": 127 | tf.constant( 128 | all_input_ids, shape=[num_examples, seq_length], 129 | dtype=tf.int32), 130 | "input_mask": 131 | tf.constant( 132 | all_input_mask, 133 | shape=[num_examples, seq_length], 134 | dtype=tf.int32), 135 | "input_type_ids": 136 | tf.constant( 137 | all_input_type_ids, 138 | shape=[num_examples, seq_length], 139 | dtype=tf.int32), 140 | }) 141 | 142 | d = d.batch(batch_size=batch_size, drop_remainder=False) 143 | return d 144 | 145 | return input_fn 146 | 147 | 148 | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, 149 | use_one_hot_embeddings): 150 | """Returns `model_fn` closure for TPUEstimator.""" 151 | 152 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument 153 | """The `model_fn` for TPUEstimator.""" 154 | 155 | unique_ids = features["unique_ids"] 156 | input_ids = features["input_ids"] 157 | input_mask = features["input_mask"] 158 | input_type_ids = features["input_type_ids"] 159 | 160 | model = modeling.BertModel( 161 | config=bert_config, 162 | is_training=False, 163 | input_ids=input_ids, 164 | input_mask=input_mask, 165 | token_type_ids=input_type_ids, 166 | use_one_hot_embeddings=use_one_hot_embeddings) 167 | 168 | if mode != tf.estimator.ModeKeys.PREDICT: 169 | raise ValueError("Only PREDICT modes are supported: %s" % (mode)) 170 | 171 | tvars = tf.trainable_variables() 172 | scaffold_fn = None 173 | (assignment_map, 174 | initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( 175 | tvars, init_checkpoint) 176 | if use_tpu: 177 | 178 | def tpu_scaffold(): 179 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 180 | return tf.train.Scaffold() 181 | 182 | scaffold_fn = tpu_scaffold 183 | else: 184 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 185 | 186 | tf.logging.info("**** Trainable Variables ****") 187 | for var in tvars: 188 | init_string = "" 189 | if var.name in initialized_variable_names: 190 | init_string = ", *INIT_FROM_CKPT*" 191 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 192 | init_string) 193 | 194 | all_layers = model.get_all_encoder_layers() 195 | 196 | predictions = { 197 | "unique_id": unique_ids, 198 | } 199 | 200 | for (i, layer_index) in enumerate(layer_indexes): 201 | predictions["layer_output_%d" % i] = all_layers[layer_index] 202 | 203 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 204 | mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) 205 | return output_spec 206 | 207 | return model_fn 208 | 209 | 210 | def convert_examples_to_features(examples, seq_length, tokenizer): 211 | """Loads a data file into a list of `InputBatch`s.""" 212 | 213 | features = [] 214 | for (ex_index, example) in enumerate(examples): 215 | tokens_a = tokenizer.tokenize(example.text_a) 216 | 217 | tokens_b = None 218 | if example.text_b: 219 | tokens_b = tokenizer.tokenize(example.text_b) 220 | 221 | if tokens_b: 222 | # Modifies `tokens_a` and `tokens_b` in place so that the total 223 | # length is less than the specified length. 224 | # Account for [CLS], [SEP], [SEP] with "- 3" 225 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 226 | else: 227 | # Account for [CLS] and [SEP] with "- 2" 228 | if len(tokens_a) > seq_length - 2: 229 | tokens_a = tokens_a[0:(seq_length - 2)] 230 | 231 | # The convention in BERT is: 232 | # (a) For sequence pairs: 233 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 234 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 235 | # (b) For single sequences: 236 | # tokens: [CLS] the dog is hairy . [SEP] 237 | # type_ids: 0 0 0 0 0 0 0 238 | # 239 | # Where "type_ids" are used to indicate whether this is the first 240 | # sequence or the second sequence. The embedding vectors for `type=0` and 241 | # `type=1` were learned during pre-training and are added to the wordpiece 242 | # embedding vector (and position vector). This is not *strictly* necessary 243 | # since the [SEP] token unambiguously separates the sequences, but it makes 244 | # it easier for the model to learn the concept of sequences. 245 | # 246 | # For classification tasks, the first vector (corresponding to [CLS]) is 247 | # used as as the "sentence vector". Note that this only makes sense because 248 | # the entire model is fine-tuned. 249 | tokens = [] 250 | input_type_ids = [] 251 | tokens.append("[CLS]") 252 | input_type_ids.append(0) 253 | for token in tokens_a: 254 | tokens.append(token) 255 | input_type_ids.append(0) 256 | tokens.append("[SEP]") 257 | input_type_ids.append(0) 258 | 259 | if tokens_b: 260 | for token in tokens_b: 261 | tokens.append(token) 262 | input_type_ids.append(1) 263 | tokens.append("[SEP]") 264 | input_type_ids.append(1) 265 | 266 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 267 | 268 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 269 | # tokens are attended to. 270 | input_mask = [1] * len(input_ids) 271 | 272 | # Zero-pad up to the sequence length. 273 | while len(input_ids) < seq_length: 274 | input_ids.append(0) 275 | input_mask.append(0) 276 | input_type_ids.append(0) 277 | 278 | assert len(input_ids) == seq_length 279 | assert len(input_mask) == seq_length 280 | assert len(input_type_ids) == seq_length 281 | 282 | if ex_index < 5: 283 | tf.logging.info("*** Example ***") 284 | tf.logging.info("unique_id: %s" % (example.unique_id)) 285 | tf.logging.info("tokens: %s" % " ".join( 286 | [tokenization.printable_text(x) for x in tokens])) 287 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 288 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 289 | tf.logging.info( 290 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) 291 | 292 | features.append( 293 | InputFeatures( 294 | unique_id=example.unique_id, 295 | tokens=tokens, 296 | input_ids=input_ids, 297 | input_mask=input_mask, 298 | input_type_ids=input_type_ids)) 299 | return features 300 | 301 | 302 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 303 | """Truncates a sequence pair in place to the maximum length.""" 304 | 305 | # This is a simple heuristic which will always truncate the longer sequence 306 | # one token at a time. This makes more sense than truncating an equal percent 307 | # of tokens from each, since if one sequence is very short then each token 308 | # that's truncated likely contains more information than a longer sequence. 309 | while True: 310 | total_length = len(tokens_a) + len(tokens_b) 311 | if total_length <= max_length: 312 | break 313 | if len(tokens_a) > len(tokens_b): 314 | tokens_a.pop() 315 | else: 316 | tokens_b.pop() 317 | 318 | 319 | def read_examples(input_file): 320 | """Read a list of `InputExample`s from an input file.""" 321 | examples = [] 322 | unique_id = 0 323 | with tf.gfile.GFile(input_file, "r") as reader: 324 | while True: 325 | line = tokenization.convert_to_unicode(reader.readline()) 326 | if not line: 327 | break 328 | line = line.strip() 329 | text_a = None 330 | text_b = None 331 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 332 | if m is None: 333 | text_a = line 334 | else: 335 | text_a = m.group(1) 336 | text_b = m.group(2) 337 | examples.append( 338 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 339 | unique_id += 1 340 | return examples 341 | 342 | 343 | def main(_): 344 | tf.logging.set_verbosity(tf.logging.INFO) 345 | 346 | layer_indexes = [int(x) for x in FLAGS.layers.split(",")] 347 | 348 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 349 | 350 | tokenizer = tokenization.FullTokenizer( 351 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) 352 | 353 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 354 | run_config = tf.contrib.tpu.RunConfig( 355 | master=FLAGS.master, 356 | tpu_config=tf.contrib.tpu.TPUConfig( 357 | num_shards=FLAGS.num_tpu_cores, 358 | per_host_input_for_training=is_per_host)) 359 | 360 | examples = read_examples(FLAGS.input_file) 361 | 362 | features = convert_examples_to_features( 363 | examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) 364 | 365 | unique_id_to_feature = {} 366 | for feature in features: 367 | unique_id_to_feature[feature.unique_id] = feature 368 | 369 | model_fn = model_fn_builder( 370 | bert_config=bert_config, 371 | init_checkpoint=FLAGS.init_checkpoint, 372 | layer_indexes=layer_indexes, 373 | use_tpu=FLAGS.use_tpu, 374 | use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) 375 | 376 | # If TPU is not available, this will fall back to normal Estimator on CPU 377 | # or GPU. 378 | estimator = tf.contrib.tpu.TPUEstimator( 379 | use_tpu=FLAGS.use_tpu, 380 | model_fn=model_fn, 381 | config=run_config, 382 | predict_batch_size=FLAGS.batch_size) 383 | 384 | input_fn = input_fn_builder( 385 | features=features, seq_length=FLAGS.max_seq_length) 386 | 387 | with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, 388 | "w")) as writer: 389 | for result in estimator.predict(input_fn, yield_single_examples=True): 390 | unique_id = int(result["unique_id"]) 391 | feature = unique_id_to_feature[unique_id] 392 | output_json = collections.OrderedDict() 393 | output_json["linex_index"] = unique_id 394 | all_features = [] 395 | for (i, token) in enumerate(feature.tokens): 396 | all_layers = [] 397 | for (j, layer_index) in enumerate(layer_indexes): 398 | layer_output = result["layer_output_%d" % j] 399 | layers = collections.OrderedDict() 400 | layers["index"] = layer_index 401 | layers["values"] = [ 402 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat 403 | ] 404 | all_layers.append(layers) 405 | features = collections.OrderedDict() 406 | features["token"] = token 407 | features["layers"] = all_layers 408 | all_features.append(features) 409 | output_json["features"] = all_features 410 | writer.write(json.dumps(output_json) + "\n") 411 | 412 | 413 | if __name__ == "__main__": 414 | flags.mark_flag_as_required("input_file") 415 | flags.mark_flag_as_required("vocab_file") 416 | flags.mark_flag_as_required("bert_config_file") 417 | flags.mark_flag_as_required("init_checkpoint") 418 | flags.mark_flag_as_required("output_file") 419 | tf.app.run() 420 | -------------------------------------------------------------------------------- /BERT-CNN/freeze_keras_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import os 5 | import sys 6 | import json 7 | 8 | import logging 9 | import numpy as np 10 | import pandas as pd 11 | import tensorflow as tf 12 | import tensorflow_hub as hub 13 | from tensorflow import keras 14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint 15 | 16 | from sklearn.model_selection import train_test_split 17 | 18 | 19 | if not 'bert_repo' in sys.path: 20 | sys.path.insert(0, 'bert_repo') 21 | 22 | from modeling import BertModel, BertConfig 23 | from tokenization import FullTokenizer, convert_to_unicode 24 | from extract_features import InputExample, convert_examples_to_features 25 | 26 | 27 | def freeze_keras_model(model, export_path=None, clear_devices=True): 28 | sess = tf.keras.backend.get_session() 29 | graph = sess.graph 30 | 31 | with graph.as_default(): 32 | 33 | input_tensors = model.inputs 34 | output_tensors = model.outputs 35 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors] 36 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors] 37 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors] 38 | 39 | tmp_g = graph.as_graph_def() 40 | if clear_devices: 41 | for node in tmp_g.node: 42 | node.device = "" 43 | 44 | tmp_g = optimize_for_inference( 45 | tmp_g, input_ops, output_ops, dtypes, False) 46 | 47 | tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops) 48 | 49 | if export_path is not None: 50 | with tf.gfile.GFile(export_path, "wb") as f: 51 | f.write(tmp_g.SerializeToString()) 52 | 53 | return tmp_g 54 | 55 | -------------------------------------------------------------------------------- /BERT-CNN/model.json: -------------------------------------------------------------------------------- 1 | "{\"class_name\": \"Model\", \"config\": {\"name\": \"model\", \"layers\": [{\"name\": \"input_1\", \"class_name\": \"InputLayer\", \"config\": {\"batch_input_shape\": [null, 1], \"dtype\": \"string\", \"sparse\": false, \"ragged\": false, \"name\": \"input_1\"}, \"inbound_nodes\": []}, {\"name\": \"bert_layer\", \"class_name\": \"BertLayer\", \"config\": {\"bert_path\": \"./bert-module/\", \"seq_len\": 64, \"pooling\": null, \"n_tune_layers\": 12, \"tune_embeddings\": false, \"do_preprocessing\": true, \"verbose\": false}, \"inbound_nodes\": [[[\"input_1\", 0, 0, {}]]]}, {\"name\": \"conv1d\", \"class_name\": \"Conv1D\", \"config\": {\"name\": \"conv1d\", \"trainable\": true, \"dtype\": \"float32\", \"filters\": 32, \"kernel_size\": [3], \"strides\": [1], \"padding\": \"valid\", \"data_format\": \"channels_last\", \"dilation_rate\": [1], \"activation\": \"relu\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"bert_layer\", 0, 0, {}]]]}, {\"name\": \"max_pooling1d\", \"class_name\": \"MaxPooling1D\", \"config\": {\"name\": \"max_pooling1d\", \"trainable\": true, \"dtype\": \"float32\", \"strides\": [2], \"pool_size\": [2], \"padding\": \"valid\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"conv1d\", 0, 0, {}]]]}, {\"name\": \"flatten\", \"class_name\": \"Flatten\", \"config\": {\"name\": \"flatten\", \"trainable\": true, \"dtype\": \"float32\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"max_pooling1d\", 0, 0, {}]]]}, {\"name\": \"dense\", \"class_name\": \"Dense\", \"config\": {\"name\": \"dense\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 1, \"activation\": \"sigmoid\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"flatten\", 0, 0, {}]]]}], \"input_layers\": [[\"input_1\", 0, 0]], \"output_layers\": [[\"dense\", 0, 0]]}, \"keras_version\": \"2.2.4-tf\", \"backend\": \"tensorflow\"}" -------------------------------------------------------------------------------- /BERT-CNN/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | import tensorflow as tf 23 | 24 | 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): 26 | """Creates an optimizer training op.""" 27 | global_step = tf.train.get_or_create_global_step() 28 | 29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 30 | 31 | # Implements linear decay of the learning rate. 32 | learning_rate = tf.train.polynomial_decay( 33 | learning_rate, 34 | global_step, 35 | num_train_steps, 36 | end_learning_rate=0.0, 37 | power=1.0, 38 | cycle=False) 39 | 40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 41 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 42 | if num_warmup_steps: 43 | global_steps_int = tf.cast(global_step, tf.int32) 44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 45 | 46 | global_steps_float = tf.cast(global_steps_int, tf.float32) 47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 48 | 49 | warmup_percent_done = global_steps_float / warmup_steps_float 50 | warmup_learning_rate = init_lr * warmup_percent_done 51 | 52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 53 | learning_rate = ( 54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) 55 | 56 | # It is recommended that you use this optimizer for fine tuning, since this 57 | # is how the model was trained (note that the Adam m/v variables are NOT 58 | # loaded from init_checkpoint.) 59 | optimizer = AdamWeightDecayOptimizer( 60 | learning_rate=learning_rate, 61 | weight_decay_rate=0.01, 62 | beta_1=0.9, 63 | beta_2=0.999, 64 | epsilon=1e-6, 65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 66 | 67 | if use_tpu: 68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 69 | 70 | tvars = tf.trainable_variables() 71 | grads = tf.gradients(loss, tvars) 72 | 73 | # This is how the model was pre-trained. 74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) 75 | 76 | train_op = optimizer.apply_gradients( 77 | zip(grads, tvars), global_step=global_step) 78 | 79 | # Normally the global step update is done inside of `apply_gradients`. 80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use 81 | # a different optimizer, you should probably take this line out. 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = ( 132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = ( 134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, 135 | tf.square(grad))) 136 | 137 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 138 | 139 | # Just adding the square of the weights to the loss function is *not* 140 | # the correct way of using L2 regularization/weight decay with Adam, 141 | # since that will interact with the m and v parameters in strange ways. 142 | # 143 | # Instead we want ot decay the weights in a manner that doesn't interact 144 | # with the m/v parameters. This is equivalent to adding the square 145 | # of the weights to the loss with plain (non-momentum) SGD. 146 | if self._do_use_weight_decay(param_name): 147 | update += self.weight_decay_rate * param 148 | 149 | update_with_lr = self.learning_rate * update 150 | 151 | next_param = param - update_with_lr 152 | 153 | assignments.extend( 154 | [param.assign(next_param), 155 | m.assign(next_m), 156 | v.assign(next_v)]) 157 | return tf.group(*assignments, name=name) 158 | 159 | def _do_use_weight_decay(self, param_name): 160 | """Whether to use L2 weight decay for `param_name`.""" 161 | if not self.weight_decay_rate: 162 | return False 163 | if self.exclude_from_weight_decay: 164 | for r in self.exclude_from_weight_decay: 165 | if re.search(r, param_name) is not None: 166 | return False 167 | return True 168 | 169 | def _get_variable_name(self, param_name): 170 | """Get the variable name from the tensor name.""" 171 | m = re.match("^(.*):\\d+$", param_name) 172 | if m is not None: 173 | param_name = m.group(1) 174 | return param_name 175 | -------------------------------------------------------------------------------- /BERT-CNN/test_demo.tsv: -------------------------------------------------------------------------------- 1 | test_id visual caption 2 | 0 standard poodle shopping cart footwear a close up of shoes and a dog in a basket 3 | 1 standard poodle shopping cart footwear a brown teddy bear laying on top of a pair of shoes 4 | 2 toilet seat a toilet with a hole in the floor 5 | 3 mobile home studio couch house a living room with a couch chair coffee table and a television 6 | 4 french loaf conch person a sandwich and a basket of food on a table 7 | 5 indian elephant a man and two children riding on an elephant 8 | -------------------------------------------------------------------------------- /BERT-CNN/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import re 23 | import unicodedata 24 | import six 25 | import tensorflow as tf 26 | 27 | 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): 29 | """Checks whether the casing config is consistent with the checkpoint name.""" 30 | 31 | # The casing has to be passed in by the user and there is no explicit check 32 | # as to whether it matches the checkpoint. The casing information probably 33 | # should have been stored in the bert_config.json file, but it's not, so 34 | # we have to heuristically detect it to validate. 35 | 36 | if not init_checkpoint: 37 | return 38 | 39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) 40 | if m is None: 41 | return 42 | 43 | model_name = m.group(1) 44 | 45 | lower_models = [ 46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", 47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" 48 | ] 49 | 50 | cased_models = [ 51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", 52 | "multi_cased_L-12_H-768_A-12" 53 | ] 54 | 55 | is_bad_config = False 56 | if model_name in lower_models and not do_lower_case: 57 | is_bad_config = True 58 | actual_flag = "False" 59 | case_name = "lowercased" 60 | opposite_flag = "True" 61 | 62 | if model_name in cased_models and do_lower_case: 63 | is_bad_config = True 64 | actual_flag = "True" 65 | case_name = "cased" 66 | opposite_flag = "False" 67 | 68 | if is_bad_config: 69 | raise ValueError( 70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " 71 | "However, `%s` seems to be a %s model, so you " 72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches " 73 | "how the model was pre-training. If this error is wrong, please " 74 | "just comment out this check." % (actual_flag, init_checkpoint, 75 | model_name, case_name, opposite_flag)) 76 | 77 | 78 | def convert_to_unicode(text): 79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 80 | if six.PY3: 81 | if isinstance(text, str): 82 | return text 83 | elif isinstance(text, bytes): 84 | return text.decode("utf-8", "ignore") 85 | else: 86 | raise ValueError("Unsupported string type: %s" % (type(text))) 87 | elif six.PY2: 88 | if isinstance(text, str): 89 | return text.decode("utf-8", "ignore") 90 | elif isinstance(text, unicode): 91 | return text 92 | else: 93 | raise ValueError("Unsupported string type: %s" % (type(text))) 94 | else: 95 | raise ValueError("Not running on Python2 or Python 3?") 96 | 97 | 98 | def printable_text(text): 99 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 100 | 101 | # These functions want `str` for both Python2 and Python3, but in one case 102 | # it's a Unicode string and in the other it's a byte string. 103 | if six.PY3: 104 | if isinstance(text, str): 105 | return text 106 | elif isinstance(text, bytes): 107 | return text.decode("utf-8", "ignore") 108 | else: 109 | raise ValueError("Unsupported string type: %s" % (type(text))) 110 | elif six.PY2: 111 | if isinstance(text, str): 112 | return text 113 | elif isinstance(text, unicode): 114 | return text.encode("utf-8") 115 | else: 116 | raise ValueError("Unsupported string type: %s" % (type(text))) 117 | else: 118 | raise ValueError("Not running on Python2 or Python 3?") 119 | 120 | 121 | def load_vocab(vocab_file): 122 | """Loads a vocabulary file into a dictionary.""" 123 | vocab = collections.OrderedDict() 124 | index = 0 125 | with tf.gfile.GFile(vocab_file, "r") as reader: 126 | while True: 127 | token = convert_to_unicode(reader.readline()) 128 | if not token: 129 | break 130 | token = token.strip() 131 | vocab[token] = index 132 | index += 1 133 | return vocab 134 | 135 | 136 | def convert_by_vocab(vocab, items): 137 | """Converts a sequence of [tokens|ids] using the vocab.""" 138 | output = [] 139 | for item in items: 140 | output.append(vocab[item]) 141 | return output 142 | 143 | 144 | def convert_tokens_to_ids(vocab, tokens): 145 | return convert_by_vocab(vocab, tokens) 146 | 147 | 148 | def convert_ids_to_tokens(inv_vocab, ids): 149 | return convert_by_vocab(inv_vocab, ids) 150 | 151 | 152 | def whitespace_tokenize(text): 153 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 154 | text = text.strip() 155 | if not text: 156 | return [] 157 | tokens = text.split() 158 | return tokens 159 | 160 | 161 | class FullTokenizer(object): 162 | """Runs end-to-end tokenziation.""" 163 | 164 | def __init__(self, vocab_file, do_lower_case=True): 165 | self.vocab = load_vocab(vocab_file) 166 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 167 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 168 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 169 | 170 | def tokenize(self, text): 171 | split_tokens = [] 172 | for token in self.basic_tokenizer.tokenize(text): 173 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 174 | split_tokens.append(sub_token) 175 | 176 | return split_tokens 177 | 178 | def convert_tokens_to_ids(self, tokens): 179 | return convert_by_vocab(self.vocab, tokens) 180 | 181 | def convert_ids_to_tokens(self, ids): 182 | return convert_by_vocab(self.inv_vocab, ids) 183 | 184 | 185 | class BasicTokenizer(object): 186 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 187 | 188 | def __init__(self, do_lower_case=True): 189 | """Constructs a BasicTokenizer. 190 | 191 | Args: 192 | do_lower_case: Whether to lower case the input. 193 | """ 194 | self.do_lower_case = do_lower_case 195 | 196 | def tokenize(self, text): 197 | """Tokenizes a piece of text.""" 198 | text = convert_to_unicode(text) 199 | text = self._clean_text(text) 200 | 201 | # This was added on November 1st, 2018 for the multilingual and Chinese 202 | # models. This is also applied to the English models now, but it doesn't 203 | # matter since the English models were not trained on any Chinese data 204 | # and generally don't have any Chinese data in them (there are Chinese 205 | # characters in the vocabulary because Wikipedia does have some Chinese 206 | # words in the English Wikipedia.). 207 | text = self._tokenize_chinese_chars(text) 208 | 209 | orig_tokens = whitespace_tokenize(text) 210 | split_tokens = [] 211 | for token in orig_tokens: 212 | if self.do_lower_case: 213 | token = token.lower() 214 | token = self._run_strip_accents(token) 215 | split_tokens.extend(self._run_split_on_punc(token)) 216 | 217 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 218 | return output_tokens 219 | 220 | def _run_strip_accents(self, text): 221 | """Strips accents from a piece of text.""" 222 | text = unicodedata.normalize("NFD", text) 223 | output = [] 224 | for char in text: 225 | cat = unicodedata.category(char) 226 | if cat == "Mn": 227 | continue 228 | output.append(char) 229 | return "".join(output) 230 | 231 | def _run_split_on_punc(self, text): 232 | """Splits punctuation on a piece of text.""" 233 | chars = list(text) 234 | i = 0 235 | start_new_word = True 236 | output = [] 237 | while i < len(chars): 238 | char = chars[i] 239 | if _is_punctuation(char): 240 | output.append([char]) 241 | start_new_word = True 242 | else: 243 | if start_new_word: 244 | output.append([]) 245 | start_new_word = False 246 | output[-1].append(char) 247 | i += 1 248 | 249 | return ["".join(x) for x in output] 250 | 251 | def _tokenize_chinese_chars(self, text): 252 | """Adds whitespace around any CJK character.""" 253 | output = [] 254 | for char in text: 255 | cp = ord(char) 256 | if self._is_chinese_char(cp): 257 | output.append(" ") 258 | output.append(char) 259 | output.append(" ") 260 | else: 261 | output.append(char) 262 | return "".join(output) 263 | 264 | def _is_chinese_char(self, cp): 265 | """Checks whether CP is the codepoint of a CJK character.""" 266 | # This defines a "chinese character" as anything in the CJK Unicode block: 267 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 268 | # 269 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 270 | # despite its name. The modern Korean Hangul alphabet is a different block, 271 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 272 | # space-separated words, so they are not treated specially and handled 273 | # like the all of the other languages. 274 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 275 | (cp >= 0x3400 and cp <= 0x4DBF) or # 276 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 277 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 278 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 279 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 280 | (cp >= 0xF900 and cp <= 0xFAFF) or # 281 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 282 | return True 283 | 284 | return False 285 | 286 | def _clean_text(self, text): 287 | """Performs invalid character removal and whitespace cleanup on text.""" 288 | output = [] 289 | for char in text: 290 | cp = ord(char) 291 | if cp == 0 or cp == 0xfffd or _is_control(char): 292 | continue 293 | if _is_whitespace(char): 294 | output.append(" ") 295 | else: 296 | output.append(char) 297 | return "".join(output) 298 | 299 | 300 | class WordpieceTokenizer(object): 301 | """Runs WordPiece tokenziation.""" 302 | 303 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): 304 | self.vocab = vocab 305 | self.unk_token = unk_token 306 | self.max_input_chars_per_word = max_input_chars_per_word 307 | 308 | def tokenize(self, text): 309 | """Tokenizes a piece of text into its word pieces. 310 | 311 | This uses a greedy longest-match-first algorithm to perform tokenization 312 | using the given vocabulary. 313 | 314 | For example: 315 | input = "unaffable" 316 | output = ["un", "##aff", "##able"] 317 | 318 | Args: 319 | text: A single token or whitespace separated tokens. This should have 320 | already been passed through `BasicTokenizer. 321 | 322 | Returns: 323 | A list of wordpiece tokens. 324 | """ 325 | 326 | text = convert_to_unicode(text) 327 | 328 | output_tokens = [] 329 | for token in whitespace_tokenize(text): 330 | chars = list(token) 331 | if len(chars) > self.max_input_chars_per_word: 332 | output_tokens.append(self.unk_token) 333 | continue 334 | 335 | is_bad = False 336 | start = 0 337 | sub_tokens = [] 338 | while start < len(chars): 339 | end = len(chars) 340 | cur_substr = None 341 | while start < end: 342 | substr = "".join(chars[start:end]) 343 | if start > 0: 344 | substr = "##" + substr 345 | if substr in self.vocab: 346 | cur_substr = substr 347 | break 348 | end -= 1 349 | if cur_substr is None: 350 | is_bad = True 351 | break 352 | sub_tokens.append(cur_substr) 353 | start = end 354 | 355 | if is_bad: 356 | output_tokens.append(self.unk_token) 357 | else: 358 | output_tokens.extend(sub_tokens) 359 | return output_tokens 360 | 361 | 362 | def _is_whitespace(char): 363 | """Checks whether `chars` is a whitespace character.""" 364 | # \t, \n, and \r are technically contorl characters but we treat them 365 | # as whitespace since they are generally considered as such. 366 | if char == " " or char == "\t" or char == "\n" or char == "\r": 367 | return True 368 | cat = unicodedata.category(char) 369 | if cat == "Zs": 370 | return True 371 | return False 372 | 373 | 374 | def _is_control(char): 375 | """Checks whether `chars` is a control character.""" 376 | # These are technically control characters but we count them as whitespace 377 | # characters. 378 | if char == "\t" or char == "\n" or char == "\r": 379 | return False 380 | cat = unicodedata.category(char) 381 | if cat in ("Cc", "Cf"): 382 | return True 383 | return False 384 | 385 | 386 | def _is_punctuation(char): 387 | """Checks whether `chars` is a punctuation character.""" 388 | cp = ord(char) 389 | # We treat all non-letter/number ASCII as punctuation. 390 | # Characters such as "^", "$", and "`" are not in the Unicode 391 | # Punctuation class but we treat them as punctuation anyways, for 392 | # consistency. 393 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 394 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 395 | return True 396 | cat = unicodedata.category(char) 397 | if cat.startswith("P"): 398 | return True 399 | return False 400 | -------------------------------------------------------------------------------- /BERT-CNN/uncased_L-12_H-768_A-12/file-should be here.txt: -------------------------------------------------------------------------------- 1 | Download this from bert website 2 | 3 | -------------------------------------------------------------------------------- /BERT/README.md: -------------------------------------------------------------------------------- 1 | ## Semantric Relatendes with BERT 2 | Fine-tune BERT on the created dataset. 3 | 4 | ### Requirements 5 | - Tensorflow 1.15.0 6 | - Python 2.7 7 | 8 | ``` 9 | conda create -n BERT_visual python=2.7 anaconda 10 | conda activate BERT_visual 11 | pip install tensorflow==1.15.0 12 | ``` 13 | 14 | ``` 15 | python train_model_VC.py # train/val/and inference 16 | ``` 17 | main page example 18 | ``` 19 | ## relatedness score 20 | 21 | image: COCO_val2014_000000156242.jpg - Karpathy test split 22 | ``` 23 | ``` 24 | BERT Base 25 | 26 | ('visual :', 'apple') # Visual (ours) 27 | ('caption :', 'a display of apple and orange at market') 28 | ('Prediction :', 0.9933211) 29 | ****** 30 | ('visual :', 'apple') # Greedy 31 | ('caption :', 'a fruit market with apples and orange') 32 | ('Prediction :', 0.98885113) 33 | ****** 34 | ('visual :', 'apple') Beam Serach 35 | ('caption :', 'a fruit stand with apples and oranges') 36 | ('Prediction :', 0.9911321) 37 | 38 | BERT Large 39 | 40 | ('visual :', 'apple') 41 | ('caption :', 'a display of apple and orange at market') 42 | ('Prediction :', 0.99782264) 43 | ****** 44 | ('visual :', 'apple') 45 | (''caption :', 'a fruit market with apples and orange') 46 | ('Prediction :', 0.99774504) 47 | ****** 48 | ('visual :', 'apple') 49 | ('caption :', 'a fruit stand with apples and oranges') 50 | ('Prediction :', 0.9977704) 51 | ``` 52 | -------------------------------------------------------------------------------- /BERT/data/test.tsv: -------------------------------------------------------------------------------- 1 | id visual caption 2 | 0 shopping a close up of a dog laying in a basket 3 | 1 traffic a black and white photo of a street light 4 | 2 toilet a white toilet with its seat up in a bathroom 5 | 3 bed a living room filled with furniture and a coffee table 6 | 4 hotdog a basket filled with sandwiches on top of a table 7 | 5 tusker a group of people riding on the back of an elephant 8 | 6 suit a man wearing glasses and a tie in a room 9 | -------------------------------------------------------------------------------- /BERT/data/train.tsv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT/data/train.tsv.zip -------------------------------------------------------------------------------- /BERT/outputs/need-this.txt: -------------------------------------------------------------------------------- 1 | Put the provide weight in this file if you want to continue the training 2 | -------------------------------------------------------------------------------- /BERT/tokenization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import unicodedata 23 | import six 24 | import tensorflow as tf 25 | 26 | 27 | def convert_to_unicode(text): 28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" 29 | if six.PY3: 30 | if isinstance(text, str): 31 | return text 32 | elif isinstance(text, bytes): 33 | return text.decode("utf-8", "ignore") 34 | else: 35 | raise ValueError("Unsupported string type: %s" % (type(text))) 36 | elif six.PY2: 37 | if isinstance(text, str): 38 | return text.decode("utf-8", "ignore") 39 | elif isinstance(text, unicode): 40 | return text 41 | else: 42 | raise ValueError("Unsupported string type: %s" % (type(text))) 43 | else: 44 | raise ValueError("Not running on Python2 or Python 3?") 45 | 46 | 47 | def printable_text(text): 48 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 49 | 50 | # These functions want `str` for both Python2 and Python3, but in one case 51 | # it's a Unicode string and in the other it's a byte string. 52 | if six.PY3: 53 | if isinstance(text, str): 54 | return text 55 | elif isinstance(text, bytes): 56 | return text.decode("utf-8", "ignore") 57 | else: 58 | raise ValueError("Unsupported string type: %s" % (type(text))) 59 | elif six.PY2: 60 | if isinstance(text, str): 61 | return text 62 | elif isinstance(text, unicode): 63 | return text.encode("utf-8") 64 | else: 65 | raise ValueError("Unsupported string type: %s" % (type(text))) 66 | else: 67 | raise ValueError("Not running on Python2 or Python 3?") 68 | 69 | 70 | def load_vocab(vocab_file): 71 | """Loads a vocabulary file into a dictionary.""" 72 | vocab = collections.OrderedDict() 73 | index = 0 74 | with tf.gfile.GFile(vocab_file, "r") as reader: 75 | while True: 76 | token = convert_to_unicode(reader.readline()) 77 | if not token: 78 | break 79 | token = token.strip() 80 | vocab[token] = index 81 | index += 1 82 | return vocab 83 | 84 | 85 | def convert_by_vocab(vocab, items): 86 | """Converts a sequence of [tokens|ids] using the vocab.""" 87 | output = [] 88 | for item in items: 89 | output.append(vocab[item]) 90 | return output 91 | 92 | 93 | def convert_tokens_to_ids(vocab, tokens): 94 | return convert_by_vocab(vocab, tokens) 95 | 96 | 97 | def convert_ids_to_tokens(inv_vocab, ids): 98 | return convert_by_vocab(inv_vocab, ids) 99 | 100 | 101 | def whitespace_tokenize(text): 102 | """Runs basic whitespace cleaning and splitting on a peice of text.""" 103 | text = text.strip() 104 | if not text: 105 | return [] 106 | tokens = text.split() 107 | return tokens 108 | 109 | 110 | class FullTokenizer(object): 111 | """Runs end-to-end tokenziation.""" 112 | 113 | def __init__(self, vocab_file, do_lower_case=True): 114 | self.vocab = load_vocab(vocab_file) 115 | self.inv_vocab = {v: k for k, v in self.vocab.items()} 116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) 117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) 118 | 119 | def tokenize(self, text): 120 | split_tokens = [] 121 | for token in self.basic_tokenizer.tokenize(text): 122 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 123 | split_tokens.append(sub_token) 124 | 125 | return split_tokens 126 | 127 | def convert_tokens_to_ids(self, tokens): 128 | return convert_by_vocab(self.vocab, tokens) 129 | 130 | def convert_ids_to_tokens(self, ids): 131 | return convert_by_vocab(self.inv_vocab, ids) 132 | 133 | 134 | class BasicTokenizer(object): 135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 136 | 137 | def __init__(self, do_lower_case=True): 138 | """Constructs a BasicTokenizer. 139 | 140 | Args: 141 | do_lower_case: Whether to lower case the input. 142 | """ 143 | self.do_lower_case = do_lower_case 144 | 145 | def tokenize(self, text): 146 | """Tokenizes a piece of text.""" 147 | text = convert_to_unicode(text) 148 | text = self._clean_text(text) 149 | 150 | # This was added on November 1st, 2018 for the multilingual and Chinese 151 | # models. This is also applied to the English models now, but it doesn't 152 | # matter since the English models were not trained on any Chinese data 153 | # and generally don't have any Chinese data in them (there are Chinese 154 | # characters in the vocabulary because Wikipedia does have some Chinese 155 | # words in the English Wikipedia.). 156 | text = self._tokenize_chinese_chars(text) 157 | 158 | orig_tokens = whitespace_tokenize(text) 159 | split_tokens = [] 160 | for token in orig_tokens: 161 | if self.do_lower_case: 162 | token = token.lower() 163 | token = self._run_strip_accents(token) 164 | split_tokens.extend(self._run_split_on_punc(token)) 165 | 166 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 167 | return output_tokens 168 | 169 | def _run_strip_accents(self, text): 170 | """Strips accents from a piece of text.""" 171 | text = unicodedata.normalize("NFD", text) 172 | output = [] 173 | for char in text: 174 | cat = unicodedata.category(char) 175 | if cat == "Mn": 176 | continue 177 | output.append(char) 178 | return "".join(output) 179 | 180 | def _run_split_on_punc(self, text): 181 | """Splits punctuation on a piece of text.""" 182 | chars = list(text) 183 | i = 0 184 | start_new_word = True 185 | output = [] 186 | while i < len(chars): 187 | char = chars[i] 188 | if _is_punctuation(char): 189 | output.append([char]) 190 | start_new_word = True 191 | else: 192 | if start_new_word: 193 | output.append([]) 194 | start_new_word = False 195 | output[-1].append(char) 196 | i += 1 197 | 198 | return ["".join(x) for x in output] 199 | 200 | def _tokenize_chinese_chars(self, text): 201 | """Adds whitespace around any CJK character.""" 202 | output = [] 203 | for char in text: 204 | cp = ord(char) 205 | if self._is_chinese_char(cp): 206 | output.append(" ") 207 | output.append(char) 208 | output.append(" ") 209 | else: 210 | output.append(char) 211 | return "".join(output) 212 | 213 | def _is_chinese_char(self, cp): 214 | """Checks whether CP is the codepoint of a CJK character.""" 215 | # This defines a "chinese character" as anything in the CJK Unicode block: 216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 217 | # 218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 219 | # despite its name. The modern Korean Hangul alphabet is a different block, 220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 221 | # space-separated words, so they are not treated specially and handled 222 | # like the all of the other languages. 223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 224 | (cp >= 0x3400 and cp <= 0x4DBF) or # 225 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 226 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 227 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 229 | (cp >= 0xF900 and cp <= 0xFAFF) or # 230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 231 | return True 232 | 233 | return False 234 | 235 | def _clean_text(self, text): 236 | """Performs invalid character removal and whitespace cleanup on text.""" 237 | output = [] 238 | for char in text: 239 | cp = ord(char) 240 | if cp == 0 or cp == 0xfffd or _is_control(char): 241 | continue 242 | if _is_whitespace(char): 243 | output.append(" ") 244 | else: 245 | output.append(char) 246 | return "".join(output) 247 | 248 | 249 | class WordpieceTokenizer(object): 250 | """Runs WordPiece tokenziation.""" 251 | 252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): 253 | self.vocab = vocab 254 | self.unk_token = unk_token 255 | self.max_input_chars_per_word = max_input_chars_per_word 256 | 257 | def tokenize(self, text): 258 | """Tokenizes a piece of text into its word pieces. 259 | 260 | This uses a greedy longest-match-first algorithm to perform tokenization 261 | using the given vocabulary. 262 | 263 | For example: 264 | input = "unaffable" 265 | output = ["un", "##aff", "##able"] 266 | 267 | Args: 268 | text: A single token or whitespace separated tokens. This should have 269 | already been passed through `BasicTokenizer. 270 | 271 | Returns: 272 | A list of wordpiece tokens. 273 | """ 274 | 275 | text = convert_to_unicode(text) 276 | 277 | output_tokens = [] 278 | for token in whitespace_tokenize(text): 279 | chars = list(token) 280 | if len(chars) > self.max_input_chars_per_word: 281 | output_tokens.append(self.unk_token) 282 | continue 283 | 284 | is_bad = False 285 | start = 0 286 | sub_tokens = [] 287 | while start < len(chars): 288 | end = len(chars) 289 | cur_substr = None 290 | while start < end: 291 | substr = "".join(chars[start:end]) 292 | if start > 0: 293 | substr = "##" + substr 294 | if substr in self.vocab: 295 | cur_substr = substr 296 | break 297 | end -= 1 298 | if cur_substr is None: 299 | is_bad = True 300 | break 301 | sub_tokens.append(cur_substr) 302 | start = end 303 | 304 | if is_bad: 305 | output_tokens.append(self.unk_token) 306 | else: 307 | output_tokens.extend(sub_tokens) 308 | return output_tokens 309 | 310 | 311 | def _is_whitespace(char): 312 | """Checks whether `chars` is a whitespace character.""" 313 | # \t, \n, and \r are technically contorl characters but we treat them 314 | # as whitespace since they are generally considered as such. 315 | if char == " " or char == "\t" or char == "\n" or char == "\r": 316 | return True 317 | cat = unicodedata.category(char) 318 | if cat == "Zs": 319 | return True 320 | return False 321 | 322 | 323 | def _is_control(char): 324 | """Checks whether `chars` is a control character.""" 325 | # These are technically control characters but we count them as whitespace 326 | # characters. 327 | if char == "\t" or char == "\n" or char == "\r": 328 | return False 329 | cat = unicodedata.category(char) 330 | if cat.startswith("C"): 331 | return True 332 | return False 333 | 334 | 335 | def _is_punctuation(char): 336 | """Checks whether `chars` is a punctuation character.""" 337 | cp = ord(char) 338 | # We treat all non-letter/number ASCII as punctuation. 339 | # Characters such as "^", "$", and "`" are not in the Unicode 340 | # Punctuation class but we treat them as punctuation anyways, for 341 | # consistency. 342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 344 | return True 345 | cat = unicodedata.category(char) 346 | if cat.startswith("P"): 347 | return True 348 | return False 349 | -------------------------------------------------------------------------------- /BERT/train_model_VC.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | #tensorflow 3 | # 1.15.0 4 | import os 5 | import sys 6 | import json 7 | import datetime 8 | import pprint 9 | import os 10 | import tensorflow as tf 11 | #pip install tensorflow==1.15 12 | config = tf.ConfigProto() 13 | 14 | 15 | #Fine-tuning with Cloud TPUs 16 | #https://github.com/google-research/bert 17 | # for the use TPU with colab for fast training and infernce 18 | # If you want to use TPU, first switch to tpu runtime in colab 19 | USE_TPU = False 20 | 21 | 22 | #https://github.com/google-research/bert#pre-trained-models 23 | # We will use base uncased bert model 24 | 25 | ## 12-layer, 768-hidden, 12-heads, 110M parameters 26 | BERT_MODEL = 'uncased_L-12_H-768_A-12' 27 | ## 12-layer, 768-hidden, 12-heads, 110M parameters 28 | #BERT_MODEL = 'uncased_L-24_H-1024_A-16' 29 | 30 | 31 | ## BERT checkpoint bucket 32 | ## 12-layer, 768-hidden, 12-heads, 110M parameters 33 | BERT_PRETRAINED_DIR = 'uncased_L-12_H-768_A-12' 34 | 35 | 36 | print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR)) 37 | ## uncased_L-12_H-768_A-12 (directory) 38 | # bert_model.ckpt.data-00000-of-00001 39 | # bert_model.ckpt.meta 40 | 41 | # output file 42 | #OUTPUT_DIR = '/home/asabir/Desktop/model_repo/outputs' 43 | OUTPUT_DIR ='outputs' 44 | #print(f'***** Model output directory: {OUTPUT_DIR} *****') 45 | print('***** Model output directory: {OUTPUT_DIR} *****') 46 | #print(f'***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****') 47 | print('***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****') 48 | 49 | 50 | print('***** Model output directory: {} *****'.format(OUTPUT_DIR)) 51 | 52 | 53 | #TASK_DATA_DIR = 'data/visual-caption' 54 | if not 'bert' in sys.path: 55 | sys.path += ['bert'] 56 | 57 | TASK_DATA_DIR = '/data/' 58 | # ## Model Configs and Hyper Parameters 59 | 60 | import modeling 61 | import optimization 62 | import tokenization 63 | import run_classifier 64 | 65 | # Model Hyper Parameters 66 | #TRAIN_BATCH_SIZE = 32 # For GPU, reduce to 16 67 | TRAIN_BATCH_SIZE = 16 # 68 | EVAL_BATCH_SIZE = 8 69 | PREDICT_BATCH_SIZE = 8 70 | LEARNING_RATE = 2e-5 71 | #NUM_TRAIN_EPOCHS = 2.0 72 | NUM_TRAIN_EPOCHS = 1.0 73 | WARMUP_PROPORTION = 0.1 74 | MAX_SEQ_LENGTH = 30 75 | 76 | # Model configs 77 | SAVE_CHECKPOINTS_STEPS = 1000 78 | ITERATIONS_PER_LOOP = 1000 79 | NUM_TPU_CORES = 8 80 | VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt') 81 | CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json') 82 | INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') 83 | DO_LOWER_CASE = BERT_MODEL.startswith('uncased') 84 | 85 | 86 | # ## Read visual caption Pairs 87 | # Read data from TSV file and covert to list of InputExample. 88 | #to [run_classifier](https://github.com/google-research/bert/blob/master/run_classifier.py) file 89 | 90 | 91 | class VCProcessor(run_classifier.DataProcessor): 92 | """Processor for the visual caption pair data set.""" 93 | 94 | def get_train_examples(self, data_dir): 95 | """Reading train.tsv and converting to list of InputExample""" 96 | return self._create_examples( 97 | self._read_tsv(os.path.join(data_dir,"train.tsv")), 'train') 98 | 99 | def get_dev_examples(self, data_dir): 100 | """Reading dev.tsv and converting to list of InputExample""" 101 | return self._create_examples( 102 | self._read_tsv(os.path.join(data_dir,"dev.tsv")), 'dev') 103 | 104 | def get_test_examples(self, data_dir): 105 | """Reading train.tsv and converting to list of InputExample""" 106 | return self._create_examples( 107 | self._read_tsv(os.path.join(data_dir,"test.tsv")), 'test') 108 | 109 | def get_predict_examples(self, sentence_pairs): 110 | """Given visual caption pairs, conevrting to list of InputExample""" 111 | examples = [] 112 | for (i, vcpair) in enumerate(sentence_pairs): 113 | guid = "predict-%d" % (i) 114 | # converting input text to utf-8 and creating InputExamples 115 | text_a = tokenization.convert_to_unicode(vcpair[0]) 116 | text_b = tokenization.convert_to_unicode(vcpair[1]) 117 | # We will add label as 0, because None is not supported in converting to features 118 | examples.append( 119 | run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=0)) 120 | return examples 121 | 122 | def _create_examples(self, lines, set_type): 123 | """Creates examples for the training, dev and test sets.""" 124 | examples = [] 125 | for (i, line) in enumerate(lines): 126 | guid = "%s-%d" % (set_type, i) 127 | if set_type=='test': 128 | # removing header and invalid data 129 | if i == 0 or len(line)!=3: 130 | print(guid, line) 131 | continue 132 | text_a = tokenization.convert_to_unicode(line[1]) 133 | text_b = tokenization.convert_to_unicode(line[2]) 134 | label = 0 # We will use zero for test as convert_example_to_features doesn't support None 135 | else: 136 | # removing header and invalid data 137 | if i == 0 or len(line)!=6: 138 | continue 139 | text_a = tokenization.convert_to_unicode(line[3]) 140 | text_b = tokenization.convert_to_unicode(line[4]) 141 | label = int(line[5]) 142 | examples.append( 143 | run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) 144 | return examples 145 | 146 | def get_labels(self): 147 | "return class labels" 148 | return [0,1] 149 | 150 | 151 | # initialiation an instance of visual-caption VCProcessor and tokenizer 152 | processor = VCProcessor() 153 | label_list = processor.get_labels() 154 | tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE) 155 | 156 | 157 | # Converting training examples to features 158 | print("---------------- Processing Training Data ------------------") 159 | TRAIN_TF_RECORD = os.path.join(OUTPUT_DIR, "train.tf_record") 160 | train_examples = processor.get_train_examples(TASK_DATA_DIR) 161 | num_train_examples = len(train_examples) 162 | num_train_steps = int( num_train_examples / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS) 163 | num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION) 164 | run_classifier.file_based_convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TRAIN_TF_RECORD) 165 | 166 | 167 | # ## Creating Classification Model 168 | 169 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 170 | labels, num_labels, use_one_hot_embeddings): 171 | """Creates a classification model.""" 172 | # Bert Model instant 173 | model = modeling.BertModel( 174 | config=bert_config, 175 | is_training=is_training, 176 | input_ids=input_ids, 177 | input_mask=input_mask, 178 | token_type_ids=segment_ids, 179 | use_one_hot_embeddings=use_one_hot_embeddings) 180 | 181 | # Getting output for last layer of BERT 182 | output_layer = model.get_pooled_output() 183 | 184 | # Number of outputs for last layer 185 | hidden_size = output_layer.shape[-1].value 186 | 187 | # We will use one layer on top of BERT pretrained for creating classification model 188 | output_weights = tf.get_variable( 189 | "output_weights", [num_labels, hidden_size], 190 | initializer=tf.truncated_normal_initializer(stddev=0.02)) 191 | 192 | output_bias = tf.get_variable( 193 | "output_bias", [num_labels], initializer=tf.zeros_initializer()) 194 | 195 | with tf.variable_scope("loss"): 196 | if is_training: 197 | # 0.1 dropout 198 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) 199 | 200 | # Calcaulte prediction probabilites and loss 201 | logits = tf.matmul(output_layer, output_weights, transpose_b=True) 202 | logits = tf.nn.bias_add(logits, output_bias) 203 | probabilities = tf.nn.softmax(logits, axis=-1) 204 | log_probs = tf.nn.log_softmax(logits, axis=-1) 205 | 206 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) 207 | 208 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) 209 | loss = tf.reduce_mean(per_example_loss) 210 | 211 | return (loss, per_example_loss, logits, probabilities) 212 | 213 | 214 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, 215 | num_train_steps, num_warmup_steps, use_tpu, 216 | use_one_hot_embeddings): 217 | """Returns `model_fn` closure for TPUEstimator.""" 218 | 219 | def model_fn(features, labels, mode, params): 220 | """The `model_fn` for TPUEstimator.""" 221 | 222 | # reading features input 223 | input_ids = features["input_ids"] 224 | input_mask = features["input_mask"] 225 | segment_ids = features["segment_ids"] 226 | label_ids = features["label_ids"] 227 | is_real_example = None 228 | if "is_real_example" in features: 229 | is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) 230 | else: 231 | is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) 232 | 233 | # checking if training mode 234 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 235 | 236 | # create simple classification model 237 | (total_loss, per_example_loss, logits, probabilities) = create_model( 238 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, 239 | num_labels, use_one_hot_embeddings) 240 | 241 | # getting variables for intialization and using pretrained init checkpoint 242 | tvars = tf.trainable_variables() 243 | initialized_variable_names = {} 244 | scaffold_fn = None 245 | if init_checkpoint: 246 | (assignment_map, initialized_variable_names 247 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) 248 | if use_tpu: 249 | 250 | def tpu_scaffold(): 251 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 252 | return tf.train.Scaffold() 253 | 254 | scaffold_fn = tpu_scaffold 255 | else: 256 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 257 | 258 | output_spec = None 259 | if mode == tf.estimator.ModeKeys.TRAIN: 260 | # defining optimizar function 261 | train_op = optimization.create_optimizer( 262 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) 263 | 264 | # Training estimator spec 265 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 266 | mode=mode, 267 | loss=total_loss, 268 | train_op=train_op, 269 | scaffold_fn=scaffold_fn) 270 | elif mode == tf.estimator.ModeKeys.EVAL: 271 | # accuracy, loss, auc, F1, precision and recall metrics for evaluation 272 | def metric_fn(per_example_loss, label_ids, logits, is_real_example): 273 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) 274 | loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) 275 | accuracy = tf.metrics.accuracy( 276 | labels=label_ids, predictions=predictions, weights=is_real_example) 277 | f1_score = tf.contrib.metrics.f1_score( 278 | label_ids, 279 | predictions) 280 | auc = tf.metrics.auc( 281 | label_ids, 282 | predictions) 283 | recall = tf.metrics.recall( 284 | label_ids, 285 | predictions) 286 | precision = tf.metrics.precision( 287 | label_ids, 288 | predictions) 289 | return { 290 | "eval_accuracy": accuracy, 291 | "eval_loss": loss, 292 | "f1_score": f1_score, 293 | "auc": auc, 294 | "precision": precision, 295 | "recall": recall 296 | } 297 | 298 | eval_metrics = (metric_fn, 299 | [per_example_loss, label_ids, logits, is_real_example]) 300 | # estimator spec for evalaution 301 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 302 | mode=mode, 303 | loss=total_loss, 304 | eval_metrics=eval_metrics, 305 | scaffold_fn=scaffold_fn) 306 | else: 307 | # estimator spec for predictions 308 | output_spec = tf.contrib.tpu.TPUEstimatorSpec( 309 | mode=mode, 310 | predictions={"probabilities": probabilities}, 311 | scaffold_fn=scaffold_fn) 312 | return output_spec 313 | 314 | return model_fn 315 | 316 | 317 | # Define TPU configs 318 | if USE_TPU: 319 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS) 320 | else: 321 | tpu_cluster_resolver = None 322 | run_config = tf.contrib.tpu.RunConfig( 323 | cluster=tpu_cluster_resolver, 324 | model_dir=OUTPUT_DIR, 325 | save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS, 326 | tpu_config=tf.contrib.tpu.TPUConfig( 327 | iterations_per_loop=ITERATIONS_PER_LOOP, 328 | num_shards=NUM_TPU_CORES, 329 | per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)) 330 | 331 | 332 | # create model function for estimator using model function builder 333 | model_fn = model_fn_builder( 334 | bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE), 335 | num_labels=len(label_list), 336 | init_checkpoint=INIT_CHECKPOINT, 337 | learning_rate=LEARNING_RATE, 338 | num_train_steps=num_train_steps, 339 | num_warmup_steps=num_warmup_steps, 340 | use_tpu=USE_TPU, 341 | use_one_hot_embeddings=True) 342 | 343 | 344 | 345 | # Defining TPU Estimator 346 | estimator = tf.contrib.tpu.TPUEstimator( 347 | use_tpu=USE_TPU, 348 | model_fn=model_fn, 349 | config=run_config, 350 | train_batch_size=TRAIN_BATCH_SIZE, 351 | eval_batch_size=EVAL_BATCH_SIZE, 352 | predict_batch_size=PREDICT_BATCH_SIZE) 353 | 354 | 355 | 356 | # Train the model. 357 | #print('VCS on BERT base model normally takes about 1 hour on TPU and 15-20 hours on GPU. Please wait...') 358 | print('***** Started training at {} *****'.format(datetime.datetime.now())) 359 | print(' Num examples = {}'.format(num_train_examples)) 360 | print(' Batch size = {}'.format(TRAIN_BATCH_SIZE)) 361 | tf.logging.info(" Num steps = %d", num_train_steps) 362 | # we are using `file_based_input_fn_builder` for creating input function from TF_RECORD file 363 | train_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD, 364 | seq_length=MAX_SEQ_LENGTH, 365 | is_training=True, 366 | drop_remainder=True) 367 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) 368 | print('***** Finished training at {} *****'.format(datetime.datetime.now())) 369 | 370 | 371 | ## Evalute FineTuned model 372 | 373 | 374 | # eval the model on train set. 375 | print('***** Started Train Set evaluation at {} *****'.format(datetime.datetime.now())) 376 | print(' Num examples = {}'.format(num_train_examples)) 377 | print(' Batch size = {}'.format(EVAL_BATCH_SIZE)) 378 | # eval input function for train set 379 | train_eval_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD, 380 | seq_length=MAX_SEQ_LENGTH, 381 | is_training=False, 382 | drop_remainder=True) 383 | # evalute on train set 384 | result = estimator.evaluate(input_fn=train_eval_input_fn, 385 | steps=int(num_train_examples/EVAL_BATCH_SIZE)) 386 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now())) 387 | print("***** Eval results *****") 388 | for key in sorted(result.keys()): 389 | print(' {} = {}'.format(key, str(result[key]))) 390 | 391 | 392 | 393 | # Converting eval examples to features 394 | print("--------------- Processing Dev Data ------------------") 395 | EVAL_TF_RECORD = os.path.join(OUTPUT_DIR, "eval.tf_record") 396 | eval_examples = processor.get_dev_examples(TASK_DATA_DIR) 397 | num_eval_examples = len(eval_examples) 398 | run_classifier.file_based_convert_examples_to_features(eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, EVAL_TF_RECORD) 399 | 400 | 401 | # Eval the model on Dev set. 402 | print('***** Started Dev Set evaluation at {} *****'.format(datetime.datetime.now())) 403 | print(' Num examples = {}'.format(num_eval_examples)) 404 | print(' Batch size = {}'.format(EVAL_BATCH_SIZE)) 405 | 406 | # eval input function for dev set 407 | eval_input_fn = run_classifier.file_based_input_fn_builder(EVAL_TF_RECORD, 408 | seq_length=MAX_SEQ_LENGTH, 409 | is_training=False, 410 | drop_remainder=True) 411 | # evalute on dev set 412 | result = estimator.evaluate(input_fn=eval_input_fn, steps=int(num_eval_examples/EVAL_BATCH_SIZE)) 413 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now())) 414 | print("***** Eval results *****") 415 | for key in sorted(result.keys()): 416 | print(' {} = {}'.format(key, str(result[key]))) 417 | 418 | 419 | # examples sentences, feel free to change and try 420 | sent_pairs = [("apple", "a display of apple and orange at market"), ("apple","a fruit market with apples and orange"), 421 | ("apple","a fruit stand with apples and oranges")] 422 | 423 | 424 | print("----------- Predictions on Custom Data -------------------") 425 | # create `InputExample` for custom examples 426 | predict_examples = processor.get_predict_examples(sent_pairs) 427 | num_predict_examples = len(predict_examples) 428 | 429 | # For TPU, We will append `PaddingExample` for maintaining batch size 430 | if USE_TPU: 431 | while(len(predict_examples)%EVAL_BATCH_SIZE!=0): 432 | predict_examples.append(run_classifier.PaddingInputExample()) 433 | 434 | # Converting to features 435 | predict_features = run_classifier.convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer) 436 | 437 | print(' Num examples = {}'.format(num_predict_examples)) 438 | print(' Batch size = {}'.format(PREDICT_BATCH_SIZE)) 439 | 440 | # Input function for prediction 441 | predict_input_fn = run_classifier.input_fn_builder(predict_features, 442 | seq_length=MAX_SEQ_LENGTH, 443 | is_training=False, 444 | drop_remainder=False) 445 | result = list(estimator.predict(input_fn=predict_input_fn)) 446 | print(result) 447 | for ex_i in range(num_predict_examples): 448 | print("****** Example {} ******".format(ex_i)) 449 | print("visual :", sent_pairs[ex_i][0]) 450 | print("caption :", sent_pairs[ex_i][1]) 451 | print("Prediction :", result[ex_i]['probabilities'][1]) 452 | 453 | 454 | 455 | ################################################# Test ################################################### 456 | 457 | # Converting test examples to features 458 | print("--------------------- Processing Test Data -------------------") 459 | TEST_TF_RECORD = os.path.join(OUTPUT_DIR, "test.tf_record") 460 | test_examples = processor.get_test_examples(TASK_DATA_DIR) 461 | num_test_examples = len(test_examples) 462 | run_classifier.file_based_convert_examples_to_features(test_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TEST_TF_RECORD) 463 | 464 | 465 | # Predictions on test set. 466 | print('***** Started Prediction at {} *****'.format(datetime.datetime.now())) 467 | print(' Num examples = {}'.format(num_test_examples)) 468 | print(' Batch size = {}'.format(PREDICT_BATCH_SIZE)) 469 | # predict input function for test set 470 | test_input_fn = run_classifier.file_based_input_fn_builder(TEST_TF_RECORD, 471 | seq_length=MAX_SEQ_LENGTH, 472 | is_training=False, 473 | drop_remainder=True) 474 | tf.logging.set_verbosity(tf.logging.ERROR) 475 | # predict on test set 476 | result = list(estimator.predict(input_fn=test_input_fn)) 477 | print('***** Finished Prediction at {} *****'.format(datetime.datetime.now())) 478 | 479 | # saving test predictions 480 | output_test_file = os.path.join(OUTPUT_DIR, "test_score.txt") 481 | with tf.gfile.GFile(output_test_file, "w") as writer: 482 | for (example_i, predictions_i) in enumerate(result): 483 | writer.write("%s , %s\n" % (test_examples[example_i].guid, str(predictions_i['probabilities'][1]))) 484 | 485 | -------------------------------------------------------------------------------- /BERT/uncased_L-12_H-768_A-12/file-should be here.txt: -------------------------------------------------------------------------------- 1 | Download this from bert website 2 | 3 | -------------------------------------------------------------------------------- /COCO_train2014_000000000009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_train2014_000000000009.jpg -------------------------------------------------------------------------------- /COCO_val2014_000000000042.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_val2014_000000000042.jpg -------------------------------------------------------------------------------- /Evaluation/captions_val2014.json.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Evaluation/captions_val2014.json.zip -------------------------------------------------------------------------------- /Evaluation/coco_eval.py: -------------------------------------------------------------------------------- 1 | from pycocotools.coco import COCO 2 | from pycocoevalcap.eval import COCOEvalCap 3 | import sys 4 | import argparse 5 | 6 | 7 | 8 | parser=argparse.ArgumentParser() 9 | parser.add_argument('--f', default='', help='', type=str,required=True) 10 | args = parser.parse_args() 11 | 12 | 13 | annotation_file = 'captions_val2014.json' 14 | results_file = args.f 15 | 16 | # create coco object and coco_result object 17 | coco = COCO(annotation_file) 18 | coco_result = coco.loadRes(results_file) 19 | 20 | # create coco_eval object by taking coco and coco_result 21 | coco_eval = COCOEvalCap(coco, coco_result) 22 | 23 | # evaluate on a subset of images by setting 24 | # coco_eval.params['image_id'] = coco_result.getImgIds() 25 | # please remove this line when evaluating the full validation set 26 | coco_eval.params['image_id'] = coco_result.getImgIds() 27 | 28 | # evaluate results 29 | # SPICE will take a few minutes the first time, but speeds up due to caching 30 | coco_eval.evaluate() 31 | 32 | # print output evaluation scores 33 | for metric, score in coco_eval.eval.items(): 34 | print(f'{metric}: {score:.3f}') 35 | -------------------------------------------------------------------------------- /LRCE_figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/LRCE_figure_1.png -------------------------------------------------------------------------------- /Pre-trained.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Pre-trained.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Visual Semantic Relatedness Dataset for Image Captioning 2 | 3 | 12 | 13 | 14 | 15 | Modern image captioning relies heavily on extracting knowledge, from images such as objects, to capture the concept of a static story in the image. 16 | In this paper, we propose a textual visual context dataset for image captioning, where the publicly available dataset COCO Captions [(Lin et al., 2014)](https://arxiv.org/pdf/1405.0312.pdf) has been extended with information about the scene (such as objects in the image). Since this information has textual form, it can be used to leverage any NLP task, such as text similarity or semantic relation methods, into captioning systems, either as an end-to-end training strategy or a post-processing based approach. 17 | 18 | 19 | 20 | This repository contains the implementation of the paper [Visual Semantic Relatedness Dataset for Image Captioning](https://arxiv.org/abs/2301.08784). 21 | 22 | [![arXiv](https://img.shields.io/badge/arXiv-2301.08784-b31b1b.svg)](https://arxiv.org/abs/2301.08784) [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](https://ahmed.jp/project_page/Dataset_2022/index.html) 23 | [![huggingface](https://img.shields.io/badge/%F0%9F%A4%97-huggingface-yellow)](https://huggingface.co/datasets/AhmedSSabir/Textual-Image-Caption-Dataset) 24 | [![O-DRUM - poster](https://img.shields.io/badge/O--DRUM-poster-0065BD)](https://ahmed.jp/project_page/Dataset_2022/poster_20.pdf) 25 | [![O-DRUM - slide](https://img.shields.io/badge/O--DRUM-slide-0065BD)](https://ahmed.jp/project_page/Dataset_2022/spotlight_ppt_ID_20.pdf) 26 | 27 | 28 | ## News 29 | Add v2 with recent SoTA model swinV2 classifier for both soft/hard-label visual_caption_cosine_score_v2 with person label (0.2, 0.3 and 0.4). Please refer to huggingface repository. 30 | 31 | ## Contents 32 | 0. [Overview](#overview) 33 | 1. [Visual semantic with BERT ](#Visual-semantic-with-BERT-CNN) 34 | 2. [Dataset](#dataset) 35 | 3. [Visual semantic with pre-trained model](#Visual-semantic-with-pre-trained-model) 36 | 4. [Evaluation](#evaluation) 37 | 5. [Citation](#Citation) 38 | 39 | 40 | ## Overview 41 | 42 | 43 | We enrich COCO-Captions with **Textual Visual Context** information. We use [ResNet152](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf), [CLIP](https://github.com/openai/CLIP) and [Faster R-CNN](https://github.com/tensorflow/models/tree/master/research/object_detection) to extract 44 | object information for each COCO-caption image. We use three filter approaches to ensure the quality of the dataset (1) Threshold: to filter out predictions where the object classifier is not confident enough, and (2) semantic alignment with semantic similarity to remove duplicated objects. (3) semantic relatedness score as Soft-Label: to guarantee the visual context and caption have strong relation, we use [Sentence RoBERTa-sts](https://www.sbert.net) to give a soft label via cosine similarity and then we use a **th**reshold to annotate the final label (if th ≥ 0.2, 0.3, 0.4 then [1,0]). Finally, to take advantage of the overlapping between the visual context and the caption, and to extract global information from each visual, we use BERT followed by a shallow CNN [(Kim, 2014)](https://arxiv.org/pdf/1408.5882.pdf) to estimate the visual relatedness score. 45 | 46 | 47 | 48 | ## Quick Start 49 | For a quick start please have a look at this [project page](https://sabirdvd.github.io/project_page/Dataset_2022/index.html) 50 | and [Demo](https://github.com/ahmedssabir/Textual-Visual-Semantic-Dataset/blob/main/BERT_CNN_Visual_re_ranker_demo.ipynb) 51 | 52 | 55 | ## Dataset 56 | 57 | ### Sample 58 | 59 | VC1 | VC2 | VC3 | human annoated caption | 60 | | ------------- | ------------- |------------- | ------------- | 61 | | cheeseburger | plate | hotdog | a plate with a hamburger fries and tomatoes | 62 | | bakery | dining table | website | a table having tea and a cake on it | 63 | | gown | groom | apron | its time to cut the cake at this couples wedding | 64 | 65 | 66 | ### Download 67 | 68 | 0. [Dowload Raw data with ID and Visual context](https://www.dropbox.com/s/xuov24on8477zg8/All_Caption_ID.csv?dl=0) -> original dataset with related ID caption [train2014](https://cocodataset.org/#download) 69 | 1. [Downlod Data with cosine score](https://www.dropbox.com/s/55sit8ow9tems4u/visual_caption_cosine_score.zip?dl=0)-> soft cosine lable with **th** 0.2, 0.3, 0.4 and 0.5 and hard-label 70 | 2. [Dowload Overlaping visual with caption](https://www.dropbox.com/s/br8nhnlf4k2czo8/COCO_overlaping_dataset.txt?dl=0)-> Overlap visual context and the human annotated caption 71 | 3. [Download Dataset (tsv file)](https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip?dl=0) 0.0-> raw data with hard lable without cosine similairty and with **th**reshold cosine sim degree of the relation beteween the visual and caption = 0.2, 0.3, 0.4 72 | 4. [Download Dataset GenderBias](https://www.dropbox.com/s/1wki0b0d21078mj/gender%20natural.zip?dl=0)-> man/woman replaced with person class label 73 | 74 | 75 | ## Visual semantic with BERT-CNN 76 | Fine-tune [BERT](https://github.com/google-research/bert) on the created dataset. 77 | 78 | ### Requirements 79 | - Tensorflow 1.15.0 80 | - Python 3.6 81 | 82 | ``` 83 | conda create -n BERT_visual python=3.6 anaconda 84 | conda activate BERT_visual 85 | pip install tensorflow==1.15.0 86 | pip install --upgrade tensorflow_hub==0.7.0 87 | ``` 88 | 89 | Download BERT check point [uncased_L-12_H-768_A-12](https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1) 90 | ``` 91 | wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip 92 | unzip uncased_L-12_H-768_A-12.zip 93 | git clone https://github.com/gaphex/bert_experimental/ 94 | ``` 95 | like this ```BERT-CNN/uncased_L-12_H-768_A-12 ``` and ```BERT-CNN/bert_experimental ``` 96 | 97 | Download dataset 98 | 99 | ``` 100 | wget https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip 101 | unzip train_all.zip 102 | ``` 103 | 104 | for Training 105 | 106 | ``` 107 | parser.add_argument('--train', default='train.tsv', help='beam serach', type=str,required=False) 108 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False) 109 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False) 110 | parser.add_argument('--epochs', default='5', help='', type=int,required=False) 111 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False) 112 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False) 113 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False) 114 | ``` 115 | 116 | ``` 117 | python BERT_CNN.py --train /train_0.4.tsv --epochs 5 118 | ``` 119 | 120 | for inference only, download pre-trained model 121 | 122 | ``` 123 | wget https://www.dropbox.com/s/ip7p0wiwkwvph5k/0.4_bert-cnn.zip 124 | unzip 0.4_bert-cnn.zip 125 | ``` 126 | 127 | ``` 128 | python eval.py --testset test_demo.tsv --model 0.4_bert-cnn/frozen_graph.pb 129 | ``` 130 | ### Example 131 | 132 | Re-rank the most related caption to the image using the visual context information. 133 | 134 | 135 | 136 | ``` 137 | visual information, candidate caption (beam search) 138 | standard poodle shopping cart footwear, a close up of shoes and a dog in a basket, 0.99774158 139 | standard poodle shopping cart footwear, a brown teddy bear laying on top of a pair of shoes, 0.0621758029 140 | ``` 141 | 142 | ## Visual semantic with pre-trained model 143 | 144 | 145 | 148 | 149 | 150 | 151 | 152 | 153 | Although this approach is proposed to take the advantage of the dataset (_e.g._ visual semantic model), we also investigate the use of out-of-the-box tools to estimate the relatedness score between the short text (_i.e._ caption) and its environmental visual context (we call it visual classifier). 154 | 155 | For this we follow similarity to probability based approach but 156 | 157 | we use only the cosine similarity from a pre-trained model and the top-3 averaged prob (confidence) from the object classifier as: 158 | 159 | 162 | 163 | $\text{P}(w \mid c)=\text{}sim(w,c)^{\text{P}(c)}$ 164 | where the main components of the visual semantics re-ranker: 165 | 168 | 1. Simialrity/relatedness between the caption and the object context $\text{}sim(w,c)$ 169 | 170 | 173 | 174 | 2. $\text{P}(c)$ is the classifier object confident in the image $\text{P}(w \mid \text{object})$ 175 | 176 | 177 | with Pre-trained [SBERT](https://www.sbert.net) 178 | 179 | ``` 180 | python model.py --vis visual-context_label.txt --vis_prob visual-context_prob.txt --c caption.txt 181 | ``` 182 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf) 183 | 184 | ## Evaluation 185 | 186 | [Download pycocoevalcap](https://github.com/salaniz/pycocoevalcap) 187 | 188 | ``` 189 | pip install pycocoevalcap 190 | ``` 191 | 192 | Then run 193 | ``` 194 | python Evaluation/coco_eval.py --f Result_tune_BERT_0.4.json 195 | ``` 196 | For more evaluation ([Lexical and Semantic Diversity](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/SBERT-caption-eval)) 197 | 213 | 214 | 215 | ## Citation 216 | 217 | The details of this repo are described in the following paper. If you find this repo useful, please kindly cite it: 218 | 219 | ```bibtex 220 | @article{sabir2023visual, 221 | title={Visual Semantic Relatedness Dataset for Image Captioning}, 222 | author={Sabir, Ahmed and Moreno-Noguer, Francesc and Padr{\'o}, Llu{\'\i}s}, 223 | journal={arXiv preprint arXiv:2301.08784}, 224 | year={2023} 225 | } 226 | ``` 227 | 228 | -------------------------------------------------------------------------------- /approch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/approch.png -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | train.tsv file here 2 | -------------------------------------------------------------------------------- /data/test.tsv: -------------------------------------------------------------------------------- 1 | id visual caption 2 | 0 standard poodle shopping cart footwear a close up of a dog laying in a basket 3 | 1 street sign traffic light tower a black and white photo of a street light 4 | 2 toilet seat a white toilet with its seat up in a bathroom 5 | 3 mobile home studio couch house a living room filled with furniture and a coffee table 6 | 4 french loaf conch person a basket filled with sandwiches on top of a table 7 | 5 indian elephant a group of people riding on the back of an elephant 8 | 6 bow tie windsor glasses a man wearing glasses and a tie in a room 9 | 7 sombrero bonnet woman a woman standing in front of a giant cake 10 | 8 diaper bassinet human a baby sitting in front of a giant cake 11 | 9 bobsled go-kart human a group of children sitting around a piece of luggage 12 | 10 vase spotlight plant a bunch of flowers that are in a vase 13 | -------------------------------------------------------------------------------- /data/train.tsv: -------------------------------------------------------------------------------- 1 | id id1 id2 visual caption is_related 2 | 220740 220741 220742 marimba dalmatian picket fence a horse jumping competition is going on with people in the stands 1 3 | 385729 385730 385731 dishwasher microwave barber chair a person riding a horse on a dirt ground 0 4 | 59422 59423 59424 laptop carton comicbook a laptop that has stickers on its cover is sitting on a table 1 5 | 46638 46639 46640 suit Windsortie woodenspoon a young bow wearing a pink shirt and a purple tie 1 6 | 11870 11871 11872 studiocouch four-poster quilt a couple of girls sitting in a bed in a bedroom 1 7 | 471676 471677 471678 streetcar fire engine passenger car a multi layer plate with cakes and food on it 0 8 | 186795 186796 186797 shoe shop television monitor a man playing a wii on a large projector screen 1 9 | 121836 121837 121838 ox water buffalo alp cattle standing on a hill in fog 1 10 | 396224 396225 396226 altar desk perfume oranges sitting in a blue bowl on a wooden table 0 11 | 430635 430636 430637 speedboat paddle lifeboat pots and other items sit on a stove and counter 0 12 | 145057 145058 145059 shopping cart ashcan park bench a coin meter that is laying down on grates 1 13 | 409778 409779 409780 web site fire engine comic book a painting of a man from the back 0 14 | 155568 155569 155570 grocery store patio restaurant a man and woman walking up the stairs in a backyard 1 15 | 213951 213952 213953 microwave washer dining table the kitchen is equipped with all the latest appliances 1 16 | 489266 489267 489268 traffic light aircraft carrier chain saw a laptop computer on a desk with cables a mug and bowl 0 17 | 257649 257650 257651 grocery store confectionery shopping basket a couple of wooden tale stopped with fresh fruit 1 18 | 113826 113827 113828 lab coat vestment West Highland white terrier a group of people standing in rows with frisbees for a photo 1 19 | 486413 486414 486415 snorkel ski tennis ball two frames of a woman in the air on a tennis court 0 20 | 400432 400433 400434 crutch lawn mower chain saw eight underneath on ambarella in the forest parrot 0 21 | 341153 341154 341155 washer microwave dishwasher a small propeller plane sitting underneath a covering at an airport 0 22 | 462067 462068 462069 ballplayer baseball scoreboard a plate full of bright green lettuce next to some bread 0 23 | 443392 443393 443394 grocery store pineapple pizza a man in black and white stripes with makeup smiling 0 24 | 486660 486661 486662 wombat wallaby titi a persons shadow on the ground of them skateboarding 0 25 | 336616 336617 336618 moped motor scooter crash helmet multiple street signs are attached to the post 0 26 | 124199 124200 124201 sorrel hog barrel a brown horse eating from a hallowed out metal barrel 1 27 | 238004 238005 238006 tray washbasin cradle a cat laying on a couch near a remote control 1 28 | 319195 319196 319197 airliner wing web site a propeller airplane parked inside and airplane hanger 1 29 | 412036 412037 412038 grey whale breakwater killer whale a stop sign is standing at a street intersection 0 30 | 491896 491897 491898 teddy wool toyshop a woman in an old-fashioned kitchen with pots and pans 0 31 | 487501 487502 487503 snowmobile steam locomotive tow truck the living room is clean and empty from people 0 32 | 277093 277094 277095 microwave dishwasher chest a chair holding a laptop that is facing towards an oven 1 33 | 135542 135543 135544 water buffalo warthog hog sheep grazing under a tree in a grassy meadow 1 34 | 8448 8449 8450 mountainbike unicycle bicycle-built-for-two a picture of a person throwing a frisbee 1 35 | 170686 170687 170688 police van minibus ambulance a person in the army greeting someone in a suit 1 36 | 372016 372017 372018 Great Dane Irish wolfhound English setter a man standing in a room holding a remote 0 37 | 351158 351159 351160 sunglass bullet train sunglasses a woman opening the trunk of her car 0 38 | 414542 414543 414544 killer whale great white shark paddle a dog running across a field with a frisbee in his mouth 0 39 | 264998 264999 265000 bannister ski unicycle a man riding a skateboard along a metal hand rail 1 40 | 362868 362869 362870 zebra bustard gazelle a basket full of bananas with a net on top 0 41 | 88455 88456 88457 patio flagpole pole a fire hydrant and fire hose in a houses front yard 1 42 | 372512 372513 372514 seashore catamaran swimming trunks a man riding a surfboard on a wave in the ocean 0 43 | 387327 387328 387329 cellular telephone lab coat cash machine a baseball game ensues as people watch 0 44 | 248027 248028 248029 web site barbershop cinema a motor bike on the side of the street 1 45 | 347507 347508 347509 banana pineapple orange a bear itching itself on a bare tree 0 46 | 33714 33715 33716 picketfence streetcar mountainbike the red bike and the pink bike just started dating 1 47 | 173989 173990 173991 umbrella poncho jinrikisha a group of people walking down a street carrying umbrellas 1 48 | 20835 20836 20837 ballplayer baseball footballhelmet a man throwing a baseball from a mound on a field 1 49 | 16356 16357 16358 lumbermill barbershop turnstile a man working on a baseball bat while two others watch 1 50 | 193491 193492 193493 unicycle pole horizontal bar boy riding on his skateboard down a stair rail 1 51 | 384165 384166 384167 mixing bowl corn meat loaf a couple of sailors standing next to a woman 0 52 | 321736 321737 321738 ballplayer baseball football helmet a boys baseball game with a batter catcher and umpire 1 53 | 108395 108396 108397 crash helmet moped backpack a man with a suit and tie on a motor bike 1 54 | 215942 215943 215944 unicycle military uniform bearskin four guys are sitting on a bench in front of a building 1 55 | 134156 134157 134158 wine bottle eggnog red wine there is a bottle of wine next to a glass 1 56 | 297783 297784 297785 necklace thimble corkscrew this is an image of a meal and an avocado is included 1 57 | 110516 110517 110518 minivan cab police van a dog looking ahead with a stoic look in a car seat 1 58 | 3166 3167 3168 grocerystore headcabbage cauliflower a pile of vegetables on display at a grocery store 1 59 | 440075 440076 440077 ski curly-coatedretriever Gordonsetter elephants and their young in their natural habitat 0 60 | 71021 71022 71023 ballplayer baseball puck a baseball player and a flying black bat 1 61 | -------------------------------------------------------------------------------- /dataset_v1-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/dataset_v1-1.png -------------------------------------------------------------------------------- /hist.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/hist.jpg -------------------------------------------------------------------------------- /main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/main.png -------------------------------------------------------------------------------- /overlap_text.py: -------------------------------------------------------------------------------- 1 | 2 | file1 = [] 3 | 4 | file2 = [] 5 | 6 | with open('train_visual.txt','rU') as f: 7 | for line in f: 8 | file1.append(line.rstrip()) 9 | 10 | 11 | with open('caption_anot.txt') as f1: 12 | for line1 in f1: 13 | file2.append(line1.rstrip()) 14 | #break 15 | 16 | f=open('intersection_caption_visual.txt', "w") 17 | for i in range(len(file1)): 18 | temp =[] 19 | messages = file1[i] 20 | messages1 = file2[i] 21 | 22 | words1 = messages.lower().split() 23 | words2 = messages1.lower().split() 24 | 25 | w = set(words1) & set(words2) 26 | 27 | 28 | #words1 = "This is a simple test of set intersection".lower().split() 29 | #words2 = "Intersection of sets is easy using Python".lower().split() 30 | 31 | 32 | temp.append(w) 33 | 34 | result= file1[i]+','+file2[i]+','+str(w) 35 | 36 | f.write(result) 37 | #f.write(result) 38 | f.write('\n') 39 | print(result) 40 | #del result 41 | #close.sess() 42 | 43 | f.close() 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/overview.png -------------------------------------------------------------------------------- /pre-trained/README.md: -------------------------------------------------------------------------------- 1 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf) 2 | -------------------------------------------------------------------------------- /pre-trained/Visual_re-rank_re-ranked_output.txt: -------------------------------------------------------------------------------- 1 | a man riding on the back of a motorcycle 0.8895639974564639 2 | a person riding a motorcycle on a city street 0.8699054868636436 3 | a person riding a motorcycle down a city street 0.8665321958170883 4 | a man riding on the back of a motorcycle down a street 0.8645537987336105 5 | a man riding a motorcycle down a street 0.8582269252364088 6 | a man riding on the back of a motorcycle down a sidewalk 0.8581149928539996 7 | a man riding a motorcycle down the street 0.8569102761752505 8 | a man riding a motorcycle on a city street 0.85454545827468 9 | a man riding a motorcycle down a sidewalk 0.8493932857280806 10 | -------------------------------------------------------------------------------- /pre-trained/Visual_re-ranker.txt: -------------------------------------------------------------------------------- 1 | a man riding a motorcycle down a street,0.8582269252364088 2 | a person riding a motorcycle on a city street,0.8699054868636436 3 | a man riding on the back of a motorcycle,0.8895639974564639 4 | a man riding a motorcycle on a city street,0.85454545827468 5 | a man riding on the back of a motorcycle down a street,0.8645537987336105 6 | a person riding a motorcycle down a city street,0.8665321958170883 7 | a man riding on the back of a motorcycle down a sidewalk,0.8581149928539996 8 | a man riding a motorcycle down the street,0.8569102761752505 9 | a man riding a motorcycle down a sidewalk,0.8493932857280806 10 | -------------------------------------------------------------------------------- /pre-trained/caption.txt: -------------------------------------------------------------------------------- 1 | a man riding a motorcycle down a street 2 | a person riding a motorcycle on a city street 3 | a man riding on the back of a motorcycle 4 | a man riding a motorcycle on a city street 5 | a man riding on the back of a motorcycle down a street 6 | a person riding a motorcycle down a city street 7 | a man riding on the back of a motorcycle down a sidewalk 8 | a man riding a motorcycle down the street 9 | a man riding a motorcycle down a sidewalk 10 | 11 | -------------------------------------------------------------------------------- /pre-trained/model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys 3 | import argparse 4 | import torch 5 | import re 6 | import os 7 | 8 | 9 | 10 | parser=argparse.ArgumentParser(description='call all scores and compute the visual context based re-ranker') 11 | parser.add_argument('--sim', default='sim-score.txt', help='similarity score from fine_tune_BERT', type=str,required=False) 12 | parser.add_argument('--vis', default='visual-context_label.txt',help='class-label from the classifier (Resent152)', type=str, required=True) 13 | parser.add_argument('--vis_prob', default='visual-context.txt', help='prob from the classifier (Resent152)', type=str, required=True) 14 | parser.add_argument('--c', default='caption.txt', help='caption from the baseline (any)', type=str, required=True) 15 | args = parser.parse_args() 16 | 17 | # Download from here S-BERT 18 | # pip install -U sentence-transformers 19 | from sentence_transformers import SentenceTransformer, util 20 | from sklearn.metrics.pairwise import cosine_similarity 21 | 22 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') 23 | #model = SentenceTransformer('nq-distilbert-base-v1') 24 | 25 | 26 | def cos_sim(a, b): 27 | return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b))) 28 | 29 | 30 | def get_lines(file_path): 31 | with open(file_path) as f: 32 | return f.read().strip().split('\n') 33 | 34 | 35 | # visual confident based visual re-ranker 36 | class Visual_re_ranker: 37 | def __init__(self, visual_context_prob, sim): 38 | self.visual_context_prob = visual_context_prob 39 | self.sim = sim 40 | def p_minus (self): 41 | score = pow(float(sim), float(visual_context_prob)) 42 | 43 | return score 44 | 45 | @staticmethod 46 | def remove_duplicate_caption_re_rank(input_path, output_path): 47 | with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file: 48 | seen_lines = set() 49 | 50 | def add_line(line): 51 | seen_lines.add(line) 52 | return line 53 | 54 | output_file.writelines((add_line(line) for line in input_file 55 | if line not in seen_lines)) 56 | re_ranked_scores = [] 57 | with open(output_path) as f: 58 | for line in f: 59 | caption, score = line.split(',') 60 | score = float(score) 61 | re_ranked_scores.append((caption, score)) 62 | re_ranked_scores.sort(key=lambda s: float(s[1]), reverse=True) 63 | with open(output_path, 'w') as f: 64 | for caption, score in re_ranked_scores: 65 | f.write("%s %s\n" % (caption, score)) 66 | 67 | 68 | 69 | # all beam with visual context 70 | input_path= 'Visual_re-ranker.txt' 71 | # re-ranked beam with visual context 72 | output_path = 'Visual_re-rank_re-ranked_output.txt' 73 | 74 | # compute visual context 75 | f=open(input_path, "w") 76 | for i in range(len(get_lines(args.vis))): 77 | temp =[] 78 | visual_context_label = get_lines(args.vis)[i] 79 | visual_context_prob = get_lines(args.vis_prob)[i] 80 | caption = get_lines(args.c)[i] 81 | 82 | 83 | caption_emb = model.encode(caption, convert_to_tensor=True) 84 | visual_context_label_emb = model.encode(visual_context_label, convert_to_tensor=True) 85 | 86 | #def cos_sim(a, b): 87 | # return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b))) 88 | 89 | 90 | 91 | sim = cosine_scores = util.pytorch_cos_sim(caption_emb, visual_context_label_emb) 92 | sim = sim.cpu().numpy() 93 | sim = sim.item() 94 | 95 | 96 | score = Visual_re_ranker(visual_context_prob, sim) 97 | score = score.p_minus() 98 | #score = score.real 99 | temp.append(score) 100 | 101 | #result = ','.join((caption, LM, str(score))) 102 | result = ','.join((caption, str(score))) 103 | result = re.sub(r'\s*,\s*', ',', result) 104 | 105 | 106 | #print(result) 107 | 108 | f.write(result) 109 | f.write('\n') 110 | 111 | 112 | f.close() 113 | 114 | if __name__ == "__main__": 115 | 116 | # re-rank and print top visual beam captions 117 | Visual_re_ranker.remove_duplicate_caption_re_rank(input_path, output_path) 118 | -------------------------------------------------------------------------------- /pre-trained/sample_best.json: -------------------------------------------------------------------------------- 1 | [{"image_id":24343,"caption":"a man riding on the back of a motorcycle"}] 2 | -------------------------------------------------------------------------------- /pre-trained/sample_best_baseline.json: -------------------------------------------------------------------------------- 1 | [{"image_id":24343,"caption":"a man riding a motorcycle down a street"}] 2 | -------------------------------------------------------------------------------- /pre-trained/visual-context_label.txt: -------------------------------------------------------------------------------- 1 | motor scooter crash helmet motorcycle 2 | motor scooter crash helmet motorcycle 3 | motor scooter crash helmet motorcycle 4 | motor scooter crash helmet motorcycle 5 | motor scooter crash helmet motorcycle 6 | motor scooter crash helmet motorcycle 7 | motor scooter crash helmet motorcycle 8 | motor scooter crash helmet motorcycle 9 | motor scooter crash helmet motorcycle 10 | -------------------------------------------------------------------------------- /pre-trained/visual-context_prob.txt: -------------------------------------------------------------------------------- 1 | 0.203588580197762 2 | 0.203588580197762 3 | 0.203588580197762 4 | 0.203588580197762 5 | 0.203588580197762 6 | 0.203588580197762 7 | 0.203588580197762 8 | 0.203588580197762 9 | 0.203588580197762 10 | -------------------------------------------------------------------------------- /visual_context/README.md: -------------------------------------------------------------------------------- 1 | ## Extract visual information 2 | ``` 3 | conda create -n Resnet python=3.7 anaconda 4 | conda activate Resnet 5 | pip install tensorflow==1.15.0 6 | pip install keras==2.1.5 7 | ``` 8 | 9 | For [ResNet](https://arxiv.org/abs/1512.03385) 10 | 11 | ``` 12 | python run-visual.py 13 | ``` 14 | 15 | ``` 16 | COCO_val2014_000000185210.jpg 'traffic_light', 0.7458004 17 | COCO_val2014_000000235692.jpg 'ox', 0.49095494 18 | ``` 19 | 20 | For [CLIP](https://github.com/openai/CLIP) with zero-shot prediction 21 | 22 | ``` 23 | # torch 1.7.1 24 | conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=10.1 25 | pip install ftfy regex tqdm 26 | pip install git+https://github.com/openai/CLIP.git 27 | ``` 28 | 29 | run 30 | 31 | ``` 32 | python run-visual_CLIP.py 33 | ``` 34 | 35 | ``` 36 | COCO_val2014_000000185210.jpg 'barrow', 0.0954 37 | COCO_val2014_000000235692.jpg 'ox', 0.5092 38 | ``` 39 | For more visual classifier (e.g., Vit, SwinV2, etc.) please refre to this [page](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/model/Resent-152) 40 | -------------------------------------------------------------------------------- /visual_context/imgs/COCO_val2014_000000185210.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000185210.jpg -------------------------------------------------------------------------------- /visual_context/imgs/COCO_val2014_000000235692.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000235692.jpg -------------------------------------------------------------------------------- /visual_context/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import keras.backend as K 3 | 4 | from keras import initializers 5 | from keras.layers import Input 6 | from keras.layers import Dense 7 | from keras.layers import Conv2D 8 | from keras.layers import MaxPooling2D 9 | from keras.layers import AveragePooling2D 10 | from keras.layers import ZeroPadding2D 11 | from keras.layers import Flatten 12 | from keras.layers import Activation 13 | from keras.layers import add 14 | from keras.layers import BatchNormalization 15 | from keras.layers import GlobalAveragePooling2D 16 | from keras.layers import GlobalMaxPooling2D 17 | 18 | from keras.models import Model 19 | from keras.engine import Layer, InputSpec 20 | from keras.engine import get_source_inputs 21 | 22 | 23 | from keras.utils.data_utils import get_file 24 | #from keras.applications.imagenet_utils import _obtain_input_shape 25 | #from keras.applications.imagenet_utils import _obtain_input_shape 26 | 27 | from keras_applications.imagenet_utils import _obtain_input_shape 28 | 29 | WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5' 30 | WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5' 31 | 32 | 33 | class Scale(Layer): 34 | """ Custom Layer for ResNet used for BatchNormalization. 35 | 36 | Learns a set of weights and biases used for scaling the input data. 37 | the output consists simply in an element-wise multiplication of the input 38 | and a sum of a set of constants: 39 | out = in * gamma + beta, 40 | where 'gamma' and 'beta' are the weights and biases larned. 41 | # Arguments 42 | axis: integer, axis along which to normalize in mode 0. For instance, 43 | if your input tensor has shape (samples, channels, rows, cols), 44 | set axis to 1 to normalize per feature map (channels axis). 45 | momentum: momentum in the computation of the 46 | exponential average of the mean and standard deviation 47 | of the data, for feature-wise normalization. 48 | weights: Initialization weights. 49 | List of 2 Numpy arrays, with shapes: 50 | `[(input_shape,), (input_shape,)]` 51 | beta_init: name of initialization function for shift parameter 52 | (see [initializers](../initializers.md)), or alternatively, 53 | Theano/TensorFlow function to use for weights initialization. 54 | This parameter is only relevant if you don't pass a `weights` argument. 55 | gamma_init: name of initialization function for scale parameter (see 56 | [initializers](../initializers.md)), or alternatively, 57 | Theano/TensorFlow function to use for weights initialization. 58 | This parameter is only relevant if you don't pass a `weights` argument. 59 | """ 60 | 61 | def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs): 62 | self.momentum = momentum 63 | self.axis = axis 64 | self.beta_init = initializers.get(beta_init) 65 | self.gamma_init = initializers.get(gamma_init) 66 | self.initial_weights = weights 67 | super(Scale, self).__init__(**kwargs) 68 | 69 | def build(self, input_shape): 70 | self.input_spec = [InputSpec(shape=input_shape)] 71 | shape = (int(input_shape[self.axis]),) 72 | 73 | self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name) 74 | self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name) 75 | self.trainable_weights = [self.gamma, self.beta] 76 | 77 | if self.initial_weights is not None: 78 | self.set_weights(self.initial_weights) 79 | del self.initial_weights 80 | 81 | def call(self, x, mask=None): 82 | input_shape = self.input_spec[0].shape 83 | broadcast_shape = [1] * len(input_shape) 84 | broadcast_shape[self.axis] = input_shape[self.axis] 85 | 86 | out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape) 87 | return out 88 | 89 | def get_config(self): 90 | config = {"momentum": self.momentum, "axis": self.axis} 91 | base_config = super(Scale, self).get_config() 92 | return dict(list(base_config.items()) + list(config.items())) 93 | 94 | 95 | def identity_block(input_tensor, kernel_size, filters, stage, block): 96 | """ 97 | The identity_block is the block that has no conv layer at shortcut 98 | # Arguments 99 | input_tensor: input tensor 100 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 101 | filters: list of integers, the nb_filters of 3 conv layer at main path 102 | stage: integer, current stage label, used for generating layer names 103 | block: 'a','b'..., current block label, used for generating layer names 104 | """ 105 | eps = 1.1e-5 106 | nb_filter1, nb_filter2, nb_filter3 = filters 107 | conv_name_base = 'res' + str(stage) + block + '_branch' 108 | bn_name_base = 'bn' + str(stage) + block + '_branch' 109 | scale_name_base = 'scale' + str(stage) + block + '_branch' 110 | 111 | if K.image_data_format() == 'channels_last': 112 | bn_axis = 3 113 | else: 114 | bn_axis = 1 115 | 116 | x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor) 117 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x) 118 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x) 119 | x = Activation('relu', name=conv_name_base + '2a_relu')(x) 120 | 121 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x) 122 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x) 123 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x) 124 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x) 125 | x = Activation('relu', name=conv_name_base + '2b_relu')(x) 126 | 127 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x) 128 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x) 129 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x) 130 | 131 | x = add([x, input_tensor], name='res' + str(stage) + block) 132 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x) 133 | return x 134 | 135 | 136 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)): 137 | """ conv_block is the block that has a conv layer at shortcut 138 | # Arguments 139 | input_tensor: input tensor 140 | kernel_size: defualt 3, the kernel size of middle conv layer at main path 141 | filters: list of integers, the nb_filters of 3 conv layer at main path 142 | stage: integer, current stage label, used for generating layer names 143 | block: 'a','b'..., current block label, used for generating layer names 144 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2) 145 | And the shortcut should have subsample=(2,2) as well 146 | """ 147 | 148 | eps = 1.1e-5 149 | nb_filter1, nb_filter2, nb_filter3 = filters 150 | conv_name_base = 'res' + str(stage) + block + '_branch' 151 | bn_name_base = 'bn' + str(stage) + block + '_branch' 152 | scale_name_base = 'scale' + str(stage) + block + '_branch' 153 | 154 | if K.image_data_format() == 'channels_last': 155 | bn_axis = 3 156 | else: 157 | bn_axis = 1 158 | 159 | x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor) 160 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x) 161 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x) 162 | x = Activation('relu', name=conv_name_base + '2a_relu')(x) 163 | 164 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x) 165 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), 166 | name=conv_name_base + '2b', use_bias=False)(x) 167 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x) 168 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x) 169 | x = Activation('relu', name=conv_name_base + '2b_relu')(x) 170 | 171 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x) 172 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x) 173 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x) 174 | 175 | shortcut = Conv2D(nb_filter3, (1, 1), strides=strides, 176 | name=conv_name_base + '1', use_bias=False)(input_tensor) 177 | shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut) 178 | shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut) 179 | 180 | x = add([x, shortcut], name='res' + str(stage) + block) 181 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x) 182 | return x 183 | 184 | 185 | def ResNet152(include_top=True, weights='imagenet', 186 | input_tensor=None, input_shape=None, pooling=None, classes=1000): 187 | """ Instantiates the ResNet152 architecture. 188 | Optionally loads weights pre-trained 189 | on ImageNet. Note that when using TensorFlow, 190 | for best performance you should set 191 | `image_data_format='channels_last'` in your Keras config 192 | at ~/.keras/keras.json. 193 | The model and the weights are compatible only with 194 | TensorFlow. The data format 195 | convention used by the model is the one 196 | specified in your Keras config file. 197 | # Arguments 198 | include_top: whether to include the fully-connected 199 | layer at the top of the network. 200 | weights: one of `None` (random initialization), 201 | 'imagenet' (pre-training on ImageNet), 202 | or the path to the weights file to be loaded. 203 | input_tensor: optional Keras tensor (i.e. output of `layers.Input()`) 204 | to use as image input for the model. 205 | input_shape: optional shape tuple, only to be specified 206 | if `include_top` is False (otherwise the input shape 207 | has to be `(224, 224, 3)` (with `channels_last` data format) 208 | or `(3, 224, 224)` (with `channels_first` data format). 209 | It should have exactly 3 inputs channels, 210 | and width and height should be no smaller than 197. 211 | E.g. `(200, 200, 3)` would be one valid value. 212 | pooling: Optional pooling mode for feature extraction 213 | when `include_top` is `False`. 214 | - `None` means that the output of the model will be 215 | the 4D tensor output of the 216 | last convolutional layer. 217 | - `avg` means that global average pooling 218 | will be applied to the output of the 219 | last convolutional layer, and thus 220 | the output of the model will be a 2D tensor. 221 | - `max` means that global max pooling will 222 | be applied. 223 | classes: optional number of classes to classify images 224 | into, only to be specified if `include_top` is True, and 225 | if no `weights` argument is specified. 226 | # Returns 227 | A Keras model instance. 228 | # Raises 229 | ValueError: in case of invalid argument for `weights`, 230 | or invalid input shape. 231 | """ 232 | 233 | eps = 1.1e-5 234 | 235 | if not (weights in {'imagenet', None} or os.path.exists(weights)): 236 | raise ValueError('The `weights` argument should be either ' 237 | '`None` (random initialization), `imagenet` ' 238 | '(pre-training on ImageNet), ' 239 | 'or the path to the weights file to be loaded.') 240 | 241 | if weights == 'imagenet' and include_top and classes != 1000: 242 | raise ValueError('If using `weights` as imagenet with `include_top`' 243 | ' as true, `classes` should be 1000') 244 | 245 | # Determine proper input shape 246 | input_shape = _obtain_input_shape(input_shape, 247 | default_size=224, 248 | min_size=197, 249 | data_format=K.image_data_format(), 250 | require_flatten=include_top, 251 | weights=weights) 252 | 253 | if input_tensor is None: 254 | img_input = Input(shape=input_shape) 255 | else: 256 | if not K.is_keras_tensor(input_tensor): 257 | img_input = Input(tensor=input_tensor, shape=input_shape, name='data') 258 | else: 259 | img_input = input_tensor 260 | 261 | # Handle dimension ordering for different backends 262 | #if K.image_dim_ordering() == 'tf': 263 | if K.common.image_dim_ordering() == 'tf': 264 | bn_axis = 3 265 | else: 266 | bn_axis = 1 267 | 268 | x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input) 269 | x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x) 270 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x) 271 | x = Scale(axis=bn_axis, name='scale_conv1')(x) 272 | x = Activation('relu', name='conv1_relu')(x) 273 | x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x) 274 | 275 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) 276 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b') 277 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c') 278 | 279 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') 280 | for i in range(1, 8): 281 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i)) 282 | 283 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') 284 | for i in range(1, 36): 285 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i)) 286 | 287 | x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a') 288 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b') 289 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c') 290 | 291 | if include_top: 292 | # Classification block 293 | x = AveragePooling2D((7, 7), name='avg_pool')(x) 294 | x = Flatten()(x) 295 | x = Dense(classes, activation='softmax', name='fc1000')(x) 296 | else: 297 | if pooling == 'avg': 298 | x = GlobalAveragePooling2D()(x) 299 | elif pooling == 'max': 300 | x = GlobalMaxPooling2D()(x) 301 | 302 | # Ensure that the model takes into account 303 | # any potential predecessors of `input_tensor`. 304 | if input_tensor is not None: 305 | inputs = get_source_inputs(input_tensor) 306 | else: 307 | inputs = img_input 308 | 309 | # Create model 310 | model = Model(inputs, x, name='resnet152') 311 | 312 | # Load weights 313 | if weights == 'imagenet': 314 | if include_top: 315 | weights_path = get_file( 316 | 'resnet152_weights_tf_dim_ordering_tf_kernels.h5', 317 | WEIGHTS_PATH, 318 | cache_subdir='models', 319 | md5_hash='cdb18a2158b88e392c0905d47dcef965') 320 | else: 321 | weights_path = get_file( 322 | 'resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5', 323 | WEIGHTS_PATH_NO_TOP, 324 | cache_subdir='models', 325 | md5_hash='02cb9130cc51543cd703c79697baa592') 326 | model.load_weights(weights_path) 327 | 328 | elif weights is not None: 329 | model.load_weights(weights) 330 | 331 | return model 332 | -------------------------------------------------------------------------------- /visual_context/run-visual.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from skimage.io import imread 4 | from skimage.transform import resize 5 | from keras.applications.imagenet_utils import decode_predictions 6 | from keras.applications.imagenet_utils import preprocess_input 7 | from tensorflow.keras.preprocessing import image 8 | from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions 9 | import numpy as np 10 | from model import ResNet152 11 | import tensorflow as tf 12 | 13 | import os 14 | 15 | 16 | image_dir = 'imgs' 17 | import keras as K 18 | from keras_applications.imagenet_utils import _obtain_input_shape 19 | 20 | os.environ['CUDA_VISIBLE_DEVICES'] = "-1" 21 | 22 | def preprocess(x): 23 | x = resize(x, (224,224), mode='constant') * 255 24 | x = preprocess_input(x) 25 | if x.ndim == 3: 26 | x = np.expand_dims(x, 0) 27 | return x 28 | model = ResNet152() 29 | 30 | for img_file in os.listdir(image_dir): 31 | #img = mpimg.imread(image_dir + '/' + img_file) 32 | img = image.load_img(image_dir + '/' + img_file, target_size=(224, 224)) 33 | x = image.img_to_array(img) 34 | x = np.expand_dims(x, axis=0) 35 | x = preprocess_input(x) 36 | 37 | preds = model.predict(x) 38 | print(img_file, decode_predictions(preds, top=1)[0]) 39 | 40 | 41 | -------------------------------------------------------------------------------- /visual_context/run-visual_CLIP.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import glob 4 | import sys 5 | import torch 6 | import torchvision.transforms as Transforms 7 | import clip 8 | from PIL import Image 9 | 10 | 11 | 12 | # Check device 13 | #device = "cuda" if torch.cuda.is_available() else "cpu" 14 | device = torch.device("cpu") 15 | print(f"Device - {device}") 16 | 17 | # Load CLIP model 18 | clip_model, clip_preprocess = clip.load('ViT-B/32', device) 19 | clip_model.eval() 20 | 21 | # 22 | with open("imagenet_classes.txt", "r") as f: 23 | categories = [s.strip() for s in f.readlines()] 24 | 25 | text = clip.tokenize(categories).to(device) 26 | 27 | def predict_clip(image_file_path): 28 | image = clip_preprocess(Image.open(image_file_path)).unsqueeze(0).to(device) 29 | clip_model, _ = clip.load('ViT-B/32', device) 30 | 31 | # Calculate features 32 | with torch.no_grad(): 33 | image_features = clip_model.encode_image(image) 34 | text_features = clip_model.encode_text(text) 35 | 36 | # Pick the top 5 most similar labels for the image 37 | image_features /= image_features.norm(dim=-1, keepdim=True) 38 | text_features /= text_features.norm(dim=-1, keepdim=True) 39 | similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) 40 | values, indices = similarity[0].topk(5) 41 | 42 | predictions = {} 43 | for value, index in zip(values, indices): 44 | predictions[f"{categories[index]:>16s}"] = f"{1 * value.item():.4f}%" 45 | 46 | return predictions 47 | 48 | 49 | # run pred 50 | filenames = glob.glob("file= '/image/*.jpg") 51 | filenames.sort() 52 | for image in filenames: 53 | print(os.path.basename(image), predict_clip(image)) 54 | #print(predict_clip("image.jpg")) 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /word-count-hisgram.py: -------------------------------------------------------------------------------- 1 | """Python script to create a histogram of words in a text file. 2 | Usage: python word_frequency.py -f "/path/to/file.txt" -n 200 3 | Specify the path to the text file as above. Manually specify the top N words to report (default 100). 4 | Text file can contain punctuation, new lines, etc., but special characters aren't handled well. 5 | """ 6 | 7 | import os 8 | import sys 9 | import string 10 | import argparse 11 | import operator 12 | 13 | import numpy as np 14 | import pandas as pd 15 | import matplotlib.pyplot as plt 16 | 17 | from collections import Counter 18 | 19 | __author__ = 'Nick Powell (PhD student, CMIC & CABI, UCL, UK), nicholas.powell.11@ucl.ac.uk' 20 | __version__ = '0.2.20150303' 21 | __created__ = '2014-12-18, Thursday' 22 | 23 | 24 | def main(): 25 | 26 | parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) 27 | parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True) 28 | parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int) 29 | args = parser.parse_args() 30 | 31 | # Path to text file to analyse 32 | rawfilepath = args.filepath 33 | 34 | # Print a histogram containing the top N words, and print them and their counts. 35 | top_n = args.number 36 | 37 | # Load the file 38 | filepath = os.path.normpath(os.path.join(rawfilepath)) 39 | file = open(filepath, 'r') 40 | 41 | # Parse as a list, removing lines 42 | content_sublists = [line.split(',') for line in file.readlines()] 43 | 44 | # Parse into a single list (from a list of lists) 45 | content_list = [item for sublist in content_sublists for item in sublist] 46 | 47 | # Remove whitespace so we can concatenate appropriately, and unify case 48 | content_list_strip = [str.strip().lower() for str in content_list] 49 | 50 | # Concatenate strings into a single string 51 | content_concat = ' '.join(content_list_strip) 52 | 53 | # Remove punctuation and new lines 54 | punct = set(string.punctuation) 55 | unpunct_content = ''.join(x for x in content_concat if x not in punct) 56 | 57 | # Split string into list of strings, again 58 | word_list = unpunct_content.split() 59 | 60 | # Perform count 61 | counts_all = Counter(word_list) 62 | 63 | words, count_values = zip(*counts_all.items()) 64 | 65 | # Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list 66 | values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True)) 67 | 68 | # Top N 69 | words_sorted_top = words_sorted[0:top_n] 70 | values_sorted_top = values_sorted[0:top_n] 71 | 72 | print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -") 73 | print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath)) 74 | print("The top {0} words are: \n{1}".format(top_n, words_sorted_top)) 75 | print("... their respective frequencies: \n{0}".format(values_sorted_top)) 76 | print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -") 77 | # Pandas DataFrame just for visualisation 78 | df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top}) 79 | print("{0}".format(df)) 80 | sys.stdout.flush() 81 | 82 | # Histogram 83 | 84 | # Make xticklabels comprehensible by matplotlib 85 | xticklabels = str(list(words_sorted_top)).split() 86 | # Remove the single quotes, commas and enclosing square brackets 87 | xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels] 88 | 89 | 90 | indices = np.arange(len(words_sorted_top)) 91 | width = 1 92 | fig = plt.figure() 93 | fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16) 94 | plt.xlabel('word', fontsize=12) 95 | plt.ylabel('count', fontsize=12) 96 | plt.bar(indices, values_sorted_top, width*0.9, alpha=0.7, color='blue') 97 | plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8) 98 | plt.show() 99 | 100 | if __name__ == '__main__': 101 | main() 102 | --------------------------------------------------------------------------------