├── BERT-CNN
    ├── BERT_CNN.py
    ├── BertLayer.py
    ├── __pycache__
    │   ├── BertLayer.cpython-36.pyc
    │   ├── data_pre.cpython-36.pyc
    │   ├── extract_features.cpython-36.pyc
    │   ├── freeze_keras_model.cpython-36.pyc
    │   ├── modeling.cpython-36.pyc
    │   └── tokenization.cpython-36.pyc
    ├── bert_experimental
    │   ├── README.md
    │   ├── bert_experimental
    │   │   ├── feature_extraction
    │   │   │   ├── l2_retriever.py
    │   │   │   └── text_preprocessing.py
    │   │   └── finetuning
    │   │   │   ├── __init__.py
    │   │   │   ├── bert_layer.py
    │   │   │   ├── graph_ops.py
    │   │   │   ├── modeling.py
    │   │   │   └── text_preprocessing.py
    │   ├── requirements.txt
    │   └── setup.py
    ├── data
    │   ├── read.me
    │   ├── test.tsv
    │   └── train.tsv
    ├── data_pre.py
    ├── eval.py
    ├── extract_features.py
    ├── freeze_keras_model.py
    ├── model.json
    ├── modeling.py
    ├── optimization.py
    ├── test_demo.tsv
    ├── tokenization.py
    └── uncased_L-12_H-768_A-12
    │   └── file-should be here.txt
├── BERT
    ├── README.md
    ├── data
    │   ├── dev.tsv
    │   ├── test.tsv
    │   └── train.tsv.zip
    ├── modeling.py
    ├── outputs
    │   └── need-this.txt
    ├── run_classifier.py
    ├── tokenization.py
    ├── train_VC_b.py
    ├── train_model_VC.py
    └── uncased_L-12_H-768_A-12
    │   └── file-should be here.txt
├── BERT_CNN_Visual_re_ranker_demo.ipynb
├── COCO_train2014_000000000009.jpg
├── COCO_val2014_000000000042.jpg
├── Evaluation
    ├── Result_tune_BERT_0.4.json
    ├── captions_val2014.json.zip
    └── coco_eval.py
├── LRCE_figure_1.png
├── Pre-trained.png
├── README.md
├── approch.png
├── data
    ├── README.md
    ├── test.tsv
    └── train.tsv
├── dataset_v1-1.png
├── hist.jpg
├── main.png
├── overlap_text.py
├── overlaping_result_v1.txt
├── overview.png
├── pre-trained
    ├── README.md
    ├── Visual_re-rank_re-ranked_output.txt
    ├── Visual_re-ranker.txt
    ├── caption.txt
    ├── model.py
    ├── sample_best.json
    ├── sample_best_baseline.json
    ├── visual-context_label.txt
    └── visual-context_prob.txt
├── visual_context
    ├── README.md
    ├── imagenet_classes.txt
    ├── imgs
    │   ├── COCO_val2014_000000185210.jpg
    │   └── COCO_val2014_000000235692.jpg
    ├── model.py
    ├── run-visual.py
    └── run-visual_CLIP.py
└── word-count-hisgram.py


/BERT-CNN/BERT_CNN.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | ##!/usr/bin/env python3
  3 | #!/bin/env python
  4 | import sys
  5 | import argparse
  6 | import re
  7 | import os
  8 | import sys
  9 | import json
 10 | 
 11 | import logging
 12 | import numpy as np
 13 | import pandas as pd
 14 | import tensorflow as tf
 15 | import tensorflow_hub as hub
 16 | from BertLayer import BertLayer
 17 | from BertLayer import build_preprocessor
 18 | from freeze_keras_model import freeze_keras_model
 19 | 
 20 | from data_pre import * 
 21 | from tensorflow import keras
 22 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
 23 | from sklearn.model_selection import train_test_split
 24 | 
 25 | 
 26 | if not 'bert_repo' in sys.path:
 27 |     sys.path.insert(0, 'bert_repo')
 28 | 
 29 | from modeling import BertModel, BertConfig
 30 | from tokenization import FullTokenizer, convert_to_unicode
 31 | from extract_features import InputExample, convert_examples_to_features
 32 | 
 33 | 
 34 | # get TF logger 
 35 | log = logging.getLogger('tensorflow')
 36 | log.handlers = []
 37 | 
 38 | 
 39 | parser=argparse.ArgumentParser()
 40 | parser.add_argument('--train',  default='/home/asabir/BERT_layers-git/data/train.tsv', help='beam serach', type=str,required=False)  
 41 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False)  
 42 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False) 
 43 | parser.add_argument('--epochs', default='5', help='', type=int,required=False) 
 44 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False) 
 45 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False) 
 46 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False)
 47 | args = parser.parse_args()
 48 | 
 49 | 
 50 | # Downlaod the pre-trained model
 51 | 
 52 | #!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
 53 | #!unzip uncased_L-12_H-768_A-12.zip
 54 | 
 55 | 
 56 | # tf.Module
 57 | def build_module_fn(config_path, vocab_path, do_lower_case=True):
 58 | 
 59 |     def bert_module_fn(is_training):
 60 |         """Spec function for a token embedding module."""
 61 | 
 62 |         input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
 63 |         input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
 64 |         token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")
 65 | 
 66 |         config = BertConfig.from_json_file(config_path)
 67 |         model = BertModel(config=config, is_training=is_training,
 68 |                           input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type)
 69 |           
 70 |         seq_output = model.all_encoder_layers[-1]
 71 |         pool_output = model.get_pooled_output()
 72 | 
 73 |         config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
 74 |         vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
 75 |         lower_case = tf.constant(do_lower_case)
 76 | 
 77 |         tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
 78 |         tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)
 79 |         
 80 |         input_map = {"input_ids": input_ids,
 81 |                      "input_mask": input_mask,
 82 |                      "segment_ids": token_type}
 83 |         
 84 |         output_map = {"pooled_output": pool_output,
 85 |                       "sequence_output": seq_output}
 86 | 
 87 |         output_info_map = {"vocab_file": vocab_file,
 88 |                            "do_lower_case": lower_case}
 89 |                 
 90 |         hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
 91 |         hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
 92 | 
 93 |     return bert_module_fn
 94 | 
 95 | 
 96 | MODEL_DIR = "/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12"
 97 | config_path = "/{}/bert_config.json".format(MODEL_DIR)
 98 | vocab_path = "/{}/vocab.txt".format(MODEL_DIR)
 99 | 
100 | 
101 | tags_and_args = []
102 | for is_training in (True, False):
103 |   tags = set()
104 |   if is_training:
105 |     tags.add("train")
106 |   tags_and_args.append((tags, dict(is_training=is_training)))
107 | 
108 | module_fn = build_module_fn(config_path, vocab_path)
109 | spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
110 | spec.export("bert-module", 
111 |             checkpoint_path="/{}/bert_model.ckpt".format(MODEL_DIR))
112 | 
113 | class BertLayer(tf.keras.layers.Layer):
114 |     def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 
115 |                  pooling="cls", do_preprocessing=True, verbose=False,
116 |                  tune_embeddings=False, trainable=True, **kwargs):
117 | 
118 |         self.trainable = trainable
119 |         self.n_tune_layers = n_tune_layers
120 |         self.tune_embeddings = tune_embeddings
121 |         self.do_preprocessing = do_preprocessing
122 | 
123 |         self.verbose = verbose
124 |         self.seq_len = seq_len
125 |         self.pooling = pooling
126 |         self.bert_path = bert_path
127 | 
128 |         self.var_per_encoder = 16
129 |         if self.pooling not in ["cls", "mean", None]:
130 |             raise NameError(
131 |                 f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
132 |             )
133 | 
134 |         super(BertLayer, self).__init__(**kwargs)
135 | 
136 |     def build(self, input_shape):
137 | 
138 |         self.bert = hub.Module(self.build_abspath(self.bert_path), 
139 |                                trainable=self.trainable, name=f"{self.name}_module")
140 | 
141 |         trainable_layers = []
142 |         if self.tune_embeddings:
143 |             trainable_layers.append("embeddings")
144 | 
145 |         if self.pooling == "cls":
146 |             trainable_layers.append("pooler")
147 | 
148 |         if self.n_tune_layers > 0:
149 |             encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
150 |             n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
151 |             for i in range(self.n_tune_layers):
152 |                 trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
153 |         
154 |         # Add module variables to layer's trainable weights
155 |         for var in self.bert.variables:
156 |             if any([l in var.name for l in trainable_layers]):
157 |                 self._trainable_weights.append(var)
158 |             else:
159 |                 self._non_trainable_weights.append(var)
160 | 
161 |         if self.verbose:
162 |             print("*** TRAINABLE VARS *** ")
163 |             for var in self._trainable_weights:
164 |                 print(var)
165 | 
166 |         self.build_preprocessor()
167 |         self.initialize_module()
168 | 
169 |         super(BertLayer, self).build(input_shape)
170 | 
171 |     def build_abspath(self, path):
172 |         if path.startswith("https://") or path.startswith("gs://"):
173 |           return path
174 |         else:
175 |           return os.path.abspath(path)
176 | 
177 |     def build_preprocessor(self):
178 |         sess = tf.keras.backend.get_session()
179 |         tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
180 |         vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
181 |                                               tokenization_info["do_lower_case"]])
182 |         self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
183 | 
184 |     def initialize_module(self):
185 |         sess = tf.keras.backend.get_session()
186 |         
187 |         vars_initialized = sess.run([tf.is_variable_initialized(var) 
188 |                                      for var in self.bert.variables])
189 | 
190 |         uninitialized = []
191 |         for var, is_initialized in zip(self.bert.variables, vars_initialized):
192 |             if not is_initialized:
193 |                 uninitialized.append(var)
194 | 
195 |         if len(uninitialized):
196 |             sess.run(tf.variables_initializer(uninitialized))
197 | 
198 |     def call(self, input):
199 | 
200 |         if self.do_preprocessing:
201 |           input = tf.numpy_function(self.preprocessor, 
202 |                                     [input], [tf.int32, tf.int32, tf.int32], 
203 |                                     name='preprocessor')
204 |           for feature in input:
205 |             feature.set_shape((None, self.seq_len))
206 |         
207 |         input_ids, input_mask, segment_ids = input
208 |         
209 |         bert_inputs = dict(
210 |             input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
211 |         )
212 |         output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
213 |         
214 |         if self.pooling == "cls":
215 |             pooled = output["pooled_output"]
216 |         else:
217 |             result = output["sequence_output"]
218 |             
219 |             input_mask = tf.cast(input_mask, tf.float32)
220 |             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
221 |             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
222 |                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
223 |             
224 |             if self.pooling == "mean":
225 |               pooled = masked_reduce_mean(result, input_mask)
226 |             else:
227 |               pooled = mul_mask(result, input_mask)
228 | 
229 |         return pooled
230 | 
231 |     def get_config(self):
232 |         config_dict = {
233 |             "bert_path": self.bert_path, 
234 |             "seq_len": self.seq_len,
235 |             "pooling": self.pooling,
236 |             "n_tune_layers": self.n_tune_layers,
237 |             "tune_embeddings": self.tune_embeddings,
238 |             "do_preprocessing": self.do_preprocessing,
239 |             "verbose": self.verbose
240 |         }
241 |         super(BertLayer, self).get_config()
242 |         return config_dict
243 | 
244 | 
245 | # read the train data 
246 | #df = pd.read_csv("/home/asabir/BERT_layers-git/data/train.tsv", sep='\t')
247 | df = pd.read_csv(args.train, sep='\t')
248 | 
249 | 
250 | 
251 | 
252 | #labels = df.is_duplicate.values
253 | labels = df.is_related.values
254 | 
255 | texts = []
256 | delimiter = " ||| "
257 | 
258 | for vis, cap  in zip(df.visual.tolist(), df.caption.tolist()):
259 |   texts.append(delimiter.join((str(vis), str(cap))))
260 | 
261 | 
262 | texts = np.array(texts)
263 | 
264 | trX, tsX, trY, tsY = train_test_split(texts, labels, shuffle=True, test_size=0.2)
265 | 
266 | 
267 | # Buliding the model 
268 | 
269 | embedding_size = 768
270 | 
271 | inp = tf.keras.Input(shape=(1,), dtype=tf.string)
272 | # Three  Layers 
273 | #encoder = BertLayer(bert_path="./bert-module/", seq_len=48, tune_embeddings=False,
274 | #                    pooling='cls', n_tune_layers=3, verbose=False)
275 | 
276 | # All Layers 
277 | encoder = BertLayer(bert_path="./bert-module/", seq_len=args.seq_len, tune_embeddings=False, pooling=None, n_tune_layers=args.num_bert_layer, verbose=False)
278 | 
279 | 
280 | 
281 | cnn_out = tf.keras.layers.Conv1D(args.CNN_filters, args.CNN_kernel_size, padding='VALID', activation=tf.nn.relu)(encoder(inp))
282 | pool = tf.keras.layers.MaxPooling1D(pool_size=2)(cnn_out)
283 | flat = tf.keras.layers.Flatten()(pool)
284 | pred = tf.keras.layers.Dense(1, activation="sigmoid")(flat)
285 | 
286 | 
287 | model = tf.keras.models.Model(inputs=[inp], outputs=[pred])
288 | 
289 | model.summary()
290 | 
291 | model.compile(
292 |       optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, ),
293 |       loss="binary_crossentropy",
294 |       metrics=["accuracy"])
295 | 
296 | # fit the data 
297 | import logging
298 | logging.getLogger("tensorflow").setLevel(logging.WARNING)
299 | 
300 | saver = keras.callbacks.ModelCheckpoint("bert_CNN_tuned.hdf5")
301 | 
302 | model.fit(trX, trY, validation_data=[tsX, tsY], batch_size=args.batch_size, epochs=args.epochs, callbacks=[saver])
303 | 
304 | 
305 | 
306 | #save the model 
307 | model.predict(trX[:10])
308 | 
309 | import json
310 | json.dump(model.to_json(), open("model.json", "w"))
311 | 
312 | model = tf.keras.models.model_from_json(json.load(open("model.json")), 
313 |                                         custom_objects={"BertLayer": BertLayer})
314 | 
315 | model.load_weights("bert_CNN_tuned.hdf5")
316 | 
317 | model.predict(trX[:10])
318 | 
319 | # For fast inference and less RAM usesage as post-processing we need to "freezing" the model. 
320 | from tensorflow.python.framework.graph_util import convert_variables_to_constants
321 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
322 | 
323 | def freeze_keras_model(model, export_path=None, clear_devices=True):
324 |     sess = tf.keras.backend.get_session()
325 |     graph = sess.graph
326 |     
327 |     with graph.as_default():
328 | 
329 |         input_tensors = model.inputs
330 |         output_tensors = model.outputs
331 |         dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
332 |         input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
333 |         output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
334 |         
335 |         tmp_g = graph.as_graph_def()
336 |         if clear_devices:
337 |             for node in tmp_g.node:
338 |                 node.device = ""
339 |         
340 |         tmp_g = optimize_for_inference(
341 |             tmp_g, input_ops, output_ops, dtypes, False)
342 |         
343 |         tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops)
344 |         
345 |         if export_path is not None:
346 |             with tf.gfile.GFile(export_path, "wb") as f:
347 |                 f.write(tmp_g.SerializeToString())
348 |         
349 |         return tmp_g
350 | 
351 | 
352 | # freeze and save the model
353 | frozen_graph = freeze_keras_model(model, export_path="frozen_graph.pb")
354 | 
355 | 
356 | # inference 
357 | #!git clone https://github.com/gaphex/bert_experimental/
358 | 
359 | import tensorflow as tf
360 | import numpy as np
361 | import sys
362 | 
363 | sys.path.insert(0, "bert_experimental")
364 | 
365 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor
366 | from bert_experimental.finetuning.graph_ops import load_graph
367 | 
368 | 
369 | restored_graph = load_graph("frozen_graph.pb")
370 | graph_ops = restored_graph.get_operations()
371 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name
372 | print(input_op, output_op)
373 | 
374 | x = restored_graph.get_tensor_by_name(input_op + ':0')
375 | y = restored_graph.get_tensor_by_name(output_op + ':0')
376 | 
377 | 
378 | preprocessor = build_preprocessor("/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12/vocab.txt", 64)
379 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor')
380 | 
381 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32])
382 | 
383 | # predictions
384 | 
385 | sess = tf.Session(graph=restored_graph)
386 | 
387 | trX[:10]
388 | 
389 | y_out = sess.run(y, feed_dict={
390 |         x: trX[:10].reshape((-1,1))
391 |     })
392 | 
393 | print(y_out)
394 | 


--------------------------------------------------------------------------------
/BERT-CNN/BertLayer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | import os
  5 | import sys
  6 | import json
  7 | 
  8 | import logging
  9 | import numpy as np
 10 | import pandas as pd
 11 | import tensorflow as tf
 12 | import tensorflow_hub as hub
 13 | from tensorflow import keras
 14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
 15 | 
 16 | from sklearn.model_selection import train_test_split
 17 | 
 18 | 
 19 | if not 'bert_repo' in sys.path:
 20 |     sys.path.insert(0, 'bert_repo')
 21 | 
 22 | from modeling import BertModel, BertConfig
 23 | from tokenization import FullTokenizer, convert_to_unicode
 24 | from extract_features import InputExample, convert_examples_to_features
 25 | 
 26 | 
 27 | def build_preprocessor(voc_path, seq_len, lower=True):
 28 |   tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
 29 |   
 30 |   def strings_to_arrays(sents):
 31 |   
 32 |       sents = np.atleast_1d(sents).reshape((-1,))
 33 | 
 34 |       examples = []
 35 |       for example in read_examples(sents):
 36 |           examples.append(example)
 37 | 
 38 |       features = convert_examples_to_features(examples, seq_len, tokenizer)
 39 |       arrays = features_to_arrays(features)
 40 |       return arrays
 41 | 
 42 | 
 43 | class BertLayer(tf.keras.layers.Layer):
 44 |     def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 
 45 |                  pooling="cls", do_preprocessing=True, verbose=False,
 46 |                  tune_embeddings=False, trainable=True, **kwargs):
 47 | 
 48 |         self.trainable = trainable
 49 |         self.n_tune_layers = n_tune_layers
 50 |         self.tune_embeddings = tune_embeddings
 51 |         self.do_preprocessing = do_preprocessing
 52 | 
 53 |         self.verbose = verbose
 54 |         self.seq_len = seq_len
 55 |         self.pooling = pooling
 56 |         self.bert_path = bert_path
 57 | 
 58 |         self.var_per_encoder = 16
 59 |         if self.pooling not in ["cls", "mean", None]:
 60 |             raise NameError(
 61 |                 f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
 62 |             )
 63 | 
 64 |         super(BertLayer, self).__init__(**kwargs)
 65 | 
 66 |     def build(self, input_shape):
 67 | 
 68 |         self.bert = hub.Module(self.build_abspath(self.bert_path), 
 69 |                                trainable=self.trainable, name=f"{self.name}_module")
 70 | 
 71 |         trainable_layers = []
 72 |         if self.tune_embeddings:
 73 |             trainable_layers.append("embeddings")
 74 | 
 75 |         if self.pooling == "cls":
 76 |             trainable_layers.append("pooler")
 77 | 
 78 |         if self.n_tune_layers > 0:
 79 |             encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
 80 |             n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
 81 |             for i in range(self.n_tune_layers):
 82 |                 trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
 83 |         
 84 |         # Add module variables to layer's trainable weights
 85 |         for var in self.bert.variables:
 86 |             if any([l in var.name for l in trainable_layers]):
 87 |                 self._trainable_weights.append(var)
 88 |             else:
 89 |                 self._non_trainable_weights.append(var)
 90 | 
 91 |         if self.verbose:
 92 |             print("*** TRAINABLE VARS *** ")
 93 |             for var in self._trainable_weights:
 94 |                 print(var)
 95 | 
 96 |         self.build_preprocessor()
 97 |         self.initialize_module()
 98 | 
 99 |         super(BertLayer, self).build(input_shape)
100 | 
101 |     def build_abspath(self, path):
102 |         if path.startswith("https://") or path.startswith("gs://"):
103 |           return path
104 |         else:
105 |           return os.path.abspath(path)
106 | 
107 |     def build_preprocessor(self):
108 |         sess = tf.keras.backend.get_session()
109 |         tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
110 |         vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
111 |                                               tokenization_info["do_lower_case"]])
112 |         self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
113 | 
114 |     def initialize_module(self):
115 |         sess = tf.keras.backend.get_session()
116 |         
117 |         vars_initialized = sess.run([tf.is_variable_initialized(var) 
118 |                                      for var in self.bert.variables])
119 | 
120 |         uninitialized = []
121 |         for var, is_initialized in zip(self.bert.variables, vars_initialized):
122 |             if not is_initialized:
123 |                 uninitialized.append(var)
124 | 
125 |         if len(uninitialized):
126 |             sess.run(tf.variables_initializer(uninitialized))
127 | 
128 |     def call(self, input):
129 | 
130 |         if self.do_preprocessing:
131 |           input = tf.numpy_function(self.preprocessor, 
132 |                                     [input], [tf.int32, tf.int32, tf.int32], 
133 |                                     name='preprocessor')
134 |           for feature in input:
135 |             feature.set_shape((None, self.seq_len))
136 |         
137 |         input_ids, input_mask, segment_ids = input
138 |         
139 |         bert_inputs = dict(
140 |             input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
141 |         )
142 |         output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
143 |         
144 |         if self.pooling == "cls":
145 |             pooled = output["pooled_output"]
146 |         else:
147 |             result = output["sequence_output"]
148 |             
149 |             input_mask = tf.cast(input_mask, tf.float32)
150 |             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
151 |             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
152 |                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
153 |             
154 |             if self.pooling == "mean":
155 |               pooled = masked_reduce_mean(result, input_mask)
156 |             else:
157 |               pooled = mul_mask(result, input_mask)
158 | 
159 |         return pooled
160 | 
161 |     def get_config(self):
162 |         config_dict = {
163 |             "bert_path": self.bert_path, 
164 |             "seq_len": self.seq_len,
165 |             "pooling": self.pooling,
166 |             "n_tune_layers": self.n_tune_layers,
167 |             "tune_embeddings": self.tune_embeddings,
168 |             "do_preprocessing": self.do_preprocessing,
169 |             "verbose": self.verbose
170 |         }
171 |         super(BertLayer, self).get_config()
172 |         return config_dict
173 | 


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/BertLayer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/BertLayer.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/data_pre.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/data_pre.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/extract_features.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/extract_features.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/modeling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/modeling.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/tokenization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/tokenization.cpython-36.pyc


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/README.md:
--------------------------------------------------------------------------------
1 | https://github.com/gaphex/bert_experimental/tree/master/bert_experimental
2 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/feature_extraction/l2_retriever.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | 
 5 | class L2Retriever:
 6 |     def __init__(self, dim, top_k=3, use_norm=False, use_gpu=False):
 7 | 
 8 |         self.dim = dim
 9 |         self.top_k = top_k
10 |         self.use_norm = use_norm
11 |         config = tf.ConfigProto(
12 |             device_count={'GPU': (1 if use_gpu else 0)}
13 |         )
14 |         config.gpu_options.allow_growth = True
15 |         self.session = tf.Session(config=config)
16 |         self.dtype = "float32"
17 |         
18 |         self.query = tf.placeholder(self.dtype, [None, self.dim])
19 |         self.kbase = tf.placeholder(self.dtype, [None, self.dim])
20 |         if self.use_norm:
21 |             self.norm = tf.placeholder(self.dtype, [None, 1])
22 |         else:
23 |             self.norm = None
24 |         
25 |         self.build_graph()
26 | 
27 |     def build_graph(self):
28 | 
29 |         self.distance = self.euclidean_distances(self.kbase, self.query, self.norm)
30 |         top_neg_dists, top_indices = tf.math.top_k(
31 |             tf.negative(self.distance), k=self.top_k)
32 |         top_dists = tf.sqrt(tf.abs(tf.negative(top_neg_dists)))
33 | 
34 |         self.top_distances = top_dists
35 |         self.top_indices = top_indices
36 | 
37 |     def predict(self, kbase, query, norm=None):
38 | 
39 |         query = query.reshape((-1, self.dim))
40 |         feed_dict = {self.query: query, self.kbase: kbase}
41 |         if self.use_norm:
42 |             feed_dict[self.norm] = norm
43 |         
44 |         I, D = self.session.run([self.top_indices, self.top_distances],
45 |                                 feed_dict=feed_dict)
46 |         
47 |         return I, D
48 |       
49 |     @staticmethod
50 |     def euclidean_distances(kbase, query, norm=None):
51 | 
52 |         if norm is None:
53 |             XX = tf.keras.backend.batch_dot(kbase, kbase, axes=1)
54 |         else:
55 |             XX = norm
56 | 
57 |         YY = tf.transpose(tf.keras.backend.batch_dot(query, query, axes=1))
58 |         XY = tf.matmul(kbase, tf.transpose(query))
59 | 
60 |         distance = XX - 2 * XY + YY
61 |         distance = tf.transpose(distance)
62 | 
63 |         return distance
64 |     
65 |     @staticmethod
66 |     def compute_squared_l2_norm(mat):
67 |         square_norms = np.sum(mat**2, axis=1, keepdims=True)
68 |         return square_norms
69 |       
70 |     def __call__(self, kbase, query, norm=None):
71 |         return self.predict(kbase, query, norm)
72 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/feature_extraction/text_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import tensorflow as tf
  3 | import collections
  4 | import unicodedata
  5 | 
  6 | 
  7 | class FullTokenizer(object):
  8 |     """Runs end-to-end tokenziation."""
  9 | 
 10 |     def __init__(self, vocab_file, do_lower_case=True):
 11 |         self.vocab = load_vocab(vocab_file)
 12 |         self.inv_vocab = {v: k for k, v in self.vocab.items()}
 13 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
 14 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 15 | 
 16 |     def tokenize(self, text):
 17 |         split_tokens = []
 18 |         for token in self.basic_tokenizer.tokenize(text):
 19 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
 20 |                 split_tokens.append(sub_token)
 21 | 
 22 |         return split_tokens
 23 | 
 24 |     def convert_tokens_to_ids(self, tokens):
 25 |         return convert_by_vocab(self.vocab, tokens)
 26 | 
 27 |     def convert_ids_to_tokens(self, ids):
 28 |         return convert_by_vocab(self.inv_vocab, ids)
 29 | 
 30 |     def mark_unk_tokens(self, tokens, unk_token='[UNK]'):
 31 |         return [t if t in self.vocab else unk_token for t in tokens]
 32 | 
 33 | 
 34 | class BasicTokenizer(object):
 35 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 36 | 
 37 |     def __init__(self, do_lower_case=True):
 38 |         """Constructs a BasicTokenizer.
 39 |         Args:
 40 |           do_lower_case: Whether to lower case the input.
 41 |         """
 42 |         self.do_lower_case = do_lower_case
 43 | 
 44 |     def tokenize(self, text):
 45 |         """Tokenizes a piece of text."""
 46 |         text = convert_to_unicode(text)
 47 |         text = self._clean_text(text)
 48 | 
 49 |         # This was added on November 1st, 2018 for the multilingual and Chinese
 50 |         # models. This is also applied to the English models now, but it doesn't
 51 |         # matter since the English models were not trained on any Chinese data
 52 |         # and generally don't have any Chinese data in them (there are Chinese
 53 |         # characters in the vocabulary because Wikipedia does have some Chinese
 54 |         # words in the English Wikipedia.).
 55 |         text = self._tokenize_chinese_chars(text)
 56 | 
 57 |         orig_tokens = whitespace_tokenize(text)
 58 |         split_tokens = []
 59 |         for token in orig_tokens:
 60 |             if self.do_lower_case:
 61 |                 token = token.lower()
 62 |                 token = self._run_strip_accents(token)
 63 |             split_tokens.extend(self._run_split_on_punc(token))
 64 | 
 65 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
 66 |         return output_tokens
 67 | 
 68 |     def _run_strip_accents(self, text):
 69 |         """Strips accents from a piece of text."""
 70 |         text = unicodedata.normalize("NFD", text)
 71 |         output = []
 72 |         for char in text:
 73 |             cat = unicodedata.category(char)
 74 |             if cat == "Mn":
 75 |                 continue
 76 |             output.append(char)
 77 |         return "".join(output)
 78 | 
 79 |     def _run_split_on_punc(self, text):
 80 |         """Splits punctuation on a piece of text."""
 81 |         chars = list(text)
 82 |         i = 0
 83 |         start_new_word = True
 84 |         output = []
 85 |         while i < len(chars):
 86 |             char = chars[i]
 87 |             if _is_punctuation(char):
 88 |                 output.append([char])
 89 |                 start_new_word = True
 90 |             else:
 91 |                 if start_new_word:
 92 |                     output.append([])
 93 |                 start_new_word = False
 94 |                 output[-1].append(char)
 95 |             i += 1
 96 | 
 97 |         return ["".join(x) for x in output]
 98 | 
 99 |     def _tokenize_chinese_chars(self, text):
100 |         """Adds whitespace around any CJK character."""
101 |         output = []
102 |         for char in text:
103 |             cp = ord(char)
104 |             if self._is_chinese_char(cp):
105 |                 output.append(" ")
106 |                 output.append(char)
107 |                 output.append(" ")
108 |             else:
109 |                 output.append(char)
110 |         return "".join(output)
111 | 
112 |     def _is_chinese_char(self, cp):
113 |         """Checks whether CP is the codepoint of a CJK character."""
114 |         # This defines a "chinese character" as anything in the CJK Unicode block:
115 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
116 |         #
117 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
118 |         # despite its name. The modern Korean Hangul alphabet is a different block,
119 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
120 |         # space-separated words, so they are not treated specially and handled
121 |         # like the all of the other languages.
122 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
123 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
124 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
125 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
126 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
127 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
128 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
129 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
130 |             return True
131 | 
132 |         return False
133 | 
134 |     def _clean_text(self, text):
135 |         """Performs invalid character removal and whitespace cleanup on text."""
136 |         output = []
137 |         for char in text:
138 |             cp = ord(char)
139 |             if cp == 0 or cp == 0xfffd or _is_control(char):
140 |                 continue
141 |             if _is_whitespace(char):
142 |                 output.append(" ")
143 |             else:
144 |                 output.append(char)
145 |         return "".join(output)
146 | 
147 | 
148 | class WordpieceTokenizer(object):
149 |     """Runs WordPiece tokenziation."""
150 | 
151 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
152 |         self.vocab = vocab
153 |         self.unk_token = unk_token
154 |         self.max_input_chars_per_word = max_input_chars_per_word
155 | 
156 |     def tokenize(self, text):
157 |         """Tokenizes a piece of text into its word pieces.
158 |         This uses a greedy longest-match-first algorithm to perform tokenization
159 |         using the given vocabulary.
160 |         For example:
161 |           input = "unaffable"
162 |           output = ["un", "##aff", "##able"]
163 |         Args:
164 |           text: A single token or whitespace separated tokens. This should have
165 |             already been passed through `BasicTokenizer.
166 |         Returns:
167 |           A list of wordpiece tokens.
168 |         """
169 | 
170 |         text = convert_to_unicode(text)
171 | 
172 |         output_tokens = []
173 |         for token in whitespace_tokenize(text):
174 |             chars = list(token)
175 |             if len(chars) > self.max_input_chars_per_word:
176 |                 output_tokens.append(self.unk_token)
177 |                 continue
178 | 
179 |             is_bad = False
180 |             start = 0
181 |             sub_tokens = []
182 |             while start < len(chars):
183 |                 end = len(chars)
184 |                 cur_substr = None
185 |                 while start < end:
186 |                     substr = "".join(chars[start:end])
187 |                     if start > 0:
188 |                         substr = "##" + substr
189 |                     if substr in self.vocab:
190 |                         cur_substr = substr
191 |                         break
192 |                     end -= 1
193 |                 if cur_substr is None:
194 |                     is_bad = True
195 |                     break
196 |                 sub_tokens.append(cur_substr)
197 |                 start = end
198 | 
199 |             if is_bad:
200 |                 output_tokens.append(self.unk_token)
201 |             else:
202 |                 output_tokens.extend(sub_tokens)
203 |         return output_tokens
204 | 
205 | 
206 | class InputExample(object):
207 | 
208 |     def __init__(self, unique_id, text_a, text_b):
209 |         self.unique_id = unique_id
210 |         self.text_a = text_a
211 |         self.text_b = text_b
212 | 
213 | 
214 | class InputFeatures(object):
215 |     """A single set of features of data."""
216 | 
217 |     def __init__(self, tokens, input_ids, input_mask, input_type_ids):
218 |         # self.unique_id = unique_id
219 |         self.tokens = tokens
220 |         self.input_ids = input_ids
221 |         self.input_mask = input_mask
222 |         self.input_type_ids = input_type_ids
223 | 
224 | 
225 | def _is_whitespace(char):
226 |     """Checks whether `chars` is a whitespace character."""
227 |     # \t, \n, and \r are technically contorl characters but we treat them
228 |     # as whitespace since they are generally considered as such.
229 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
230 |         return True
231 |     cat = unicodedata.category(char)
232 |     if cat == "Zs":
233 |         return True
234 |     return False
235 | 
236 | 
237 | def _is_control(char):
238 |     """Checks whether `chars` is a control character."""
239 |     # These are technically control characters but we count them as whitespace
240 |     # characters.
241 |     if char == "\t" or char == "\n" or char == "\r":
242 |         return False
243 |     cat = unicodedata.category(char)
244 |     if cat.startswith("C"):
245 |         return True
246 |     return False
247 | 
248 | 
249 | def _is_punctuation(char):
250 |     """Checks whether `chars` is a punctuation character."""
251 |     cp = ord(char)
252 |     # We treat all non-letter/number ASCII as punctuation.
253 |     # Characters such as "^", "$", and "`" are not in the Unicode
254 |     # Punctuation class but we treat them as punctuation anyways, for
255 |     # consistency.
256 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
257 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
258 |         return True
259 |     cat = unicodedata.category(char)
260 |     if cat.startswith("P"):
261 |         return True
262 |     return False
263 | 
264 | 
265 | def convert_to_unicode(text):
266 |     if isinstance(text, str):
267 |         return text
268 |     elif isinstance(text, bytes):
269 |         return text.decode("utf-8", "ignore")
270 |     else:
271 |         raise ValueError("Unsupported string type: %s" % (type(text)))
272 | 
273 | 
274 | def printable_text(text):
275 |     if isinstance(text, str):
276 |         return text
277 |     elif isinstance(text, bytes):
278 |         return text.decode("utf-8", "ignore")
279 |     else:
280 |         raise ValueError("Unsupported string type: %s" % (type(text)))
281 | 
282 | 
283 | def load_vocab(vocab_file):
284 |     """Loads a vocabulary file into a dictionary."""
285 |     vocab = collections.OrderedDict()
286 |     index = 0
287 |     with tf.gfile.GFile(vocab_file, "r") as reader:
288 |         while True:
289 |             token = convert_to_unicode(reader.readline())
290 |             if not token:
291 |                 break
292 |             token = token.strip()
293 |             vocab[token] = index
294 |             index += 1
295 |     return vocab
296 | 
297 | 
298 | def convert_by_vocab(vocab, items):
299 |     """Converts a sequence of [tokens|ids] using the vocab."""
300 |     output = []
301 |     for item in items:
302 |         output.append(vocab[item])
303 |     return output
304 | 
305 | 
306 | def convert_tokens_to_ids(vocab, tokens):
307 |     return convert_by_vocab(vocab, tokens)
308 | 
309 | 
310 | def convert_ids_to_tokens(inv_vocab, ids):
311 |     return convert_by_vocab(inv_vocab, ids)
312 | 
313 | 
314 | def whitespace_tokenize(text):
315 |     """Runs basic whitespace cleaning and splitting on a peice of text."""
316 |     text = text.strip()
317 |     if not text:
318 |         return []
319 |     tokens = text.split()
320 |     return tokens
321 | 
322 | 
323 | def convert_lst_to_features(lst_str, max_seq_length, max_position_embeddings,
324 |                             tokenizer, is_tokenized=False, mask_cls_sep=False):
325 |     """Loads a data file into a list of `InputBatch`s."""
326 | 
327 |     examples = read_tokenized_examples(lst_str) if is_tokenized else read_examples(lst_str)
328 | 
329 |     _tokenize = lambda x: tokenizer.mark_unk_tokens(x) if is_tokenized else tokenizer.tokenize(x)
330 | 
331 |     all_tokens = [(_tokenize(ex.text_a), _tokenize(ex.text_b) if ex.text_b else []) for ex in examples]
332 | 
333 |     # user did not specify a meaningful sequence length
334 |     # override the sequence length by the maximum seq length of the current batch
335 |     if max_seq_length is None:
336 |         max_seq_length = max(len(ta) + len(tb) for ta, tb in all_tokens)
337 |         # add special tokens into account
338 |         # case 1: Account for [CLS], tokens_a [SEP], tokens_b [SEP] -> 3 additional tokens
339 |         # case 2: Account for [CLS], tokens_a [SEP] -> 2 additional tokens
340 |         max_seq_length += 3 if any(len(tb) for _, tb in all_tokens) else 2
341 |         max_seq_length = min(max_seq_length, max_position_embeddings)
342 | 
343 |     for (tokens_a, tokens_b) in all_tokens:
344 |         if tokens_b:
345 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
346 |             # length is less than the specified length.
347 |             # Account for [CLS], [SEP], [SEP] with "- 3"
348 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
349 |         else:
350 |             # Account for [CLS] and [SEP] with "- 2"
351 |             if len(tokens_a) > max_seq_length - 2:
352 |                 tokens_a = tokens_a[0:(max_seq_length - 2)]
353 | 
354 |         # The convention in BERT is:
355 |         # (a) For sequence pairs:
356 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
357 |         #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
358 |         # (b) For single sequences:
359 |         #  tokens:   [CLS] the dog is hairy . [SEP]
360 |         #  type_ids: 0     0   0   0  0     0 0
361 |         #
362 |         # Where "type_ids" are used to indicate whether this is the first
363 |         # sequence or the second sequence. The embedding vectors for `type=0` and
364 |         # `type=1` were learned during pre-training and are added to the wordpiece
365 |         # embedding vector (and position vector). This is not *strictly* necessary
366 |         # since the [SEP] token unambiguously separates the sequences, but it makes
367 |         # it easier for the model to learn the concept of sequences.
368 |         #
369 |         # For classification tasks, the first vector (corresponding to [CLS]) is
370 |         # used as as the "sentence vector". Note that this only makes sense because
371 |         # the entire model is fine-tuned.
372 |         tokens = ['[CLS]'] + tokens_a + ['[SEP]']
373 |         input_type_ids = [0] * len(tokens)
374 |         input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]
375 | 
376 |         if tokens_b:
377 |             tokens += tokens_b + ['[SEP]']
378 |             input_type_ids += [1] * (len(tokens_b) + 1)
379 |             input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]
380 | 
381 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
382 | 
383 |         # Zero-pad up to the sequence length. more pythonic
384 |         pad_len = max_seq_length - len(input_ids)
385 |         input_ids += [0] * pad_len
386 |         input_mask += [0] * pad_len
387 |         input_type_ids += [0] * pad_len
388 | 
389 |         assert len(input_ids) == max_seq_length
390 |         assert len(input_mask) == max_seq_length
391 |         assert len(input_type_ids) == max_seq_length
392 | 
393 |         yield InputFeatures(
394 |             # unique_id=example.unique_id,
395 |             tokens=tokens,
396 |             input_ids=input_ids,
397 |             input_mask=input_mask,
398 |             input_type_ids=input_type_ids)
399 | 
400 | 
401 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
402 |     """Truncates a sequence pair in place to the maximum length."""
403 | 
404 |     # This is a simple heuristic which will always truncate the longer sequence
405 |     # one token at a time. This makes more sense than truncating an equal percent
406 |     # of tokens from each, since if one sequence is very short then each token
407 |     # that's truncated likely contains more information than a longer sequence.
408 |     while True:
409 |         total_length = len(tokens_a) + len(tokens_b)
410 |         if total_length <= max_length:
411 |             break
412 |         if len(tokens_a) > len(tokens_b):
413 |             tokens_a.pop()
414 |         else:
415 |             tokens_b.pop()
416 | 
417 | 
418 | def read_examples(lst_strs):
419 |     """Read a list of `InputExample`s from a list of strings."""
420 |     unique_id = 0
421 |     for ss in lst_strs:
422 |         line = convert_to_unicode(ss)
423 |         if not line:
424 |             continue
425 |         line = line.strip()
426 |         text_a = None
427 |         text_b = None
428 |         m = re.match(r"^(.*) \|\|\| (.*)$", line)
429 |         if m is None:
430 |             text_a = line
431 |         else:
432 |             text_a = m.group(1)
433 |             text_b = m.group(2)
434 |         yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
435 |         unique_id += 1
436 | 
437 | 
438 | def read_tokenized_examples(lst_strs):
439 |     unique_id = 0
440 |     lst_strs = [[convert_to_unicode(w) for w in s] for s in lst_strs]
441 |     for ss in lst_strs:
442 |         text_a = ss
443 |         text_b = None
444 |         try:
445 |             j = ss.index('|||')
446 |             text_a = ss[:j]
447 |             text_b = ss[(j + 1):]
448 |         except ValueError:
449 |             pass
450 |         yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
451 |         unique_id += 1
452 | 
453 | def stub_preprocessor(text):
454 |     return text
455 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/bert_layer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | import tensorflow_hub as hub
  4 | 
  5 | from .text_preprocessing import build_preprocessor
  6 | 
  7 | 
  8 | class BertLayer(tf.keras.layers.Layer):
  9 |     def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 
 10 |                  pooling="cls", do_preprocessing=True, verbose=False,
 11 |                  tune_embeddings=False, trainable=True, use_layers=None, 
 12 |                  as_dict=False, **kwargs):
 13 | 
 14 |         self.trainable = trainable
 15 |         self.n_tune_layers = n_tune_layers
 16 |         self.tune_embeddings = tune_embeddings
 17 |         self.do_preprocessing = do_preprocessing
 18 | 
 19 |         self.as_dict = as_dict
 20 |         self.verbose = verbose
 21 |         self.seq_len = seq_len
 22 |         self.pooling = pooling
 23 |         self.bert_path = bert_path
 24 |         self.use_layers = use_layers
 25 | 
 26 |         self.var_per_encoder = 16
 27 |         if self.pooling not in ["cls", "mean", "sqrt_mean", None]:
 28 |             raise NameError(
 29 |                 f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}"
 30 |             )
 31 | 
 32 |         super(BertLayer, self).__init__(**kwargs)
 33 | 
 34 |     def build(self, input_shape):
 35 | 
 36 |         self.bert = hub.Module(self.build_abspath(self.bert_path), 
 37 |                                trainable=self.trainable, name=f"{self.name}_module")
 38 | 
 39 |         trainable_layers = []
 40 |         if self.tune_embeddings:
 41 |             trainable_layers.append("embeddings")
 42 | 
 43 |         if self.pooling == "cls":
 44 |             trainable_layers.append("pooler")
 45 | 
 46 |         if self.n_tune_layers > 0:
 47 |             encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
 48 |             n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
 49 |             if self.use_layers:
 50 |                 n_encoder_layers = min(self.use_layers, n_encoder_layers)
 51 |             for i in range(self.n_tune_layers):
 52 |                 trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
 53 | 
 54 |         # Add module variables to layer's trainable weights
 55 |         for var in self.bert.variables:
 56 |             if any([l in var.name for l in trainable_layers]):
 57 |                 self._trainable_weights.append(var)
 58 |             else:
 59 |                 self._non_trainable_weights.append(var)
 60 | 
 61 |         if self.verbose:
 62 |             print("*** TRAINABLE VARS *** ")
 63 |             for var in self._trainable_weights:
 64 |                 print(var)
 65 | 
 66 |         self.build_preprocessor()
 67 |         self.initialize_module()
 68 | 
 69 |         super(BertLayer, self).build(input_shape)
 70 | 
 71 |     def build_abspath(self, path):
 72 |         if path.startswith("https://") or path.startswith("gs://"):
 73 |             return path
 74 |         else:
 75 |             return os.path.abspath(path)
 76 | 
 77 |     def build_preprocessor(self):
 78 |         sess = tf.compat.v1.keras.backend.get_session()
 79 |         tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
 80 |         vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
 81 |                                               tokenization_info["do_lower_case"]])
 82 |         self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
 83 | 
 84 |     def initialize_module(self):
 85 |         sess = tf.compat.v1.keras.backend.get_session()
 86 | 
 87 |         vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var) 
 88 |                                      for var in self.bert.variables])
 89 | 
 90 |         uninitialized = []
 91 |         for var, is_initialized in zip(self.bert.variables, vars_initialized):
 92 |             if not is_initialized:
 93 |                 uninitialized.append(var)
 94 | 
 95 |         if len(uninitialized):
 96 |             sess.run(tf.compat.v1.variables_initializer(uninitialized))
 97 | 
 98 |     def call(self, input):
 99 | 
100 |         if self.do_preprocessing:
101 |             input = tf.numpy_function(self.preprocessor, 
102 |                                       [input], [tf.int32, tf.int32, tf.int32], 
103 |                                       name='preprocessor')
104 |             for feature in input:
105 |                 feature.set_shape((None, self.seq_len))
106 | 
107 |         input_ids, input_mask, segment_ids = input
108 | 
109 |         bert_inputs = dict(
110 |             input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
111 |         )
112 |         output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
113 |         
114 |         input_mask = tf.cast(input_mask, tf.float32)
115 |             
116 |         seq_output = output["sequence_output"]
117 |         tok_output = mul_mask(output.get("token_output", seq_output), input_mask)
118 |         
119 |         if self.pooling == "cls":
120 |             pooled = output["pooled_output"]
121 |         else:
122 |             if self.pooling == "mean":
123 |                 pooled = masked_reduce_mean(seq_output, input_mask)
124 | 
125 |             elif self.pooling == "sqrt_mean":
126 |                 pooled = masked_reduce_sqrt_mean(seq_output, input_mask)
127 | 
128 |             else:
129 |                 pooled = mul_mask(seq_output, input_mask)
130 | 
131 |         if self.as_dict:
132 |             output = {
133 |                 "sequence_output": seq_output,
134 |                 "pooled_output": pooled,
135 |                 "token_output": tok_output
136 |             }
137 |         else:
138 |             output = pooled
139 | 
140 |         return output
141 | 
142 |     def get_config(self):
143 |         config_dict = {
144 |             "bert_path": self.bert_path, 
145 |             "seq_len": self.seq_len,
146 |             "pooling": self.pooling,
147 |             "n_tune_layers": self.n_tune_layers,
148 |             "tune_embeddings": self.tune_embeddings,
149 |             "do_preprocessing": self.do_preprocessing,
150 |             "use_layers": self.use_layers,
151 |             "trainable": self.trainable,
152 |             "as_dict": self.as_dict,
153 |             "verbose": self.verbose
154 |         }
155 |         super(BertLayer, self).get_config()
156 |         return config_dict
157 | 
158 | 
159 | class StatefulBertLayer(tf.keras.layers.Layer):
160 |     def __init__(self, bert_path, seq_len=64, n_tune_layers=3, 
161 |                  pooling="cls", do_preprocessing=True, verbose=False,
162 |                  tune_embeddings=False, trainable=True, use_layers=None, 
163 |                  as_dict=False, **kwargs):
164 | 
165 |         self.trainable = trainable
166 |         self.n_tune_layers = n_tune_layers
167 |         self.tune_embeddings = tune_embeddings
168 |         self.do_preprocessing = do_preprocessing
169 | 
170 |         self.as_dict = as_dict
171 |         self.verbose = verbose
172 |         self.seq_len = seq_len
173 |         self.pooling = pooling
174 |         self.bert_path = bert_path
175 |         self.use_layers = use_layers
176 | 
177 |         self.var_per_encoder = 16
178 |         if self.pooling not in ["cls", "mean", "sqrt_mean", None]:
179 |             raise NameError(
180 |                 f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}"
181 |             )
182 | 
183 |         super(StatefulBertLayer, self).__init__(**kwargs)
184 | 
185 |     def build(self, input_shape):
186 | 
187 |         self.bert = hub.Module(self.build_abspath(self.bert_path), 
188 |                                trainable=self.trainable, name=f"{self.name}_module")
189 | 
190 |         trainable_layers = []
191 |         if self.tune_embeddings:
192 |             trainable_layers.append("embeddings")
193 | 
194 |         if self.pooling == "cls":
195 |             trainable_layers.append("pooler")
196 | 
197 |         if self.n_tune_layers > 0:
198 |             encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
199 |             n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
200 |             if self.use_layers:
201 |                 n_encoder_layers = min(self.use_layers, n_encoder_layers)
202 |             for i in range(self.n_tune_layers):
203 |                 trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
204 | 
205 |         # Add module variables to layer's trainable weights
206 |         for var in self.bert.variables:
207 |             if any([l in var.name for l in trainable_layers]):
208 |                 self._trainable_weights.append(var)
209 |             else:
210 |                 self._non_trainable_weights.append(var)
211 | 
212 |         if self.verbose:
213 |             print("*** TRAINABLE VARS *** ")
214 |             for var in self._trainable_weights:
215 |                 print(var)
216 | 
217 |         self.build_preprocessor()
218 |         self.initialize_module()
219 | 
220 |         super(StatefulBertLayer, self).build(input_shape)
221 | 
222 |     def build_abspath(self, path):
223 |         if path.startswith("https://") or path.startswith("gs://"):
224 |             return path
225 |         else:
226 |             return os.path.abspath(path)
227 | 
228 |     def build_preprocessor(self):
229 |         sess = tf.compat.v1.keras.backend.get_session()
230 |         tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
231 |         vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
232 |                                               tokenization_info["do_lower_case"]])
233 |         self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
234 | 
235 |     def initialize_module(self):
236 |         sess = tf.compat.v1.keras.backend.get_session()
237 | 
238 |         vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var) 
239 |                                      for var in self.bert.variables])
240 | 
241 |         uninitialized = []
242 |         for var, is_initialized in zip(self.bert.variables, vars_initialized):
243 |             if not is_initialized:
244 |                 uninitialized.append(var)
245 | 
246 |         if len(uninitialized):
247 |             sess.run(tf.compat.v1.variables_initializer(uninitialized))
248 | 
249 |     def call(self, input):
250 | 
251 |         if self.do_preprocessing:
252 |             input_text, input_state = input
253 |             
254 |             preprocessed_text = tf.numpy_function(
255 |                 self.preprocessor, [input_text], 
256 |                 [tf.int32, tf.int32, tf.int32], 
257 |                 name='preprocessor')
258 |             for feature in preprocessed_text:
259 |                 feature.set_shape((None, self.seq_len))
260 |             input_ids, input_mask, segment_ids = preprocessed_text
261 |                 
262 |         else:
263 |             input_ids, input_mask, segment_ids, input_state = input
264 |             
265 |         bert_inputs = dict(
266 |             input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, input_state=input_state
267 |         )
268 |         output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
269 |         
270 |         input_mask = tf.cast(input_mask, tf.float32)
271 |             
272 |         seq_output = output["sequence_output"]
273 |         tok_output = mul_mask(output.get("token_output", seq_output), input_mask)
274 |         
275 |         if self.pooling == "cls":
276 |             pooled = output["pooled_output"]
277 |         else:
278 |             if self.pooling == "mean":
279 |                 pooled = masked_reduce_mean(seq_output, input_mask)
280 | 
281 |             elif self.pooling == "sqrt_mean":
282 |                 pooled = masked_reduce_sqrt_mean(seq_output, input_mask)
283 | 
284 |             else:
285 |                 pooled = mul_mask(seq_output, input_mask)
286 | 
287 |         if self.as_dict:
288 |             output["pooled_output"] = pooled
289 |         else:
290 |             output = pooled
291 | 
292 |         return output
293 | 
294 |     def get_config(self):
295 |         config_dict = {
296 |             "bert_path": self.bert_path, 
297 |             "seq_len": self.seq_len,
298 |             "pooling": self.pooling,
299 |             "n_tune_layers": self.n_tune_layers,
300 |             "tune_embeddings": self.tune_embeddings,
301 |             "do_preprocessing": self.do_preprocessing,
302 |             "use_layers": self.use_layers,
303 |             "trainable": self.trainable,
304 |             "as_dict": self.as_dict,
305 |             "verbose": self.verbose
306 |         }
307 |         super(StatefulBertLayer, self).get_config()
308 |         return config_dict
309 |     
310 | def mul_mask(x, m):
311 |     return x * tf.expand_dims(m, axis=-1)
312 | 
313 | def masked_reduce_mean(x, m):
314 |     return tf.reduce_sum(mul_mask(x, m), axis=1) / (
315 |         tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
316 |     
317 | def masked_reduce_sqrt_mean(x, m):
318 |     return tf.reduce_sum(mul_mask(x, m), axis=1) / (
319 |         tf.sqrt(tf.reduce_sum(m, axis=1, keepdims=True)) + 1e-10)
320 | 
321 | 
322 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/graph_ops.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from tensorflow.python.framework.graph_util import convert_variables_to_constants
 4 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
 5 | 
 6 | 
 7 | def load_graph(frozen_graph_filename):
 8 |     with tf.io.gfile.GFile(frozen_graph_filename, "rb") as f:
 9 |         graph_def = tf.compat.v1.GraphDef()
10 |         graph_def.ParseFromString(f.read())
11 | 
12 |     with tf.Graph().as_default() as graph:
13 |         tf.import_graph_def(graph_def)
14 |     return graph
15 |     
16 | 
17 | ### UPD old version to tf2/working with tf 1.x
18 | def freeze_keras_model(model, export_path=None, clear_devices=True):
19 |     """
20 |     Freezes the state of a session into a pruned computation graph.
21 | 
22 |     @param model The Keras model to be optimized for inference.
23 |     @param clear_devices Remove the device directives from the graph for better portability.
24 |     @return The frozen graph definition.
25 |     """
26 |     from tensorflow.compat.v1.graph_util import convert_variables_to_constants
27 |     from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
28 |     
29 |     session = tf.compat.v1.keras.backend.get_session()
30 |     graph = session.graph
31 |     
32 |     with graph.as_default():
33 | 
34 |         input_tensors = model.inputs
35 |         output_tensors = model.outputs
36 |         dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
37 |         input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
38 |         output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
39 |         
40 |         tmp_g = graph.as_graph_def()
41 |         if clear_devices:
42 |             for node in tmp_g.node:
43 |                 node.device = ""
44 |         
45 |         tmp_g = optimize_for_inference(
46 |             tmp_g, input_ops, output_ops, dtypes, False)
47 |         
48 |         tmp_g = convert_variables_to_constants(session, tmp_g, output_ops)
49 |         
50 |         if export_path is not None:
51 |             with tf.io.gfile.GFile(export_path, "wb") as f:
52 |                 f.write(tmp_g.SerializeToString())
53 |         
54 |         return tmp_g
55 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/text_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import collections
  5 | import unicodedata
  6 | 
  7 | 
  8 | class FullTokenizer(object):
  9 |     """Runs end-to-end tokenziation."""
 10 | 
 11 |     def __init__(self, vocab_file, do_lower_case=True):
 12 |         self.vocab = load_vocab(vocab_file)
 13 |         self.inv_vocab = {v: k for k, v in self.vocab.items()}
 14 |         self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
 15 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
 16 | 
 17 |     def tokenize(self, text):
 18 |         split_tokens = []
 19 |         for token in self.basic_tokenizer.tokenize(text):
 20 |             for sub_token in self.wordpiece_tokenizer.tokenize(token):
 21 |                 split_tokens.append(sub_token)
 22 | 
 23 |         return split_tokens
 24 | 
 25 |     def convert_tokens_to_ids(self, tokens):
 26 |         return convert_by_vocab(self.vocab, tokens)
 27 | 
 28 |     def convert_ids_to_tokens(self, ids):
 29 |         return convert_by_vocab(self.inv_vocab, ids)
 30 | 
 31 |     def mark_unk_tokens(self, tokens, unk_token='[UNK]'):
 32 |         return [t if t in self.vocab else unk_token for t in tokens]
 33 | 
 34 | 
 35 | class BasicTokenizer(object):
 36 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 37 | 
 38 |     def __init__(self, do_lower_case=True):
 39 |         """Constructs a BasicTokenizer.
 40 |         Args:
 41 |           do_lower_case: Whether to lower case the input.
 42 |         """
 43 |         self.do_lower_case = do_lower_case
 44 | 
 45 |     def tokenize(self, text):
 46 |         """Tokenizes a piece of text."""
 47 |         text = convert_to_unicode(text)
 48 |         text = self._clean_text(text)
 49 | 
 50 |         # This was added on November 1st, 2018 for the multilingual and Chinese
 51 |         # models. This is also applied to the English models now, but it doesn't
 52 |         # matter since the English models were not trained on any Chinese data
 53 |         # and generally don't have any Chinese data in them (there are Chinese
 54 |         # characters in the vocabulary because Wikipedia does have some Chinese
 55 |         # words in the English Wikipedia.).
 56 |         text = self._tokenize_chinese_chars(text)
 57 | 
 58 |         orig_tokens = whitespace_tokenize(text)
 59 |         split_tokens = []
 60 |         for token in orig_tokens:
 61 |             if self.do_lower_case:
 62 |                 token = token.lower()
 63 |                 token = self._run_strip_accents(token)
 64 |             split_tokens.extend(self._run_split_on_punc(token))
 65 | 
 66 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
 67 |         return output_tokens
 68 | 
 69 |     def _run_strip_accents(self, text):
 70 |         """Strips accents from a piece of text."""
 71 |         text = unicodedata.normalize("NFD", text)
 72 |         output = []
 73 |         for char in text:
 74 |             cat = unicodedata.category(char)
 75 |             if cat == "Mn":
 76 |                 continue
 77 |             output.append(char)
 78 |         return "".join(output)
 79 | 
 80 |     def _run_split_on_punc(self, text):
 81 |         """Splits punctuation on a piece of text."""
 82 |         chars = list(text)
 83 |         i = 0
 84 |         start_new_word = True
 85 |         output = []
 86 |         while i < len(chars):
 87 |             char = chars[i]
 88 |             if _is_punctuation(char):
 89 |                 output.append([char])
 90 |                 start_new_word = True
 91 |             else:
 92 |                 if start_new_word:
 93 |                     output.append([])
 94 |                 start_new_word = False
 95 |                 output[-1].append(char)
 96 |             i += 1
 97 | 
 98 |         return ["".join(x) for x in output]
 99 | 
100 |     def _tokenize_chinese_chars(self, text):
101 |         """Adds whitespace around any CJK character."""
102 |         output = []
103 |         for char in text:
104 |             cp = ord(char)
105 |             if self._is_chinese_char(cp):
106 |                 output.append(" ")
107 |                 output.append(char)
108 |                 output.append(" ")
109 |             else:
110 |                 output.append(char)
111 |         return "".join(output)
112 | 
113 |     def _is_chinese_char(self, cp):
114 |         """Checks whether CP is the codepoint of a CJK character."""
115 |         # This defines a "chinese character" as anything in the CJK Unicode block:
116 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
117 |         #
118 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
119 |         # despite its name. The modern Korean Hangul alphabet is a different block,
120 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
121 |         # space-separated words, so they are not treated specially and handled
122 |         # like the all of the other languages.
123 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
124 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
125 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
126 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
127 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
128 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
129 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
130 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
131 |             return True
132 | 
133 |         return False
134 | 
135 |     def _clean_text(self, text):
136 |         """Performs invalid character removal and whitespace cleanup on text."""
137 |         output = []
138 |         for char in text:
139 |             cp = ord(char)
140 |             if cp == 0 or cp == 0xfffd or _is_control(char):
141 |                 continue
142 |             if _is_whitespace(char):
143 |                 output.append(" ")
144 |             else:
145 |                 output.append(char)
146 |         return "".join(output)
147 | 
148 | 
149 | class WordpieceTokenizer(object):
150 |     """Runs WordPiece tokenziation."""
151 | 
152 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
153 |         self.vocab = vocab
154 |         self.unk_token = unk_token
155 |         self.max_input_chars_per_word = max_input_chars_per_word
156 | 
157 |     def tokenize(self, text):
158 |         """Tokenizes a piece of text into its word pieces.
159 |         This uses a greedy longest-match-first algorithm to perform tokenization
160 |         using the given vocabulary.
161 |         For example:
162 |           input = "unaffable"
163 |           output = ["un", "##aff", "##able"]
164 |         Args:
165 |           text: A single token or whitespace separated tokens. This should have
166 |             already been passed through `BasicTokenizer.
167 |         Returns:
168 |           A list of wordpiece tokens.
169 |         """
170 | 
171 |         text = convert_to_unicode(text)
172 | 
173 |         output_tokens = []
174 |         for token in whitespace_tokenize(text):
175 |             chars = list(token)
176 |             if len(chars) > self.max_input_chars_per_word:
177 |                 output_tokens.append(self.unk_token)
178 |                 continue
179 | 
180 |             is_bad = False
181 |             start = 0
182 |             sub_tokens = []
183 |             while start < len(chars):
184 |                 end = len(chars)
185 |                 cur_substr = None
186 |                 while start < end:
187 |                     substr = "".join(chars[start:end])
188 |                     if start > 0:
189 |                         substr = "##" + substr
190 |                     if substr in self.vocab:
191 |                         cur_substr = substr
192 |                         break
193 |                     end -= 1
194 |                 if cur_substr is None:
195 |                     is_bad = True
196 |                     break
197 |                 sub_tokens.append(cur_substr)
198 |                 start = end
199 | 
200 |             if is_bad:
201 |                 output_tokens.append(self.unk_token)
202 |             else:
203 |                 output_tokens.extend(sub_tokens)
204 |         return output_tokens
205 | 
206 | 
207 | class InputExample(object):
208 | 
209 |     def __init__(self, unique_id, text_a, text_b):
210 |         self.unique_id = unique_id
211 |         self.text_a = text_a
212 |         self.text_b = text_b
213 | 
214 | 
215 | class InputFeatures(object):
216 |     """A single set of features of data."""
217 | 
218 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
219 |         self.unique_id = unique_id
220 |         self.tokens = tokens
221 |         self.input_ids = input_ids
222 |         self.input_mask = input_mask
223 |         self.input_type_ids = input_type_ids
224 | 
225 | 
226 | def _is_whitespace(char):
227 |     """Checks whether `chars` is a whitespace character."""
228 |     # \t, \n, and \r are technically contorl characters but we treat them
229 |     # as whitespace since they are generally considered as such.
230 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
231 |         return True
232 |     cat = unicodedata.category(char)
233 |     if cat == "Zs":
234 |         return True
235 |     return False
236 | 
237 | 
238 | def _is_control(char):
239 |     """Checks whether `chars` is a control character."""
240 |     # These are technically control characters but we count them as whitespace
241 |     # characters.
242 |     if char == "\t" or char == "\n" or char == "\r":
243 |         return False
244 |     cat = unicodedata.category(char)
245 |     if cat.startswith("C"):
246 |         return True
247 |     return False
248 | 
249 | 
250 | def _is_punctuation(char):
251 |     """Checks whether `chars` is a punctuation character."""
252 |     cp = ord(char)
253 |     # We treat all non-letter/number ASCII as punctuation.
254 |     # Characters such as "^", "$", and "`" are not in the Unicode
255 |     # Punctuation class but we treat them as punctuation anyways, for
256 |     # consistency.
257 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
258 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
259 |         return True
260 |     cat = unicodedata.category(char)
261 |     if cat.startswith("P"):
262 |         return True
263 |     return False
264 | 
265 | 
266 | def convert_to_unicode(text):
267 |     if isinstance(text, str):
268 |         return text
269 |     elif isinstance(text, bytes):
270 |         return text.decode("utf-8", "ignore")
271 |     else:
272 |         raise ValueError("Unsupported string type: %s" % (type(text)))
273 | 
274 | 
275 | def printable_text(text):
276 |     if isinstance(text, str):
277 |         return text
278 |     elif isinstance(text, bytes):
279 |         return text.decode("utf-8", "ignore")
280 |     else:
281 |         raise ValueError("Unsupported string type: %s" % (type(text)))
282 | 
283 | 
284 | def load_vocab(vocab_file):
285 |     """Loads a vocabulary file into a dictionary."""
286 |     vocab = collections.OrderedDict()
287 |     index = 0
288 |     with tf.io.gfile.GFile(vocab_file, "r") as reader:
289 |         while True:
290 |             token = convert_to_unicode(reader.readline())
291 |             if not token:
292 |                 break
293 |             token = token.strip()
294 |             vocab[token] = index
295 |             index += 1
296 |     return vocab
297 | 
298 | 
299 | def convert_by_vocab(vocab, items):
300 |     """Converts a sequence of [tokens|ids] using the vocab."""
301 |     output = []
302 |     for item in items:
303 |         output.append(vocab[item])
304 |     return output
305 | 
306 | 
307 | def convert_tokens_to_ids(vocab, tokens):
308 |     return convert_by_vocab(vocab, tokens)
309 | 
310 | 
311 | def convert_ids_to_tokens(inv_vocab, ids):
312 |     return convert_by_vocab(inv_vocab, ids)
313 | 
314 | 
315 | def whitespace_tokenize(text):
316 |     """Runs basic whitespace cleaning and splitting on a peice of text."""
317 |     text = text.strip()
318 |     if not text:
319 |         return []
320 |     tokens = text.split()
321 |     return tokens
322 | 
323 | 
324 | def convert_examples_to_features(examples, seq_length, tokenizer):
325 |     """Loads a data file into a list of `InputBatch`s."""
326 | 
327 |     features = []
328 |     for (ex_index, example) in enumerate(examples):
329 |         tokens_a = tokenizer.tokenize(example.text_a)
330 | 
331 |         tokens_b = None
332 |         if example.text_b:
333 |             tokens_b = tokenizer.tokenize(example.text_b)
334 | 
335 |         if tokens_b:
336 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
337 |             # length is less than the specified length.
338 |             # Account for [CLS], [SEP], [SEP] with "- 3"
339 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
340 |         else:
341 |             # Account for [CLS] and [SEP] with "- 2"
342 |             if len(tokens_a) > seq_length - 2:
343 |                 tokens_a = tokens_a[0:(seq_length - 2)]
344 | 
345 |         # The convention in BERT is:
346 |         # (a) For sequence pairs:
347 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
348 |         #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
349 |         # (b) For single sequences:
350 |         #  tokens:   [CLS] the dog is hairy . [SEP]
351 |         #  type_ids: 0     0   0   0  0     0 0
352 |         #
353 |         # Where "type_ids" are used to indicate whether this is the first
354 |         # sequence or the second sequence. The embedding vectors for `type=0` and
355 |         # `type=1` were learned during pre-training and are added to the wordpiece
356 |         # embedding vector (and position vector). This is not *strictly* necessary
357 |         # since the [SEP] token unambiguously separates the sequences, but it makes
358 |         # it easier for the model to learn the concept of sequences.
359 |         #
360 |         # For classification tasks, the first vector (corresponding to [CLS]) is
361 |         # used as as the "sentence vector". Note that this only makes sense because
362 |         # the entire model is fine-tuned.
363 |         tokens = []
364 |         input_type_ids = []
365 |         tokens.append("[CLS]")
366 |         input_type_ids.append(0)
367 |         for token in tokens_a:
368 |             tokens.append(token)
369 |             input_type_ids.append(0)
370 |         tokens.append("[SEP]")
371 |         input_type_ids.append(0)
372 | 
373 |         if tokens_b:
374 |             for token in tokens_b:
375 |                 tokens.append(token)
376 |                 input_type_ids.append(1)
377 |             tokens.append("[SEP]")
378 |             input_type_ids.append(1)
379 | 
380 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
381 | 
382 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
383 |         # tokens are attended to.
384 |         input_mask = [1] * len(input_ids)
385 | 
386 |         # Zero-pad up to the sequence length.
387 |         while len(input_ids) < seq_length:
388 |             input_ids.append(0)
389 |             input_mask.append(0)
390 |             input_type_ids.append(0)
391 | 
392 |         assert len(input_ids) == seq_length
393 |         assert len(input_mask) == seq_length
394 |         assert len(input_type_ids) == seq_length
395 | 
396 |         features.append(
397 |             InputFeatures(
398 |                 unique_id=example.unique_id,
399 |                 tokens=tokens,
400 |                 input_ids=input_ids,
401 |                 input_mask=input_mask,
402 |                 input_type_ids=input_type_ids))
403 |     return features
404 | 
405 | 
406 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
407 |     """Truncates a sequence pair in place to the maximum length."""
408 | 
409 |     # This is a simple heuristic which will always truncate the longer sequence
410 |     # one token at a time. This makes more sense than truncating an equal percent
411 |     # of tokens from each, since if one sequence is very short then each token
412 |     # that's truncated likely contains more information than a longer sequence.
413 |     while True:
414 |         total_length = len(tokens_a) + len(tokens_b)
415 |         if total_length <= max_length:
416 |             break
417 |         if len(tokens_a) > len(tokens_b):
418 |             tokens_a.pop()
419 |         else:
420 |             tokens_b.pop()
421 | 
422 | 
423 | def read_examples(str_list):
424 |     """Read a list of `InputExample`s from a list of strings."""
425 |     unique_id = 0
426 |     for s in str_list:
427 |         line = convert_to_unicode(s)
428 |         line = line.strip()
429 |         text_a = None
430 |         text_b = None
431 |         m = re.match(r"^(.*) \|\|\| (.*)$", line)
432 |         if m is None:
433 |             text_a = line
434 |         else:
435 |             text_a = m.group(1)
436 |             text_b = m.group(2)
437 |         yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
438 |         unique_id += 1
439 | 
440 | 
441 | def features_to_arrays(features):
442 |     all_input_ids = []
443 |     all_input_mask = []
444 |     all_segment_ids = []
445 | 
446 |     for feature in features:
447 |         all_input_ids.append(feature.input_ids)
448 |         all_input_mask.append(feature.input_mask)
449 |         all_segment_ids.append(feature.input_type_ids)
450 | 
451 |     return (np.array(all_input_ids, dtype='int32'),
452 |             np.array(all_input_mask, dtype='int32'),
453 |             np.array(all_segment_ids, dtype='int32'))
454 | 
455 | 
456 | def build_preprocessor(voc_path, seq_len, lower=True):
457 |     tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
458 |     EMPTY_STR = ""
459 |     PAD_STR = "pad"
460 |     NULL_VAL = 0
461 | 
462 |     def strings_to_arrays(str_list):
463 |         str_list = np.atleast_1d(str_list).reshape((-1,))
464 | 
465 |         empty_id = (str_list == EMPTY_STR).nonzero()[0]
466 |         str_list[empty_id] = PAD_STR
467 | 
468 |         examples = []
469 |         for example in read_examples(str_list):
470 |             examples.append(example)
471 | 
472 |         features = convert_examples_to_features(examples, seq_len, tokenizer)
473 |         arrays = features_to_arrays(features)
474 | 
475 |         for arr in arrays:
476 |             arr[empty_id] = NULL_VAL
477 |         str_list[empty_id] = EMPTY_STR
478 |         return arrays
479 | 
480 |     return strings_to_arrays
481 |     


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.4
2 | tensorflow-hub==0.7.0
3 | 


--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | __version__ = '1.0.4'
 4 | 
 5 | setup(
 6 |     name='bert_experimental',
 7 |     version=__version__,
 8 |     description='Utilities for finetuning BERT-like models',
 9 |     url='https://github.com/gaphex/bert_experimental',
10 |     long_description=open('README.md', 'r', encoding="utf8").read(),
11 |     long_description_content_type='text/markdown',
12 |     author='Denis Antyukhov',
13 |     author_email='gaphex@gmail.com',
14 |     license='MIT',
15 |     packages=find_packages(),
16 |     zip_safe=False,
17 |     install_requires=[
18 |         'tensorflow>=1.15, <2.0',
19 |         'tensorflow-hub==0.7.0',
20 |         'numpy'
21 |     ],
22 |     classifiers=(
23 |         'Programming Language :: Python :: 3.7',
24 |         'License :: OSI Approved :: MIT License',
25 |         'Operating System :: OS Independent',
26 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
27 |     ),
28 |     keywords='bert nlp tensorflow machine learning sentence encoding embedding finetuning',
29 | )
30 | 


--------------------------------------------------------------------------------
/BERT-CNN/data/read.me:
--------------------------------------------------------------------------------
1 | put the train.tsv file here
2 | 


--------------------------------------------------------------------------------
/BERT-CNN/data/test.tsv:
--------------------------------------------------------------------------------
 1 | id	visual	caption
 2 | 0	standard poodle shopping cart footwear	a close up of a dog laying in a basket
 3 | 1	street sign traffic light tower	a black and white photo of a street light
 4 | 2	toilet seat	a white toilet with its seat up in a bathroom
 5 | 3	mobile home studio couch house	a living room filled with furniture and a coffee table
 6 | 4	french loaf conch person	a basket filled with sandwiches on top of a table
 7 | 5	indian elephant	a group of people riding on the back of an elephant
 8 | 6	bow tie windsor glasses	a man wearing glasses and a tie in a room
 9 | 7	sombrero bonnet woman	a woman standing in front of a giant cake
10 | 8	diaper bassinet human	a baby sitting in front of a giant cake
11 | 9	bobsled go-kart human	a group of children sitting around a piece of luggage
12 | 10	vase spotlight plant	a bunch of flowers that are in a vase
13 | 


--------------------------------------------------------------------------------
/BERT-CNN/data/train.tsv:
--------------------------------------------------------------------------------
 1 | id	id1	id2	visual	caption	is_related
 2 | 220740	220741	220742	marimba dalmatian picket fence	a horse jumping competition is going on with people in the stands 	1
 3 | 385729	385730	385731	dishwasher microwave barber chair	a person riding a horse on a dirt ground	0
 4 | 59422	59423	59424	laptop carton comicbook	a laptop that has stickers on its cover is sitting on a table 	1
 5 | 46638	46639	46640	suit Windsortie woodenspoon	a young bow wearing a pink shirt and a purple tie 	1
 6 | 11870	11871	11872	studiocouch four-poster quilt	a couple of girls sitting in a bed in a bedroom 	1
 7 | 471676	471677	471678	streetcar fire engine passenger car	a multi layer plate with cakes and food on it 	0
 8 | 186795	186796	186797	shoe shop television monitor	a man playing a wii on a large projector screen 	1
 9 | 121836	121837	121838	ox water buffalo alp	cattle standing on a hill in fog	1
10 | 396224	396225	396226	altar desk perfume	oranges sitting in a blue bowl on a wooden table 	0
11 | 430635	430636	430637	speedboat paddle lifeboat	pots and other items sit on a stove and counter 	0
12 | 145057	145058	145059	shopping cart ashcan park bench	a coin meter that is laying down on grates 	1
13 | 409778	409779	409780	web site fire engine comic book	a painting of a man from the back 	0
14 | 155568	155569	155570	grocery store patio restaurant	a man and woman walking up the stairs in a backyard 	1
15 | 213951	213952	213953	microwave washer dining table	the kitchen is equipped with all the latest appliances 	1
16 | 489266	489267	489268	traffic light aircraft carrier chain saw	a laptop computer on a desk with cables a mug and bowl	0
17 | 257649	257650	257651	grocery store confectionery shopping basket	a couple of wooden tale stopped with fresh fruit 	1
18 | 113826	113827	113828	lab coat vestment West Highland white terrier	a group of people standing in rows with frisbees for a photo 	1
19 | 486413	486414	486415	snorkel ski tennis ball	two frames of a woman in the air on a tennis court 	0
20 | 400432	400433	400434	crutch lawn mower chain saw	eight  underneath on ambarella in the forest parrot	0
21 | 341153	341154	341155	washer microwave dishwasher	a small propeller plane sitting underneath a covering at an airport 	0
22 | 462067	462068	462069	ballplayer baseball scoreboard	a plate full of bright green lettuce next to some bread 	0
23 | 443392	443393	443394	grocery store pineapple pizza	a man in black and white stripes with makeup smiling	0
24 | 486660	486661	486662	wombat wallaby titi	a persons shadow on the ground of them skateboarding 	0
25 | 336616	336617	336618	moped motor scooter crash helmet	multiple street signs are attached to the post 	0
26 | 124199	124200	124201	sorrel hog barrel	a brown horse eating from a hallowed out metal barrel 	1
27 | 238004	238005	238006	tray washbasin cradle	a cat laying on a couch near a remote control 	1
28 | 319195	319196	319197	airliner wing web site	a propeller airplane parked inside and airplane hanger 	1
29 | 412036	412037	412038	grey whale breakwater killer whale	a stop sign is standing at a street intersection 	0
30 | 491896	491897	491898	teddy wool toyshop	a woman in an old-fashioned kitchen with pots and pans 	0
31 | 487501	487502	487503	snowmobile steam locomotive tow truck	the living room is clean and empty from people 	0
32 | 277093	277094	277095	microwave dishwasher chest	a chair holding a laptop that is facing towards an oven 	1
33 | 135542	135543	135544	water buffalo warthog hog	sheep grazing under a tree in a grassy meadow 	1
34 | 8448	8449	8450	mountainbike unicycle bicycle-built-for-two	a picture of a person throwing a frisbee 	1
35 | 170686	170687	170688	police van minibus ambulance	a person in the army greeting someone in a suit 	1
36 | 372016	372017	372018	Great Dane Irish wolfhound English setter	a man standing in a room holding a remote	0
37 | 351158	351159	351160	sunglass bullet train sunglasses	a woman opening the trunk of her car 	0
38 | 414542	414543	414544	killer whale great white shark paddle	a dog running across a field with a frisbee in his mouth 	0
39 | 264998	264999	265000	bannister ski unicycle	a man riding a skateboard along a metal hand rail 	1
40 | 362868	362869	362870	zebra bustard gazelle	a basket full of bananas with a net on top 	0
41 | 88455	88456	88457	patio flagpole pole	a fire hydrant and fire hose in a houses front yard 	1
42 | 372512	372513	372514	seashore catamaran swimming trunks	a man riding a surfboard on a wave in the ocean 	0
43 | 387327	387328	387329	cellular telephone lab coat cash machine	a baseball game ensues as people watch	0
44 | 248027	248028	248029	web site barbershop cinema	a motor bike on the side of the street 	1
45 | 347507	347508	347509	banana pineapple orange	a bear itching itself on a bare tree	0
46 | 33714	33715	33716	picketfence streetcar mountainbike	the red bike and the pink bike just started dating	1
47 | 173989	173990	173991	umbrella poncho jinrikisha	a group of people walking down a street carrying umbrellas 	1
48 | 20835	20836	20837	ballplayer baseball footballhelmet	a man throwing a baseball from a mound on a field 	1
49 | 16356	16357	16358	lumbermill barbershop turnstile	a man working on a baseball bat while two others watch 	1
50 | 193491	193492	193493	unicycle pole horizontal bar	boy riding on his skateboard down a stair rail 	1
51 | 384165	384166	384167	mixing bowl corn meat loaf	a couple of sailors standing next to a woman 	0
52 | 321736	321737	321738	ballplayer baseball football helmet	a boys baseball game with a batter catcher and umpire	1
53 | 108395	108396	108397	crash helmet moped backpack	a man with a suit and tie on a motor bike 	1
54 | 215942	215943	215944	unicycle military uniform bearskin	four guys are sitting on a bench in front of a building 	1
55 | 134156	134157	134158	wine bottle eggnog red wine	there is a bottle of wine next to a glass	1
56 | 297783	297784	297785	necklace thimble corkscrew	this is an image of a meal and an avocado is included	1
57 | 110516	110517	110518	minivan cab police van	a dog looking ahead with a stoic look in a car seat 	1
58 | 3166	3167	3168	grocerystore headcabbage cauliflower	a pile of vegetables on display at a grocery store 	1
59 | 440075	440076	440077	ski curly-coatedretriever Gordonsetter	elephants and their young in their natural habitat	0
60 | 71021	71022	71023	ballplayer baseball puck	a baseball player and a flying black bat	1
61 | 


--------------------------------------------------------------------------------
/BERT-CNN/data_pre.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import sys
 4 | import json
 5 | 
 6 | import logging
 7 | import numpy as np
 8 | import pandas as pd
 9 | import tensorflow as tf
10 | 
11 | from modeling import BertModel, BertConfig
12 | from tokenization import FullTokenizer, convert_to_unicode
13 | from extract_features import InputExample, convert_examples_to_features
14 | 
15 | def read_examples(str_list):
16 |     """Read a list of `InputExample`s from a list of strings."""
17 |     unique_id = 0
18 |     for s in str_list:
19 |         line = convert_to_unicode(s)
20 |         if not line:
21 |             continue
22 |         line = line.strip()
23 |         text_a = None
24 |         text_b = None
25 |         m = re.match(r"^(.*) \|\|\| (.*)$", line)
26 |         if m is None:
27 |             text_a = line
28 |         else:
29 |             text_a = m.group(1)
30 |             text_b = m.group(2)
31 |         yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
32 |         unique_id += 1
33 | 
34 | # Convert theses features to np.arrays to use with tf.Keras.
35 | def features_to_arrays(features):
36 | 
37 |     all_input_ids = []
38 |     all_input_mask = []
39 |     all_segment_ids = []
40 | 
41 |     for feature in features:
42 |         all_input_ids.append(feature.input_ids)
43 |         all_input_mask.append(feature.input_mask)
44 |         all_segment_ids.append(feature.input_type_ids)
45 | 
46 |     return (np.array(all_input_ids, dtype='int32'), 
47 |             np.array(all_input_mask, dtype='int32'), 
48 |             np.array(all_segment_ids, dtype='int32'))
49 | 
50 | 
51 | # built all togehter 
52 | def build_preprocessor(voc_path, seq_len, lower=True):
53 |   tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
54 |   
55 |   def strings_to_arrays(sents):
56 |   
57 |       sents = np.atleast_1d(sents).reshape((-1,))
58 | 
59 |       examples = []
60 |       for example in read_examples(sents):
61 |           examples.append(example)
62 | 
63 |       features = convert_examples_to_features(examples, seq_len, tokenizer)
64 |       arrays = features_to_arrays(features)
65 |       return arrays
66 |   
67 |   return strings_to_arrays
68 | 
69 | 


--------------------------------------------------------------------------------
/BERT-CNN/eval.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import pandas as pd
 4 | import sys
 5 | import argparse
 6 | from sklearn.model_selection import train_test_split
 7 | 
 8 | sys.path.insert(0, "bert_experimental")
 9 | 
10 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor
11 | from bert_experimental.finetuning.graph_ops import load_graph
12 | 
13 | 
14 | 
15 | parser=argparse.ArgumentParser(description='inference of the model')
16 | parser.add_argument('--testset',  default='test.tsv', help='test file', type=str,required=True) 
17 | parser.add_argument('--model', default='pre-trained model', help='', type=str, required=True) 
18 | args = parser.parse_args()
19 | 
20 | 
21 | 
22 | df = pd.read_csv(args.testset, sep='\t')
23 |  
24 | 
25 | texts = []
26 | delimiter = " ||| "
27 | 
28 | for vis, cap  in zip(df.visual.tolist(), df.caption.tolist()):
29 |   texts.append(delimiter.join((str(vis), str(cap))))
30 | 
31 | 
32 | texts = np.array(texts)
33 | 
34 | trX, tsX = train_test_split(texts, shuffle=False, test_size=0.01)
35 | 
36 | 
37 | restored_graph = load_graph(args.model)
38 | 
39 | graph_ops = restored_graph.get_operations()
40 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name
41 | print(input_op, output_op)
42 | 
43 | x = restored_graph.get_tensor_by_name(input_op + ':0')
44 | y = restored_graph.get_tensor_by_name(output_op + ':0')
45 | 
46 | preprocessor = build_preprocessor("uncased_L-12_H-768_A-12/vocab.txt", 64)
47 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor')
48 | 
49 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32])
50 | 
51 | ##predictions
52 | 
53 | sess = tf.Session(graph=restored_graph)
54 | 
55 | print(trX[:2])
56 | 
57 | y = tf.print(y, summarize=-1)
58 | #x = tf.print(x, summarize=-1)
59 | y_out = sess.run(y, feed_dict={
60 |         x: trX[:2].reshape((-1,1))
61 | 
62 |     })
63 | 
64 | print(y_out)
65 | 
66 | 


--------------------------------------------------------------------------------
/BERT-CNN/extract_features.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Extract pre-computed feature vectors from BERT."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import codecs
 22 | import collections
 23 | import json
 24 | import re
 25 | 
 26 | import modeling
 27 | import tokenization
 28 | import tensorflow as tf
 29 | 
 30 | flags = tf.flags
 31 | 
 32 | FLAGS = flags.FLAGS
 33 | 
 34 | flags.DEFINE_string("input_file", None, "")
 35 | 
 36 | flags.DEFINE_string("output_file", None, "")
 37 | 
 38 | flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
 39 | 
 40 | flags.DEFINE_string(
 41 |     "bert_config_file", None,
 42 |     "The config json file corresponding to the pre-trained BERT model. "
 43 |     "This specifies the model architecture.")
 44 | 
 45 | flags.DEFINE_integer(
 46 |     "max_seq_length", 128,
 47 |     "The maximum total input sequence length after WordPiece tokenization. "
 48 |     "Sequences longer than this will be truncated, and sequences shorter "
 49 |     "than this will be padded.")
 50 | 
 51 | flags.DEFINE_string(
 52 |     "init_checkpoint", None,
 53 |     "Initial checkpoint (usually from a pre-trained BERT model).")
 54 | 
 55 | flags.DEFINE_string("vocab_file", None,
 56 |                     "The vocabulary file that the BERT model was trained on.")
 57 | 
 58 | flags.DEFINE_bool(
 59 |     "do_lower_case", True,
 60 |     "Whether to lower case the input text. Should be True for uncased "
 61 |     "models and False for cased models.")
 62 | 
 63 | flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
 64 | 
 65 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
 66 | 
 67 | flags.DEFINE_string("master", None,
 68 |                     "If using a TPU, the address of the master.")
 69 | 
 70 | flags.DEFINE_integer(
 71 |     "num_tpu_cores", 8,
 72 |     "Only used if `use_tpu` is True. Total number of TPU cores to use.")
 73 | 
 74 | flags.DEFINE_bool(
 75 |     "use_one_hot_embeddings", False,
 76 |     "If True, tf.one_hot will be used for embedding lookups, otherwise "
 77 |     "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
 78 |     "since it is much faster.")
 79 | 
 80 | 
 81 | class InputExample(object):
 82 | 
 83 |   def __init__(self, unique_id, text_a, text_b):
 84 |     self.unique_id = unique_id
 85 |     self.text_a = text_a
 86 |     self.text_b = text_b
 87 | 
 88 | 
 89 | class InputFeatures(object):
 90 |   """A single set of features of data."""
 91 | 
 92 |   def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 93 |     self.unique_id = unique_id
 94 |     self.tokens = tokens
 95 |     self.input_ids = input_ids
 96 |     self.input_mask = input_mask
 97 |     self.input_type_ids = input_type_ids
 98 | 
 99 | 
100 | def input_fn_builder(features, seq_length):
101 |   """Creates an `input_fn` closure to be passed to TPUEstimator."""
102 | 
103 |   all_unique_ids = []
104 |   all_input_ids = []
105 |   all_input_mask = []
106 |   all_input_type_ids = []
107 | 
108 |   for feature in features:
109 |     all_unique_ids.append(feature.unique_id)
110 |     all_input_ids.append(feature.input_ids)
111 |     all_input_mask.append(feature.input_mask)
112 |     all_input_type_ids.append(feature.input_type_ids)
113 | 
114 |   def input_fn(params):
115 |     """The actual input function."""
116 |     batch_size = params["batch_size"]
117 | 
118 |     num_examples = len(features)
119 | 
120 |     # This is for demo purposes and does NOT scale to large data sets. We do
121 |     # not use Dataset.from_generator() because that uses tf.py_func which is
122 |     # not TPU compatible. The right way to load data is with TFRecordReader.
123 |     d = tf.data.Dataset.from_tensor_slices({
124 |         "unique_ids":
125 |             tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
126 |         "input_ids":
127 |             tf.constant(
128 |                 all_input_ids, shape=[num_examples, seq_length],
129 |                 dtype=tf.int32),
130 |         "input_mask":
131 |             tf.constant(
132 |                 all_input_mask,
133 |                 shape=[num_examples, seq_length],
134 |                 dtype=tf.int32),
135 |         "input_type_ids":
136 |             tf.constant(
137 |                 all_input_type_ids,
138 |                 shape=[num_examples, seq_length],
139 |                 dtype=tf.int32),
140 |     })
141 | 
142 |     d = d.batch(batch_size=batch_size, drop_remainder=False)
143 |     return d
144 | 
145 |   return input_fn
146 | 
147 | 
148 | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
149 |                      use_one_hot_embeddings):
150 |   """Returns `model_fn` closure for TPUEstimator."""
151 | 
152 |   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
153 |     """The `model_fn` for TPUEstimator."""
154 | 
155 |     unique_ids = features["unique_ids"]
156 |     input_ids = features["input_ids"]
157 |     input_mask = features["input_mask"]
158 |     input_type_ids = features["input_type_ids"]
159 | 
160 |     model = modeling.BertModel(
161 |         config=bert_config,
162 |         is_training=False,
163 |         input_ids=input_ids,
164 |         input_mask=input_mask,
165 |         token_type_ids=input_type_ids,
166 |         use_one_hot_embeddings=use_one_hot_embeddings)
167 | 
168 |     if mode != tf.estimator.ModeKeys.PREDICT:
169 |       raise ValueError("Only PREDICT modes are supported: %s" % (mode))
170 | 
171 |     tvars = tf.trainable_variables()
172 |     scaffold_fn = None
173 |     (assignment_map,
174 |      initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
175 |          tvars, init_checkpoint)
176 |     if use_tpu:
177 | 
178 |       def tpu_scaffold():
179 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
180 |         return tf.train.Scaffold()
181 | 
182 |       scaffold_fn = tpu_scaffold
183 |     else:
184 |       tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
185 | 
186 |     tf.logging.info("**** Trainable Variables ****")
187 |     for var in tvars:
188 |       init_string = ""
189 |       if var.name in initialized_variable_names:
190 |         init_string = ", *INIT_FROM_CKPT*"
191 |       tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
192 |                       init_string)
193 | 
194 |     all_layers = model.get_all_encoder_layers()
195 | 
196 |     predictions = {
197 |         "unique_id": unique_ids,
198 |     }
199 | 
200 |     for (i, layer_index) in enumerate(layer_indexes):
201 |       predictions["layer_output_%d" % i] = all_layers[layer_index]
202 | 
203 |     output_spec = tf.contrib.tpu.TPUEstimatorSpec(
204 |         mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
205 |     return output_spec
206 | 
207 |   return model_fn
208 | 
209 | 
210 | def convert_examples_to_features(examples, seq_length, tokenizer):
211 |   """Loads a data file into a list of `InputBatch`s."""
212 | 
213 |   features = []
214 |   for (ex_index, example) in enumerate(examples):
215 |     tokens_a = tokenizer.tokenize(example.text_a)
216 | 
217 |     tokens_b = None
218 |     if example.text_b:
219 |       tokens_b = tokenizer.tokenize(example.text_b)
220 | 
221 |     if tokens_b:
222 |       # Modifies `tokens_a` and `tokens_b` in place so that the total
223 |       # length is less than the specified length.
224 |       # Account for [CLS], [SEP], [SEP] with "- 3"
225 |       _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
226 |     else:
227 |       # Account for [CLS] and [SEP] with "- 2"
228 |       if len(tokens_a) > seq_length - 2:
229 |         tokens_a = tokens_a[0:(seq_length - 2)]
230 | 
231 |     # The convention in BERT is:
232 |     # (a) For sequence pairs:
233 |     #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
234 |     #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
235 |     # (b) For single sequences:
236 |     #  tokens:   [CLS] the dog is hairy . [SEP]
237 |     #  type_ids: 0     0   0   0  0     0 0
238 |     #
239 |     # Where "type_ids" are used to indicate whether this is the first
240 |     # sequence or the second sequence. The embedding vectors for `type=0` and
241 |     # `type=1` were learned during pre-training and are added to the wordpiece
242 |     # embedding vector (and position vector). This is not *strictly* necessary
243 |     # since the [SEP] token unambiguously separates the sequences, but it makes
244 |     # it easier for the model to learn the concept of sequences.
245 |     #
246 |     # For classification tasks, the first vector (corresponding to [CLS]) is
247 |     # used as as the "sentence vector". Note that this only makes sense because
248 |     # the entire model is fine-tuned.
249 |     tokens = []
250 |     input_type_ids = []
251 |     tokens.append("[CLS]")
252 |     input_type_ids.append(0)
253 |     for token in tokens_a:
254 |       tokens.append(token)
255 |       input_type_ids.append(0)
256 |     tokens.append("[SEP]")
257 |     input_type_ids.append(0)
258 | 
259 |     if tokens_b:
260 |       for token in tokens_b:
261 |         tokens.append(token)
262 |         input_type_ids.append(1)
263 |       tokens.append("[SEP]")
264 |       input_type_ids.append(1)
265 | 
266 |     input_ids = tokenizer.convert_tokens_to_ids(tokens)
267 | 
268 |     # The mask has 1 for real tokens and 0 for padding tokens. Only real
269 |     # tokens are attended to.
270 |     input_mask = [1] * len(input_ids)
271 | 
272 |     # Zero-pad up to the sequence length.
273 |     while len(input_ids) < seq_length:
274 |       input_ids.append(0)
275 |       input_mask.append(0)
276 |       input_type_ids.append(0)
277 | 
278 |     assert len(input_ids) == seq_length
279 |     assert len(input_mask) == seq_length
280 |     assert len(input_type_ids) == seq_length
281 | 
282 |     if ex_index < 5:
283 |       tf.logging.info("*** Example ***")
284 |       tf.logging.info("unique_id: %s" % (example.unique_id))
285 |       tf.logging.info("tokens: %s" % " ".join(
286 |           [tokenization.printable_text(x) for x in tokens]))
287 |       tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
288 |       tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
289 |       tf.logging.info(
290 |           "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
291 | 
292 |     features.append(
293 |         InputFeatures(
294 |             unique_id=example.unique_id,
295 |             tokens=tokens,
296 |             input_ids=input_ids,
297 |             input_mask=input_mask,
298 |             input_type_ids=input_type_ids))
299 |   return features
300 | 
301 | 
302 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
303 |   """Truncates a sequence pair in place to the maximum length."""
304 | 
305 |   # This is a simple heuristic which will always truncate the longer sequence
306 |   # one token at a time. This makes more sense than truncating an equal percent
307 |   # of tokens from each, since if one sequence is very short then each token
308 |   # that's truncated likely contains more information than a longer sequence.
309 |   while True:
310 |     total_length = len(tokens_a) + len(tokens_b)
311 |     if total_length <= max_length:
312 |       break
313 |     if len(tokens_a) > len(tokens_b):
314 |       tokens_a.pop()
315 |     else:
316 |       tokens_b.pop()
317 | 
318 | 
319 | def read_examples(input_file):
320 |   """Read a list of `InputExample`s from an input file."""
321 |   examples = []
322 |   unique_id = 0
323 |   with tf.gfile.GFile(input_file, "r") as reader:
324 |     while True:
325 |       line = tokenization.convert_to_unicode(reader.readline())
326 |       if not line:
327 |         break
328 |       line = line.strip()
329 |       text_a = None
330 |       text_b = None
331 |       m = re.match(r"^(.*) \|\|\| (.*)$", line)
332 |       if m is None:
333 |         text_a = line
334 |       else:
335 |         text_a = m.group(1)
336 |         text_b = m.group(2)
337 |       examples.append(
338 |           InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
339 |       unique_id += 1
340 |   return examples
341 | 
342 | 
343 | def main(_):
344 |   tf.logging.set_verbosity(tf.logging.INFO)
345 | 
346 |   layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
347 | 
348 |   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
349 | 
350 |   tokenizer = tokenization.FullTokenizer(
351 |       vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
352 | 
353 |   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
354 |   run_config = tf.contrib.tpu.RunConfig(
355 |       master=FLAGS.master,
356 |       tpu_config=tf.contrib.tpu.TPUConfig(
357 |           num_shards=FLAGS.num_tpu_cores,
358 |           per_host_input_for_training=is_per_host))
359 | 
360 |   examples = read_examples(FLAGS.input_file)
361 | 
362 |   features = convert_examples_to_features(
363 |       examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
364 | 
365 |   unique_id_to_feature = {}
366 |   for feature in features:
367 |     unique_id_to_feature[feature.unique_id] = feature
368 | 
369 |   model_fn = model_fn_builder(
370 |       bert_config=bert_config,
371 |       init_checkpoint=FLAGS.init_checkpoint,
372 |       layer_indexes=layer_indexes,
373 |       use_tpu=FLAGS.use_tpu,
374 |       use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
375 | 
376 |   # If TPU is not available, this will fall back to normal Estimator on CPU
377 |   # or GPU.
378 |   estimator = tf.contrib.tpu.TPUEstimator(
379 |       use_tpu=FLAGS.use_tpu,
380 |       model_fn=model_fn,
381 |       config=run_config,
382 |       predict_batch_size=FLAGS.batch_size)
383 | 
384 |   input_fn = input_fn_builder(
385 |       features=features, seq_length=FLAGS.max_seq_length)
386 | 
387 |   with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
388 |                                                "w")) as writer:
389 |     for result in estimator.predict(input_fn, yield_single_examples=True):
390 |       unique_id = int(result["unique_id"])
391 |       feature = unique_id_to_feature[unique_id]
392 |       output_json = collections.OrderedDict()
393 |       output_json["linex_index"] = unique_id
394 |       all_features = []
395 |       for (i, token) in enumerate(feature.tokens):
396 |         all_layers = []
397 |         for (j, layer_index) in enumerate(layer_indexes):
398 |           layer_output = result["layer_output_%d" % j]
399 |           layers = collections.OrderedDict()
400 |           layers["index"] = layer_index
401 |           layers["values"] = [
402 |               round(float(x), 6) for x in layer_output[i:(i + 1)].flat
403 |           ]
404 |           all_layers.append(layers)
405 |         features = collections.OrderedDict()
406 |         features["token"] = token
407 |         features["layers"] = all_layers
408 |         all_features.append(features)
409 |       output_json["features"] = all_features
410 |       writer.write(json.dumps(output_json) + "\n")
411 | 
412 | 
413 | if __name__ == "__main__":
414 |   flags.mark_flag_as_required("input_file")
415 |   flags.mark_flag_as_required("vocab_file")
416 |   flags.mark_flag_as_required("bert_config_file")
417 |   flags.mark_flag_as_required("init_checkpoint")
418 |   flags.mark_flag_as_required("output_file")
419 |   tf.app.run()
420 | 


--------------------------------------------------------------------------------
/BERT-CNN/freeze_keras_model.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | import os
 5 | import sys
 6 | import json
 7 | 
 8 | import logging
 9 | import numpy as np
10 | import pandas as pd
11 | import tensorflow as tf
12 | import tensorflow_hub as hub
13 | from tensorflow import keras
14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
15 | 
16 | from sklearn.model_selection import train_test_split
17 | 
18 | 
19 | if not 'bert_repo' in sys.path:
20 |     sys.path.insert(0, 'bert_repo')
21 | 
22 | from modeling import BertModel, BertConfig
23 | from tokenization import FullTokenizer, convert_to_unicode
24 | from extract_features import InputExample, convert_examples_to_features
25 | 
26 | 
27 | def freeze_keras_model(model, export_path=None, clear_devices=True):
28 |     sess = tf.keras.backend.get_session()
29 |     graph = sess.graph
30 |     
31 |     with graph.as_default():
32 | 
33 |         input_tensors = model.inputs
34 |         output_tensors = model.outputs
35 |         dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
36 |         input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
37 |         output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
38 |         
39 |         tmp_g = graph.as_graph_def()
40 |         if clear_devices:
41 |             for node in tmp_g.node:
42 |                 node.device = ""
43 |         
44 |         tmp_g = optimize_for_inference(
45 |             tmp_g, input_ops, output_ops, dtypes, False)
46 |         
47 |         tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops)
48 |         
49 |         if export_path is not None:
50 |             with tf.gfile.GFile(export_path, "wb") as f:
51 |                 f.write(tmp_g.SerializeToString())
52 |         
53 |         return tmp_g
54 | 
55 | 


--------------------------------------------------------------------------------
/BERT-CNN/model.json:
--------------------------------------------------------------------------------
1 | "{\"class_name\": \"Model\", \"config\": {\"name\": \"model\", \"layers\": [{\"name\": \"input_1\", \"class_name\": \"InputLayer\", \"config\": {\"batch_input_shape\": [null, 1], \"dtype\": \"string\", \"sparse\": false, \"ragged\": false, \"name\": \"input_1\"}, \"inbound_nodes\": []}, {\"name\": \"bert_layer\", \"class_name\": \"BertLayer\", \"config\": {\"bert_path\": \"./bert-module/\", \"seq_len\": 64, \"pooling\": null, \"n_tune_layers\": 12, \"tune_embeddings\": false, \"do_preprocessing\": true, \"verbose\": false}, \"inbound_nodes\": [[[\"input_1\", 0, 0, {}]]]}, {\"name\": \"conv1d\", \"class_name\": \"Conv1D\", \"config\": {\"name\": \"conv1d\", \"trainable\": true, \"dtype\": \"float32\", \"filters\": 32, \"kernel_size\": [3], \"strides\": [1], \"padding\": \"valid\", \"data_format\": \"channels_last\", \"dilation_rate\": [1], \"activation\": \"relu\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"bert_layer\", 0, 0, {}]]]}, {\"name\": \"max_pooling1d\", \"class_name\": \"MaxPooling1D\", \"config\": {\"name\": \"max_pooling1d\", \"trainable\": true, \"dtype\": \"float32\", \"strides\": [2], \"pool_size\": [2], \"padding\": \"valid\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"conv1d\", 0, 0, {}]]]}, {\"name\": \"flatten\", \"class_name\": \"Flatten\", \"config\": {\"name\": \"flatten\", \"trainable\": true, \"dtype\": \"float32\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"max_pooling1d\", 0, 0, {}]]]}, {\"name\": \"dense\", \"class_name\": \"Dense\", \"config\": {\"name\": \"dense\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 1, \"activation\": \"sigmoid\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"flatten\", 0, 0, {}]]]}], \"input_layers\": [[\"input_1\", 0, 0]], \"output_layers\": [[\"dense\", 0, 0]]}, \"keras_version\": \"2.2.4-tf\", \"backend\": \"tensorflow\"}"


--------------------------------------------------------------------------------
/BERT-CNN/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
 26 |   """Creates an optimizer training op."""
 27 |   global_step = tf.train.get_or_create_global_step()
 28 | 
 29 |   learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 30 | 
 31 |   # Implements linear decay of the learning rate.
 32 |   learning_rate = tf.train.polynomial_decay(
 33 |       learning_rate,
 34 |       global_step,
 35 |       num_train_steps,
 36 |       end_learning_rate=0.0,
 37 |       power=1.0,
 38 |       cycle=False)
 39 | 
 40 |   # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 41 |   # learning rate will be `global_step/num_warmup_steps * init_lr`.
 42 |   if num_warmup_steps:
 43 |     global_steps_int = tf.cast(global_step, tf.int32)
 44 |     warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 45 | 
 46 |     global_steps_float = tf.cast(global_steps_int, tf.float32)
 47 |     warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 48 | 
 49 |     warmup_percent_done = global_steps_float / warmup_steps_float
 50 |     warmup_learning_rate = init_lr * warmup_percent_done
 51 | 
 52 |     is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 53 |     learning_rate = (
 54 |         (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
 55 | 
 56 |   # It is recommended that you use this optimizer for fine tuning, since this
 57 |   # is how the model was trained (note that the Adam m/v variables are NOT
 58 |   # loaded from init_checkpoint.)
 59 |   optimizer = AdamWeightDecayOptimizer(
 60 |       learning_rate=learning_rate,
 61 |       weight_decay_rate=0.01,
 62 |       beta_1=0.9,
 63 |       beta_2=0.999,
 64 |       epsilon=1e-6,
 65 |       exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 66 | 
 67 |   if use_tpu:
 68 |     optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 69 | 
 70 |   tvars = tf.trainable_variables()
 71 |   grads = tf.gradients(loss, tvars)
 72 | 
 73 |   # This is how the model was pre-trained.
 74 |   (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
 75 | 
 76 |   train_op = optimizer.apply_gradients(
 77 |       zip(grads, tvars), global_step=global_step)
 78 | 
 79 |   # Normally the global step update is done inside of `apply_gradients`.
 80 |   # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
 81 |   # a different optimizer, you should probably take this line out.
 82 |   new_global_step = global_step + 1
 83 |   train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |   return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |   """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |   def __init__(self,
 91 |                learning_rate,
 92 |                weight_decay_rate=0.0,
 93 |                beta_1=0.9,
 94 |                beta_2=0.999,
 95 |                epsilon=1e-6,
 96 |                exclude_from_weight_decay=None,
 97 |                name="AdamWeightDecayOptimizer"):
 98 |     """Constructs a AdamWeightDecayOptimizer."""
 99 |     super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |     self.learning_rate = learning_rate
102 |     self.weight_decay_rate = weight_decay_rate
103 |     self.beta_1 = beta_1
104 |     self.beta_2 = beta_2
105 |     self.epsilon = epsilon
106 |     self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |   def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |     """See base class."""
110 |     assignments = []
111 |     for (grad, param) in grads_and_vars:
112 |       if grad is None or param is None:
113 |         continue
114 | 
115 |       param_name = self._get_variable_name(param.name)
116 | 
117 |       m = tf.get_variable(
118 |           name=param_name + "/adam_m",
119 |           shape=param.shape.as_list(),
120 |           dtype=tf.float32,
121 |           trainable=False,
122 |           initializer=tf.zeros_initializer())
123 |       v = tf.get_variable(
124 |           name=param_name + "/adam_v",
125 |           shape=param.shape.as_list(),
126 |           dtype=tf.float32,
127 |           trainable=False,
128 |           initializer=tf.zeros_initializer())
129 | 
130 |       # Standard Adam update.
131 |       next_m = (
132 |           tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 |       next_v = (
134 |           tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 |                                                     tf.square(grad)))
136 | 
137 |       update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 | 
139 |       # Just adding the square of the weights to the loss function is *not*
140 |       # the correct way of using L2 regularization/weight decay with Adam,
141 |       # since that will interact with the m and v parameters in strange ways.
142 |       #
143 |       # Instead we want ot decay the weights in a manner that doesn't interact
144 |       # with the m/v parameters. This is equivalent to adding the square
145 |       # of the weights to the loss with plain (non-momentum) SGD.
146 |       if self._do_use_weight_decay(param_name):
147 |         update += self.weight_decay_rate * param
148 | 
149 |       update_with_lr = self.learning_rate * update
150 | 
151 |       next_param = param - update_with_lr
152 | 
153 |       assignments.extend(
154 |           [param.assign(next_param),
155 |            m.assign(next_m),
156 |            v.assign(next_v)])
157 |     return tf.group(*assignments, name=name)
158 | 
159 |   def _do_use_weight_decay(self, param_name):
160 |     """Whether to use L2 weight decay for `param_name`."""
161 |     if not self.weight_decay_rate:
162 |       return False
163 |     if self.exclude_from_weight_decay:
164 |       for r in self.exclude_from_weight_decay:
165 |         if re.search(r, param_name) is not None:
166 |           return False
167 |     return True
168 | 
169 |   def _get_variable_name(self, param_name):
170 |     """Get the variable name from the tensor name."""
171 |     m = re.match("^(.*):\\d+$", param_name)
172 |     if m is not None:
173 |       param_name = m.group(1)
174 |     return param_name
175 | 


--------------------------------------------------------------------------------
/BERT-CNN/test_demo.tsv:
--------------------------------------------------------------------------------
1 | test_id	visual	caption
2 | 0	standard poodle shopping cart footwear 	 a close up of shoes and a dog in a basket
3 | 1	standard poodle shopping cart footwear 	 a brown teddy bear laying on top of a pair of shoes
4 | 2	toilet seat	a toilet with a hole in the floor
5 | 3	mobile home studio couch house	a living room with a couch chair coffee table and a television
6 | 4	french loaf conch person	a sandwich and a basket of food on a table
7 | 5	indian elephant	a man and two children riding on an elephant
8 | 


--------------------------------------------------------------------------------
/BERT-CNN/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import re
 23 | import unicodedata
 24 | import six
 25 | import tensorflow as tf
 26 | 
 27 | 
 28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 29 |   """Checks whether the casing config is consistent with the checkpoint name."""
 30 | 
 31 |   # The casing has to be passed in by the user and there is no explicit check
 32 |   # as to whether it matches the checkpoint. The casing information probably
 33 |   # should have been stored in the bert_config.json file, but it's not, so
 34 |   # we have to heuristically detect it to validate.
 35 | 
 36 |   if not init_checkpoint:
 37 |     return
 38 | 
 39 |   m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
 40 |   if m is None:
 41 |     return
 42 | 
 43 |   model_name = m.group(1)
 44 | 
 45 |   lower_models = [
 46 |       "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
 47 |       "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
 48 |   ]
 49 | 
 50 |   cased_models = [
 51 |       "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
 52 |       "multi_cased_L-12_H-768_A-12"
 53 |   ]
 54 | 
 55 |   is_bad_config = False
 56 |   if model_name in lower_models and not do_lower_case:
 57 |     is_bad_config = True
 58 |     actual_flag = "False"
 59 |     case_name = "lowercased"
 60 |     opposite_flag = "True"
 61 | 
 62 |   if model_name in cased_models and do_lower_case:
 63 |     is_bad_config = True
 64 |     actual_flag = "True"
 65 |     case_name = "cased"
 66 |     opposite_flag = "False"
 67 | 
 68 |   if is_bad_config:
 69 |     raise ValueError(
 70 |         "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
 71 |         "However, `%s` seems to be a %s model, so you "
 72 |         "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
 73 |         "how the model was pre-training. If this error is wrong, please "
 74 |         "just comment out this check." % (actual_flag, init_checkpoint,
 75 |                                           model_name, case_name, opposite_flag))
 76 | 
 77 | 
 78 | def convert_to_unicode(text):
 79 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 80 |   if six.PY3:
 81 |     if isinstance(text, str):
 82 |       return text
 83 |     elif isinstance(text, bytes):
 84 |       return text.decode("utf-8", "ignore")
 85 |     else:
 86 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 87 |   elif six.PY2:
 88 |     if isinstance(text, str):
 89 |       return text.decode("utf-8", "ignore")
 90 |     elif isinstance(text, unicode):
 91 |       return text
 92 |     else:
 93 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 94 |   else:
 95 |     raise ValueError("Not running on Python2 or Python 3?")
 96 | 
 97 | 
 98 | def printable_text(text):
 99 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
100 | 
101 |   # These functions want `str` for both Python2 and Python3, but in one case
102 |   # it's a Unicode string and in the other it's a byte string.
103 |   if six.PY3:
104 |     if isinstance(text, str):
105 |       return text
106 |     elif isinstance(text, bytes):
107 |       return text.decode("utf-8", "ignore")
108 |     else:
109 |       raise ValueError("Unsupported string type: %s" % (type(text)))
110 |   elif six.PY2:
111 |     if isinstance(text, str):
112 |       return text
113 |     elif isinstance(text, unicode):
114 |       return text.encode("utf-8")
115 |     else:
116 |       raise ValueError("Unsupported string type: %s" % (type(text)))
117 |   else:
118 |     raise ValueError("Not running on Python2 or Python 3?")
119 | 
120 | 
121 | def load_vocab(vocab_file):
122 |   """Loads a vocabulary file into a dictionary."""
123 |   vocab = collections.OrderedDict()
124 |   index = 0
125 |   with tf.gfile.GFile(vocab_file, "r") as reader:
126 |     while True:
127 |       token = convert_to_unicode(reader.readline())
128 |       if not token:
129 |         break
130 |       token = token.strip()
131 |       vocab[token] = index
132 |       index += 1
133 |   return vocab
134 | 
135 | 
136 | def convert_by_vocab(vocab, items):
137 |   """Converts a sequence of [tokens|ids] using the vocab."""
138 |   output = []
139 |   for item in items:
140 |     output.append(vocab[item])
141 |   return output
142 | 
143 | 
144 | def convert_tokens_to_ids(vocab, tokens):
145 |   return convert_by_vocab(vocab, tokens)
146 | 
147 | 
148 | def convert_ids_to_tokens(inv_vocab, ids):
149 |   return convert_by_vocab(inv_vocab, ids)
150 | 
151 | 
152 | def whitespace_tokenize(text):
153 |   """Runs basic whitespace cleaning and splitting on a piece of text."""
154 |   text = text.strip()
155 |   if not text:
156 |     return []
157 |   tokens = text.split()
158 |   return tokens
159 | 
160 | 
161 | class FullTokenizer(object):
162 |   """Runs end-to-end tokenziation."""
163 | 
164 |   def __init__(self, vocab_file, do_lower_case=True):
165 |     self.vocab = load_vocab(vocab_file)
166 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
167 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
168 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
169 | 
170 |   def tokenize(self, text):
171 |     split_tokens = []
172 |     for token in self.basic_tokenizer.tokenize(text):
173 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 |         split_tokens.append(sub_token)
175 | 
176 |     return split_tokens
177 | 
178 |   def convert_tokens_to_ids(self, tokens):
179 |     return convert_by_vocab(self.vocab, tokens)
180 | 
181 |   def convert_ids_to_tokens(self, ids):
182 |     return convert_by_vocab(self.inv_vocab, ids)
183 | 
184 | 
185 | class BasicTokenizer(object):
186 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
187 | 
188 |   def __init__(self, do_lower_case=True):
189 |     """Constructs a BasicTokenizer.
190 | 
191 |     Args:
192 |       do_lower_case: Whether to lower case the input.
193 |     """
194 |     self.do_lower_case = do_lower_case
195 | 
196 |   def tokenize(self, text):
197 |     """Tokenizes a piece of text."""
198 |     text = convert_to_unicode(text)
199 |     text = self._clean_text(text)
200 | 
201 |     # This was added on November 1st, 2018 for the multilingual and Chinese
202 |     # models. This is also applied to the English models now, but it doesn't
203 |     # matter since the English models were not trained on any Chinese data
204 |     # and generally don't have any Chinese data in them (there are Chinese
205 |     # characters in the vocabulary because Wikipedia does have some Chinese
206 |     # words in the English Wikipedia.).
207 |     text = self._tokenize_chinese_chars(text)
208 | 
209 |     orig_tokens = whitespace_tokenize(text)
210 |     split_tokens = []
211 |     for token in orig_tokens:
212 |       if self.do_lower_case:
213 |         token = token.lower()
214 |         token = self._run_strip_accents(token)
215 |       split_tokens.extend(self._run_split_on_punc(token))
216 | 
217 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
218 |     return output_tokens
219 | 
220 |   def _run_strip_accents(self, text):
221 |     """Strips accents from a piece of text."""
222 |     text = unicodedata.normalize("NFD", text)
223 |     output = []
224 |     for char in text:
225 |       cat = unicodedata.category(char)
226 |       if cat == "Mn":
227 |         continue
228 |       output.append(char)
229 |     return "".join(output)
230 | 
231 |   def _run_split_on_punc(self, text):
232 |     """Splits punctuation on a piece of text."""
233 |     chars = list(text)
234 |     i = 0
235 |     start_new_word = True
236 |     output = []
237 |     while i < len(chars):
238 |       char = chars[i]
239 |       if _is_punctuation(char):
240 |         output.append([char])
241 |         start_new_word = True
242 |       else:
243 |         if start_new_word:
244 |           output.append([])
245 |         start_new_word = False
246 |         output[-1].append(char)
247 |       i += 1
248 | 
249 |     return ["".join(x) for x in output]
250 | 
251 |   def _tokenize_chinese_chars(self, text):
252 |     """Adds whitespace around any CJK character."""
253 |     output = []
254 |     for char in text:
255 |       cp = ord(char)
256 |       if self._is_chinese_char(cp):
257 |         output.append(" ")
258 |         output.append(char)
259 |         output.append(" ")
260 |       else:
261 |         output.append(char)
262 |     return "".join(output)
263 | 
264 |   def _is_chinese_char(self, cp):
265 |     """Checks whether CP is the codepoint of a CJK character."""
266 |     # This defines a "chinese character" as anything in the CJK Unicode block:
267 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
268 |     #
269 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
270 |     # despite its name. The modern Korean Hangul alphabet is a different block,
271 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
272 |     # space-separated words, so they are not treated specially and handled
273 |     # like the all of the other languages.
274 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
275 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
276 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
277 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
278 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
279 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
280 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
281 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
282 |       return True
283 | 
284 |     return False
285 | 
286 |   def _clean_text(self, text):
287 |     """Performs invalid character removal and whitespace cleanup on text."""
288 |     output = []
289 |     for char in text:
290 |       cp = ord(char)
291 |       if cp == 0 or cp == 0xfffd or _is_control(char):
292 |         continue
293 |       if _is_whitespace(char):
294 |         output.append(" ")
295 |       else:
296 |         output.append(char)
297 |     return "".join(output)
298 | 
299 | 
300 | class WordpieceTokenizer(object):
301 |   """Runs WordPiece tokenziation."""
302 | 
303 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
304 |     self.vocab = vocab
305 |     self.unk_token = unk_token
306 |     self.max_input_chars_per_word = max_input_chars_per_word
307 | 
308 |   def tokenize(self, text):
309 |     """Tokenizes a piece of text into its word pieces.
310 | 
311 |     This uses a greedy longest-match-first algorithm to perform tokenization
312 |     using the given vocabulary.
313 | 
314 |     For example:
315 |       input = "unaffable"
316 |       output = ["un", "##aff", "##able"]
317 | 
318 |     Args:
319 |       text: A single token or whitespace separated tokens. This should have
320 |         already been passed through `BasicTokenizer.
321 | 
322 |     Returns:
323 |       A list of wordpiece tokens.
324 |     """
325 | 
326 |     text = convert_to_unicode(text)
327 | 
328 |     output_tokens = []
329 |     for token in whitespace_tokenize(text):
330 |       chars = list(token)
331 |       if len(chars) > self.max_input_chars_per_word:
332 |         output_tokens.append(self.unk_token)
333 |         continue
334 | 
335 |       is_bad = False
336 |       start = 0
337 |       sub_tokens = []
338 |       while start < len(chars):
339 |         end = len(chars)
340 |         cur_substr = None
341 |         while start < end:
342 |           substr = "".join(chars[start:end])
343 |           if start > 0:
344 |             substr = "##" + substr
345 |           if substr in self.vocab:
346 |             cur_substr = substr
347 |             break
348 |           end -= 1
349 |         if cur_substr is None:
350 |           is_bad = True
351 |           break
352 |         sub_tokens.append(cur_substr)
353 |         start = end
354 | 
355 |       if is_bad:
356 |         output_tokens.append(self.unk_token)
357 |       else:
358 |         output_tokens.extend(sub_tokens)
359 |     return output_tokens
360 | 
361 | 
362 | def _is_whitespace(char):
363 |   """Checks whether `chars` is a whitespace character."""
364 |   # \t, \n, and \r are technically contorl characters but we treat them
365 |   # as whitespace since they are generally considered as such.
366 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
367 |     return True
368 |   cat = unicodedata.category(char)
369 |   if cat == "Zs":
370 |     return True
371 |   return False
372 | 
373 | 
374 | def _is_control(char):
375 |   """Checks whether `chars` is a control character."""
376 |   # These are technically control characters but we count them as whitespace
377 |   # characters.
378 |   if char == "\t" or char == "\n" or char == "\r":
379 |     return False
380 |   cat = unicodedata.category(char)
381 |   if cat in ("Cc", "Cf"):
382 |     return True
383 |   return False
384 | 
385 | 
386 | def _is_punctuation(char):
387 |   """Checks whether `chars` is a punctuation character."""
388 |   cp = ord(char)
389 |   # We treat all non-letter/number ASCII as punctuation.
390 |   # Characters such as "^", "$", and "`" are not in the Unicode
391 |   # Punctuation class but we treat them as punctuation anyways, for
392 |   # consistency.
393 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
394 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
395 |     return True
396 |   cat = unicodedata.category(char)
397 |   if cat.startswith("P"):
398 |     return True
399 |   return False
400 | 


--------------------------------------------------------------------------------
/BERT-CNN/uncased_L-12_H-768_A-12/file-should be here.txt:
--------------------------------------------------------------------------------
1 | Download this from bert website
2 | 
3 | 


--------------------------------------------------------------------------------
/BERT/README.md:
--------------------------------------------------------------------------------
 1 | ## Semantric Relatendes with BERT  
 2 | Fine-tune BERT on the created  dataset. 
 3 | 
 4 | ### Requirements
 5 | - Tensorflow 1.15.0
 6 | - Python 2.7
 7 | 
 8 | ```
 9 | conda create -n BERT_visual python=2.7 anaconda
10 | conda activate BERT_visual
11 | pip install tensorflow==1.15.0
12 | ``` 
13 | 
14 | ```
15 | python train_model_VC.py # train/val/and inference 
16 | ```
17 | main page example
18 | ``` 
19 | ## relatedness score   
20 | 
21 | image: COCO_val2014_000000156242.jpg - Karpathy test split
22 | ```
23 | ```
24 | BERT Base
25 | 
26 | ('visual :', 'apple') # Visual (ours)
27 | ('caption :', 'a display of apple and orange at market')
28 | ('Prediction :', 0.9933211)
29 | ******
30 | ('visual :', 'apple') # Greedy 
31 | ('caption :', 'a fruit market with apples and orange')
32 | ('Prediction :', 0.98885113)
33 | ******
34 | ('visual :', 'apple') Beam Serach
35 | ('caption :', 'a fruit stand with apples and oranges')
36 | ('Prediction :', 0.9911321)
37 | 
38 |  BERT Large
39 |  
40 | ('visual  :', 'apple')
41 | ('caption :', 'a display of apple and orange at market')
42 | ('Prediction :', 0.99782264)
43 | ****** 
44 | ('visual :', 'apple')
45 | (''caption :', 'a fruit market with apples and orange')
46 | ('Prediction :', 0.99774504)
47 | ****** 
48 | ('visual :', 'apple')
49 | ('caption :', 'a fruit stand with apples and oranges')
50 | ('Prediction :', 0.9977704)
51 | ```
52 | 


--------------------------------------------------------------------------------
/BERT/data/test.tsv:
--------------------------------------------------------------------------------
1 | id	visual	caption	 
2 | 0	shopping	a close up of a dog laying in a basket     
3 | 1	traffic	a black and white photo of a street light      
4 | 2	toilet	a white toilet with its seat up in a bathroom     
5 | 3	bed	a living room filled with furniture and a coffee table     
6 | 4	hotdog	a basket filled with sandwiches on top of a table     
7 | 5	tusker	a group of people riding on the back of an elephant    
8 | 6	suit	a man wearing glasses and a tie in a room    
9 | 


--------------------------------------------------------------------------------
/BERT/data/train.tsv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT/data/train.tsv.zip


--------------------------------------------------------------------------------
/BERT/outputs/need-this.txt:
--------------------------------------------------------------------------------
1 | Put the provide weight in this file if you want to continue the training 
2 | 


--------------------------------------------------------------------------------
/BERT/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import collections
 22 | import unicodedata
 23 | import six
 24 | import tensorflow as tf
 25 | 
 26 | 
 27 | def convert_to_unicode(text):
 28 |   """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
 29 |   if six.PY3:
 30 |     if isinstance(text, str):
 31 |       return text
 32 |     elif isinstance(text, bytes):
 33 |       return text.decode("utf-8", "ignore")
 34 |     else:
 35 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 36 |   elif six.PY2:
 37 |     if isinstance(text, str):
 38 |       return text.decode("utf-8", "ignore")
 39 |     elif isinstance(text, unicode):
 40 |       return text
 41 |     else:
 42 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 43 |   else:
 44 |     raise ValueError("Not running on Python2 or Python 3?")
 45 | 
 46 | 
 47 | def printable_text(text):
 48 |   """Returns text encoded in a way suitable for print or `tf.logging`."""
 49 | 
 50 |   # These functions want `str` for both Python2 and Python3, but in one case
 51 |   # it's a Unicode string and in the other it's a byte string.
 52 |   if six.PY3:
 53 |     if isinstance(text, str):
 54 |       return text
 55 |     elif isinstance(text, bytes):
 56 |       return text.decode("utf-8", "ignore")
 57 |     else:
 58 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 59 |   elif six.PY2:
 60 |     if isinstance(text, str):
 61 |       return text
 62 |     elif isinstance(text, unicode):
 63 |       return text.encode("utf-8")
 64 |     else:
 65 |       raise ValueError("Unsupported string type: %s" % (type(text)))
 66 |   else:
 67 |     raise ValueError("Not running on Python2 or Python 3?")
 68 | 
 69 | 
 70 | def load_vocab(vocab_file):
 71 |   """Loads a vocabulary file into a dictionary."""
 72 |   vocab = collections.OrderedDict()
 73 |   index = 0
 74 |   with tf.gfile.GFile(vocab_file, "r") as reader:
 75 |     while True:
 76 |       token = convert_to_unicode(reader.readline())
 77 |       if not token:
 78 |         break
 79 |       token = token.strip()
 80 |       vocab[token] = index
 81 |       index += 1
 82 |   return vocab
 83 | 
 84 | 
 85 | def convert_by_vocab(vocab, items):
 86 |   """Converts a sequence of [tokens|ids] using the vocab."""
 87 |   output = []
 88 |   for item in items:
 89 |     output.append(vocab[item])
 90 |   return output
 91 | 
 92 | 
 93 | def convert_tokens_to_ids(vocab, tokens):
 94 |   return convert_by_vocab(vocab, tokens)
 95 | 
 96 | 
 97 | def convert_ids_to_tokens(inv_vocab, ids):
 98 |   return convert_by_vocab(inv_vocab, ids)
 99 | 
100 | 
101 | def whitespace_tokenize(text):
102 |   """Runs basic whitespace cleaning and splitting on a peice of text."""
103 |   text = text.strip()
104 |   if not text:
105 |     return []
106 |   tokens = text.split()
107 |   return tokens
108 | 
109 | 
110 | class FullTokenizer(object):
111 |   """Runs end-to-end tokenziation."""
112 | 
113 |   def __init__(self, vocab_file, do_lower_case=True):
114 |     self.vocab = load_vocab(vocab_file)
115 |     self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 |     self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 |     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 | 
119 |   def tokenize(self, text):
120 |     split_tokens = []
121 |     for token in self.basic_tokenizer.tokenize(text):
122 |       for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 |         split_tokens.append(sub_token)
124 | 
125 |     return split_tokens
126 | 
127 |   def convert_tokens_to_ids(self, tokens):
128 |     return convert_by_vocab(self.vocab, tokens)
129 | 
130 |   def convert_ids_to_tokens(self, ids):
131 |     return convert_by_vocab(self.inv_vocab, ids)
132 | 
133 | 
134 | class BasicTokenizer(object):
135 |   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 | 
137 |   def __init__(self, do_lower_case=True):
138 |     """Constructs a BasicTokenizer.
139 | 
140 |     Args:
141 |       do_lower_case: Whether to lower case the input.
142 |     """
143 |     self.do_lower_case = do_lower_case
144 | 
145 |   def tokenize(self, text):
146 |     """Tokenizes a piece of text."""
147 |     text = convert_to_unicode(text)
148 |     text = self._clean_text(text)
149 | 
150 |     # This was added on November 1st, 2018 for the multilingual and Chinese
151 |     # models. This is also applied to the English models now, but it doesn't
152 |     # matter since the English models were not trained on any Chinese data
153 |     # and generally don't have any Chinese data in them (there are Chinese
154 |     # characters in the vocabulary because Wikipedia does have some Chinese
155 |     # words in the English Wikipedia.).
156 |     text = self._tokenize_chinese_chars(text)
157 | 
158 |     orig_tokens = whitespace_tokenize(text)
159 |     split_tokens = []
160 |     for token in orig_tokens:
161 |       if self.do_lower_case:
162 |         token = token.lower()
163 |         token = self._run_strip_accents(token)
164 |       split_tokens.extend(self._run_split_on_punc(token))
165 | 
166 |     output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 |     return output_tokens
168 | 
169 |   def _run_strip_accents(self, text):
170 |     """Strips accents from a piece of text."""
171 |     text = unicodedata.normalize("NFD", text)
172 |     output = []
173 |     for char in text:
174 |       cat = unicodedata.category(char)
175 |       if cat == "Mn":
176 |         continue
177 |       output.append(char)
178 |     return "".join(output)
179 | 
180 |   def _run_split_on_punc(self, text):
181 |     """Splits punctuation on a piece of text."""
182 |     chars = list(text)
183 |     i = 0
184 |     start_new_word = True
185 |     output = []
186 |     while i < len(chars):
187 |       char = chars[i]
188 |       if _is_punctuation(char):
189 |         output.append([char])
190 |         start_new_word = True
191 |       else:
192 |         if start_new_word:
193 |           output.append([])
194 |         start_new_word = False
195 |         output[-1].append(char)
196 |       i += 1
197 | 
198 |     return ["".join(x) for x in output]
199 | 
200 |   def _tokenize_chinese_chars(self, text):
201 |     """Adds whitespace around any CJK character."""
202 |     output = []
203 |     for char in text:
204 |       cp = ord(char)
205 |       if self._is_chinese_char(cp):
206 |         output.append(" ")
207 |         output.append(char)
208 |         output.append(" ")
209 |       else:
210 |         output.append(char)
211 |     return "".join(output)
212 | 
213 |   def _is_chinese_char(self, cp):
214 |     """Checks whether CP is the codepoint of a CJK character."""
215 |     # This defines a "chinese character" as anything in the CJK Unicode block:
216 |     #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 |     #
218 |     # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 |     # despite its name. The modern Korean Hangul alphabet is a different block,
220 |     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 |     # space-separated words, so they are not treated specially and handled
222 |     # like the all of the other languages.
223 |     if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
224 |         (cp >= 0x3400 and cp <= 0x4DBF) or  #
225 |         (cp >= 0x20000 and cp <= 0x2A6DF) or  #
226 |         (cp >= 0x2A700 and cp <= 0x2B73F) or  #
227 |         (cp >= 0x2B740 and cp <= 0x2B81F) or  #
228 |         (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 |         (cp >= 0xF900 and cp <= 0xFAFF) or  #
230 |         (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
231 |       return True
232 | 
233 |     return False
234 | 
235 |   def _clean_text(self, text):
236 |     """Performs invalid character removal and whitespace cleanup on text."""
237 |     output = []
238 |     for char in text:
239 |       cp = ord(char)
240 |       if cp == 0 or cp == 0xfffd or _is_control(char):
241 |         continue
242 |       if _is_whitespace(char):
243 |         output.append(" ")
244 |       else:
245 |         output.append(char)
246 |     return "".join(output)
247 | 
248 | 
249 | class WordpieceTokenizer(object):
250 |   """Runs WordPiece tokenziation."""
251 | 
252 |   def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
253 |     self.vocab = vocab
254 |     self.unk_token = unk_token
255 |     self.max_input_chars_per_word = max_input_chars_per_word
256 | 
257 |   def tokenize(self, text):
258 |     """Tokenizes a piece of text into its word pieces.
259 | 
260 |     This uses a greedy longest-match-first algorithm to perform tokenization
261 |     using the given vocabulary.
262 | 
263 |     For example:
264 |       input = "unaffable"
265 |       output = ["un", "##aff", "##able"]
266 | 
267 |     Args:
268 |       text: A single token or whitespace separated tokens. This should have
269 |         already been passed through `BasicTokenizer.
270 | 
271 |     Returns:
272 |       A list of wordpiece tokens.
273 |     """
274 | 
275 |     text = convert_to_unicode(text)
276 | 
277 |     output_tokens = []
278 |     for token in whitespace_tokenize(text):
279 |       chars = list(token)
280 |       if len(chars) > self.max_input_chars_per_word:
281 |         output_tokens.append(self.unk_token)
282 |         continue
283 | 
284 |       is_bad = False
285 |       start = 0
286 |       sub_tokens = []
287 |       while start < len(chars):
288 |         end = len(chars)
289 |         cur_substr = None
290 |         while start < end:
291 |           substr = "".join(chars[start:end])
292 |           if start > 0:
293 |             substr = "##" + substr
294 |           if substr in self.vocab:
295 |             cur_substr = substr
296 |             break
297 |           end -= 1
298 |         if cur_substr is None:
299 |           is_bad = True
300 |           break
301 |         sub_tokens.append(cur_substr)
302 |         start = end
303 | 
304 |       if is_bad:
305 |         output_tokens.append(self.unk_token)
306 |       else:
307 |         output_tokens.extend(sub_tokens)
308 |     return output_tokens
309 | 
310 | 
311 | def _is_whitespace(char):
312 |   """Checks whether `chars` is a whitespace character."""
313 |   # \t, \n, and \r are technically contorl characters but we treat them
314 |   # as whitespace since they are generally considered as such.
315 |   if char == " " or char == "\t" or char == "\n" or char == "\r":
316 |     return True
317 |   cat = unicodedata.category(char)
318 |   if cat == "Zs":
319 |     return True
320 |   return False
321 | 
322 | 
323 | def _is_control(char):
324 |   """Checks whether `chars` is a control character."""
325 |   # These are technically control characters but we count them as whitespace
326 |   # characters.
327 |   if char == "\t" or char == "\n" or char == "\r":
328 |     return False
329 |   cat = unicodedata.category(char)
330 |   if cat.startswith("C"):
331 |     return True
332 |   return False
333 | 
334 | 
335 | def _is_punctuation(char):
336 |   """Checks whether `chars` is a punctuation character."""
337 |   cp = ord(char)
338 |   # We treat all non-letter/number ASCII as punctuation.
339 |   # Characters such as "^", "$", and "`" are not in the Unicode
340 |   # Punctuation class but we treat them as punctuation anyways, for
341 |   # consistency.
342 |   if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 |       (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 |     return True
345 |   cat = unicodedata.category(char)
346 |   if cat.startswith("P"):
347 |     return True
348 |   return False
349 | 


--------------------------------------------------------------------------------
/BERT/train_model_VC.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | #tensorflow
  3 | # 1.15.0
  4 | import os
  5 | import sys
  6 | import json
  7 | import datetime
  8 | import pprint
  9 | import os
 10 | import tensorflow as tf
 11 | #pip install tensorflow==1.15
 12 | config = tf.ConfigProto()
 13 | 
 14 | 
 15 | #Fine-tuning with Cloud TPUs
 16 | #https://github.com/google-research/bert
 17 | # for the use TPU with colab for fast training and infernce 
 18 | # If you want to use TPU, first switch to tpu runtime in colab
 19 | USE_TPU = False
 20 | 
 21 | 
 22 | #https://github.com/google-research/bert#pre-trained-models
 23 | # We will use base uncased bert model 
 24 | 
 25 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
 26 | BERT_MODEL = 'uncased_L-12_H-768_A-12'
 27 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
 28 | #BERT_MODEL = 'uncased_L-24_H-1024_A-16'
 29 | 
 30 | 
 31 | ## BERT checkpoint bucket
 32 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
 33 | BERT_PRETRAINED_DIR = 'uncased_L-12_H-768_A-12'
 34 | 
 35 | 
 36 | print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
 37 | ## uncased_L-12_H-768_A-12 (directory)
 38 | # bert_model.ckpt.data-00000-of-00001
 39 | # bert_model.ckpt.meta
 40 | 
 41 | # output file 
 42 | #OUTPUT_DIR = '/home/asabir/Desktop/model_repo/outputs'
 43 | OUTPUT_DIR ='outputs'
 44 | #print(f'***** Model output directory: {OUTPUT_DIR} *****')
 45 | print('***** Model output directory: {OUTPUT_DIR} *****')
 46 | #print(f'***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')
 47 | print('***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')
 48 | 
 49 | 
 50 | print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
 51 | 
 52 | 
 53 | #TASK_DATA_DIR = 'data/visual-caption'
 54 | if not 'bert' in sys.path:
 55 |   sys.path += ['bert']
 56 | 
 57 | TASK_DATA_DIR = '/data/'
 58 | # ## Model Configs and Hyper Parameters
 59 | 
 60 | import modeling
 61 | import optimization
 62 | import tokenization
 63 | import run_classifier
 64 | 
 65 | # Model Hyper Parameters
 66 | #TRAIN_BATCH_SIZE = 32 # For GPU, reduce to 16
 67 | TRAIN_BATCH_SIZE = 16 # 
 68 | EVAL_BATCH_SIZE = 8
 69 | PREDICT_BATCH_SIZE = 8
 70 | LEARNING_RATE = 2e-5
 71 | #NUM_TRAIN_EPOCHS = 2.0
 72 | NUM_TRAIN_EPOCHS = 1.0
 73 | WARMUP_PROPORTION = 0.1
 74 | MAX_SEQ_LENGTH = 30
 75 | 
 76 | # Model configs
 77 | SAVE_CHECKPOINTS_STEPS = 1000
 78 | ITERATIONS_PER_LOOP = 1000
 79 | NUM_TPU_CORES = 8
 80 | VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
 81 | CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
 82 | INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
 83 | DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
 84 | 
 85 | 
 86 | # ## Read visual caption Pairs
 87 | # Read data from TSV file and covert to list of InputExample. 
 88 | #to [run_classifier](https://github.com/google-research/bert/blob/master/run_classifier.py) file 
 89 | 
 90 | 
 91 | class VCProcessor(run_classifier.DataProcessor):
 92 |   """Processor for the visual caption pair data set."""
 93 | 
 94 |   def get_train_examples(self, data_dir):
 95 |     """Reading train.tsv and converting to list of InputExample"""
 96 |     return self._create_examples(
 97 |         self._read_tsv(os.path.join(data_dir,"train.tsv")), 'train')
 98 | 
 99 |   def get_dev_examples(self, data_dir):
100 |     """Reading dev.tsv and converting to list of InputExample"""
101 |     return self._create_examples(
102 |         self._read_tsv(os.path.join(data_dir,"dev.tsv")), 'dev')
103 |   
104 |   def get_test_examples(self, data_dir):
105 |     """Reading train.tsv and converting to list of InputExample"""
106 |     return self._create_examples(
107 |         self._read_tsv(os.path.join(data_dir,"test.tsv")), 'test')
108 |   
109 |   def get_predict_examples(self, sentence_pairs):
110 |     """Given visual caption pairs, conevrting to list of InputExample"""
111 |     examples = []
112 |     for (i, vcpair) in enumerate(sentence_pairs):
113 |       guid = "predict-%d" % (i)
114 |       # converting input text to utf-8 and creating InputExamples
115 |       text_a = tokenization.convert_to_unicode(vcpair[0])
116 |       text_b = tokenization.convert_to_unicode(vcpair[1])
117 |       # We will add label  as 0, because None is not supported in converting to features
118 |       examples.append(
119 |           run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=0))
120 |     return examples
121 |   
122 |   def _create_examples(self, lines, set_type):
123 |     """Creates examples for the training, dev and test sets."""
124 |     examples = []
125 |     for (i, line) in enumerate(lines):
126 |       guid = "%s-%d" % (set_type, i)
127 |       if set_type=='test':
128 |         # removing header and invalid data
129 |         if i == 0 or len(line)!=3:
130 |           print(guid, line)
131 |           continue
132 |         text_a = tokenization.convert_to_unicode(line[1])
133 |         text_b = tokenization.convert_to_unicode(line[2])
134 |         label = 0 # We will use zero for test as convert_example_to_features doesn't support None
135 |       else:
136 |         # removing header and invalid data
137 |         if i == 0 or len(line)!=6:
138 |           continue
139 |         text_a = tokenization.convert_to_unicode(line[3])
140 |         text_b = tokenization.convert_to_unicode(line[4])
141 |         label = int(line[5])
142 |       examples.append(
143 |           run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
144 |     return examples
145 | 
146 |   def get_labels(self):
147 |     "return class labels"
148 |     return [0,1]
149 | 
150 | 
151 | # initialiation an instance of visual-caption VCProcessor and tokenizer
152 | processor = VCProcessor()
153 | label_list = processor.get_labels()
154 | tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
155 | 
156 | 
157 | # Converting training examples to features
158 | print("----------------  Processing Training Data ------------------")
159 | TRAIN_TF_RECORD = os.path.join(OUTPUT_DIR, "train.tf_record")
160 | train_examples = processor.get_train_examples(TASK_DATA_DIR)
161 | num_train_examples = len(train_examples)
162 | num_train_steps = int( num_train_examples / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
163 | num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
164 | run_classifier.file_based_convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TRAIN_TF_RECORD)
165 | 
166 | 
167 | # ## Creating Classification Model
168 | 
169 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
170 |                  labels, num_labels, use_one_hot_embeddings):
171 |   """Creates a classification model."""
172 |   # Bert Model instant 
173 |   model = modeling.BertModel(
174 |       config=bert_config,
175 |       is_training=is_training,
176 |       input_ids=input_ids,
177 |       input_mask=input_mask,
178 |       token_type_ids=segment_ids,
179 |       use_one_hot_embeddings=use_one_hot_embeddings)
180 | 
181 |   # Getting output for last layer of BERT
182 |   output_layer = model.get_pooled_output()
183 |   
184 |   # Number of outputs for last layer
185 |   hidden_size = output_layer.shape[-1].value
186 |   
187 |   # We will use one layer on top of BERT pretrained for creating classification model
188 |   output_weights = tf.get_variable(
189 |       "output_weights", [num_labels, hidden_size],
190 |       initializer=tf.truncated_normal_initializer(stddev=0.02))
191 | 
192 |   output_bias = tf.get_variable(
193 |       "output_bias", [num_labels], initializer=tf.zeros_initializer())
194 | 
195 |   with tf.variable_scope("loss"):
196 |     if is_training:
197 |       # 0.1 dropout
198 |       output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
199 |     
200 |     # Calcaulte prediction probabilites and loss
201 |     logits = tf.matmul(output_layer, output_weights, transpose_b=True)
202 |     logits = tf.nn.bias_add(logits, output_bias)
203 |     probabilities = tf.nn.softmax(logits, axis=-1)
204 |     log_probs = tf.nn.log_softmax(logits, axis=-1)
205 | 
206 |     one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
207 | 
208 |     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
209 |     loss = tf.reduce_mean(per_example_loss)
210 | 
211 |     return (loss, per_example_loss, logits, probabilities)
212 | 
213 | 
214 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
215 |                      num_train_steps, num_warmup_steps, use_tpu,
216 |                      use_one_hot_embeddings):
217 |   """Returns `model_fn` closure for TPUEstimator."""
218 | 
219 |   def model_fn(features, labels, mode, params):  
220 |     """The `model_fn` for TPUEstimator."""
221 | 
222 |     # reading features input
223 |     input_ids = features["input_ids"]
224 |     input_mask = features["input_mask"]
225 |     segment_ids = features["segment_ids"]
226 |     label_ids = features["label_ids"]
227 |     is_real_example = None
228 |     if "is_real_example" in features:
229 |       is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
230 |     else:
231 |       is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
232 |     
233 |     # checking if training mode
234 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
235 |     
236 |     # create simple classification model
237 |     (total_loss, per_example_loss, logits, probabilities) = create_model(
238 |         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
239 |         num_labels, use_one_hot_embeddings)
240 |     
241 |     # getting variables for intialization and using pretrained init checkpoint
242 |     tvars = tf.trainable_variables()
243 |     initialized_variable_names = {}
244 |     scaffold_fn = None
245 |     if init_checkpoint:
246 |       (assignment_map, initialized_variable_names
247 |       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
248 |       if use_tpu:
249 | 
250 |         def tpu_scaffold():
251 |           tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
252 |           return tf.train.Scaffold()
253 | 
254 |         scaffold_fn = tpu_scaffold
255 |       else:
256 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
257 | 
258 |     output_spec = None
259 |     if mode == tf.estimator.ModeKeys.TRAIN:
260 |       # defining optimizar function
261 |       train_op = optimization.create_optimizer(
262 |           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
263 |       
264 |       # Training estimator spec
265 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
266 |           mode=mode,
267 |           loss=total_loss,
268 |           train_op=train_op,
269 |           scaffold_fn=scaffold_fn)
270 |     elif mode == tf.estimator.ModeKeys.EVAL:
271 |       # accuracy, loss, auc, F1, precision and recall metrics for evaluation
272 |       def metric_fn(per_example_loss, label_ids, logits, is_real_example):
273 |         predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
274 |         loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
275 |         accuracy = tf.metrics.accuracy(
276 |             labels=label_ids, predictions=predictions, weights=is_real_example)
277 |         f1_score = tf.contrib.metrics.f1_score(
278 |             label_ids,
279 |             predictions)
280 |         auc = tf.metrics.auc(
281 |             label_ids,
282 |             predictions)
283 |         recall = tf.metrics.recall(
284 |             label_ids,
285 |             predictions)
286 |         precision = tf.metrics.precision(
287 |             label_ids,
288 |             predictions) 
289 |         return {
290 |             "eval_accuracy": accuracy,
291 |             "eval_loss": loss,
292 |             "f1_score": f1_score,
293 |             "auc": auc,
294 |             "precision": precision,
295 |             "recall": recall
296 |         }
297 | 
298 |       eval_metrics = (metric_fn,
299 |                       [per_example_loss, label_ids, logits, is_real_example])
300 |       # estimator spec for evalaution
301 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
302 |           mode=mode,
303 |           loss=total_loss,
304 |           eval_metrics=eval_metrics,
305 |           scaffold_fn=scaffold_fn)
306 |     else:
307 |       # estimator spec for predictions
308 |       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
309 |           mode=mode,
310 |           predictions={"probabilities": probabilities},
311 |           scaffold_fn=scaffold_fn)
312 |     return output_spec
313 | 
314 |   return model_fn
315 | 
316 | 
317 | # Define TPU configs
318 | if USE_TPU:
319 |   tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
320 | else:
321 |   tpu_cluster_resolver = None
322 | run_config = tf.contrib.tpu.RunConfig(
323 |     cluster=tpu_cluster_resolver,
324 |     model_dir=OUTPUT_DIR,
325 |     save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
326 |     tpu_config=tf.contrib.tpu.TPUConfig(
327 |         iterations_per_loop=ITERATIONS_PER_LOOP,
328 |         num_shards=NUM_TPU_CORES,
329 |         per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
330 | 
331 | 
332 | # create model function for estimator using model function builder
333 | model_fn = model_fn_builder(
334 |     bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
335 |     num_labels=len(label_list),
336 |     init_checkpoint=INIT_CHECKPOINT,
337 |     learning_rate=LEARNING_RATE,
338 |     num_train_steps=num_train_steps,
339 |     num_warmup_steps=num_warmup_steps,
340 |     use_tpu=USE_TPU,
341 |     use_one_hot_embeddings=True)
342 | 
343 | 
344 | 
345 | # Defining TPU Estimator
346 | estimator = tf.contrib.tpu.TPUEstimator(
347 |     use_tpu=USE_TPU,
348 |     model_fn=model_fn,
349 |     config=run_config,
350 |     train_batch_size=TRAIN_BATCH_SIZE,
351 |     eval_batch_size=EVAL_BATCH_SIZE,
352 |     predict_batch_size=PREDICT_BATCH_SIZE)
353 | 
354 | 
355 | 
356 | # Train the model.
357 | #print('VCS on BERT base model normally takes about 1 hour on TPU and 15-20 hours on GPU. Please wait...')
358 | print('***** Started training at {} *****'.format(datetime.datetime.now()))
359 | print('  Num examples = {}'.format(num_train_examples))
360 | print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
361 | tf.logging.info("  Num steps = %d", num_train_steps)
362 | # we are using `file_based_input_fn_builder` for creating input function from TF_RECORD file
363 | train_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD,
364 |                                                             seq_length=MAX_SEQ_LENGTH,
365 |                                                             is_training=True,
366 |                                                             drop_remainder=True)
367 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
368 | print('***** Finished training at {} *****'.format(datetime.datetime.now()))
369 | 
370 | 
371 |  ## Evalute FineTuned model
372 | 
373 | 
374 | # eval the model on train set.
375 | print('***** Started Train Set evaluation at {} *****'.format(datetime.datetime.now()))
376 | print('  Num examples = {}'.format(num_train_examples))
377 | print('  Batch size = {}'.format(EVAL_BATCH_SIZE))
378 | # eval input function for train set
379 | train_eval_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD,
380 |                                                            seq_length=MAX_SEQ_LENGTH,
381 |                                                            is_training=False,
382 |                                                            drop_remainder=True)
383 | # evalute on train set
384 | result = estimator.evaluate(input_fn=train_eval_input_fn, 
385 |                             steps=int(num_train_examples/EVAL_BATCH_SIZE))
386 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
387 | print("***** Eval results *****")
388 | for key in sorted(result.keys()):
389 |   print('  {} = {}'.format(key, str(result[key])))
390 | 
391 | 
392 | 
393 | # Converting eval examples to features
394 | print("---------------  Processing Dev Data ------------------")
395 | EVAL_TF_RECORD = os.path.join(OUTPUT_DIR, "eval.tf_record")
396 | eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
397 | num_eval_examples = len(eval_examples)
398 | run_classifier.file_based_convert_examples_to_features(eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, EVAL_TF_RECORD)
399 | 
400 | 
401 | # Eval the model on Dev set.
402 | print('***** Started Dev Set evaluation at {} *****'.format(datetime.datetime.now()))
403 | print('  Num examples = {}'.format(num_eval_examples))
404 | print('  Batch size = {}'.format(EVAL_BATCH_SIZE))
405 | 
406 | # eval input function for dev set
407 | eval_input_fn = run_classifier.file_based_input_fn_builder(EVAL_TF_RECORD,
408 |                                                            seq_length=MAX_SEQ_LENGTH,
409 |                                                            is_training=False,
410 |                                                            drop_remainder=True)
411 | # evalute on dev set
412 | result = estimator.evaluate(input_fn=eval_input_fn, steps=int(num_eval_examples/EVAL_BATCH_SIZE))
413 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
414 | print("***** Eval results *****")
415 | for key in sorted(result.keys()):
416 |   print('  {} = {}'.format(key, str(result[key])))
417 | 
418 | 
419 | # examples sentences, feel free to change and try
420 | sent_pairs = [("apple", "a display of apple and orange at market"), ("apple","a fruit market with apples and orange"),
421 |              ("apple","a fruit stand with apples and oranges")]
422 | 
423 | 
424 | print("-----------  Predictions on Custom Data -------------------")
425 | # create `InputExample` for custom examples
426 | predict_examples = processor.get_predict_examples(sent_pairs)
427 | num_predict_examples = len(predict_examples)
428 | 
429 | # For TPU, We will append `PaddingExample` for maintaining batch size
430 | if USE_TPU:
431 |   while(len(predict_examples)%EVAL_BATCH_SIZE!=0):
432 |     predict_examples.append(run_classifier.PaddingInputExample())
433 | 
434 | # Converting to features 
435 | predict_features = run_classifier.convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
436 | 
437 | print('  Num examples = {}'.format(num_predict_examples))
438 | print('  Batch size = {}'.format(PREDICT_BATCH_SIZE))
439 | 
440 | # Input function for prediction
441 | predict_input_fn = run_classifier.input_fn_builder(predict_features,
442 |                                                 seq_length=MAX_SEQ_LENGTH,
443 |                                                 is_training=False,
444 |                                                 drop_remainder=False)
445 | result = list(estimator.predict(input_fn=predict_input_fn))
446 | print(result)
447 | for ex_i in range(num_predict_examples):
448 |   print("****** Example {} ******".format(ex_i))
449 |   print("visual :", sent_pairs[ex_i][0])
450 |   print("caption :", sent_pairs[ex_i][1])
451 |   print("Prediction :", result[ex_i]['probabilities'][1])
452 | 
453 | 
454 | 
455 | ################################################# Test ###################################################
456 | 
457 | # Converting test examples to features
458 | print("---------------------  Processing Test Data -------------------")
459 | TEST_TF_RECORD = os.path.join(OUTPUT_DIR, "test.tf_record")
460 | test_examples = processor.get_test_examples(TASK_DATA_DIR)
461 | num_test_examples = len(test_examples)
462 | run_classifier.file_based_convert_examples_to_features(test_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TEST_TF_RECORD)
463 | 
464 | 
465 | # Predictions on test set.
466 | print('***** Started Prediction at {} *****'.format(datetime.datetime.now()))
467 | print('  Num examples = {}'.format(num_test_examples))
468 | print('  Batch size = {}'.format(PREDICT_BATCH_SIZE))
469 | # predict input function for test set
470 | test_input_fn = run_classifier.file_based_input_fn_builder(TEST_TF_RECORD,
471 |                                                            seq_length=MAX_SEQ_LENGTH,
472 |                                                            is_training=False,
473 |                                                            drop_remainder=True)
474 | tf.logging.set_verbosity(tf.logging.ERROR)
475 | # predict on test set
476 | result = list(estimator.predict(input_fn=test_input_fn))
477 | print('***** Finished Prediction at {} *****'.format(datetime.datetime.now()))
478 | 
479 | # saving test predictions
480 | output_test_file = os.path.join(OUTPUT_DIR, "test_score.txt")
481 | with tf.gfile.GFile(output_test_file, "w") as writer:
482 |   for (example_i, predictions_i) in enumerate(result):
483 |     writer.write("%s , %s\n" % (test_examples[example_i].guid, str(predictions_i['probabilities'][1])))
484 | 
485 | 


--------------------------------------------------------------------------------
/BERT/uncased_L-12_H-768_A-12/file-should be here.txt:
--------------------------------------------------------------------------------
1 | Download this from bert website
2 | 
3 | 


--------------------------------------------------------------------------------
/COCO_train2014_000000000009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_train2014_000000000009.jpg


--------------------------------------------------------------------------------
/COCO_val2014_000000000042.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_val2014_000000000042.jpg


--------------------------------------------------------------------------------
/Evaluation/captions_val2014.json.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Evaluation/captions_val2014.json.zip


--------------------------------------------------------------------------------
/Evaluation/coco_eval.py:
--------------------------------------------------------------------------------
 1 | from pycocotools.coco import COCO
 2 | from pycocoevalcap.eval import COCOEvalCap
 3 | import sys
 4 | import argparse
 5 | 
 6 | 
 7 | 
 8 | parser=argparse.ArgumentParser()
 9 | parser.add_argument('--f', default='', help='', type=str,required=True)
10 | args = parser.parse_args()
11 | 
12 | 
13 | annotation_file = 'captions_val2014.json'
14 | results_file =  args.f
15 | 
16 | # create coco object and coco_result object
17 | coco = COCO(annotation_file)
18 | coco_result = coco.loadRes(results_file)
19 | 
20 | # create coco_eval object by taking coco and coco_result
21 | coco_eval = COCOEvalCap(coco, coco_result)
22 | 
23 | # evaluate on a subset of images by setting
24 | # coco_eval.params['image_id'] = coco_result.getImgIds()
25 | # please remove this line when evaluating the full validation set
26 | coco_eval.params['image_id'] = coco_result.getImgIds()
27 | 
28 | # evaluate results
29 | # SPICE will take a few minutes the first time, but speeds up due to caching
30 | coco_eval.evaluate()
31 | 
32 | # print output evaluation scores
33 | for metric, score in coco_eval.eval.items():
34 |     print(f'{metric}: {score:.3f}')
35 | 


--------------------------------------------------------------------------------
/LRCE_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/LRCE_figure_1.png


--------------------------------------------------------------------------------
/Pre-trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Pre-trained.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Visual Semantic Relatedness Dataset for Image Captioning
  2 | 
  3 | <!--
  4 |  <img align="right" width="400" height="300" src="overview.png"> 
  5 | 
  6 | 
  7 | 
  8 | Modern image captioning relies heavily on extracting knowledge, from images such as objects, to capture the concept of a static story in the image. 
  9 | In this paper, we propose a textual visual context dataset for image captioning, where the publicly available dataset COCO Captions 
 10 | [(Lin et al., 2014)](https://arxiv.org/pdf/1405.0312.pdf) has been extended with information about the scene (such as objects in the image). Since this information has textual form, it can be used to leverage any NLP task, such as text similarity or semantic relation methods, into captioning systems, either as an end-to-end training strategy or a post-processing based approach.
 11 | -->
 12 | 
 13 | <img src="main.png" align="right" width="600"/>
 14 | 
 15 | Modern image captioning relies heavily on extracting knowledge, from images such as objects, to capture the concept of a static story in the image. 
 16 | In this paper, we propose a textual visual context dataset for image captioning, where the publicly available dataset COCO Captions [(Lin et al., 2014)](https://arxiv.org/pdf/1405.0312.pdf) has been extended with information  about the scene (such as objects in the image). Since this information has textual form, it can be used to leverage any NLP task, such as text similarity or semantic relation methods, into captioning systems, either as an end-to-end training strategy or a post-processing based approach.
 17 | 
 18 | 
 19 | 
 20 | This repository contains the  implementation of the paper  [Visual Semantic Relatedness Dataset for Image Captioning](https://arxiv.org/abs/2301.08784).
 21 | 
 22 | [![arXiv](https://img.shields.io/badge/arXiv-2301.08784-b31b1b.svg)](https://arxiv.org/abs/2301.08784)  [![Website shields.io](https://img.shields.io/website-up-down-green-red/http/shields.io.svg)](https://ahmed.jp/project_page/Dataset_2022/index.html)
 23 | [![huggingface](https://img.shields.io/badge/%F0%9F%A4%97-huggingface-yellow)](https://huggingface.co/datasets/AhmedSSabir/Textual-Image-Caption-Dataset)
 24 | [![O-DRUM - poster](https://img.shields.io/badge/O--DRUM-poster-0065BD)](https://ahmed.jp/project_page/Dataset_2022/poster_20.pdf)
 25 | [![O-DRUM - slide](https://img.shields.io/badge/O--DRUM-slide-0065BD)](https://ahmed.jp/project_page/Dataset_2022/spotlight_ppt_ID_20.pdf)
 26 | <!--[![YouTube - spotlight talk](https://img.shields.io/badge/YouTube-spotlight_talk-red)](https://youtu.be/-br99Q--bxM)-->
 27 | 
 28 | ## News 
 29 | Add v2 with recent SoTA model swinV2 classifier for both soft/hard-label visual_caption_cosine_score_v2 with person label (0.2, 0.3 and 0.4). Please refer to huggingface repository.
 30 | 
 31 | ##  Contents
 32 | 0. [Overview](#overview)
 33 | 1. [Visual semantic with BERT ](#Visual-semantic-with-BERT-CNN)
 34 | 2. [Dataset](#dataset)
 35 | 3. [Visual semantic with pre-trained model](#Visual-semantic-with-pre-trained-model)
 36 | 4. [Evaluation](#evaluation)
 37 | 5. [Citation](#Citation)
 38 | 
 39 | 
 40 | ## Overview
 41 | <img align="right" width="300" height="280" src="LRCE_figure_1.png"> 
 42 | 
 43 | We enrich COCO-Captions with **Textual Visual Context** information. We use [ResNet152](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf), [CLIP](https://github.com/openai/CLIP) and [Faster R-CNN](https://github.com/tensorflow/models/tree/master/research/object_detection) to extract
 44 |  object information for each COCO-caption image. We use three filter approaches to ensure the quality of the dataset   (1) Threshold: to filter out predictions where the object classifier  is not confident enough, and (2) semantic alignment with semantic similarity to remove duplicated objects. (3) semantic relatedness score as Soft-Label:  to guarantee the visual context and caption have strong relation, we use [Sentence RoBERTa-sts](https://www.sbert.net) to give a soft label via cosine similarity and then we use a **th**reshold to annotate the final label (if th ≥ 0.2, 0.3, 0.4 then [1,0]). Finally, to take advantage of the overlapping between the visual context and the caption, and to extract global information from each visual, we use BERT followed by a shallow CNN [(Kim, 2014)](https://arxiv.org/pdf/1408.5882.pdf) to estimate the visual relatedness score. 
 45 | 
 46 | 
 47 | 
 48 | ## Quick Start 
 49 | For a quick start please have a look at this [project page](https://sabirdvd.github.io/project_page/Dataset_2022/index.html) 
 50 | and [Demo](https://github.com/ahmedssabir/Textual-Visual-Semantic-Dataset/blob/main/BERT_CNN_Visual_re_ranker_demo.ipynb)  
 51 | 
 52 | <!--
 53 | [![Made withJupyter](https://img.shields.io/badge/Made%20with-Jupyter-orange?style=for-the-badge&logo=Jupyter)](https://github.com/ahmedssabir/Textual-Visual-Semantic-Dataset/blob/main/BERT_CNN_Visual_re_ranker_demo.ipynb)
 54 | -->
 55 | ## Dataset 
 56 | 
 57 | ### Sample 
 58 |  
 59 |  VC1     | VC2   |  VC3   | human annoated caption     | 
 60 | | ------------- | ------------- |------------- | ------------- |
 61 | | cheeseburger  | plate       |  hotdog   |    a plate with a hamburger fries and tomatoes   |
 62 | | bakery  | dining table       |  website   |    a table having tea and a cake on it   |
 63 | | gown  | groom       |  apron   |    its time to cut the cake at this couples wedding   |
 64 | 
 65 | 
 66 | ### Download 
 67 | 
 68 | 0. [Dowload Raw data with ID and Visual context](https://www.dropbox.com/s/xuov24on8477zg8/All_Caption_ID.csv?dl=0) -> original dataset with related ID caption [train2014](https://cocodataset.org/#download)
 69 | 1. [Downlod Data with cosine score](https://www.dropbox.com/s/55sit8ow9tems4u/visual_caption_cosine_score.zip?dl=0)-> soft cosine lable with **th** 0.2, 0.3, 0.4 and 0.5 and hard-label
 70 | 2. [Dowload Overlaping visual with caption](https://www.dropbox.com/s/br8nhnlf4k2czo8/COCO_overlaping_dataset.txt?dl=0)-> Overlap visual context and the human annotated caption 
 71 | 3. [Download Dataset (tsv file)](https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip?dl=0) 0.0-> raw data with hard lable without cosine similairty and with **th**reshold  cosine sim degree of the relation beteween the visual and caption = 0.2, 0.3, 0.4
 72 | 4. [Download Dataset GenderBias](https://www.dropbox.com/s/1wki0b0d21078mj/gender%20natural.zip?dl=0)-> man/woman replaced with person class label
 73 | 
 74 | 
 75 | ## Visual semantic with BERT-CNN  
 76 | Fine-tune [BERT](https://github.com/google-research/bert) on the created  dataset. 
 77 | 
 78 | ### Requirements
 79 | - Tensorflow 1.15.0
 80 | - Python 3.6
 81 | 
 82 | ```
 83 | conda create -n BERT_visual python=3.6 anaconda
 84 | conda activate BERT_visual
 85 | pip install tensorflow==1.15.0
 86 | pip install --upgrade tensorflow_hub==0.7.0
 87 | ``` 
 88 | 
 89 | Download BERT check point [uncased_L-12_H-768_A-12](https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1)
 90 | ```
 91 | wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
 92 | unzip uncased_L-12_H-768_A-12.zip
 93 | git clone https://github.com/gaphex/bert_experimental/
 94 | ```
 95 | like this ```BERT-CNN/uncased_L-12_H-768_A-12 ``` and  ```BERT-CNN/bert_experimental ```
 96 | 
 97 | Download dataset
 98 | 
 99 | ```
100 | wget https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip
101 | unzip train_all.zip
102 | ```
103 | 
104 | for Training 
105 | 
106 | ```
107 | parser.add_argument('--train',  default='train.tsv', help='beam serach', type=str,required=False)  
108 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False)  
109 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False) 
110 | parser.add_argument('--epochs', default='5', help='', type=int,required=False) 
111 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False) 
112 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False) 
113 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False)
114 | ```
115 | 
116 | ```
117 | python BERT_CNN.py --train /train_0.4.tsv --epochs 5
118 | ```
119 | 
120 | for inference only, download pre-trained model 
121 | 
122 | ```
123 | wget https://www.dropbox.com/s/ip7p0wiwkwvph5k/0.4_bert-cnn.zip
124 | unzip 0.4_bert-cnn.zip
125 | ``` 
126 | 
127 | ```
128 | python eval.py --testset test_demo.tsv --model 0.4_bert-cnn/frozen_graph.pb
129 | ```
130 | ### Example 
131 | 
132 | Re-rank the most related caption to the image using the visual context information.
133 | 
134 |  <img align="center" width="400" height="300" src="COCO_val2014_000000000042.jpg"> 
135 | 
136 |  ```
137 |  visual information, candidate caption (beam search)
138 |  standard poodle shopping cart footwear, a close up of shoes and a dog in a basket, 0.99774158
139 |  standard poodle shopping cart footwear, a brown teddy bear laying on top of a pair of shoes, 0.0621758029 
140 |  ```
141 |  
142 | ## Visual semantic with pre-trained model  
143 | 
144 | 
145 | <!--
146 | <img align="right" width="300" height="100" src="Pre-trained.png"> 
147 | -->
148 | 
149 | <img align="right" width="350" height="130" src="Pre-trained.png"> 
150 | 
151 | 
152 | 
153 | Although this approach is proposed to take the advantage of the dataset (_e.g._ visual semantic model), we also investigate the use of out-of-the-box tools to estimate the relatedness score between the short text (_i.e._ caption)  and its environmental visual context (we call it visual classifier). 
154 | 
155 | For this we follow similarity to probability based approach but 
156 | 
157 | we use only the cosine similarity from a pre-trained model and the top-3 averaged prob (confidence) from the object classifier as:
158 | 
159 | <!--
160 | <img src="https://render.githubusercontent.com/render/math?math=\text{P}(w \mid c)=\text{}sim(w,c)^{\text{P}(c)}"> 
161 | -->
162 | 
163 | $\text{P}(w \mid c)=\text{}sim(w,c)^{\text{P}(c)}$ 
164 | where the main components of the visual semantics re-ranker:
165 | <!--
166 | 1. Simialrity/relatedness between the caption and the object context  <img src="https://render.githubusercontent.com/render/math?math=\text{}sim(w,c)">
167 | -->
168 | 1. Simialrity/relatedness between the caption and the object context  $\text{}sim(w,c)$ 
169 | 
170 | <!--
171 | 2. <img src="https://render.githubusercontent.com/render/math?math=\text{P}(c)">  is the classifier confidnent <img src="https://render.githubusercontent.com/render/math?math=\text{P}(w \mid object)">
172 | -->
173 | 
174 | 2. $\text{P}(c)$  is the classifier object confident in the image $\text{P}(w \mid \text{object})$ 
175 | 
176 | 
177 | with Pre-trained [SBERT](https://www.sbert.net)
178 | 
179 | ```
180 |  python model.py --vis visual-context_label.txt --vis_prob visual-context_prob.txt --c caption.txt
181 | ```
182 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf)
183 | 
184 | ## Evaluation
185 | 
186 | [Download pycocoevalcap](https://github.com/salaniz/pycocoevalcap)
187 | 
188 | ```
189 | pip install pycocoevalcap
190 | ``` 
191 | 
192 | Then run
193 | ```
194 | python Evaluation/coco_eval.py --f Result_tune_BERT_0.4.json
195 | ```
196 | For more evaluation ([Lexical and Semantic Diversity](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/SBERT-caption-eval)) 
197 | <!--
198 | ## Synthetic + Real caption dataset 
199 | 
200 | For future work, we plan to extract the visual context from the caption (without using a visual classifier) and estimate the visual relatedness score by
201 | employing unsupervised learning (i.e. contrastive learning). (work in progress)
202 | 
203 | Feel free to download the training data 
204 | 
205 |  
206 |  1. [Download CC](https://www.dropbox.com/s/pc1uv2rf6nqdp57/CC_caption_40.txt.zip) -> Caption dataset from [Conceptual Captions](https://github.com/google-research-datasets/conceptual-captions) (CC) 2M (2255927 captions)
207 |  2. [Download CC+wiki](https://www.dropbox.com/s/xuov24on8477zg8/All_Caption_ID.csv?dl=0) -> CC+1M-wiki 3M (3255928) 
208 |  3. [Download CC+wiki+COCO](https://www.dropbox.com/s/k7oqwr9a1a0h8x1/CC_caption_40%2Bwiki%2BCOCO.txt.zip) -> CC+wiki+COCO-Caption 3.5M (366984)
209 |  4. [Download COCO-caption+wiki](https://www.dropbox.com/s/wc4k677wp24kzhh/COCO%2Bwiki.txt.zip) -> COCO-caption +wiki 1.4M (1413915)
210 |  5. [Download COCO-caption+wiki+CC+8Mwiki](https://www.dropbox.com/s/xhfx32sjy2z5bpa/11M_wiki_7M%2BCC%2BCOCO.txt.zip) -> COCO-caption+wiki+CC+8Mwiki 11M (11541667) 
211 | 
212 | -->
213 | 
214 | 
215 | ## Citation
216 | 
217 | The details of this repo are described in the following paper. If you find this repo useful, please kindly cite it:
218 | 
219 | ```bibtex
220 | @article{sabir2023visual,
221 |   title={Visual Semantic Relatedness Dataset for Image Captioning},
222 |   author={Sabir, Ahmed and Moreno-Noguer, Francesc and Padr{\'o}, Llu{\'\i}s},
223 |   journal={arXiv preprint arXiv:2301.08784},
224 |   year={2023}
225 | }
226 | ```
227 | 
228 | 


--------------------------------------------------------------------------------
/approch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/approch.png


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | train.tsv file here
2 | 


--------------------------------------------------------------------------------
/data/test.tsv:
--------------------------------------------------------------------------------
 1 | id	visual	caption
 2 | 0	standard poodle shopping cart footwear	a close up of a dog laying in a basket
 3 | 1	street sign traffic light tower	a black and white photo of a street light
 4 | 2	toilet seat	a white toilet with its seat up in a bathroom
 5 | 3	mobile home studio couch house	a living room filled with furniture and a coffee table
 6 | 4	french loaf conch person	a basket filled with sandwiches on top of a table
 7 | 5	indian elephant	a group of people riding on the back of an elephant
 8 | 6	bow tie windsor glasses	a man wearing glasses and a tie in a room
 9 | 7	sombrero bonnet woman	a woman standing in front of a giant cake
10 | 8	diaper bassinet human	a baby sitting in front of a giant cake
11 | 9	bobsled go-kart human	a group of children sitting around a piece of luggage
12 | 10	vase spotlight plant	a bunch of flowers that are in a vase
13 | 


--------------------------------------------------------------------------------
/data/train.tsv:
--------------------------------------------------------------------------------
 1 | id	id1	id2	visual	caption	is_related
 2 | 220740	220741	220742	marimba dalmatian picket fence	a horse jumping competition is going on with people in the stands 	1
 3 | 385729	385730	385731	dishwasher microwave barber chair	a person riding a horse on a dirt ground	0
 4 | 59422	59423	59424	laptop carton comicbook	a laptop that has stickers on its cover is sitting on a table 	1
 5 | 46638	46639	46640	suit Windsortie woodenspoon	a young bow wearing a pink shirt and a purple tie 	1
 6 | 11870	11871	11872	studiocouch four-poster quilt	a couple of girls sitting in a bed in a bedroom 	1
 7 | 471676	471677	471678	streetcar fire engine passenger car	a multi layer plate with cakes and food on it 	0
 8 | 186795	186796	186797	shoe shop television monitor	a man playing a wii on a large projector screen 	1
 9 | 121836	121837	121838	ox water buffalo alp	cattle standing on a hill in fog	1
10 | 396224	396225	396226	altar desk perfume	oranges sitting in a blue bowl on a wooden table 	0
11 | 430635	430636	430637	speedboat paddle lifeboat	pots and other items sit on a stove and counter 	0
12 | 145057	145058	145059	shopping cart ashcan park bench	a coin meter that is laying down on grates 	1
13 | 409778	409779	409780	web site fire engine comic book	a painting of a man from the back 	0
14 | 155568	155569	155570	grocery store patio restaurant	a man and woman walking up the stairs in a backyard 	1
15 | 213951	213952	213953	microwave washer dining table	the kitchen is equipped with all the latest appliances 	1
16 | 489266	489267	489268	traffic light aircraft carrier chain saw	a laptop computer on a desk with cables a mug and bowl	0
17 | 257649	257650	257651	grocery store confectionery shopping basket	a couple of wooden tale stopped with fresh fruit 	1
18 | 113826	113827	113828	lab coat vestment West Highland white terrier	a group of people standing in rows with frisbees for a photo 	1
19 | 486413	486414	486415	snorkel ski tennis ball	two frames of a woman in the air on a tennis court 	0
20 | 400432	400433	400434	crutch lawn mower chain saw	eight  underneath on ambarella in the forest parrot	0
21 | 341153	341154	341155	washer microwave dishwasher	a small propeller plane sitting underneath a covering at an airport 	0
22 | 462067	462068	462069	ballplayer baseball scoreboard	a plate full of bright green lettuce next to some bread 	0
23 | 443392	443393	443394	grocery store pineapple pizza	a man in black and white stripes with makeup smiling	0
24 | 486660	486661	486662	wombat wallaby titi	a persons shadow on the ground of them skateboarding 	0
25 | 336616	336617	336618	moped motor scooter crash helmet	multiple street signs are attached to the post 	0
26 | 124199	124200	124201	sorrel hog barrel	a brown horse eating from a hallowed out metal barrel 	1
27 | 238004	238005	238006	tray washbasin cradle	a cat laying on a couch near a remote control 	1
28 | 319195	319196	319197	airliner wing web site	a propeller airplane parked inside and airplane hanger 	1
29 | 412036	412037	412038	grey whale breakwater killer whale	a stop sign is standing at a street intersection 	0
30 | 491896	491897	491898	teddy wool toyshop	a woman in an old-fashioned kitchen with pots and pans 	0
31 | 487501	487502	487503	snowmobile steam locomotive tow truck	the living room is clean and empty from people 	0
32 | 277093	277094	277095	microwave dishwasher chest	a chair holding a laptop that is facing towards an oven 	1
33 | 135542	135543	135544	water buffalo warthog hog	sheep grazing under a tree in a grassy meadow 	1
34 | 8448	8449	8450	mountainbike unicycle bicycle-built-for-two	a picture of a person throwing a frisbee 	1
35 | 170686	170687	170688	police van minibus ambulance	a person in the army greeting someone in a suit 	1
36 | 372016	372017	372018	Great Dane Irish wolfhound English setter	a man standing in a room holding a remote	0
37 | 351158	351159	351160	sunglass bullet train sunglasses	a woman opening the trunk of her car 	0
38 | 414542	414543	414544	killer whale great white shark paddle	a dog running across a field with a frisbee in his mouth 	0
39 | 264998	264999	265000	bannister ski unicycle	a man riding a skateboard along a metal hand rail 	1
40 | 362868	362869	362870	zebra bustard gazelle	a basket full of bananas with a net on top 	0
41 | 88455	88456	88457	patio flagpole pole	a fire hydrant and fire hose in a houses front yard 	1
42 | 372512	372513	372514	seashore catamaran swimming trunks	a man riding a surfboard on a wave in the ocean 	0
43 | 387327	387328	387329	cellular telephone lab coat cash machine	a baseball game ensues as people watch	0
44 | 248027	248028	248029	web site barbershop cinema	a motor bike on the side of the street 	1
45 | 347507	347508	347509	banana pineapple orange	a bear itching itself on a bare tree	0
46 | 33714	33715	33716	picketfence streetcar mountainbike	the red bike and the pink bike just started dating	1
47 | 173989	173990	173991	umbrella poncho jinrikisha	a group of people walking down a street carrying umbrellas 	1
48 | 20835	20836	20837	ballplayer baseball footballhelmet	a man throwing a baseball from a mound on a field 	1
49 | 16356	16357	16358	lumbermill barbershop turnstile	a man working on a baseball bat while two others watch 	1
50 | 193491	193492	193493	unicycle pole horizontal bar	boy riding on his skateboard down a stair rail 	1
51 | 384165	384166	384167	mixing bowl corn meat loaf	a couple of sailors standing next to a woman 	0
52 | 321736	321737	321738	ballplayer baseball football helmet	a boys baseball game with a batter catcher and umpire	1
53 | 108395	108396	108397	crash helmet moped backpack	a man with a suit and tie on a motor bike 	1
54 | 215942	215943	215944	unicycle military uniform bearskin	four guys are sitting on a bench in front of a building 	1
55 | 134156	134157	134158	wine bottle eggnog red wine	there is a bottle of wine next to a glass	1
56 | 297783	297784	297785	necklace thimble corkscrew	this is an image of a meal and an avocado is included	1
57 | 110516	110517	110518	minivan cab police van	a dog looking ahead with a stoic look in a car seat 	1
58 | 3166	3167	3168	grocerystore headcabbage cauliflower	a pile of vegetables on display at a grocery store 	1
59 | 440075	440076	440077	ski curly-coatedretriever Gordonsetter	elephants and their young in their natural habitat	0
60 | 71021	71022	71023	ballplayer baseball puck	a baseball player and a flying black bat	1
61 | 


--------------------------------------------------------------------------------
/dataset_v1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/dataset_v1-1.png


--------------------------------------------------------------------------------
/hist.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/hist.jpg


--------------------------------------------------------------------------------
/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/main.png


--------------------------------------------------------------------------------
/overlap_text.py:
--------------------------------------------------------------------------------
 1 | 
 2 | file1 = []
 3 | 
 4 | file2 = []
 5 | 
 6 | with open('train_visual.txt','rU') as f:
 7 |     for line in f:
 8 |        file1.append(line.rstrip())
 9 | 
10 | 
11 | with open('caption_anot.txt') as f1:
12 |     for line1 in f1:
13 |        file2.append(line1.rstrip())
14 |        #break
15 | 
16 | f=open('intersection_caption_visual.txt', "w")
17 | for i in range(len(file1)):
18 |     temp =[]
19 |     messages  = file1[i]
20 |     messages1 = file2[i]
21 | 
22 |     words1 = messages.lower().split()
23 |     words2 = messages1.lower().split()
24 | 
25 |     w = set(words1) & set(words2)
26 | 
27 | 
28 | 	#words1 = "This is a simple test of set intersection".lower().split()
29 | 	#words2 = "Intersection of sets is easy using Python".lower().split()
30 | 
31 |     
32 |     temp.append(w)
33 | 
34 |     result= file1[i]+','+file2[i]+','+str(w)
35 | 
36 |     f.write(result)
37 |     #f.write(result)
38 |     f.write('\n')
39 |     print(result)
40 |     #del result
41 |     #close.sess()
42 |     
43 | f.close()
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/overview.png


--------------------------------------------------------------------------------
/pre-trained/README.md:
--------------------------------------------------------------------------------
1 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf)
2 | 


--------------------------------------------------------------------------------
/pre-trained/Visual_re-rank_re-ranked_output.txt:
--------------------------------------------------------------------------------
 1 | a man riding on the back of a motorcycle 0.8895639974564639
 2 | a person riding a motorcycle on a city street 0.8699054868636436
 3 | a person riding a motorcycle down a city street 0.8665321958170883
 4 | a man riding on the back of a motorcycle down a street 0.8645537987336105
 5 | a man riding a motorcycle down a street 0.8582269252364088
 6 | a man riding on the back of a motorcycle down a sidewalk 0.8581149928539996
 7 | a man riding a motorcycle down the street 0.8569102761752505
 8 | a man riding a motorcycle on a city street 0.85454545827468
 9 | a man riding a motorcycle down a sidewalk 0.8493932857280806
10 | 


--------------------------------------------------------------------------------
/pre-trained/Visual_re-ranker.txt:
--------------------------------------------------------------------------------
 1 | a man riding a motorcycle down a street,0.8582269252364088
 2 | a person riding a motorcycle on a city street,0.8699054868636436
 3 | a man riding on the back of a motorcycle,0.8895639974564639
 4 | a man riding a motorcycle on a city street,0.85454545827468
 5 | a man riding on the back of a motorcycle down a street,0.8645537987336105
 6 | a person riding a motorcycle down a city street,0.8665321958170883
 7 | a man riding on the back of a motorcycle down a sidewalk,0.8581149928539996
 8 | a man riding a motorcycle down the street,0.8569102761752505
 9 | a man riding a motorcycle down a sidewalk,0.8493932857280806
10 | 


--------------------------------------------------------------------------------
/pre-trained/caption.txt:
--------------------------------------------------------------------------------
 1 | a man riding a motorcycle down a street       	 
 2 | a person riding a motorcycle on a city street	 
 3 | a man riding on the back of a motorcycle	 
 4 | a man riding a motorcycle on a city street	 
 5 | a man riding on the back of a motorcycle down a street	 
 6 | a person riding a motorcycle down a city street	 
 7 | a man riding on the back of a motorcycle down a sidewalk	 
 8 | a man riding a motorcycle down the street	 
 9 | a man riding a motorcycle down a sidewalk	
10 | 
11 | 


--------------------------------------------------------------------------------
/pre-trained/model.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import sys
  3 | import argparse
  4 | import torch
  5 | import re
  6 | import os
  7 | 
  8 | 
  9 | 
 10 | parser=argparse.ArgumentParser(description='call all scores and compute the visual context based re-ranker')
 11 | parser.add_argument('--sim', default='sim-score.txt', help='similarity score from fine_tune_BERT', type=str,required=False)
 12 | parser.add_argument('--vis', default='visual-context_label.txt',help='class-label from the classifier (Resent152)', type=str, required=True)
 13 | parser.add_argument('--vis_prob', default='visual-context.txt', help='prob from the classifier (Resent152)', type=str, required=True)
 14 | parser.add_argument('--c',  default='caption.txt', help='caption from the baseline (any)', type=str, required=True)
 15 | args = parser.parse_args()
 16 | 
 17 | # Download from here S-BERT
 18 | # pip install -U sentence-transformers
 19 | from sentence_transformers import SentenceTransformer, util
 20 | from sklearn.metrics.pairwise import cosine_similarity
 21 | 
 22 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
 23 | #model = SentenceTransformer('nq-distilbert-base-v1')
 24 | 
 25 | 
 26 | def cos_sim(a, b):
 27 |     return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))
 28 | 
 29 | 
 30 | def get_lines(file_path):
 31 |     with open(file_path) as f:
 32 |             return f.read().strip().split('\n')
 33 | 
 34 | 
 35 | # visual confident based visual re-ranker
 36 | class Visual_re_ranker:
 37 |         def __init__(self, visual_context_prob, sim):
 38 |             self.visual_context_prob = visual_context_prob
 39 |             self.sim = sim
 40 |         def p_minus (self):
 41 |             score = pow(float(sim), float(visual_context_prob))
 42 |     
 43 |             return score
 44 | 
 45 |         @staticmethod
 46 |         def remove_duplicate_caption_re_rank(input_path, output_path):
 47 |             with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
 48 |                 seen_lines = set()
 49 |                 
 50 |                 def add_line(line):
 51 |                     seen_lines.add(line)
 52 |                     return line
 53 | 
 54 |                 output_file.writelines((add_line(line) for line in input_file
 55 |                                 if line not in seen_lines))
 56 |             re_ranked_scores = []
 57 |             with open(output_path) as f:
 58 |                 for line in f:
 59 |                     caption, score = line.split(',')
 60 |                     score = float(score)
 61 |                     re_ranked_scores.append((caption, score))
 62 |             re_ranked_scores.sort(key=lambda s: float(s[1]), reverse=True)
 63 |             with open(output_path, 'w') as f:
 64 |                 for caption, score in re_ranked_scores:
 65 |                     f.write("%s %s\n" % (caption, score))
 66 |  
 67 |     
 68 |            
 69 | # all beam with visual context
 70 | input_path= 'Visual_re-ranker.txt'
 71 | # re-ranked beam with visual context
 72 | output_path = 'Visual_re-rank_re-ranked_output.txt'
 73 | 
 74 | # compute visual context
 75 | f=open(input_path, "w")
 76 | for i in range(len(get_lines(args.vis))):
 77 |     temp =[]
 78 |     visual_context_label = get_lines(args.vis)[i]
 79 |     visual_context_prob = get_lines(args.vis_prob)[i]
 80 |     caption = get_lines(args.c)[i]
 81 | 
 82 |    
 83 |     caption_emb = model.encode(caption, convert_to_tensor=True)
 84 |     visual_context_label_emb = model.encode(visual_context_label, convert_to_tensor=True)
 85 | 
 86 |     #def cos_sim(a, b):
 87 |     #    return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))
 88 | 
 89 | 
 90 |     
 91 |     sim =  cosine_scores = util.pytorch_cos_sim(caption_emb, visual_context_label_emb)
 92 |     sim = sim.cpu().numpy()
 93 |     sim = sim.item()
 94 | 
 95 |   
 96 |     score = Visual_re_ranker(visual_context_prob, sim)
 97 |     score = score.p_minus()
 98 |     #score = score.real
 99 |     temp.append(score)
100 |     
101 |     #result = ','.join((caption, LM, str(score)))
102 |     result = ','.join((caption, str(score)))
103 |     result = re.sub(r'\s*,\s*', ',', result)
104 | 
105 | 
106 |     #print(result)
107 | 
108 |     f.write(result)
109 |     f.write('\n')
110 | 
111 |         
112 | f.close()
113 | 
114 | if __name__ == "__main__":
115 | 
116 | # re-rank and print top visual beam captions
117 |    Visual_re_ranker.remove_duplicate_caption_re_rank(input_path, output_path)
118 | 


--------------------------------------------------------------------------------
/pre-trained/sample_best.json:
--------------------------------------------------------------------------------
1 | [{"image_id":24343,"caption":"a man riding on the back of a motorcycle"}]
2 | 


--------------------------------------------------------------------------------
/pre-trained/sample_best_baseline.json:
--------------------------------------------------------------------------------
1 | [{"image_id":24343,"caption":"a man riding a motorcycle down a street"}]
2 | 


--------------------------------------------------------------------------------
/pre-trained/visual-context_label.txt:
--------------------------------------------------------------------------------
 1 | motor scooter crash helmet motorcycle
 2 | motor scooter crash helmet motorcycle
 3 | motor scooter crash helmet motorcycle
 4 | motor scooter crash helmet motorcycle
 5 | motor scooter crash helmet motorcycle
 6 | motor scooter crash helmet motorcycle
 7 | motor scooter crash helmet motorcycle
 8 | motor scooter crash helmet motorcycle
 9 | motor scooter crash helmet motorcycle
10 | 


--------------------------------------------------------------------------------
/pre-trained/visual-context_prob.txt:
--------------------------------------------------------------------------------
 1 | 0.203588580197762
 2 | 0.203588580197762
 3 | 0.203588580197762
 4 | 0.203588580197762
 5 | 0.203588580197762
 6 | 0.203588580197762
 7 | 0.203588580197762
 8 | 0.203588580197762
 9 | 0.203588580197762
10 | 


--------------------------------------------------------------------------------
/visual_context/README.md:
--------------------------------------------------------------------------------
 1 | ## Extract visual information 
 2 | ```
 3 | conda create -n Resnet python=3.7 anaconda
 4 | conda activate Resnet
 5 | pip install tensorflow==1.15.0
 6 | pip install keras==2.1.5
 7 | ``` 
 8 | 
 9 | For [ResNet](https://arxiv.org/abs/1512.03385)
10 | 
11 | ``` 
12 | python run-visual.py
13 | ```
14 | 
15 | ``` 
16 | COCO_val2014_000000185210.jpg 'traffic_light', 0.7458004
17 | COCO_val2014_000000235692.jpg  'ox', 0.49095494
18 | ``` 
19 | 
20 | For [CLIP](https://github.com/openai/CLIP) with zero-shot prediction
21 | 
22 | ```
23 | # torch 1.7.1 
24 | conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=10.1
25 | pip install ftfy regex tqdm
26 | pip install git+https://github.com/openai/CLIP.git
27 | ```
28 | 
29 | run  
30 | 
31 | ```
32 |  python run-visual_CLIP.py
33 | ```
34 | 
35 | ```
36 | COCO_val2014_000000185210.jpg 'barrow', 0.0954
37 | COCO_val2014_000000235692.jpg  'ox', 0.5092
38 | ```
39 | For more visual classifier (e.g., Vit, SwinV2, etc.) please refre to this [page](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/model/Resent-152) 
40 | 


--------------------------------------------------------------------------------
/visual_context/imgs/COCO_val2014_000000185210.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000185210.jpg


--------------------------------------------------------------------------------
/visual_context/imgs/COCO_val2014_000000235692.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000235692.jpg


--------------------------------------------------------------------------------
/visual_context/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import keras.backend as K
  3 | 
  4 | from keras import initializers
  5 | from keras.layers import Input
  6 | from keras.layers import Dense
  7 | from keras.layers import Conv2D
  8 | from keras.layers import MaxPooling2D
  9 | from keras.layers import AveragePooling2D
 10 | from keras.layers import ZeroPadding2D
 11 | from keras.layers import Flatten
 12 | from keras.layers import Activation
 13 | from keras.layers import add
 14 | from keras.layers import BatchNormalization
 15 | from keras.layers import GlobalAveragePooling2D
 16 | from keras.layers import GlobalMaxPooling2D
 17 | 
 18 | from keras.models import Model
 19 | from keras.engine import Layer, InputSpec
 20 | from keras.engine import get_source_inputs
 21 | 
 22 | 
 23 | from keras.utils.data_utils import get_file
 24 | #from keras.applications.imagenet_utils import _obtain_input_shape
 25 | #from keras.applications.imagenet_utils import _obtain_input_shape
 26 | 
 27 | from keras_applications.imagenet_utils import _obtain_input_shape
 28 | 
 29 | WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5'
 30 | WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5'
 31 | 
 32 | 
 33 | class Scale(Layer):
 34 |     """ Custom Layer for ResNet used for BatchNormalization.
 35 | 
 36 |     Learns a set of weights and biases used for scaling the input data.
 37 |     the output consists simply in an element-wise multiplication of the input
 38 |     and a sum of a set of constants:
 39 |         out = in * gamma + beta,
 40 |     where 'gamma' and 'beta' are the weights and biases larned.
 41 |     # Arguments
 42 |         axis: integer, axis along which to normalize in mode 0. For instance,
 43 |             if your input tensor has shape (samples, channels, rows, cols),
 44 |             set axis to 1 to normalize per feature map (channels axis).
 45 |         momentum: momentum in the computation of the
 46 |             exponential average of the mean and standard deviation
 47 |             of the data, for feature-wise normalization.
 48 |         weights: Initialization weights.
 49 |             List of 2 Numpy arrays, with shapes:
 50 |             `[(input_shape,), (input_shape,)]`
 51 |         beta_init: name of initialization function for shift parameter
 52 |             (see [initializers](../initializers.md)), or alternatively,
 53 |             Theano/TensorFlow function to use for weights initialization.
 54 |             This parameter is only relevant if you don't pass a `weights` argument.
 55 |         gamma_init: name of initialization function for scale parameter (see
 56 |             [initializers](../initializers.md)), or alternatively,
 57 |             Theano/TensorFlow function to use for weights initialization.
 58 |             This parameter is only relevant if you don't pass a `weights` argument.
 59 |     """
 60 | 
 61 |     def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs):
 62 |         self.momentum = momentum
 63 |         self.axis = axis
 64 |         self.beta_init = initializers.get(beta_init)
 65 |         self.gamma_init = initializers.get(gamma_init)
 66 |         self.initial_weights = weights
 67 |         super(Scale, self).__init__(**kwargs)
 68 | 
 69 |     def build(self, input_shape):
 70 |         self.input_spec = [InputSpec(shape=input_shape)]
 71 |         shape = (int(input_shape[self.axis]),)
 72 | 
 73 |         self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name)
 74 |         self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name)
 75 |         self.trainable_weights = [self.gamma, self.beta]
 76 | 
 77 |         if self.initial_weights is not None:
 78 |             self.set_weights(self.initial_weights)
 79 |             del self.initial_weights
 80 | 
 81 |     def call(self, x, mask=None):
 82 |         input_shape = self.input_spec[0].shape
 83 |         broadcast_shape = [1] * len(input_shape)
 84 |         broadcast_shape[self.axis] = input_shape[self.axis]
 85 | 
 86 |         out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape)
 87 |         return out
 88 | 
 89 |     def get_config(self):
 90 |         config = {"momentum": self.momentum, "axis": self.axis}
 91 |         base_config = super(Scale, self).get_config()
 92 |         return dict(list(base_config.items()) + list(config.items()))
 93 | 
 94 | 
 95 | def identity_block(input_tensor, kernel_size, filters, stage, block):
 96 |     """
 97 |     The identity_block is the block that has no conv layer at shortcut
 98 |     # Arguments
 99 |         input_tensor: input tensor
100 |         kernel_size: defualt 3, the kernel size of middle conv layer at main path
101 |         filters: list of integers, the nb_filters of 3 conv layer at main path
102 |         stage: integer, current stage label, used for generating layer names
103 |         block: 'a','b'..., current block label, used for generating layer names
104 |     """
105 |     eps = 1.1e-5
106 |     nb_filter1, nb_filter2, nb_filter3 = filters
107 |     conv_name_base = 'res' + str(stage) + block + '_branch'
108 |     bn_name_base = 'bn' + str(stage) + block + '_branch'
109 |     scale_name_base = 'scale' + str(stage) + block + '_branch'
110 | 
111 |     if K.image_data_format() == 'channels_last':
112 |         bn_axis = 3
113 |     else:
114 |         bn_axis = 1
115 | 
116 |     x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
117 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
118 |     x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
119 |     x = Activation('relu', name=conv_name_base + '2a_relu')(x)
120 | 
121 |     x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
122 |     x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x)
123 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
124 |     x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
125 |     x = Activation('relu', name=conv_name_base + '2b_relu')(x)
126 | 
127 |     x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
128 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
129 |     x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
130 | 
131 |     x = add([x, input_tensor], name='res' + str(stage) + block)
132 |     x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
133 |     return x
134 | 
135 | 
136 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
137 |     """ conv_block is the block that has a conv layer at shortcut
138 |     # Arguments
139 |         input_tensor: input tensor
140 |         kernel_size: defualt 3, the kernel size of middle conv layer at main path
141 |         filters: list of integers, the nb_filters of 3 conv layer at main path
142 |         stage: integer, current stage label, used for generating layer names
143 |         block: 'a','b'..., current block label, used for generating layer names
144 |     Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
145 |     And the shortcut should have subsample=(2,2) as well
146 |     """
147 | 
148 |     eps = 1.1e-5
149 |     nb_filter1, nb_filter2, nb_filter3 = filters
150 |     conv_name_base = 'res' + str(stage) + block + '_branch'
151 |     bn_name_base = 'bn' + str(stage) + block + '_branch'
152 |     scale_name_base = 'scale' + str(stage) + block + '_branch'
153 | 
154 |     if K.image_data_format() == 'channels_last':
155 |         bn_axis = 3
156 |     else:
157 |         bn_axis = 1
158 | 
159 |     x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor)
160 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
161 |     x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
162 |     x = Activation('relu', name=conv_name_base + '2a_relu')(x)
163 | 
164 |     x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
165 |     x = Conv2D(nb_filter2, (kernel_size, kernel_size),
166 |                name=conv_name_base + '2b', use_bias=False)(x)
167 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
168 |     x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
169 |     x = Activation('relu', name=conv_name_base + '2b_relu')(x)
170 | 
171 |     x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
172 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
173 |     x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
174 | 
175 |     shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
176 |                       name=conv_name_base + '1', use_bias=False)(input_tensor)
177 |     shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
178 |     shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
179 | 
180 |     x = add([x, shortcut], name='res' + str(stage) + block)
181 |     x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
182 |     return x
183 | 
184 | 
185 | def ResNet152(include_top=True, weights='imagenet',
186 |               input_tensor=None, input_shape=None, pooling=None, classes=1000):
187 |     """ Instantiates the ResNet152 architecture.
188 |     Optionally loads weights pre-trained
189 |     on ImageNet. Note that when using TensorFlow,
190 |     for best performance you should set
191 |     `image_data_format='channels_last'` in your Keras config
192 |     at ~/.keras/keras.json.
193 |     The model and the weights are compatible only with
194 |     TensorFlow. The data format
195 |     convention used by the model is the one
196 |     specified in your Keras config file.
197 |     # Arguments
198 |         include_top: whether to include the fully-connected
199 |             layer at the top of the network.
200 |         weights: one of `None` (random initialization),
201 |               'imagenet' (pre-training on ImageNet),
202 |               or the path to the weights file to be loaded.
203 |         input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
204 |             to use as image input for the model.
205 |         input_shape: optional shape tuple, only to be specified
206 |             if `include_top` is False (otherwise the input shape
207 |             has to be `(224, 224, 3)` (with `channels_last` data format)
208 |             or `(3, 224, 224)` (with `channels_first` data format).
209 |             It should have exactly 3 inputs channels,
210 |             and width and height should be no smaller than 197.
211 |             E.g. `(200, 200, 3)` would be one valid value.
212 |         pooling: Optional pooling mode for feature extraction
213 |             when `include_top` is `False`.
214 |             - `None` means that the output of the model will be
215 |                 the 4D tensor output of the
216 |                 last convolutional layer.
217 |             - `avg` means that global average pooling
218 |                 will be applied to the output of the
219 |                 last convolutional layer, and thus
220 |                 the output of the model will be a 2D tensor.
221 |             - `max` means that global max pooling will
222 |                 be applied.
223 |         classes: optional number of classes to classify images
224 |             into, only to be specified if `include_top` is True, and
225 |             if no `weights` argument is specified.
226 |     # Returns
227 |         A Keras model instance.
228 |     # Raises
229 |         ValueError: in case of invalid argument for `weights`,
230 |             or invalid input shape.
231 |     """
232 | 
233 |     eps = 1.1e-5
234 | 
235 |     if not (weights in {'imagenet', None} or os.path.exists(weights)):
236 |         raise ValueError('The `weights` argument should be either '
237 |                          '`None` (random initialization), `imagenet` '
238 |                          '(pre-training on ImageNet), '
239 |                          'or the path to the weights file to be loaded.')
240 | 
241 |     if weights == 'imagenet' and include_top and classes != 1000:
242 |         raise ValueError('If using `weights` as imagenet with `include_top`'
243 |                          ' as true, `classes` should be 1000')
244 | 
245 |     # Determine proper input shape
246 |     input_shape = _obtain_input_shape(input_shape,
247 |                                       default_size=224,
248 |                                       min_size=197,
249 |                                       data_format=K.image_data_format(),
250 |                                       require_flatten=include_top,
251 |                                       weights=weights)
252 | 
253 |     if input_tensor is None:
254 |         img_input = Input(shape=input_shape)
255 |     else:
256 |         if not K.is_keras_tensor(input_tensor):
257 |             img_input = Input(tensor=input_tensor, shape=input_shape, name='data')
258 |         else:
259 |             img_input = input_tensor
260 | 
261 |     # Handle dimension ordering for different backends
262 |     #if K.image_dim_ordering() == 'tf':
263 |     if K.common.image_dim_ordering() == 'tf':
264 |         bn_axis = 3
265 |     else:
266 |         bn_axis = 1
267 | 
268 |     x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
269 |     x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
270 |     x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
271 |     x = Scale(axis=bn_axis, name='scale_conv1')(x)
272 |     x = Activation('relu', name='conv1_relu')(x)
273 |     x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x)
274 | 
275 |     x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
276 |     x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
277 |     x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
278 | 
279 |     x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
280 |     for i in range(1, 8):
281 |         x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i))
282 | 
283 |     x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
284 |     for i in range(1, 36):
285 |         x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i))
286 | 
287 |     x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
288 |     x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
289 |     x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
290 | 
291 |     if include_top:
292 |         # Classification block
293 |         x = AveragePooling2D((7, 7), name='avg_pool')(x)
294 |         x = Flatten()(x)
295 |         x = Dense(classes, activation='softmax', name='fc1000')(x)
296 |     else:
297 |         if pooling == 'avg':
298 |             x = GlobalAveragePooling2D()(x)
299 |         elif pooling == 'max':
300 |             x = GlobalMaxPooling2D()(x)
301 | 
302 |     # Ensure that the model takes into account
303 |     # any potential predecessors of `input_tensor`.
304 |     if input_tensor is not None:
305 |         inputs = get_source_inputs(input_tensor)
306 |     else:
307 |         inputs = img_input
308 | 
309 |     # Create model
310 |     model = Model(inputs, x, name='resnet152')
311 | 
312 |     # Load weights
313 |     if weights == 'imagenet':
314 |         if include_top:
315 |             weights_path = get_file(
316 |                 'resnet152_weights_tf_dim_ordering_tf_kernels.h5',
317 |                 WEIGHTS_PATH,
318 |                 cache_subdir='models',
319 |                 md5_hash='cdb18a2158b88e392c0905d47dcef965')
320 |         else:
321 |             weights_path = get_file(
322 |                 'resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5',
323 |                 WEIGHTS_PATH_NO_TOP,
324 |                 cache_subdir='models',
325 |                 md5_hash='02cb9130cc51543cd703c79697baa592')
326 |         model.load_weights(weights_path)
327 | 
328 |     elif weights is not None:
329 |         model.load_weights(weights)
330 | 
331 |     return model
332 | 


--------------------------------------------------------------------------------
/visual_context/run-visual.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | from skimage.io import imread
 4 | from skimage.transform import resize
 5 | from keras.applications.imagenet_utils import decode_predictions
 6 | from keras.applications.imagenet_utils import preprocess_input
 7 | from tensorflow.keras.preprocessing import image
 8 | from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
 9 | import numpy as np
10 | from model import ResNet152
11 | import tensorflow as tf
12 | 
13 | import os
14 | 
15 | 
16 | image_dir = 'imgs'
17 | import keras as K
18 | from keras_applications.imagenet_utils import _obtain_input_shape
19 | 
20 | os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
21 | 
22 | def preprocess(x):
23 |     x = resize(x, (224,224), mode='constant') * 255
24 |     x = preprocess_input(x)
25 |     if x.ndim == 3:
26 |         x = np.expand_dims(x, 0)
27 |     return x
28 | model = ResNet152()
29 | 
30 | for img_file in os.listdir(image_dir):
31 |     #img = mpimg.imread(image_dir + '/' + img_file)
32 |     img = image.load_img(image_dir + '/' + img_file, target_size=(224, 224))
33 |     x = image.img_to_array(img)
34 |     x = np.expand_dims(x, axis=0)
35 |     x = preprocess_input(x)
36 | 
37 |     preds = model.predict(x)
38 |     print(img_file, decode_predictions(preds, top=1)[0])
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/visual_context/run-visual_CLIP.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import glob
 4 | import sys
 5 | import torch
 6 | import torchvision.transforms as Transforms
 7 | import clip
 8 | from PIL import Image
 9 | 
10 | 
11 | 
12 | # Check device
13 | #device = "cuda" if torch.cuda.is_available() else "cpu"
14 | device = torch.device("cpu")
15 | print(f"Device - {device}")
16 | 
17 | # Load CLIP model
18 | clip_model, clip_preprocess = clip.load('ViT-B/32', device)
19 | clip_model.eval()
20 | 
21 | #
22 | with open("imagenet_classes.txt", "r") as f:
23 |     categories = [s.strip() for s in f.readlines()]
24 | 
25 | text = clip.tokenize(categories).to(device)
26 | 
27 | def predict_clip(image_file_path):
28 |     image = clip_preprocess(Image.open(image_file_path)).unsqueeze(0).to(device)
29 |     clip_model, _ = clip.load('ViT-B/32', device)
30 | 
31 |     # Calculate features
32 |     with torch.no_grad():
33 |         image_features = clip_model.encode_image(image)
34 |         text_features = clip_model.encode_text(text)
35 | 
36 |     # Pick the top 5 most similar labels for the image
37 |     image_features /= image_features.norm(dim=-1, keepdim=True)
38 |     text_features /= text_features.norm(dim=-1, keepdim=True)
39 |     similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
40 |     values, indices = similarity[0].topk(5)
41 | 
42 |     predictions = {}
43 |     for value, index in zip(values, indices):
44 |         predictions[f"{categories[index]:>16s}"] = f"{1 * value.item():.4f}%"
45 | 	
46 |     return predictions
47 | 
48 | 
49 | # run pred 
50 | filenames = glob.glob("file= '/image/*.jpg")
51 | filenames.sort()
52 | for image in filenames:
53 |      print(os.path.basename(image), predict_clip(image))
54 | #print(predict_clip("image.jpg"))
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/word-count-hisgram.py:
--------------------------------------------------------------------------------
  1 | """Python script to create a histogram of words in a text file.
  2 | Usage: python word_frequency.py -f "/path/to/file.txt" -n 200
  3 | Specify the path to the text file as above. Manually specify the top N words to report (default 100).
  4 | Text file can contain punctuation, new lines, etc., but special characters aren't handled well.
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import string
 10 | import argparse
 11 | import operator
 12 | 
 13 | import numpy as np
 14 | import pandas as pd
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | from collections import Counter
 18 | 
 19 | __author__ = 'Nick Powell (PhD student, CMIC & CABI, UCL, UK), nicholas.powell.11@ucl.ac.uk'
 20 | __version__ = '0.2.20150303'
 21 | __created__ = '2014-12-18, Thursday'
 22 | 
 23 |     
 24 | def main():
 25 |     
 26 |     parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
 27 |     parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True)
 28 |     parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int)
 29 |     args = parser.parse_args()
 30 |     
 31 |     # Path to text file to analyse
 32 |     rawfilepath = args.filepath
 33 |     
 34 |     # Print a histogram containing the top N words, and print them and their counts.
 35 |     top_n = args.number
 36 |     
 37 |     # Load the file
 38 |     filepath = os.path.normpath(os.path.join(rawfilepath))
 39 |     file = open(filepath, 'r')
 40 |     
 41 |     # Parse as a list, removing lines
 42 |     content_sublists = [line.split(',') for line in file.readlines()]
 43 |     
 44 |     # Parse into a single list (from a list of lists)
 45 |     content_list = [item for sublist in content_sublists for item in sublist]
 46 |     
 47 |     # Remove whitespace so we can concatenate appropriately, and unify case
 48 |     content_list_strip = [str.strip().lower() for str in content_list]
 49 |     
 50 |     # Concatenate strings into a single string
 51 |     content_concat = ' '.join(content_list_strip)
 52 |     
 53 |     # Remove punctuation and new lines
 54 |     punct = set(string.punctuation)
 55 |     unpunct_content = ''.join(x for x in content_concat if x not in punct)
 56 |     
 57 |     # Split string into list of strings, again
 58 |     word_list = unpunct_content.split()
 59 |     
 60 |     # Perform count
 61 |     counts_all = Counter(word_list)
 62 |     
 63 |     words, count_values = zip(*counts_all.items())
 64 |     
 65 |     # Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list
 66 |     values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
 67 |     
 68 |     # Top N
 69 |     words_sorted_top = words_sorted[0:top_n]
 70 |     values_sorted_top = values_sorted[0:top_n]
 71 |     
 72 |     print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
 73 |     print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath))
 74 |     print("The top {0} words are: \n{1}".format(top_n, words_sorted_top))
 75 |     print("... their respective frequencies: \n{0}".format(values_sorted_top))
 76 |     print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
 77 |     # Pandas DataFrame just for visualisation
 78 |     df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top})
 79 |     print("{0}".format(df))
 80 |     sys.stdout.flush()
 81 |     
 82 |     # Histogram
 83 |     
 84 |     # Make xticklabels comprehensible by matplotlib
 85 |     xticklabels = str(list(words_sorted_top)).split()
 86 |     # Remove the single quotes, commas and enclosing square brackets
 87 |     xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels]
 88 | 
 89 |     
 90 |     indices = np.arange(len(words_sorted_top))
 91 |     width = 1
 92 |     fig = plt.figure()
 93 |     fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16)
 94 |     plt.xlabel('word', fontsize=12)
 95 |     plt.ylabel('count', fontsize=12)
 96 |     plt.bar(indices, values_sorted_top, width*0.9, alpha=0.7, color='blue')
 97 |     plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8)
 98 |     plt.show()
 99 |     
100 | if __name__ == '__main__':
101 |     main()
102 |     


--------------------------------------------------------------------------------