├── BERT-CNN
├── BERT_CNN.py
├── BertLayer.py
├── __pycache__
│ ├── BertLayer.cpython-36.pyc
│ ├── data_pre.cpython-36.pyc
│ ├── extract_features.cpython-36.pyc
│ ├── freeze_keras_model.cpython-36.pyc
│ ├── modeling.cpython-36.pyc
│ └── tokenization.cpython-36.pyc
├── bert_experimental
│ ├── README.md
│ ├── bert_experimental
│ │ ├── feature_extraction
│ │ │ ├── l2_retriever.py
│ │ │ └── text_preprocessing.py
│ │ └── finetuning
│ │ │ ├── __init__.py
│ │ │ ├── bert_layer.py
│ │ │ ├── graph_ops.py
│ │ │ ├── modeling.py
│ │ │ └── text_preprocessing.py
│ ├── requirements.txt
│ └── setup.py
├── data
│ ├── read.me
│ ├── test.tsv
│ └── train.tsv
├── data_pre.py
├── eval.py
├── extract_features.py
├── freeze_keras_model.py
├── model.json
├── modeling.py
├── optimization.py
├── test_demo.tsv
├── tokenization.py
└── uncased_L-12_H-768_A-12
│ └── file-should be here.txt
├── BERT
├── README.md
├── data
│ ├── dev.tsv
│ ├── test.tsv
│ └── train.tsv.zip
├── modeling.py
├── outputs
│ └── need-this.txt
├── run_classifier.py
├── tokenization.py
├── train_VC_b.py
├── train_model_VC.py
└── uncased_L-12_H-768_A-12
│ └── file-should be here.txt
├── BERT_CNN_Visual_re_ranker_demo.ipynb
├── COCO_train2014_000000000009.jpg
├── COCO_val2014_000000000042.jpg
├── Evaluation
├── Result_tune_BERT_0.4.json
├── captions_val2014.json.zip
└── coco_eval.py
├── LRCE_figure_1.png
├── Pre-trained.png
├── README.md
├── approch.png
├── data
├── README.md
├── test.tsv
└── train.tsv
├── dataset_v1-1.png
├── hist.jpg
├── main.png
├── overlap_text.py
├── overlaping_result_v1.txt
├── overview.png
├── pre-trained
├── README.md
├── Visual_re-rank_re-ranked_output.txt
├── Visual_re-ranker.txt
├── caption.txt
├── model.py
├── sample_best.json
├── sample_best_baseline.json
├── visual-context_label.txt
└── visual-context_prob.txt
├── visual_context
├── README.md
├── imagenet_classes.txt
├── imgs
│ ├── COCO_val2014_000000185210.jpg
│ └── COCO_val2014_000000235692.jpg
├── model.py
├── run-visual.py
└── run-visual_CLIP.py
└── word-count-hisgram.py
/BERT-CNN/BERT_CNN.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | ##!/usr/bin/env python3
3 | #!/bin/env python
4 | import sys
5 | import argparse
6 | import re
7 | import os
8 | import sys
9 | import json
10 |
11 | import logging
12 | import numpy as np
13 | import pandas as pd
14 | import tensorflow as tf
15 | import tensorflow_hub as hub
16 | from BertLayer import BertLayer
17 | from BertLayer import build_preprocessor
18 | from freeze_keras_model import freeze_keras_model
19 |
20 | from data_pre import *
21 | from tensorflow import keras
22 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
23 | from sklearn.model_selection import train_test_split
24 |
25 |
26 | if not 'bert_repo' in sys.path:
27 | sys.path.insert(0, 'bert_repo')
28 |
29 | from modeling import BertModel, BertConfig
30 | from tokenization import FullTokenizer, convert_to_unicode
31 | from extract_features import InputExample, convert_examples_to_features
32 |
33 |
34 | # get TF logger
35 | log = logging.getLogger('tensorflow')
36 | log.handlers = []
37 |
38 |
39 | parser=argparse.ArgumentParser()
40 | parser.add_argument('--train', default='/home/asabir/BERT_layers-git/data/train.tsv', help='beam serach', type=str,required=False)
41 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False)
42 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False)
43 | parser.add_argument('--epochs', default='5', help='', type=int,required=False)
44 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False)
45 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False)
46 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False)
47 | args = parser.parse_args()
48 |
49 |
50 | # Downlaod the pre-trained model
51 |
52 | #!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
53 | #!unzip uncased_L-12_H-768_A-12.zip
54 |
55 |
56 | # tf.Module
57 | def build_module_fn(config_path, vocab_path, do_lower_case=True):
58 |
59 | def bert_module_fn(is_training):
60 | """Spec function for a token embedding module."""
61 |
62 | input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids")
63 | input_mask = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_mask")
64 | token_type = tf.placeholder(shape=[None, None], dtype=tf.int32, name="segment_ids")
65 |
66 | config = BertConfig.from_json_file(config_path)
67 | model = BertModel(config=config, is_training=is_training,
68 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type)
69 |
70 | seq_output = model.all_encoder_layers[-1]
71 | pool_output = model.get_pooled_output()
72 |
73 | config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file")
74 | vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file")
75 | lower_case = tf.constant(do_lower_case)
76 |
77 | tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file)
78 | tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file)
79 |
80 | input_map = {"input_ids": input_ids,
81 | "input_mask": input_mask,
82 | "segment_ids": token_type}
83 |
84 | output_map = {"pooled_output": pool_output,
85 | "sequence_output": seq_output}
86 |
87 | output_info_map = {"vocab_file": vocab_file,
88 | "do_lower_case": lower_case}
89 |
90 | hub.add_signature(name="tokens", inputs=input_map, outputs=output_map)
91 | hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
92 |
93 | return bert_module_fn
94 |
95 |
96 | MODEL_DIR = "/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12"
97 | config_path = "/{}/bert_config.json".format(MODEL_DIR)
98 | vocab_path = "/{}/vocab.txt".format(MODEL_DIR)
99 |
100 |
101 | tags_and_args = []
102 | for is_training in (True, False):
103 | tags = set()
104 | if is_training:
105 | tags.add("train")
106 | tags_and_args.append((tags, dict(is_training=is_training)))
107 |
108 | module_fn = build_module_fn(config_path, vocab_path)
109 | spec = hub.create_module_spec(module_fn, tags_and_args=tags_and_args)
110 | spec.export("bert-module",
111 | checkpoint_path="/{}/bert_model.ckpt".format(MODEL_DIR))
112 |
113 | class BertLayer(tf.keras.layers.Layer):
114 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
115 | pooling="cls", do_preprocessing=True, verbose=False,
116 | tune_embeddings=False, trainable=True, **kwargs):
117 |
118 | self.trainable = trainable
119 | self.n_tune_layers = n_tune_layers
120 | self.tune_embeddings = tune_embeddings
121 | self.do_preprocessing = do_preprocessing
122 |
123 | self.verbose = verbose
124 | self.seq_len = seq_len
125 | self.pooling = pooling
126 | self.bert_path = bert_path
127 |
128 | self.var_per_encoder = 16
129 | if self.pooling not in ["cls", "mean", None]:
130 | raise NameError(
131 | f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
132 | )
133 |
134 | super(BertLayer, self).__init__(**kwargs)
135 |
136 | def build(self, input_shape):
137 |
138 | self.bert = hub.Module(self.build_abspath(self.bert_path),
139 | trainable=self.trainable, name=f"{self.name}_module")
140 |
141 | trainable_layers = []
142 | if self.tune_embeddings:
143 | trainable_layers.append("embeddings")
144 |
145 | if self.pooling == "cls":
146 | trainable_layers.append("pooler")
147 |
148 | if self.n_tune_layers > 0:
149 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
150 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
151 | for i in range(self.n_tune_layers):
152 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
153 |
154 | # Add module variables to layer's trainable weights
155 | for var in self.bert.variables:
156 | if any([l in var.name for l in trainable_layers]):
157 | self._trainable_weights.append(var)
158 | else:
159 | self._non_trainable_weights.append(var)
160 |
161 | if self.verbose:
162 | print("*** TRAINABLE VARS *** ")
163 | for var in self._trainable_weights:
164 | print(var)
165 |
166 | self.build_preprocessor()
167 | self.initialize_module()
168 |
169 | super(BertLayer, self).build(input_shape)
170 |
171 | def build_abspath(self, path):
172 | if path.startswith("https://") or path.startswith("gs://"):
173 | return path
174 | else:
175 | return os.path.abspath(path)
176 |
177 | def build_preprocessor(self):
178 | sess = tf.keras.backend.get_session()
179 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
180 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
181 | tokenization_info["do_lower_case"]])
182 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
183 |
184 | def initialize_module(self):
185 | sess = tf.keras.backend.get_session()
186 |
187 | vars_initialized = sess.run([tf.is_variable_initialized(var)
188 | for var in self.bert.variables])
189 |
190 | uninitialized = []
191 | for var, is_initialized in zip(self.bert.variables, vars_initialized):
192 | if not is_initialized:
193 | uninitialized.append(var)
194 |
195 | if len(uninitialized):
196 | sess.run(tf.variables_initializer(uninitialized))
197 |
198 | def call(self, input):
199 |
200 | if self.do_preprocessing:
201 | input = tf.numpy_function(self.preprocessor,
202 | [input], [tf.int32, tf.int32, tf.int32],
203 | name='preprocessor')
204 | for feature in input:
205 | feature.set_shape((None, self.seq_len))
206 |
207 | input_ids, input_mask, segment_ids = input
208 |
209 | bert_inputs = dict(
210 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
211 | )
212 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
213 |
214 | if self.pooling == "cls":
215 | pooled = output["pooled_output"]
216 | else:
217 | result = output["sequence_output"]
218 |
219 | input_mask = tf.cast(input_mask, tf.float32)
220 | mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
221 | masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
222 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
223 |
224 | if self.pooling == "mean":
225 | pooled = masked_reduce_mean(result, input_mask)
226 | else:
227 | pooled = mul_mask(result, input_mask)
228 |
229 | return pooled
230 |
231 | def get_config(self):
232 | config_dict = {
233 | "bert_path": self.bert_path,
234 | "seq_len": self.seq_len,
235 | "pooling": self.pooling,
236 | "n_tune_layers": self.n_tune_layers,
237 | "tune_embeddings": self.tune_embeddings,
238 | "do_preprocessing": self.do_preprocessing,
239 | "verbose": self.verbose
240 | }
241 | super(BertLayer, self).get_config()
242 | return config_dict
243 |
244 |
245 | # read the train data
246 | #df = pd.read_csv("/home/asabir/BERT_layers-git/data/train.tsv", sep='\t')
247 | df = pd.read_csv(args.train, sep='\t')
248 |
249 |
250 |
251 |
252 | #labels = df.is_duplicate.values
253 | labels = df.is_related.values
254 |
255 | texts = []
256 | delimiter = " ||| "
257 |
258 | for vis, cap in zip(df.visual.tolist(), df.caption.tolist()):
259 | texts.append(delimiter.join((str(vis), str(cap))))
260 |
261 |
262 | texts = np.array(texts)
263 |
264 | trX, tsX, trY, tsY = train_test_split(texts, labels, shuffle=True, test_size=0.2)
265 |
266 |
267 | # Buliding the model
268 |
269 | embedding_size = 768
270 |
271 | inp = tf.keras.Input(shape=(1,), dtype=tf.string)
272 | # Three Layers
273 | #encoder = BertLayer(bert_path="./bert-module/", seq_len=48, tune_embeddings=False,
274 | # pooling='cls', n_tune_layers=3, verbose=False)
275 |
276 | # All Layers
277 | encoder = BertLayer(bert_path="./bert-module/", seq_len=args.seq_len, tune_embeddings=False, pooling=None, n_tune_layers=args.num_bert_layer, verbose=False)
278 |
279 |
280 |
281 | cnn_out = tf.keras.layers.Conv1D(args.CNN_filters, args.CNN_kernel_size, padding='VALID', activation=tf.nn.relu)(encoder(inp))
282 | pool = tf.keras.layers.MaxPooling1D(pool_size=2)(cnn_out)
283 | flat = tf.keras.layers.Flatten()(pool)
284 | pred = tf.keras.layers.Dense(1, activation="sigmoid")(flat)
285 |
286 |
287 | model = tf.keras.models.Model(inputs=[inp], outputs=[pred])
288 |
289 | model.summary()
290 |
291 | model.compile(
292 | optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5, ),
293 | loss="binary_crossentropy",
294 | metrics=["accuracy"])
295 |
296 | # fit the data
297 | import logging
298 | logging.getLogger("tensorflow").setLevel(logging.WARNING)
299 |
300 | saver = keras.callbacks.ModelCheckpoint("bert_CNN_tuned.hdf5")
301 |
302 | model.fit(trX, trY, validation_data=[tsX, tsY], batch_size=args.batch_size, epochs=args.epochs, callbacks=[saver])
303 |
304 |
305 |
306 | #save the model
307 | model.predict(trX[:10])
308 |
309 | import json
310 | json.dump(model.to_json(), open("model.json", "w"))
311 |
312 | model = tf.keras.models.model_from_json(json.load(open("model.json")),
313 | custom_objects={"BertLayer": BertLayer})
314 |
315 | model.load_weights("bert_CNN_tuned.hdf5")
316 |
317 | model.predict(trX[:10])
318 |
319 | # For fast inference and less RAM usesage as post-processing we need to "freezing" the model.
320 | from tensorflow.python.framework.graph_util import convert_variables_to_constants
321 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
322 |
323 | def freeze_keras_model(model, export_path=None, clear_devices=True):
324 | sess = tf.keras.backend.get_session()
325 | graph = sess.graph
326 |
327 | with graph.as_default():
328 |
329 | input_tensors = model.inputs
330 | output_tensors = model.outputs
331 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
332 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
333 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
334 |
335 | tmp_g = graph.as_graph_def()
336 | if clear_devices:
337 | for node in tmp_g.node:
338 | node.device = ""
339 |
340 | tmp_g = optimize_for_inference(
341 | tmp_g, input_ops, output_ops, dtypes, False)
342 |
343 | tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops)
344 |
345 | if export_path is not None:
346 | with tf.gfile.GFile(export_path, "wb") as f:
347 | f.write(tmp_g.SerializeToString())
348 |
349 | return tmp_g
350 |
351 |
352 | # freeze and save the model
353 | frozen_graph = freeze_keras_model(model, export_path="frozen_graph.pb")
354 |
355 |
356 | # inference
357 | #!git clone https://github.com/gaphex/bert_experimental/
358 |
359 | import tensorflow as tf
360 | import numpy as np
361 | import sys
362 |
363 | sys.path.insert(0, "bert_experimental")
364 |
365 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor
366 | from bert_experimental.finetuning.graph_ops import load_graph
367 |
368 |
369 | restored_graph = load_graph("frozen_graph.pb")
370 | graph_ops = restored_graph.get_operations()
371 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name
372 | print(input_op, output_op)
373 |
374 | x = restored_graph.get_tensor_by_name(input_op + ':0')
375 | y = restored_graph.get_tensor_by_name(output_op + ':0')
376 |
377 |
378 | preprocessor = build_preprocessor("/Users/asabir/BERT_layers-main/uncased_L-12_H-768_A-12/vocab.txt", 64)
379 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor')
380 |
381 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32])
382 |
383 | # predictions
384 |
385 | sess = tf.Session(graph=restored_graph)
386 |
387 | trX[:10]
388 |
389 | y_out = sess.run(y, feed_dict={
390 | x: trX[:10].reshape((-1,1))
391 | })
392 |
393 | print(y_out)
394 |
--------------------------------------------------------------------------------
/BERT-CNN/BertLayer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | import os
5 | import sys
6 | import json
7 |
8 | import logging
9 | import numpy as np
10 | import pandas as pd
11 | import tensorflow as tf
12 | import tensorflow_hub as hub
13 | from tensorflow import keras
14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
15 |
16 | from sklearn.model_selection import train_test_split
17 |
18 |
19 | if not 'bert_repo' in sys.path:
20 | sys.path.insert(0, 'bert_repo')
21 |
22 | from modeling import BertModel, BertConfig
23 | from tokenization import FullTokenizer, convert_to_unicode
24 | from extract_features import InputExample, convert_examples_to_features
25 |
26 |
27 | def build_preprocessor(voc_path, seq_len, lower=True):
28 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
29 |
30 | def strings_to_arrays(sents):
31 |
32 | sents = np.atleast_1d(sents).reshape((-1,))
33 |
34 | examples = []
35 | for example in read_examples(sents):
36 | examples.append(example)
37 |
38 | features = convert_examples_to_features(examples, seq_len, tokenizer)
39 | arrays = features_to_arrays(features)
40 | return arrays
41 |
42 |
43 | class BertLayer(tf.keras.layers.Layer):
44 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
45 | pooling="cls", do_preprocessing=True, verbose=False,
46 | tune_embeddings=False, trainable=True, **kwargs):
47 |
48 | self.trainable = trainable
49 | self.n_tune_layers = n_tune_layers
50 | self.tune_embeddings = tune_embeddings
51 | self.do_preprocessing = do_preprocessing
52 |
53 | self.verbose = verbose
54 | self.seq_len = seq_len
55 | self.pooling = pooling
56 | self.bert_path = bert_path
57 |
58 | self.var_per_encoder = 16
59 | if self.pooling not in ["cls", "mean", None]:
60 | raise NameError(
61 | f"Undefined pooling type (must be either 'cls', 'mean', or None, but is {self.pooling}"
62 | )
63 |
64 | super(BertLayer, self).__init__(**kwargs)
65 |
66 | def build(self, input_shape):
67 |
68 | self.bert = hub.Module(self.build_abspath(self.bert_path),
69 | trainable=self.trainable, name=f"{self.name}_module")
70 |
71 | trainable_layers = []
72 | if self.tune_embeddings:
73 | trainable_layers.append("embeddings")
74 |
75 | if self.pooling == "cls":
76 | trainable_layers.append("pooler")
77 |
78 | if self.n_tune_layers > 0:
79 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
80 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
81 | for i in range(self.n_tune_layers):
82 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
83 |
84 | # Add module variables to layer's trainable weights
85 | for var in self.bert.variables:
86 | if any([l in var.name for l in trainable_layers]):
87 | self._trainable_weights.append(var)
88 | else:
89 | self._non_trainable_weights.append(var)
90 |
91 | if self.verbose:
92 | print("*** TRAINABLE VARS *** ")
93 | for var in self._trainable_weights:
94 | print(var)
95 |
96 | self.build_preprocessor()
97 | self.initialize_module()
98 |
99 | super(BertLayer, self).build(input_shape)
100 |
101 | def build_abspath(self, path):
102 | if path.startswith("https://") or path.startswith("gs://"):
103 | return path
104 | else:
105 | return os.path.abspath(path)
106 |
107 | def build_preprocessor(self):
108 | sess = tf.keras.backend.get_session()
109 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
110 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
111 | tokenization_info["do_lower_case"]])
112 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
113 |
114 | def initialize_module(self):
115 | sess = tf.keras.backend.get_session()
116 |
117 | vars_initialized = sess.run([tf.is_variable_initialized(var)
118 | for var in self.bert.variables])
119 |
120 | uninitialized = []
121 | for var, is_initialized in zip(self.bert.variables, vars_initialized):
122 | if not is_initialized:
123 | uninitialized.append(var)
124 |
125 | if len(uninitialized):
126 | sess.run(tf.variables_initializer(uninitialized))
127 |
128 | def call(self, input):
129 |
130 | if self.do_preprocessing:
131 | input = tf.numpy_function(self.preprocessor,
132 | [input], [tf.int32, tf.int32, tf.int32],
133 | name='preprocessor')
134 | for feature in input:
135 | feature.set_shape((None, self.seq_len))
136 |
137 | input_ids, input_mask, segment_ids = input
138 |
139 | bert_inputs = dict(
140 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
141 | )
142 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
143 |
144 | if self.pooling == "cls":
145 | pooled = output["pooled_output"]
146 | else:
147 | result = output["sequence_output"]
148 |
149 | input_mask = tf.cast(input_mask, tf.float32)
150 | mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
151 | masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
152 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
153 |
154 | if self.pooling == "mean":
155 | pooled = masked_reduce_mean(result, input_mask)
156 | else:
157 | pooled = mul_mask(result, input_mask)
158 |
159 | return pooled
160 |
161 | def get_config(self):
162 | config_dict = {
163 | "bert_path": self.bert_path,
164 | "seq_len": self.seq_len,
165 | "pooling": self.pooling,
166 | "n_tune_layers": self.n_tune_layers,
167 | "tune_embeddings": self.tune_embeddings,
168 | "do_preprocessing": self.do_preprocessing,
169 | "verbose": self.verbose
170 | }
171 | super(BertLayer, self).get_config()
172 | return config_dict
173 |
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/BertLayer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/BertLayer.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/data_pre.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/data_pre.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/extract_features.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/extract_features.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/freeze_keras_model.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/modeling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/modeling.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/__pycache__/tokenization.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/__pycache__/tokenization.cpython-36.pyc
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/README.md:
--------------------------------------------------------------------------------
1 | https://github.com/gaphex/bert_experimental/tree/master/bert_experimental
2 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/feature_extraction/l2_retriever.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 |
5 | class L2Retriever:
6 | def __init__(self, dim, top_k=3, use_norm=False, use_gpu=False):
7 |
8 | self.dim = dim
9 | self.top_k = top_k
10 | self.use_norm = use_norm
11 | config = tf.ConfigProto(
12 | device_count={'GPU': (1 if use_gpu else 0)}
13 | )
14 | config.gpu_options.allow_growth = True
15 | self.session = tf.Session(config=config)
16 | self.dtype = "float32"
17 |
18 | self.query = tf.placeholder(self.dtype, [None, self.dim])
19 | self.kbase = tf.placeholder(self.dtype, [None, self.dim])
20 | if self.use_norm:
21 | self.norm = tf.placeholder(self.dtype, [None, 1])
22 | else:
23 | self.norm = None
24 |
25 | self.build_graph()
26 |
27 | def build_graph(self):
28 |
29 | self.distance = self.euclidean_distances(self.kbase, self.query, self.norm)
30 | top_neg_dists, top_indices = tf.math.top_k(
31 | tf.negative(self.distance), k=self.top_k)
32 | top_dists = tf.sqrt(tf.abs(tf.negative(top_neg_dists)))
33 |
34 | self.top_distances = top_dists
35 | self.top_indices = top_indices
36 |
37 | def predict(self, kbase, query, norm=None):
38 |
39 | query = query.reshape((-1, self.dim))
40 | feed_dict = {self.query: query, self.kbase: kbase}
41 | if self.use_norm:
42 | feed_dict[self.norm] = norm
43 |
44 | I, D = self.session.run([self.top_indices, self.top_distances],
45 | feed_dict=feed_dict)
46 |
47 | return I, D
48 |
49 | @staticmethod
50 | def euclidean_distances(kbase, query, norm=None):
51 |
52 | if norm is None:
53 | XX = tf.keras.backend.batch_dot(kbase, kbase, axes=1)
54 | else:
55 | XX = norm
56 |
57 | YY = tf.transpose(tf.keras.backend.batch_dot(query, query, axes=1))
58 | XY = tf.matmul(kbase, tf.transpose(query))
59 |
60 | distance = XX - 2 * XY + YY
61 | distance = tf.transpose(distance)
62 |
63 | return distance
64 |
65 | @staticmethod
66 | def compute_squared_l2_norm(mat):
67 | square_norms = np.sum(mat**2, axis=1, keepdims=True)
68 | return square_norms
69 |
70 | def __call__(self, kbase, query, norm=None):
71 | return self.predict(kbase, query, norm)
72 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/feature_extraction/text_preprocessing.py:
--------------------------------------------------------------------------------
1 | import re
2 | import tensorflow as tf
3 | import collections
4 | import unicodedata
5 |
6 |
7 | class FullTokenizer(object):
8 | """Runs end-to-end tokenziation."""
9 |
10 | def __init__(self, vocab_file, do_lower_case=True):
11 | self.vocab = load_vocab(vocab_file)
12 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
13 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
14 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
15 |
16 | def tokenize(self, text):
17 | split_tokens = []
18 | for token in self.basic_tokenizer.tokenize(text):
19 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
20 | split_tokens.append(sub_token)
21 |
22 | return split_tokens
23 |
24 | def convert_tokens_to_ids(self, tokens):
25 | return convert_by_vocab(self.vocab, tokens)
26 |
27 | def convert_ids_to_tokens(self, ids):
28 | return convert_by_vocab(self.inv_vocab, ids)
29 |
30 | def mark_unk_tokens(self, tokens, unk_token='[UNK]'):
31 | return [t if t in self.vocab else unk_token for t in tokens]
32 |
33 |
34 | class BasicTokenizer(object):
35 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
36 |
37 | def __init__(self, do_lower_case=True):
38 | """Constructs a BasicTokenizer.
39 | Args:
40 | do_lower_case: Whether to lower case the input.
41 | """
42 | self.do_lower_case = do_lower_case
43 |
44 | def tokenize(self, text):
45 | """Tokenizes a piece of text."""
46 | text = convert_to_unicode(text)
47 | text = self._clean_text(text)
48 |
49 | # This was added on November 1st, 2018 for the multilingual and Chinese
50 | # models. This is also applied to the English models now, but it doesn't
51 | # matter since the English models were not trained on any Chinese data
52 | # and generally don't have any Chinese data in them (there are Chinese
53 | # characters in the vocabulary because Wikipedia does have some Chinese
54 | # words in the English Wikipedia.).
55 | text = self._tokenize_chinese_chars(text)
56 |
57 | orig_tokens = whitespace_tokenize(text)
58 | split_tokens = []
59 | for token in orig_tokens:
60 | if self.do_lower_case:
61 | token = token.lower()
62 | token = self._run_strip_accents(token)
63 | split_tokens.extend(self._run_split_on_punc(token))
64 |
65 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
66 | return output_tokens
67 |
68 | def _run_strip_accents(self, text):
69 | """Strips accents from a piece of text."""
70 | text = unicodedata.normalize("NFD", text)
71 | output = []
72 | for char in text:
73 | cat = unicodedata.category(char)
74 | if cat == "Mn":
75 | continue
76 | output.append(char)
77 | return "".join(output)
78 |
79 | def _run_split_on_punc(self, text):
80 | """Splits punctuation on a piece of text."""
81 | chars = list(text)
82 | i = 0
83 | start_new_word = True
84 | output = []
85 | while i < len(chars):
86 | char = chars[i]
87 | if _is_punctuation(char):
88 | output.append([char])
89 | start_new_word = True
90 | else:
91 | if start_new_word:
92 | output.append([])
93 | start_new_word = False
94 | output[-1].append(char)
95 | i += 1
96 |
97 | return ["".join(x) for x in output]
98 |
99 | def _tokenize_chinese_chars(self, text):
100 | """Adds whitespace around any CJK character."""
101 | output = []
102 | for char in text:
103 | cp = ord(char)
104 | if self._is_chinese_char(cp):
105 | output.append(" ")
106 | output.append(char)
107 | output.append(" ")
108 | else:
109 | output.append(char)
110 | return "".join(output)
111 |
112 | def _is_chinese_char(self, cp):
113 | """Checks whether CP is the codepoint of a CJK character."""
114 | # This defines a "chinese character" as anything in the CJK Unicode block:
115 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
116 | #
117 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
118 | # despite its name. The modern Korean Hangul alphabet is a different block,
119 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
120 | # space-separated words, so they are not treated specially and handled
121 | # like the all of the other languages.
122 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
123 | (cp >= 0x3400 and cp <= 0x4DBF) or #
124 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
125 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
126 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
127 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
128 | (cp >= 0xF900 and cp <= 0xFAFF) or #
129 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
130 | return True
131 |
132 | return False
133 |
134 | def _clean_text(self, text):
135 | """Performs invalid character removal and whitespace cleanup on text."""
136 | output = []
137 | for char in text:
138 | cp = ord(char)
139 | if cp == 0 or cp == 0xfffd or _is_control(char):
140 | continue
141 | if _is_whitespace(char):
142 | output.append(" ")
143 | else:
144 | output.append(char)
145 | return "".join(output)
146 |
147 |
148 | class WordpieceTokenizer(object):
149 | """Runs WordPiece tokenziation."""
150 |
151 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
152 | self.vocab = vocab
153 | self.unk_token = unk_token
154 | self.max_input_chars_per_word = max_input_chars_per_word
155 |
156 | def tokenize(self, text):
157 | """Tokenizes a piece of text into its word pieces.
158 | This uses a greedy longest-match-first algorithm to perform tokenization
159 | using the given vocabulary.
160 | For example:
161 | input = "unaffable"
162 | output = ["un", "##aff", "##able"]
163 | Args:
164 | text: A single token or whitespace separated tokens. This should have
165 | already been passed through `BasicTokenizer.
166 | Returns:
167 | A list of wordpiece tokens.
168 | """
169 |
170 | text = convert_to_unicode(text)
171 |
172 | output_tokens = []
173 | for token in whitespace_tokenize(text):
174 | chars = list(token)
175 | if len(chars) > self.max_input_chars_per_word:
176 | output_tokens.append(self.unk_token)
177 | continue
178 |
179 | is_bad = False
180 | start = 0
181 | sub_tokens = []
182 | while start < len(chars):
183 | end = len(chars)
184 | cur_substr = None
185 | while start < end:
186 | substr = "".join(chars[start:end])
187 | if start > 0:
188 | substr = "##" + substr
189 | if substr in self.vocab:
190 | cur_substr = substr
191 | break
192 | end -= 1
193 | if cur_substr is None:
194 | is_bad = True
195 | break
196 | sub_tokens.append(cur_substr)
197 | start = end
198 |
199 | if is_bad:
200 | output_tokens.append(self.unk_token)
201 | else:
202 | output_tokens.extend(sub_tokens)
203 | return output_tokens
204 |
205 |
206 | class InputExample(object):
207 |
208 | def __init__(self, unique_id, text_a, text_b):
209 | self.unique_id = unique_id
210 | self.text_a = text_a
211 | self.text_b = text_b
212 |
213 |
214 | class InputFeatures(object):
215 | """A single set of features of data."""
216 |
217 | def __init__(self, tokens, input_ids, input_mask, input_type_ids):
218 | # self.unique_id = unique_id
219 | self.tokens = tokens
220 | self.input_ids = input_ids
221 | self.input_mask = input_mask
222 | self.input_type_ids = input_type_ids
223 |
224 |
225 | def _is_whitespace(char):
226 | """Checks whether `chars` is a whitespace character."""
227 | # \t, \n, and \r are technically contorl characters but we treat them
228 | # as whitespace since they are generally considered as such.
229 | if char == " " or char == "\t" or char == "\n" or char == "\r":
230 | return True
231 | cat = unicodedata.category(char)
232 | if cat == "Zs":
233 | return True
234 | return False
235 |
236 |
237 | def _is_control(char):
238 | """Checks whether `chars` is a control character."""
239 | # These are technically control characters but we count them as whitespace
240 | # characters.
241 | if char == "\t" or char == "\n" or char == "\r":
242 | return False
243 | cat = unicodedata.category(char)
244 | if cat.startswith("C"):
245 | return True
246 | return False
247 |
248 |
249 | def _is_punctuation(char):
250 | """Checks whether `chars` is a punctuation character."""
251 | cp = ord(char)
252 | # We treat all non-letter/number ASCII as punctuation.
253 | # Characters such as "^", "$", and "`" are not in the Unicode
254 | # Punctuation class but we treat them as punctuation anyways, for
255 | # consistency.
256 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
257 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
258 | return True
259 | cat = unicodedata.category(char)
260 | if cat.startswith("P"):
261 | return True
262 | return False
263 |
264 |
265 | def convert_to_unicode(text):
266 | if isinstance(text, str):
267 | return text
268 | elif isinstance(text, bytes):
269 | return text.decode("utf-8", "ignore")
270 | else:
271 | raise ValueError("Unsupported string type: %s" % (type(text)))
272 |
273 |
274 | def printable_text(text):
275 | if isinstance(text, str):
276 | return text
277 | elif isinstance(text, bytes):
278 | return text.decode("utf-8", "ignore")
279 | else:
280 | raise ValueError("Unsupported string type: %s" % (type(text)))
281 |
282 |
283 | def load_vocab(vocab_file):
284 | """Loads a vocabulary file into a dictionary."""
285 | vocab = collections.OrderedDict()
286 | index = 0
287 | with tf.gfile.GFile(vocab_file, "r") as reader:
288 | while True:
289 | token = convert_to_unicode(reader.readline())
290 | if not token:
291 | break
292 | token = token.strip()
293 | vocab[token] = index
294 | index += 1
295 | return vocab
296 |
297 |
298 | def convert_by_vocab(vocab, items):
299 | """Converts a sequence of [tokens|ids] using the vocab."""
300 | output = []
301 | for item in items:
302 | output.append(vocab[item])
303 | return output
304 |
305 |
306 | def convert_tokens_to_ids(vocab, tokens):
307 | return convert_by_vocab(vocab, tokens)
308 |
309 |
310 | def convert_ids_to_tokens(inv_vocab, ids):
311 | return convert_by_vocab(inv_vocab, ids)
312 |
313 |
314 | def whitespace_tokenize(text):
315 | """Runs basic whitespace cleaning and splitting on a peice of text."""
316 | text = text.strip()
317 | if not text:
318 | return []
319 | tokens = text.split()
320 | return tokens
321 |
322 |
323 | def convert_lst_to_features(lst_str, max_seq_length, max_position_embeddings,
324 | tokenizer, is_tokenized=False, mask_cls_sep=False):
325 | """Loads a data file into a list of `InputBatch`s."""
326 |
327 | examples = read_tokenized_examples(lst_str) if is_tokenized else read_examples(lst_str)
328 |
329 | _tokenize = lambda x: tokenizer.mark_unk_tokens(x) if is_tokenized else tokenizer.tokenize(x)
330 |
331 | all_tokens = [(_tokenize(ex.text_a), _tokenize(ex.text_b) if ex.text_b else []) for ex in examples]
332 |
333 | # user did not specify a meaningful sequence length
334 | # override the sequence length by the maximum seq length of the current batch
335 | if max_seq_length is None:
336 | max_seq_length = max(len(ta) + len(tb) for ta, tb in all_tokens)
337 | # add special tokens into account
338 | # case 1: Account for [CLS], tokens_a [SEP], tokens_b [SEP] -> 3 additional tokens
339 | # case 2: Account for [CLS], tokens_a [SEP] -> 2 additional tokens
340 | max_seq_length += 3 if any(len(tb) for _, tb in all_tokens) else 2
341 | max_seq_length = min(max_seq_length, max_position_embeddings)
342 |
343 | for (tokens_a, tokens_b) in all_tokens:
344 | if tokens_b:
345 | # Modifies `tokens_a` and `tokens_b` in place so that the total
346 | # length is less than the specified length.
347 | # Account for [CLS], [SEP], [SEP] with "- 3"
348 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
349 | else:
350 | # Account for [CLS] and [SEP] with "- 2"
351 | if len(tokens_a) > max_seq_length - 2:
352 | tokens_a = tokens_a[0:(max_seq_length - 2)]
353 |
354 | # The convention in BERT is:
355 | # (a) For sequence pairs:
356 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
357 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
358 | # (b) For single sequences:
359 | # tokens: [CLS] the dog is hairy . [SEP]
360 | # type_ids: 0 0 0 0 0 0 0
361 | #
362 | # Where "type_ids" are used to indicate whether this is the first
363 | # sequence or the second sequence. The embedding vectors for `type=0` and
364 | # `type=1` were learned during pre-training and are added to the wordpiece
365 | # embedding vector (and position vector). This is not *strictly* necessary
366 | # since the [SEP] token unambiguously separates the sequences, but it makes
367 | # it easier for the model to learn the concept of sequences.
368 | #
369 | # For classification tasks, the first vector (corresponding to [CLS]) is
370 | # used as as the "sentence vector". Note that this only makes sense because
371 | # the entire model is fine-tuned.
372 | tokens = ['[CLS]'] + tokens_a + ['[SEP]']
373 | input_type_ids = [0] * len(tokens)
374 | input_mask = [int(not mask_cls_sep)] + [1] * len(tokens_a) + [int(not mask_cls_sep)]
375 |
376 | if tokens_b:
377 | tokens += tokens_b + ['[SEP]']
378 | input_type_ids += [1] * (len(tokens_b) + 1)
379 | input_mask += [1] * len(tokens_b) + [int(not mask_cls_sep)]
380 |
381 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
382 |
383 | # Zero-pad up to the sequence length. more pythonic
384 | pad_len = max_seq_length - len(input_ids)
385 | input_ids += [0] * pad_len
386 | input_mask += [0] * pad_len
387 | input_type_ids += [0] * pad_len
388 |
389 | assert len(input_ids) == max_seq_length
390 | assert len(input_mask) == max_seq_length
391 | assert len(input_type_ids) == max_seq_length
392 |
393 | yield InputFeatures(
394 | # unique_id=example.unique_id,
395 | tokens=tokens,
396 | input_ids=input_ids,
397 | input_mask=input_mask,
398 | input_type_ids=input_type_ids)
399 |
400 |
401 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
402 | """Truncates a sequence pair in place to the maximum length."""
403 |
404 | # This is a simple heuristic which will always truncate the longer sequence
405 | # one token at a time. This makes more sense than truncating an equal percent
406 | # of tokens from each, since if one sequence is very short then each token
407 | # that's truncated likely contains more information than a longer sequence.
408 | while True:
409 | total_length = len(tokens_a) + len(tokens_b)
410 | if total_length <= max_length:
411 | break
412 | if len(tokens_a) > len(tokens_b):
413 | tokens_a.pop()
414 | else:
415 | tokens_b.pop()
416 |
417 |
418 | def read_examples(lst_strs):
419 | """Read a list of `InputExample`s from a list of strings."""
420 | unique_id = 0
421 | for ss in lst_strs:
422 | line = convert_to_unicode(ss)
423 | if not line:
424 | continue
425 | line = line.strip()
426 | text_a = None
427 | text_b = None
428 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
429 | if m is None:
430 | text_a = line
431 | else:
432 | text_a = m.group(1)
433 | text_b = m.group(2)
434 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
435 | unique_id += 1
436 |
437 |
438 | def read_tokenized_examples(lst_strs):
439 | unique_id = 0
440 | lst_strs = [[convert_to_unicode(w) for w in s] for s in lst_strs]
441 | for ss in lst_strs:
442 | text_a = ss
443 | text_b = None
444 | try:
445 | j = ss.index('|||')
446 | text_a = ss[:j]
447 | text_b = ss[(j + 1):]
448 | except ValueError:
449 | pass
450 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
451 | unique_id += 1
452 |
453 | def stub_preprocessor(text):
454 | return text
455 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT-CNN/bert_experimental/bert_experimental/finetuning/__init__.py
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/bert_layer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tensorflow as tf
3 | import tensorflow_hub as hub
4 |
5 | from .text_preprocessing import build_preprocessor
6 |
7 |
8 | class BertLayer(tf.keras.layers.Layer):
9 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
10 | pooling="cls", do_preprocessing=True, verbose=False,
11 | tune_embeddings=False, trainable=True, use_layers=None,
12 | as_dict=False, **kwargs):
13 |
14 | self.trainable = trainable
15 | self.n_tune_layers = n_tune_layers
16 | self.tune_embeddings = tune_embeddings
17 | self.do_preprocessing = do_preprocessing
18 |
19 | self.as_dict = as_dict
20 | self.verbose = verbose
21 | self.seq_len = seq_len
22 | self.pooling = pooling
23 | self.bert_path = bert_path
24 | self.use_layers = use_layers
25 |
26 | self.var_per_encoder = 16
27 | if self.pooling not in ["cls", "mean", "sqrt_mean", None]:
28 | raise NameError(
29 | f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}"
30 | )
31 |
32 | super(BertLayer, self).__init__(**kwargs)
33 |
34 | def build(self, input_shape):
35 |
36 | self.bert = hub.Module(self.build_abspath(self.bert_path),
37 | trainable=self.trainable, name=f"{self.name}_module")
38 |
39 | trainable_layers = []
40 | if self.tune_embeddings:
41 | trainable_layers.append("embeddings")
42 |
43 | if self.pooling == "cls":
44 | trainable_layers.append("pooler")
45 |
46 | if self.n_tune_layers > 0:
47 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
48 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
49 | if self.use_layers:
50 | n_encoder_layers = min(self.use_layers, n_encoder_layers)
51 | for i in range(self.n_tune_layers):
52 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
53 |
54 | # Add module variables to layer's trainable weights
55 | for var in self.bert.variables:
56 | if any([l in var.name for l in trainable_layers]):
57 | self._trainable_weights.append(var)
58 | else:
59 | self._non_trainable_weights.append(var)
60 |
61 | if self.verbose:
62 | print("*** TRAINABLE VARS *** ")
63 | for var in self._trainable_weights:
64 | print(var)
65 |
66 | self.build_preprocessor()
67 | self.initialize_module()
68 |
69 | super(BertLayer, self).build(input_shape)
70 |
71 | def build_abspath(self, path):
72 | if path.startswith("https://") or path.startswith("gs://"):
73 | return path
74 | else:
75 | return os.path.abspath(path)
76 |
77 | def build_preprocessor(self):
78 | sess = tf.compat.v1.keras.backend.get_session()
79 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
80 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
81 | tokenization_info["do_lower_case"]])
82 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
83 |
84 | def initialize_module(self):
85 | sess = tf.compat.v1.keras.backend.get_session()
86 |
87 | vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var)
88 | for var in self.bert.variables])
89 |
90 | uninitialized = []
91 | for var, is_initialized in zip(self.bert.variables, vars_initialized):
92 | if not is_initialized:
93 | uninitialized.append(var)
94 |
95 | if len(uninitialized):
96 | sess.run(tf.compat.v1.variables_initializer(uninitialized))
97 |
98 | def call(self, input):
99 |
100 | if self.do_preprocessing:
101 | input = tf.numpy_function(self.preprocessor,
102 | [input], [tf.int32, tf.int32, tf.int32],
103 | name='preprocessor')
104 | for feature in input:
105 | feature.set_shape((None, self.seq_len))
106 |
107 | input_ids, input_mask, segment_ids = input
108 |
109 | bert_inputs = dict(
110 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
111 | )
112 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
113 |
114 | input_mask = tf.cast(input_mask, tf.float32)
115 |
116 | seq_output = output["sequence_output"]
117 | tok_output = mul_mask(output.get("token_output", seq_output), input_mask)
118 |
119 | if self.pooling == "cls":
120 | pooled = output["pooled_output"]
121 | else:
122 | if self.pooling == "mean":
123 | pooled = masked_reduce_mean(seq_output, input_mask)
124 |
125 | elif self.pooling == "sqrt_mean":
126 | pooled = masked_reduce_sqrt_mean(seq_output, input_mask)
127 |
128 | else:
129 | pooled = mul_mask(seq_output, input_mask)
130 |
131 | if self.as_dict:
132 | output = {
133 | "sequence_output": seq_output,
134 | "pooled_output": pooled,
135 | "token_output": tok_output
136 | }
137 | else:
138 | output = pooled
139 |
140 | return output
141 |
142 | def get_config(self):
143 | config_dict = {
144 | "bert_path": self.bert_path,
145 | "seq_len": self.seq_len,
146 | "pooling": self.pooling,
147 | "n_tune_layers": self.n_tune_layers,
148 | "tune_embeddings": self.tune_embeddings,
149 | "do_preprocessing": self.do_preprocessing,
150 | "use_layers": self.use_layers,
151 | "trainable": self.trainable,
152 | "as_dict": self.as_dict,
153 | "verbose": self.verbose
154 | }
155 | super(BertLayer, self).get_config()
156 | return config_dict
157 |
158 |
159 | class StatefulBertLayer(tf.keras.layers.Layer):
160 | def __init__(self, bert_path, seq_len=64, n_tune_layers=3,
161 | pooling="cls", do_preprocessing=True, verbose=False,
162 | tune_embeddings=False, trainable=True, use_layers=None,
163 | as_dict=False, **kwargs):
164 |
165 | self.trainable = trainable
166 | self.n_tune_layers = n_tune_layers
167 | self.tune_embeddings = tune_embeddings
168 | self.do_preprocessing = do_preprocessing
169 |
170 | self.as_dict = as_dict
171 | self.verbose = verbose
172 | self.seq_len = seq_len
173 | self.pooling = pooling
174 | self.bert_path = bert_path
175 | self.use_layers = use_layers
176 |
177 | self.var_per_encoder = 16
178 | if self.pooling not in ["cls", "mean", "sqrt_mean", None]:
179 | raise NameError(
180 | f"Undefined pooling type (must be either 'cls', 'mean', 'sqrt_mean' or None, but is {self.pooling}"
181 | )
182 |
183 | super(StatefulBertLayer, self).__init__(**kwargs)
184 |
185 | def build(self, input_shape):
186 |
187 | self.bert = hub.Module(self.build_abspath(self.bert_path),
188 | trainable=self.trainable, name=f"{self.name}_module")
189 |
190 | trainable_layers = []
191 | if self.tune_embeddings:
192 | trainable_layers.append("embeddings")
193 |
194 | if self.pooling == "cls":
195 | trainable_layers.append("pooler")
196 |
197 | if self.n_tune_layers > 0:
198 | encoder_var_names = [var.name for var in self.bert.variables if 'encoder' in var.name]
199 | n_encoder_layers = int(len(encoder_var_names) / self.var_per_encoder)
200 | if self.use_layers:
201 | n_encoder_layers = min(self.use_layers, n_encoder_layers)
202 | for i in range(self.n_tune_layers):
203 | trainable_layers.append(f"encoder/layer_{str(n_encoder_layers - 1 - i)}/")
204 |
205 | # Add module variables to layer's trainable weights
206 | for var in self.bert.variables:
207 | if any([l in var.name for l in trainable_layers]):
208 | self._trainable_weights.append(var)
209 | else:
210 | self._non_trainable_weights.append(var)
211 |
212 | if self.verbose:
213 | print("*** TRAINABLE VARS *** ")
214 | for var in self._trainable_weights:
215 | print(var)
216 |
217 | self.build_preprocessor()
218 | self.initialize_module()
219 |
220 | super(StatefulBertLayer, self).build(input_shape)
221 |
222 | def build_abspath(self, path):
223 | if path.startswith("https://") or path.startswith("gs://"):
224 | return path
225 | else:
226 | return os.path.abspath(path)
227 |
228 | def build_preprocessor(self):
229 | sess = tf.compat.v1.keras.backend.get_session()
230 | tokenization_info = self.bert(signature="tokenization_info", as_dict=True)
231 | vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
232 | tokenization_info["do_lower_case"]])
233 | self.preprocessor = build_preprocessor(vocab_file, self.seq_len, do_lower_case)
234 |
235 | def initialize_module(self):
236 | sess = tf.compat.v1.keras.backend.get_session()
237 |
238 | vars_initialized = sess.run([tf.compat.v1.is_variable_initialized(var)
239 | for var in self.bert.variables])
240 |
241 | uninitialized = []
242 | for var, is_initialized in zip(self.bert.variables, vars_initialized):
243 | if not is_initialized:
244 | uninitialized.append(var)
245 |
246 | if len(uninitialized):
247 | sess.run(tf.compat.v1.variables_initializer(uninitialized))
248 |
249 | def call(self, input):
250 |
251 | if self.do_preprocessing:
252 | input_text, input_state = input
253 |
254 | preprocessed_text = tf.numpy_function(
255 | self.preprocessor, [input_text],
256 | [tf.int32, tf.int32, tf.int32],
257 | name='preprocessor')
258 | for feature in preprocessed_text:
259 | feature.set_shape((None, self.seq_len))
260 | input_ids, input_mask, segment_ids = preprocessed_text
261 |
262 | else:
263 | input_ids, input_mask, segment_ids, input_state = input
264 |
265 | bert_inputs = dict(
266 | input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, input_state=input_state
267 | )
268 | output = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)
269 |
270 | input_mask = tf.cast(input_mask, tf.float32)
271 |
272 | seq_output = output["sequence_output"]
273 | tok_output = mul_mask(output.get("token_output", seq_output), input_mask)
274 |
275 | if self.pooling == "cls":
276 | pooled = output["pooled_output"]
277 | else:
278 | if self.pooling == "mean":
279 | pooled = masked_reduce_mean(seq_output, input_mask)
280 |
281 | elif self.pooling == "sqrt_mean":
282 | pooled = masked_reduce_sqrt_mean(seq_output, input_mask)
283 |
284 | else:
285 | pooled = mul_mask(seq_output, input_mask)
286 |
287 | if self.as_dict:
288 | output["pooled_output"] = pooled
289 | else:
290 | output = pooled
291 |
292 | return output
293 |
294 | def get_config(self):
295 | config_dict = {
296 | "bert_path": self.bert_path,
297 | "seq_len": self.seq_len,
298 | "pooling": self.pooling,
299 | "n_tune_layers": self.n_tune_layers,
300 | "tune_embeddings": self.tune_embeddings,
301 | "do_preprocessing": self.do_preprocessing,
302 | "use_layers": self.use_layers,
303 | "trainable": self.trainable,
304 | "as_dict": self.as_dict,
305 | "verbose": self.verbose
306 | }
307 | super(StatefulBertLayer, self).get_config()
308 | return config_dict
309 |
310 | def mul_mask(x, m):
311 | return x * tf.expand_dims(m, axis=-1)
312 |
313 | def masked_reduce_mean(x, m):
314 | return tf.reduce_sum(mul_mask(x, m), axis=1) / (
315 | tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
316 |
317 | def masked_reduce_sqrt_mean(x, m):
318 | return tf.reduce_sum(mul_mask(x, m), axis=1) / (
319 | tf.sqrt(tf.reduce_sum(m, axis=1, keepdims=True)) + 1e-10)
320 |
321 |
322 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/graph_ops.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from tensorflow.python.framework.graph_util import convert_variables_to_constants
4 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
5 |
6 |
7 | def load_graph(frozen_graph_filename):
8 | with tf.io.gfile.GFile(frozen_graph_filename, "rb") as f:
9 | graph_def = tf.compat.v1.GraphDef()
10 | graph_def.ParseFromString(f.read())
11 |
12 | with tf.Graph().as_default() as graph:
13 | tf.import_graph_def(graph_def)
14 | return graph
15 |
16 |
17 | ### UPD old version to tf2/working with tf 1.x
18 | def freeze_keras_model(model, export_path=None, clear_devices=True):
19 | """
20 | Freezes the state of a session into a pruned computation graph.
21 |
22 | @param model The Keras model to be optimized for inference.
23 | @param clear_devices Remove the device directives from the graph for better portability.
24 | @return The frozen graph definition.
25 | """
26 | from tensorflow.compat.v1.graph_util import convert_variables_to_constants
27 | from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
28 |
29 | session = tf.compat.v1.keras.backend.get_session()
30 | graph = session.graph
31 |
32 | with graph.as_default():
33 |
34 | input_tensors = model.inputs
35 | output_tensors = model.outputs
36 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
37 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
38 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
39 |
40 | tmp_g = graph.as_graph_def()
41 | if clear_devices:
42 | for node in tmp_g.node:
43 | node.device = ""
44 |
45 | tmp_g = optimize_for_inference(
46 | tmp_g, input_ops, output_ops, dtypes, False)
47 |
48 | tmp_g = convert_variables_to_constants(session, tmp_g, output_ops)
49 |
50 | if export_path is not None:
51 | with tf.io.gfile.GFile(export_path, "wb") as f:
52 | f.write(tmp_g.SerializeToString())
53 |
54 | return tmp_g
55 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/bert_experimental/finetuning/text_preprocessing.py:
--------------------------------------------------------------------------------
1 | import re
2 | import tensorflow as tf
3 | import numpy as np
4 | import collections
5 | import unicodedata
6 |
7 |
8 | class FullTokenizer(object):
9 | """Runs end-to-end tokenziation."""
10 |
11 | def __init__(self, vocab_file, do_lower_case=True):
12 | self.vocab = load_vocab(vocab_file)
13 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
14 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
15 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
16 |
17 | def tokenize(self, text):
18 | split_tokens = []
19 | for token in self.basic_tokenizer.tokenize(text):
20 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
21 | split_tokens.append(sub_token)
22 |
23 | return split_tokens
24 |
25 | def convert_tokens_to_ids(self, tokens):
26 | return convert_by_vocab(self.vocab, tokens)
27 |
28 | def convert_ids_to_tokens(self, ids):
29 | return convert_by_vocab(self.inv_vocab, ids)
30 |
31 | def mark_unk_tokens(self, tokens, unk_token='[UNK]'):
32 | return [t if t in self.vocab else unk_token for t in tokens]
33 |
34 |
35 | class BasicTokenizer(object):
36 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
37 |
38 | def __init__(self, do_lower_case=True):
39 | """Constructs a BasicTokenizer.
40 | Args:
41 | do_lower_case: Whether to lower case the input.
42 | """
43 | self.do_lower_case = do_lower_case
44 |
45 | def tokenize(self, text):
46 | """Tokenizes a piece of text."""
47 | text = convert_to_unicode(text)
48 | text = self._clean_text(text)
49 |
50 | # This was added on November 1st, 2018 for the multilingual and Chinese
51 | # models. This is also applied to the English models now, but it doesn't
52 | # matter since the English models were not trained on any Chinese data
53 | # and generally don't have any Chinese data in them (there are Chinese
54 | # characters in the vocabulary because Wikipedia does have some Chinese
55 | # words in the English Wikipedia.).
56 | text = self._tokenize_chinese_chars(text)
57 |
58 | orig_tokens = whitespace_tokenize(text)
59 | split_tokens = []
60 | for token in orig_tokens:
61 | if self.do_lower_case:
62 | token = token.lower()
63 | token = self._run_strip_accents(token)
64 | split_tokens.extend(self._run_split_on_punc(token))
65 |
66 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
67 | return output_tokens
68 |
69 | def _run_strip_accents(self, text):
70 | """Strips accents from a piece of text."""
71 | text = unicodedata.normalize("NFD", text)
72 | output = []
73 | for char in text:
74 | cat = unicodedata.category(char)
75 | if cat == "Mn":
76 | continue
77 | output.append(char)
78 | return "".join(output)
79 |
80 | def _run_split_on_punc(self, text):
81 | """Splits punctuation on a piece of text."""
82 | chars = list(text)
83 | i = 0
84 | start_new_word = True
85 | output = []
86 | while i < len(chars):
87 | char = chars[i]
88 | if _is_punctuation(char):
89 | output.append([char])
90 | start_new_word = True
91 | else:
92 | if start_new_word:
93 | output.append([])
94 | start_new_word = False
95 | output[-1].append(char)
96 | i += 1
97 |
98 | return ["".join(x) for x in output]
99 |
100 | def _tokenize_chinese_chars(self, text):
101 | """Adds whitespace around any CJK character."""
102 | output = []
103 | for char in text:
104 | cp = ord(char)
105 | if self._is_chinese_char(cp):
106 | output.append(" ")
107 | output.append(char)
108 | output.append(" ")
109 | else:
110 | output.append(char)
111 | return "".join(output)
112 |
113 | def _is_chinese_char(self, cp):
114 | """Checks whether CP is the codepoint of a CJK character."""
115 | # This defines a "chinese character" as anything in the CJK Unicode block:
116 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
117 | #
118 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
119 | # despite its name. The modern Korean Hangul alphabet is a different block,
120 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
121 | # space-separated words, so they are not treated specially and handled
122 | # like the all of the other languages.
123 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
124 | (cp >= 0x3400 and cp <= 0x4DBF) or #
125 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
126 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
127 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
128 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
129 | (cp >= 0xF900 and cp <= 0xFAFF) or #
130 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
131 | return True
132 |
133 | return False
134 |
135 | def _clean_text(self, text):
136 | """Performs invalid character removal and whitespace cleanup on text."""
137 | output = []
138 | for char in text:
139 | cp = ord(char)
140 | if cp == 0 or cp == 0xfffd or _is_control(char):
141 | continue
142 | if _is_whitespace(char):
143 | output.append(" ")
144 | else:
145 | output.append(char)
146 | return "".join(output)
147 |
148 |
149 | class WordpieceTokenizer(object):
150 | """Runs WordPiece tokenziation."""
151 |
152 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
153 | self.vocab = vocab
154 | self.unk_token = unk_token
155 | self.max_input_chars_per_word = max_input_chars_per_word
156 |
157 | def tokenize(self, text):
158 | """Tokenizes a piece of text into its word pieces.
159 | This uses a greedy longest-match-first algorithm to perform tokenization
160 | using the given vocabulary.
161 | For example:
162 | input = "unaffable"
163 | output = ["un", "##aff", "##able"]
164 | Args:
165 | text: A single token or whitespace separated tokens. This should have
166 | already been passed through `BasicTokenizer.
167 | Returns:
168 | A list of wordpiece tokens.
169 | """
170 |
171 | text = convert_to_unicode(text)
172 |
173 | output_tokens = []
174 | for token in whitespace_tokenize(text):
175 | chars = list(token)
176 | if len(chars) > self.max_input_chars_per_word:
177 | output_tokens.append(self.unk_token)
178 | continue
179 |
180 | is_bad = False
181 | start = 0
182 | sub_tokens = []
183 | while start < len(chars):
184 | end = len(chars)
185 | cur_substr = None
186 | while start < end:
187 | substr = "".join(chars[start:end])
188 | if start > 0:
189 | substr = "##" + substr
190 | if substr in self.vocab:
191 | cur_substr = substr
192 | break
193 | end -= 1
194 | if cur_substr is None:
195 | is_bad = True
196 | break
197 | sub_tokens.append(cur_substr)
198 | start = end
199 |
200 | if is_bad:
201 | output_tokens.append(self.unk_token)
202 | else:
203 | output_tokens.extend(sub_tokens)
204 | return output_tokens
205 |
206 |
207 | class InputExample(object):
208 |
209 | def __init__(self, unique_id, text_a, text_b):
210 | self.unique_id = unique_id
211 | self.text_a = text_a
212 | self.text_b = text_b
213 |
214 |
215 | class InputFeatures(object):
216 | """A single set of features of data."""
217 |
218 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
219 | self.unique_id = unique_id
220 | self.tokens = tokens
221 | self.input_ids = input_ids
222 | self.input_mask = input_mask
223 | self.input_type_ids = input_type_ids
224 |
225 |
226 | def _is_whitespace(char):
227 | """Checks whether `chars` is a whitespace character."""
228 | # \t, \n, and \r are technically contorl characters but we treat them
229 | # as whitespace since they are generally considered as such.
230 | if char == " " or char == "\t" or char == "\n" or char == "\r":
231 | return True
232 | cat = unicodedata.category(char)
233 | if cat == "Zs":
234 | return True
235 | return False
236 |
237 |
238 | def _is_control(char):
239 | """Checks whether `chars` is a control character."""
240 | # These are technically control characters but we count them as whitespace
241 | # characters.
242 | if char == "\t" or char == "\n" or char == "\r":
243 | return False
244 | cat = unicodedata.category(char)
245 | if cat.startswith("C"):
246 | return True
247 | return False
248 |
249 |
250 | def _is_punctuation(char):
251 | """Checks whether `chars` is a punctuation character."""
252 | cp = ord(char)
253 | # We treat all non-letter/number ASCII as punctuation.
254 | # Characters such as "^", "$", and "`" are not in the Unicode
255 | # Punctuation class but we treat them as punctuation anyways, for
256 | # consistency.
257 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
258 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
259 | return True
260 | cat = unicodedata.category(char)
261 | if cat.startswith("P"):
262 | return True
263 | return False
264 |
265 |
266 | def convert_to_unicode(text):
267 | if isinstance(text, str):
268 | return text
269 | elif isinstance(text, bytes):
270 | return text.decode("utf-8", "ignore")
271 | else:
272 | raise ValueError("Unsupported string type: %s" % (type(text)))
273 |
274 |
275 | def printable_text(text):
276 | if isinstance(text, str):
277 | return text
278 | elif isinstance(text, bytes):
279 | return text.decode("utf-8", "ignore")
280 | else:
281 | raise ValueError("Unsupported string type: %s" % (type(text)))
282 |
283 |
284 | def load_vocab(vocab_file):
285 | """Loads a vocabulary file into a dictionary."""
286 | vocab = collections.OrderedDict()
287 | index = 0
288 | with tf.io.gfile.GFile(vocab_file, "r") as reader:
289 | while True:
290 | token = convert_to_unicode(reader.readline())
291 | if not token:
292 | break
293 | token = token.strip()
294 | vocab[token] = index
295 | index += 1
296 | return vocab
297 |
298 |
299 | def convert_by_vocab(vocab, items):
300 | """Converts a sequence of [tokens|ids] using the vocab."""
301 | output = []
302 | for item in items:
303 | output.append(vocab[item])
304 | return output
305 |
306 |
307 | def convert_tokens_to_ids(vocab, tokens):
308 | return convert_by_vocab(vocab, tokens)
309 |
310 |
311 | def convert_ids_to_tokens(inv_vocab, ids):
312 | return convert_by_vocab(inv_vocab, ids)
313 |
314 |
315 | def whitespace_tokenize(text):
316 | """Runs basic whitespace cleaning and splitting on a peice of text."""
317 | text = text.strip()
318 | if not text:
319 | return []
320 | tokens = text.split()
321 | return tokens
322 |
323 |
324 | def convert_examples_to_features(examples, seq_length, tokenizer):
325 | """Loads a data file into a list of `InputBatch`s."""
326 |
327 | features = []
328 | for (ex_index, example) in enumerate(examples):
329 | tokens_a = tokenizer.tokenize(example.text_a)
330 |
331 | tokens_b = None
332 | if example.text_b:
333 | tokens_b = tokenizer.tokenize(example.text_b)
334 |
335 | if tokens_b:
336 | # Modifies `tokens_a` and `tokens_b` in place so that the total
337 | # length is less than the specified length.
338 | # Account for [CLS], [SEP], [SEP] with "- 3"
339 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
340 | else:
341 | # Account for [CLS] and [SEP] with "- 2"
342 | if len(tokens_a) > seq_length - 2:
343 | tokens_a = tokens_a[0:(seq_length - 2)]
344 |
345 | # The convention in BERT is:
346 | # (a) For sequence pairs:
347 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
348 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
349 | # (b) For single sequences:
350 | # tokens: [CLS] the dog is hairy . [SEP]
351 | # type_ids: 0 0 0 0 0 0 0
352 | #
353 | # Where "type_ids" are used to indicate whether this is the first
354 | # sequence or the second sequence. The embedding vectors for `type=0` and
355 | # `type=1` were learned during pre-training and are added to the wordpiece
356 | # embedding vector (and position vector). This is not *strictly* necessary
357 | # since the [SEP] token unambiguously separates the sequences, but it makes
358 | # it easier for the model to learn the concept of sequences.
359 | #
360 | # For classification tasks, the first vector (corresponding to [CLS]) is
361 | # used as as the "sentence vector". Note that this only makes sense because
362 | # the entire model is fine-tuned.
363 | tokens = []
364 | input_type_ids = []
365 | tokens.append("[CLS]")
366 | input_type_ids.append(0)
367 | for token in tokens_a:
368 | tokens.append(token)
369 | input_type_ids.append(0)
370 | tokens.append("[SEP]")
371 | input_type_ids.append(0)
372 |
373 | if tokens_b:
374 | for token in tokens_b:
375 | tokens.append(token)
376 | input_type_ids.append(1)
377 | tokens.append("[SEP]")
378 | input_type_ids.append(1)
379 |
380 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
381 |
382 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
383 | # tokens are attended to.
384 | input_mask = [1] * len(input_ids)
385 |
386 | # Zero-pad up to the sequence length.
387 | while len(input_ids) < seq_length:
388 | input_ids.append(0)
389 | input_mask.append(0)
390 | input_type_ids.append(0)
391 |
392 | assert len(input_ids) == seq_length
393 | assert len(input_mask) == seq_length
394 | assert len(input_type_ids) == seq_length
395 |
396 | features.append(
397 | InputFeatures(
398 | unique_id=example.unique_id,
399 | tokens=tokens,
400 | input_ids=input_ids,
401 | input_mask=input_mask,
402 | input_type_ids=input_type_ids))
403 | return features
404 |
405 |
406 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
407 | """Truncates a sequence pair in place to the maximum length."""
408 |
409 | # This is a simple heuristic which will always truncate the longer sequence
410 | # one token at a time. This makes more sense than truncating an equal percent
411 | # of tokens from each, since if one sequence is very short then each token
412 | # that's truncated likely contains more information than a longer sequence.
413 | while True:
414 | total_length = len(tokens_a) + len(tokens_b)
415 | if total_length <= max_length:
416 | break
417 | if len(tokens_a) > len(tokens_b):
418 | tokens_a.pop()
419 | else:
420 | tokens_b.pop()
421 |
422 |
423 | def read_examples(str_list):
424 | """Read a list of `InputExample`s from a list of strings."""
425 | unique_id = 0
426 | for s in str_list:
427 | line = convert_to_unicode(s)
428 | line = line.strip()
429 | text_a = None
430 | text_b = None
431 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
432 | if m is None:
433 | text_a = line
434 | else:
435 | text_a = m.group(1)
436 | text_b = m.group(2)
437 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
438 | unique_id += 1
439 |
440 |
441 | def features_to_arrays(features):
442 | all_input_ids = []
443 | all_input_mask = []
444 | all_segment_ids = []
445 |
446 | for feature in features:
447 | all_input_ids.append(feature.input_ids)
448 | all_input_mask.append(feature.input_mask)
449 | all_segment_ids.append(feature.input_type_ids)
450 |
451 | return (np.array(all_input_ids, dtype='int32'),
452 | np.array(all_input_mask, dtype='int32'),
453 | np.array(all_segment_ids, dtype='int32'))
454 |
455 |
456 | def build_preprocessor(voc_path, seq_len, lower=True):
457 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
458 | EMPTY_STR = ""
459 | PAD_STR = "pad"
460 | NULL_VAL = 0
461 |
462 | def strings_to_arrays(str_list):
463 | str_list = np.atleast_1d(str_list).reshape((-1,))
464 |
465 | empty_id = (str_list == EMPTY_STR).nonzero()[0]
466 | str_list[empty_id] = PAD_STR
467 |
468 | examples = []
469 | for example in read_examples(str_list):
470 | examples.append(example)
471 |
472 | features = convert_examples_to_features(examples, seq_len, tokenizer)
473 | arrays = features_to_arrays(features)
474 |
475 | for arr in arrays:
476 | arr[empty_id] = NULL_VAL
477 | str_list[empty_id] = EMPTY_STR
478 | return arrays
479 |
480 | return strings_to_arrays
481 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow==1.15.4
2 | tensorflow-hub==0.7.0
3 |
--------------------------------------------------------------------------------
/BERT-CNN/bert_experimental/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | __version__ = '1.0.4'
4 |
5 | setup(
6 | name='bert_experimental',
7 | version=__version__,
8 | description='Utilities for finetuning BERT-like models',
9 | url='https://github.com/gaphex/bert_experimental',
10 | long_description=open('README.md', 'r', encoding="utf8").read(),
11 | long_description_content_type='text/markdown',
12 | author='Denis Antyukhov',
13 | author_email='gaphex@gmail.com',
14 | license='MIT',
15 | packages=find_packages(),
16 | zip_safe=False,
17 | install_requires=[
18 | 'tensorflow>=1.15, <2.0',
19 | 'tensorflow-hub==0.7.0',
20 | 'numpy'
21 | ],
22 | classifiers=(
23 | 'Programming Language :: Python :: 3.7',
24 | 'License :: OSI Approved :: MIT License',
25 | 'Operating System :: OS Independent',
26 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
27 | ),
28 | keywords='bert nlp tensorflow machine learning sentence encoding embedding finetuning',
29 | )
30 |
--------------------------------------------------------------------------------
/BERT-CNN/data/read.me:
--------------------------------------------------------------------------------
1 | put the train.tsv file here
2 |
--------------------------------------------------------------------------------
/BERT-CNN/data/test.tsv:
--------------------------------------------------------------------------------
1 | id visual caption
2 | 0 standard poodle shopping cart footwear a close up of a dog laying in a basket
3 | 1 street sign traffic light tower a black and white photo of a street light
4 | 2 toilet seat a white toilet with its seat up in a bathroom
5 | 3 mobile home studio couch house a living room filled with furniture and a coffee table
6 | 4 french loaf conch person a basket filled with sandwiches on top of a table
7 | 5 indian elephant a group of people riding on the back of an elephant
8 | 6 bow tie windsor glasses a man wearing glasses and a tie in a room
9 | 7 sombrero bonnet woman a woman standing in front of a giant cake
10 | 8 diaper bassinet human a baby sitting in front of a giant cake
11 | 9 bobsled go-kart human a group of children sitting around a piece of luggage
12 | 10 vase spotlight plant a bunch of flowers that are in a vase
13 |
--------------------------------------------------------------------------------
/BERT-CNN/data/train.tsv:
--------------------------------------------------------------------------------
1 | id id1 id2 visual caption is_related
2 | 220740 220741 220742 marimba dalmatian picket fence a horse jumping competition is going on with people in the stands 1
3 | 385729 385730 385731 dishwasher microwave barber chair a person riding a horse on a dirt ground 0
4 | 59422 59423 59424 laptop carton comicbook a laptop that has stickers on its cover is sitting on a table 1
5 | 46638 46639 46640 suit Windsortie woodenspoon a young bow wearing a pink shirt and a purple tie 1
6 | 11870 11871 11872 studiocouch four-poster quilt a couple of girls sitting in a bed in a bedroom 1
7 | 471676 471677 471678 streetcar fire engine passenger car a multi layer plate with cakes and food on it 0
8 | 186795 186796 186797 shoe shop television monitor a man playing a wii on a large projector screen 1
9 | 121836 121837 121838 ox water buffalo alp cattle standing on a hill in fog 1
10 | 396224 396225 396226 altar desk perfume oranges sitting in a blue bowl on a wooden table 0
11 | 430635 430636 430637 speedboat paddle lifeboat pots and other items sit on a stove and counter 0
12 | 145057 145058 145059 shopping cart ashcan park bench a coin meter that is laying down on grates 1
13 | 409778 409779 409780 web site fire engine comic book a painting of a man from the back 0
14 | 155568 155569 155570 grocery store patio restaurant a man and woman walking up the stairs in a backyard 1
15 | 213951 213952 213953 microwave washer dining table the kitchen is equipped with all the latest appliances 1
16 | 489266 489267 489268 traffic light aircraft carrier chain saw a laptop computer on a desk with cables a mug and bowl 0
17 | 257649 257650 257651 grocery store confectionery shopping basket a couple of wooden tale stopped with fresh fruit 1
18 | 113826 113827 113828 lab coat vestment West Highland white terrier a group of people standing in rows with frisbees for a photo 1
19 | 486413 486414 486415 snorkel ski tennis ball two frames of a woman in the air on a tennis court 0
20 | 400432 400433 400434 crutch lawn mower chain saw eight underneath on ambarella in the forest parrot 0
21 | 341153 341154 341155 washer microwave dishwasher a small propeller plane sitting underneath a covering at an airport 0
22 | 462067 462068 462069 ballplayer baseball scoreboard a plate full of bright green lettuce next to some bread 0
23 | 443392 443393 443394 grocery store pineapple pizza a man in black and white stripes with makeup smiling 0
24 | 486660 486661 486662 wombat wallaby titi a persons shadow on the ground of them skateboarding 0
25 | 336616 336617 336618 moped motor scooter crash helmet multiple street signs are attached to the post 0
26 | 124199 124200 124201 sorrel hog barrel a brown horse eating from a hallowed out metal barrel 1
27 | 238004 238005 238006 tray washbasin cradle a cat laying on a couch near a remote control 1
28 | 319195 319196 319197 airliner wing web site a propeller airplane parked inside and airplane hanger 1
29 | 412036 412037 412038 grey whale breakwater killer whale a stop sign is standing at a street intersection 0
30 | 491896 491897 491898 teddy wool toyshop a woman in an old-fashioned kitchen with pots and pans 0
31 | 487501 487502 487503 snowmobile steam locomotive tow truck the living room is clean and empty from people 0
32 | 277093 277094 277095 microwave dishwasher chest a chair holding a laptop that is facing towards an oven 1
33 | 135542 135543 135544 water buffalo warthog hog sheep grazing under a tree in a grassy meadow 1
34 | 8448 8449 8450 mountainbike unicycle bicycle-built-for-two a picture of a person throwing a frisbee 1
35 | 170686 170687 170688 police van minibus ambulance a person in the army greeting someone in a suit 1
36 | 372016 372017 372018 Great Dane Irish wolfhound English setter a man standing in a room holding a remote 0
37 | 351158 351159 351160 sunglass bullet train sunglasses a woman opening the trunk of her car 0
38 | 414542 414543 414544 killer whale great white shark paddle a dog running across a field with a frisbee in his mouth 0
39 | 264998 264999 265000 bannister ski unicycle a man riding a skateboard along a metal hand rail 1
40 | 362868 362869 362870 zebra bustard gazelle a basket full of bananas with a net on top 0
41 | 88455 88456 88457 patio flagpole pole a fire hydrant and fire hose in a houses front yard 1
42 | 372512 372513 372514 seashore catamaran swimming trunks a man riding a surfboard on a wave in the ocean 0
43 | 387327 387328 387329 cellular telephone lab coat cash machine a baseball game ensues as people watch 0
44 | 248027 248028 248029 web site barbershop cinema a motor bike on the side of the street 1
45 | 347507 347508 347509 banana pineapple orange a bear itching itself on a bare tree 0
46 | 33714 33715 33716 picketfence streetcar mountainbike the red bike and the pink bike just started dating 1
47 | 173989 173990 173991 umbrella poncho jinrikisha a group of people walking down a street carrying umbrellas 1
48 | 20835 20836 20837 ballplayer baseball footballhelmet a man throwing a baseball from a mound on a field 1
49 | 16356 16357 16358 lumbermill barbershop turnstile a man working on a baseball bat while two others watch 1
50 | 193491 193492 193493 unicycle pole horizontal bar boy riding on his skateboard down a stair rail 1
51 | 384165 384166 384167 mixing bowl corn meat loaf a couple of sailors standing next to a woman 0
52 | 321736 321737 321738 ballplayer baseball football helmet a boys baseball game with a batter catcher and umpire 1
53 | 108395 108396 108397 crash helmet moped backpack a man with a suit and tie on a motor bike 1
54 | 215942 215943 215944 unicycle military uniform bearskin four guys are sitting on a bench in front of a building 1
55 | 134156 134157 134158 wine bottle eggnog red wine there is a bottle of wine next to a glass 1
56 | 297783 297784 297785 necklace thimble corkscrew this is an image of a meal and an avocado is included 1
57 | 110516 110517 110518 minivan cab police van a dog looking ahead with a stoic look in a car seat 1
58 | 3166 3167 3168 grocerystore headcabbage cauliflower a pile of vegetables on display at a grocery store 1
59 | 440075 440076 440077 ski curly-coatedretriever Gordonsetter elephants and their young in their natural habitat 0
60 | 71021 71022 71023 ballplayer baseball puck a baseball player and a flying black bat 1
61 |
--------------------------------------------------------------------------------
/BERT-CNN/data_pre.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import sys
4 | import json
5 |
6 | import logging
7 | import numpy as np
8 | import pandas as pd
9 | import tensorflow as tf
10 |
11 | from modeling import BertModel, BertConfig
12 | from tokenization import FullTokenizer, convert_to_unicode
13 | from extract_features import InputExample, convert_examples_to_features
14 |
15 | def read_examples(str_list):
16 | """Read a list of `InputExample`s from a list of strings."""
17 | unique_id = 0
18 | for s in str_list:
19 | line = convert_to_unicode(s)
20 | if not line:
21 | continue
22 | line = line.strip()
23 | text_a = None
24 | text_b = None
25 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
26 | if m is None:
27 | text_a = line
28 | else:
29 | text_a = m.group(1)
30 | text_b = m.group(2)
31 | yield InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)
32 | unique_id += 1
33 |
34 | # Convert theses features to np.arrays to use with tf.Keras.
35 | def features_to_arrays(features):
36 |
37 | all_input_ids = []
38 | all_input_mask = []
39 | all_segment_ids = []
40 |
41 | for feature in features:
42 | all_input_ids.append(feature.input_ids)
43 | all_input_mask.append(feature.input_mask)
44 | all_segment_ids.append(feature.input_type_ids)
45 |
46 | return (np.array(all_input_ids, dtype='int32'),
47 | np.array(all_input_mask, dtype='int32'),
48 | np.array(all_segment_ids, dtype='int32'))
49 |
50 |
51 | # built all togehter
52 | def build_preprocessor(voc_path, seq_len, lower=True):
53 | tokenizer = FullTokenizer(vocab_file=voc_path, do_lower_case=lower)
54 |
55 | def strings_to_arrays(sents):
56 |
57 | sents = np.atleast_1d(sents).reshape((-1,))
58 |
59 | examples = []
60 | for example in read_examples(sents):
61 | examples.append(example)
62 |
63 | features = convert_examples_to_features(examples, seq_len, tokenizer)
64 | arrays = features_to_arrays(features)
65 | return arrays
66 |
67 | return strings_to_arrays
68 |
69 |
--------------------------------------------------------------------------------
/BERT-CNN/eval.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | import pandas as pd
4 | import sys
5 | import argparse
6 | from sklearn.model_selection import train_test_split
7 |
8 | sys.path.insert(0, "bert_experimental")
9 |
10 | from bert_experimental.finetuning.text_preprocessing import build_preprocessor
11 | from bert_experimental.finetuning.graph_ops import load_graph
12 |
13 |
14 |
15 | parser=argparse.ArgumentParser(description='inference of the model')
16 | parser.add_argument('--testset', default='test.tsv', help='test file', type=str,required=True)
17 | parser.add_argument('--model', default='pre-trained model', help='', type=str, required=True)
18 | args = parser.parse_args()
19 |
20 |
21 |
22 | df = pd.read_csv(args.testset, sep='\t')
23 |
24 |
25 | texts = []
26 | delimiter = " ||| "
27 |
28 | for vis, cap in zip(df.visual.tolist(), df.caption.tolist()):
29 | texts.append(delimiter.join((str(vis), str(cap))))
30 |
31 |
32 | texts = np.array(texts)
33 |
34 | trX, tsX = train_test_split(texts, shuffle=False, test_size=0.01)
35 |
36 |
37 | restored_graph = load_graph(args.model)
38 |
39 | graph_ops = restored_graph.get_operations()
40 | input_op, output_op = graph_ops[0].name, graph_ops[-1].name
41 | print(input_op, output_op)
42 |
43 | x = restored_graph.get_tensor_by_name(input_op + ':0')
44 | y = restored_graph.get_tensor_by_name(output_op + ':0')
45 |
46 | preprocessor = build_preprocessor("uncased_L-12_H-768_A-12/vocab.txt", 64)
47 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32], name='preprocessor')
48 |
49 | py_func = tf.numpy_function(preprocessor, [x], [tf.int32, tf.int32, tf.int32])
50 |
51 | ##predictions
52 |
53 | sess = tf.Session(graph=restored_graph)
54 |
55 | print(trX[:2])
56 |
57 | y = tf.print(y, summarize=-1)
58 | #x = tf.print(x, summarize=-1)
59 | y_out = sess.run(y, feed_dict={
60 | x: trX[:2].reshape((-1,1))
61 |
62 | })
63 |
64 | print(y_out)
65 |
66 |
--------------------------------------------------------------------------------
/BERT-CNN/extract_features.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Extract pre-computed feature vectors from BERT."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import codecs
22 | import collections
23 | import json
24 | import re
25 |
26 | import modeling
27 | import tokenization
28 | import tensorflow as tf
29 |
30 | flags = tf.flags
31 |
32 | FLAGS = flags.FLAGS
33 |
34 | flags.DEFINE_string("input_file", None, "")
35 |
36 | flags.DEFINE_string("output_file", None, "")
37 |
38 | flags.DEFINE_string("layers", "-1,-2,-3,-4", "")
39 |
40 | flags.DEFINE_string(
41 | "bert_config_file", None,
42 | "The config json file corresponding to the pre-trained BERT model. "
43 | "This specifies the model architecture.")
44 |
45 | flags.DEFINE_integer(
46 | "max_seq_length", 128,
47 | "The maximum total input sequence length after WordPiece tokenization. "
48 | "Sequences longer than this will be truncated, and sequences shorter "
49 | "than this will be padded.")
50 |
51 | flags.DEFINE_string(
52 | "init_checkpoint", None,
53 | "Initial checkpoint (usually from a pre-trained BERT model).")
54 |
55 | flags.DEFINE_string("vocab_file", None,
56 | "The vocabulary file that the BERT model was trained on.")
57 |
58 | flags.DEFINE_bool(
59 | "do_lower_case", True,
60 | "Whether to lower case the input text. Should be True for uncased "
61 | "models and False for cased models.")
62 |
63 | flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.")
64 |
65 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
66 |
67 | flags.DEFINE_string("master", None,
68 | "If using a TPU, the address of the master.")
69 |
70 | flags.DEFINE_integer(
71 | "num_tpu_cores", 8,
72 | "Only used if `use_tpu` is True. Total number of TPU cores to use.")
73 |
74 | flags.DEFINE_bool(
75 | "use_one_hot_embeddings", False,
76 | "If True, tf.one_hot will be used for embedding lookups, otherwise "
77 | "tf.nn.embedding_lookup will be used. On TPUs, this should be True "
78 | "since it is much faster.")
79 |
80 |
81 | class InputExample(object):
82 |
83 | def __init__(self, unique_id, text_a, text_b):
84 | self.unique_id = unique_id
85 | self.text_a = text_a
86 | self.text_b = text_b
87 |
88 |
89 | class InputFeatures(object):
90 | """A single set of features of data."""
91 |
92 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
93 | self.unique_id = unique_id
94 | self.tokens = tokens
95 | self.input_ids = input_ids
96 | self.input_mask = input_mask
97 | self.input_type_ids = input_type_ids
98 |
99 |
100 | def input_fn_builder(features, seq_length):
101 | """Creates an `input_fn` closure to be passed to TPUEstimator."""
102 |
103 | all_unique_ids = []
104 | all_input_ids = []
105 | all_input_mask = []
106 | all_input_type_ids = []
107 |
108 | for feature in features:
109 | all_unique_ids.append(feature.unique_id)
110 | all_input_ids.append(feature.input_ids)
111 | all_input_mask.append(feature.input_mask)
112 | all_input_type_ids.append(feature.input_type_ids)
113 |
114 | def input_fn(params):
115 | """The actual input function."""
116 | batch_size = params["batch_size"]
117 |
118 | num_examples = len(features)
119 |
120 | # This is for demo purposes and does NOT scale to large data sets. We do
121 | # not use Dataset.from_generator() because that uses tf.py_func which is
122 | # not TPU compatible. The right way to load data is with TFRecordReader.
123 | d = tf.data.Dataset.from_tensor_slices({
124 | "unique_ids":
125 | tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32),
126 | "input_ids":
127 | tf.constant(
128 | all_input_ids, shape=[num_examples, seq_length],
129 | dtype=tf.int32),
130 | "input_mask":
131 | tf.constant(
132 | all_input_mask,
133 | shape=[num_examples, seq_length],
134 | dtype=tf.int32),
135 | "input_type_ids":
136 | tf.constant(
137 | all_input_type_ids,
138 | shape=[num_examples, seq_length],
139 | dtype=tf.int32),
140 | })
141 |
142 | d = d.batch(batch_size=batch_size, drop_remainder=False)
143 | return d
144 |
145 | return input_fn
146 |
147 |
148 | def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu,
149 | use_one_hot_embeddings):
150 | """Returns `model_fn` closure for TPUEstimator."""
151 |
152 | def model_fn(features, labels, mode, params): # pylint: disable=unused-argument
153 | """The `model_fn` for TPUEstimator."""
154 |
155 | unique_ids = features["unique_ids"]
156 | input_ids = features["input_ids"]
157 | input_mask = features["input_mask"]
158 | input_type_ids = features["input_type_ids"]
159 |
160 | model = modeling.BertModel(
161 | config=bert_config,
162 | is_training=False,
163 | input_ids=input_ids,
164 | input_mask=input_mask,
165 | token_type_ids=input_type_ids,
166 | use_one_hot_embeddings=use_one_hot_embeddings)
167 |
168 | if mode != tf.estimator.ModeKeys.PREDICT:
169 | raise ValueError("Only PREDICT modes are supported: %s" % (mode))
170 |
171 | tvars = tf.trainable_variables()
172 | scaffold_fn = None
173 | (assignment_map,
174 | initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
175 | tvars, init_checkpoint)
176 | if use_tpu:
177 |
178 | def tpu_scaffold():
179 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
180 | return tf.train.Scaffold()
181 |
182 | scaffold_fn = tpu_scaffold
183 | else:
184 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
185 |
186 | tf.logging.info("**** Trainable Variables ****")
187 | for var in tvars:
188 | init_string = ""
189 | if var.name in initialized_variable_names:
190 | init_string = ", *INIT_FROM_CKPT*"
191 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape,
192 | init_string)
193 |
194 | all_layers = model.get_all_encoder_layers()
195 |
196 | predictions = {
197 | "unique_id": unique_ids,
198 | }
199 |
200 | for (i, layer_index) in enumerate(layer_indexes):
201 | predictions["layer_output_%d" % i] = all_layers[layer_index]
202 |
203 | output_spec = tf.contrib.tpu.TPUEstimatorSpec(
204 | mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
205 | return output_spec
206 |
207 | return model_fn
208 |
209 |
210 | def convert_examples_to_features(examples, seq_length, tokenizer):
211 | """Loads a data file into a list of `InputBatch`s."""
212 |
213 | features = []
214 | for (ex_index, example) in enumerate(examples):
215 | tokens_a = tokenizer.tokenize(example.text_a)
216 |
217 | tokens_b = None
218 | if example.text_b:
219 | tokens_b = tokenizer.tokenize(example.text_b)
220 |
221 | if tokens_b:
222 | # Modifies `tokens_a` and `tokens_b` in place so that the total
223 | # length is less than the specified length.
224 | # Account for [CLS], [SEP], [SEP] with "- 3"
225 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
226 | else:
227 | # Account for [CLS] and [SEP] with "- 2"
228 | if len(tokens_a) > seq_length - 2:
229 | tokens_a = tokens_a[0:(seq_length - 2)]
230 |
231 | # The convention in BERT is:
232 | # (a) For sequence pairs:
233 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
234 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
235 | # (b) For single sequences:
236 | # tokens: [CLS] the dog is hairy . [SEP]
237 | # type_ids: 0 0 0 0 0 0 0
238 | #
239 | # Where "type_ids" are used to indicate whether this is the first
240 | # sequence or the second sequence. The embedding vectors for `type=0` and
241 | # `type=1` were learned during pre-training and are added to the wordpiece
242 | # embedding vector (and position vector). This is not *strictly* necessary
243 | # since the [SEP] token unambiguously separates the sequences, but it makes
244 | # it easier for the model to learn the concept of sequences.
245 | #
246 | # For classification tasks, the first vector (corresponding to [CLS]) is
247 | # used as as the "sentence vector". Note that this only makes sense because
248 | # the entire model is fine-tuned.
249 | tokens = []
250 | input_type_ids = []
251 | tokens.append("[CLS]")
252 | input_type_ids.append(0)
253 | for token in tokens_a:
254 | tokens.append(token)
255 | input_type_ids.append(0)
256 | tokens.append("[SEP]")
257 | input_type_ids.append(0)
258 |
259 | if tokens_b:
260 | for token in tokens_b:
261 | tokens.append(token)
262 | input_type_ids.append(1)
263 | tokens.append("[SEP]")
264 | input_type_ids.append(1)
265 |
266 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
267 |
268 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
269 | # tokens are attended to.
270 | input_mask = [1] * len(input_ids)
271 |
272 | # Zero-pad up to the sequence length.
273 | while len(input_ids) < seq_length:
274 | input_ids.append(0)
275 | input_mask.append(0)
276 | input_type_ids.append(0)
277 |
278 | assert len(input_ids) == seq_length
279 | assert len(input_mask) == seq_length
280 | assert len(input_type_ids) == seq_length
281 |
282 | if ex_index < 5:
283 | tf.logging.info("*** Example ***")
284 | tf.logging.info("unique_id: %s" % (example.unique_id))
285 | tf.logging.info("tokens: %s" % " ".join(
286 | [tokenization.printable_text(x) for x in tokens]))
287 | tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
288 | tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
289 | tf.logging.info(
290 | "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
291 |
292 | features.append(
293 | InputFeatures(
294 | unique_id=example.unique_id,
295 | tokens=tokens,
296 | input_ids=input_ids,
297 | input_mask=input_mask,
298 | input_type_ids=input_type_ids))
299 | return features
300 |
301 |
302 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
303 | """Truncates a sequence pair in place to the maximum length."""
304 |
305 | # This is a simple heuristic which will always truncate the longer sequence
306 | # one token at a time. This makes more sense than truncating an equal percent
307 | # of tokens from each, since if one sequence is very short then each token
308 | # that's truncated likely contains more information than a longer sequence.
309 | while True:
310 | total_length = len(tokens_a) + len(tokens_b)
311 | if total_length <= max_length:
312 | break
313 | if len(tokens_a) > len(tokens_b):
314 | tokens_a.pop()
315 | else:
316 | tokens_b.pop()
317 |
318 |
319 | def read_examples(input_file):
320 | """Read a list of `InputExample`s from an input file."""
321 | examples = []
322 | unique_id = 0
323 | with tf.gfile.GFile(input_file, "r") as reader:
324 | while True:
325 | line = tokenization.convert_to_unicode(reader.readline())
326 | if not line:
327 | break
328 | line = line.strip()
329 | text_a = None
330 | text_b = None
331 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
332 | if m is None:
333 | text_a = line
334 | else:
335 | text_a = m.group(1)
336 | text_b = m.group(2)
337 | examples.append(
338 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
339 | unique_id += 1
340 | return examples
341 |
342 |
343 | def main(_):
344 | tf.logging.set_verbosity(tf.logging.INFO)
345 |
346 | layer_indexes = [int(x) for x in FLAGS.layers.split(",")]
347 |
348 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
349 |
350 | tokenizer = tokenization.FullTokenizer(
351 | vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
352 |
353 | is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
354 | run_config = tf.contrib.tpu.RunConfig(
355 | master=FLAGS.master,
356 | tpu_config=tf.contrib.tpu.TPUConfig(
357 | num_shards=FLAGS.num_tpu_cores,
358 | per_host_input_for_training=is_per_host))
359 |
360 | examples = read_examples(FLAGS.input_file)
361 |
362 | features = convert_examples_to_features(
363 | examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer)
364 |
365 | unique_id_to_feature = {}
366 | for feature in features:
367 | unique_id_to_feature[feature.unique_id] = feature
368 |
369 | model_fn = model_fn_builder(
370 | bert_config=bert_config,
371 | init_checkpoint=FLAGS.init_checkpoint,
372 | layer_indexes=layer_indexes,
373 | use_tpu=FLAGS.use_tpu,
374 | use_one_hot_embeddings=FLAGS.use_one_hot_embeddings)
375 |
376 | # If TPU is not available, this will fall back to normal Estimator on CPU
377 | # or GPU.
378 | estimator = tf.contrib.tpu.TPUEstimator(
379 | use_tpu=FLAGS.use_tpu,
380 | model_fn=model_fn,
381 | config=run_config,
382 | predict_batch_size=FLAGS.batch_size)
383 |
384 | input_fn = input_fn_builder(
385 | features=features, seq_length=FLAGS.max_seq_length)
386 |
387 | with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file,
388 | "w")) as writer:
389 | for result in estimator.predict(input_fn, yield_single_examples=True):
390 | unique_id = int(result["unique_id"])
391 | feature = unique_id_to_feature[unique_id]
392 | output_json = collections.OrderedDict()
393 | output_json["linex_index"] = unique_id
394 | all_features = []
395 | for (i, token) in enumerate(feature.tokens):
396 | all_layers = []
397 | for (j, layer_index) in enumerate(layer_indexes):
398 | layer_output = result["layer_output_%d" % j]
399 | layers = collections.OrderedDict()
400 | layers["index"] = layer_index
401 | layers["values"] = [
402 | round(float(x), 6) for x in layer_output[i:(i + 1)].flat
403 | ]
404 | all_layers.append(layers)
405 | features = collections.OrderedDict()
406 | features["token"] = token
407 | features["layers"] = all_layers
408 | all_features.append(features)
409 | output_json["features"] = all_features
410 | writer.write(json.dumps(output_json) + "\n")
411 |
412 |
413 | if __name__ == "__main__":
414 | flags.mark_flag_as_required("input_file")
415 | flags.mark_flag_as_required("vocab_file")
416 | flags.mark_flag_as_required("bert_config_file")
417 | flags.mark_flag_as_required("init_checkpoint")
418 | flags.mark_flag_as_required("output_file")
419 | tf.app.run()
420 |
--------------------------------------------------------------------------------
/BERT-CNN/freeze_keras_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | import os
5 | import sys
6 | import json
7 |
8 | import logging
9 | import numpy as np
10 | import pandas as pd
11 | import tensorflow as tf
12 | import tensorflow_hub as hub
13 | from tensorflow import keras
14 | from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
15 |
16 | from sklearn.model_selection import train_test_split
17 |
18 |
19 | if not 'bert_repo' in sys.path:
20 | sys.path.insert(0, 'bert_repo')
21 |
22 | from modeling import BertModel, BertConfig
23 | from tokenization import FullTokenizer, convert_to_unicode
24 | from extract_features import InputExample, convert_examples_to_features
25 |
26 |
27 | def freeze_keras_model(model, export_path=None, clear_devices=True):
28 | sess = tf.keras.backend.get_session()
29 | graph = sess.graph
30 |
31 | with graph.as_default():
32 |
33 | input_tensors = model.inputs
34 | output_tensors = model.outputs
35 | dtypes = [t.dtype.as_datatype_enum for t in input_tensors]
36 | input_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in input_tensors]
37 | output_ops = [t.name.rsplit(":", maxsplit=1)[0] for t in output_tensors]
38 |
39 | tmp_g = graph.as_graph_def()
40 | if clear_devices:
41 | for node in tmp_g.node:
42 | node.device = ""
43 |
44 | tmp_g = optimize_for_inference(
45 | tmp_g, input_ops, output_ops, dtypes, False)
46 |
47 | tmp_g = convert_variables_to_constants(sess, tmp_g, output_ops)
48 |
49 | if export_path is not None:
50 | with tf.gfile.GFile(export_path, "wb") as f:
51 | f.write(tmp_g.SerializeToString())
52 |
53 | return tmp_g
54 |
55 |
--------------------------------------------------------------------------------
/BERT-CNN/model.json:
--------------------------------------------------------------------------------
1 | "{\"class_name\": \"Model\", \"config\": {\"name\": \"model\", \"layers\": [{\"name\": \"input_1\", \"class_name\": \"InputLayer\", \"config\": {\"batch_input_shape\": [null, 1], \"dtype\": \"string\", \"sparse\": false, \"ragged\": false, \"name\": \"input_1\"}, \"inbound_nodes\": []}, {\"name\": \"bert_layer\", \"class_name\": \"BertLayer\", \"config\": {\"bert_path\": \"./bert-module/\", \"seq_len\": 64, \"pooling\": null, \"n_tune_layers\": 12, \"tune_embeddings\": false, \"do_preprocessing\": true, \"verbose\": false}, \"inbound_nodes\": [[[\"input_1\", 0, 0, {}]]]}, {\"name\": \"conv1d\", \"class_name\": \"Conv1D\", \"config\": {\"name\": \"conv1d\", \"trainable\": true, \"dtype\": \"float32\", \"filters\": 32, \"kernel_size\": [3], \"strides\": [1], \"padding\": \"valid\", \"data_format\": \"channels_last\", \"dilation_rate\": [1], \"activation\": \"relu\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"bert_layer\", 0, 0, {}]]]}, {\"name\": \"max_pooling1d\", \"class_name\": \"MaxPooling1D\", \"config\": {\"name\": \"max_pooling1d\", \"trainable\": true, \"dtype\": \"float32\", \"strides\": [2], \"pool_size\": [2], \"padding\": \"valid\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"conv1d\", 0, 0, {}]]]}, {\"name\": \"flatten\", \"class_name\": \"Flatten\", \"config\": {\"name\": \"flatten\", \"trainable\": true, \"dtype\": \"float32\", \"data_format\": \"channels_last\"}, \"inbound_nodes\": [[[\"max_pooling1d\", 0, 0, {}]]]}, {\"name\": \"dense\", \"class_name\": \"Dense\", \"config\": {\"name\": \"dense\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 1, \"activation\": \"sigmoid\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"GlorotUniform\", \"config\": {\"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {\"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"inbound_nodes\": [[[\"flatten\", 0, 0, {}]]]}], \"input_layers\": [[\"input_1\", 0, 0]], \"output_layers\": [[\"dense\", 0, 0]]}, \"keras_version\": \"2.2.4-tf\", \"backend\": \"tensorflow\"}"
--------------------------------------------------------------------------------
/BERT-CNN/optimization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Functions and classes related to optimization (weight updates)."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import re
22 | import tensorflow as tf
23 |
24 |
25 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu):
26 | """Creates an optimizer training op."""
27 | global_step = tf.train.get_or_create_global_step()
28 |
29 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
30 |
31 | # Implements linear decay of the learning rate.
32 | learning_rate = tf.train.polynomial_decay(
33 | learning_rate,
34 | global_step,
35 | num_train_steps,
36 | end_learning_rate=0.0,
37 | power=1.0,
38 | cycle=False)
39 |
40 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
41 | # learning rate will be `global_step/num_warmup_steps * init_lr`.
42 | if num_warmup_steps:
43 | global_steps_int = tf.cast(global_step, tf.int32)
44 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
45 |
46 | global_steps_float = tf.cast(global_steps_int, tf.float32)
47 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
48 |
49 | warmup_percent_done = global_steps_float / warmup_steps_float
50 | warmup_learning_rate = init_lr * warmup_percent_done
51 |
52 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
53 | learning_rate = (
54 | (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
55 |
56 | # It is recommended that you use this optimizer for fine tuning, since this
57 | # is how the model was trained (note that the Adam m/v variables are NOT
58 | # loaded from init_checkpoint.)
59 | optimizer = AdamWeightDecayOptimizer(
60 | learning_rate=learning_rate,
61 | weight_decay_rate=0.01,
62 | beta_1=0.9,
63 | beta_2=0.999,
64 | epsilon=1e-6,
65 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
66 |
67 | if use_tpu:
68 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
69 |
70 | tvars = tf.trainable_variables()
71 | grads = tf.gradients(loss, tvars)
72 |
73 | # This is how the model was pre-trained.
74 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0)
75 |
76 | train_op = optimizer.apply_gradients(
77 | zip(grads, tvars), global_step=global_step)
78 |
79 | # Normally the global step update is done inside of `apply_gradients`.
80 | # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
81 | # a different optimizer, you should probably take this line out.
82 | new_global_step = global_step + 1
83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)])
84 | return train_op
85 |
86 |
87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
88 | """A basic Adam optimizer that includes "correct" L2 weight decay."""
89 |
90 | def __init__(self,
91 | learning_rate,
92 | weight_decay_rate=0.0,
93 | beta_1=0.9,
94 | beta_2=0.999,
95 | epsilon=1e-6,
96 | exclude_from_weight_decay=None,
97 | name="AdamWeightDecayOptimizer"):
98 | """Constructs a AdamWeightDecayOptimizer."""
99 | super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 |
101 | self.learning_rate = learning_rate
102 | self.weight_decay_rate = weight_decay_rate
103 | self.beta_1 = beta_1
104 | self.beta_2 = beta_2
105 | self.epsilon = epsilon
106 | self.exclude_from_weight_decay = exclude_from_weight_decay
107 |
108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 | """See base class."""
110 | assignments = []
111 | for (grad, param) in grads_and_vars:
112 | if grad is None or param is None:
113 | continue
114 |
115 | param_name = self._get_variable_name(param.name)
116 |
117 | m = tf.get_variable(
118 | name=param_name + "/adam_m",
119 | shape=param.shape.as_list(),
120 | dtype=tf.float32,
121 | trainable=False,
122 | initializer=tf.zeros_initializer())
123 | v = tf.get_variable(
124 | name=param_name + "/adam_v",
125 | shape=param.shape.as_list(),
126 | dtype=tf.float32,
127 | trainable=False,
128 | initializer=tf.zeros_initializer())
129 |
130 | # Standard Adam update.
131 | next_m = (
132 | tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
133 | next_v = (
134 | tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
135 | tf.square(grad)))
136 |
137 | update = next_m / (tf.sqrt(next_v) + self.epsilon)
138 |
139 | # Just adding the square of the weights to the loss function is *not*
140 | # the correct way of using L2 regularization/weight decay with Adam,
141 | # since that will interact with the m and v parameters in strange ways.
142 | #
143 | # Instead we want ot decay the weights in a manner that doesn't interact
144 | # with the m/v parameters. This is equivalent to adding the square
145 | # of the weights to the loss with plain (non-momentum) SGD.
146 | if self._do_use_weight_decay(param_name):
147 | update += self.weight_decay_rate * param
148 |
149 | update_with_lr = self.learning_rate * update
150 |
151 | next_param = param - update_with_lr
152 |
153 | assignments.extend(
154 | [param.assign(next_param),
155 | m.assign(next_m),
156 | v.assign(next_v)])
157 | return tf.group(*assignments, name=name)
158 |
159 | def _do_use_weight_decay(self, param_name):
160 | """Whether to use L2 weight decay for `param_name`."""
161 | if not self.weight_decay_rate:
162 | return False
163 | if self.exclude_from_weight_decay:
164 | for r in self.exclude_from_weight_decay:
165 | if re.search(r, param_name) is not None:
166 | return False
167 | return True
168 |
169 | def _get_variable_name(self, param_name):
170 | """Get the variable name from the tensor name."""
171 | m = re.match("^(.*):\\d+$", param_name)
172 | if m is not None:
173 | param_name = m.group(1)
174 | return param_name
175 |
--------------------------------------------------------------------------------
/BERT-CNN/test_demo.tsv:
--------------------------------------------------------------------------------
1 | test_id visual caption
2 | 0 standard poodle shopping cart footwear a close up of shoes and a dog in a basket
3 | 1 standard poodle shopping cart footwear a brown teddy bear laying on top of a pair of shoes
4 | 2 toilet seat a toilet with a hole in the floor
5 | 3 mobile home studio couch house a living room with a couch chair coffee table and a television
6 | 4 french loaf conch person a sandwich and a basket of food on a table
7 | 5 indian elephant a man and two children riding on an elephant
8 |
--------------------------------------------------------------------------------
/BERT-CNN/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import re
23 | import unicodedata
24 | import six
25 | import tensorflow as tf
26 |
27 |
28 | def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
29 | """Checks whether the casing config is consistent with the checkpoint name."""
30 |
31 | # The casing has to be passed in by the user and there is no explicit check
32 | # as to whether it matches the checkpoint. The casing information probably
33 | # should have been stored in the bert_config.json file, but it's not, so
34 | # we have to heuristically detect it to validate.
35 |
36 | if not init_checkpoint:
37 | return
38 |
39 | m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
40 | if m is None:
41 | return
42 |
43 | model_name = m.group(1)
44 |
45 | lower_models = [
46 | "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
47 | "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
48 | ]
49 |
50 | cased_models = [
51 | "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
52 | "multi_cased_L-12_H-768_A-12"
53 | ]
54 |
55 | is_bad_config = False
56 | if model_name in lower_models and not do_lower_case:
57 | is_bad_config = True
58 | actual_flag = "False"
59 | case_name = "lowercased"
60 | opposite_flag = "True"
61 |
62 | if model_name in cased_models and do_lower_case:
63 | is_bad_config = True
64 | actual_flag = "True"
65 | case_name = "cased"
66 | opposite_flag = "False"
67 |
68 | if is_bad_config:
69 | raise ValueError(
70 | "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
71 | "However, `%s` seems to be a %s model, so you "
72 | "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
73 | "how the model was pre-training. If this error is wrong, please "
74 | "just comment out this check." % (actual_flag, init_checkpoint,
75 | model_name, case_name, opposite_flag))
76 |
77 |
78 | def convert_to_unicode(text):
79 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
80 | if six.PY3:
81 | if isinstance(text, str):
82 | return text
83 | elif isinstance(text, bytes):
84 | return text.decode("utf-8", "ignore")
85 | else:
86 | raise ValueError("Unsupported string type: %s" % (type(text)))
87 | elif six.PY2:
88 | if isinstance(text, str):
89 | return text.decode("utf-8", "ignore")
90 | elif isinstance(text, unicode):
91 | return text
92 | else:
93 | raise ValueError("Unsupported string type: %s" % (type(text)))
94 | else:
95 | raise ValueError("Not running on Python2 or Python 3?")
96 |
97 |
98 | def printable_text(text):
99 | """Returns text encoded in a way suitable for print or `tf.logging`."""
100 |
101 | # These functions want `str` for both Python2 and Python3, but in one case
102 | # it's a Unicode string and in the other it's a byte string.
103 | if six.PY3:
104 | if isinstance(text, str):
105 | return text
106 | elif isinstance(text, bytes):
107 | return text.decode("utf-8", "ignore")
108 | else:
109 | raise ValueError("Unsupported string type: %s" % (type(text)))
110 | elif six.PY2:
111 | if isinstance(text, str):
112 | return text
113 | elif isinstance(text, unicode):
114 | return text.encode("utf-8")
115 | else:
116 | raise ValueError("Unsupported string type: %s" % (type(text)))
117 | else:
118 | raise ValueError("Not running on Python2 or Python 3?")
119 |
120 |
121 | def load_vocab(vocab_file):
122 | """Loads a vocabulary file into a dictionary."""
123 | vocab = collections.OrderedDict()
124 | index = 0
125 | with tf.gfile.GFile(vocab_file, "r") as reader:
126 | while True:
127 | token = convert_to_unicode(reader.readline())
128 | if not token:
129 | break
130 | token = token.strip()
131 | vocab[token] = index
132 | index += 1
133 | return vocab
134 |
135 |
136 | def convert_by_vocab(vocab, items):
137 | """Converts a sequence of [tokens|ids] using the vocab."""
138 | output = []
139 | for item in items:
140 | output.append(vocab[item])
141 | return output
142 |
143 |
144 | def convert_tokens_to_ids(vocab, tokens):
145 | return convert_by_vocab(vocab, tokens)
146 |
147 |
148 | def convert_ids_to_tokens(inv_vocab, ids):
149 | return convert_by_vocab(inv_vocab, ids)
150 |
151 |
152 | def whitespace_tokenize(text):
153 | """Runs basic whitespace cleaning and splitting on a piece of text."""
154 | text = text.strip()
155 | if not text:
156 | return []
157 | tokens = text.split()
158 | return tokens
159 |
160 |
161 | class FullTokenizer(object):
162 | """Runs end-to-end tokenziation."""
163 |
164 | def __init__(self, vocab_file, do_lower_case=True):
165 | self.vocab = load_vocab(vocab_file)
166 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
167 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
168 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
169 |
170 | def tokenize(self, text):
171 | split_tokens = []
172 | for token in self.basic_tokenizer.tokenize(text):
173 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
174 | split_tokens.append(sub_token)
175 |
176 | return split_tokens
177 |
178 | def convert_tokens_to_ids(self, tokens):
179 | return convert_by_vocab(self.vocab, tokens)
180 |
181 | def convert_ids_to_tokens(self, ids):
182 | return convert_by_vocab(self.inv_vocab, ids)
183 |
184 |
185 | class BasicTokenizer(object):
186 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
187 |
188 | def __init__(self, do_lower_case=True):
189 | """Constructs a BasicTokenizer.
190 |
191 | Args:
192 | do_lower_case: Whether to lower case the input.
193 | """
194 | self.do_lower_case = do_lower_case
195 |
196 | def tokenize(self, text):
197 | """Tokenizes a piece of text."""
198 | text = convert_to_unicode(text)
199 | text = self._clean_text(text)
200 |
201 | # This was added on November 1st, 2018 for the multilingual and Chinese
202 | # models. This is also applied to the English models now, but it doesn't
203 | # matter since the English models were not trained on any Chinese data
204 | # and generally don't have any Chinese data in them (there are Chinese
205 | # characters in the vocabulary because Wikipedia does have some Chinese
206 | # words in the English Wikipedia.).
207 | text = self._tokenize_chinese_chars(text)
208 |
209 | orig_tokens = whitespace_tokenize(text)
210 | split_tokens = []
211 | for token in orig_tokens:
212 | if self.do_lower_case:
213 | token = token.lower()
214 | token = self._run_strip_accents(token)
215 | split_tokens.extend(self._run_split_on_punc(token))
216 |
217 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
218 | return output_tokens
219 |
220 | def _run_strip_accents(self, text):
221 | """Strips accents from a piece of text."""
222 | text = unicodedata.normalize("NFD", text)
223 | output = []
224 | for char in text:
225 | cat = unicodedata.category(char)
226 | if cat == "Mn":
227 | continue
228 | output.append(char)
229 | return "".join(output)
230 |
231 | def _run_split_on_punc(self, text):
232 | """Splits punctuation on a piece of text."""
233 | chars = list(text)
234 | i = 0
235 | start_new_word = True
236 | output = []
237 | while i < len(chars):
238 | char = chars[i]
239 | if _is_punctuation(char):
240 | output.append([char])
241 | start_new_word = True
242 | else:
243 | if start_new_word:
244 | output.append([])
245 | start_new_word = False
246 | output[-1].append(char)
247 | i += 1
248 |
249 | return ["".join(x) for x in output]
250 |
251 | def _tokenize_chinese_chars(self, text):
252 | """Adds whitespace around any CJK character."""
253 | output = []
254 | for char in text:
255 | cp = ord(char)
256 | if self._is_chinese_char(cp):
257 | output.append(" ")
258 | output.append(char)
259 | output.append(" ")
260 | else:
261 | output.append(char)
262 | return "".join(output)
263 |
264 | def _is_chinese_char(self, cp):
265 | """Checks whether CP is the codepoint of a CJK character."""
266 | # This defines a "chinese character" as anything in the CJK Unicode block:
267 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
268 | #
269 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
270 | # despite its name. The modern Korean Hangul alphabet is a different block,
271 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
272 | # space-separated words, so they are not treated specially and handled
273 | # like the all of the other languages.
274 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
275 | (cp >= 0x3400 and cp <= 0x4DBF) or #
276 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
277 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
278 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
279 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
280 | (cp >= 0xF900 and cp <= 0xFAFF) or #
281 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
282 | return True
283 |
284 | return False
285 |
286 | def _clean_text(self, text):
287 | """Performs invalid character removal and whitespace cleanup on text."""
288 | output = []
289 | for char in text:
290 | cp = ord(char)
291 | if cp == 0 or cp == 0xfffd or _is_control(char):
292 | continue
293 | if _is_whitespace(char):
294 | output.append(" ")
295 | else:
296 | output.append(char)
297 | return "".join(output)
298 |
299 |
300 | class WordpieceTokenizer(object):
301 | """Runs WordPiece tokenziation."""
302 |
303 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
304 | self.vocab = vocab
305 | self.unk_token = unk_token
306 | self.max_input_chars_per_word = max_input_chars_per_word
307 |
308 | def tokenize(self, text):
309 | """Tokenizes a piece of text into its word pieces.
310 |
311 | This uses a greedy longest-match-first algorithm to perform tokenization
312 | using the given vocabulary.
313 |
314 | For example:
315 | input = "unaffable"
316 | output = ["un", "##aff", "##able"]
317 |
318 | Args:
319 | text: A single token or whitespace separated tokens. This should have
320 | already been passed through `BasicTokenizer.
321 |
322 | Returns:
323 | A list of wordpiece tokens.
324 | """
325 |
326 | text = convert_to_unicode(text)
327 |
328 | output_tokens = []
329 | for token in whitespace_tokenize(text):
330 | chars = list(token)
331 | if len(chars) > self.max_input_chars_per_word:
332 | output_tokens.append(self.unk_token)
333 | continue
334 |
335 | is_bad = False
336 | start = 0
337 | sub_tokens = []
338 | while start < len(chars):
339 | end = len(chars)
340 | cur_substr = None
341 | while start < end:
342 | substr = "".join(chars[start:end])
343 | if start > 0:
344 | substr = "##" + substr
345 | if substr in self.vocab:
346 | cur_substr = substr
347 | break
348 | end -= 1
349 | if cur_substr is None:
350 | is_bad = True
351 | break
352 | sub_tokens.append(cur_substr)
353 | start = end
354 |
355 | if is_bad:
356 | output_tokens.append(self.unk_token)
357 | else:
358 | output_tokens.extend(sub_tokens)
359 | return output_tokens
360 |
361 |
362 | def _is_whitespace(char):
363 | """Checks whether `chars` is a whitespace character."""
364 | # \t, \n, and \r are technically contorl characters but we treat them
365 | # as whitespace since they are generally considered as such.
366 | if char == " " or char == "\t" or char == "\n" or char == "\r":
367 | return True
368 | cat = unicodedata.category(char)
369 | if cat == "Zs":
370 | return True
371 | return False
372 |
373 |
374 | def _is_control(char):
375 | """Checks whether `chars` is a control character."""
376 | # These are technically control characters but we count them as whitespace
377 | # characters.
378 | if char == "\t" or char == "\n" or char == "\r":
379 | return False
380 | cat = unicodedata.category(char)
381 | if cat in ("Cc", "Cf"):
382 | return True
383 | return False
384 |
385 |
386 | def _is_punctuation(char):
387 | """Checks whether `chars` is a punctuation character."""
388 | cp = ord(char)
389 | # We treat all non-letter/number ASCII as punctuation.
390 | # Characters such as "^", "$", and "`" are not in the Unicode
391 | # Punctuation class but we treat them as punctuation anyways, for
392 | # consistency.
393 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
394 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
395 | return True
396 | cat = unicodedata.category(char)
397 | if cat.startswith("P"):
398 | return True
399 | return False
400 |
--------------------------------------------------------------------------------
/BERT-CNN/uncased_L-12_H-768_A-12/file-should be here.txt:
--------------------------------------------------------------------------------
1 | Download this from bert website
2 |
3 |
--------------------------------------------------------------------------------
/BERT/README.md:
--------------------------------------------------------------------------------
1 | ## Semantric Relatendes with BERT
2 | Fine-tune BERT on the created dataset.
3 |
4 | ### Requirements
5 | - Tensorflow 1.15.0
6 | - Python 2.7
7 |
8 | ```
9 | conda create -n BERT_visual python=2.7 anaconda
10 | conda activate BERT_visual
11 | pip install tensorflow==1.15.0
12 | ```
13 |
14 | ```
15 | python train_model_VC.py # train/val/and inference
16 | ```
17 | main page example
18 | ```
19 | ## relatedness score
20 |
21 | image: COCO_val2014_000000156242.jpg - Karpathy test split
22 | ```
23 | ```
24 | BERT Base
25 |
26 | ('visual :', 'apple') # Visual (ours)
27 | ('caption :', 'a display of apple and orange at market')
28 | ('Prediction :', 0.9933211)
29 | ******
30 | ('visual :', 'apple') # Greedy
31 | ('caption :', 'a fruit market with apples and orange')
32 | ('Prediction :', 0.98885113)
33 | ******
34 | ('visual :', 'apple') Beam Serach
35 | ('caption :', 'a fruit stand with apples and oranges')
36 | ('Prediction :', 0.9911321)
37 |
38 | BERT Large
39 |
40 | ('visual :', 'apple')
41 | ('caption :', 'a display of apple and orange at market')
42 | ('Prediction :', 0.99782264)
43 | ******
44 | ('visual :', 'apple')
45 | (''caption :', 'a fruit market with apples and orange')
46 | ('Prediction :', 0.99774504)
47 | ******
48 | ('visual :', 'apple')
49 | ('caption :', 'a fruit stand with apples and oranges')
50 | ('Prediction :', 0.9977704)
51 | ```
52 |
--------------------------------------------------------------------------------
/BERT/data/test.tsv:
--------------------------------------------------------------------------------
1 | id visual caption
2 | 0 shopping a close up of a dog laying in a basket
3 | 1 traffic a black and white photo of a street light
4 | 2 toilet a white toilet with its seat up in a bathroom
5 | 3 bed a living room filled with furniture and a coffee table
6 | 4 hotdog a basket filled with sandwiches on top of a table
7 | 5 tusker a group of people riding on the back of an elephant
8 | 6 suit a man wearing glasses and a tie in a room
9 |
--------------------------------------------------------------------------------
/BERT/data/train.tsv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/BERT/data/train.tsv.zip
--------------------------------------------------------------------------------
/BERT/outputs/need-this.txt:
--------------------------------------------------------------------------------
1 | Put the provide weight in this file if you want to continue the training
2 |
--------------------------------------------------------------------------------
/BERT/tokenization.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import collections
22 | import unicodedata
23 | import six
24 | import tensorflow as tf
25 |
26 |
27 | def convert_to_unicode(text):
28 | """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
29 | if six.PY3:
30 | if isinstance(text, str):
31 | return text
32 | elif isinstance(text, bytes):
33 | return text.decode("utf-8", "ignore")
34 | else:
35 | raise ValueError("Unsupported string type: %s" % (type(text)))
36 | elif six.PY2:
37 | if isinstance(text, str):
38 | return text.decode("utf-8", "ignore")
39 | elif isinstance(text, unicode):
40 | return text
41 | else:
42 | raise ValueError("Unsupported string type: %s" % (type(text)))
43 | else:
44 | raise ValueError("Not running on Python2 or Python 3?")
45 |
46 |
47 | def printable_text(text):
48 | """Returns text encoded in a way suitable for print or `tf.logging`."""
49 |
50 | # These functions want `str` for both Python2 and Python3, but in one case
51 | # it's a Unicode string and in the other it's a byte string.
52 | if six.PY3:
53 | if isinstance(text, str):
54 | return text
55 | elif isinstance(text, bytes):
56 | return text.decode("utf-8", "ignore")
57 | else:
58 | raise ValueError("Unsupported string type: %s" % (type(text)))
59 | elif six.PY2:
60 | if isinstance(text, str):
61 | return text
62 | elif isinstance(text, unicode):
63 | return text.encode("utf-8")
64 | else:
65 | raise ValueError("Unsupported string type: %s" % (type(text)))
66 | else:
67 | raise ValueError("Not running on Python2 or Python 3?")
68 |
69 |
70 | def load_vocab(vocab_file):
71 | """Loads a vocabulary file into a dictionary."""
72 | vocab = collections.OrderedDict()
73 | index = 0
74 | with tf.gfile.GFile(vocab_file, "r") as reader:
75 | while True:
76 | token = convert_to_unicode(reader.readline())
77 | if not token:
78 | break
79 | token = token.strip()
80 | vocab[token] = index
81 | index += 1
82 | return vocab
83 |
84 |
85 | def convert_by_vocab(vocab, items):
86 | """Converts a sequence of [tokens|ids] using the vocab."""
87 | output = []
88 | for item in items:
89 | output.append(vocab[item])
90 | return output
91 |
92 |
93 | def convert_tokens_to_ids(vocab, tokens):
94 | return convert_by_vocab(vocab, tokens)
95 |
96 |
97 | def convert_ids_to_tokens(inv_vocab, ids):
98 | return convert_by_vocab(inv_vocab, ids)
99 |
100 |
101 | def whitespace_tokenize(text):
102 | """Runs basic whitespace cleaning and splitting on a peice of text."""
103 | text = text.strip()
104 | if not text:
105 | return []
106 | tokens = text.split()
107 | return tokens
108 |
109 |
110 | class FullTokenizer(object):
111 | """Runs end-to-end tokenziation."""
112 |
113 | def __init__(self, vocab_file, do_lower_case=True):
114 | self.vocab = load_vocab(vocab_file)
115 | self.inv_vocab = {v: k for k, v in self.vocab.items()}
116 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
117 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
118 |
119 | def tokenize(self, text):
120 | split_tokens = []
121 | for token in self.basic_tokenizer.tokenize(text):
122 | for sub_token in self.wordpiece_tokenizer.tokenize(token):
123 | split_tokens.append(sub_token)
124 |
125 | return split_tokens
126 |
127 | def convert_tokens_to_ids(self, tokens):
128 | return convert_by_vocab(self.vocab, tokens)
129 |
130 | def convert_ids_to_tokens(self, ids):
131 | return convert_by_vocab(self.inv_vocab, ids)
132 |
133 |
134 | class BasicTokenizer(object):
135 | """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
136 |
137 | def __init__(self, do_lower_case=True):
138 | """Constructs a BasicTokenizer.
139 |
140 | Args:
141 | do_lower_case: Whether to lower case the input.
142 | """
143 | self.do_lower_case = do_lower_case
144 |
145 | def tokenize(self, text):
146 | """Tokenizes a piece of text."""
147 | text = convert_to_unicode(text)
148 | text = self._clean_text(text)
149 |
150 | # This was added on November 1st, 2018 for the multilingual and Chinese
151 | # models. This is also applied to the English models now, but it doesn't
152 | # matter since the English models were not trained on any Chinese data
153 | # and generally don't have any Chinese data in them (there are Chinese
154 | # characters in the vocabulary because Wikipedia does have some Chinese
155 | # words in the English Wikipedia.).
156 | text = self._tokenize_chinese_chars(text)
157 |
158 | orig_tokens = whitespace_tokenize(text)
159 | split_tokens = []
160 | for token in orig_tokens:
161 | if self.do_lower_case:
162 | token = token.lower()
163 | token = self._run_strip_accents(token)
164 | split_tokens.extend(self._run_split_on_punc(token))
165 |
166 | output_tokens = whitespace_tokenize(" ".join(split_tokens))
167 | return output_tokens
168 |
169 | def _run_strip_accents(self, text):
170 | """Strips accents from a piece of text."""
171 | text = unicodedata.normalize("NFD", text)
172 | output = []
173 | for char in text:
174 | cat = unicodedata.category(char)
175 | if cat == "Mn":
176 | continue
177 | output.append(char)
178 | return "".join(output)
179 |
180 | def _run_split_on_punc(self, text):
181 | """Splits punctuation on a piece of text."""
182 | chars = list(text)
183 | i = 0
184 | start_new_word = True
185 | output = []
186 | while i < len(chars):
187 | char = chars[i]
188 | if _is_punctuation(char):
189 | output.append([char])
190 | start_new_word = True
191 | else:
192 | if start_new_word:
193 | output.append([])
194 | start_new_word = False
195 | output[-1].append(char)
196 | i += 1
197 |
198 | return ["".join(x) for x in output]
199 |
200 | def _tokenize_chinese_chars(self, text):
201 | """Adds whitespace around any CJK character."""
202 | output = []
203 | for char in text:
204 | cp = ord(char)
205 | if self._is_chinese_char(cp):
206 | output.append(" ")
207 | output.append(char)
208 | output.append(" ")
209 | else:
210 | output.append(char)
211 | return "".join(output)
212 |
213 | def _is_chinese_char(self, cp):
214 | """Checks whether CP is the codepoint of a CJK character."""
215 | # This defines a "chinese character" as anything in the CJK Unicode block:
216 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
217 | #
218 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
219 | # despite its name. The modern Korean Hangul alphabet is a different block,
220 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write
221 | # space-separated words, so they are not treated specially and handled
222 | # like the all of the other languages.
223 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
224 | (cp >= 0x3400 and cp <= 0x4DBF) or #
225 | (cp >= 0x20000 and cp <= 0x2A6DF) or #
226 | (cp >= 0x2A700 and cp <= 0x2B73F) or #
227 | (cp >= 0x2B740 and cp <= 0x2B81F) or #
228 | (cp >= 0x2B820 and cp <= 0x2CEAF) or
229 | (cp >= 0xF900 and cp <= 0xFAFF) or #
230 | (cp >= 0x2F800 and cp <= 0x2FA1F)): #
231 | return True
232 |
233 | return False
234 |
235 | def _clean_text(self, text):
236 | """Performs invalid character removal and whitespace cleanup on text."""
237 | output = []
238 | for char in text:
239 | cp = ord(char)
240 | if cp == 0 or cp == 0xfffd or _is_control(char):
241 | continue
242 | if _is_whitespace(char):
243 | output.append(" ")
244 | else:
245 | output.append(char)
246 | return "".join(output)
247 |
248 |
249 | class WordpieceTokenizer(object):
250 | """Runs WordPiece tokenziation."""
251 |
252 | def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
253 | self.vocab = vocab
254 | self.unk_token = unk_token
255 | self.max_input_chars_per_word = max_input_chars_per_word
256 |
257 | def tokenize(self, text):
258 | """Tokenizes a piece of text into its word pieces.
259 |
260 | This uses a greedy longest-match-first algorithm to perform tokenization
261 | using the given vocabulary.
262 |
263 | For example:
264 | input = "unaffable"
265 | output = ["un", "##aff", "##able"]
266 |
267 | Args:
268 | text: A single token or whitespace separated tokens. This should have
269 | already been passed through `BasicTokenizer.
270 |
271 | Returns:
272 | A list of wordpiece tokens.
273 | """
274 |
275 | text = convert_to_unicode(text)
276 |
277 | output_tokens = []
278 | for token in whitespace_tokenize(text):
279 | chars = list(token)
280 | if len(chars) > self.max_input_chars_per_word:
281 | output_tokens.append(self.unk_token)
282 | continue
283 |
284 | is_bad = False
285 | start = 0
286 | sub_tokens = []
287 | while start < len(chars):
288 | end = len(chars)
289 | cur_substr = None
290 | while start < end:
291 | substr = "".join(chars[start:end])
292 | if start > 0:
293 | substr = "##" + substr
294 | if substr in self.vocab:
295 | cur_substr = substr
296 | break
297 | end -= 1
298 | if cur_substr is None:
299 | is_bad = True
300 | break
301 | sub_tokens.append(cur_substr)
302 | start = end
303 |
304 | if is_bad:
305 | output_tokens.append(self.unk_token)
306 | else:
307 | output_tokens.extend(sub_tokens)
308 | return output_tokens
309 |
310 |
311 | def _is_whitespace(char):
312 | """Checks whether `chars` is a whitespace character."""
313 | # \t, \n, and \r are technically contorl characters but we treat them
314 | # as whitespace since they are generally considered as such.
315 | if char == " " or char == "\t" or char == "\n" or char == "\r":
316 | return True
317 | cat = unicodedata.category(char)
318 | if cat == "Zs":
319 | return True
320 | return False
321 |
322 |
323 | def _is_control(char):
324 | """Checks whether `chars` is a control character."""
325 | # These are technically control characters but we count them as whitespace
326 | # characters.
327 | if char == "\t" or char == "\n" or char == "\r":
328 | return False
329 | cat = unicodedata.category(char)
330 | if cat.startswith("C"):
331 | return True
332 | return False
333 |
334 |
335 | def _is_punctuation(char):
336 | """Checks whether `chars` is a punctuation character."""
337 | cp = ord(char)
338 | # We treat all non-letter/number ASCII as punctuation.
339 | # Characters such as "^", "$", and "`" are not in the Unicode
340 | # Punctuation class but we treat them as punctuation anyways, for
341 | # consistency.
342 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
343 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
344 | return True
345 | cat = unicodedata.category(char)
346 | if cat.startswith("P"):
347 | return True
348 | return False
349 |
--------------------------------------------------------------------------------
/BERT/train_model_VC.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | #tensorflow
3 | # 1.15.0
4 | import os
5 | import sys
6 | import json
7 | import datetime
8 | import pprint
9 | import os
10 | import tensorflow as tf
11 | #pip install tensorflow==1.15
12 | config = tf.ConfigProto()
13 |
14 |
15 | #Fine-tuning with Cloud TPUs
16 | #https://github.com/google-research/bert
17 | # for the use TPU with colab for fast training and infernce
18 | # If you want to use TPU, first switch to tpu runtime in colab
19 | USE_TPU = False
20 |
21 |
22 | #https://github.com/google-research/bert#pre-trained-models
23 | # We will use base uncased bert model
24 |
25 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
26 | BERT_MODEL = 'uncased_L-12_H-768_A-12'
27 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
28 | #BERT_MODEL = 'uncased_L-24_H-1024_A-16'
29 |
30 |
31 | ## BERT checkpoint bucket
32 | ## 12-layer, 768-hidden, 12-heads, 110M parameters
33 | BERT_PRETRAINED_DIR = 'uncased_L-12_H-768_A-12'
34 |
35 |
36 | print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
37 | ## uncased_L-12_H-768_A-12 (directory)
38 | # bert_model.ckpt.data-00000-of-00001
39 | # bert_model.ckpt.meta
40 |
41 | # output file
42 | #OUTPUT_DIR = '/home/asabir/Desktop/model_repo/outputs'
43 | OUTPUT_DIR ='outputs'
44 | #print(f'***** Model output directory: {OUTPUT_DIR} *****')
45 | print('***** Model output directory: {OUTPUT_DIR} *****')
46 | #print(f'***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')
47 | print('***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')
48 |
49 |
50 | print('***** Model output directory: {} *****'.format(OUTPUT_DIR))
51 |
52 |
53 | #TASK_DATA_DIR = 'data/visual-caption'
54 | if not 'bert' in sys.path:
55 | sys.path += ['bert']
56 |
57 | TASK_DATA_DIR = '/data/'
58 | # ## Model Configs and Hyper Parameters
59 |
60 | import modeling
61 | import optimization
62 | import tokenization
63 | import run_classifier
64 |
65 | # Model Hyper Parameters
66 | #TRAIN_BATCH_SIZE = 32 # For GPU, reduce to 16
67 | TRAIN_BATCH_SIZE = 16 #
68 | EVAL_BATCH_SIZE = 8
69 | PREDICT_BATCH_SIZE = 8
70 | LEARNING_RATE = 2e-5
71 | #NUM_TRAIN_EPOCHS = 2.0
72 | NUM_TRAIN_EPOCHS = 1.0
73 | WARMUP_PROPORTION = 0.1
74 | MAX_SEQ_LENGTH = 30
75 |
76 | # Model configs
77 | SAVE_CHECKPOINTS_STEPS = 1000
78 | ITERATIONS_PER_LOOP = 1000
79 | NUM_TPU_CORES = 8
80 | VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
81 | CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
82 | INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
83 | DO_LOWER_CASE = BERT_MODEL.startswith('uncased')
84 |
85 |
86 | # ## Read visual caption Pairs
87 | # Read data from TSV file and covert to list of InputExample.
88 | #to [run_classifier](https://github.com/google-research/bert/blob/master/run_classifier.py) file
89 |
90 |
91 | class VCProcessor(run_classifier.DataProcessor):
92 | """Processor for the visual caption pair data set."""
93 |
94 | def get_train_examples(self, data_dir):
95 | """Reading train.tsv and converting to list of InputExample"""
96 | return self._create_examples(
97 | self._read_tsv(os.path.join(data_dir,"train.tsv")), 'train')
98 |
99 | def get_dev_examples(self, data_dir):
100 | """Reading dev.tsv and converting to list of InputExample"""
101 | return self._create_examples(
102 | self._read_tsv(os.path.join(data_dir,"dev.tsv")), 'dev')
103 |
104 | def get_test_examples(self, data_dir):
105 | """Reading train.tsv and converting to list of InputExample"""
106 | return self._create_examples(
107 | self._read_tsv(os.path.join(data_dir,"test.tsv")), 'test')
108 |
109 | def get_predict_examples(self, sentence_pairs):
110 | """Given visual caption pairs, conevrting to list of InputExample"""
111 | examples = []
112 | for (i, vcpair) in enumerate(sentence_pairs):
113 | guid = "predict-%d" % (i)
114 | # converting input text to utf-8 and creating InputExamples
115 | text_a = tokenization.convert_to_unicode(vcpair[0])
116 | text_b = tokenization.convert_to_unicode(vcpair[1])
117 | # We will add label as 0, because None is not supported in converting to features
118 | examples.append(
119 | run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=0))
120 | return examples
121 |
122 | def _create_examples(self, lines, set_type):
123 | """Creates examples for the training, dev and test sets."""
124 | examples = []
125 | for (i, line) in enumerate(lines):
126 | guid = "%s-%d" % (set_type, i)
127 | if set_type=='test':
128 | # removing header and invalid data
129 | if i == 0 or len(line)!=3:
130 | print(guid, line)
131 | continue
132 | text_a = tokenization.convert_to_unicode(line[1])
133 | text_b = tokenization.convert_to_unicode(line[2])
134 | label = 0 # We will use zero for test as convert_example_to_features doesn't support None
135 | else:
136 | # removing header and invalid data
137 | if i == 0 or len(line)!=6:
138 | continue
139 | text_a = tokenization.convert_to_unicode(line[3])
140 | text_b = tokenization.convert_to_unicode(line[4])
141 | label = int(line[5])
142 | examples.append(
143 | run_classifier.InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
144 | return examples
145 |
146 | def get_labels(self):
147 | "return class labels"
148 | return [0,1]
149 |
150 |
151 | # initialiation an instance of visual-caption VCProcessor and tokenizer
152 | processor = VCProcessor()
153 | label_list = processor.get_labels()
154 | tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
155 |
156 |
157 | # Converting training examples to features
158 | print("---------------- Processing Training Data ------------------")
159 | TRAIN_TF_RECORD = os.path.join(OUTPUT_DIR, "train.tf_record")
160 | train_examples = processor.get_train_examples(TASK_DATA_DIR)
161 | num_train_examples = len(train_examples)
162 | num_train_steps = int( num_train_examples / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
163 | num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
164 | run_classifier.file_based_convert_examples_to_features(train_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TRAIN_TF_RECORD)
165 |
166 |
167 | # ## Creating Classification Model
168 |
169 | def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
170 | labels, num_labels, use_one_hot_embeddings):
171 | """Creates a classification model."""
172 | # Bert Model instant
173 | model = modeling.BertModel(
174 | config=bert_config,
175 | is_training=is_training,
176 | input_ids=input_ids,
177 | input_mask=input_mask,
178 | token_type_ids=segment_ids,
179 | use_one_hot_embeddings=use_one_hot_embeddings)
180 |
181 | # Getting output for last layer of BERT
182 | output_layer = model.get_pooled_output()
183 |
184 | # Number of outputs for last layer
185 | hidden_size = output_layer.shape[-1].value
186 |
187 | # We will use one layer on top of BERT pretrained for creating classification model
188 | output_weights = tf.get_variable(
189 | "output_weights", [num_labels, hidden_size],
190 | initializer=tf.truncated_normal_initializer(stddev=0.02))
191 |
192 | output_bias = tf.get_variable(
193 | "output_bias", [num_labels], initializer=tf.zeros_initializer())
194 |
195 | with tf.variable_scope("loss"):
196 | if is_training:
197 | # 0.1 dropout
198 | output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
199 |
200 | # Calcaulte prediction probabilites and loss
201 | logits = tf.matmul(output_layer, output_weights, transpose_b=True)
202 | logits = tf.nn.bias_add(logits, output_bias)
203 | probabilities = tf.nn.softmax(logits, axis=-1)
204 | log_probs = tf.nn.log_softmax(logits, axis=-1)
205 |
206 | one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
207 |
208 | per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
209 | loss = tf.reduce_mean(per_example_loss)
210 |
211 | return (loss, per_example_loss, logits, probabilities)
212 |
213 |
214 | def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
215 | num_train_steps, num_warmup_steps, use_tpu,
216 | use_one_hot_embeddings):
217 | """Returns `model_fn` closure for TPUEstimator."""
218 |
219 | def model_fn(features, labels, mode, params):
220 | """The `model_fn` for TPUEstimator."""
221 |
222 | # reading features input
223 | input_ids = features["input_ids"]
224 | input_mask = features["input_mask"]
225 | segment_ids = features["segment_ids"]
226 | label_ids = features["label_ids"]
227 | is_real_example = None
228 | if "is_real_example" in features:
229 | is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
230 | else:
231 | is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
232 |
233 | # checking if training mode
234 | is_training = (mode == tf.estimator.ModeKeys.TRAIN)
235 |
236 | # create simple classification model
237 | (total_loss, per_example_loss, logits, probabilities) = create_model(
238 | bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
239 | num_labels, use_one_hot_embeddings)
240 |
241 | # getting variables for intialization and using pretrained init checkpoint
242 | tvars = tf.trainable_variables()
243 | initialized_variable_names = {}
244 | scaffold_fn = None
245 | if init_checkpoint:
246 | (assignment_map, initialized_variable_names
247 | ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
248 | if use_tpu:
249 |
250 | def tpu_scaffold():
251 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
252 | return tf.train.Scaffold()
253 |
254 | scaffold_fn = tpu_scaffold
255 | else:
256 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
257 |
258 | output_spec = None
259 | if mode == tf.estimator.ModeKeys.TRAIN:
260 | # defining optimizar function
261 | train_op = optimization.create_optimizer(
262 | total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)
263 |
264 | # Training estimator spec
265 | output_spec = tf.contrib.tpu.TPUEstimatorSpec(
266 | mode=mode,
267 | loss=total_loss,
268 | train_op=train_op,
269 | scaffold_fn=scaffold_fn)
270 | elif mode == tf.estimator.ModeKeys.EVAL:
271 | # accuracy, loss, auc, F1, precision and recall metrics for evaluation
272 | def metric_fn(per_example_loss, label_ids, logits, is_real_example):
273 | predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
274 | loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
275 | accuracy = tf.metrics.accuracy(
276 | labels=label_ids, predictions=predictions, weights=is_real_example)
277 | f1_score = tf.contrib.metrics.f1_score(
278 | label_ids,
279 | predictions)
280 | auc = tf.metrics.auc(
281 | label_ids,
282 | predictions)
283 | recall = tf.metrics.recall(
284 | label_ids,
285 | predictions)
286 | precision = tf.metrics.precision(
287 | label_ids,
288 | predictions)
289 | return {
290 | "eval_accuracy": accuracy,
291 | "eval_loss": loss,
292 | "f1_score": f1_score,
293 | "auc": auc,
294 | "precision": precision,
295 | "recall": recall
296 | }
297 |
298 | eval_metrics = (metric_fn,
299 | [per_example_loss, label_ids, logits, is_real_example])
300 | # estimator spec for evalaution
301 | output_spec = tf.contrib.tpu.TPUEstimatorSpec(
302 | mode=mode,
303 | loss=total_loss,
304 | eval_metrics=eval_metrics,
305 | scaffold_fn=scaffold_fn)
306 | else:
307 | # estimator spec for predictions
308 | output_spec = tf.contrib.tpu.TPUEstimatorSpec(
309 | mode=mode,
310 | predictions={"probabilities": probabilities},
311 | scaffold_fn=scaffold_fn)
312 | return output_spec
313 |
314 | return model_fn
315 |
316 |
317 | # Define TPU configs
318 | if USE_TPU:
319 | tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
320 | else:
321 | tpu_cluster_resolver = None
322 | run_config = tf.contrib.tpu.RunConfig(
323 | cluster=tpu_cluster_resolver,
324 | model_dir=OUTPUT_DIR,
325 | save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
326 | tpu_config=tf.contrib.tpu.TPUConfig(
327 | iterations_per_loop=ITERATIONS_PER_LOOP,
328 | num_shards=NUM_TPU_CORES,
329 | per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))
330 |
331 |
332 | # create model function for estimator using model function builder
333 | model_fn = model_fn_builder(
334 | bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
335 | num_labels=len(label_list),
336 | init_checkpoint=INIT_CHECKPOINT,
337 | learning_rate=LEARNING_RATE,
338 | num_train_steps=num_train_steps,
339 | num_warmup_steps=num_warmup_steps,
340 | use_tpu=USE_TPU,
341 | use_one_hot_embeddings=True)
342 |
343 |
344 |
345 | # Defining TPU Estimator
346 | estimator = tf.contrib.tpu.TPUEstimator(
347 | use_tpu=USE_TPU,
348 | model_fn=model_fn,
349 | config=run_config,
350 | train_batch_size=TRAIN_BATCH_SIZE,
351 | eval_batch_size=EVAL_BATCH_SIZE,
352 | predict_batch_size=PREDICT_BATCH_SIZE)
353 |
354 |
355 |
356 | # Train the model.
357 | #print('VCS on BERT base model normally takes about 1 hour on TPU and 15-20 hours on GPU. Please wait...')
358 | print('***** Started training at {} *****'.format(datetime.datetime.now()))
359 | print(' Num examples = {}'.format(num_train_examples))
360 | print(' Batch size = {}'.format(TRAIN_BATCH_SIZE))
361 | tf.logging.info(" Num steps = %d", num_train_steps)
362 | # we are using `file_based_input_fn_builder` for creating input function from TF_RECORD file
363 | train_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD,
364 | seq_length=MAX_SEQ_LENGTH,
365 | is_training=True,
366 | drop_remainder=True)
367 | estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
368 | print('***** Finished training at {} *****'.format(datetime.datetime.now()))
369 |
370 |
371 | ## Evalute FineTuned model
372 |
373 |
374 | # eval the model on train set.
375 | print('***** Started Train Set evaluation at {} *****'.format(datetime.datetime.now()))
376 | print(' Num examples = {}'.format(num_train_examples))
377 | print(' Batch size = {}'.format(EVAL_BATCH_SIZE))
378 | # eval input function for train set
379 | train_eval_input_fn = run_classifier.file_based_input_fn_builder(TRAIN_TF_RECORD,
380 | seq_length=MAX_SEQ_LENGTH,
381 | is_training=False,
382 | drop_remainder=True)
383 | # evalute on train set
384 | result = estimator.evaluate(input_fn=train_eval_input_fn,
385 | steps=int(num_train_examples/EVAL_BATCH_SIZE))
386 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
387 | print("***** Eval results *****")
388 | for key in sorted(result.keys()):
389 | print(' {} = {}'.format(key, str(result[key])))
390 |
391 |
392 |
393 | # Converting eval examples to features
394 | print("--------------- Processing Dev Data ------------------")
395 | EVAL_TF_RECORD = os.path.join(OUTPUT_DIR, "eval.tf_record")
396 | eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
397 | num_eval_examples = len(eval_examples)
398 | run_classifier.file_based_convert_examples_to_features(eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer, EVAL_TF_RECORD)
399 |
400 |
401 | # Eval the model on Dev set.
402 | print('***** Started Dev Set evaluation at {} *****'.format(datetime.datetime.now()))
403 | print(' Num examples = {}'.format(num_eval_examples))
404 | print(' Batch size = {}'.format(EVAL_BATCH_SIZE))
405 |
406 | # eval input function for dev set
407 | eval_input_fn = run_classifier.file_based_input_fn_builder(EVAL_TF_RECORD,
408 | seq_length=MAX_SEQ_LENGTH,
409 | is_training=False,
410 | drop_remainder=True)
411 | # evalute on dev set
412 | result = estimator.evaluate(input_fn=eval_input_fn, steps=int(num_eval_examples/EVAL_BATCH_SIZE))
413 | print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
414 | print("***** Eval results *****")
415 | for key in sorted(result.keys()):
416 | print(' {} = {}'.format(key, str(result[key])))
417 |
418 |
419 | # examples sentences, feel free to change and try
420 | sent_pairs = [("apple", "a display of apple and orange at market"), ("apple","a fruit market with apples and orange"),
421 | ("apple","a fruit stand with apples and oranges")]
422 |
423 |
424 | print("----------- Predictions on Custom Data -------------------")
425 | # create `InputExample` for custom examples
426 | predict_examples = processor.get_predict_examples(sent_pairs)
427 | num_predict_examples = len(predict_examples)
428 |
429 | # For TPU, We will append `PaddingExample` for maintaining batch size
430 | if USE_TPU:
431 | while(len(predict_examples)%EVAL_BATCH_SIZE!=0):
432 | predict_examples.append(run_classifier.PaddingInputExample())
433 |
434 | # Converting to features
435 | predict_features = run_classifier.convert_examples_to_features(predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
436 |
437 | print(' Num examples = {}'.format(num_predict_examples))
438 | print(' Batch size = {}'.format(PREDICT_BATCH_SIZE))
439 |
440 | # Input function for prediction
441 | predict_input_fn = run_classifier.input_fn_builder(predict_features,
442 | seq_length=MAX_SEQ_LENGTH,
443 | is_training=False,
444 | drop_remainder=False)
445 | result = list(estimator.predict(input_fn=predict_input_fn))
446 | print(result)
447 | for ex_i in range(num_predict_examples):
448 | print("****** Example {} ******".format(ex_i))
449 | print("visual :", sent_pairs[ex_i][0])
450 | print("caption :", sent_pairs[ex_i][1])
451 | print("Prediction :", result[ex_i]['probabilities'][1])
452 |
453 |
454 |
455 | ################################################# Test ###################################################
456 |
457 | # Converting test examples to features
458 | print("--------------------- Processing Test Data -------------------")
459 | TEST_TF_RECORD = os.path.join(OUTPUT_DIR, "test.tf_record")
460 | test_examples = processor.get_test_examples(TASK_DATA_DIR)
461 | num_test_examples = len(test_examples)
462 | run_classifier.file_based_convert_examples_to_features(test_examples, label_list, MAX_SEQ_LENGTH, tokenizer, TEST_TF_RECORD)
463 |
464 |
465 | # Predictions on test set.
466 | print('***** Started Prediction at {} *****'.format(datetime.datetime.now()))
467 | print(' Num examples = {}'.format(num_test_examples))
468 | print(' Batch size = {}'.format(PREDICT_BATCH_SIZE))
469 | # predict input function for test set
470 | test_input_fn = run_classifier.file_based_input_fn_builder(TEST_TF_RECORD,
471 | seq_length=MAX_SEQ_LENGTH,
472 | is_training=False,
473 | drop_remainder=True)
474 | tf.logging.set_verbosity(tf.logging.ERROR)
475 | # predict on test set
476 | result = list(estimator.predict(input_fn=test_input_fn))
477 | print('***** Finished Prediction at {} *****'.format(datetime.datetime.now()))
478 |
479 | # saving test predictions
480 | output_test_file = os.path.join(OUTPUT_DIR, "test_score.txt")
481 | with tf.gfile.GFile(output_test_file, "w") as writer:
482 | for (example_i, predictions_i) in enumerate(result):
483 | writer.write("%s , %s\n" % (test_examples[example_i].guid, str(predictions_i['probabilities'][1])))
484 |
485 |
--------------------------------------------------------------------------------
/BERT/uncased_L-12_H-768_A-12/file-should be here.txt:
--------------------------------------------------------------------------------
1 | Download this from bert website
2 |
3 |
--------------------------------------------------------------------------------
/COCO_train2014_000000000009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_train2014_000000000009.jpg
--------------------------------------------------------------------------------
/COCO_val2014_000000000042.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/COCO_val2014_000000000042.jpg
--------------------------------------------------------------------------------
/Evaluation/captions_val2014.json.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Evaluation/captions_val2014.json.zip
--------------------------------------------------------------------------------
/Evaluation/coco_eval.py:
--------------------------------------------------------------------------------
1 | from pycocotools.coco import COCO
2 | from pycocoevalcap.eval import COCOEvalCap
3 | import sys
4 | import argparse
5 |
6 |
7 |
8 | parser=argparse.ArgumentParser()
9 | parser.add_argument('--f', default='', help='', type=str,required=True)
10 | args = parser.parse_args()
11 |
12 |
13 | annotation_file = 'captions_val2014.json'
14 | results_file = args.f
15 |
16 | # create coco object and coco_result object
17 | coco = COCO(annotation_file)
18 | coco_result = coco.loadRes(results_file)
19 |
20 | # create coco_eval object by taking coco and coco_result
21 | coco_eval = COCOEvalCap(coco, coco_result)
22 |
23 | # evaluate on a subset of images by setting
24 | # coco_eval.params['image_id'] = coco_result.getImgIds()
25 | # please remove this line when evaluating the full validation set
26 | coco_eval.params['image_id'] = coco_result.getImgIds()
27 |
28 | # evaluate results
29 | # SPICE will take a few minutes the first time, but speeds up due to caching
30 | coco_eval.evaluate()
31 |
32 | # print output evaluation scores
33 | for metric, score in coco_eval.eval.items():
34 | print(f'{metric}: {score:.3f}')
35 |
--------------------------------------------------------------------------------
/LRCE_figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/LRCE_figure_1.png
--------------------------------------------------------------------------------
/Pre-trained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/Pre-trained.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Visual Semantic Relatedness Dataset for Image Captioning
2 |
3 |
12 |
13 |
14 |
15 | Modern image captioning relies heavily on extracting knowledge, from images such as objects, to capture the concept of a static story in the image.
16 | In this paper, we propose a textual visual context dataset for image captioning, where the publicly available dataset COCO Captions [(Lin et al., 2014)](https://arxiv.org/pdf/1405.0312.pdf) has been extended with information about the scene (such as objects in the image). Since this information has textual form, it can be used to leverage any NLP task, such as text similarity or semantic relation methods, into captioning systems, either as an end-to-end training strategy or a post-processing based approach.
17 |
18 |
19 |
20 | This repository contains the implementation of the paper [Visual Semantic Relatedness Dataset for Image Captioning](https://arxiv.org/abs/2301.08784).
21 |
22 | [](https://arxiv.org/abs/2301.08784) [](https://ahmed.jp/project_page/Dataset_2022/index.html)
23 | [](https://huggingface.co/datasets/AhmedSSabir/Textual-Image-Caption-Dataset)
24 | [](https://ahmed.jp/project_page/Dataset_2022/poster_20.pdf)
25 | [](https://ahmed.jp/project_page/Dataset_2022/spotlight_ppt_ID_20.pdf)
26 |
27 |
28 | ## News
29 | Add v2 with recent SoTA model swinV2 classifier for both soft/hard-label visual_caption_cosine_score_v2 with person label (0.2, 0.3 and 0.4). Please refer to huggingface repository.
30 |
31 | ## Contents
32 | 0. [Overview](#overview)
33 | 1. [Visual semantic with BERT ](#Visual-semantic-with-BERT-CNN)
34 | 2. [Dataset](#dataset)
35 | 3. [Visual semantic with pre-trained model](#Visual-semantic-with-pre-trained-model)
36 | 4. [Evaluation](#evaluation)
37 | 5. [Citation](#Citation)
38 |
39 |
40 | ## Overview
41 |
42 |
43 | We enrich COCO-Captions with **Textual Visual Context** information. We use [ResNet152](https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf), [CLIP](https://github.com/openai/CLIP) and [Faster R-CNN](https://github.com/tensorflow/models/tree/master/research/object_detection) to extract
44 | object information for each COCO-caption image. We use three filter approaches to ensure the quality of the dataset (1) Threshold: to filter out predictions where the object classifier is not confident enough, and (2) semantic alignment with semantic similarity to remove duplicated objects. (3) semantic relatedness score as Soft-Label: to guarantee the visual context and caption have strong relation, we use [Sentence RoBERTa-sts](https://www.sbert.net) to give a soft label via cosine similarity and then we use a **th**reshold to annotate the final label (if th ≥ 0.2, 0.3, 0.4 then [1,0]). Finally, to take advantage of the overlapping between the visual context and the caption, and to extract global information from each visual, we use BERT followed by a shallow CNN [(Kim, 2014)](https://arxiv.org/pdf/1408.5882.pdf) to estimate the visual relatedness score.
45 |
46 |
47 |
48 | ## Quick Start
49 | For a quick start please have a look at this [project page](https://sabirdvd.github.io/project_page/Dataset_2022/index.html)
50 | and [Demo](https://github.com/ahmedssabir/Textual-Visual-Semantic-Dataset/blob/main/BERT_CNN_Visual_re_ranker_demo.ipynb)
51 |
52 |
55 | ## Dataset
56 |
57 | ### Sample
58 |
59 | VC1 | VC2 | VC3 | human annoated caption |
60 | | ------------- | ------------- |------------- | ------------- |
61 | | cheeseburger | plate | hotdog | a plate with a hamburger fries and tomatoes |
62 | | bakery | dining table | website | a table having tea and a cake on it |
63 | | gown | groom | apron | its time to cut the cake at this couples wedding |
64 |
65 |
66 | ### Download
67 |
68 | 0. [Dowload Raw data with ID and Visual context](https://www.dropbox.com/s/xuov24on8477zg8/All_Caption_ID.csv?dl=0) -> original dataset with related ID caption [train2014](https://cocodataset.org/#download)
69 | 1. [Downlod Data with cosine score](https://www.dropbox.com/s/55sit8ow9tems4u/visual_caption_cosine_score.zip?dl=0)-> soft cosine lable with **th** 0.2, 0.3, 0.4 and 0.5 and hard-label
70 | 2. [Dowload Overlaping visual with caption](https://www.dropbox.com/s/br8nhnlf4k2czo8/COCO_overlaping_dataset.txt?dl=0)-> Overlap visual context and the human annotated caption
71 | 3. [Download Dataset (tsv file)](https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip?dl=0) 0.0-> raw data with hard lable without cosine similairty and with **th**reshold cosine sim degree of the relation beteween the visual and caption = 0.2, 0.3, 0.4
72 | 4. [Download Dataset GenderBias](https://www.dropbox.com/s/1wki0b0d21078mj/gender%20natural.zip?dl=0)-> man/woman replaced with person class label
73 |
74 |
75 | ## Visual semantic with BERT-CNN
76 | Fine-tune [BERT](https://github.com/google-research/bert) on the created dataset.
77 |
78 | ### Requirements
79 | - Tensorflow 1.15.0
80 | - Python 3.6
81 |
82 | ```
83 | conda create -n BERT_visual python=3.6 anaconda
84 | conda activate BERT_visual
85 | pip install tensorflow==1.15.0
86 | pip install --upgrade tensorflow_hub==0.7.0
87 | ```
88 |
89 | Download BERT check point [uncased_L-12_H-768_A-12](https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1)
90 | ```
91 | wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
92 | unzip uncased_L-12_H-768_A-12.zip
93 | git clone https://github.com/gaphex/bert_experimental/
94 | ```
95 | like this ```BERT-CNN/uncased_L-12_H-768_A-12 ``` and ```BERT-CNN/bert_experimental ```
96 |
97 | Download dataset
98 |
99 | ```
100 | wget https://www.dropbox.com/s/dh38xibtjpohbeg/train_all.zip
101 | unzip train_all.zip
102 | ```
103 |
104 | for Training
105 |
106 | ```
107 | parser.add_argument('--train', default='train.tsv', help='beam serach', type=str,required=False)
108 | parser.add_argument('--num_bert_layer', default='12', help='truned layers', type=int,required=False)
109 | parser.add_argument('--batch_size', default='128', help='truned layers', type=int,required=False)
110 | parser.add_argument('--epochs', default='5', help='', type=int,required=False)
111 | parser.add_argument('--seq_len', default='64', help='', type=int,required=False)
112 | parser.add_argument('--CNN_kernel_size', default='3', help='', type=int,required=False)
113 | parser.add_argument('--CNN_filters', default='32', help='', type=int,required=False)
114 | ```
115 |
116 | ```
117 | python BERT_CNN.py --train /train_0.4.tsv --epochs 5
118 | ```
119 |
120 | for inference only, download pre-trained model
121 |
122 | ```
123 | wget https://www.dropbox.com/s/ip7p0wiwkwvph5k/0.4_bert-cnn.zip
124 | unzip 0.4_bert-cnn.zip
125 | ```
126 |
127 | ```
128 | python eval.py --testset test_demo.tsv --model 0.4_bert-cnn/frozen_graph.pb
129 | ```
130 | ### Example
131 |
132 | Re-rank the most related caption to the image using the visual context information.
133 |
134 |
135 |
136 | ```
137 | visual information, candidate caption (beam search)
138 | standard poodle shopping cart footwear, a close up of shoes and a dog in a basket, 0.99774158
139 | standard poodle shopping cart footwear, a brown teddy bear laying on top of a pair of shoes, 0.0621758029
140 | ```
141 |
142 | ## Visual semantic with pre-trained model
143 |
144 |
145 |
148 |
149 |
150 |
151 |
152 |
153 | Although this approach is proposed to take the advantage of the dataset (_e.g._ visual semantic model), we also investigate the use of out-of-the-box tools to estimate the relatedness score between the short text (_i.e._ caption) and its environmental visual context (we call it visual classifier).
154 |
155 | For this we follow similarity to probability based approach but
156 |
157 | we use only the cosine similarity from a pre-trained model and the top-3 averaged prob (confidence) from the object classifier as:
158 |
159 |
162 |
163 | $\text{P}(w \mid c)=\text{}sim(w,c)^{\text{P}(c)}$
164 | where the main components of the visual semantics re-ranker:
165 |
168 | 1. Simialrity/relatedness between the caption and the object context $\text{}sim(w,c)$
169 |
170 |
173 |
174 | 2. $\text{P}(c)$ is the classifier object confident in the image $\text{P}(w \mid \text{object})$
175 |
176 |
177 | with Pre-trained [SBERT](https://www.sbert.net)
178 |
179 | ```
180 | python model.py --vis visual-context_label.txt --vis_prob visual-context_prob.txt --c caption.txt
181 | ```
182 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf)
183 |
184 | ## Evaluation
185 |
186 | [Download pycocoevalcap](https://github.com/salaniz/pycocoevalcap)
187 |
188 | ```
189 | pip install pycocoevalcap
190 | ```
191 |
192 | Then run
193 | ```
194 | python Evaluation/coco_eval.py --f Result_tune_BERT_0.4.json
195 | ```
196 | For more evaluation ([Lexical and Semantic Diversity](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/SBERT-caption-eval))
197 |
213 |
214 |
215 | ## Citation
216 |
217 | The details of this repo are described in the following paper. If you find this repo useful, please kindly cite it:
218 |
219 | ```bibtex
220 | @article{sabir2023visual,
221 | title={Visual Semantic Relatedness Dataset for Image Captioning},
222 | author={Sabir, Ahmed and Moreno-Noguer, Francesc and Padr{\'o}, Llu{\'\i}s},
223 | journal={arXiv preprint arXiv:2301.08784},
224 | year={2023}
225 | }
226 | ```
227 |
228 |
--------------------------------------------------------------------------------
/approch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/approch.png
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | train.tsv file here
2 |
--------------------------------------------------------------------------------
/data/test.tsv:
--------------------------------------------------------------------------------
1 | id visual caption
2 | 0 standard poodle shopping cart footwear a close up of a dog laying in a basket
3 | 1 street sign traffic light tower a black and white photo of a street light
4 | 2 toilet seat a white toilet with its seat up in a bathroom
5 | 3 mobile home studio couch house a living room filled with furniture and a coffee table
6 | 4 french loaf conch person a basket filled with sandwiches on top of a table
7 | 5 indian elephant a group of people riding on the back of an elephant
8 | 6 bow tie windsor glasses a man wearing glasses and a tie in a room
9 | 7 sombrero bonnet woman a woman standing in front of a giant cake
10 | 8 diaper bassinet human a baby sitting in front of a giant cake
11 | 9 bobsled go-kart human a group of children sitting around a piece of luggage
12 | 10 vase spotlight plant a bunch of flowers that are in a vase
13 |
--------------------------------------------------------------------------------
/data/train.tsv:
--------------------------------------------------------------------------------
1 | id id1 id2 visual caption is_related
2 | 220740 220741 220742 marimba dalmatian picket fence a horse jumping competition is going on with people in the stands 1
3 | 385729 385730 385731 dishwasher microwave barber chair a person riding a horse on a dirt ground 0
4 | 59422 59423 59424 laptop carton comicbook a laptop that has stickers on its cover is sitting on a table 1
5 | 46638 46639 46640 suit Windsortie woodenspoon a young bow wearing a pink shirt and a purple tie 1
6 | 11870 11871 11872 studiocouch four-poster quilt a couple of girls sitting in a bed in a bedroom 1
7 | 471676 471677 471678 streetcar fire engine passenger car a multi layer plate with cakes and food on it 0
8 | 186795 186796 186797 shoe shop television monitor a man playing a wii on a large projector screen 1
9 | 121836 121837 121838 ox water buffalo alp cattle standing on a hill in fog 1
10 | 396224 396225 396226 altar desk perfume oranges sitting in a blue bowl on a wooden table 0
11 | 430635 430636 430637 speedboat paddle lifeboat pots and other items sit on a stove and counter 0
12 | 145057 145058 145059 shopping cart ashcan park bench a coin meter that is laying down on grates 1
13 | 409778 409779 409780 web site fire engine comic book a painting of a man from the back 0
14 | 155568 155569 155570 grocery store patio restaurant a man and woman walking up the stairs in a backyard 1
15 | 213951 213952 213953 microwave washer dining table the kitchen is equipped with all the latest appliances 1
16 | 489266 489267 489268 traffic light aircraft carrier chain saw a laptop computer on a desk with cables a mug and bowl 0
17 | 257649 257650 257651 grocery store confectionery shopping basket a couple of wooden tale stopped with fresh fruit 1
18 | 113826 113827 113828 lab coat vestment West Highland white terrier a group of people standing in rows with frisbees for a photo 1
19 | 486413 486414 486415 snorkel ski tennis ball two frames of a woman in the air on a tennis court 0
20 | 400432 400433 400434 crutch lawn mower chain saw eight underneath on ambarella in the forest parrot 0
21 | 341153 341154 341155 washer microwave dishwasher a small propeller plane sitting underneath a covering at an airport 0
22 | 462067 462068 462069 ballplayer baseball scoreboard a plate full of bright green lettuce next to some bread 0
23 | 443392 443393 443394 grocery store pineapple pizza a man in black and white stripes with makeup smiling 0
24 | 486660 486661 486662 wombat wallaby titi a persons shadow on the ground of them skateboarding 0
25 | 336616 336617 336618 moped motor scooter crash helmet multiple street signs are attached to the post 0
26 | 124199 124200 124201 sorrel hog barrel a brown horse eating from a hallowed out metal barrel 1
27 | 238004 238005 238006 tray washbasin cradle a cat laying on a couch near a remote control 1
28 | 319195 319196 319197 airliner wing web site a propeller airplane parked inside and airplane hanger 1
29 | 412036 412037 412038 grey whale breakwater killer whale a stop sign is standing at a street intersection 0
30 | 491896 491897 491898 teddy wool toyshop a woman in an old-fashioned kitchen with pots and pans 0
31 | 487501 487502 487503 snowmobile steam locomotive tow truck the living room is clean and empty from people 0
32 | 277093 277094 277095 microwave dishwasher chest a chair holding a laptop that is facing towards an oven 1
33 | 135542 135543 135544 water buffalo warthog hog sheep grazing under a tree in a grassy meadow 1
34 | 8448 8449 8450 mountainbike unicycle bicycle-built-for-two a picture of a person throwing a frisbee 1
35 | 170686 170687 170688 police van minibus ambulance a person in the army greeting someone in a suit 1
36 | 372016 372017 372018 Great Dane Irish wolfhound English setter a man standing in a room holding a remote 0
37 | 351158 351159 351160 sunglass bullet train sunglasses a woman opening the trunk of her car 0
38 | 414542 414543 414544 killer whale great white shark paddle a dog running across a field with a frisbee in his mouth 0
39 | 264998 264999 265000 bannister ski unicycle a man riding a skateboard along a metal hand rail 1
40 | 362868 362869 362870 zebra bustard gazelle a basket full of bananas with a net on top 0
41 | 88455 88456 88457 patio flagpole pole a fire hydrant and fire hose in a houses front yard 1
42 | 372512 372513 372514 seashore catamaran swimming trunks a man riding a surfboard on a wave in the ocean 0
43 | 387327 387328 387329 cellular telephone lab coat cash machine a baseball game ensues as people watch 0
44 | 248027 248028 248029 web site barbershop cinema a motor bike on the side of the street 1
45 | 347507 347508 347509 banana pineapple orange a bear itching itself on a bare tree 0
46 | 33714 33715 33716 picketfence streetcar mountainbike the red bike and the pink bike just started dating 1
47 | 173989 173990 173991 umbrella poncho jinrikisha a group of people walking down a street carrying umbrellas 1
48 | 20835 20836 20837 ballplayer baseball footballhelmet a man throwing a baseball from a mound on a field 1
49 | 16356 16357 16358 lumbermill barbershop turnstile a man working on a baseball bat while two others watch 1
50 | 193491 193492 193493 unicycle pole horizontal bar boy riding on his skateboard down a stair rail 1
51 | 384165 384166 384167 mixing bowl corn meat loaf a couple of sailors standing next to a woman 0
52 | 321736 321737 321738 ballplayer baseball football helmet a boys baseball game with a batter catcher and umpire 1
53 | 108395 108396 108397 crash helmet moped backpack a man with a suit and tie on a motor bike 1
54 | 215942 215943 215944 unicycle military uniform bearskin four guys are sitting on a bench in front of a building 1
55 | 134156 134157 134158 wine bottle eggnog red wine there is a bottle of wine next to a glass 1
56 | 297783 297784 297785 necklace thimble corkscrew this is an image of a meal and an avocado is included 1
57 | 110516 110517 110518 minivan cab police van a dog looking ahead with a stoic look in a car seat 1
58 | 3166 3167 3168 grocerystore headcabbage cauliflower a pile of vegetables on display at a grocery store 1
59 | 440075 440076 440077 ski curly-coatedretriever Gordonsetter elephants and their young in their natural habitat 0
60 | 71021 71022 71023 ballplayer baseball puck a baseball player and a flying black bat 1
61 |
--------------------------------------------------------------------------------
/dataset_v1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/dataset_v1-1.png
--------------------------------------------------------------------------------
/hist.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/hist.jpg
--------------------------------------------------------------------------------
/main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/main.png
--------------------------------------------------------------------------------
/overlap_text.py:
--------------------------------------------------------------------------------
1 |
2 | file1 = []
3 |
4 | file2 = []
5 |
6 | with open('train_visual.txt','rU') as f:
7 | for line in f:
8 | file1.append(line.rstrip())
9 |
10 |
11 | with open('caption_anot.txt') as f1:
12 | for line1 in f1:
13 | file2.append(line1.rstrip())
14 | #break
15 |
16 | f=open('intersection_caption_visual.txt', "w")
17 | for i in range(len(file1)):
18 | temp =[]
19 | messages = file1[i]
20 | messages1 = file2[i]
21 |
22 | words1 = messages.lower().split()
23 | words2 = messages1.lower().split()
24 |
25 | w = set(words1) & set(words2)
26 |
27 |
28 | #words1 = "This is a simple test of set intersection".lower().split()
29 | #words2 = "Intersection of sets is easy using Python".lower().split()
30 |
31 |
32 | temp.append(w)
33 |
34 | result= file1[i]+','+file2[i]+','+str(w)
35 |
36 | f.write(result)
37 | #f.write(result)
38 | f.write('\n')
39 | print(result)
40 | #del result
41 | #close.sess()
42 |
43 | f.close()
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/overview.png
--------------------------------------------------------------------------------
/pre-trained/README.md:
--------------------------------------------------------------------------------
1 | Please refer to this [repository](https://github.com/ahmedssabir/Belief-Revision-Score) for more information about pre-trained visual re-ranker [probability from similarity](https://cdn.aaai.org/Symposia/Spring/2003/SS-03-05/SS03-05-005.pdf)
2 |
--------------------------------------------------------------------------------
/pre-trained/Visual_re-rank_re-ranked_output.txt:
--------------------------------------------------------------------------------
1 | a man riding on the back of a motorcycle 0.8895639974564639
2 | a person riding a motorcycle on a city street 0.8699054868636436
3 | a person riding a motorcycle down a city street 0.8665321958170883
4 | a man riding on the back of a motorcycle down a street 0.8645537987336105
5 | a man riding a motorcycle down a street 0.8582269252364088
6 | a man riding on the back of a motorcycle down a sidewalk 0.8581149928539996
7 | a man riding a motorcycle down the street 0.8569102761752505
8 | a man riding a motorcycle on a city street 0.85454545827468
9 | a man riding a motorcycle down a sidewalk 0.8493932857280806
10 |
--------------------------------------------------------------------------------
/pre-trained/Visual_re-ranker.txt:
--------------------------------------------------------------------------------
1 | a man riding a motorcycle down a street,0.8582269252364088
2 | a person riding a motorcycle on a city street,0.8699054868636436
3 | a man riding on the back of a motorcycle,0.8895639974564639
4 | a man riding a motorcycle on a city street,0.85454545827468
5 | a man riding on the back of a motorcycle down a street,0.8645537987336105
6 | a person riding a motorcycle down a city street,0.8665321958170883
7 | a man riding on the back of a motorcycle down a sidewalk,0.8581149928539996
8 | a man riding a motorcycle down the street,0.8569102761752505
9 | a man riding a motorcycle down a sidewalk,0.8493932857280806
10 |
--------------------------------------------------------------------------------
/pre-trained/caption.txt:
--------------------------------------------------------------------------------
1 | a man riding a motorcycle down a street
2 | a person riding a motorcycle on a city street
3 | a man riding on the back of a motorcycle
4 | a man riding a motorcycle on a city street
5 | a man riding on the back of a motorcycle down a street
6 | a person riding a motorcycle down a city street
7 | a man riding on the back of a motorcycle down a sidewalk
8 | a man riding a motorcycle down the street
9 | a man riding a motorcycle down a sidewalk
10 |
11 |
--------------------------------------------------------------------------------
/pre-trained/model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import sys
3 | import argparse
4 | import torch
5 | import re
6 | import os
7 |
8 |
9 |
10 | parser=argparse.ArgumentParser(description='call all scores and compute the visual context based re-ranker')
11 | parser.add_argument('--sim', default='sim-score.txt', help='similarity score from fine_tune_BERT', type=str,required=False)
12 | parser.add_argument('--vis', default='visual-context_label.txt',help='class-label from the classifier (Resent152)', type=str, required=True)
13 | parser.add_argument('--vis_prob', default='visual-context.txt', help='prob from the classifier (Resent152)', type=str, required=True)
14 | parser.add_argument('--c', default='caption.txt', help='caption from the baseline (any)', type=str, required=True)
15 | args = parser.parse_args()
16 |
17 | # Download from here S-BERT
18 | # pip install -U sentence-transformers
19 | from sentence_transformers import SentenceTransformer, util
20 | from sklearn.metrics.pairwise import cosine_similarity
21 |
22 | model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')
23 | #model = SentenceTransformer('nq-distilbert-base-v1')
24 |
25 |
26 | def cos_sim(a, b):
27 | return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))
28 |
29 |
30 | def get_lines(file_path):
31 | with open(file_path) as f:
32 | return f.read().strip().split('\n')
33 |
34 |
35 | # visual confident based visual re-ranker
36 | class Visual_re_ranker:
37 | def __init__(self, visual_context_prob, sim):
38 | self.visual_context_prob = visual_context_prob
39 | self.sim = sim
40 | def p_minus (self):
41 | score = pow(float(sim), float(visual_context_prob))
42 |
43 | return score
44 |
45 | @staticmethod
46 | def remove_duplicate_caption_re_rank(input_path, output_path):
47 | with open(input_path, 'r') as input_file, open(output_path, 'w') as output_file:
48 | seen_lines = set()
49 |
50 | def add_line(line):
51 | seen_lines.add(line)
52 | return line
53 |
54 | output_file.writelines((add_line(line) for line in input_file
55 | if line not in seen_lines))
56 | re_ranked_scores = []
57 | with open(output_path) as f:
58 | for line in f:
59 | caption, score = line.split(',')
60 | score = float(score)
61 | re_ranked_scores.append((caption, score))
62 | re_ranked_scores.sort(key=lambda s: float(s[1]), reverse=True)
63 | with open(output_path, 'w') as f:
64 | for caption, score in re_ranked_scores:
65 | f.write("%s %s\n" % (caption, score))
66 |
67 |
68 |
69 | # all beam with visual context
70 | input_path= 'Visual_re-ranker.txt'
71 | # re-ranked beam with visual context
72 | output_path = 'Visual_re-rank_re-ranked_output.txt'
73 |
74 | # compute visual context
75 | f=open(input_path, "w")
76 | for i in range(len(get_lines(args.vis))):
77 | temp =[]
78 | visual_context_label = get_lines(args.vis)[i]
79 | visual_context_prob = get_lines(args.vis_prob)[i]
80 | caption = get_lines(args.c)[i]
81 |
82 |
83 | caption_emb = model.encode(caption, convert_to_tensor=True)
84 | visual_context_label_emb = model.encode(visual_context_label, convert_to_tensor=True)
85 |
86 | #def cos_sim(a, b):
87 | # return np.inner(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))
88 |
89 |
90 |
91 | sim = cosine_scores = util.pytorch_cos_sim(caption_emb, visual_context_label_emb)
92 | sim = sim.cpu().numpy()
93 | sim = sim.item()
94 |
95 |
96 | score = Visual_re_ranker(visual_context_prob, sim)
97 | score = score.p_minus()
98 | #score = score.real
99 | temp.append(score)
100 |
101 | #result = ','.join((caption, LM, str(score)))
102 | result = ','.join((caption, str(score)))
103 | result = re.sub(r'\s*,\s*', ',', result)
104 |
105 |
106 | #print(result)
107 |
108 | f.write(result)
109 | f.write('\n')
110 |
111 |
112 | f.close()
113 |
114 | if __name__ == "__main__":
115 |
116 | # re-rank and print top visual beam captions
117 | Visual_re_ranker.remove_duplicate_caption_re_rank(input_path, output_path)
118 |
--------------------------------------------------------------------------------
/pre-trained/sample_best.json:
--------------------------------------------------------------------------------
1 | [{"image_id":24343,"caption":"a man riding on the back of a motorcycle"}]
2 |
--------------------------------------------------------------------------------
/pre-trained/sample_best_baseline.json:
--------------------------------------------------------------------------------
1 | [{"image_id":24343,"caption":"a man riding a motorcycle down a street"}]
2 |
--------------------------------------------------------------------------------
/pre-trained/visual-context_label.txt:
--------------------------------------------------------------------------------
1 | motor scooter crash helmet motorcycle
2 | motor scooter crash helmet motorcycle
3 | motor scooter crash helmet motorcycle
4 | motor scooter crash helmet motorcycle
5 | motor scooter crash helmet motorcycle
6 | motor scooter crash helmet motorcycle
7 | motor scooter crash helmet motorcycle
8 | motor scooter crash helmet motorcycle
9 | motor scooter crash helmet motorcycle
10 |
--------------------------------------------------------------------------------
/pre-trained/visual-context_prob.txt:
--------------------------------------------------------------------------------
1 | 0.203588580197762
2 | 0.203588580197762
3 | 0.203588580197762
4 | 0.203588580197762
5 | 0.203588580197762
6 | 0.203588580197762
7 | 0.203588580197762
8 | 0.203588580197762
9 | 0.203588580197762
10 |
--------------------------------------------------------------------------------
/visual_context/README.md:
--------------------------------------------------------------------------------
1 | ## Extract visual information
2 | ```
3 | conda create -n Resnet python=3.7 anaconda
4 | conda activate Resnet
5 | pip install tensorflow==1.15.0
6 | pip install keras==2.1.5
7 | ```
8 |
9 | For [ResNet](https://arxiv.org/abs/1512.03385)
10 |
11 | ```
12 | python run-visual.py
13 | ```
14 |
15 | ```
16 | COCO_val2014_000000185210.jpg 'traffic_light', 0.7458004
17 | COCO_val2014_000000235692.jpg 'ox', 0.49095494
18 | ```
19 |
20 | For [CLIP](https://github.com/openai/CLIP) with zero-shot prediction
21 |
22 | ```
23 | # torch 1.7.1
24 | conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=10.1
25 | pip install ftfy regex tqdm
26 | pip install git+https://github.com/openai/CLIP.git
27 | ```
28 |
29 | run
30 |
31 | ```
32 | python run-visual_CLIP.py
33 | ```
34 |
35 | ```
36 | COCO_val2014_000000185210.jpg 'barrow', 0.0954
37 | COCO_val2014_000000235692.jpg 'ox', 0.5092
38 | ```
39 | For more visual classifier (e.g., Vit, SwinV2, etc.) please refre to this [page](https://github.com/ahmedssabir/Belief-Revision-Score/tree/main/model/Resent-152)
40 |
--------------------------------------------------------------------------------
/visual_context/imgs/COCO_val2014_000000185210.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000185210.jpg
--------------------------------------------------------------------------------
/visual_context/imgs/COCO_val2014_000000235692.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedssabir/Textual-Visual-Semantic-Dataset/70ab427b0822dd176dfbf05546b6bd474af66b15/visual_context/imgs/COCO_val2014_000000235692.jpg
--------------------------------------------------------------------------------
/visual_context/model.py:
--------------------------------------------------------------------------------
1 | import os
2 | import keras.backend as K
3 |
4 | from keras import initializers
5 | from keras.layers import Input
6 | from keras.layers import Dense
7 | from keras.layers import Conv2D
8 | from keras.layers import MaxPooling2D
9 | from keras.layers import AveragePooling2D
10 | from keras.layers import ZeroPadding2D
11 | from keras.layers import Flatten
12 | from keras.layers import Activation
13 | from keras.layers import add
14 | from keras.layers import BatchNormalization
15 | from keras.layers import GlobalAveragePooling2D
16 | from keras.layers import GlobalMaxPooling2D
17 |
18 | from keras.models import Model
19 | from keras.engine import Layer, InputSpec
20 | from keras.engine import get_source_inputs
21 |
22 |
23 | from keras.utils.data_utils import get_file
24 | #from keras.applications.imagenet_utils import _obtain_input_shape
25 | #from keras.applications.imagenet_utils import _obtain_input_shape
26 |
27 | from keras_applications.imagenet_utils import _obtain_input_shape
28 |
29 | WEIGHTS_PATH = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels.h5'
30 | WEIGHTS_PATH_NO_TOP = 'https://github.com/qubvel/ResNet152/releases/download/v0.0.1/resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5'
31 |
32 |
33 | class Scale(Layer):
34 | """ Custom Layer for ResNet used for BatchNormalization.
35 |
36 | Learns a set of weights and biases used for scaling the input data.
37 | the output consists simply in an element-wise multiplication of the input
38 | and a sum of a set of constants:
39 | out = in * gamma + beta,
40 | where 'gamma' and 'beta' are the weights and biases larned.
41 | # Arguments
42 | axis: integer, axis along which to normalize in mode 0. For instance,
43 | if your input tensor has shape (samples, channels, rows, cols),
44 | set axis to 1 to normalize per feature map (channels axis).
45 | momentum: momentum in the computation of the
46 | exponential average of the mean and standard deviation
47 | of the data, for feature-wise normalization.
48 | weights: Initialization weights.
49 | List of 2 Numpy arrays, with shapes:
50 | `[(input_shape,), (input_shape,)]`
51 | beta_init: name of initialization function for shift parameter
52 | (see [initializers](../initializers.md)), or alternatively,
53 | Theano/TensorFlow function to use for weights initialization.
54 | This parameter is only relevant if you don't pass a `weights` argument.
55 | gamma_init: name of initialization function for scale parameter (see
56 | [initializers](../initializers.md)), or alternatively,
57 | Theano/TensorFlow function to use for weights initialization.
58 | This parameter is only relevant if you don't pass a `weights` argument.
59 | """
60 |
61 | def __init__(self, weights=None, axis=-1, momentum=0.9, beta_init='zero', gamma_init='one', **kwargs):
62 | self.momentum = momentum
63 | self.axis = axis
64 | self.beta_init = initializers.get(beta_init)
65 | self.gamma_init = initializers.get(gamma_init)
66 | self.initial_weights = weights
67 | super(Scale, self).__init__(**kwargs)
68 |
69 | def build(self, input_shape):
70 | self.input_spec = [InputSpec(shape=input_shape)]
71 | shape = (int(input_shape[self.axis]),)
72 |
73 | self.gamma = K.variable(self.gamma_init(shape), name='%s_gamma' % self.name)
74 | self.beta = K.variable(self.beta_init(shape), name='%s_beta' % self.name)
75 | self.trainable_weights = [self.gamma, self.beta]
76 |
77 | if self.initial_weights is not None:
78 | self.set_weights(self.initial_weights)
79 | del self.initial_weights
80 |
81 | def call(self, x, mask=None):
82 | input_shape = self.input_spec[0].shape
83 | broadcast_shape = [1] * len(input_shape)
84 | broadcast_shape[self.axis] = input_shape[self.axis]
85 |
86 | out = K.reshape(self.gamma, broadcast_shape) * x + K.reshape(self.beta, broadcast_shape)
87 | return out
88 |
89 | def get_config(self):
90 | config = {"momentum": self.momentum, "axis": self.axis}
91 | base_config = super(Scale, self).get_config()
92 | return dict(list(base_config.items()) + list(config.items()))
93 |
94 |
95 | def identity_block(input_tensor, kernel_size, filters, stage, block):
96 | """
97 | The identity_block is the block that has no conv layer at shortcut
98 | # Arguments
99 | input_tensor: input tensor
100 | kernel_size: defualt 3, the kernel size of middle conv layer at main path
101 | filters: list of integers, the nb_filters of 3 conv layer at main path
102 | stage: integer, current stage label, used for generating layer names
103 | block: 'a','b'..., current block label, used for generating layer names
104 | """
105 | eps = 1.1e-5
106 | nb_filter1, nb_filter2, nb_filter3 = filters
107 | conv_name_base = 'res' + str(stage) + block + '_branch'
108 | bn_name_base = 'bn' + str(stage) + block + '_branch'
109 | scale_name_base = 'scale' + str(stage) + block + '_branch'
110 |
111 | if K.image_data_format() == 'channels_last':
112 | bn_axis = 3
113 | else:
114 | bn_axis = 1
115 |
116 | x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a', use_bias=False)(input_tensor)
117 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
118 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
119 | x = Activation('relu', name=conv_name_base + '2a_relu')(x)
120 |
121 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
122 | x = Conv2D(nb_filter2, (kernel_size, kernel_size), name=conv_name_base + '2b', use_bias=False)(x)
123 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
124 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
125 | x = Activation('relu', name=conv_name_base + '2b_relu')(x)
126 |
127 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
128 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
129 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
130 |
131 | x = add([x, input_tensor], name='res' + str(stage) + block)
132 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
133 | return x
134 |
135 |
136 | def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
137 | """ conv_block is the block that has a conv layer at shortcut
138 | # Arguments
139 | input_tensor: input tensor
140 | kernel_size: defualt 3, the kernel size of middle conv layer at main path
141 | filters: list of integers, the nb_filters of 3 conv layer at main path
142 | stage: integer, current stage label, used for generating layer names
143 | block: 'a','b'..., current block label, used for generating layer names
144 | Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
145 | And the shortcut should have subsample=(2,2) as well
146 | """
147 |
148 | eps = 1.1e-5
149 | nb_filter1, nb_filter2, nb_filter3 = filters
150 | conv_name_base = 'res' + str(stage) + block + '_branch'
151 | bn_name_base = 'bn' + str(stage) + block + '_branch'
152 | scale_name_base = 'scale' + str(stage) + block + '_branch'
153 |
154 | if K.image_data_format() == 'channels_last':
155 | bn_axis = 3
156 | else:
157 | bn_axis = 1
158 |
159 | x = Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + '2a', use_bias=False)(input_tensor)
160 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2a')(x)
161 | x = Scale(axis=bn_axis, name=scale_name_base + '2a')(x)
162 | x = Activation('relu', name=conv_name_base + '2a_relu')(x)
163 |
164 | x = ZeroPadding2D((1, 1), name=conv_name_base + '2b_zeropadding')(x)
165 | x = Conv2D(nb_filter2, (kernel_size, kernel_size),
166 | name=conv_name_base + '2b', use_bias=False)(x)
167 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2b')(x)
168 | x = Scale(axis=bn_axis, name=scale_name_base + '2b')(x)
169 | x = Activation('relu', name=conv_name_base + '2b_relu')(x)
170 |
171 | x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=False)(x)
172 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '2c')(x)
173 | x = Scale(axis=bn_axis, name=scale_name_base + '2c')(x)
174 |
175 | shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
176 | name=conv_name_base + '1', use_bias=False)(input_tensor)
177 | shortcut = BatchNormalization(epsilon=eps, axis=bn_axis, name=bn_name_base + '1')(shortcut)
178 | shortcut = Scale(axis=bn_axis, name=scale_name_base + '1')(shortcut)
179 |
180 | x = add([x, shortcut], name='res' + str(stage) + block)
181 | x = Activation('relu', name='res' + str(stage) + block + '_relu')(x)
182 | return x
183 |
184 |
185 | def ResNet152(include_top=True, weights='imagenet',
186 | input_tensor=None, input_shape=None, pooling=None, classes=1000):
187 | """ Instantiates the ResNet152 architecture.
188 | Optionally loads weights pre-trained
189 | on ImageNet. Note that when using TensorFlow,
190 | for best performance you should set
191 | `image_data_format='channels_last'` in your Keras config
192 | at ~/.keras/keras.json.
193 | The model and the weights are compatible only with
194 | TensorFlow. The data format
195 | convention used by the model is the one
196 | specified in your Keras config file.
197 | # Arguments
198 | include_top: whether to include the fully-connected
199 | layer at the top of the network.
200 | weights: one of `None` (random initialization),
201 | 'imagenet' (pre-training on ImageNet),
202 | or the path to the weights file to be loaded.
203 | input_tensor: optional Keras tensor (i.e. output of `layers.Input()`)
204 | to use as image input for the model.
205 | input_shape: optional shape tuple, only to be specified
206 | if `include_top` is False (otherwise the input shape
207 | has to be `(224, 224, 3)` (with `channels_last` data format)
208 | or `(3, 224, 224)` (with `channels_first` data format).
209 | It should have exactly 3 inputs channels,
210 | and width and height should be no smaller than 197.
211 | E.g. `(200, 200, 3)` would be one valid value.
212 | pooling: Optional pooling mode for feature extraction
213 | when `include_top` is `False`.
214 | - `None` means that the output of the model will be
215 | the 4D tensor output of the
216 | last convolutional layer.
217 | - `avg` means that global average pooling
218 | will be applied to the output of the
219 | last convolutional layer, and thus
220 | the output of the model will be a 2D tensor.
221 | - `max` means that global max pooling will
222 | be applied.
223 | classes: optional number of classes to classify images
224 | into, only to be specified if `include_top` is True, and
225 | if no `weights` argument is specified.
226 | # Returns
227 | A Keras model instance.
228 | # Raises
229 | ValueError: in case of invalid argument for `weights`,
230 | or invalid input shape.
231 | """
232 |
233 | eps = 1.1e-5
234 |
235 | if not (weights in {'imagenet', None} or os.path.exists(weights)):
236 | raise ValueError('The `weights` argument should be either '
237 | '`None` (random initialization), `imagenet` '
238 | '(pre-training on ImageNet), '
239 | 'or the path to the weights file to be loaded.')
240 |
241 | if weights == 'imagenet' and include_top and classes != 1000:
242 | raise ValueError('If using `weights` as imagenet with `include_top`'
243 | ' as true, `classes` should be 1000')
244 |
245 | # Determine proper input shape
246 | input_shape = _obtain_input_shape(input_shape,
247 | default_size=224,
248 | min_size=197,
249 | data_format=K.image_data_format(),
250 | require_flatten=include_top,
251 | weights=weights)
252 |
253 | if input_tensor is None:
254 | img_input = Input(shape=input_shape)
255 | else:
256 | if not K.is_keras_tensor(input_tensor):
257 | img_input = Input(tensor=input_tensor, shape=input_shape, name='data')
258 | else:
259 | img_input = input_tensor
260 |
261 | # Handle dimension ordering for different backends
262 | #if K.image_dim_ordering() == 'tf':
263 | if K.common.image_dim_ordering() == 'tf':
264 | bn_axis = 3
265 | else:
266 | bn_axis = 1
267 |
268 | x = ZeroPadding2D((3, 3), name='conv1_zeropadding')(img_input)
269 | x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=False)(x)
270 | x = BatchNormalization(epsilon=eps, axis=bn_axis, name='bn_conv1')(x)
271 | x = Scale(axis=bn_axis, name='scale_conv1')(x)
272 | x = Activation('relu', name='conv1_relu')(x)
273 | x = MaxPooling2D((3, 3), strides=(2, 2), name='pool1', padding='same')(x)
274 |
275 | x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
276 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
277 | x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
278 |
279 | x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
280 | for i in range(1, 8):
281 | x = identity_block(x, 3, [128, 128, 512], stage=3, block='b' + str(i))
282 |
283 | x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
284 | for i in range(1, 36):
285 | x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b' + str(i))
286 |
287 | x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a')
288 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b')
289 | x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c')
290 |
291 | if include_top:
292 | # Classification block
293 | x = AveragePooling2D((7, 7), name='avg_pool')(x)
294 | x = Flatten()(x)
295 | x = Dense(classes, activation='softmax', name='fc1000')(x)
296 | else:
297 | if pooling == 'avg':
298 | x = GlobalAveragePooling2D()(x)
299 | elif pooling == 'max':
300 | x = GlobalMaxPooling2D()(x)
301 |
302 | # Ensure that the model takes into account
303 | # any potential predecessors of `input_tensor`.
304 | if input_tensor is not None:
305 | inputs = get_source_inputs(input_tensor)
306 | else:
307 | inputs = img_input
308 |
309 | # Create model
310 | model = Model(inputs, x, name='resnet152')
311 |
312 | # Load weights
313 | if weights == 'imagenet':
314 | if include_top:
315 | weights_path = get_file(
316 | 'resnet152_weights_tf_dim_ordering_tf_kernels.h5',
317 | WEIGHTS_PATH,
318 | cache_subdir='models',
319 | md5_hash='cdb18a2158b88e392c0905d47dcef965')
320 | else:
321 | weights_path = get_file(
322 | 'resnet152_weights_tf_dim_ordering_tf_kernels_no_top.h5',
323 | WEIGHTS_PATH_NO_TOP,
324 | cache_subdir='models',
325 | md5_hash='02cb9130cc51543cd703c79697baa592')
326 | model.load_weights(weights_path)
327 |
328 | elif weights is not None:
329 | model.load_weights(weights)
330 |
331 | return model
332 |
--------------------------------------------------------------------------------
/visual_context/run-visual.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from skimage.io import imread
4 | from skimage.transform import resize
5 | from keras.applications.imagenet_utils import decode_predictions
6 | from keras.applications.imagenet_utils import preprocess_input
7 | from tensorflow.keras.preprocessing import image
8 | from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions
9 | import numpy as np
10 | from model import ResNet152
11 | import tensorflow as tf
12 |
13 | import os
14 |
15 |
16 | image_dir = 'imgs'
17 | import keras as K
18 | from keras_applications.imagenet_utils import _obtain_input_shape
19 |
20 | os.environ['CUDA_VISIBLE_DEVICES'] = "-1"
21 |
22 | def preprocess(x):
23 | x = resize(x, (224,224), mode='constant') * 255
24 | x = preprocess_input(x)
25 | if x.ndim == 3:
26 | x = np.expand_dims(x, 0)
27 | return x
28 | model = ResNet152()
29 |
30 | for img_file in os.listdir(image_dir):
31 | #img = mpimg.imread(image_dir + '/' + img_file)
32 | img = image.load_img(image_dir + '/' + img_file, target_size=(224, 224))
33 | x = image.img_to_array(img)
34 | x = np.expand_dims(x, axis=0)
35 | x = preprocess_input(x)
36 |
37 | preds = model.predict(x)
38 | print(img_file, decode_predictions(preds, top=1)[0])
39 |
40 |
41 |
--------------------------------------------------------------------------------
/visual_context/run-visual_CLIP.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import glob
4 | import sys
5 | import torch
6 | import torchvision.transforms as Transforms
7 | import clip
8 | from PIL import Image
9 |
10 |
11 |
12 | # Check device
13 | #device = "cuda" if torch.cuda.is_available() else "cpu"
14 | device = torch.device("cpu")
15 | print(f"Device - {device}")
16 |
17 | # Load CLIP model
18 | clip_model, clip_preprocess = clip.load('ViT-B/32', device)
19 | clip_model.eval()
20 |
21 | #
22 | with open("imagenet_classes.txt", "r") as f:
23 | categories = [s.strip() for s in f.readlines()]
24 |
25 | text = clip.tokenize(categories).to(device)
26 |
27 | def predict_clip(image_file_path):
28 | image = clip_preprocess(Image.open(image_file_path)).unsqueeze(0).to(device)
29 | clip_model, _ = clip.load('ViT-B/32', device)
30 |
31 | # Calculate features
32 | with torch.no_grad():
33 | image_features = clip_model.encode_image(image)
34 | text_features = clip_model.encode_text(text)
35 |
36 | # Pick the top 5 most similar labels for the image
37 | image_features /= image_features.norm(dim=-1, keepdim=True)
38 | text_features /= text_features.norm(dim=-1, keepdim=True)
39 | similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
40 | values, indices = similarity[0].topk(5)
41 |
42 | predictions = {}
43 | for value, index in zip(values, indices):
44 | predictions[f"{categories[index]:>16s}"] = f"{1 * value.item():.4f}%"
45 |
46 | return predictions
47 |
48 |
49 | # run pred
50 | filenames = glob.glob("file= '/image/*.jpg")
51 | filenames.sort()
52 | for image in filenames:
53 | print(os.path.basename(image), predict_clip(image))
54 | #print(predict_clip("image.jpg"))
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/word-count-hisgram.py:
--------------------------------------------------------------------------------
1 | """Python script to create a histogram of words in a text file.
2 | Usage: python word_frequency.py -f "/path/to/file.txt" -n 200
3 | Specify the path to the text file as above. Manually specify the top N words to report (default 100).
4 | Text file can contain punctuation, new lines, etc., but special characters aren't handled well.
5 | """
6 |
7 | import os
8 | import sys
9 | import string
10 | import argparse
11 | import operator
12 |
13 | import numpy as np
14 | import pandas as pd
15 | import matplotlib.pyplot as plt
16 |
17 | from collections import Counter
18 |
19 | __author__ = 'Nick Powell (PhD student, CMIC & CABI, UCL, UK), nicholas.powell.11@ucl.ac.uk'
20 | __version__ = '0.2.20150303'
21 | __created__ = '2014-12-18, Thursday'
22 |
23 |
24 | def main():
25 |
26 | parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
27 | parser.add_argument('-f','--filepath',dest='filepath',metavar='file path',help='Path to text input file to be analysed.', required=True)
28 | parser.add_argument('-n','--number',dest='number',metavar='number',help='Most frequent n words will be displayed and plotted.', required=False, default=100, type=int)
29 | args = parser.parse_args()
30 |
31 | # Path to text file to analyse
32 | rawfilepath = args.filepath
33 |
34 | # Print a histogram containing the top N words, and print them and their counts.
35 | top_n = args.number
36 |
37 | # Load the file
38 | filepath = os.path.normpath(os.path.join(rawfilepath))
39 | file = open(filepath, 'r')
40 |
41 | # Parse as a list, removing lines
42 | content_sublists = [line.split(',') for line in file.readlines()]
43 |
44 | # Parse into a single list (from a list of lists)
45 | content_list = [item for sublist in content_sublists for item in sublist]
46 |
47 | # Remove whitespace so we can concatenate appropriately, and unify case
48 | content_list_strip = [str.strip().lower() for str in content_list]
49 |
50 | # Concatenate strings into a single string
51 | content_concat = ' '.join(content_list_strip)
52 |
53 | # Remove punctuation and new lines
54 | punct = set(string.punctuation)
55 | unpunct_content = ''.join(x for x in content_concat if x not in punct)
56 |
57 | # Split string into list of strings, again
58 | word_list = unpunct_content.split()
59 |
60 | # Perform count
61 | counts_all = Counter(word_list)
62 |
63 | words, count_values = zip(*counts_all.items())
64 |
65 | # Sort both lists by frequency in values (Schwartzian transform) - thanks, http://stackoverflow.com/questions/9543211/sorting-a-list-in-python-using-the-result-from-sorting-another-list
66 | values_sorted, words_sorted = zip(*sorted(zip(count_values, words), key=operator.itemgetter(0), reverse=True))
67 |
68 | # Top N
69 | words_sorted_top = words_sorted[0:top_n]
70 | values_sorted_top = values_sorted[0:top_n]
71 |
72 | print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
73 | print("{0} unique words identified in the text file, {1}".format(len(values_sorted), filepath))
74 | print("The top {0} words are: \n{1}".format(top_n, words_sorted_top))
75 | print("... their respective frequencies: \n{0}".format(values_sorted_top))
76 | print("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -")
77 | # Pandas DataFrame just for visualisation
78 | df = pd.DataFrame({'count': values_sorted_top, 'word': words_sorted_top})
79 | print("{0}".format(df))
80 | sys.stdout.flush()
81 |
82 | # Histogram
83 |
84 | # Make xticklabels comprehensible by matplotlib
85 | xticklabels = str(list(words_sorted_top)).split()
86 | # Remove the single quotes, commas and enclosing square brackets
87 | xtlabs = [xstr.replace("'","").replace(",","").replace("]","").replace("[","") for xstr in xticklabels]
88 |
89 |
90 | indices = np.arange(len(words_sorted_top))
91 | width = 1
92 | fig = plt.figure()
93 | fig.suptitle('Word frequency histogram, top {0}'.format(top_n), fontsize=16)
94 | plt.xlabel('word', fontsize=12)
95 | plt.ylabel('count', fontsize=12)
96 | plt.bar(indices, values_sorted_top, width*0.9, alpha=0.7, color='blue')
97 | plt.xticks(indices + width * 0.5, xtlabs, rotation='vertical', fontsize=8)
98 | plt.show()
99 |
100 | if __name__ == '__main__':
101 | main()
102 |
--------------------------------------------------------------------------------