├── Data
    └── phisher_account.txt
├── Model
    ├── gen_finetune_data.py
    ├── gen_pretrain_data.py
    ├── gen_seq.py
    ├── modeling.py
    ├── optimization.py
    ├── partitioning.py
    ├── run_finetune.py
    ├── run_pretrain.py
    ├── run_zipzap.sh
    ├── vocab.py
    └── zipzap_config.json
└── README.md


/Model/gen_finetune_data.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | from vocab import FreqVocab
  9 | import os
 10 | import collections
 11 | import random
 12 | import functools
 13 | # import tensorflow as tf
 14 | import tensorflow.compat.v1 as tf
 15 | tf.disable_v2_behavior()
 16 | 
 17 | import numpy as np
 18 | import sys
 19 | import six
 20 | import pickle as pkl
 21 | import time
 22 | 
 23 | flags = tf.flags
 24 | FLAGS = flags.FLAGS
 25 | 
 26 | random_seed = 12345
 27 | rng = random.Random(random_seed)
 28 | 
 29 | ## parameters
 30 | flags.DEFINE_integer("max_seq_length", 100, "max sequence length.")
 31 | # flags.DEFINE_integer("sliding_step", 30, "sliding window step size.")
 32 | flags.DEFINE_string("data_dir", './data/', "data dir.")
 33 | flags.DEFINE_string("dataset_name", 'eth',"dataset name.")
 34 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename")
 35 | flags.DEFINE_bool("total_drop", False, "whether to drop")
 36 | 
 37 | SLIDING_STEP = round(FLAGS.max_seq_length * 0.6)
 38 | 
 39 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length)
 40 | print("SLIDING_STEP:", SLIDING_STEP)
 41 | 
 42 | 
 43 | class FinetuneInstance(object):
 44 |     """A single training instance (sentence pair)."""
 45 | 
 46 |     def __init__(self, address, tokens, label):
 47 | 
 48 |         self.address = [address]
 49 |         self.tokens = list(map(lambda x: x[0], tokens))
 50 |         self.block_timestamps = list(map(lambda x: x[2], tokens))
 51 |         self.values = list(map(lambda x: x[3], tokens))
 52 |         self.label = label
 53 | 
 54 |         def map_io_flag(token):
 55 |             flag = token[4]
 56 |             if flag == "OUT":
 57 |                 return 1
 58 |             elif flag == "IN":
 59 |                 return 2
 60 |             else:
 61 |                 return 0
 62 | 
 63 |         self.io_flags = list(map(map_io_flag, tokens))
 64 |         self.cnts = list(map(lambda x: x[5], tokens))
 65 | 
 66 | 
 67 |     def __str__(self):
 68 |         s = "address: %s\n" % (self.address[0])
 69 |         s += "tokens: %s\n" % (
 70 |             " ".join([printable_text(x) for x in self.tokens]))
 71 |         s += "\n"
 72 |         return s
 73 | 
 74 |     def __repr__(self):
 75 |         return self.__str__()
 76 | 
 77 | def printable_text(text):
 78 |     """Returns text encoded in a way suitable for print or `tf.logging`."""
 79 | 
 80 |     # These functions want `str` for both Python2 and Python3, but in one case
 81 |     # it's a Unicode string and in the other it's a byte string.
 82 |     if six.PY3:
 83 |         if isinstance(text, str):
 84 |             return text
 85 |         elif isinstance(text, bytes):
 86 |             return text.decode("utf-8", "ignore")
 87 |         else:
 88 |             raise ValueError("Unsupported string type: %s" % (type(text)))
 89 |     elif six.PY2:
 90 |         if isinstance(text, str):
 91 |             return text
 92 |         elif isinstance(text, unicode):
 93 |             return text.encode("utf-8")
 94 |         else:
 95 |             raise ValueError("Unsupported string type: %s" % (type(text)))
 96 |     else:
 97 |         raise ValueError("Not running on Python2 or Python 3?")
 98 | 
 99 | def cmp_udf_reverse(x1, x2):
100 |     time1 = int(x1[2])
101 |     time2 = int(x2[2])
102 | 
103 |     if time1 < time2:
104 |         return 1
105 |     elif time1 > time2:
106 |         return -1
107 |     else:
108 |         return 0
109 | 
110 | 
111 | def create_embedding_predictions(tokens):
112 |     """Creates the predictions for the masked LM objective."""
113 |     address = tokens[0][0]
114 |     output_tokens = tokens
115 |     masked_lm_positions = []
116 |     masked_lm_labels = []
117 |     return (address, output_tokens, masked_lm_positions, masked_lm_labels)
118 | 
119 | 
120 | def gen_finetune_samples(sequences, label_list):
121 |     instances = []
122 |     # create train
123 |     start = time.time()
124 |     for i in tqdm(range(len(sequences))):
125 | 
126 |         tokens = sequences[i]
127 |         address = tokens[0][0]
128 |         instance = FinetuneInstance(
129 |             address=address,
130 |             tokens=tokens,
131 |             label=label_list[i])
132 |         instances.append(instance)
133 | 
134 |     end = time.time()
135 |     print("=======Finish========")
136 |     print("cost time:%.2f" % (end - start))
137 |     return instances
138 | 
139 | def create_int_feature(values):
140 |     feature = tf.train.Feature(
141 |         int64_list=tf.train.Int64List(value=list(values)))
142 |     return feature
143 | 
144 | def create_float_feature(values):
145 |     feature = tf.train.Feature(
146 |         float_list=tf.train.FloatList(value=list(values)))
147 |     return feature
148 | 
149 | def convert_timestamp_to_position(block_timestamps):
150 |     position = [0]
151 |     if len(block_timestamps) <= 1:
152 |         return position
153 |     last_ts = block_timestamps[1]
154 |     idx = 1
155 |     for b_ts in block_timestamps[1:]:
156 |         if b_ts != last_ts:
157 |             last_ts = b_ts
158 |             idx += 1
159 |         position.append(idx)
160 |     return position
161 | 
162 | def write_finetune_instance_to_example_files(instances, max_seq_length, vocab, output_files):
163 |     """Create TF example files from `TrainingInstance`s."""
164 |     writers = []
165 |     for output_file in output_files:
166 |         writers.append(tf.python_io.TFRecordWriter(output_file))
167 | 
168 |     writer_index = 0
169 |     total_written = 0
170 | 
171 |     for inst_index in tqdm(range(len(instances))):
172 |         instance = instances[inst_index]
173 |         input_ids = vocab.convert_tokens_to_ids(instance.tokens)
174 |         address = vocab.convert_tokens_to_ids(instance.address)
175 |         counts = instance.cnts
176 |         block_timestamps = instance.block_timestamps
177 |         values = instance.cnts
178 |         io_flags = instance.io_flags
179 |         positions = convert_timestamp_to_position(block_timestamps)
180 |         label = [instance.label]
181 | 
182 |         input_mask = [1] * len(input_ids)
183 |         assert len(input_ids) <= max_seq_length
184 |         assert len(counts) <= max_seq_length
185 |         assert len(values) <= max_seq_length
186 |         assert len(io_flags) <= max_seq_length
187 |         assert len(positions) <= max_seq_length
188 | 
189 |         input_ids += [0] * (max_seq_length - len(input_ids))
190 |         counts += [0] * (max_seq_length - len(counts))
191 |         values += [0] * (max_seq_length - len(values))
192 |         io_flags += [0] * (max_seq_length - len(io_flags))
193 |         positions += [0] * (max_seq_length - len(positions))
194 |         input_mask += [0] * (max_seq_length - len(input_mask))
195 | 
196 |         assert len(input_ids) == max_seq_length
197 |         assert len(counts) == max_seq_length
198 |         assert len(values) == max_seq_length
199 |         assert len(io_flags) == max_seq_length
200 |         assert len(positions) == max_seq_length
201 |         assert len(input_mask) == max_seq_length
202 | 
203 |         features = collections.OrderedDict()
204 |         features["address"] = create_int_feature(address)
205 |         features["label"] = create_float_feature(label)
206 |         features["input_ids"] = create_int_feature(input_ids)
207 |         features["input_positions"] = create_int_feature(positions)
208 |         features["input_counts"] = create_int_feature(counts)
209 |         features["input_io_flags"] = create_int_feature(io_flags)
210 |         features["input_values"] = create_int_feature(values)
211 |         features["input_mask"] = create_int_feature(input_mask)
212 | 
213 |         tf_example = tf.train.Example(
214 |             features=tf.train.Features(feature=features))
215 | 
216 |         writers[writer_index].write(tf_example.SerializeToString())
217 |         writer_index = (writer_index + 1) % len(writers)
218 | 
219 |         total_written += 1
220 | 
221 |         if inst_index < 3:
222 |             tf.logging.info("*** Example ***")
223 |             tf.logging.info("tokens: %s" % " ".join(
224 |                 [printable_text(x) for x in instance.tokens]))
225 | 
226 |             for feature_name in features.keys():
227 |                 feature = features[feature_name]
228 |                 values = []
229 |                 if feature.int64_list.value:
230 |                     values = feature.int64_list.value
231 |                 elif feature.float_list.value:
232 |                     values = feature.float_list.value
233 |                 tf.logging.info("%s: %s" % (feature_name,
234 |                                             " ".join([str(x)
235 |                                                       for x in values])))
236 | 
237 |     for writer in writers:
238 |         writer.close()
239 | 
240 |     tf.logging.info("Wrote %d total instances", total_written)
241 | 
242 | 
243 | def total_repeat_drop(eoa2seq):
244 |     """
245 |     totally drop the repeat part.
246 |     """
247 |     new_eoa2seq = {}
248 |     for eoa, seq in eoa2seq.items():
249 |         new_seq = []
250 |         exist_addr = set()
251 |         for trans in seq:
252 |             if trans[0] not in exist_addr:
253 |                 exist_addr.add(trans[0])
254 |                 new_seq.append(trans)
255 | 
256 |         new_eoa2seq[eoa] = new_seq
257 | 
258 |     return new_eoa2seq
259 | 
260 | 
261 | def random_drop(eoa2seq, ratio=0.5):
262 | 
263 |     new_eoa2seq = {}
264 |     for eoa, seq in eoa2seq.items():
265 |         filter_num = int(ratio * len(seq))
266 | 
267 |         if len(seq) <= 2:
268 |             new_eoa2seq[eoa] = seq
269 | 
270 |         else:
271 |             remain_idx = set(np.random.choice(range(len(seq)), len(seq) - filter_num, replace=False))
272 |             new_seq = []
273 | 
274 |             for id in remain_idx:
275 |                 new_seq.append(seq[id])
276 | 
277 |             new_seq = sorted(new_seq, key=functools.cmp_to_key(cmp_udf_reverse))
278 |             new_eoa2seq[eoa] = new_seq
279 | 
280 |     return new_eoa2seq
281 | 
282 | 
283 | 
284 | if __name__ == '__main__':
285 | 
286 |     # load label
287 |     phisher_account = pd.read_csv("../Data/phisher_account.txt", names=["account"])
288 |     phisher_account_set = set(phisher_account.account.values)
289 | 
290 |     # load vocab
291 |     vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename
292 |     with open(vocab_file_name, "rb") as f:
293 |         vocab = pkl.load(f)
294 | 
295 |     with open("./data/eoa2seq.pkl", "rb") as f:
296 |         eoa2seq = pkl.load(f)
297 | 
298 |     print("number of target user account:", len(eoa2seq))
299 | 
300 |     if FLAGS.total_drop:
301 |         eoa2seq = total_repeat_drop(eoa2seq)
302 | 
303 |     eoa_list = list(eoa2seq.keys())
304 |     rng.shuffle(eoa_list)
305 |     idx = round(len(eoa_list) * 0.7)
306 |     train_eoa_list = set(eoa_list[:idx])
307 |     test_eoa_list = set(eoa_list[idx:])
308 |     print("------------------")
309 |     print(len(train_eoa_list.intersection(test_eoa_list)))
310 | 
311 |     label_list = []
312 |     # clip and add label
313 |     def is_phish(address):
314 |         if address in phisher_account_set:
315 |             return 1.0
316 |         else:
317 |             return 0.0
318 | 
319 |     max_num_tokens = FLAGS.max_seq_length - 1
320 |     seqs = []
321 |     idx = 0
322 |     for eoa, seq in eoa2seq.items():
323 |         if len(seq) <= max_num_tokens:
324 |             seqs.append([[eoa, 0, 0, 0, 0, 0]])
325 |             seqs[idx] += seq
326 |             idx += 1
327 |             label_list.append(is_phish(eoa))
328 | 
329 |         elif len(seq) > max_num_tokens:
330 |             beg_idx = list(range(len(seq) - max_num_tokens, 0, -1 * SLIDING_STEP))
331 |             beg_idx.append(0)
332 | 
333 |             if len(beg_idx) > 500:
334 |                 beg_idx = list(np.random.permutation(beg_idx)[:500])
335 |                 for i in beg_idx:
336 |                     seqs.append([[eoa, 0, 0, 0, 0, 0]])
337 |                     seqs[idx] += seq[i:i + max_num_tokens]
338 |                     idx += 1
339 |                     label_list.append(is_phish(eoa))
340 | 
341 |             else:
342 |                 for i in beg_idx[::-1]:
343 |                     seqs.append([[eoa, 0, 0, 0, 0, 0]])
344 |                     seqs[idx] += seq[i:i + max_num_tokens]
345 |                     idx += 1
346 |                     label_list.append(is_phish(eoa))
347 | 
348 |     # split into training and testing sequences
349 |     train_seqs = []
350 |     test_seqs = []
351 |     train_label_list = []
352 |     test_label_list = []
353 |     print("Splitting the sequence..")
354 |     for i in tqdm(range(len(seqs))):
355 |         seq = seqs[i]
356 |         label = label_list[i]
357 |         if seq[0][0] in train_eoa_list:
358 |             train_seqs.append(seq)
359 |             train_label_list.append(label)
360 |         elif seq[0][0] in test_eoa_list:
361 |             test_seqs.append(seq)
362 |             test_label_list.append(label)
363 | 
364 |     print("Generating training samples..")
365 |     train_phish_instance = gen_finetune_samples(train_seqs, train_label_list)
366 |     rng.shuffle(train_phish_instance)
367 | 
368 |     print("Generating testing samples..")
369 |     test_phish_instance = gen_finetune_samples(test_seqs, test_label_list)
370 |     rng.shuffle(test_phish_instance)
371 | 
372 |     print("*** Writing to output files ***")
373 |     output_filename = FLAGS.data_dir + "finetune_train.tfrecord"
374 |     print("  %s", output_filename)
375 | 
376 |     write_finetune_instance_to_example_files(train_phish_instance, FLAGS.max_seq_length, vocab, [output_filename])
377 | 
378 |     print("*** Writing to output files ***")
379 |     output_filename = FLAGS.data_dir + "finetune_test.tfrecord"
380 |     print("  %s", output_filename)
381 | 
382 |     write_finetune_instance_to_example_files(test_phish_instance, FLAGS.max_seq_length, vocab, [output_filename])
383 |     print("Finished..")
384 | 


--------------------------------------------------------------------------------
/Model/gen_pretrain_data.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import pickle as pkl
  4 | from tqdm import tqdm
  5 | import collections
  6 | import functools
  7 | import random
  8 | import tensorflow.compat.v1 as tf
  9 | 
 10 | tf.disable_v2_behavior()
 11 | 
 12 | import six
 13 | import time
 14 | import math
 15 | from vocab import FreqVocab
 16 | 
 17 | tf.logging.set_verbosity(tf.logging.INFO)
 18 | 
 19 | random_seed = 12345
 20 | rng = random.Random(random_seed)
 21 | 
 22 | short_seq_prob = 0  # Probability of creating sequences which are shorter than the maximum length。
 23 | flags = tf.flags
 24 | FLAGS = flags.FLAGS
 25 | 
 26 | flags.DEFINE_integer("pool_size", 10, "multiprocesses pool size.")
 27 | flags.DEFINE_integer("max_seq_length", 29, "max sequence length.")
 28 | flags.DEFINE_float("masked_lm_prob", 0.8, "Masked LM probability.")
 29 | flags.DEFINE_float("mask_prob", 1.0, "mask probabaility")
 30 | flags.DEFINE_bool("do_eval", False, "")
 31 | flags.DEFINE_bool("do_embed", True, "")
 32 | flags.DEFINE_integer("dupe_factor", 10, "Number of times to duplicate the input data (with different masks).")
 33 | flags.DEFINE_string("data_dir", './data/', "data dir.")
 34 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename")
 35 | flags.DEFINE_bool("total_drop", True, "whether to drop")
 36 | flags.DEFINE_bool("drop", False, "whether to drop")
 37 | 
 38 | HEADER = 'hash,nonce,block_hash,block_number,transaction_index,from_address,to_address,value,gas,gas_price,input,block_timestamp,max_fee_per_gas,max_priority_fee_per_gas,transaction_type'.split(
 39 |     ",")
 40 | 
 41 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
 42 |                                           ["index", "label"])
 43 | 
 44 | MAX_PREDICTIONS_PER_SEQ = math.ceil(FLAGS.max_seq_length * FLAGS.masked_lm_prob)
 45 | SLIDING_STEP = round(FLAGS.max_seq_length * 0.6)
 46 | 
 47 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length)
 48 | print("MAX_PREDICTIONS_PER_SEQ:", MAX_PREDICTIONS_PER_SEQ)
 49 | print("SLIDING_STEP:", SLIDING_STEP)
 50 | 
 51 | class TrainingInstance(object):
 52 |     """A single training instance (sentence pair)."""
 53 | 
 54 |     def __init__(self, address, tokens, masked_lm_positions, masked_lm_labels):
 55 | 
 56 |         self.address = [address]
 57 |         self.tokens = list(map(lambda x: x[0], tokens))
 58 |         self.block_timestamps = list(map(lambda x: x[2], tokens))
 59 |         self.values = list(map(lambda x: x[3], tokens))
 60 | 
 61 |         def map_io_flag(token):
 62 |             flag = token[4]
 63 |             if flag == "OUT":
 64 |                 return 1
 65 |             elif flag == "IN":
 66 |                 return 2
 67 |             else:
 68 |                 return 0
 69 | 
 70 |         self.io_flags = list(map(map_io_flag, tokens))
 71 |         self.cnts = list(map(lambda x: x[5], tokens))
 72 |         self.masked_lm_positions = masked_lm_positions
 73 |         self.masked_lm_labels = masked_lm_labels
 74 | 
 75 |     def __str__(self):
 76 |         s = "address: %s\n" % (self.address[0])
 77 |         s += "tokens: %s\n" % (
 78 |             " ".join([printable_text(x) for x in self.tokens]))
 79 |         s += "masked_lm_positions: %s\n" % (
 80 |             " ".join([str(x) for x in self.masked_lm_positions]))
 81 |         s += "masked_lm_labels: %s\n" % (
 82 |             " ".join([printable_text(x) for x in self.masked_lm_labels]))
 83 |         s += "\n"
 84 |         return s
 85 | 
 86 |     def __repr__(self):
 87 |         return self.__str__()
 88 | 
 89 | 
 90 | def printable_text(text):
 91 |     """Returns text encoded in a way suitable for print or `tf.logging`."""
 92 | 
 93 |     # These functions want `str` for both Python2 and Python3, but in one case
 94 |     # it's a Unicode string and in the other it's a byte string.
 95 |     if six.PY3:
 96 |         if isinstance(text, str):
 97 |             return text
 98 |         elif isinstance(text, bytes):
 99 |             return text.decode("utf-8", "ignore")
100 |         else:
101 |             raise ValueError("Unsupported string type: %s" % (type(text)))
102 |     elif six.PY2:
103 |         if isinstance(text, str):
104 |             return text
105 |         elif isinstance(text, unicode):
106 |             return text.encode("utf-8")
107 |         else:
108 |             raise ValueError("Unsupported string type: %s" % (type(text)))
109 |     else:
110 |         raise ValueError("Not running on Python2 or Python 3?")
111 | 
112 | 
113 | def create_int_feature(values):
114 |     feature = tf.train.Feature(
115 |         int64_list=tf.train.Int64List(value=list(values)))
116 |     return feature
117 | 
118 | 
119 | def create_float_feature(values):
120 |     feature = tf.train.Feature(
121 |         float_list=tf.train.FloatList(value=list(values)))
122 |     return feature
123 | 
124 | 
125 | def gen_samples(sequences,
126 |                 dupe_factor,
127 |                 masked_lm_prob,
128 |                 max_predictions_per_seq,
129 |                 pool_size,
130 |                 rng,
131 |                 force_head=False):
132 |     instances = []
133 |     # create train
134 |     if force_head:
135 |         for step in range(dupe_factor):
136 |             start = time.time()
137 |             for tokens in sequences:
138 |                 (address, tokens, masked_lm_positions,
139 |                  masked_lm_labels) = create_masked_lm_predictions_force_head(tokens)
140 |                 instance = TrainingInstance(
141 |                     address=address,
142 |                     tokens=tokens,
143 |                     masked_lm_positions=masked_lm_positions,
144 |                     masked_lm_labels=masked_lm_labels)
145 |                 instances.append(instance)
146 |             end = time.time()
147 |             cost = end - start
148 |             print("step=%d, time=%.2f" % (step, cost))
149 |         print("=======Finish========")
150 | 
151 |     else:
152 |         for step in range(dupe_factor):
153 |             start = time.time()
154 |             for tokens in sequences:
155 |                 (address, tokens, masked_lm_positions,
156 |                  masked_lm_labels) = create_masked_lm_predictions(
157 |                     tokens, masked_lm_prob, max_predictions_per_seq, rng)
158 |                 instance = TrainingInstance(
159 |                     address=address,
160 |                     tokens=tokens,
161 |                     masked_lm_positions=masked_lm_positions,
162 |                     masked_lm_labels=masked_lm_labels)
163 |                 instances.append(instance)
164 |             end = time.time()
165 |             cost = end - start
166 |             print("step=%d, time=%.2f" % (step, cost))
167 |         print("=======Finish========")
168 |     return instances
169 | 
170 | 
171 | def create_masked_lm_predictions_force_head(tokens):
172 |     """Creates the predictions for the masked LM objective."""
173 |     first_index = 0
174 |     address = tokens[0][0]
175 |     output_tokens = [list(i) for i in tokens]  # note that change the value of output_tokens will also change tokens
176 |     output_tokens[first_index] = ["[MASK]", 0, 0, 0, 0, 0]
177 |     masked_lm_positions = [first_index]
178 |     masked_lm_labels = [tokens[first_index][0]]
179 | 
180 |     return (address, output_tokens, masked_lm_positions, masked_lm_labels)
181 | 
182 | 
183 | def create_masked_lm_predictions(tokens, masked_lm_prob,
184 |                                  max_predictions_per_seq, rng):
185 |     """Creates the predictions for the masked LM objective."""
186 | 
187 |     address = tokens[0][0]
188 |     cand_indexes = []
189 |     for (i, token) in enumerate(tokens):
190 |         cand_indexes.append(i)
191 | 
192 |     rng.shuffle(cand_indexes)
193 |     output_tokens = [list(i) for i in tokens]  # note that change the value of output_tokens will also change tokens
194 |     num_to_predict = min(max_predictions_per_seq,
195 |                          max(1, int(len(tokens) * masked_lm_prob)))
196 |     masked_lms = []
197 |     covered_indexes = set()
198 |     for index in cand_indexes:
199 |         if len(masked_lms) >= num_to_predict:
200 |             break
201 |         if index in covered_indexes:
202 |             continue
203 |         covered_indexes.add(index)
204 |         masked_token = "[MASK]"
205 |         masked_lms.append(MaskedLmInstance(index=index, label=tokens[index][0]))
206 |         output_tokens[index][0] = masked_token
207 | 
208 |     masked_lms = sorted(masked_lms, key=lambda x: x.index)
209 |     masked_lm_positions = []
210 |     masked_lm_labels = []
211 |     for p in masked_lms:
212 |         masked_lm_positions.append(p.index)
213 |         masked_lm_labels.append(p.label)
214 |     return (address, output_tokens, masked_lm_positions, masked_lm_labels)
215 | 
216 | 
217 | def create_embedding_predictions(tokens):
218 |     """Creates the predictions for the masked LM objective."""
219 |     address = tokens[0][0]
220 |     output_tokens = tokens
221 |     masked_lm_positions = []
222 |     masked_lm_labels = []
223 |     return (address, output_tokens, masked_lm_positions, masked_lm_labels)
224 | 
225 | 
226 | def gen_embedding_samples(sequences):
227 |     instances = []
228 |     # create train
229 |     start = time.time()
230 |     for tokens in sequences:
231 |         (address, tokens, masked_lm_positions,
232 |          masked_lm_labels) = create_embedding_predictions(tokens)
233 |         instance = TrainingInstance(
234 |             address=address,
235 |             tokens=tokens,
236 |             masked_lm_positions=masked_lm_positions,
237 |             masked_lm_labels=masked_lm_labels)
238 |         instances.append(instance)
239 | 
240 |     end = time.time()
241 |     print("=======Finish========")
242 |     print("cost time:%.2f" % (end - start))
243 |     return instances
244 | 
245 | 
246 | def convert_timestamp_to_position(block_timestamps):
247 |     position = [0]
248 |     if len(block_timestamps) <= 1:
249 |         return position
250 |     last_ts = block_timestamps[1]
251 |     idx = 1
252 |     for b_ts in block_timestamps[1:]:
253 |         if b_ts != last_ts:
254 |             last_ts = b_ts
255 |             idx += 1
256 |         position.append(idx)
257 |     return position
258 | 
259 | 
260 | def write_instance_to_example_files(instances, max_seq_length,
261 |                                     max_predictions_per_seq, vocab,
262 |                                     output_files):
263 |     """Create TF example files from `TrainingInstance`s."""
264 |     writers = []
265 |     for output_file in output_files:
266 |         writers.append(tf.python_io.TFRecordWriter(output_file))
267 | 
268 |     writer_index = 0
269 |     total_written = 0
270 | 
271 |     for inst_index in tqdm(range(len(instances))):
272 |         instance = instances[inst_index]
273 |         input_ids = vocab.convert_tokens_to_ids(instance.tokens)
274 |         address = vocab.convert_tokens_to_ids(instance.address)
275 |         counts = instance.cnts
276 |         block_timestamps = instance.block_timestamps
277 |         values = instance.values
278 |         io_flags = instance.io_flags
279 |         positions = convert_timestamp_to_position(block_timestamps)
280 | 
281 |         input_mask = [1] * len(input_ids)
282 |         assert len(input_ids) <= max_seq_length
283 |         assert len(counts) <= max_seq_length
284 |         assert len(values) <= max_seq_length
285 |         assert len(io_flags) <= max_seq_length
286 |         assert len(positions) <= max_seq_length
287 | 
288 |         input_ids += [0] * (max_seq_length - len(input_ids))
289 |         counts += [0] * (max_seq_length - len(counts))
290 |         values += [0] * (max_seq_length - len(values))
291 |         io_flags += [0] * (max_seq_length - len(io_flags))
292 |         positions += [0] * (max_seq_length - len(positions))
293 |         input_mask += [0] * (max_seq_length - len(input_mask))
294 | 
295 |         assert len(input_ids) == max_seq_length
296 |         assert len(counts) == max_seq_length
297 |         assert len(values) == max_seq_length
298 |         assert len(io_flags) == max_seq_length
299 |         assert len(positions) == max_seq_length
300 |         assert len(input_mask) == max_seq_length
301 | 
302 |         masked_lm_positions = list(instance.masked_lm_positions)
303 |         masked_lm_ids = vocab.convert_tokens_to_ids(instance.masked_lm_labels)
304 |         masked_lm_weights = [1.0] * len(masked_lm_ids)
305 | 
306 |         masked_lm_positions += [0] * (max_predictions_per_seq - len(masked_lm_positions))
307 |         masked_lm_ids += [0] * (max_predictions_per_seq - len(masked_lm_ids))
308 |         masked_lm_weights += [0.0] * (max_predictions_per_seq - len(masked_lm_weights))
309 | 
310 |         features = collections.OrderedDict()
311 |         features["address"] = create_int_feature(address)
312 |         features["input_ids"] = create_int_feature(input_ids)
313 |         features["input_positions"] = create_int_feature(positions)
314 |         features["input_counts"] = create_int_feature(counts)
315 |         features["input_io_flags"] = create_int_feature(io_flags)
316 |         features["input_values"] = create_int_feature(values)
317 | 
318 |         features["input_mask"] = create_int_feature(input_mask)
319 |         features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
320 |         features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
321 |         features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
322 | 
323 |         tf_example = tf.train.Example(
324 |             features=tf.train.Features(feature=features))
325 | 
326 |         writers[writer_index].write(tf_example.SerializeToString())
327 |         writer_index = (writer_index + 1) % len(writers)
328 | 
329 |         total_written += 1
330 | 
331 |         if inst_index < 3:
332 |             tf.logging.info("*** Example ***")
333 |             tf.logging.info("tokens: %s" % " ".join(
334 |                 [printable_text(x) for x in instance.tokens]))
335 | 
336 |             for feature_name in features.keys():
337 |                 feature = features[feature_name]
338 |                 values = []
339 |                 if feature.int64_list.value:
340 |                     values = feature.int64_list.value
341 |                 elif feature.float_list.value:
342 |                     values = feature.float_list.value
343 |                 tf.logging.info("%s: %s" % (feature_name,
344 |                                             " ".join([str(x)
345 |                                                       for x in values])))
346 | 
347 |     for writer in writers:
348 |         writer.close()
349 | 
350 |     tf.logging.info("Wrote %d total instances", total_written)
351 | 
352 | 
353 | 
354 | def total_repeat_drop(eoa2seq):
355 |     """
356 |     totally drop the repeat transaction based on time.
357 |     """
358 |     new_eoa2seq = {}
359 |     for eoa, seq in eoa2seq.items():
360 |         new_seq = []
361 |         exist_addr = set()
362 |         for trans in seq:
363 |             if trans[0] not in exist_addr:
364 |                 exist_addr.add(trans[0])
365 |                 new_seq.append(trans)
366 | 
367 |         new_eoa2seq[eoa] = new_seq
368 | 
369 |     return new_eoa2seq
370 | 
371 | 
372 | def cmp_udf_reverse(x1, x2):
373 |     time1 = int(x1[2])
374 |     time2 = int(x2[2])
375 | 
376 |     if time1 < time2:
377 |         return 1
378 |     elif time1 > time2:
379 |         return -1
380 |     else:
381 |         return 0
382 | 
383 | 
384 | def main():
385 |     vocab = FreqVocab()
386 |     print("===========Load Sequence===========")
387 |     with open("./data/eoa2seq.pkl", "rb") as f:
388 |         eoa2seq = pkl.load(f)
389 | 
390 |     print("number of target user account:", len(eoa2seq))
391 |     vocab.update(eoa2seq)
392 |     # generate mapping
393 |     vocab.generate_vocab()
394 | 
395 |     # save vocab
396 |     print("token_size:{}".format(len(vocab.vocab_words)))
397 |     vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename
398 |     print('vocab pickle file: ' + vocab_file_name)
399 |     with open(vocab_file_name, 'wb') as output_file:
400 |         pkl.dump(vocab, output_file, protocol=2)
401 | 
402 |     print("===========Original===========")
403 |     length_list = []
404 |     for eoa in eoa2seq.keys():
405 |         seq = eoa2seq[eoa]
406 |         length_list.append(len(seq))
407 | 
408 |     length_list = np.array(length_list)
409 |     print("Median:", np.median(length_list))
410 |     print("Mean:", np.mean(length_list))
411 |     print("Seq num:", len(length_list))
412 | 
413 |     if FLAGS.total_drop:
414 |         eoa2seq = total_repeat_drop(eoa2seq)
415 | 
416 |     print("==========After Reduce==========")
417 |     length_list = []
418 |     for eoa in eoa2seq.keys():
419 |         seq = eoa2seq[eoa]
420 |         length_list.append(len(seq))
421 | 
422 |     length_list = np.array(length_list)
423 |     print("Median:", np.median(length_list))
424 |     print("Mean:", np.mean(length_list))
425 |     print("Seq num:", len(length_list))
426 | 
427 |     # clip
428 |     max_num_tokens = FLAGS.max_seq_length - 1
429 |     seqs = []
430 |     idx = 0
431 |     for eoa, seq in eoa2seq.items():
432 |         if len(seq) <= max_num_tokens:
433 |             seqs.append([[eoa, 0, 0, 0, 0, 0]])
434 |             seqs[idx] += seq
435 |             idx += 1
436 |         elif len(seq) > max_num_tokens:
437 |             beg_idx = list(range(len(seq) - max_num_tokens, 0, -1 * SLIDING_STEP))
438 |             beg_idx.append(0)
439 | 
440 |             if len(beg_idx) > 500:
441 |                 beg_idx = list(np.random.permutation(beg_idx)[:500])
442 |                 for i in beg_idx:
443 |                     seqs.append([[eoa, 0, 0, 0, 0, 0]])
444 |                     seqs[idx] += seq[i:i + max_num_tokens]
445 |                     idx += 1
446 | 
447 |             else:
448 |                 for i in beg_idx[::-1]:
449 |                     seqs.append([[eoa, 0, 0, 0, 0, 0]])
450 |                     seqs[idx] += seq[i:i + max_num_tokens]
451 |                     idx += 1
452 | 
453 |     if FLAGS.do_embed:
454 |         print("===========Generate Embedding Samples==========")
455 |         write_instance = gen_embedding_samples(seqs)
456 |         output_filename = FLAGS.data_dir + "embed.tfrecord"
457 |         tf.logging.info("*** Writing to output embedding files ***")
458 |         tf.logging.info("  %s", output_filename)
459 | 
460 |         write_instance_to_example_files(write_instance, FLAGS.max_seq_length,
461 |                                         MAX_PREDICTIONS_PER_SEQ, vocab,
462 |                                         [output_filename])
463 | 
464 |     seqs = np.random.permutation(seqs)
465 | 
466 |     if FLAGS.do_eval:  # select 20% for testing
467 |         print("========Generate Evaluation Samples========")
468 |         eval_seqs = seqs[:round(len(seqs) * 0.2)]
469 |         seqs = seqs[round(len(seqs) * 0.2):]
470 | 
471 |         eval_normal_instances = gen_samples(eval_seqs,
472 |                                             dupe_factor=FLAGS.dupe_factor,
473 |                                             masked_lm_prob=FLAGS.masked_lm_prob,
474 |                                             max_predictions_per_seq=MAX_PREDICTIONS_PER_SEQ,
475 |                                             pool_size=FLAGS.pool_size,
476 |                                             rng=rng,
477 |                                             force_head=False)
478 | 
479 |         eval_write_instance = eval_normal_instances
480 |         rng.shuffle(eval_write_instance)
481 |         eval_output_filename = FLAGS.data_dir + "test.tfrecord"
482 |         tf.logging.info("*** Writing to Testing files ***")
483 |         tf.logging.info("  %s", eval_output_filename)
484 | 
485 |         write_instance_to_example_files(eval_write_instance, FLAGS.max_seq_length,
486 |                                         MAX_PREDICTIONS_PER_SEQ, vocab,
487 |                                         [eval_output_filename])
488 | 
489 |     print("========Generate Training Samples========")
490 |     normal_instances = gen_samples(seqs,
491 |                                    dupe_factor=FLAGS.dupe_factor,
492 |                                    masked_lm_prob=FLAGS.masked_lm_prob,
493 |                                    max_predictions_per_seq=MAX_PREDICTIONS_PER_SEQ,
494 |                                    pool_size=FLAGS.pool_size,
495 |                                    rng=rng,
496 |                                    force_head=False)
497 | 
498 |     write_instance = normal_instances
499 |     rng.shuffle(write_instance)
500 | 
501 |     output_filename = FLAGS.data_dir + "train.tfrecord"
502 |     tf.logging.info("*** Writing to Training files ***")
503 |     tf.logging.info("  %s", output_filename)
504 | 
505 |     write_instance_to_example_files(write_instance, FLAGS.max_seq_length,
506 |                                     MAX_PREDICTIONS_PER_SEQ, vocab,
507 |                                     [output_filename])
508 | 
509 |     return
510 | 
511 | 
512 | if __name__ == '__main__':
513 |     main()
514 | 
515 | 
516 | 


--------------------------------------------------------------------------------
/Model/gen_seq.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pickle as pkl
  4 | import functools
  5 | import os
  6 | from vocab import FreqVocab
  7 | import tensorflow.compat.v1 as tf
  8 | tf.disable_v2_behavior()
  9 | 
 10 | flags = tf.flags
 11 | FLAGS = flags.FLAGS
 12 | 
 13 | flags.DEFINE_bool("phisher", True, "whether to include phisher detection dataset.")
 14 | flags.DEFINE_string("data_dir", "../../Data", "data directory.")
 15 | flags.DEFINE_string("dataset", "1M", "which dataset to use")
 16 | flags.DEFINE_bool("dup", True, "whether to do transaction duplication")
 17 | 
 18 | 
 19 | HEADER = 'hash,nonce,block_hash,block_number,transaction_index,from_address,to_address,value,gas,gas_price,input,block_timestamp,max_fee_per_gas,max_priority_fee_per_gas,transaction_type'.split(",")
 20 | 
 21 | def cmp_udf(x1, x2):
 22 |     time1 = int(x1[2])
 23 |     time2 = int(x2[2])
 24 |     if time1 < time2:
 25 |         return -1
 26 |     elif time1 > time2:
 27 |         return 1
 28 |     else:
 29 |         return 0
 30 | 
 31 | def cmp_udf_reverse(x1, x2):
 32 |     time1 = int(x1[2])
 33 |     time2 = int(x2[2])
 34 | 
 35 |     if time1 < time2:
 36 |         return 1
 37 |     elif time1 > time2:
 38 |         return -1
 39 |     else:
 40 |         return 0
 41 | 
 42 | def load_data(f_in, f_out):
 43 |     eoa2seq_out = {}
 44 |     error_trans = []
 45 |     while True:
 46 |         trans = f_out.readline()
 47 |         if trans == "":
 48 |             break
 49 |         record = trans.split(",")
 50 |         trans_hash = record[0]
 51 |         block_number = int(record[3])
 52 |         from_address = record[5]
 53 |         to_address = record[6]
 54 |         value = int(record[7]) / (pow(10, 12))
 55 |         gas = int(record[8])
 56 |         gas_price = int(record[9])
 57 |         block_timestamp = int(record[11])
 58 |         if from_address == "" or to_address == "":
 59 |             error_trans.append(trans)
 60 |             continue
 61 |         try:
 62 |             eoa2seq_out[from_address].append([to_address, block_number, block_timestamp, value, "OUT", 1])
 63 |         except:
 64 |             eoa2seq_out[from_address] = [[to_address, block_number, block_timestamp, value, "OUT", 1]]
 65 | 
 66 |     eoa2seq_in = {}
 67 |     while True:
 68 |         trans = f_in.readline()
 69 |         if trans == "":
 70 |             break
 71 |         record = trans.split(",")
 72 |         block_number = int(record[3])
 73 |         from_address = record[5]
 74 |         to_address = record[6]
 75 |         value = int(record[7]) / (pow(10, 12))
 76 |         gas = int(record[8])
 77 |         gas_price = int(record[9])
 78 |         block_timestamp = int(record[11])
 79 |         if from_address == "" or to_address == "":
 80 |             error_trans.append(trans)
 81 |             continue
 82 |         try:
 83 |             eoa2seq_in[to_address].append([from_address, block_number, block_timestamp, value, "IN", 1]) # not process trans
 84 |         except:
 85 |             eoa2seq_in[to_address] = [[from_address, block_number, block_timestamp, value, "IN", 1]] # in/out, cnt
 86 |     return eoa2seq_in, eoa2seq_out
 87 | 
 88 | def seq_duplicate(eoa2seq_in, eoa2seq_out):
 89 |     eoa2seq_agg_in = {}
 90 |     for eoa in eoa2seq_in.keys():
 91 |         if len(eoa2seq_in[eoa]) >= 10000:
 92 |             continue
 93 |         seq_sorted = sorted(eoa2seq_in[eoa], key=functools.cmp_to_key(cmp_udf))
 94 |         seq_tmp = [e.copy() for e in seq_sorted]
 95 |         for i in range(len(seq_tmp) - 1, 0, -1):
 96 |             l_acc = seq_tmp[i][0]  # latter
 97 |             f_acc = seq_tmp[i - 1][0]  # former
 98 |             l_time = int(seq_tmp[i][2])
 99 |             f_time = int(seq_tmp[i - 1][2])
100 |             delta_time = l_time - f_time
101 |             if f_acc != l_acc or delta_time > 86400 * 3:
102 |                 continue
103 |             # value add
104 |             seq_tmp[i - 1][3] += seq_tmp[i][3]
105 |             seq_tmp[i - 1][5] += seq_tmp[i][5]
106 |             del seq_tmp[i]
107 |         eoa2seq_agg_in[eoa] = seq_tmp
108 | 
109 |     eoa2seq_agg_out = {}
110 |     for eoa in eoa2seq_out.keys():
111 |         if len(eoa2seq_out[eoa])>=10000:
112 |             continue
113 |         seq_sorted = sorted(eoa2seq_out[eoa], key=functools.cmp_to_key(cmp_udf))
114 |         seq_tmp = [e.copy() for e in seq_sorted]
115 |         for i in range(len(seq_tmp) - 1, 0, -1):
116 |             l_acc = seq_tmp[i][0]  # latter
117 |             f_acc = seq_tmp[i - 1][0]  # former
118 |             l_time = int(seq_tmp[i][2])
119 |             f_time = int(seq_tmp[i - 1][2])
120 |             delta_time = l_time - f_time
121 |             if f_acc != l_acc or delta_time > 86400 * 3:
122 |                 continue
123 |             # value add
124 |             seq_tmp[i - 1][3] += seq_tmp[i][3]
125 |             seq_tmp[i - 1][5] += seq_tmp[i][5]
126 |             del seq_tmp[i]
127 |         eoa2seq_agg_out[eoa] = seq_tmp
128 | 
129 |     eoa_list = list(eoa2seq_agg_out.keys()) # eoa_list must include eoa account only (i.e., have out transaction at least)
130 |     eoa2seq_agg = {}
131 | 
132 |     for eoa in eoa_list:
133 |         out_seq = eoa2seq_agg_out[eoa]
134 |         try:
135 |             in_seq = eoa2seq_agg_in[eoa]
136 |         except:
137 |             in_seq = []
138 | 
139 |         seq_agg = sorted(out_seq + in_seq, key=functools.cmp_to_key(cmp_udf_reverse))
140 |         cnt_all = 0
141 |         for trans in seq_agg:
142 |             cnt_all += trans[5]
143 |             if cnt_all >= 5 and cnt_all<=10000:
144 |             # if cnt_all > 2 and cnt_all<=10000:
145 |                 eoa2seq_agg[eoa] = seq_agg
146 |                 break
147 | 
148 |     return eoa2seq_agg
149 | 
150 | def seq_generation(eoa2seq_in, eoa2seq_out):
151 | 
152 |     eoa_list = list(eoa2seq_out.keys()) # eoa_list must include eoa account only (i.e., have out transaction at least)
153 |     eoa2seq = {}
154 |     for eoa in eoa_list:
155 |         out_seq = eoa2seq_out[eoa]
156 |         try:
157 |             in_seq = eoa2seq_in[eoa]
158 |         except:
159 |             in_seq = []
160 |         seq_agg = sorted(out_seq + in_seq, key=functools.cmp_to_key(cmp_udf_reverse))
161 |         cnt_all = 0
162 |         for trans in seq_agg:
163 |             cnt_all += 1
164 |             if cnt_all >= 5 and cnt_all<=10000:
165 |             # if cnt_all > 2 and cnt_all<=10000:
166 |                 eoa2seq[eoa] = seq_agg
167 |                 break
168 | 
169 |     return eoa2seq
170 | 
171 | def feature_bucketization(eoa2seq_agg):
172 | 
173 |     for eoa in eoa2seq_agg.keys():
174 |         seq = eoa2seq_agg[eoa]
175 |         for trans in seq:
176 |             amount = trans[3]
177 |             cnt = trans[5]
178 | 
179 |             if amount == 0:
180 |                 amount_bucket = 1
181 |             elif amount<= 591:
182 |                 amount_bucket = 2
183 |             elif amount<= 6195:
184 |                 amount_bucket = 3
185 |             elif amount <= 21255:
186 |                 amount_bucket = 4
187 |             elif amount <= 50161:
188 |                 amount_bucket = 5
189 |             elif amount <= 100120:
190 |                 amount_bucket = 6
191 |             elif amount <= 208727:
192 |                 amount_bucket = 7
193 |             elif amount <= 508961:
194 |                 amount_bucket = 8
195 |             elif amount <= 1360574:
196 |                 amount_bucket = 9
197 |             elif amount <= 6500000:
198 |                 amount_bucket = 10
199 |             elif amount <= 143791433950:
200 |                 amount_bucket = 11
201 |             else:
202 |                 amount_bucket = 12
203 | 
204 |             trans[3] = amount_bucket
205 | 
206 |             if cnt == 0:
207 |                 cnt_bucket = 0
208 |             elif cnt == 1:
209 |                 cnt_bucket = 1
210 |             elif cnt == 2:
211 |                 cnt_bucket = 2
212 |             elif cnt == 3:
213 |                 cnt_bucket = 3
214 |             elif cnt == 4:
215 |                 cnt_bucket = 4
216 |             elif cnt == 5:
217 |                 cnt_bucket = 5
218 |             elif cnt == 6:
219 |                 cnt_bucket = 6
220 |             elif cnt == 7:
221 |                 cnt_bucket = 7
222 |             elif 8 < cnt <= 10:
223 |                 cnt_bucket = 8
224 |             elif 10 < cnt <= 20:
225 |                 cnt_bucket = 9
226 |             else:
227 |                 cnt_bucket = 10
228 | 
229 |             trans[5] = cnt_bucket
230 | 
231 |     return eoa2seq_agg
232 | 
233 | def main():
234 | 
235 |     if FLAGS.dataset in ("1000K", "1M"):
236 |         f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice_1000K.csv"), "r")
237 |         f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice_1000K.csv"), "r")
238 | 
239 |     elif FLAGS.dataset in ("3000K", "3M"):
240 |         f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice_3000K.csv"), "r")
241 |         f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice_3000K.csv"), "r")
242 | 
243 |     elif FLAGS.dataset in ("10M"):
244 |         f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice.csv"), "r")
245 |         f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice.csv"), "r")
246 | 
247 |     else:
248 |         raise ValueError("Please choose right dataset")
249 | 
250 |     print("Add normal " + FLAGS.dataset)
251 | 
252 |     eoa2seq_in, eoa2seq_out = load_data(f_in, f_out)
253 | 
254 |     if FLAGS.dup:
255 |         eoa2seq_agg = seq_duplicate(eoa2seq_in, eoa2seq_out)
256 |     else:
257 |         eoa2seq_agg = seq_generation(eoa2seq_in, eoa2seq_out)
258 | 
259 |     if FLAGS.phisher:
260 |         print("Add phishing..")
261 |         phisher_f_in = open(os.path.join(FLAGS.data_dir, "phisher_transaction_in.csv"), "r")
262 |         phisher_f_out = open(os.path.join(FLAGS.data_dir, "phisher_transaction_out.csv"), "r")
263 |         phisher_eoa2seq_in, phisher_eoa2seq_out = load_data(phisher_f_in, phisher_f_out)
264 | 
265 |         if FLAGS.dup:
266 |             phisher_eoa2seq_agg = seq_duplicate(phisher_eoa2seq_in, phisher_eoa2seq_out)
267 |         else:
268 |             phisher_eoa2seq_agg = seq_generation(phisher_eoa2seq_in, phisher_eoa2seq_out)
269 | 
270 |         eoa2seq_agg.update(phisher_eoa2seq_agg)
271 | 
272 |     eoa2seq_agg = feature_bucketization(eoa2seq_agg)
273 | 
274 |     print("statistics:")
275 |     length_list = []
276 |     for eoa in eoa2seq_agg.keys():
277 |         seq = eoa2seq_agg[eoa]
278 |         length_list.append(len(seq))
279 | 
280 |     length_list = np.array(length_list)
281 |     print("Median:", np.median(length_list))
282 |     print("Mean:", np.mean(length_list))
283 |     print("Seq #:", len(length_list))
284 | 
285 |     tf.gfile.MakeDirs("./data")
286 | 
287 |     with open("./data/eoa2seq.pkl", "wb") as f:
288 |         pkl.dump(eoa2seq_agg, f)
289 | 
290 | 
291 | print("pause")
292 | 
293 | if __name__ == '__main__':
294 |     main()


--------------------------------------------------------------------------------
/Model/modeling.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | """The main BERT model and related functions."""
   3 | from __future__ import absolute_import
   4 | from __future__ import division
   5 | from __future__ import print_function
   6 | 
   7 | import collections
   8 | import copy
   9 | import json
  10 | import math
  11 | import re
  12 | import six
  13 | import numpy as np
  14 | # import tensorflow as tf
  15 | import tensorflow.compat.v1 as tf
  16 | tf.disable_v2_behavior()
  17 | 
  18 | 
  19 | class BertConfig(object):
  20 |     """Configuration for `BertModel`."""
  21 | 
  22 |     def __init__(self,
  23 |                  vocab_size,
  24 |                  hidden_size=768,
  25 |                  num_hidden_layers=12,
  26 |                  num_attention_heads=12,
  27 |                  intermediate_size=3072,
  28 |                  hidden_act="gelu",
  29 |                  hidden_dropout_prob=0.1,
  30 |                  attention_probs_dropout_prob=0.1,
  31 |                  max_position_embeddings=512,
  32 |                  type_vocab_size=16,
  33 |                  initializer_range=0.02):
  34 |         """Constructs BertConfig.
  35 | 
  36 |         Args:
  37 |         vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
  38 |         hidden_size: Size of the encoder layers and the pooler layer.
  39 |         num_hidden_layers: Number of hidden layers in the Transformer encoder.
  40 |         num_attention_heads: Number of attention heads for each attention layer in
  41 |             the Transformer encoder.
  42 |         intermediate_size: The size of the "intermediate" (i.e., feed-forward)
  43 |             layer in the Transformer encoder.
  44 |         hidden_act: The non-linear activation function (function or string) in the
  45 |             encoder and pooler.
  46 |         hidden_dropout_prob: The dropout probability for all fully connected
  47 |             layers in the embeddings, encoder, and pooler.
  48 |         attention_probs_dropout_prob: The dropout ratio for the attention
  49 |             probabilities.
  50 |         max_position_embeddings: The maximum sequence length that this model might
  51 |             ever be used with. Typically set this to something large just in case
  52 |             (e.g., 512 or 1024 or 2048).
  53 |         type_vocab_size: The vocabulary size of the `token_type_ids` passed into
  54 |             `BertModel`.
  55 |         initializer_range: The stdev of the truncated_normal_initializer for
  56 |             initializing all weight matrices.
  57 |         """
  58 |         self.vocab_size = vocab_size
  59 |         self.hidden_size = hidden_size
  60 |         self.num_hidden_layers = num_hidden_layers
  61 |         self.num_attention_heads = num_attention_heads
  62 |         self.hidden_act = hidden_act
  63 |         self.intermediate_size = intermediate_size
  64 |         self.hidden_dropout_prob = hidden_dropout_prob
  65 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
  66 |         self.max_position_embeddings = max_position_embeddings
  67 |         self.type_vocab_size = type_vocab_size
  68 |         self.initializer_range = initializer_range
  69 | 
  70 |     @classmethod
  71 |     def from_dict(cls, json_object):
  72 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
  73 |         config = BertConfig(vocab_size=None)
  74 |         for (key, value) in six.iteritems(json_object):
  75 |             config.__dict__[key] = value
  76 |         return config
  77 | 
  78 |     @classmethod
  79 |     def from_json_file(cls, json_file):
  80 |         """Constructs a `BertConfig` from a json file of parameters."""
  81 |         with tf.gfile.GFile(json_file, "r") as reader:
  82 |             text = reader.read()
  83 |         return cls.from_dict(json.loads(text))
  84 | 
  85 |     def to_dict(self):
  86 |         """Serializes this instance to a Python dictionary."""
  87 |         output = copy.deepcopy(self.__dict__)
  88 |         return output
  89 | 
  90 |     def to_json_string(self):
  91 |         """Serializes this instance to a JSON string."""
  92 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
  93 | 
  94 | 
  95 | class BertModel(object):
  96 |     """BERT model ("Bidirectional Embedding Representations from a Transformer").
  97 | 
  98 |     Example usage:
  99 | 
 100 |     ```python
 101 |     # Already been converted into WordPiece token ids
 102 |     input_ids = tf.constant([[31, 51, 99], [15, 5, 0]])
 103 |     input_mask = tf.constant([[1, 1, 1], [1, 1, 0]])
 104 |     token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]])
 105 | 
 106 |     config = modeling.BertConfig(vocab_size=32000, hidden_size=512,
 107 |         num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024)
 108 | 
 109 |     model = modeling.BertModel(config=config, is_training=True,
 110 |         input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids)
 111 | 
 112 |     label_embeddings = tf.get_variable(...)
 113 |     logits = tf.matmul(pooled_output, label_embeddings)
 114 |     ...
 115 |     ```
 116 |     """
 117 |     def __init__(self,
 118 |                  config,
 119 |                  is_training,
 120 |                  input_ids,
 121 |                  input_positions,
 122 |                  input_io_flags,
 123 |                  input_amounts,
 124 |                  input_counts,
 125 |                  input_mask=None,
 126 |                  token_type_ids=None,
 127 |                  use_one_hot_embeddings=False,
 128 |                  cross_share=False,
 129 |                  scope="bert"):
 130 |         """Constructor for BertModel.
 131 | 
 132 |         Args:
 133 |         config: `BertConfig` instance.
 134 |         is_training: bool. rue for training model, false for eval model. Controls
 135 |             whether dropout will be applied.
 136 |         input_ids: int32 Tensor of shape [batch_size, seq_length].
 137 |         input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
 138 |         token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
 139 |         use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
 140 |             embeddings or tf.embedding_lookup() for the word embeddings. On the TPU,
 141 |             it is must faster if this is True, on the CPU or GPU, it is faster if
 142 |             this is False.
 143 |         scope: (optional) variable scope. Defaults to "bert".
 144 | 
 145 |         Raises:
 146 |         ValueError: The config is invalid or one of the input tensor shapes
 147 |             is invalid.
 148 |         """
 149 |         config = copy.deepcopy(config)
 150 |         if not is_training:
 151 |             config.hidden_dropout_prob = 0.0
 152 |             config.attention_probs_dropout_prob = 0.0
 153 | 
 154 |         input_shape = get_shape_list(input_ids, expected_rank=2)
 155 |         batch_size = input_shape[0]
 156 |         seq_length = input_shape[1]
 157 | 
 158 |         if input_mask is None:
 159 |             input_mask = tf.ones(
 160 |                 shape=[batch_size, seq_length], dtype=tf.int32)
 161 | 
 162 |         if token_type_ids is None:
 163 |             token_type_ids = tf.zeros(
 164 |                 shape=[batch_size, seq_length], dtype=tf.int32)
 165 | 
 166 |         with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
 167 |             with tf.variable_scope("embeddings"):
 168 |                 # Perform embedding lookup on the word ids.
 169 | 
 170 |                 self.embedding_output, self.embedding_table_list, self.factorize_table_list = \
 171 |                     embedding_lookup(input_ids=input_ids,
 172 |                                      vocab_size=config.vocab_size,
 173 |                                      embedding_size=config.hidden_size,
 174 |                                      bucket_list=config.bucket_list,
 175 |                                      factor_list=config.factor_list,
 176 |                                      initializer_range=config.initializer_range,
 177 |                                      word_embedding_name="word_embeddings",
 178 |                                      use_one_hot_embeddings=use_one_hot_embeddings)
 179 | 
 180 |                 self.embedding_output = feature_encoder(input_tensor=self.embedding_output,
 181 |                                                         input_io_flags=input_io_flags,
 182 |                                                         input_amounts=input_amounts,
 183 |                                                         input_counts=input_counts,
 184 |                                                         initializer_range=config.initializer_range)
 185 | 
 186 |                 # Add positional embeddings and token type embeddings, then layer
 187 |                 # normalize and perform dropout.
 188 |                 self.embedding_output = embedding_postprocessor(
 189 |                     input_tensor=self.embedding_output,
 190 |                     input_positions=input_positions,
 191 |                     initializer_range=config.initializer_range,
 192 |                     max_position_embeddings=config.max_position_embeddings,
 193 |                     dropout_prob=config.hidden_dropout_prob,
 194 |                     is_sinusoidal=False)
 195 | 
 196 |             with tf.variable_scope("encoder"):
 197 |                 # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
 198 |                 # mask of shape [batch_size, seq_length, seq_length] which is used
 199 |                 # for the attention scores.
 200 |                 attention_mask = create_attention_mask_from_input_mask(
 201 |                     input_ids, input_mask)
 202 | 
 203 |                 # Run the stacked transformer.
 204 |                 # `sequence_output` shape = [batch_size, seq_length, hidden_size].
 205 |                 if cross_share:
 206 |                     self.all_encoder_layers = transformer_model_cross_share(
 207 |                         input_tensor=self.embedding_output,
 208 |                         attention_mask=attention_mask,
 209 |                         hidden_size=config.hidden_size,
 210 |                         num_hidden_layers=config.num_hidden_layers,
 211 |                         num_attention_heads=config.num_attention_heads,
 212 |                         intermediate_size=config.intermediate_size,
 213 |                         intermediate_act_fn=get_activation(config.hidden_act),
 214 |                         hidden_dropout_prob=config.hidden_dropout_prob,
 215 |                         attention_probs_dropout_prob=config.attention_probs_dropout_prob,
 216 |                         initializer_range=config.initializer_range,
 217 |                         do_return_all_layers=True)
 218 | 
 219 |                 else:
 220 |                     self.all_encoder_layers = transformer_model(
 221 |                         input_tensor=self.embedding_output,
 222 |                         attention_mask=attention_mask,
 223 |                         hidden_size=config.hidden_size,
 224 |                         num_hidden_layers=config.num_hidden_layers,
 225 |                         num_attention_heads=config.num_attention_heads,
 226 |                         intermediate_size=config.intermediate_size,
 227 |                         intermediate_act_fn=get_activation(config.hidden_act),
 228 |                         hidden_dropout_prob=config.hidden_dropout_prob,
 229 |                         attention_probs_dropout_prob=config.attention_probs_dropout_prob,
 230 |                         initializer_range=config.initializer_range,
 231 |                         do_return_all_layers=True)
 232 | 
 233 |             self.sequence_output = self.all_encoder_layers[-1]
 234 | 
 235 | 
 236 |     def get_pooled_output(self):
 237 |         return self.pooled_output
 238 | 
 239 |     def get_sequence_output(self):
 240 |         """Gets final hidden layer of encoder.
 241 | 
 242 |         Returns:
 243 |         float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
 244 |         to the final hidden of the transformer encoder.
 245 |         """
 246 |         return self.sequence_output
 247 | 
 248 |     def get_all_encoder_layers(self):
 249 |         return self.all_encoder_layers
 250 | 
 251 |     def get_embedding_output(self):
 252 |         """Gets output of the embedding lookup (i.e., input to the transformer).
 253 | 
 254 |         Returns:
 255 |         float Tensor of shape [batch_size, seq_length, hidden_size] corresponding
 256 |         to the output of the embedding layer, after summing the word
 257 |         embeddings with the positional embeddings and the token type embeddings,
 258 |         then performing layer normalization. This is the input to the transformer.
 259 |         """
 260 |         return self.embedding_output
 261 | 
 262 |     def get_embedding_table(self):
 263 |         return self.embedding_table_list, self.factorize_table_list
 264 | 
 265 | 
 266 | def gelu(input_tensor):
 267 |     """Gaussian Error Linear Unit.
 268 | 
 269 |     This is a smoother version of the RELU.
 270 |     Original paper: https://arxiv.org/abs/1606.08415
 271 | 
 272 |     Args:
 273 |         input_tensor: float Tensor to perform activation.
 274 | 
 275 |     Returns:
 276 |         `input_tensor` with the GELU activation applied.
 277 |     """
 278 |     cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0)))
 279 |     return input_tensor * cdf
 280 | 
 281 | 
 282 | def get_activation(activation_string):
 283 |     """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`.
 284 | 
 285 |     Args:
 286 |         activation_string: String name of the activation function.
 287 | 
 288 |     Returns:
 289 |         A Python function corresponding to the activation function. If
 290 |         `activation_string` is None, empty, or "linear", this will return None.
 291 |         If `activation_string` is not a string, it will return `activation_string`.
 292 | 
 293 |     Raises:
 294 |         ValueError: The `activation_string` does not correspond to a known
 295 |         activation.
 296 |     """
 297 | 
 298 |     # We assume that anything that"s not a string is already an activation
 299 |     # function, so we just return it.
 300 |     if not isinstance(activation_string, six.string_types):
 301 |         return activation_string
 302 | 
 303 |     if not activation_string:
 304 |         return None
 305 | 
 306 |     act = activation_string.lower()
 307 |     if act == "linear":
 308 |         return None
 309 |     elif act == "relu":
 310 |         return tf.nn.relu
 311 |     elif act == "gelu":
 312 |         return gelu
 313 |     elif act == "tanh":
 314 |         return tf.tanh
 315 |     else:
 316 |         raise ValueError("Unsupported activation: %s" % act)
 317 | 
 318 | 
 319 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
 320 |     """Compute the union of the current variables and checkpoint variables."""
 321 |     assignment_map = {}
 322 |     initialized_variable_names = {}
 323 | 
 324 |     name_to_variable = collections.OrderedDict()
 325 |     for var in tvars:
 326 |         name = var.name
 327 |         m = re.match("^(.*):\\d+$", name)
 328 |         if m is not None:
 329 |             name = m.group(1)
 330 |         name_to_variable[name] = var
 331 | 
 332 |     init_vars = tf.train.list_variables(init_checkpoint)
 333 | 
 334 |     assignment_map = collections.OrderedDict()
 335 |     for x in init_vars:
 336 |         (name, var) = (x[0], x[1])
 337 |         if name not in name_to_variable:
 338 |             continue
 339 |         assignment_map[name] = name
 340 |         initialized_variable_names[name] = 1
 341 |         initialized_variable_names[name + ":0"] = 1
 342 | 
 343 |     return (assignment_map, initialized_variable_names)
 344 | 
 345 | 
 346 | def dropout(input_tensor, dropout_prob):
 347 |     """Perform dropout.
 348 | 
 349 |     Args:
 350 |         input_tensor: float Tensor.
 351 |         dropout_prob: Python float. The probability of dropping out a value (NOT of
 352 |         *keeping* a dimension as in `tf.nn.dropout`).
 353 | 
 354 |     Returns:
 355 |         A version of `input_tensor` with dropout applied.
 356 |     """
 357 |     if dropout_prob is None or dropout_prob == 0.0:
 358 |         return input_tensor
 359 | 
 360 |     output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob)
 361 |     return output
 362 | 
 363 | 
 364 | def layer_norm(input_tensor, name=None):
 365 |     """Run layer normalization on the last dimension of the tensor."""
 366 |     epsilon = 1e-6
 367 |     filters = input_tensor.get_shape()[-1]
 368 |     with tf.variable_scope("layer_norm"):
 369 |         scale = tf.get_variable("layer_norm_scale", [filters], initializer=tf.ones_initializer())
 370 |         bias = tf.get_variable("layer_norm_bias", [filters], initializer=tf.zeros_initializer())
 371 | 
 372 |         mean = tf.reduce_mean(input_tensor, axis=-1, keep_dims=True)
 373 |         variance = tf.reduce_mean(tf.square(input_tensor-mean), axis=-1, keep_dims=True)
 374 |         input_tensor = (input_tensor - mean) * tf.rsqrt(variance + epsilon)
 375 |         input_tensor = input_tensor * scale + bias
 376 | 
 377 |         return input_tensor
 378 | 
 379 | 
 380 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None):
 381 |     """Runs layer normalization followed by dropout."""
 382 |     output_tensor = layer_norm(input_tensor, name)
 383 |     output_tensor = dropout(output_tensor, dropout_prob)
 384 |     return output_tensor
 385 | 
 386 | 
 387 | def create_initializer(initializer_range=0.02):
 388 |     """Creates a `truncated_normal_initializer` with the given range."""
 389 |     return tf.truncated_normal_initializer(stddev=initializer_range)
 390 | 
 391 | 
 392 | def embedding_lookup(input_ids,
 393 |                      vocab_size,
 394 |                      embedding_size,
 395 |                      bucket_list,
 396 |                      factor_list,
 397 |                      initializer_range=0.02,
 398 |                      word_embedding_name="word_embeddings",
 399 |                      use_one_hot_embeddings=False):
 400 |     """Looks up words embeddings for id tensor.
 401 | 
 402 |     Args:
 403 |         input_ids: int32 Tensor of shape [batch_size, seq_length] containing word
 404 |         ids.
 405 |         vocab_size: int. Size of the embedding vocabulary.
 406 |         embedding_size: int. Width of the word embeddings.
 407 |         initializer_range: float. Embedding initialization range.
 408 |         word_embedding_name: string. Name of the embedding table.
 409 |         use_one_hot_embeddings: bool. If True, use one-hot method for word
 410 |         embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better
 411 |         for TPUs.
 412 | 
 413 |     Returns:
 414 |         float Tensor of shape [batch_size, seq_length, embedding_size].
 415 |     """
 416 |     # This function assumes that the input is of shape [batch_size, seq_length,
 417 |     # num_inputs].
 418 |     #
 419 |     # If the input is a 2D tensor of shape [batch_size, seq_length], we
 420 |     # reshape to [batch_size, seq_length, 1]. why?
 421 |     # if input_ids.shape.ndims == 2:
 422 |     #     input_ids = tf.expand_dims(input_ids, axis=[-1])
 423 | 
 424 |     embedding_table_list = []
 425 |     factorize_table_list = []
 426 | 
 427 |     for i in range(len(bucket_list)):
 428 |         bucket = bucket_list[i]
 429 |         lower_bound = bucket[0]
 430 |         upper_bound = bucket[1]
 431 | 
 432 |         factor_size = factor_list[i]
 433 | 
 434 |         embedding_table = tf.get_variable(
 435 |             name=word_embedding_name + "_" + str(i),
 436 |             shape=[upper_bound-lower_bound, factor_size],
 437 |             initializer=create_initializer(initializer_range))
 438 | 
 439 |         embedding_table_list.append(embedding_table)
 440 | 
 441 |         factor_table = tf.get_variable(
 442 |             name="factor_table" + "_" + str(i),
 443 |             shape=[factor_size, embedding_size],
 444 |             initializer=create_initializer(initializer_range)
 445 |         )
 446 |         factorize_table_list.append(factor_table)
 447 | 
 448 |     output = new_embedding_lookup(input_ids, bucket_list, embedding_table_list, factorize_table_list)
 449 | 
 450 |     return (output, embedding_table_list, factorize_table_list)
 451 | 
 452 | 
 453 | def new_embedding_lookup(input_ids, bucket_list, embedding_table_list, factorize_table_list):
 454 | 
 455 |     embedding_size = get_shape_list(embedding_table_list[0])[-1]
 456 |     output_list = []
 457 |     for i in range(len(bucket_list)):
 458 | 
 459 |         embedding_table = embedding_table_list[i]
 460 |         factorize_table = factorize_table_list[i]
 461 | 
 462 |         bucket = bucket_list[i]
 463 |         lower_bound = bucket[0]
 464 |         upper_bound = bucket[1]
 465 | 
 466 |         mask1 = tf.cast(tf.greater_equal(input_ids, lower_bound), tf.int32)
 467 |         mask2 = tf.cast(tf.less(input_ids, upper_bound), tf.int32)
 468 | 
 469 |         mask = mask1 * mask2
 470 |         mask_2d = tf.cast(tf.tile(tf.expand_dims(mask, axis=2), multiples=[1, 1, embedding_size]), tf.float32)
 471 |         embedding_output = tf.nn.embedding_lookup(embedding_table, (input_ids - lower_bound) * mask)
 472 | 
 473 |         if i == 0:
 474 |             output = embedding_output * mask_2d
 475 |         else:
 476 |             output = tf.matmul(embedding_output, factorize_table) * mask_2d
 477 | 
 478 |         output_list.append(output)
 479 | 
 480 |     final_output = tf.reduce_sum(output_list, axis=0)
 481 | 
 482 |     return final_output
 483 | 
 484 | 
 485 | def feature_encoder(input_tensor,
 486 |                     input_io_flags,
 487 |                     input_amounts,
 488 |                     input_counts,
 489 |                     initializer_range=0.02):
 490 | 
 491 |     input_shape = get_shape_list(input_tensor, expected_rank=3)
 492 |     batch_size = input_shape[0]
 493 |     seq_length = input_shape[1]
 494 |     width = input_shape[2]
 495 | 
 496 |     io_embedding_table = tf.get_variable(
 497 |         name="io_embeddings",
 498 |         shape=[3, width],
 499 |         initializer=create_initializer(initializer_range))
 500 |     io_embeddings = tf.nn.embedding_lookup(io_embedding_table, input_io_flags)
 501 | 
 502 |     amount_embedding_table = tf.get_variable(
 503 |         name="amount_embeddings",
 504 |         shape=[15, width],
 505 |         initializer=create_initializer(initializer_range))
 506 |     amount_embeddings = tf.nn.embedding_lookup(amount_embedding_table, input_amounts)
 507 | 
 508 |     count_embedding_table = tf.get_variable(
 509 |         name="count_embeddings",
 510 |         shape=[15, width],
 511 |         initializer=create_initializer(initializer_range))
 512 |     count_embeddings = tf.nn.embedding_lookup(count_embedding_table, input_counts)
 513 | 
 514 |     output = input_tensor + io_embeddings + amount_embeddings + count_embeddings
 515 | 
 516 |     return output
 517 | 
 518 | 
 519 | def embedding_postprocessor(input_tensor,
 520 |                             input_positions=None,
 521 |                             initializer_range=0.02,
 522 |                             max_position_embeddings=512,
 523 |                             dropout_prob=0.1,
 524 |                             is_sinusoidal=False):
 525 |     """Performs various post-processing on a word embedding tensor.
 526 | 
 527 |     Args:
 528 |         input_tensor: float Tensor of shape [batch_size, seq_length,
 529 |         embedding_size].
 530 |         use_token_type: bool. Whether to add embeddings for `token_type_ids`.
 531 |         token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
 532 |         Must be specified if `use_token_type` is True.
 533 |         token_type_vocab_size: int. The vocabulary size of `token_type_ids`.
 534 |         token_type_embedding_name: string. The name of the embedding table variable
 535 |         for token type ids.
 536 |         use_position_embeddings: bool. Whether to add position embeddings for the
 537 |         position of each token in the sequence.
 538 |         position_embedding_name: string. The name of the embedding table variable
 539 |         for positional embeddings.
 540 |         initializer_range: float. Range of the weight initialization.
 541 |         max_position_embeddings: int. Maximum sequence length that might ever be
 542 |         used with this model. This can be longer than the sequence length of
 543 |         input_tensor, but cannot be shorter.
 544 |         dropout_prob: float. Dropout probability applied to the final output tensor.
 545 | 
 546 |     Returns:
 547 |         float tensor with same shape as `input_tensor`.
 548 | 
 549 |     Raises:
 550 |         ValueError: One of the tensor shapes or input values is invalid.
 551 |     """
 552 |     input_shape = get_shape_list(input_tensor, expected_rank=3)
 553 |     batch_size = input_shape[0]
 554 |     seq_length = input_shape[1]
 555 |     width = input_shape[2]
 556 |     output = input_tensor
 557 | 
 558 |     if is_sinusoidal:
 559 | 
 560 |         PE_embedding = []
 561 |         for pos in range(max_position_embeddings):
 562 |             pos_i_tmp = []
 563 |             for i in range(width):
 564 |                 if i % 2 == 0:
 565 |                     encoding = math.sin(pos / (pow(1000, i / width)))
 566 |                     pos_i_tmp.append(encoding)
 567 |                 else:
 568 |                     encoding = math.cos(pos / (pow(1000, (i - 1) / width)))
 569 |                     pos_i_tmp.append(encoding)
 570 |             PE_embedding.append(pos_i_tmp)
 571 | 
 572 |         PE_embedding = np.array(PE_embedding)
 573 |         position_embedding_table = tf.constant(PE_embedding,
 574 |                                                name="position_embeddings",
 575 |                                                dtype=tf.float32)
 576 |         print("===========Positional Embedding=============")
 577 |         print(position_embedding_table)
 578 | 
 579 |     else:
 580 |         position_embedding_table = tf.get_variable(
 581 |             name="position_embeddings",
 582 |             shape=[max_position_embeddings, width],
 583 |             initializer=create_initializer(initializer_range))
 584 | 
 585 |     # This vocab will be small so we always do one-hot here, since it is always
 586 |     position_embeddings = tf.nn.embedding_lookup(position_embedding_table, input_positions)
 587 | 
 588 |     output += position_embeddings
 589 |     output = layer_norm_and_dropout(output, dropout_prob)
 590 |     return output
 591 | 
 592 | def create_attention_mask_from_input_mask(from_tensor, to_mask):
 593 |     """Create 3D attention mask from a 2D tensor mask.
 594 | 
 595 |     Args:
 596 |         from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...].
 597 |         to_mask: int32 Tensor of shape [batch_size, to_seq_length].
 598 | 
 599 |     Returns:
 600 |         float Tensor of shape [batch_size, from_seq_length, to_seq_length].
 601 |     """
 602 |     from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
 603 |     batch_size = from_shape[0]
 604 |     from_seq_length = from_shape[1]
 605 | 
 606 |     to_shape = get_shape_list(to_mask, expected_rank=2)
 607 |     to_seq_length = to_shape[1]
 608 | 
 609 |     to_mask = tf.cast(
 610 |         tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
 611 | 
 612 |     # We don't assume that `from_tensor` is a mask (although it could be). We
 613 |     # don't actually care if we attend *from* padding tokens (only *to* padding)
 614 |     # tokens so we create a tensor of all ones.
 615 |     #
 616 |     # `broadcast_ones` = [batch_size, from_seq_length, 1]
 617 |     broadcast_ones = tf.ones(
 618 |         shape=[batch_size, from_seq_length, 1], dtype=tf.float32)
 619 | 
 620 |     # Here we broadcast along two dimensions to create the mask.
 621 |     mask = broadcast_ones * to_mask
 622 | 
 623 |     return mask
 624 | 
 625 | 
 626 | def attention_layer(layer_idx,
 627 |                     from_tensor,
 628 |                     to_tensor,
 629 |                     attention_mask=None,
 630 |                     num_attention_heads=1,
 631 |                     size_per_head=512,
 632 |                     query_act=None,
 633 |                     key_act=None,
 634 |                     value_act=None,
 635 |                     attention_probs_dropout_prob=0.0,
 636 |                     initializer_range=0.02,
 637 |                     do_return_2d_tensor=False,
 638 |                     batch_size=None,
 639 |                     from_seq_length=None,
 640 |                     to_seq_length=None):
 641 |     """Performs multi-headed attention from `from_tensor` to `to_tensor`.
 642 | 
 643 |     This is an implementation of multi-headed attention based on "Attention
 644 |     is all you Need". If `from_tensor` and `to_tensor` are the same, then
 645 |     this is self-attention. Each timestep in `from_tensor` attends to the
 646 |     corresponding sequence in `to_tensor`, and returns a fixed-with vector.
 647 | 
 648 |     This function first projects `from_tensor` into a "query" tensor and
 649 |     `to_tensor` into "key" and "value" tensors. These are (effectively) a list
 650 |     of tensors of length `num_attention_heads`, where each tensor is of shape
 651 |     [batch_size, seq_length, size_per_head].
 652 | 
 653 |     Then, the query and key tensors are dot-producted and scaled. These are
 654 |     softmaxed to obtain attention probabilities. The value tensors are then
 655 |     interpolated by these probabilities, then concatenated back to a single
 656 |     tensor and returned.
 657 | 
 658 |     In practice, the multi-headed attention are done with transposes and
 659 |     reshapes rather than actual separate tensors.
 660 | 
 661 |     Args:
 662 |         from_tensor: float Tensor of shape [batch_size, from_seq_length,
 663 |         from_width].
 664 |         to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width].
 665 |         attention_mask: (optional) int32 Tensor of shape [batch_size,
 666 |         from_seq_length, to_seq_length]. The values should be 1 or 0. The
 667 |         attention scores will effectively be set to -infinity for any positions in
 668 |         the mask that are 0, and will be unchanged for positions that are 1.
 669 |         num_attention_heads: int. Number of attention heads.
 670 |         size_per_head: int. Size of each attention head.
 671 |         query_act: (optional) Activation function for the query transform.
 672 |         key_act: (optional) Activation function for the key transform.
 673 |         value_act: (optional) Activation function for the value transform.
 674 |         attention_probs_dropout_prob: (optional) float. Dropout probability of the
 675 |         attention probabilities.
 676 |         initializer_range: float. Range of the weight initializer.
 677 |         do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
 678 |         * from_seq_length, num_attention_heads * size_per_head]. If False, the
 679 |         output will be of shape [batch_size, from_seq_length, num_attention_heads
 680 |         * size_per_head].
 681 |         batch_size: (Optional) int. If the input is 2D, this might be the batch size
 682 |         of the 3D version of the `from_tensor` and `to_tensor`.
 683 |         from_seq_length: (Optional) If the input is 2D, this might be the seq length
 684 |         of the 3D version of the `from_tensor`.
 685 |         to_seq_length: (Optional) If the input is 2D, this might be the seq length
 686 |         of the 3D version of the `to_tensor`.
 687 | 
 688 |     Returns:
 689 |         float Tensor of shape [batch_size, from_seq_length,
 690 |         num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is
 691 |         true, this will be of shape [batch_size * from_seq_length,
 692 |         num_attention_heads * size_per_head]).
 693 | 
 694 |     Raises:
 695 |         ValueError: Any of the arguments or tensor shapes are invalid.
 696 |     """
 697 | 
 698 |     def transpose_for_scores(input_tensor, batch_size, num_attention_heads,
 699 |                              seq_length, width):
 700 |         output_tensor = tf.reshape(
 701 |             input_tensor, [batch_size, seq_length, num_attention_heads, width])
 702 | 
 703 |         output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
 704 |         return output_tensor
 705 | 
 706 |     from_shape = get_shape_list(from_tensor, expected_rank=[2, 3])
 707 |     to_shape = get_shape_list(to_tensor, expected_rank=[2, 3])
 708 | 
 709 |     if len(from_shape) != len(to_shape):
 710 |         raise ValueError(
 711 |             "The rank of `from_tensor` must match the rank of `to_tensor`.")
 712 | 
 713 |     if len(from_shape) == 3:
 714 |         batch_size = from_shape[0]
 715 |         from_seq_length = from_shape[1]
 716 |         to_seq_length = to_shape[1]
 717 |     elif len(from_shape) == 2:
 718 |         if (batch_size is None or from_seq_length is None
 719 |                 or to_seq_length is None):
 720 |             raise ValueError(
 721 |                 "When passing in rank 2 tensors to attention_layer, the values "
 722 |                 "for `batch_size`, `from_seq_length`, and `to_seq_length` "
 723 |                 "must all be specified.")
 724 | 
 725 |     # Scalar dimensions referenced here:
 726 |     #   B = batch size (number of sequences)
 727 |     #   F = `from_tensor` sequence length
 728 |     #   T = `to_tensor` sequence length
 729 |     #   N = `num_attention_heads`
 730 |     #   H = `size_per_head`
 731 | 
 732 |     from_tensor_2d = reshape_to_matrix(from_tensor)
 733 |     to_tensor_2d = reshape_to_matrix(to_tensor)
 734 | 
 735 |     # `query_layer` = [B*F, N*H]
 736 |     query_layer = tf.layers.dense(
 737 |         from_tensor_2d,
 738 |         num_attention_heads * size_per_head,
 739 |         activation=query_act,
 740 |         name="query",
 741 |         kernel_initializer=create_initializer(initializer_range))
 742 | 
 743 |     # `key_layer` = [B*T, N*H]
 744 |     key_layer = tf.layers.dense(
 745 |         to_tensor_2d,
 746 |         num_attention_heads * size_per_head,
 747 |         activation=key_act,
 748 |         name="key",
 749 |         kernel_initializer=create_initializer(initializer_range))
 750 | 
 751 |     # `value_layer` = [B*T, N*H]
 752 |     value_layer = tf.layers.dense(
 753 |         to_tensor_2d,
 754 |         num_attention_heads * size_per_head,
 755 |         activation=value_act,
 756 |         name="value",
 757 |         kernel_initializer=create_initializer(initializer_range))
 758 | 
 759 |     # `query_layer` = [B, N, F, H]
 760 |     query_layer = transpose_for_scores(query_layer, batch_size,
 761 |                                        num_attention_heads, from_seq_length,
 762 |                                        size_per_head)
 763 | 
 764 |     # `key_layer` = [B, N, T, H]
 765 |     key_layer = transpose_for_scores(key_layer, batch_size,
 766 |                                      num_attention_heads, to_seq_length,
 767 |                                      size_per_head)
 768 | 
 769 |     # Take the dot product between "query" and "key" to get the raw
 770 |     # attention scores.
 771 |     # `attention_scores` = [B, N, F, T]
 772 |     attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
 773 |     attention_scores = tf.multiply(attention_scores,
 774 |                                    1.0 / math.sqrt(float(size_per_head)))
 775 | 
 776 |     if attention_mask is not None:
 777 |         # `attention_mask` = [B, 1, F, T]
 778 |         attention_mask = tf.expand_dims(attention_mask, axis=[1])
 779 | 
 780 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
 781 |         # masked positions, this operation will create a tensor which is 0.0 for
 782 |         # positions we want to attend and -10000.0 for masked positions.
 783 |         adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0
 784 | 
 785 |         # Since we are adding it to the raw scores before the softmax, this is
 786 |         # effectively the same as removing these entirely.
 787 |         attention_scores += adder
 788 | 
 789 |     # Normalize the attention scores to probabilities.
 790 |     # `attention_probs` = [B, N, F, T]
 791 |     attention_probs = tf.nn.softmax(attention_scores)
 792 | 
 793 |     tf.add_to_collection("layer" + str(layer_idx) + "_attention_probs", attention_probs)
 794 | 
 795 |     # This is actually dropping out entire tokens to attend to, which might
 796 |     # seem a bit unusual, but is taken from the original Transformer paper.
 797 |     attention_probs = dropout(attention_probs, attention_probs_dropout_prob)
 798 | 
 799 |     # `value_layer` = [B, T, N, H]
 800 |     value_layer = tf.reshape(
 801 |         value_layer,
 802 |         [batch_size, to_seq_length, num_attention_heads, size_per_head])
 803 | 
 804 |     # `value_layer` = [B, N, T, H]
 805 |     value_layer = tf.transpose(value_layer, [0, 2, 1, 3])
 806 | 
 807 |     # `context_layer` = [B, N, F, H]
 808 |     context_layer = tf.matmul(attention_probs, value_layer)
 809 | 
 810 |     # `context_layer` = [B, F, N, H]
 811 |     context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
 812 | 
 813 |     if do_return_2d_tensor:
 814 |         # `context_layer` = [B*F, N*V]
 815 |         context_layer = tf.reshape(context_layer, [
 816 |             batch_size * from_seq_length, num_attention_heads * size_per_head
 817 |         ])
 818 |     else:
 819 |         # `context_layer` = [B, F, N*V]
 820 |         context_layer = tf.reshape(
 821 |             context_layer,
 822 |             [batch_size, from_seq_length, num_attention_heads * size_per_head])
 823 | 
 824 |     return context_layer
 825 | 
 826 | 
 827 | def transformer_model(input_tensor,
 828 |                       attention_mask=None,
 829 |                       hidden_size=768,
 830 |                       num_hidden_layers=12,
 831 |                       num_attention_heads=12,
 832 |                       intermediate_size=3072,
 833 |                       intermediate_act_fn=gelu,
 834 |                       hidden_dropout_prob=0.1,
 835 |                       attention_probs_dropout_prob=0.1,
 836 |                       initializer_range=0.02,
 837 |                       do_return_all_layers=False):
 838 |     """Multi-headed, multi-layer Transformer from "Attention is All You Need".
 839 |     This is almost an exact implementation of the original Transformer encoder.
 840 | 
 841 |     See the original paper:
 842 |     https://arxiv.org/abs/1706.03762
 843 | 
 844 |     Also see:
 845 |     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
 846 | 
 847 |     Args:
 848 |         input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
 849 |         attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
 850 |         seq_length], with 1 for positions that can be attended to and 0 in
 851 |         positions that should not be.
 852 |         hidden_size: int. Hidden size of the Transformer.
 853 |         num_hidden_layers: int. Number of layers (blocks) in the Transformer.
 854 |         num_attention_heads: int. Number of attention heads in the Transformer.
 855 |         intermediate_size: int. The size of the "intermediate" (a.k.a., feed
 856 |         forward) layer.
 857 |         intermediate_act_fn: function. The non-linear activation function to apply
 858 |         to the output of the intermediate/feed-forward layer.
 859 |         hidden_dropout_prob: float. Dropout probability for the hidden layers.
 860 |         attention_probs_dropout_prob: float. Dropout probability of the attention
 861 |         probabilities.
 862 |         initializer_range: float. Range of the initializer (stddev of truncated
 863 |         normal).
 864 |         do_return_all_layers: Whether to also return all layers or just the final
 865 |         layer.
 866 | 
 867 |     Returns:
 868 |         float Tensor of shape [batch_size, seq_length, hidden_size], the final
 869 |         hidden layer of the Transformer.
 870 | 
 871 |     Raises:
 872 |         ValueError: A Tensor shape or parameter is invalid.
 873 |     """
 874 |     if hidden_size % num_attention_heads != 0:
 875 |         raise ValueError(
 876 |             "The hidden size (%d) is not a multiple of the number of attention "
 877 |             "heads (%d)" % (hidden_size, num_attention_heads))
 878 | 
 879 |     attention_head_size = int(hidden_size / num_attention_heads)
 880 |     input_shape = get_shape_list(input_tensor, expected_rank=3)
 881 |     batch_size = input_shape[0]
 882 |     seq_length = input_shape[1]
 883 |     input_width = input_shape[2]
 884 | 
 885 |     # The Transformer performs sum residuals on all layers so the input needs
 886 |     # to be the same as the hidden size.
 887 |     if input_width != hidden_size:
 888 |         raise ValueError(
 889 |             "The width of the input tensor (%d) != hidden size (%d)" %
 890 |             (input_width, hidden_size))
 891 | 
 892 |     # We keep the representation as a 2D tensor to avoid re-shaping it back and
 893 |     # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
 894 |     # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
 895 |     # help the optimizer.
 896 |     prev_output = reshape_to_matrix(input_tensor)
 897 | 
 898 |     all_layer_outputs = []
 899 |     for layer_idx in range(num_hidden_layers):
 900 |         with tf.variable_scope("layer_%d" % layer_idx):
 901 |             layer_input = prev_output
 902 | 
 903 |             with tf.variable_scope("attention"):
 904 |                 attention_heads = []
 905 |                 with tf.variable_scope("self"):
 906 |                     attention_head = attention_layer(
 907 |                         layer_idx=layer_idx,
 908 |                         from_tensor=layer_input,
 909 |                         to_tensor=layer_input,
 910 |                         attention_mask=attention_mask,
 911 |                         num_attention_heads=num_attention_heads,
 912 |                         size_per_head=attention_head_size,
 913 |                         attention_probs_dropout_prob=
 914 |                         attention_probs_dropout_prob,
 915 |                         initializer_range=initializer_range,
 916 |                         do_return_2d_tensor=True,
 917 |                         batch_size=batch_size,
 918 |                         from_seq_length=seq_length,
 919 |                         to_seq_length=seq_length)
 920 |                     attention_heads.append(attention_head)
 921 | 
 922 |                 attention_output = None
 923 |                 if len(attention_heads) == 1:
 924 |                     attention_output = attention_heads[0]
 925 |                 else:
 926 |                     # In the case where we have other sequences, we just concatenate
 927 |                     # them to the self-attention head before the projection.
 928 |                     attention_output = tf.concat(attention_heads, axis=-1)
 929 | 
 930 |                 # Run a linear projection of `hidden_size` then add a residual
 931 |                 # with `layer_input`.
 932 |                 with tf.variable_scope("output"):
 933 |                     attention_output = tf.layers.dense(
 934 |                         attention_output,
 935 |                         hidden_size,
 936 |                         kernel_initializer=create_initializer(
 937 |                             initializer_range))
 938 |                     attention_output = dropout(attention_output,
 939 |                                                hidden_dropout_prob)
 940 |                     attention_output = layer_norm(attention_output +
 941 |                                                   layer_input)
 942 | 
 943 |             # The activation is only applied to the "intermediate" hidden layer.
 944 |             with tf.variable_scope("intermediate"):
 945 |                 intermediate_output = tf.layers.dense(
 946 |                     attention_output,
 947 |                     intermediate_size,
 948 |                     activation=intermediate_act_fn,
 949 |                     kernel_initializer=create_initializer(initializer_range))
 950 | 
 951 |             # Down-project back to `hidden_size` then add the residual.
 952 |             with tf.variable_scope("output"):
 953 |                 layer_output = tf.layers.dense(
 954 |                     intermediate_output,
 955 |                     hidden_size,
 956 |                     kernel_initializer=create_initializer(initializer_range))
 957 |                 layer_output = dropout(layer_output, hidden_dropout_prob)
 958 |                 layer_output = layer_norm(layer_output + attention_output)
 959 |                 prev_output = layer_output
 960 |                 all_layer_outputs.append(layer_output)
 961 | 
 962 |     if do_return_all_layers:
 963 |         final_outputs = []
 964 |         for layer_output in all_layer_outputs:
 965 |             final_output = reshape_from_matrix(layer_output, input_shape)
 966 |             final_outputs.append(final_output)
 967 |         return final_outputs
 968 |     else:
 969 |         final_output = reshape_from_matrix(prev_output, input_shape)
 970 |         return final_output
 971 | 
 972 | 
 973 | def transformer_model_cross_share(input_tensor,
 974 |                       attention_mask=None,
 975 |                       hidden_size=768,
 976 |                       num_hidden_layers=12,
 977 |                       num_attention_heads=12,
 978 |                       intermediate_size=3072,
 979 |                       intermediate_act_fn=gelu,
 980 |                       hidden_dropout_prob=0.1,
 981 |                       attention_probs_dropout_prob=0.1,
 982 |                       initializer_range=0.02,
 983 |                       do_return_all_layers=False):
 984 |     """Multi-headed, multi-layer Transformer from "Attention is All You Need".
 985 |     This is almost an exact implementation of the original Transformer encoder.
 986 | 
 987 |     See the original paper:
 988 |     https://arxiv.org/abs/1706.03762
 989 | 
 990 |     Also see:
 991 |     https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
 992 | 
 993 |     Args:
 994 |         input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
 995 |         attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
 996 |         seq_length], with 1 for positions that can be attended to and 0 in
 997 |         positions that should not be.
 998 |         hidden_size: int. Hidden size of the Transformer.
 999 |         num_hidden_layers: int. Number of layers (blocks) in the Transformer.
1000 |         num_attention_heads: int. Number of attention heads in the Transformer.
1001 |         intermediate_size: int. The size of the "intermediate" (a.k.a., feed
1002 |         forward) layer.
1003 |         intermediate_act_fn: function. The non-linear activation function to apply
1004 |         to the output of the intermediate/feed-forward layer.
1005 |         hidden_dropout_prob: float. Dropout probability for the hidden layers.
1006 |         attention_probs_dropout_prob: float. Dropout probability of the attention
1007 |         probabilities.
1008 |         initializer_range: float. Range of the initializer (stddev of truncated
1009 |         normal).
1010 |         do_return_all_layers: Whether to also return all layers or just the final
1011 |         layer.
1012 | 
1013 |     Returns:
1014 |         float Tensor of shape [batch_size, seq_length, hidden_size], the final
1015 |         hidden layer of the Transformer.
1016 | 
1017 |     Raises:
1018 |         ValueError: A Tensor shape or parameter is invalid.
1019 |     """
1020 |     if hidden_size % num_attention_heads != 0:
1021 |         raise ValueError(
1022 |             "The hidden size (%d) is not a multiple of the number of attention "
1023 |             "heads (%d)" % (hidden_size, num_attention_heads))
1024 | 
1025 |     attention_head_size = int(hidden_size / num_attention_heads)
1026 |     input_shape = get_shape_list(input_tensor, expected_rank=3)
1027 |     batch_size = input_shape[0]
1028 |     seq_length = input_shape[1]
1029 |     input_width = input_shape[2]
1030 | 
1031 |     # The Transformer performs sum residuals on all layers so the input needs
1032 |     # to be the same as the hidden size.
1033 |     if input_width != hidden_size:
1034 |         raise ValueError(
1035 |             "The width of the input tensor (%d) != hidden size (%d)" %
1036 |             (input_width, hidden_size))
1037 | 
1038 |     # We keep the representation as a 2D tensor to avoid re-shaping it back and
1039 |     # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
1040 |     # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
1041 |     # help the optimizer.
1042 |     prev_output = reshape_to_matrix(input_tensor)
1043 | 
1044 |     all_layer_outputs = []
1045 |     for layer_idx in range(num_hidden_layers):
1046 |         with tf.variable_scope("shared_layer", reuse=tf.AUTO_REUSE):
1047 |             layer_input = prev_output
1048 |             with tf.variable_scope("attention"):
1049 |                 attention_heads = []
1050 |                 with tf.variable_scope("self"):
1051 |                     attention_head = attention_layer(
1052 |                         layer_idx=0,
1053 |                         from_tensor=layer_input,
1054 |                         to_tensor=layer_input,
1055 |                         attention_mask=attention_mask,
1056 |                         num_attention_heads=num_attention_heads,
1057 |                         size_per_head=attention_head_size,
1058 |                         attention_probs_dropout_prob=
1059 |                         attention_probs_dropout_prob,
1060 |                         initializer_range=initializer_range,
1061 |                         do_return_2d_tensor=True,
1062 |                         batch_size=batch_size,
1063 |                         from_seq_length=seq_length,
1064 |                         to_seq_length=seq_length)
1065 |                     attention_heads.append(attention_head)
1066 | 
1067 |                 attention_output = None
1068 |                 if len(attention_heads) == 1:
1069 |                     attention_output = attention_heads[0]
1070 |                 else:
1071 |                     # In the case where we have other sequences, we just concatenate
1072 |                     # them to the self-attention head before the projection.
1073 |                     attention_output = tf.concat(attention_heads, axis=-1)
1074 | 
1075 |                 # Run a linear projection of `hidden_size` then add a residual
1076 |                 # with `layer_input`.
1077 |                 with tf.variable_scope("output"):
1078 |                     attention_output = tf.layers.dense(
1079 |                         attention_output,
1080 |                         hidden_size,
1081 |                         kernel_initializer=create_initializer(
1082 |                             initializer_range))
1083 |                     attention_output = dropout(attention_output,
1084 |                                                hidden_dropout_prob)
1085 |                     attention_output = layer_norm(attention_output +
1086 |                                                   layer_input)
1087 | 
1088 |             # The activation is only applied to the "intermediate" hidden layer.
1089 |             with tf.variable_scope("intermediate"):
1090 |                 intermediate_output = tf.layers.dense(
1091 |                     attention_output,
1092 |                     intermediate_size,
1093 |                     activation=intermediate_act_fn,
1094 |                     kernel_initializer=create_initializer(initializer_range))
1095 | 
1096 |             # Down-project back to `hidden_size` then add the residual.
1097 |             with tf.variable_scope("output"):
1098 |                 layer_output = tf.layers.dense(
1099 |                     intermediate_output,
1100 |                     hidden_size,
1101 |                     kernel_initializer=create_initializer(initializer_range))
1102 |                 layer_output = dropout(layer_output, hidden_dropout_prob)
1103 |                 layer_output = layer_norm(layer_output + attention_output)
1104 |                 prev_output = layer_output
1105 |                 all_layer_outputs.append(layer_output)
1106 | 
1107 |     if do_return_all_layers:
1108 |         final_outputs = []
1109 |         for layer_output in all_layer_outputs:
1110 |             final_output = reshape_from_matrix(layer_output, input_shape)
1111 |             final_outputs.append(final_output)
1112 |         return final_outputs
1113 |     else:
1114 |         final_output = reshape_from_matrix(prev_output, input_shape)
1115 |         return final_output
1116 | 
1117 | 
1118 | 
1119 | def get_shape_list(tensor, expected_rank=None, name=None):
1120 |     """Returns a list of the shape of tensor, preferring static dimensions.
1121 | 
1122 |     Args:
1123 |         tensor: A tf.Tensor object to find the shape of.
1124 |         expected_rank: (optional) int. The expected rank of `tensor`. If this is
1125 |         specified and the `tensor` has a different rank, and exception will be
1126 |         thrown.
1127 |         name: Optional name of the tensor for the error message.
1128 | 
1129 |     Returns:
1130 |         A list of dimensions of the shape of tensor. All static dimensions will
1131 |         be returned as python integers, and dynamic dimensions will be returned
1132 |         as tf.Tensor scalars.
1133 |     """
1134 |     if name is None:
1135 |         name = tensor.name
1136 | 
1137 |     if expected_rank is not None:
1138 |         assert_rank(tensor, expected_rank, name)
1139 | 
1140 |     shape = tensor.shape.as_list()
1141 | 
1142 |     non_static_indexes = []
1143 |     for (index, dim) in enumerate(shape):
1144 |         if dim is None:
1145 |             non_static_indexes.append(index)
1146 | 
1147 |     if not non_static_indexes:
1148 |         return shape
1149 | 
1150 |     dyn_shape = tf.shape(tensor)
1151 |     for index in non_static_indexes:
1152 |         shape[index] = dyn_shape[index]
1153 |     return shape
1154 | 
1155 | 
1156 | def reshape_to_matrix(input_tensor):
1157 |     """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix)."""
1158 |     ndims = input_tensor.shape.ndims
1159 |     if ndims < 2:
1160 |         raise ValueError("Input tensor must have at least rank 2. Shape = %s" %
1161 |                          (input_tensor.shape))
1162 |     if ndims == 2:
1163 |         return input_tensor
1164 | 
1165 |     width = input_tensor.shape[-1]
1166 |     output_tensor = tf.reshape(input_tensor, [-1, width])
1167 |     return output_tensor
1168 | 
1169 | 
1170 | def reshape_from_matrix(output_tensor, orig_shape_list):
1171 |     """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
1172 |     if len(orig_shape_list) == 2:
1173 |         return output_tensor
1174 | 
1175 |     output_shape = get_shape_list(output_tensor)
1176 | 
1177 |     orig_dims = orig_shape_list[0:-1]
1178 |     width = output_shape[-1]
1179 | 
1180 |     return tf.reshape(output_tensor, orig_dims + [width])
1181 | 
1182 | 
1183 | def assert_rank(tensor, expected_rank, name=None):
1184 |     """Raises an exception if the tensor rank is not of the expected rank.
1185 | 
1186 |     Args:
1187 |         tensor: A tf.Tensor to check the rank of.
1188 |         expected_rank: Python integer or list of integers, expected rank.
1189 |         name: Optional name of the tensor for the error message.
1190 | 
1191 |     Raises:
1192 |         ValueError: If the expected shape doesn't match the actual shape.
1193 |     """
1194 |     if name is None:
1195 |         name = tensor.name
1196 | 
1197 |     expected_rank_dict = {}
1198 |     if isinstance(expected_rank, six.integer_types):
1199 |         expected_rank_dict[expected_rank] = True
1200 |     else:
1201 |         for x in expected_rank:
1202 |             expected_rank_dict[x] = True
1203 | 
1204 |     actual_rank = tensor.shape.ndims
1205 |     if actual_rank not in expected_rank_dict:
1206 |         scope_name = tf.get_variable_scope().name
1207 |         raise ValueError(
1208 |             "For the tensor `%s` in scope `%s`, the actual rank "
1209 |             "`%d` (shape = %s) is not equal to the expected rank `%s`" %
1210 |             (name, scope_name, actual_rank, str(tensor.shape),
1211 |              str(expected_rank)))


--------------------------------------------------------------------------------
/Model/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Functions and classes related to optimization (weight updates)."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import re
 22 | # import tensorflow as tf
 23 | import tensorflow.compat.v1 as tf
 24 | tf.disable_v2_behavior()
 25 | 
 26 | 
 27 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps,
 28 |                      use_tpu):
 29 |     """Creates an optimizer training op."""
 30 |     global_step = tf.train.get_or_create_global_step()
 31 | 
 32 |     learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)
 33 | 
 34 |     # Implements linear decay of the learning rate.
 35 |     learning_rate = tf.train.polynomial_decay(
 36 |         learning_rate,
 37 |         global_step,
 38 |         num_train_steps,
 39 |         end_learning_rate=0.0,
 40 |         power=1.0,
 41 |         cycle=False)
 42 | 
 43 |     # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
 44 |     # learning rate will be `global_step/num_warmup_steps * init_lr`.
 45 |     if num_warmup_steps:
 46 |         global_steps_int = tf.cast(global_step, tf.int32)
 47 |         warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
 48 | 
 49 |         global_steps_float = tf.cast(global_steps_int, tf.float32)
 50 |         warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
 51 | 
 52 |         warmup_percent_done = global_steps_float / warmup_steps_float
 53 |         warmup_learning_rate = init_lr * warmup_percent_done
 54 | 
 55 |         is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
 56 |         learning_rate = ((1.0 - is_warmup) * learning_rate +
 57 |                          is_warmup * warmup_learning_rate)
 58 | 
 59 |     # It is recommended that you use this optimizer for fine tuning, since this
 60 |     # is how the model was trained (note that the Adam m/v variables are NOT
 61 |     # loaded from init_checkpoint.)
 62 |     optimizer = AdamWeightDecayOptimizer(
 63 |         learning_rate=learning_rate,
 64 |         weight_decay_rate=0.01,
 65 |         beta_1=0.9,
 66 |         beta_2=0.999,
 67 |         epsilon=1e-6,
 68 |         exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 69 | 
 70 |     if use_tpu:
 71 |         optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
 72 | 
 73 |     tvars = tf.trainable_variables()
 74 |     grads = tf.gradients(loss, tvars)
 75 | 
 76 |     # This is how the model was pre-trained.
 77 |     (grads, _) = tf.clip_by_global_norm(grads, clip_norm=5.0)
 78 | 
 79 |     train_op = optimizer.apply_gradients(
 80 |         zip(grads, tvars), global_step=global_step)
 81 | 
 82 |     new_global_step = global_step + 1
 83 |     train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 84 |     return train_op
 85 | 
 86 | 
 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer):
 88 |     """A basic Adam optimizer that includes "correct" L2 weight decay."""
 89 | 
 90 |     def __init__(self,
 91 |                  learning_rate,
 92 |                  weight_decay_rate=0.0,
 93 |                  beta_1=0.9,
 94 |                  beta_2=0.999,
 95 |                  epsilon=1e-6,
 96 |                  exclude_from_weight_decay=None,
 97 |                  name="AdamWeightDecayOptimizer"):
 98 |         """Constructs a AdamWeightDecayOptimizer."""
 99 |         super(AdamWeightDecayOptimizer, self).__init__(False, name)
100 | 
101 |         self.learning_rate = learning_rate
102 |         self.weight_decay_rate = weight_decay_rate
103 |         self.beta_1 = beta_1
104 |         self.beta_2 = beta_2
105 |         self.epsilon = epsilon
106 |         self.exclude_from_weight_decay = exclude_from_weight_decay
107 | 
108 |     def apply_gradients(self, grads_and_vars, global_step=None, name=None):
109 |         """See base class."""
110 |         assignments = []
111 |         for (grad, param) in grads_and_vars:
112 |             if grad is None or param is None:
113 |                 continue
114 | 
115 |             param_name = self._get_variable_name(param.name)
116 | 
117 |             m = tf.get_variable(
118 |                 name=param_name + "/adam_m",
119 |                 shape=param.shape.as_list(),
120 |                 dtype=tf.float32,
121 |                 trainable=False,
122 |                 initializer=tf.zeros_initializer())
123 |             v = tf.get_variable(
124 |                 name=param_name + "/adam_v",
125 |                 shape=param.shape.as_list(),
126 |                 dtype=tf.float32,
127 |                 trainable=False,
128 |                 initializer=tf.zeros_initializer())
129 | 
130 |             # Standard Adam update.
131 |             next_m = (tf.multiply(self.beta_1, m) +
132 |                       tf.multiply(1.0 - self.beta_1, grad))
133 |             next_v = (tf.multiply(self.beta_2, v) +
134 |                       tf.multiply(1.0 - self.beta_2, tf.square(grad)))
135 | 
136 |             update = next_m / (tf.sqrt(next_v) + self.epsilon)
137 | 
138 |             # Just adding the square of the weights to the loss function is *not*
139 |             # the correct way of using L2 regularization/weight decay with Adam,
140 |             # since that will interact with the m and v parameters in strange ways.
141 |             #
142 |             # Instead we want ot decay the weights in a manner that doesn't interact
143 |             # with the m/v parameters. This is equivalent to adding the square
144 |             # of the weights to the loss with plain (non-momentum) SGD.
145 |             if self._do_use_weight_decay(param_name):
146 |                 update += self.weight_decay_rate * param
147 | 
148 |             update_with_lr = self.learning_rate * update
149 | 
150 |             next_param = param - update_with_lr
151 | 
152 |             assignments.extend(
153 |                 [param.assign(next_param),
154 |                  m.assign(next_m),
155 |                  v.assign(next_v)])
156 |         return tf.group(*assignments, name=name)
157 | 
158 |     def _do_use_weight_decay(self, param_name):
159 |         """Whether to use L2 weight decay for `param_name`."""
160 |         if not self.weight_decay_rate:
161 |             return False
162 |         if self.exclude_from_weight_decay:
163 |             for r in self.exclude_from_weight_decay:
164 |                 if re.search(r, param_name) is not None:
165 |                     return False
166 |         return True
167 | 
168 |     def _get_variable_name(self, param_name):
169 |         """Get the variable name from the tensor name."""
170 |         m = re.match("^(.*):\\d+$", param_name)
171 |         if m is not None:
172 |             param_name = m.group(1)
173 |         return param_name
174 | 


--------------------------------------------------------------------------------
/Model/partitioning.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | sys.path.append("..")
 4 | import pickle as pkl
 5 | 
 6 | n_bucket = 8
 7 | 
 8 | with open("./data/vocab", "rb") as f:
 9 |     vocab = pkl.load(f)
10 | 
11 | total_freq = np.sum(vocab.frequency)
12 | print(len(vocab.frequency))
13 | unit_freq = total_freq/n_bucket
14 | 
15 | offset = 0
16 | bucket_list = []
17 | 
18 | for i in range(n_bucket):
19 |     lower = offset
20 |     count = 0
21 |     for j in range(lower, len(vocab.frequency)):
22 |         count += vocab.frequency[j]
23 |         if count >= unit_freq or j == len(vocab.frequency)-1:
24 |             upper = j
25 |             break
26 | 
27 |     bucket_list.append([lower, upper])
28 |     offset = upper + 1
29 | 
30 | print(bucket_list)
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Model/run_finetune.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Run masked LM/next sentence masked_lm pre-training for BERT."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | import os
 21 | import sys
 22 | sys.path.append("..")
 23 | import optimization
 24 | import collections
 25 | import re
 26 | import tensorflow.compat.v1 as tf
 27 | tf.disable_v2_behavior()
 28 | 
 29 | from sklearn.metrics import roc_curve, auc, classification_report
 30 | from run_pretrain import *
 31 | import pandas as pd
 32 | import numpy as np
 33 | import pickle as pkl
 34 | import time
 35 | 
 36 | def _decode_record(record, name_to_features):
 37 |     """Decodes a record to a TensorFlow example."""
 38 |     example = tf.parse_single_example(record, name_to_features)
 39 | 
 40 |     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
 41 |     # So cast all int64 to int32.
 42 |     for name in list(example.keys()):
 43 |         t = example[name]
 44 |         if t.dtype == tf.int64:
 45 |             t = tf.to_int32(t)
 46 |         example[name] = t
 47 |     return example
 48 | 
 49 | def del_flags(FLAGS, keys_list):
 50 |     for keys in keys_list:
 51 |         FLAGS.__delattr__(keys)
 52 |     return
 53 | 
 54 | def input_fn(input_files,
 55 |              is_training,
 56 |              num_cpu_threads=4):
 57 |     """ The actual input function"""
 58 | 
 59 |     name_to_features = {
 60 |         "address":
 61 |             tf.FixedLenFeature([1], tf.int64),
 62 |         "label":
 63 |             tf.FixedLenFeature([1], tf.float32),
 64 |         "input_ids":
 65 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 66 |         "input_positions":
 67 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 68 |         "input_counts":
 69 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 70 |         "input_mask":
 71 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 72 |         "input_io_flags":
 73 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
 74 |         "input_values":
 75 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64)
 76 |     }
 77 | 
 78 |     if is_training:
 79 |         d = tf.data.TFRecordDataset(input_files)
 80 |         d = d.repeat(FLAGS.epoch).shuffle(100)
 81 | 
 82 |     else:
 83 |         d = tf.data.TFRecordDataset(input_files)
 84 | 
 85 |     d = d.map(lambda record: _decode_record(record, name_to_features), num_parallel_calls=num_cpu_threads)
 86 |     d = d.batch(batch_size=FLAGS.batch_size)
 87 | 
 88 |     iterator = d.make_one_shot_iterator()
 89 |     features = iterator.get_next()
 90 | 
 91 |     return features
 92 | 
 93 | 
 94 | def model_fn(features, mode, bert_config, vocab, init_checkpoint, learning_rate,
 95 |              num_train_steps, num_warmup_steps, load_cross, use_one_hot_embeddings):
 96 |     """The `model_fn` for TPUEstimator."""
 97 | 
 98 |     tf.logging.info("*** Features ***")
 99 |     for name in sorted(features.keys()):
100 |         tf.logging.info("name = %s, shape = %s" % (name,
101 |                                                    features[name].shape))
102 | 
103 |     label = tf.squeeze(features["label"]) # squeeze is important
104 |     input_ids = features["input_ids"]
105 |     input_positions = features["input_positions"]
106 |     input_mask = features["input_mask"]
107 |     input_io_flags = features["input_io_flags"]
108 |     input_values = features["input_values"]
109 |     input_counts = features["input_counts"]
110 | 
111 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
112 | 
113 |     model = modeling.BertModel(
114 |         config=bert_config,
115 |         is_training=is_training,
116 |         input_ids=input_ids,
117 |         input_positions=input_positions,
118 |         input_io_flags=input_io_flags,
119 |         input_amounts=input_values,
120 |         input_counts=input_counts,
121 |         input_mask=input_mask,
122 |         token_type_ids=None,
123 |         use_one_hot_embeddings=use_one_hot_embeddings,
124 |         cross_share=FLAGS.cross_share)
125 | 
126 |     transformer_output = model.get_sequence_output()
127 |     print(transformer_output)
128 |     with tf.variable_scope("MLP", reuse=tf.AUTO_REUSE):
129 | 
130 |         # inp = tf.reduce_mean(transformer_output, 1)
131 |         inp = transformer_output[:,0,:]
132 | 
133 |         dnn1 = tf.layers.dense(inp, FLAGS.hidden_size, activation=tf.nn.relu, name='f1')
134 |         dnn2 = tf.layers.dense(dnn1, FLAGS.hidden_size, activation=tf.nn.relu, name='f2')
135 |         logit = tf.squeeze(tf.layers.dense(dnn2 + dnn1, 1, activation=None, name='logit'))
136 |         y_hat = tf.sigmoid(logit)
137 | 
138 |     # print("--------------------")
139 |     # print("label:", label)
140 |     # print("logit:", logit)
141 |     # print("--------------------")
142 |     loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit))
143 | 
144 |     total_loss = loss
145 |     tvars = tf.trainable_variables()
146 |     initialized_variable_names = {}
147 |     scaffold_fn = None
148 | 
149 |     if init_checkpoint:
150 |         (assignment_map, initialized_variable_names
151 |          ) = modeling.get_assignment_map_from_checkpoint(
152 |             tvars, init_checkpoint)
153 |         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
154 |         if load_cross:
155 |             # multi-layer parameter loading
156 | 
157 |             name_to_variable = collections.OrderedDict()
158 |             for var in tvars:
159 |                 name = var.name
160 |                 m = re.match("^(.*):\\d+$", name)
161 |                 if m is not None:
162 |                     name = m.group(1)
163 |                 name_to_variable[name] = var
164 | 
165 |             for layer_index in range(bert_config.num_hidden_layers):
166 | 
167 |                 assignment_map = collections.OrderedDict()
168 |                 for name in name_to_variable.keys():
169 |                     if "layer_" + str(layer_index) in name:
170 | 
171 |                         var_name_list = name.split("/")
172 |                         var_name_list[2] = "shared_layer"
173 |                         load_name = "/".join(var_name_list)
174 |                         # assignment_map[name] = new_name
175 |                         assignment_map[load_name] = name
176 |                         initialized_variable_names[name] = 1
177 |                         initialized_variable_names[name + ":0"] = 1
178 | 
179 |                 tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
180 | 
181 |     tf.logging.info("**** Trainable Variables ****")
182 |     for var in tvars:
183 |         init_string = ""
184 |         if var.name in initialized_variable_names:
185 |             init_string = ", *INIT_FROM_CKPT*"
186 |         tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
187 |                         init_string)
188 | 
189 |     if mode == tf.estimator.ModeKeys.TRAIN:
190 |         train_op = optimization.create_optimizer(total_loss, learning_rate,
191 |                                                  num_train_steps,
192 |                                                  num_warmup_steps, False)
193 | 
194 |         return model, train_op, total_loss
195 | 
196 |     elif mode == tf.estimator.ModeKeys.EVAL:
197 | 
198 |         return model, y_hat, total_loss
199 | 
200 |     else:
201 |         raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
202 | 
203 | 
204 | def main(_):
205 | 
206 |     # load label
207 |     phisher_account = pd.read_csv("../Data/phisher_account.txt", names=["account"])
208 |     phisher_account_set = set(phisher_account.account.values)
209 | 
210 |     def is_phish(address):
211 |         if address in phisher_account_set:
212 |             return 1.0
213 |         else:
214 |             return 0.0
215 | 
216 |     mode = tf.estimator.ModeKeys.TRAIN
217 |     train_input_files = FLAGS.train_input_file
218 |     train_features = input_fn(train_input_files, is_training=True)
219 | 
220 |     # modeling
221 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
222 |     tf.gfile.MakeDirs(FLAGS.checkpointDir)
223 | 
224 |     # load vocab
225 |     vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename
226 |     with open(vocab_file_name, "rb") as f:
227 |         vocab = pkl.load(f)
228 | 
229 |     # must have checkpoint
230 |     if FLAGS.init_checkpoint==None:
231 |         # raise ValueError("Must need a checkpoint for finetuning")
232 |         print("No checkpoint!")
233 | 
234 |     train_bert_model, train_op, total_loss = model_fn(train_features, mode, bert_config, vocab,
235 |                                                       FLAGS.init_checkpoint,
236 |                                                       FLAGS.learning_rate,
237 |                                                       FLAGS.num_train_steps, FLAGS.num_warmup_steps, FLAGS.load_cross, False)
238 | 
239 |     # saver define
240 |     tvars = tf.trainable_variables()
241 |     saver = tf.train.Saver(max_to_keep=30, var_list=tvars)
242 | 
243 |     # start session
244 |     sess = tf.Session()
245 |     sess.run(tf.global_variables_initializer())
246 | 
247 |     # start TRAINING
248 |     losses = []
249 |     iter = 0
250 |     start = time.time()
251 |     while True:
252 |         try:
253 |             _, loss = sess.run([train_op, total_loss])
254 |             losses.append(loss)
255 | 
256 |             if iter % 100 == 0:
257 |                 end = time.time()
258 |                 loss = np.mean(losses)
259 |                 print("iter=%d, loss=%f, time=%.2fs" % (iter, loss, end - start))
260 |                 losses = []
261 |                 start = time.time()
262 | 
263 |             iter += 1
264 | 
265 |         except Exception as e:
266 |             print("Out of Sequence")
267 |             saver.save(sess, os.path.join(FLAGS.checkpointDir, "bert_finetune"))
268 |             break
269 | 
270 |     # Evaluation
271 |     mode = tf.estimator.ModeKeys.EVAL
272 |     test_input_files = FLAGS.test_input_file
273 |     test_features = input_fn(test_input_files, is_training=False)
274 |     # do not load checkpoint
275 |     test_bert_model, y_hat, total_loss = model_fn(test_features, mode, bert_config, vocab,
276 |                                                   os.path.join(FLAGS.checkpointDir, "bert_finetune"),
277 |                                                   FLAGS.learning_rate,
278 |                                                   FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False)
279 | 
280 |     address_id_list = []
281 |     y_hat_list = []
282 |     label_list = []
283 | 
284 |     iter = 0
285 |     start = time.time()
286 |     while True:
287 |         try:
288 |             address_id_v, y_hat_v, label_v, loss = sess.run([test_features["address"], y_hat, test_features["label"], total_loss])
289 |             address_id_list += list(np.squeeze(address_id_v))
290 |             y_hat_list += list(y_hat_v)
291 |             label_list += list(label_v)
292 |             losses.append(loss)
293 | 
294 |             if iter % 100 == 0:
295 |                 end = time.time()
296 |                 print("iter=%d, time=%.2fs" % (iter, end - start))
297 |                 start = time.time()
298 | 
299 |             iter += 1
300 | 
301 |         except Exception as e:
302 |             print("Out of Sequence")
303 |             # save model
304 |             # saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(iter)))
305 |             break
306 | 
307 |     sess.close()
308 | 
309 |     # generate final result
310 |     address_id_list = np.array(address_id_list).reshape([-1])
311 |     y_hat_list = np.array(y_hat_list).reshape([-1])
312 |     label_list = np.array(label_list).reshape([-1])
313 | 
314 |     # aggregation
315 |     # group by embedding according to address
316 |     address_to_pred_proba = {}
317 |     # address_to_label = {}
318 |     for i in range(len(address_id_list)):
319 |         address = address_id_list[i]
320 |         pred_proba = y_hat_list[i]
321 |         # label = label_list[i]
322 |         try:
323 |             address_to_pred_proba[address].append(pred_proba)
324 |             # address_to_label[address].append(label)
325 |         except:
326 |             address_to_pred_proba[address] = [pred_proba]
327 |             # address_to_label[address] = [label]
328 | 
329 |     # group to one
330 |     address_list = []
331 |     agg_y_hat_list = []
332 |     agg_label_list = []
333 | 
334 |     for addr, pred_proba_list in address_to_pred_proba.items():
335 |         address_list.append(addr)
336 |         if len(pred_proba_list) > 1:
337 |             agg_y_hat_list.append(np.mean(pred_proba_list, axis=0))
338 |         else:
339 |             agg_y_hat_list.append(pred_proba_list[0])
340 | 
341 |         agg_label_list.append(is_phish(vocab.id_to_tokens[addr]))
342 | 
343 |     # print("================ROC Curve====================")
344 |     fpr, tpr, thresholds = roc_curve(agg_label_list, agg_y_hat_list, pos_label=1)
345 |     print("AUC=", auc(fpr, tpr))
346 | 
347 |     print(np.sum(agg_label_list))
348 |     print(np.sum(agg_y_hat_list))
349 | 
350 |     # for threshold in [0.01, 0.03, 0.05]:
351 |     for threshold in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]:
352 | 
353 |         print("threshold =", threshold)
354 |         y_pred = np.zeros_like(agg_y_hat_list)
355 |         y_pred[np.where(np.array(agg_y_hat_list) >= threshold)[0]] = 1
356 |         print(np.sum(y_pred))
357 |         print(classification_report(agg_label_list, y_pred, digits=4))
358 | 
359 |     return
360 | 
361 | if __name__ == '__main__':
362 | 
363 |     del_flags(FLAGS, ["do_train", "do_eval", "cross_share", "load_cross", "epoch", "max_seq_length", "train_input_file", "test_input_file", "init_checkpoint","learning_rate"])
364 |     flags.DEFINE_bool("do_train", False, "")
365 |     flags.DEFINE_bool("do_eval", True, "")
366 |     flags.DEFINE_bool("cross_share", False, "whether to share or not")
367 |     flags.DEFINE_bool("load_cross", True, "whether to load from cross")
368 |     flags.DEFINE_integer("epoch", 1, "Epoch for finetune")
369 |     flags.DEFINE_integer("max_seq_length", 100, "")
370 |     flags.DEFINE_string("train_input_file", "./data/finetune_train.tfrecord", "Input train file for finetuning")
371 |     flags.DEFINE_string("test_input_file", "./data/finetune_test.tfrecord", "Input test file for finetuning")
372 |     flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BERT model).")
373 |     flags.DEFINE_integer("hidden_size", 128, "Hidden size for downside MLP.")
374 |     flags.DEFINE_float("learning_rate", 3e-4, "")
375 | 
376 |     print("==========Parameters===========")
377 |     print("cross_share:", FLAGS.cross_share)
378 |     print("load_cross:", FLAGS.load_cross)
379 |     print("learning_rate:", FLAGS.learning_rate)
380 |     tf.app.run()


--------------------------------------------------------------------------------
/Model/run_pretrain.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Run masked LM/next sentence masked_lm pre-training for BERT."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | import os
 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 22 | import sys
 23 | 
 24 | import modeling
 25 | sys.path.append("..")
 26 | import optimization
 27 | import tensorflow.compat.v1 as tf
 28 | tf.disable_v2_behavior()
 29 | 
 30 | import numpy as np
 31 | 
 32 | import pickle as pkl
 33 | # import time
 34 | from timeit import default_timer as timer
 35 | import math
 36 | 
 37 | flags = tf.flags
 38 | FLAGS = flags.FLAGS
 39 | 
 40 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 41 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 42 | 
 43 | ## Required parameters
 44 | flags.DEFINE_string(
 45 |     "bert_config_file", "./zipzap_config.json",
 46 |     "The config json file corresponding to the pre-trained BERT model. "
 47 |     "This specifies the model architecture.")
 48 | 
 49 | flags.DEFINE_string(
 50 |     "train_input_file", "./data/train.tfrecord",
 51 |     "Input TF example files (can be a glob or comma separated).")
 52 | 
 53 | flags.DEFINE_string(
 54 |     "test_input_file", "./data/test.tfrecord",
 55 |     "Input TF example files (can be a glob or comma separated).")
 56 | 
 57 | flags.DEFINE_string(
 58 |     "checkpointDir", "ckpt_dir",
 59 |     "The output directory where the model checkpoints will be written.")
 60 | 
 61 | flags.DEFINE_string("signature", 'default', "signature_name")
 62 | 
 63 | ## Other parameters
 64 | flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BERT model).")
 65 | flags.DEFINE_integer("max_seq_length", 29, "")
 66 | flags.DEFINE_float("masked_lm_prob", 0.8, "Masked LM probability.")
 67 | flags.DEFINE_bool("do_train", True, "")
 68 | flags.DEFINE_bool("do_eval", False, "")
 69 | flags.DEFINE_integer("batch_size", 256, "")
 70 | flags.DEFINE_integer("epoch", 5, "")
 71 | flags.DEFINE_float("learning_rate", 1e-4, "")
 72 | flags.DEFINE_integer("num_train_steps", 10000000, "Number of training steps.")
 73 | flags.DEFINE_integer("num_warmup_steps", 100, "Number of warmup steps.")
 74 | flags.DEFINE_integer("save_checkpoints_steps", 8000, "")
 75 | flags.DEFINE_integer("iterations_per_loop", 2000, "How many steps to make in each estimator call.")
 76 | flags.DEFINE_integer("max_eval_steps", 1000, "Maximum number of eval steps.")
 77 | flags.DEFINE_integer("neg_sample_num", 5000, "The number of negative samples in a batch")
 78 | flags.DEFINE_string("neg_strategy", "zip", "Strategy of negative sampling")
 79 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
 80 | flags.DEFINE_string("data_dir", './data/', "data dir.")
 81 | flags.DEFINE_bool("cross_share", True, "whether to share or not")
 82 | flags.DEFINE_bool("load_cross", False, "whether to load from cross")
 83 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename")
 84 | 
 85 | MAX_PREDICTIONS_PER_SEQ = math.ceil(FLAGS.max_seq_length * FLAGS.masked_lm_prob)
 86 | 
 87 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length)
 88 | print("MAX_PREDICTIONS_PER_SEQ:", MAX_PREDICTIONS_PER_SEQ)
 89 | 
 90 | 
 91 | def input_fn(input_files,
 92 |              is_training,
 93 |              num_cpu_threads=4):
 94 |     """ The actual input function"""
 95 | 
 96 |     name_to_features = {
 97 |         "address":
 98 |             tf.FixedLenFeature([1], tf.int64),
 99 |         "input_ids":
100 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
101 |         "input_positions":
102 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
103 |         "input_counts":
104 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
105 |         "input_mask":
106 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
107 |         "input_io_flags":
108 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
109 |         "input_values":
110 |             tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64),
111 |         "masked_lm_positions":
112 |             tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.int64),
113 |         "masked_lm_ids":
114 |             tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.int64),
115 |         "masked_lm_weights":
116 |             tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.float32)
117 |     }
118 | 
119 |     if is_training:
120 |         d = tf.data.TFRecordDataset(input_files)
121 |         d = d.repeat(FLAGS.epoch).shuffle(100)
122 | 
123 |     else:
124 |         d = tf.data.TFRecordDataset(input_files)
125 | 
126 |     d = d.map(lambda record: _decode_record(record, name_to_features), num_parallel_calls=num_cpu_threads)
127 |     d = d.batch(batch_size=FLAGS.batch_size)
128 | 
129 |     iterator = d.make_one_shot_iterator()
130 |     features = iterator.get_next()
131 | 
132 |     return features
133 | 
134 | 
135 | def model_fn(features, mode, bert_config, vocab, init_checkpoint, learning_rate,
136 |              num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings):
137 |     """The `model_fn` for TPUEstimator."""
138 | 
139 |     # tf.logging.info("*** Features ***")
140 |     # for name in sorted(features.keys()):
141 |     #     tf.logging.info("name = %s, shape = %s" % (name,
142 |     #                                                features[name].shape))
143 | 
144 |     input_ids = features["input_ids"]
145 |     input_positions = features["input_positions"]
146 |     input_mask = features["input_mask"]
147 |     input_io_flags = features["input_io_flags"]
148 |     input_values = features["input_values"]
149 |     input_counts = features["input_counts"]
150 |     masked_lm_positions = features["masked_lm_positions"]
151 |     masked_lm_ids = features["masked_lm_ids"]
152 |     masked_lm_weights = features["masked_lm_weights"]
153 | 
154 |     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
155 | 
156 |     model = modeling.BertModel(
157 |         config=bert_config,
158 |         is_training=is_training,
159 |         input_ids=input_ids,
160 |         input_positions=input_positions,
161 |         input_io_flags=input_io_flags,
162 |         input_amounts=input_values,
163 |         input_counts=input_counts,
164 |         input_mask=input_mask,
165 |         token_type_ids=None,
166 |         use_one_hot_embeddings=use_one_hot_embeddings,
167 |         cross_share=FLAGS.cross_share)
168 | 
169 |     embedding_table_list, factorize_table_list = model.get_embedding_table()
170 | 
171 |     (masked_lm_loss,
172 |      masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output_negative_sampling(
173 |         bert_config,
174 |         model.get_sequence_output(),
175 |         embedding_table_list,
176 |         factorize_table_list,
177 |         masked_lm_positions,
178 |         masked_lm_ids,
179 |         masked_lm_weights,
180 |         vocab)  # model use the token embedding table as the output_weights
181 | 
182 |     total_loss = masked_lm_loss
183 |     tvars = tf.trainable_variables()
184 |     initialized_variable_names = {}
185 |     scaffold_fn = None
186 | 
187 |     if init_checkpoint:
188 |         (assignment_map, initialized_variable_names
189 |          ) = modeling.get_assignment_map_from_checkpoint(
190 |             tvars, init_checkpoint)
191 |         if use_tpu:
192 | 
193 |             def tpu_scaffold():
194 |                 tf.train.init_from_checkpoint(init_checkpoint,
195 |                                               assignment_map)
196 |                 return tf.train.Scaffold()
197 | 
198 |             scaffold_fn = tpu_scaffold
199 |         else:
200 |             tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
201 | 
202 |     tf.logging.info("**** Trainable Variables ****")
203 |     for var in tvars:
204 |         init_string = ""
205 |         if var.name in initialized_variable_names:
206 |             init_string = ", *INIT_FROM_CKPT*"
207 |         tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
208 |                         init_string)
209 | 
210 |     if mode == tf.estimator.ModeKeys.TRAIN:
211 |         train_op = optimization.create_optimizer(total_loss, learning_rate,
212 |                                                  num_train_steps,
213 |                                                  num_warmup_steps, use_tpu)
214 | 
215 |         return model, train_op, total_loss
216 |         # output_spec = tf.estimator.EstimatorSpec(
217 |         #     mode=mode,
218 |         #     loss=total_loss,
219 |         #     train_op=train_op,
220 |         #     scaffold=scaffold_fn)
221 | 
222 |     elif mode == tf.estimator.ModeKeys.EVAL:
223 | 
224 |         def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights):
225 |             """Computes the loss and accuracy of the model."""
226 |             masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]])
227 |             masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32)
228 |             masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])
229 |             masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
230 |             masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
231 |             masked_lm_accuracy = tf.metrics.accuracy(labels=masked_lm_ids, predictions=masked_lm_predictions,
232 |                                                      weights=masked_lm_weights)
233 |             masked_lm_mean_loss = tf.metrics.mean(values=masked_lm_example_loss, weights=masked_lm_weights)
234 | 
235 |             return {
236 |                 "masked_lm_accuracy": masked_lm_accuracy,
237 |                 "masked_lm_loss": masked_lm_mean_loss,
238 |             }
239 | 
240 |         tf.add_to_collection('eval_sp', masked_lm_log_probs)
241 |         tf.add_to_collection('eval_sp', input_ids)
242 |         tf.add_to_collection('eval_sp', masked_lm_ids)
243 | 
244 |         eval_metrics = metric_fn(masked_lm_example_loss,
245 |                                  masked_lm_log_probs,
246 |                                  masked_lm_ids,
247 |                                  masked_lm_weights)
248 | 
249 |         # output_spec = tf.estimator.EstimatorSpec(
250 |         #     mode=mode,
251 |         #     loss=total_loss,
252 |         #     eval_metric_ops=eval_metrics,
253 |         #     scaffold=scaffold_fn)
254 | 
255 |         return model, total_loss
256 | 
257 |     else:
258 |         raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode))
259 | 
260 | 
261 | def get_masked_lm_output(bert_config, input_tensor, output_weights, positions,
262 |                          label_ids, label_weights):
263 |     """Get loss and log probs for the masked LM."""
264 |     # [batch_size*label_size, dim]
265 |     input_tensor = gather_indexes(input_tensor, positions)
266 | 
267 |     with tf.variable_scope("cls/predictions"):
268 |         # We apply one more non-linear transformation before the output layer.
269 |         # This matrix is not used after pre-training.
270 |         with tf.variable_scope("transform"):
271 |             input_tensor = tf.layers.dense(
272 |                 input_tensor,
273 |                 units=bert_config.hidden_size,
274 |                 activation=modeling.get_activation(bert_config.hidden_act),
275 |                 kernel_initializer=modeling.create_initializer(
276 |                     bert_config.initializer_range))
277 |             input_tensor = modeling.layer_norm(input_tensor)
278 | 
279 |         # The output weights are the same as the input embeddings, but there is
280 |         # an output-only bias for each token.
281 |         output_bias = tf.get_variable(
282 |             "output_bias",
283 |             shape=[output_weights.shape[0]],
284 |             initializer=tf.zeros_initializer())
285 |         logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
286 |         logits = tf.nn.bias_add(logits, output_bias)
287 |         # logits, (bs*label_size, vocab_size)
288 |         log_probs = tf.nn.log_softmax(logits, -1)
289 | 
290 |         label_ids = tf.reshape(label_ids, [-1])
291 |         label_weights = tf.reshape(label_weights, [-1])
292 | 
293 |         one_hot_labels = tf.one_hot(
294 |             label_ids, depth=output_weights.shape[0], dtype=tf.float32)
295 | 
296 |         # The `positions` tensor might be zero-padded (if the sequence is too
297 |         # short to have the maximum number of predictions). The `label_weights`
298 |         # tensor has a value of 1.0 for every real prediction and 0.0 for the
299 |         # padding predictions.
300 |         per_example_loss = -tf.reduce_sum(
301 |             log_probs * one_hot_labels, axis=[-1])
302 |         numerator = tf.reduce_sum(label_weights * per_example_loss)
303 |         denominator = tf.reduce_sum(label_weights) + 1e-5
304 |         loss = numerator / denominator
305 | 
306 |     return (loss, per_example_loss, log_probs)
307 | 
308 | 
309 | def get_masked_lm_output_negative_sampling(bert_config, input_tensor,
310 |                                            embedding_table_list, factorize_table_list,
311 |                                            positions, label_ids, label_weights, vocab):
312 |     """Get loss and log probs for the masked LM."""
313 | 
314 |     # negative sample randomly
315 |     word_num = len(vocab.vocab_words) - 3
316 | 
317 |     if FLAGS.neg_strategy == "uniform":
318 |         neg_ids, _, _ = tf.nn.uniform_candidate_sampler(true_classes=[[len(vocab.vocab_words)]],
319 |                                                         num_true=1,
320 |                                                         num_sampled=FLAGS.neg_sample_num,
321 |                                                         unique=True,
322 |                                                         range_max=word_num)
323 | 
324 |     elif FLAGS.neg_strategy == "zip":
325 |         neg_ids, _, _ = tf.nn.log_uniform_candidate_sampler(true_classes=[[len(vocab.vocab_words)]],
326 |                                                             num_true=1,
327 |                                                             num_sampled=FLAGS.neg_sample_num,
328 |                                                             unique=True,
329 |                                                             range_max=word_num)
330 | 
331 |     elif FLAGS.neg_strategy == "freq":
332 |         # negative sample based on frequency
333 |         neg_ids, _, _ = tf.nn.fixed_unigram_candidate_sampler(true_classes=[[len(vocab.vocab_words)]],
334 |                                                               num_true=1,
335 |                                                               num_sampled=FLAGS.neg_sample_num,
336 |                                                               unique=True,
337 |                                                               range_max=word_num,
338 |                                                               unigrams=list(
339 |                                                                   map(lambda x: pow(x, 1 / 1), vocab.frequency[3:]))
340 |                                                               )
341 | 
342 |     else:
343 |         raise ValueError("Please select correct negative sampling strategy: uniform, zip, .")
344 | 
345 |     neg_ids = tf.cast(neg_ids, tf.int32)
346 |     neg_ids = neg_ids + 1 + 3 # + 4 (1 padding, 2 mask, 3 not use)
347 | 
348 |     # [batch_size*label_size, dim]
349 |     input_tensor = gather_indexes(input_tensor, positions)
350 | 
351 |     with tf.variable_scope("cls/predictions"):
352 |         # We apply one more non-linear transformation before the output layer.
353 |         # This matrix is not used after pre-training.
354 |         with tf.variable_scope("transform"):
355 |             input_tensor = tf.layers.dense(
356 |                 input_tensor,
357 |                 units=bert_config.hidden_size,
358 |                 activation=modeling.get_activation(bert_config.hidden_act),
359 |                 kernel_initializer=modeling.create_initializer(
360 |                     bert_config.initializer_range))
361 |             input_tensor = modeling.layer_norm(input_tensor)
362 | 
363 |         # label_ids = tf.reshape(label_ids, [-1])
364 |         label_weights = tf.reshape(label_weights, [-1])
365 | 
366 |         pos_output_weights = modeling.new_embedding_lookup(label_ids,
367 |                                                            bert_config.bucket_list,
368 |                                                            embedding_table_list,
369 |                                                            factorize_table_list)
370 | 
371 |         pos_output_weights = tf.reshape(pos_output_weights, [-1, pos_output_weights.shape[-1]])
372 | 
373 |         neg_ids = tf.expand_dims(neg_ids, axis=0)
374 |         neg_output_weights = modeling.new_embedding_lookup(neg_ids,
375 |                                                            bert_config.bucket_list,
376 |                                                            embedding_table_list,
377 |                                                            factorize_table_list)
378 | 
379 |         neg_output_weights = tf.reshape(neg_output_weights, [-1, neg_output_weights.shape[-1]])
380 | 
381 |         pos_logits = tf.reduce_sum(tf.multiply(input_tensor, pos_output_weights), axis=-1)  # 768
382 |         pos_logits = tf.expand_dims(pos_logits, axis=1)
383 |         neg_logits = tf.matmul(input_tensor, neg_output_weights, transpose_b=True)  # 768, 10000
384 | 
385 |         logits = tf.concat([pos_logits, neg_logits], axis=1)
386 |         # The output weights are the same as the input embeddings, but there is
387 |         # an output-only bias for each token.
388 |         output_bias = tf.get_variable(
389 |             "output_bias",
390 |             shape=[logits.shape[1]],
391 |             initializer=tf.zeros_initializer())
392 | 
393 |         logits = tf.nn.bias_add(logits, output_bias)
394 |         log_probs = tf.nn.log_softmax(logits, -1)
395 |         per_example_loss = -log_probs[:, 0]
396 |         # The `positions` tensor might be zero-padded (if the sequence is too
397 |         # short to have the maximum number of predictions). The `label_weights`
398 |         # tensor has a value of 1.0 for every real prediction and 0.0 for the
399 |         # padding predictions.
400 |         numerator = tf.reduce_sum(label_weights * per_example_loss)
401 |         denominator = tf.reduce_sum(label_weights) + 1e-5
402 |         loss = numerator / denominator
403 | 
404 |     return (loss, per_example_loss, log_probs)
405 | 
406 | 
407 | def gather_indexes(sequence_tensor, positions):
408 |     """Gathers the vectors at the specific positions over a minibatch."""
409 |     sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
410 |     batch_size = sequence_shape[0]
411 |     seq_length = sequence_shape[1]
412 |     width = sequence_shape[2]
413 |     flat_offsets = tf.reshape(
414 |         tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
415 |     flat_positions = tf.reshape(positions + flat_offsets, [-1])
416 |     flat_sequence_tensor = tf.reshape(sequence_tensor,
417 |                                       [batch_size * seq_length, width])
418 |     output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
419 |     return output_tensor
420 | 
421 | 
422 | def _decode_record(record, name_to_features):
423 |     """Decodes a record to a TensorFlow example."""
424 |     example = tf.parse_single_example(record, name_to_features)
425 | 
426 |     # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
427 |     # So cast all int64 to int32.
428 |     for name in list(example.keys()):
429 |         t = example[name]
430 |         if t.dtype == tf.int64:
431 |             t = tf.to_int32(t)
432 |         example[name] = t
433 |     return example
434 | 
435 | 
436 | def main(_):
437 |     if FLAGS.do_train:
438 |         mode = tf.estimator.ModeKeys.TRAIN
439 |         input_files = FLAGS.train_input_file
440 |         # load data
441 |         features = input_fn(input_files, is_training=True)
442 | 
443 |     elif FLAGS.do_eval:
444 |         mode = tf.estimator.ModeKeys.EVAL
445 |         input_files = FLAGS.test_input_file
446 |         features = input_fn(input_files, is_training=False)
447 | 
448 |     else:
449 |         raise ValueError("Only TRAIN and EVAL modes are supported.")
450 | 
451 |     # modeling
452 |     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
453 |     tf.gfile.MakeDirs(FLAGS.checkpointDir)
454 | 
455 |     # load vocab
456 |     vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename
457 |     with open(vocab_file_name, "rb") as f:
458 |         vocab = pkl.load(f)
459 | 
460 |     if FLAGS.do_train:
461 |         bert_model, train_op, total_loss = model_fn(features, mode, bert_config, vocab, FLAGS.init_checkpoint,
462 |                                                     FLAGS.learning_rate,
463 |                                                     FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False)
464 |         # saver define
465 |         tvars = tf.trainable_variables()
466 |         saver = tf.train.Saver(max_to_keep=30, var_list=tvars)
467 | 
468 |         # start session
469 |         config = tf.ConfigProto(allow_soft_placement=True)
470 |         config.gpu_options.allow_growth = True
471 | 
472 |         with tf.Session(config=config) as sess:
473 |             sess.run(tf.global_variables_initializer())
474 |             losses = []
475 |             iter = 0
476 |             # start = time.time()
477 |             start = timer()
478 |             while True:
479 |                 try:
480 |                     _, loss = sess.run([train_op, total_loss])
481 |                     # loss = sess.run([total_loss])
482 |                     losses.append(loss)
483 | 
484 |                     if iter % 500 == 0:
485 |                         # end = time.time()
486 |                         end = timer()
487 |                         loss = np.mean(losses)
488 |                         print("iter=%d, loss=%f, time=%.3fs" % (iter, loss, end - start))
489 |                         losses = []
490 |                         # start = time.time()
491 |                         start = timer()
492 | 
493 |                     if iter % FLAGS.save_checkpoints_steps == 0 and iter > 0:
494 |                         saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(round(iter))))
495 | 
496 |                     iter += 1
497 | 
498 |                 except Exception as e:
499 |                     # print("Out of Sequence, end of training...")
500 |                     print(e)
501 |                     # save model
502 |                     saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(round(iter))))
503 |                     break
504 | 
505 |     elif FLAGS.do_eval:
506 |         # must have checkpoint
507 |         if FLAGS.init_checkpoint == None:
508 |             raise ValueError("Must need a checkpoint for evaluation")
509 | 
510 |         bert_model, total_loss = model_fn(features, mode, bert_config, vocab, FLAGS.init_checkpoint,
511 |                                           FLAGS.learning_rate,
512 |                                           FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False)
513 | 
514 |         # start session
515 |         with tf.Session() as sess:
516 |             sess.run(tf.global_variables_initializer())
517 |             losses = []
518 |             iter = 0
519 |             # start = time.time()
520 |             start = timer()
521 |             while True:
522 |                 try:
523 |                     loss = sess.run(total_loss)
524 |                     losses.append(loss)
525 | 
526 |                     if iter % 500 == 0:
527 |                         # end = time.time()
528 |                         end = timer()
529 |                         print("iter=%d, time=%.3fs" % (iter, end - start))
530 |                         # start = time.time()
531 |                         start = timer()
532 |                     iter += 1
533 | 
534 |                 except Exception as e:
535 |                     print("Out of Sequence")
536 |                     # save model
537 |                     # saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(iter)))
538 |                     break
539 | 
540 |             final_loss = np.mean(losses)
541 |             eval_sample_num = len(losses)
542 | 
543 |             print("========Evaluation Results==========")
544 |             print("sample_num=%d, loss=%.2f" % (eval_sample_num, final_loss))
545 | 
546 |     else:
547 |         raise ValueError("Only TRAIN and EVAL modes are supported.")
548 | 
549 |     return
550 | 
551 | 
552 | if __name__ == '__main__':
553 |     tf.app.run()


--------------------------------------------------------------------------------
/Model/run_zipzap.sh:
--------------------------------------------------------------------------------
1 | python gen_seq.py # construct transaction sequence
2 | python gen_pretrain_data.py # generate pre-training data
3 | python gen_finetune_data.py # generate fine-tuning data
4 | 
5 | python run_pretrain.py # pre-training
6 | python run_finetune.py --init_checkpoint=ckpt_dir/model_64000 # fine-tuning and evaluation
7 | 


--------------------------------------------------------------------------------
/Model/vocab.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | def convert_by_vocab(vocab, tokens):
 4 |     """Converts a sequence of [tokens|ids] using the vocab."""
 5 |     output = []
 6 |     for token in tokens:
 7 |         output.append(vocab[token])
 8 |     return output
 9 | 
10 | class FreqVocab(object):
11 |     """Runs end-to-end tokenziation."""
12 | 
13 |     def __init__(self):
14 |         self.counter = Counter()
15 |         self.frequency = []
16 | 
17 |     def update(self, eoa2seq):
18 |         for eoa in eoa2seq.keys():
19 |             seq = eoa2seq[eoa]
20 |             self.counter[eoa] = len(seq)
21 |             self.counter.update(map(lambda x:x[0], seq))
22 | 
23 |     def generate_vocab(self):
24 |         self.token_count = len(self.counter.keys())
25 |         self.special_tokens = ["[MASK]", "[pad]", '[NO_USE]']
26 |         self.token_to_ids = {}  # index begin from 1
27 | 
28 |         # first special tokens for frequency factorization
29 |         for token in self.special_tokens:
30 |             self.token_to_ids[token] = len(self.token_to_ids) + 1
31 | 
32 |         # then normal item
33 |         for token, count in self.counter.most_common():
34 |             self.token_to_ids[token] = len(self.token_to_ids) + 1
35 | 
36 |         # add count
37 |         for token in self.special_tokens:
38 |             self.counter[token] = 0
39 | 
40 |         self.id_to_tokens = {v: k for k, v in self.token_to_ids.items()}
41 |         self.vocab_words = list(self.token_to_ids.keys())
42 | 
43 |         id_list = sorted(list(self.token_to_ids.values()))
44 |         for id in id_list:
45 |             token = self.id_to_tokens[id]
46 |             self.frequency.append(self.counter[token]) # used for negative sampling
47 | 
48 |     def convert_tokens_to_ids(self, tokens):
49 |         return convert_by_vocab(self.token_to_ids, tokens)
50 | 
51 |     def convert_ids_to_tokens(self, ids):
52 |         return convert_by_vocab(self.id_to_tokens, ids)
53 | 
54 | 


--------------------------------------------------------------------------------
/Model/zipzap_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "attention_probs_dropout_prob": 0.2,
 3 |     "hidden_act": "gelu",
 4 |     "hidden_dropout_prob": 0.2,
 5 |     "hidden_size": 64,
 6 |     "bucket_list": [[0, 63], [64, 751], [752, 4242], [4243, 16800], [16801, 56218], [56219, 175121], [175122, 514653], [514654, 2300000]],
 7 |     "factor_list": [64, 41, 27, 17, 11, 7, 5, 3],
 8 |     "intermediate_size": 64,
 9 |     "initializer_range": 0.02,
10 |     "max_position_embeddings": 200,
11 |     "num_attention_heads": 2,
12 |     "num_hidden_layers": 8,
13 |     "type_vocab_size": 2,
14 |     "vocab_size": 2300000
15 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # ZipZap
 3 | 
 4 | This is the code and dataset for the paper "ZipZap: Efficient Training of Language Models for Ethereum Fraud Detection" (the Web conference 2024)
 5 | 
 6 | ## Getting Start
 7 | ### Requirements:
 8 | * Python >= 3.6.1
 9 | * NumPy >= 1.18.1
10 | * TensorFlow >= 2.0.0
11 | 
12 | ###  1. Download dataset 
13 | 
14 | #### Step 1. Download dataset from Google drive:
15 | * [All in one](https://drive.google.com/file/d/1EXMIWEPTuu3bN2gJOaxmEXyDG-AsDUIL/view)
16 | 
17 | #### Step 2. Unzip dataset under the directory of "ZipZap/Data"
18 | ``` 
19 | tar -xvf ZipZap_Data.tar.gz
20 | ``` 
21 | 
22 | ### 2. Run the code
23 | 
24 | Please refer to ./Model/run_zipzap.sh
25 | 


--------------------------------------------------------------------------------