├── Data └── phisher_account.txt ├── Model ├── gen_finetune_data.py ├── gen_pretrain_data.py ├── gen_seq.py ├── modeling.py ├── optimization.py ├── partitioning.py ├── run_finetune.py ├── run_pretrain.py ├── run_zipzap.sh ├── vocab.py └── zipzap_config.json └── README.md /Model/gen_finetune_data.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | from vocab import FreqVocab 9 | import os 10 | import collections 11 | import random 12 | import functools 13 | # import tensorflow as tf 14 | import tensorflow.compat.v1 as tf 15 | tf.disable_v2_behavior() 16 | 17 | import numpy as np 18 | import sys 19 | import six 20 | import pickle as pkl 21 | import time 22 | 23 | flags = tf.flags 24 | FLAGS = flags.FLAGS 25 | 26 | random_seed = 12345 27 | rng = random.Random(random_seed) 28 | 29 | ## parameters 30 | flags.DEFINE_integer("max_seq_length", 100, "max sequence length.") 31 | # flags.DEFINE_integer("sliding_step", 30, "sliding window step size.") 32 | flags.DEFINE_string("data_dir", './data/', "data dir.") 33 | flags.DEFINE_string("dataset_name", 'eth',"dataset name.") 34 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename") 35 | flags.DEFINE_bool("total_drop", False, "whether to drop") 36 | 37 | SLIDING_STEP = round(FLAGS.max_seq_length * 0.6) 38 | 39 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length) 40 | print("SLIDING_STEP:", SLIDING_STEP) 41 | 42 | 43 | class FinetuneInstance(object): 44 | """A single training instance (sentence pair).""" 45 | 46 | def __init__(self, address, tokens, label): 47 | 48 | self.address = [address] 49 | self.tokens = list(map(lambda x: x[0], tokens)) 50 | self.block_timestamps = list(map(lambda x: x[2], tokens)) 51 | self.values = list(map(lambda x: x[3], tokens)) 52 | self.label = label 53 | 54 | def map_io_flag(token): 55 | flag = token[4] 56 | if flag == "OUT": 57 | return 1 58 | elif flag == "IN": 59 | return 2 60 | else: 61 | return 0 62 | 63 | self.io_flags = list(map(map_io_flag, tokens)) 64 | self.cnts = list(map(lambda x: x[5], tokens)) 65 | 66 | 67 | def __str__(self): 68 | s = "address: %s\n" % (self.address[0]) 69 | s += "tokens: %s\n" % ( 70 | " ".join([printable_text(x) for x in self.tokens])) 71 | s += "\n" 72 | return s 73 | 74 | def __repr__(self): 75 | return self.__str__() 76 | 77 | def printable_text(text): 78 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 79 | 80 | # These functions want `str` for both Python2 and Python3, but in one case 81 | # it's a Unicode string and in the other it's a byte string. 82 | if six.PY3: 83 | if isinstance(text, str): 84 | return text 85 | elif isinstance(text, bytes): 86 | return text.decode("utf-8", "ignore") 87 | else: 88 | raise ValueError("Unsupported string type: %s" % (type(text))) 89 | elif six.PY2: 90 | if isinstance(text, str): 91 | return text 92 | elif isinstance(text, unicode): 93 | return text.encode("utf-8") 94 | else: 95 | raise ValueError("Unsupported string type: %s" % (type(text))) 96 | else: 97 | raise ValueError("Not running on Python2 or Python 3?") 98 | 99 | def cmp_udf_reverse(x1, x2): 100 | time1 = int(x1[2]) 101 | time2 = int(x2[2]) 102 | 103 | if time1 < time2: 104 | return 1 105 | elif time1 > time2: 106 | return -1 107 | else: 108 | return 0 109 | 110 | 111 | def create_embedding_predictions(tokens): 112 | """Creates the predictions for the masked LM objective.""" 113 | address = tokens[0][0] 114 | output_tokens = tokens 115 | masked_lm_positions = [] 116 | masked_lm_labels = [] 117 | return (address, output_tokens, masked_lm_positions, masked_lm_labels) 118 | 119 | 120 | def gen_finetune_samples(sequences, label_list): 121 | instances = [] 122 | # create train 123 | start = time.time() 124 | for i in tqdm(range(len(sequences))): 125 | 126 | tokens = sequences[i] 127 | address = tokens[0][0] 128 | instance = FinetuneInstance( 129 | address=address, 130 | tokens=tokens, 131 | label=label_list[i]) 132 | instances.append(instance) 133 | 134 | end = time.time() 135 | print("=======Finish========") 136 | print("cost time:%.2f" % (end - start)) 137 | return instances 138 | 139 | def create_int_feature(values): 140 | feature = tf.train.Feature( 141 | int64_list=tf.train.Int64List(value=list(values))) 142 | return feature 143 | 144 | def create_float_feature(values): 145 | feature = tf.train.Feature( 146 | float_list=tf.train.FloatList(value=list(values))) 147 | return feature 148 | 149 | def convert_timestamp_to_position(block_timestamps): 150 | position = [0] 151 | if len(block_timestamps) <= 1: 152 | return position 153 | last_ts = block_timestamps[1] 154 | idx = 1 155 | for b_ts in block_timestamps[1:]: 156 | if b_ts != last_ts: 157 | last_ts = b_ts 158 | idx += 1 159 | position.append(idx) 160 | return position 161 | 162 | def write_finetune_instance_to_example_files(instances, max_seq_length, vocab, output_files): 163 | """Create TF example files from `TrainingInstance`s.""" 164 | writers = [] 165 | for output_file in output_files: 166 | writers.append(tf.python_io.TFRecordWriter(output_file)) 167 | 168 | writer_index = 0 169 | total_written = 0 170 | 171 | for inst_index in tqdm(range(len(instances))): 172 | instance = instances[inst_index] 173 | input_ids = vocab.convert_tokens_to_ids(instance.tokens) 174 | address = vocab.convert_tokens_to_ids(instance.address) 175 | counts = instance.cnts 176 | block_timestamps = instance.block_timestamps 177 | values = instance.cnts 178 | io_flags = instance.io_flags 179 | positions = convert_timestamp_to_position(block_timestamps) 180 | label = [instance.label] 181 | 182 | input_mask = [1] * len(input_ids) 183 | assert len(input_ids) <= max_seq_length 184 | assert len(counts) <= max_seq_length 185 | assert len(values) <= max_seq_length 186 | assert len(io_flags) <= max_seq_length 187 | assert len(positions) <= max_seq_length 188 | 189 | input_ids += [0] * (max_seq_length - len(input_ids)) 190 | counts += [0] * (max_seq_length - len(counts)) 191 | values += [0] * (max_seq_length - len(values)) 192 | io_flags += [0] * (max_seq_length - len(io_flags)) 193 | positions += [0] * (max_seq_length - len(positions)) 194 | input_mask += [0] * (max_seq_length - len(input_mask)) 195 | 196 | assert len(input_ids) == max_seq_length 197 | assert len(counts) == max_seq_length 198 | assert len(values) == max_seq_length 199 | assert len(io_flags) == max_seq_length 200 | assert len(positions) == max_seq_length 201 | assert len(input_mask) == max_seq_length 202 | 203 | features = collections.OrderedDict() 204 | features["address"] = create_int_feature(address) 205 | features["label"] = create_float_feature(label) 206 | features["input_ids"] = create_int_feature(input_ids) 207 | features["input_positions"] = create_int_feature(positions) 208 | features["input_counts"] = create_int_feature(counts) 209 | features["input_io_flags"] = create_int_feature(io_flags) 210 | features["input_values"] = create_int_feature(values) 211 | features["input_mask"] = create_int_feature(input_mask) 212 | 213 | tf_example = tf.train.Example( 214 | features=tf.train.Features(feature=features)) 215 | 216 | writers[writer_index].write(tf_example.SerializeToString()) 217 | writer_index = (writer_index + 1) % len(writers) 218 | 219 | total_written += 1 220 | 221 | if inst_index < 3: 222 | tf.logging.info("*** Example ***") 223 | tf.logging.info("tokens: %s" % " ".join( 224 | [printable_text(x) for x in instance.tokens])) 225 | 226 | for feature_name in features.keys(): 227 | feature = features[feature_name] 228 | values = [] 229 | if feature.int64_list.value: 230 | values = feature.int64_list.value 231 | elif feature.float_list.value: 232 | values = feature.float_list.value 233 | tf.logging.info("%s: %s" % (feature_name, 234 | " ".join([str(x) 235 | for x in values]))) 236 | 237 | for writer in writers: 238 | writer.close() 239 | 240 | tf.logging.info("Wrote %d total instances", total_written) 241 | 242 | 243 | def total_repeat_drop(eoa2seq): 244 | """ 245 | totally drop the repeat part. 246 | """ 247 | new_eoa2seq = {} 248 | for eoa, seq in eoa2seq.items(): 249 | new_seq = [] 250 | exist_addr = set() 251 | for trans in seq: 252 | if trans[0] not in exist_addr: 253 | exist_addr.add(trans[0]) 254 | new_seq.append(trans) 255 | 256 | new_eoa2seq[eoa] = new_seq 257 | 258 | return new_eoa2seq 259 | 260 | 261 | def random_drop(eoa2seq, ratio=0.5): 262 | 263 | new_eoa2seq = {} 264 | for eoa, seq in eoa2seq.items(): 265 | filter_num = int(ratio * len(seq)) 266 | 267 | if len(seq) <= 2: 268 | new_eoa2seq[eoa] = seq 269 | 270 | else: 271 | remain_idx = set(np.random.choice(range(len(seq)), len(seq) - filter_num, replace=False)) 272 | new_seq = [] 273 | 274 | for id in remain_idx: 275 | new_seq.append(seq[id]) 276 | 277 | new_seq = sorted(new_seq, key=functools.cmp_to_key(cmp_udf_reverse)) 278 | new_eoa2seq[eoa] = new_seq 279 | 280 | return new_eoa2seq 281 | 282 | 283 | 284 | if __name__ == '__main__': 285 | 286 | # load label 287 | phisher_account = pd.read_csv("../Data/phisher_account.txt", names=["account"]) 288 | phisher_account_set = set(phisher_account.account.values) 289 | 290 | # load vocab 291 | vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename 292 | with open(vocab_file_name, "rb") as f: 293 | vocab = pkl.load(f) 294 | 295 | with open("./data/eoa2seq.pkl", "rb") as f: 296 | eoa2seq = pkl.load(f) 297 | 298 | print("number of target user account:", len(eoa2seq)) 299 | 300 | if FLAGS.total_drop: 301 | eoa2seq = total_repeat_drop(eoa2seq) 302 | 303 | eoa_list = list(eoa2seq.keys()) 304 | rng.shuffle(eoa_list) 305 | idx = round(len(eoa_list) * 0.7) 306 | train_eoa_list = set(eoa_list[:idx]) 307 | test_eoa_list = set(eoa_list[idx:]) 308 | print("------------------") 309 | print(len(train_eoa_list.intersection(test_eoa_list))) 310 | 311 | label_list = [] 312 | # clip and add label 313 | def is_phish(address): 314 | if address in phisher_account_set: 315 | return 1.0 316 | else: 317 | return 0.0 318 | 319 | max_num_tokens = FLAGS.max_seq_length - 1 320 | seqs = [] 321 | idx = 0 322 | for eoa, seq in eoa2seq.items(): 323 | if len(seq) <= max_num_tokens: 324 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 325 | seqs[idx] += seq 326 | idx += 1 327 | label_list.append(is_phish(eoa)) 328 | 329 | elif len(seq) > max_num_tokens: 330 | beg_idx = list(range(len(seq) - max_num_tokens, 0, -1 * SLIDING_STEP)) 331 | beg_idx.append(0) 332 | 333 | if len(beg_idx) > 500: 334 | beg_idx = list(np.random.permutation(beg_idx)[:500]) 335 | for i in beg_idx: 336 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 337 | seqs[idx] += seq[i:i + max_num_tokens] 338 | idx += 1 339 | label_list.append(is_phish(eoa)) 340 | 341 | else: 342 | for i in beg_idx[::-1]: 343 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 344 | seqs[idx] += seq[i:i + max_num_tokens] 345 | idx += 1 346 | label_list.append(is_phish(eoa)) 347 | 348 | # split into training and testing sequences 349 | train_seqs = [] 350 | test_seqs = [] 351 | train_label_list = [] 352 | test_label_list = [] 353 | print("Splitting the sequence..") 354 | for i in tqdm(range(len(seqs))): 355 | seq = seqs[i] 356 | label = label_list[i] 357 | if seq[0][0] in train_eoa_list: 358 | train_seqs.append(seq) 359 | train_label_list.append(label) 360 | elif seq[0][0] in test_eoa_list: 361 | test_seqs.append(seq) 362 | test_label_list.append(label) 363 | 364 | print("Generating training samples..") 365 | train_phish_instance = gen_finetune_samples(train_seqs, train_label_list) 366 | rng.shuffle(train_phish_instance) 367 | 368 | print("Generating testing samples..") 369 | test_phish_instance = gen_finetune_samples(test_seqs, test_label_list) 370 | rng.shuffle(test_phish_instance) 371 | 372 | print("*** Writing to output files ***") 373 | output_filename = FLAGS.data_dir + "finetune_train.tfrecord" 374 | print(" %s", output_filename) 375 | 376 | write_finetune_instance_to_example_files(train_phish_instance, FLAGS.max_seq_length, vocab, [output_filename]) 377 | 378 | print("*** Writing to output files ***") 379 | output_filename = FLAGS.data_dir + "finetune_test.tfrecord" 380 | print(" %s", output_filename) 381 | 382 | write_finetune_instance_to_example_files(test_phish_instance, FLAGS.max_seq_length, vocab, [output_filename]) 383 | print("Finished..") 384 | -------------------------------------------------------------------------------- /Model/gen_pretrain_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pickle as pkl 4 | from tqdm import tqdm 5 | import collections 6 | import functools 7 | import random 8 | import tensorflow.compat.v1 as tf 9 | 10 | tf.disable_v2_behavior() 11 | 12 | import six 13 | import time 14 | import math 15 | from vocab import FreqVocab 16 | 17 | tf.logging.set_verbosity(tf.logging.INFO) 18 | 19 | random_seed = 12345 20 | rng = random.Random(random_seed) 21 | 22 | short_seq_prob = 0 # Probability of creating sequences which are shorter than the maximum length。 23 | flags = tf.flags 24 | FLAGS = flags.FLAGS 25 | 26 | flags.DEFINE_integer("pool_size", 10, "multiprocesses pool size.") 27 | flags.DEFINE_integer("max_seq_length", 29, "max sequence length.") 28 | flags.DEFINE_float("masked_lm_prob", 0.8, "Masked LM probability.") 29 | flags.DEFINE_float("mask_prob", 1.0, "mask probabaility") 30 | flags.DEFINE_bool("do_eval", False, "") 31 | flags.DEFINE_bool("do_embed", True, "") 32 | flags.DEFINE_integer("dupe_factor", 10, "Number of times to duplicate the input data (with different masks).") 33 | flags.DEFINE_string("data_dir", './data/', "data dir.") 34 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename") 35 | flags.DEFINE_bool("total_drop", True, "whether to drop") 36 | flags.DEFINE_bool("drop", False, "whether to drop") 37 | 38 | HEADER = 'hash,nonce,block_hash,block_number,transaction_index,from_address,to_address,value,gas,gas_price,input,block_timestamp,max_fee_per_gas,max_priority_fee_per_gas,transaction_type'.split( 39 | ",") 40 | 41 | MaskedLmInstance = collections.namedtuple("MaskedLmInstance", 42 | ["index", "label"]) 43 | 44 | MAX_PREDICTIONS_PER_SEQ = math.ceil(FLAGS.max_seq_length * FLAGS.masked_lm_prob) 45 | SLIDING_STEP = round(FLAGS.max_seq_length * 0.6) 46 | 47 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length) 48 | print("MAX_PREDICTIONS_PER_SEQ:", MAX_PREDICTIONS_PER_SEQ) 49 | print("SLIDING_STEP:", SLIDING_STEP) 50 | 51 | class TrainingInstance(object): 52 | """A single training instance (sentence pair).""" 53 | 54 | def __init__(self, address, tokens, masked_lm_positions, masked_lm_labels): 55 | 56 | self.address = [address] 57 | self.tokens = list(map(lambda x: x[0], tokens)) 58 | self.block_timestamps = list(map(lambda x: x[2], tokens)) 59 | self.values = list(map(lambda x: x[3], tokens)) 60 | 61 | def map_io_flag(token): 62 | flag = token[4] 63 | if flag == "OUT": 64 | return 1 65 | elif flag == "IN": 66 | return 2 67 | else: 68 | return 0 69 | 70 | self.io_flags = list(map(map_io_flag, tokens)) 71 | self.cnts = list(map(lambda x: x[5], tokens)) 72 | self.masked_lm_positions = masked_lm_positions 73 | self.masked_lm_labels = masked_lm_labels 74 | 75 | def __str__(self): 76 | s = "address: %s\n" % (self.address[0]) 77 | s += "tokens: %s\n" % ( 78 | " ".join([printable_text(x) for x in self.tokens])) 79 | s += "masked_lm_positions: %s\n" % ( 80 | " ".join([str(x) for x in self.masked_lm_positions])) 81 | s += "masked_lm_labels: %s\n" % ( 82 | " ".join([printable_text(x) for x in self.masked_lm_labels])) 83 | s += "\n" 84 | return s 85 | 86 | def __repr__(self): 87 | return self.__str__() 88 | 89 | 90 | def printable_text(text): 91 | """Returns text encoded in a way suitable for print or `tf.logging`.""" 92 | 93 | # These functions want `str` for both Python2 and Python3, but in one case 94 | # it's a Unicode string and in the other it's a byte string. 95 | if six.PY3: 96 | if isinstance(text, str): 97 | return text 98 | elif isinstance(text, bytes): 99 | return text.decode("utf-8", "ignore") 100 | else: 101 | raise ValueError("Unsupported string type: %s" % (type(text))) 102 | elif six.PY2: 103 | if isinstance(text, str): 104 | return text 105 | elif isinstance(text, unicode): 106 | return text.encode("utf-8") 107 | else: 108 | raise ValueError("Unsupported string type: %s" % (type(text))) 109 | else: 110 | raise ValueError("Not running on Python2 or Python 3?") 111 | 112 | 113 | def create_int_feature(values): 114 | feature = tf.train.Feature( 115 | int64_list=tf.train.Int64List(value=list(values))) 116 | return feature 117 | 118 | 119 | def create_float_feature(values): 120 | feature = tf.train.Feature( 121 | float_list=tf.train.FloatList(value=list(values))) 122 | return feature 123 | 124 | 125 | def gen_samples(sequences, 126 | dupe_factor, 127 | masked_lm_prob, 128 | max_predictions_per_seq, 129 | pool_size, 130 | rng, 131 | force_head=False): 132 | instances = [] 133 | # create train 134 | if force_head: 135 | for step in range(dupe_factor): 136 | start = time.time() 137 | for tokens in sequences: 138 | (address, tokens, masked_lm_positions, 139 | masked_lm_labels) = create_masked_lm_predictions_force_head(tokens) 140 | instance = TrainingInstance( 141 | address=address, 142 | tokens=tokens, 143 | masked_lm_positions=masked_lm_positions, 144 | masked_lm_labels=masked_lm_labels) 145 | instances.append(instance) 146 | end = time.time() 147 | cost = end - start 148 | print("step=%d, time=%.2f" % (step, cost)) 149 | print("=======Finish========") 150 | 151 | else: 152 | for step in range(dupe_factor): 153 | start = time.time() 154 | for tokens in sequences: 155 | (address, tokens, masked_lm_positions, 156 | masked_lm_labels) = create_masked_lm_predictions( 157 | tokens, masked_lm_prob, max_predictions_per_seq, rng) 158 | instance = TrainingInstance( 159 | address=address, 160 | tokens=tokens, 161 | masked_lm_positions=masked_lm_positions, 162 | masked_lm_labels=masked_lm_labels) 163 | instances.append(instance) 164 | end = time.time() 165 | cost = end - start 166 | print("step=%d, time=%.2f" % (step, cost)) 167 | print("=======Finish========") 168 | return instances 169 | 170 | 171 | def create_masked_lm_predictions_force_head(tokens): 172 | """Creates the predictions for the masked LM objective.""" 173 | first_index = 0 174 | address = tokens[0][0] 175 | output_tokens = [list(i) for i in tokens] # note that change the value of output_tokens will also change tokens 176 | output_tokens[first_index] = ["[MASK]", 0, 0, 0, 0, 0] 177 | masked_lm_positions = [first_index] 178 | masked_lm_labels = [tokens[first_index][0]] 179 | 180 | return (address, output_tokens, masked_lm_positions, masked_lm_labels) 181 | 182 | 183 | def create_masked_lm_predictions(tokens, masked_lm_prob, 184 | max_predictions_per_seq, rng): 185 | """Creates the predictions for the masked LM objective.""" 186 | 187 | address = tokens[0][0] 188 | cand_indexes = [] 189 | for (i, token) in enumerate(tokens): 190 | cand_indexes.append(i) 191 | 192 | rng.shuffle(cand_indexes) 193 | output_tokens = [list(i) for i in tokens] # note that change the value of output_tokens will also change tokens 194 | num_to_predict = min(max_predictions_per_seq, 195 | max(1, int(len(tokens) * masked_lm_prob))) 196 | masked_lms = [] 197 | covered_indexes = set() 198 | for index in cand_indexes: 199 | if len(masked_lms) >= num_to_predict: 200 | break 201 | if index in covered_indexes: 202 | continue 203 | covered_indexes.add(index) 204 | masked_token = "[MASK]" 205 | masked_lms.append(MaskedLmInstance(index=index, label=tokens[index][0])) 206 | output_tokens[index][0] = masked_token 207 | 208 | masked_lms = sorted(masked_lms, key=lambda x: x.index) 209 | masked_lm_positions = [] 210 | masked_lm_labels = [] 211 | for p in masked_lms: 212 | masked_lm_positions.append(p.index) 213 | masked_lm_labels.append(p.label) 214 | return (address, output_tokens, masked_lm_positions, masked_lm_labels) 215 | 216 | 217 | def create_embedding_predictions(tokens): 218 | """Creates the predictions for the masked LM objective.""" 219 | address = tokens[0][0] 220 | output_tokens = tokens 221 | masked_lm_positions = [] 222 | masked_lm_labels = [] 223 | return (address, output_tokens, masked_lm_positions, masked_lm_labels) 224 | 225 | 226 | def gen_embedding_samples(sequences): 227 | instances = [] 228 | # create train 229 | start = time.time() 230 | for tokens in sequences: 231 | (address, tokens, masked_lm_positions, 232 | masked_lm_labels) = create_embedding_predictions(tokens) 233 | instance = TrainingInstance( 234 | address=address, 235 | tokens=tokens, 236 | masked_lm_positions=masked_lm_positions, 237 | masked_lm_labels=masked_lm_labels) 238 | instances.append(instance) 239 | 240 | end = time.time() 241 | print("=======Finish========") 242 | print("cost time:%.2f" % (end - start)) 243 | return instances 244 | 245 | 246 | def convert_timestamp_to_position(block_timestamps): 247 | position = [0] 248 | if len(block_timestamps) <= 1: 249 | return position 250 | last_ts = block_timestamps[1] 251 | idx = 1 252 | for b_ts in block_timestamps[1:]: 253 | if b_ts != last_ts: 254 | last_ts = b_ts 255 | idx += 1 256 | position.append(idx) 257 | return position 258 | 259 | 260 | def write_instance_to_example_files(instances, max_seq_length, 261 | max_predictions_per_seq, vocab, 262 | output_files): 263 | """Create TF example files from `TrainingInstance`s.""" 264 | writers = [] 265 | for output_file in output_files: 266 | writers.append(tf.python_io.TFRecordWriter(output_file)) 267 | 268 | writer_index = 0 269 | total_written = 0 270 | 271 | for inst_index in tqdm(range(len(instances))): 272 | instance = instances[inst_index] 273 | input_ids = vocab.convert_tokens_to_ids(instance.tokens) 274 | address = vocab.convert_tokens_to_ids(instance.address) 275 | counts = instance.cnts 276 | block_timestamps = instance.block_timestamps 277 | values = instance.values 278 | io_flags = instance.io_flags 279 | positions = convert_timestamp_to_position(block_timestamps) 280 | 281 | input_mask = [1] * len(input_ids) 282 | assert len(input_ids) <= max_seq_length 283 | assert len(counts) <= max_seq_length 284 | assert len(values) <= max_seq_length 285 | assert len(io_flags) <= max_seq_length 286 | assert len(positions) <= max_seq_length 287 | 288 | input_ids += [0] * (max_seq_length - len(input_ids)) 289 | counts += [0] * (max_seq_length - len(counts)) 290 | values += [0] * (max_seq_length - len(values)) 291 | io_flags += [0] * (max_seq_length - len(io_flags)) 292 | positions += [0] * (max_seq_length - len(positions)) 293 | input_mask += [0] * (max_seq_length - len(input_mask)) 294 | 295 | assert len(input_ids) == max_seq_length 296 | assert len(counts) == max_seq_length 297 | assert len(values) == max_seq_length 298 | assert len(io_flags) == max_seq_length 299 | assert len(positions) == max_seq_length 300 | assert len(input_mask) == max_seq_length 301 | 302 | masked_lm_positions = list(instance.masked_lm_positions) 303 | masked_lm_ids = vocab.convert_tokens_to_ids(instance.masked_lm_labels) 304 | masked_lm_weights = [1.0] * len(masked_lm_ids) 305 | 306 | masked_lm_positions += [0] * (max_predictions_per_seq - len(masked_lm_positions)) 307 | masked_lm_ids += [0] * (max_predictions_per_seq - len(masked_lm_ids)) 308 | masked_lm_weights += [0.0] * (max_predictions_per_seq - len(masked_lm_weights)) 309 | 310 | features = collections.OrderedDict() 311 | features["address"] = create_int_feature(address) 312 | features["input_ids"] = create_int_feature(input_ids) 313 | features["input_positions"] = create_int_feature(positions) 314 | features["input_counts"] = create_int_feature(counts) 315 | features["input_io_flags"] = create_int_feature(io_flags) 316 | features["input_values"] = create_int_feature(values) 317 | 318 | features["input_mask"] = create_int_feature(input_mask) 319 | features["masked_lm_positions"] = create_int_feature(masked_lm_positions) 320 | features["masked_lm_ids"] = create_int_feature(masked_lm_ids) 321 | features["masked_lm_weights"] = create_float_feature(masked_lm_weights) 322 | 323 | tf_example = tf.train.Example( 324 | features=tf.train.Features(feature=features)) 325 | 326 | writers[writer_index].write(tf_example.SerializeToString()) 327 | writer_index = (writer_index + 1) % len(writers) 328 | 329 | total_written += 1 330 | 331 | if inst_index < 3: 332 | tf.logging.info("*** Example ***") 333 | tf.logging.info("tokens: %s" % " ".join( 334 | [printable_text(x) for x in instance.tokens])) 335 | 336 | for feature_name in features.keys(): 337 | feature = features[feature_name] 338 | values = [] 339 | if feature.int64_list.value: 340 | values = feature.int64_list.value 341 | elif feature.float_list.value: 342 | values = feature.float_list.value 343 | tf.logging.info("%s: %s" % (feature_name, 344 | " ".join([str(x) 345 | for x in values]))) 346 | 347 | for writer in writers: 348 | writer.close() 349 | 350 | tf.logging.info("Wrote %d total instances", total_written) 351 | 352 | 353 | 354 | def total_repeat_drop(eoa2seq): 355 | """ 356 | totally drop the repeat transaction based on time. 357 | """ 358 | new_eoa2seq = {} 359 | for eoa, seq in eoa2seq.items(): 360 | new_seq = [] 361 | exist_addr = set() 362 | for trans in seq: 363 | if trans[0] not in exist_addr: 364 | exist_addr.add(trans[0]) 365 | new_seq.append(trans) 366 | 367 | new_eoa2seq[eoa] = new_seq 368 | 369 | return new_eoa2seq 370 | 371 | 372 | def cmp_udf_reverse(x1, x2): 373 | time1 = int(x1[2]) 374 | time2 = int(x2[2]) 375 | 376 | if time1 < time2: 377 | return 1 378 | elif time1 > time2: 379 | return -1 380 | else: 381 | return 0 382 | 383 | 384 | def main(): 385 | vocab = FreqVocab() 386 | print("===========Load Sequence===========") 387 | with open("./data/eoa2seq.pkl", "rb") as f: 388 | eoa2seq = pkl.load(f) 389 | 390 | print("number of target user account:", len(eoa2seq)) 391 | vocab.update(eoa2seq) 392 | # generate mapping 393 | vocab.generate_vocab() 394 | 395 | # save vocab 396 | print("token_size:{}".format(len(vocab.vocab_words))) 397 | vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename 398 | print('vocab pickle file: ' + vocab_file_name) 399 | with open(vocab_file_name, 'wb') as output_file: 400 | pkl.dump(vocab, output_file, protocol=2) 401 | 402 | print("===========Original===========") 403 | length_list = [] 404 | for eoa in eoa2seq.keys(): 405 | seq = eoa2seq[eoa] 406 | length_list.append(len(seq)) 407 | 408 | length_list = np.array(length_list) 409 | print("Median:", np.median(length_list)) 410 | print("Mean:", np.mean(length_list)) 411 | print("Seq num:", len(length_list)) 412 | 413 | if FLAGS.total_drop: 414 | eoa2seq = total_repeat_drop(eoa2seq) 415 | 416 | print("==========After Reduce==========") 417 | length_list = [] 418 | for eoa in eoa2seq.keys(): 419 | seq = eoa2seq[eoa] 420 | length_list.append(len(seq)) 421 | 422 | length_list = np.array(length_list) 423 | print("Median:", np.median(length_list)) 424 | print("Mean:", np.mean(length_list)) 425 | print("Seq num:", len(length_list)) 426 | 427 | # clip 428 | max_num_tokens = FLAGS.max_seq_length - 1 429 | seqs = [] 430 | idx = 0 431 | for eoa, seq in eoa2seq.items(): 432 | if len(seq) <= max_num_tokens: 433 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 434 | seqs[idx] += seq 435 | idx += 1 436 | elif len(seq) > max_num_tokens: 437 | beg_idx = list(range(len(seq) - max_num_tokens, 0, -1 * SLIDING_STEP)) 438 | beg_idx.append(0) 439 | 440 | if len(beg_idx) > 500: 441 | beg_idx = list(np.random.permutation(beg_idx)[:500]) 442 | for i in beg_idx: 443 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 444 | seqs[idx] += seq[i:i + max_num_tokens] 445 | idx += 1 446 | 447 | else: 448 | for i in beg_idx[::-1]: 449 | seqs.append([[eoa, 0, 0, 0, 0, 0]]) 450 | seqs[idx] += seq[i:i + max_num_tokens] 451 | idx += 1 452 | 453 | if FLAGS.do_embed: 454 | print("===========Generate Embedding Samples==========") 455 | write_instance = gen_embedding_samples(seqs) 456 | output_filename = FLAGS.data_dir + "embed.tfrecord" 457 | tf.logging.info("*** Writing to output embedding files ***") 458 | tf.logging.info(" %s", output_filename) 459 | 460 | write_instance_to_example_files(write_instance, FLAGS.max_seq_length, 461 | MAX_PREDICTIONS_PER_SEQ, vocab, 462 | [output_filename]) 463 | 464 | seqs = np.random.permutation(seqs) 465 | 466 | if FLAGS.do_eval: # select 20% for testing 467 | print("========Generate Evaluation Samples========") 468 | eval_seqs = seqs[:round(len(seqs) * 0.2)] 469 | seqs = seqs[round(len(seqs) * 0.2):] 470 | 471 | eval_normal_instances = gen_samples(eval_seqs, 472 | dupe_factor=FLAGS.dupe_factor, 473 | masked_lm_prob=FLAGS.masked_lm_prob, 474 | max_predictions_per_seq=MAX_PREDICTIONS_PER_SEQ, 475 | pool_size=FLAGS.pool_size, 476 | rng=rng, 477 | force_head=False) 478 | 479 | eval_write_instance = eval_normal_instances 480 | rng.shuffle(eval_write_instance) 481 | eval_output_filename = FLAGS.data_dir + "test.tfrecord" 482 | tf.logging.info("*** Writing to Testing files ***") 483 | tf.logging.info(" %s", eval_output_filename) 484 | 485 | write_instance_to_example_files(eval_write_instance, FLAGS.max_seq_length, 486 | MAX_PREDICTIONS_PER_SEQ, vocab, 487 | [eval_output_filename]) 488 | 489 | print("========Generate Training Samples========") 490 | normal_instances = gen_samples(seqs, 491 | dupe_factor=FLAGS.dupe_factor, 492 | masked_lm_prob=FLAGS.masked_lm_prob, 493 | max_predictions_per_seq=MAX_PREDICTIONS_PER_SEQ, 494 | pool_size=FLAGS.pool_size, 495 | rng=rng, 496 | force_head=False) 497 | 498 | write_instance = normal_instances 499 | rng.shuffle(write_instance) 500 | 501 | output_filename = FLAGS.data_dir + "train.tfrecord" 502 | tf.logging.info("*** Writing to Training files ***") 503 | tf.logging.info(" %s", output_filename) 504 | 505 | write_instance_to_example_files(write_instance, FLAGS.max_seq_length, 506 | MAX_PREDICTIONS_PER_SEQ, vocab, 507 | [output_filename]) 508 | 509 | return 510 | 511 | 512 | if __name__ == '__main__': 513 | main() 514 | 515 | 516 | -------------------------------------------------------------------------------- /Model/gen_seq.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pickle as pkl 4 | import functools 5 | import os 6 | from vocab import FreqVocab 7 | import tensorflow.compat.v1 as tf 8 | tf.disable_v2_behavior() 9 | 10 | flags = tf.flags 11 | FLAGS = flags.FLAGS 12 | 13 | flags.DEFINE_bool("phisher", True, "whether to include phisher detection dataset.") 14 | flags.DEFINE_string("data_dir", "../../Data", "data directory.") 15 | flags.DEFINE_string("dataset", "1M", "which dataset to use") 16 | flags.DEFINE_bool("dup", True, "whether to do transaction duplication") 17 | 18 | 19 | HEADER = 'hash,nonce,block_hash,block_number,transaction_index,from_address,to_address,value,gas,gas_price,input,block_timestamp,max_fee_per_gas,max_priority_fee_per_gas,transaction_type'.split(",") 20 | 21 | def cmp_udf(x1, x2): 22 | time1 = int(x1[2]) 23 | time2 = int(x2[2]) 24 | if time1 < time2: 25 | return -1 26 | elif time1 > time2: 27 | return 1 28 | else: 29 | return 0 30 | 31 | def cmp_udf_reverse(x1, x2): 32 | time1 = int(x1[2]) 33 | time2 = int(x2[2]) 34 | 35 | if time1 < time2: 36 | return 1 37 | elif time1 > time2: 38 | return -1 39 | else: 40 | return 0 41 | 42 | def load_data(f_in, f_out): 43 | eoa2seq_out = {} 44 | error_trans = [] 45 | while True: 46 | trans = f_out.readline() 47 | if trans == "": 48 | break 49 | record = trans.split(",") 50 | trans_hash = record[0] 51 | block_number = int(record[3]) 52 | from_address = record[5] 53 | to_address = record[6] 54 | value = int(record[7]) / (pow(10, 12)) 55 | gas = int(record[8]) 56 | gas_price = int(record[9]) 57 | block_timestamp = int(record[11]) 58 | if from_address == "" or to_address == "": 59 | error_trans.append(trans) 60 | continue 61 | try: 62 | eoa2seq_out[from_address].append([to_address, block_number, block_timestamp, value, "OUT", 1]) 63 | except: 64 | eoa2seq_out[from_address] = [[to_address, block_number, block_timestamp, value, "OUT", 1]] 65 | 66 | eoa2seq_in = {} 67 | while True: 68 | trans = f_in.readline() 69 | if trans == "": 70 | break 71 | record = trans.split(",") 72 | block_number = int(record[3]) 73 | from_address = record[5] 74 | to_address = record[6] 75 | value = int(record[7]) / (pow(10, 12)) 76 | gas = int(record[8]) 77 | gas_price = int(record[9]) 78 | block_timestamp = int(record[11]) 79 | if from_address == "" or to_address == "": 80 | error_trans.append(trans) 81 | continue 82 | try: 83 | eoa2seq_in[to_address].append([from_address, block_number, block_timestamp, value, "IN", 1]) # not process trans 84 | except: 85 | eoa2seq_in[to_address] = [[from_address, block_number, block_timestamp, value, "IN", 1]] # in/out, cnt 86 | return eoa2seq_in, eoa2seq_out 87 | 88 | def seq_duplicate(eoa2seq_in, eoa2seq_out): 89 | eoa2seq_agg_in = {} 90 | for eoa in eoa2seq_in.keys(): 91 | if len(eoa2seq_in[eoa]) >= 10000: 92 | continue 93 | seq_sorted = sorted(eoa2seq_in[eoa], key=functools.cmp_to_key(cmp_udf)) 94 | seq_tmp = [e.copy() for e in seq_sorted] 95 | for i in range(len(seq_tmp) - 1, 0, -1): 96 | l_acc = seq_tmp[i][0] # latter 97 | f_acc = seq_tmp[i - 1][0] # former 98 | l_time = int(seq_tmp[i][2]) 99 | f_time = int(seq_tmp[i - 1][2]) 100 | delta_time = l_time - f_time 101 | if f_acc != l_acc or delta_time > 86400 * 3: 102 | continue 103 | # value add 104 | seq_tmp[i - 1][3] += seq_tmp[i][3] 105 | seq_tmp[i - 1][5] += seq_tmp[i][5] 106 | del seq_tmp[i] 107 | eoa2seq_agg_in[eoa] = seq_tmp 108 | 109 | eoa2seq_agg_out = {} 110 | for eoa in eoa2seq_out.keys(): 111 | if len(eoa2seq_out[eoa])>=10000: 112 | continue 113 | seq_sorted = sorted(eoa2seq_out[eoa], key=functools.cmp_to_key(cmp_udf)) 114 | seq_tmp = [e.copy() for e in seq_sorted] 115 | for i in range(len(seq_tmp) - 1, 0, -1): 116 | l_acc = seq_tmp[i][0] # latter 117 | f_acc = seq_tmp[i - 1][0] # former 118 | l_time = int(seq_tmp[i][2]) 119 | f_time = int(seq_tmp[i - 1][2]) 120 | delta_time = l_time - f_time 121 | if f_acc != l_acc or delta_time > 86400 * 3: 122 | continue 123 | # value add 124 | seq_tmp[i - 1][3] += seq_tmp[i][3] 125 | seq_tmp[i - 1][5] += seq_tmp[i][5] 126 | del seq_tmp[i] 127 | eoa2seq_agg_out[eoa] = seq_tmp 128 | 129 | eoa_list = list(eoa2seq_agg_out.keys()) # eoa_list must include eoa account only (i.e., have out transaction at least) 130 | eoa2seq_agg = {} 131 | 132 | for eoa in eoa_list: 133 | out_seq = eoa2seq_agg_out[eoa] 134 | try: 135 | in_seq = eoa2seq_agg_in[eoa] 136 | except: 137 | in_seq = [] 138 | 139 | seq_agg = sorted(out_seq + in_seq, key=functools.cmp_to_key(cmp_udf_reverse)) 140 | cnt_all = 0 141 | for trans in seq_agg: 142 | cnt_all += trans[5] 143 | if cnt_all >= 5 and cnt_all<=10000: 144 | # if cnt_all > 2 and cnt_all<=10000: 145 | eoa2seq_agg[eoa] = seq_agg 146 | break 147 | 148 | return eoa2seq_agg 149 | 150 | def seq_generation(eoa2seq_in, eoa2seq_out): 151 | 152 | eoa_list = list(eoa2seq_out.keys()) # eoa_list must include eoa account only (i.e., have out transaction at least) 153 | eoa2seq = {} 154 | for eoa in eoa_list: 155 | out_seq = eoa2seq_out[eoa] 156 | try: 157 | in_seq = eoa2seq_in[eoa] 158 | except: 159 | in_seq = [] 160 | seq_agg = sorted(out_seq + in_seq, key=functools.cmp_to_key(cmp_udf_reverse)) 161 | cnt_all = 0 162 | for trans in seq_agg: 163 | cnt_all += 1 164 | if cnt_all >= 5 and cnt_all<=10000: 165 | # if cnt_all > 2 and cnt_all<=10000: 166 | eoa2seq[eoa] = seq_agg 167 | break 168 | 169 | return eoa2seq 170 | 171 | def feature_bucketization(eoa2seq_agg): 172 | 173 | for eoa in eoa2seq_agg.keys(): 174 | seq = eoa2seq_agg[eoa] 175 | for trans in seq: 176 | amount = trans[3] 177 | cnt = trans[5] 178 | 179 | if amount == 0: 180 | amount_bucket = 1 181 | elif amount<= 591: 182 | amount_bucket = 2 183 | elif amount<= 6195: 184 | amount_bucket = 3 185 | elif amount <= 21255: 186 | amount_bucket = 4 187 | elif amount <= 50161: 188 | amount_bucket = 5 189 | elif amount <= 100120: 190 | amount_bucket = 6 191 | elif amount <= 208727: 192 | amount_bucket = 7 193 | elif amount <= 508961: 194 | amount_bucket = 8 195 | elif amount <= 1360574: 196 | amount_bucket = 9 197 | elif amount <= 6500000: 198 | amount_bucket = 10 199 | elif amount <= 143791433950: 200 | amount_bucket = 11 201 | else: 202 | amount_bucket = 12 203 | 204 | trans[3] = amount_bucket 205 | 206 | if cnt == 0: 207 | cnt_bucket = 0 208 | elif cnt == 1: 209 | cnt_bucket = 1 210 | elif cnt == 2: 211 | cnt_bucket = 2 212 | elif cnt == 3: 213 | cnt_bucket = 3 214 | elif cnt == 4: 215 | cnt_bucket = 4 216 | elif cnt == 5: 217 | cnt_bucket = 5 218 | elif cnt == 6: 219 | cnt_bucket = 6 220 | elif cnt == 7: 221 | cnt_bucket = 7 222 | elif 8 < cnt <= 10: 223 | cnt_bucket = 8 224 | elif 10 < cnt <= 20: 225 | cnt_bucket = 9 226 | else: 227 | cnt_bucket = 10 228 | 229 | trans[5] = cnt_bucket 230 | 231 | return eoa2seq_agg 232 | 233 | def main(): 234 | 235 | if FLAGS.dataset in ("1000K", "1M"): 236 | f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice_1000K.csv"), "r") 237 | f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice_1000K.csv"), "r") 238 | 239 | elif FLAGS.dataset in ("3000K", "3M"): 240 | f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice_3000K.csv"), "r") 241 | f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice_3000K.csv"), "r") 242 | 243 | elif FLAGS.dataset in ("10M"): 244 | f_in = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_in_slice.csv"), "r") 245 | f_out = open(os.path.join(FLAGS.data_dir, "normal_eoa_transaction_out_slice.csv"), "r") 246 | 247 | else: 248 | raise ValueError("Please choose right dataset") 249 | 250 | print("Add normal " + FLAGS.dataset) 251 | 252 | eoa2seq_in, eoa2seq_out = load_data(f_in, f_out) 253 | 254 | if FLAGS.dup: 255 | eoa2seq_agg = seq_duplicate(eoa2seq_in, eoa2seq_out) 256 | else: 257 | eoa2seq_agg = seq_generation(eoa2seq_in, eoa2seq_out) 258 | 259 | if FLAGS.phisher: 260 | print("Add phishing..") 261 | phisher_f_in = open(os.path.join(FLAGS.data_dir, "phisher_transaction_in.csv"), "r") 262 | phisher_f_out = open(os.path.join(FLAGS.data_dir, "phisher_transaction_out.csv"), "r") 263 | phisher_eoa2seq_in, phisher_eoa2seq_out = load_data(phisher_f_in, phisher_f_out) 264 | 265 | if FLAGS.dup: 266 | phisher_eoa2seq_agg = seq_duplicate(phisher_eoa2seq_in, phisher_eoa2seq_out) 267 | else: 268 | phisher_eoa2seq_agg = seq_generation(phisher_eoa2seq_in, phisher_eoa2seq_out) 269 | 270 | eoa2seq_agg.update(phisher_eoa2seq_agg) 271 | 272 | eoa2seq_agg = feature_bucketization(eoa2seq_agg) 273 | 274 | print("statistics:") 275 | length_list = [] 276 | for eoa in eoa2seq_agg.keys(): 277 | seq = eoa2seq_agg[eoa] 278 | length_list.append(len(seq)) 279 | 280 | length_list = np.array(length_list) 281 | print("Median:", np.median(length_list)) 282 | print("Mean:", np.mean(length_list)) 283 | print("Seq #:", len(length_list)) 284 | 285 | tf.gfile.MakeDirs("./data") 286 | 287 | with open("./data/eoa2seq.pkl", "wb") as f: 288 | pkl.dump(eoa2seq_agg, f) 289 | 290 | 291 | print("pause") 292 | 293 | if __name__ == '__main__': 294 | main() -------------------------------------------------------------------------------- /Model/modeling.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """The main BERT model and related functions.""" 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import collections 8 | import copy 9 | import json 10 | import math 11 | import re 12 | import six 13 | import numpy as np 14 | # import tensorflow as tf 15 | import tensorflow.compat.v1 as tf 16 | tf.disable_v2_behavior() 17 | 18 | 19 | class BertConfig(object): 20 | """Configuration for `BertModel`.""" 21 | 22 | def __init__(self, 23 | vocab_size, 24 | hidden_size=768, 25 | num_hidden_layers=12, 26 | num_attention_heads=12, 27 | intermediate_size=3072, 28 | hidden_act="gelu", 29 | hidden_dropout_prob=0.1, 30 | attention_probs_dropout_prob=0.1, 31 | max_position_embeddings=512, 32 | type_vocab_size=16, 33 | initializer_range=0.02): 34 | """Constructs BertConfig. 35 | 36 | Args: 37 | vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. 38 | hidden_size: Size of the encoder layers and the pooler layer. 39 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 40 | num_attention_heads: Number of attention heads for each attention layer in 41 | the Transformer encoder. 42 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 43 | layer in the Transformer encoder. 44 | hidden_act: The non-linear activation function (function or string) in the 45 | encoder and pooler. 46 | hidden_dropout_prob: The dropout probability for all fully connected 47 | layers in the embeddings, encoder, and pooler. 48 | attention_probs_dropout_prob: The dropout ratio for the attention 49 | probabilities. 50 | max_position_embeddings: The maximum sequence length that this model might 51 | ever be used with. Typically set this to something large just in case 52 | (e.g., 512 or 1024 or 2048). 53 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 54 | `BertModel`. 55 | initializer_range: The stdev of the truncated_normal_initializer for 56 | initializing all weight matrices. 57 | """ 58 | self.vocab_size = vocab_size 59 | self.hidden_size = hidden_size 60 | self.num_hidden_layers = num_hidden_layers 61 | self.num_attention_heads = num_attention_heads 62 | self.hidden_act = hidden_act 63 | self.intermediate_size = intermediate_size 64 | self.hidden_dropout_prob = hidden_dropout_prob 65 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 66 | self.max_position_embeddings = max_position_embeddings 67 | self.type_vocab_size = type_vocab_size 68 | self.initializer_range = initializer_range 69 | 70 | @classmethod 71 | def from_dict(cls, json_object): 72 | """Constructs a `BertConfig` from a Python dictionary of parameters.""" 73 | config = BertConfig(vocab_size=None) 74 | for (key, value) in six.iteritems(json_object): 75 | config.__dict__[key] = value 76 | return config 77 | 78 | @classmethod 79 | def from_json_file(cls, json_file): 80 | """Constructs a `BertConfig` from a json file of parameters.""" 81 | with tf.gfile.GFile(json_file, "r") as reader: 82 | text = reader.read() 83 | return cls.from_dict(json.loads(text)) 84 | 85 | def to_dict(self): 86 | """Serializes this instance to a Python dictionary.""" 87 | output = copy.deepcopy(self.__dict__) 88 | return output 89 | 90 | def to_json_string(self): 91 | """Serializes this instance to a JSON string.""" 92 | return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" 93 | 94 | 95 | class BertModel(object): 96 | """BERT model ("Bidirectional Embedding Representations from a Transformer"). 97 | 98 | Example usage: 99 | 100 | ```python 101 | # Already been converted into WordPiece token ids 102 | input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) 103 | input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) 104 | token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) 105 | 106 | config = modeling.BertConfig(vocab_size=32000, hidden_size=512, 107 | num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) 108 | 109 | model = modeling.BertModel(config=config, is_training=True, 110 | input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) 111 | 112 | label_embeddings = tf.get_variable(...) 113 | logits = tf.matmul(pooled_output, label_embeddings) 114 | ... 115 | ``` 116 | """ 117 | def __init__(self, 118 | config, 119 | is_training, 120 | input_ids, 121 | input_positions, 122 | input_io_flags, 123 | input_amounts, 124 | input_counts, 125 | input_mask=None, 126 | token_type_ids=None, 127 | use_one_hot_embeddings=False, 128 | cross_share=False, 129 | scope="bert"): 130 | """Constructor for BertModel. 131 | 132 | Args: 133 | config: `BertConfig` instance. 134 | is_training: bool. rue for training model, false for eval model. Controls 135 | whether dropout will be applied. 136 | input_ids: int32 Tensor of shape [batch_size, seq_length]. 137 | input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. 138 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 139 | use_one_hot_embeddings: (optional) bool. Whether to use one-hot word 140 | embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, 141 | it is must faster if this is True, on the CPU or GPU, it is faster if 142 | this is False. 143 | scope: (optional) variable scope. Defaults to "bert". 144 | 145 | Raises: 146 | ValueError: The config is invalid or one of the input tensor shapes 147 | is invalid. 148 | """ 149 | config = copy.deepcopy(config) 150 | if not is_training: 151 | config.hidden_dropout_prob = 0.0 152 | config.attention_probs_dropout_prob = 0.0 153 | 154 | input_shape = get_shape_list(input_ids, expected_rank=2) 155 | batch_size = input_shape[0] 156 | seq_length = input_shape[1] 157 | 158 | if input_mask is None: 159 | input_mask = tf.ones( 160 | shape=[batch_size, seq_length], dtype=tf.int32) 161 | 162 | if token_type_ids is None: 163 | token_type_ids = tf.zeros( 164 | shape=[batch_size, seq_length], dtype=tf.int32) 165 | 166 | with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): 167 | with tf.variable_scope("embeddings"): 168 | # Perform embedding lookup on the word ids. 169 | 170 | self.embedding_output, self.embedding_table_list, self.factorize_table_list = \ 171 | embedding_lookup(input_ids=input_ids, 172 | vocab_size=config.vocab_size, 173 | embedding_size=config.hidden_size, 174 | bucket_list=config.bucket_list, 175 | factor_list=config.factor_list, 176 | initializer_range=config.initializer_range, 177 | word_embedding_name="word_embeddings", 178 | use_one_hot_embeddings=use_one_hot_embeddings) 179 | 180 | self.embedding_output = feature_encoder(input_tensor=self.embedding_output, 181 | input_io_flags=input_io_flags, 182 | input_amounts=input_amounts, 183 | input_counts=input_counts, 184 | initializer_range=config.initializer_range) 185 | 186 | # Add positional embeddings and token type embeddings, then layer 187 | # normalize and perform dropout. 188 | self.embedding_output = embedding_postprocessor( 189 | input_tensor=self.embedding_output, 190 | input_positions=input_positions, 191 | initializer_range=config.initializer_range, 192 | max_position_embeddings=config.max_position_embeddings, 193 | dropout_prob=config.hidden_dropout_prob, 194 | is_sinusoidal=False) 195 | 196 | with tf.variable_scope("encoder"): 197 | # This converts a 2D mask of shape [batch_size, seq_length] to a 3D 198 | # mask of shape [batch_size, seq_length, seq_length] which is used 199 | # for the attention scores. 200 | attention_mask = create_attention_mask_from_input_mask( 201 | input_ids, input_mask) 202 | 203 | # Run the stacked transformer. 204 | # `sequence_output` shape = [batch_size, seq_length, hidden_size]. 205 | if cross_share: 206 | self.all_encoder_layers = transformer_model_cross_share( 207 | input_tensor=self.embedding_output, 208 | attention_mask=attention_mask, 209 | hidden_size=config.hidden_size, 210 | num_hidden_layers=config.num_hidden_layers, 211 | num_attention_heads=config.num_attention_heads, 212 | intermediate_size=config.intermediate_size, 213 | intermediate_act_fn=get_activation(config.hidden_act), 214 | hidden_dropout_prob=config.hidden_dropout_prob, 215 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 216 | initializer_range=config.initializer_range, 217 | do_return_all_layers=True) 218 | 219 | else: 220 | self.all_encoder_layers = transformer_model( 221 | input_tensor=self.embedding_output, 222 | attention_mask=attention_mask, 223 | hidden_size=config.hidden_size, 224 | num_hidden_layers=config.num_hidden_layers, 225 | num_attention_heads=config.num_attention_heads, 226 | intermediate_size=config.intermediate_size, 227 | intermediate_act_fn=get_activation(config.hidden_act), 228 | hidden_dropout_prob=config.hidden_dropout_prob, 229 | attention_probs_dropout_prob=config.attention_probs_dropout_prob, 230 | initializer_range=config.initializer_range, 231 | do_return_all_layers=True) 232 | 233 | self.sequence_output = self.all_encoder_layers[-1] 234 | 235 | 236 | def get_pooled_output(self): 237 | return self.pooled_output 238 | 239 | def get_sequence_output(self): 240 | """Gets final hidden layer of encoder. 241 | 242 | Returns: 243 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 244 | to the final hidden of the transformer encoder. 245 | """ 246 | return self.sequence_output 247 | 248 | def get_all_encoder_layers(self): 249 | return self.all_encoder_layers 250 | 251 | def get_embedding_output(self): 252 | """Gets output of the embedding lookup (i.e., input to the transformer). 253 | 254 | Returns: 255 | float Tensor of shape [batch_size, seq_length, hidden_size] corresponding 256 | to the output of the embedding layer, after summing the word 257 | embeddings with the positional embeddings and the token type embeddings, 258 | then performing layer normalization. This is the input to the transformer. 259 | """ 260 | return self.embedding_output 261 | 262 | def get_embedding_table(self): 263 | return self.embedding_table_list, self.factorize_table_list 264 | 265 | 266 | def gelu(input_tensor): 267 | """Gaussian Error Linear Unit. 268 | 269 | This is a smoother version of the RELU. 270 | Original paper: https://arxiv.org/abs/1606.08415 271 | 272 | Args: 273 | input_tensor: float Tensor to perform activation. 274 | 275 | Returns: 276 | `input_tensor` with the GELU activation applied. 277 | """ 278 | cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) 279 | return input_tensor * cdf 280 | 281 | 282 | def get_activation(activation_string): 283 | """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. 284 | 285 | Args: 286 | activation_string: String name of the activation function. 287 | 288 | Returns: 289 | A Python function corresponding to the activation function. If 290 | `activation_string` is None, empty, or "linear", this will return None. 291 | If `activation_string` is not a string, it will return `activation_string`. 292 | 293 | Raises: 294 | ValueError: The `activation_string` does not correspond to a known 295 | activation. 296 | """ 297 | 298 | # We assume that anything that"s not a string is already an activation 299 | # function, so we just return it. 300 | if not isinstance(activation_string, six.string_types): 301 | return activation_string 302 | 303 | if not activation_string: 304 | return None 305 | 306 | act = activation_string.lower() 307 | if act == "linear": 308 | return None 309 | elif act == "relu": 310 | return tf.nn.relu 311 | elif act == "gelu": 312 | return gelu 313 | elif act == "tanh": 314 | return tf.tanh 315 | else: 316 | raise ValueError("Unsupported activation: %s" % act) 317 | 318 | 319 | def get_assignment_map_from_checkpoint(tvars, init_checkpoint): 320 | """Compute the union of the current variables and checkpoint variables.""" 321 | assignment_map = {} 322 | initialized_variable_names = {} 323 | 324 | name_to_variable = collections.OrderedDict() 325 | for var in tvars: 326 | name = var.name 327 | m = re.match("^(.*):\\d+$", name) 328 | if m is not None: 329 | name = m.group(1) 330 | name_to_variable[name] = var 331 | 332 | init_vars = tf.train.list_variables(init_checkpoint) 333 | 334 | assignment_map = collections.OrderedDict() 335 | for x in init_vars: 336 | (name, var) = (x[0], x[1]) 337 | if name not in name_to_variable: 338 | continue 339 | assignment_map[name] = name 340 | initialized_variable_names[name] = 1 341 | initialized_variable_names[name + ":0"] = 1 342 | 343 | return (assignment_map, initialized_variable_names) 344 | 345 | 346 | def dropout(input_tensor, dropout_prob): 347 | """Perform dropout. 348 | 349 | Args: 350 | input_tensor: float Tensor. 351 | dropout_prob: Python float. The probability of dropping out a value (NOT of 352 | *keeping* a dimension as in `tf.nn.dropout`). 353 | 354 | Returns: 355 | A version of `input_tensor` with dropout applied. 356 | """ 357 | if dropout_prob is None or dropout_prob == 0.0: 358 | return input_tensor 359 | 360 | output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) 361 | return output 362 | 363 | 364 | def layer_norm(input_tensor, name=None): 365 | """Run layer normalization on the last dimension of the tensor.""" 366 | epsilon = 1e-6 367 | filters = input_tensor.get_shape()[-1] 368 | with tf.variable_scope("layer_norm"): 369 | scale = tf.get_variable("layer_norm_scale", [filters], initializer=tf.ones_initializer()) 370 | bias = tf.get_variable("layer_norm_bias", [filters], initializer=tf.zeros_initializer()) 371 | 372 | mean = tf.reduce_mean(input_tensor, axis=-1, keep_dims=True) 373 | variance = tf.reduce_mean(tf.square(input_tensor-mean), axis=-1, keep_dims=True) 374 | input_tensor = (input_tensor - mean) * tf.rsqrt(variance + epsilon) 375 | input_tensor = input_tensor * scale + bias 376 | 377 | return input_tensor 378 | 379 | 380 | def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): 381 | """Runs layer normalization followed by dropout.""" 382 | output_tensor = layer_norm(input_tensor, name) 383 | output_tensor = dropout(output_tensor, dropout_prob) 384 | return output_tensor 385 | 386 | 387 | def create_initializer(initializer_range=0.02): 388 | """Creates a `truncated_normal_initializer` with the given range.""" 389 | return tf.truncated_normal_initializer(stddev=initializer_range) 390 | 391 | 392 | def embedding_lookup(input_ids, 393 | vocab_size, 394 | embedding_size, 395 | bucket_list, 396 | factor_list, 397 | initializer_range=0.02, 398 | word_embedding_name="word_embeddings", 399 | use_one_hot_embeddings=False): 400 | """Looks up words embeddings for id tensor. 401 | 402 | Args: 403 | input_ids: int32 Tensor of shape [batch_size, seq_length] containing word 404 | ids. 405 | vocab_size: int. Size of the embedding vocabulary. 406 | embedding_size: int. Width of the word embeddings. 407 | initializer_range: float. Embedding initialization range. 408 | word_embedding_name: string. Name of the embedding table. 409 | use_one_hot_embeddings: bool. If True, use one-hot method for word 410 | embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better 411 | for TPUs. 412 | 413 | Returns: 414 | float Tensor of shape [batch_size, seq_length, embedding_size]. 415 | """ 416 | # This function assumes that the input is of shape [batch_size, seq_length, 417 | # num_inputs]. 418 | # 419 | # If the input is a 2D tensor of shape [batch_size, seq_length], we 420 | # reshape to [batch_size, seq_length, 1]. why? 421 | # if input_ids.shape.ndims == 2: 422 | # input_ids = tf.expand_dims(input_ids, axis=[-1]) 423 | 424 | embedding_table_list = [] 425 | factorize_table_list = [] 426 | 427 | for i in range(len(bucket_list)): 428 | bucket = bucket_list[i] 429 | lower_bound = bucket[0] 430 | upper_bound = bucket[1] 431 | 432 | factor_size = factor_list[i] 433 | 434 | embedding_table = tf.get_variable( 435 | name=word_embedding_name + "_" + str(i), 436 | shape=[upper_bound-lower_bound, factor_size], 437 | initializer=create_initializer(initializer_range)) 438 | 439 | embedding_table_list.append(embedding_table) 440 | 441 | factor_table = tf.get_variable( 442 | name="factor_table" + "_" + str(i), 443 | shape=[factor_size, embedding_size], 444 | initializer=create_initializer(initializer_range) 445 | ) 446 | factorize_table_list.append(factor_table) 447 | 448 | output = new_embedding_lookup(input_ids, bucket_list, embedding_table_list, factorize_table_list) 449 | 450 | return (output, embedding_table_list, factorize_table_list) 451 | 452 | 453 | def new_embedding_lookup(input_ids, bucket_list, embedding_table_list, factorize_table_list): 454 | 455 | embedding_size = get_shape_list(embedding_table_list[0])[-1] 456 | output_list = [] 457 | for i in range(len(bucket_list)): 458 | 459 | embedding_table = embedding_table_list[i] 460 | factorize_table = factorize_table_list[i] 461 | 462 | bucket = bucket_list[i] 463 | lower_bound = bucket[0] 464 | upper_bound = bucket[1] 465 | 466 | mask1 = tf.cast(tf.greater_equal(input_ids, lower_bound), tf.int32) 467 | mask2 = tf.cast(tf.less(input_ids, upper_bound), tf.int32) 468 | 469 | mask = mask1 * mask2 470 | mask_2d = tf.cast(tf.tile(tf.expand_dims(mask, axis=2), multiples=[1, 1, embedding_size]), tf.float32) 471 | embedding_output = tf.nn.embedding_lookup(embedding_table, (input_ids - lower_bound) * mask) 472 | 473 | if i == 0: 474 | output = embedding_output * mask_2d 475 | else: 476 | output = tf.matmul(embedding_output, factorize_table) * mask_2d 477 | 478 | output_list.append(output) 479 | 480 | final_output = tf.reduce_sum(output_list, axis=0) 481 | 482 | return final_output 483 | 484 | 485 | def feature_encoder(input_tensor, 486 | input_io_flags, 487 | input_amounts, 488 | input_counts, 489 | initializer_range=0.02): 490 | 491 | input_shape = get_shape_list(input_tensor, expected_rank=3) 492 | batch_size = input_shape[0] 493 | seq_length = input_shape[1] 494 | width = input_shape[2] 495 | 496 | io_embedding_table = tf.get_variable( 497 | name="io_embeddings", 498 | shape=[3, width], 499 | initializer=create_initializer(initializer_range)) 500 | io_embeddings = tf.nn.embedding_lookup(io_embedding_table, input_io_flags) 501 | 502 | amount_embedding_table = tf.get_variable( 503 | name="amount_embeddings", 504 | shape=[15, width], 505 | initializer=create_initializer(initializer_range)) 506 | amount_embeddings = tf.nn.embedding_lookup(amount_embedding_table, input_amounts) 507 | 508 | count_embedding_table = tf.get_variable( 509 | name="count_embeddings", 510 | shape=[15, width], 511 | initializer=create_initializer(initializer_range)) 512 | count_embeddings = tf.nn.embedding_lookup(count_embedding_table, input_counts) 513 | 514 | output = input_tensor + io_embeddings + amount_embeddings + count_embeddings 515 | 516 | return output 517 | 518 | 519 | def embedding_postprocessor(input_tensor, 520 | input_positions=None, 521 | initializer_range=0.02, 522 | max_position_embeddings=512, 523 | dropout_prob=0.1, 524 | is_sinusoidal=False): 525 | """Performs various post-processing on a word embedding tensor. 526 | 527 | Args: 528 | input_tensor: float Tensor of shape [batch_size, seq_length, 529 | embedding_size]. 530 | use_token_type: bool. Whether to add embeddings for `token_type_ids`. 531 | token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. 532 | Must be specified if `use_token_type` is True. 533 | token_type_vocab_size: int. The vocabulary size of `token_type_ids`. 534 | token_type_embedding_name: string. The name of the embedding table variable 535 | for token type ids. 536 | use_position_embeddings: bool. Whether to add position embeddings for the 537 | position of each token in the sequence. 538 | position_embedding_name: string. The name of the embedding table variable 539 | for positional embeddings. 540 | initializer_range: float. Range of the weight initialization. 541 | max_position_embeddings: int. Maximum sequence length that might ever be 542 | used with this model. This can be longer than the sequence length of 543 | input_tensor, but cannot be shorter. 544 | dropout_prob: float. Dropout probability applied to the final output tensor. 545 | 546 | Returns: 547 | float tensor with same shape as `input_tensor`. 548 | 549 | Raises: 550 | ValueError: One of the tensor shapes or input values is invalid. 551 | """ 552 | input_shape = get_shape_list(input_tensor, expected_rank=3) 553 | batch_size = input_shape[0] 554 | seq_length = input_shape[1] 555 | width = input_shape[2] 556 | output = input_tensor 557 | 558 | if is_sinusoidal: 559 | 560 | PE_embedding = [] 561 | for pos in range(max_position_embeddings): 562 | pos_i_tmp = [] 563 | for i in range(width): 564 | if i % 2 == 0: 565 | encoding = math.sin(pos / (pow(1000, i / width))) 566 | pos_i_tmp.append(encoding) 567 | else: 568 | encoding = math.cos(pos / (pow(1000, (i - 1) / width))) 569 | pos_i_tmp.append(encoding) 570 | PE_embedding.append(pos_i_tmp) 571 | 572 | PE_embedding = np.array(PE_embedding) 573 | position_embedding_table = tf.constant(PE_embedding, 574 | name="position_embeddings", 575 | dtype=tf.float32) 576 | print("===========Positional Embedding=============") 577 | print(position_embedding_table) 578 | 579 | else: 580 | position_embedding_table = tf.get_variable( 581 | name="position_embeddings", 582 | shape=[max_position_embeddings, width], 583 | initializer=create_initializer(initializer_range)) 584 | 585 | # This vocab will be small so we always do one-hot here, since it is always 586 | position_embeddings = tf.nn.embedding_lookup(position_embedding_table, input_positions) 587 | 588 | output += position_embeddings 589 | output = layer_norm_and_dropout(output, dropout_prob) 590 | return output 591 | 592 | def create_attention_mask_from_input_mask(from_tensor, to_mask): 593 | """Create 3D attention mask from a 2D tensor mask. 594 | 595 | Args: 596 | from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. 597 | to_mask: int32 Tensor of shape [batch_size, to_seq_length]. 598 | 599 | Returns: 600 | float Tensor of shape [batch_size, from_seq_length, to_seq_length]. 601 | """ 602 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 603 | batch_size = from_shape[0] 604 | from_seq_length = from_shape[1] 605 | 606 | to_shape = get_shape_list(to_mask, expected_rank=2) 607 | to_seq_length = to_shape[1] 608 | 609 | to_mask = tf.cast( 610 | tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) 611 | 612 | # We don't assume that `from_tensor` is a mask (although it could be). We 613 | # don't actually care if we attend *from* padding tokens (only *to* padding) 614 | # tokens so we create a tensor of all ones. 615 | # 616 | # `broadcast_ones` = [batch_size, from_seq_length, 1] 617 | broadcast_ones = tf.ones( 618 | shape=[batch_size, from_seq_length, 1], dtype=tf.float32) 619 | 620 | # Here we broadcast along two dimensions to create the mask. 621 | mask = broadcast_ones * to_mask 622 | 623 | return mask 624 | 625 | 626 | def attention_layer(layer_idx, 627 | from_tensor, 628 | to_tensor, 629 | attention_mask=None, 630 | num_attention_heads=1, 631 | size_per_head=512, 632 | query_act=None, 633 | key_act=None, 634 | value_act=None, 635 | attention_probs_dropout_prob=0.0, 636 | initializer_range=0.02, 637 | do_return_2d_tensor=False, 638 | batch_size=None, 639 | from_seq_length=None, 640 | to_seq_length=None): 641 | """Performs multi-headed attention from `from_tensor` to `to_tensor`. 642 | 643 | This is an implementation of multi-headed attention based on "Attention 644 | is all you Need". If `from_tensor` and `to_tensor` are the same, then 645 | this is self-attention. Each timestep in `from_tensor` attends to the 646 | corresponding sequence in `to_tensor`, and returns a fixed-with vector. 647 | 648 | This function first projects `from_tensor` into a "query" tensor and 649 | `to_tensor` into "key" and "value" tensors. These are (effectively) a list 650 | of tensors of length `num_attention_heads`, where each tensor is of shape 651 | [batch_size, seq_length, size_per_head]. 652 | 653 | Then, the query and key tensors are dot-producted and scaled. These are 654 | softmaxed to obtain attention probabilities. The value tensors are then 655 | interpolated by these probabilities, then concatenated back to a single 656 | tensor and returned. 657 | 658 | In practice, the multi-headed attention are done with transposes and 659 | reshapes rather than actual separate tensors. 660 | 661 | Args: 662 | from_tensor: float Tensor of shape [batch_size, from_seq_length, 663 | from_width]. 664 | to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. 665 | attention_mask: (optional) int32 Tensor of shape [batch_size, 666 | from_seq_length, to_seq_length]. The values should be 1 or 0. The 667 | attention scores will effectively be set to -infinity for any positions in 668 | the mask that are 0, and will be unchanged for positions that are 1. 669 | num_attention_heads: int. Number of attention heads. 670 | size_per_head: int. Size of each attention head. 671 | query_act: (optional) Activation function for the query transform. 672 | key_act: (optional) Activation function for the key transform. 673 | value_act: (optional) Activation function for the value transform. 674 | attention_probs_dropout_prob: (optional) float. Dropout probability of the 675 | attention probabilities. 676 | initializer_range: float. Range of the weight initializer. 677 | do_return_2d_tensor: bool. If True, the output will be of shape [batch_size 678 | * from_seq_length, num_attention_heads * size_per_head]. If False, the 679 | output will be of shape [batch_size, from_seq_length, num_attention_heads 680 | * size_per_head]. 681 | batch_size: (Optional) int. If the input is 2D, this might be the batch size 682 | of the 3D version of the `from_tensor` and `to_tensor`. 683 | from_seq_length: (Optional) If the input is 2D, this might be the seq length 684 | of the 3D version of the `from_tensor`. 685 | to_seq_length: (Optional) If the input is 2D, this might be the seq length 686 | of the 3D version of the `to_tensor`. 687 | 688 | Returns: 689 | float Tensor of shape [batch_size, from_seq_length, 690 | num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is 691 | true, this will be of shape [batch_size * from_seq_length, 692 | num_attention_heads * size_per_head]). 693 | 694 | Raises: 695 | ValueError: Any of the arguments or tensor shapes are invalid. 696 | """ 697 | 698 | def transpose_for_scores(input_tensor, batch_size, num_attention_heads, 699 | seq_length, width): 700 | output_tensor = tf.reshape( 701 | input_tensor, [batch_size, seq_length, num_attention_heads, width]) 702 | 703 | output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) 704 | return output_tensor 705 | 706 | from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) 707 | to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) 708 | 709 | if len(from_shape) != len(to_shape): 710 | raise ValueError( 711 | "The rank of `from_tensor` must match the rank of `to_tensor`.") 712 | 713 | if len(from_shape) == 3: 714 | batch_size = from_shape[0] 715 | from_seq_length = from_shape[1] 716 | to_seq_length = to_shape[1] 717 | elif len(from_shape) == 2: 718 | if (batch_size is None or from_seq_length is None 719 | or to_seq_length is None): 720 | raise ValueError( 721 | "When passing in rank 2 tensors to attention_layer, the values " 722 | "for `batch_size`, `from_seq_length`, and `to_seq_length` " 723 | "must all be specified.") 724 | 725 | # Scalar dimensions referenced here: 726 | # B = batch size (number of sequences) 727 | # F = `from_tensor` sequence length 728 | # T = `to_tensor` sequence length 729 | # N = `num_attention_heads` 730 | # H = `size_per_head` 731 | 732 | from_tensor_2d = reshape_to_matrix(from_tensor) 733 | to_tensor_2d = reshape_to_matrix(to_tensor) 734 | 735 | # `query_layer` = [B*F, N*H] 736 | query_layer = tf.layers.dense( 737 | from_tensor_2d, 738 | num_attention_heads * size_per_head, 739 | activation=query_act, 740 | name="query", 741 | kernel_initializer=create_initializer(initializer_range)) 742 | 743 | # `key_layer` = [B*T, N*H] 744 | key_layer = tf.layers.dense( 745 | to_tensor_2d, 746 | num_attention_heads * size_per_head, 747 | activation=key_act, 748 | name="key", 749 | kernel_initializer=create_initializer(initializer_range)) 750 | 751 | # `value_layer` = [B*T, N*H] 752 | value_layer = tf.layers.dense( 753 | to_tensor_2d, 754 | num_attention_heads * size_per_head, 755 | activation=value_act, 756 | name="value", 757 | kernel_initializer=create_initializer(initializer_range)) 758 | 759 | # `query_layer` = [B, N, F, H] 760 | query_layer = transpose_for_scores(query_layer, batch_size, 761 | num_attention_heads, from_seq_length, 762 | size_per_head) 763 | 764 | # `key_layer` = [B, N, T, H] 765 | key_layer = transpose_for_scores(key_layer, batch_size, 766 | num_attention_heads, to_seq_length, 767 | size_per_head) 768 | 769 | # Take the dot product between "query" and "key" to get the raw 770 | # attention scores. 771 | # `attention_scores` = [B, N, F, T] 772 | attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) 773 | attention_scores = tf.multiply(attention_scores, 774 | 1.0 / math.sqrt(float(size_per_head))) 775 | 776 | if attention_mask is not None: 777 | # `attention_mask` = [B, 1, F, T] 778 | attention_mask = tf.expand_dims(attention_mask, axis=[1]) 779 | 780 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 781 | # masked positions, this operation will create a tensor which is 0.0 for 782 | # positions we want to attend and -10000.0 for masked positions. 783 | adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 784 | 785 | # Since we are adding it to the raw scores before the softmax, this is 786 | # effectively the same as removing these entirely. 787 | attention_scores += adder 788 | 789 | # Normalize the attention scores to probabilities. 790 | # `attention_probs` = [B, N, F, T] 791 | attention_probs = tf.nn.softmax(attention_scores) 792 | 793 | tf.add_to_collection("layer" + str(layer_idx) + "_attention_probs", attention_probs) 794 | 795 | # This is actually dropping out entire tokens to attend to, which might 796 | # seem a bit unusual, but is taken from the original Transformer paper. 797 | attention_probs = dropout(attention_probs, attention_probs_dropout_prob) 798 | 799 | # `value_layer` = [B, T, N, H] 800 | value_layer = tf.reshape( 801 | value_layer, 802 | [batch_size, to_seq_length, num_attention_heads, size_per_head]) 803 | 804 | # `value_layer` = [B, N, T, H] 805 | value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) 806 | 807 | # `context_layer` = [B, N, F, H] 808 | context_layer = tf.matmul(attention_probs, value_layer) 809 | 810 | # `context_layer` = [B, F, N, H] 811 | context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) 812 | 813 | if do_return_2d_tensor: 814 | # `context_layer` = [B*F, N*V] 815 | context_layer = tf.reshape(context_layer, [ 816 | batch_size * from_seq_length, num_attention_heads * size_per_head 817 | ]) 818 | else: 819 | # `context_layer` = [B, F, N*V] 820 | context_layer = tf.reshape( 821 | context_layer, 822 | [batch_size, from_seq_length, num_attention_heads * size_per_head]) 823 | 824 | return context_layer 825 | 826 | 827 | def transformer_model(input_tensor, 828 | attention_mask=None, 829 | hidden_size=768, 830 | num_hidden_layers=12, 831 | num_attention_heads=12, 832 | intermediate_size=3072, 833 | intermediate_act_fn=gelu, 834 | hidden_dropout_prob=0.1, 835 | attention_probs_dropout_prob=0.1, 836 | initializer_range=0.02, 837 | do_return_all_layers=False): 838 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 839 | This is almost an exact implementation of the original Transformer encoder. 840 | 841 | See the original paper: 842 | https://arxiv.org/abs/1706.03762 843 | 844 | Also see: 845 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 846 | 847 | Args: 848 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 849 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 850 | seq_length], with 1 for positions that can be attended to and 0 in 851 | positions that should not be. 852 | hidden_size: int. Hidden size of the Transformer. 853 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 854 | num_attention_heads: int. Number of attention heads in the Transformer. 855 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 856 | forward) layer. 857 | intermediate_act_fn: function. The non-linear activation function to apply 858 | to the output of the intermediate/feed-forward layer. 859 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 860 | attention_probs_dropout_prob: float. Dropout probability of the attention 861 | probabilities. 862 | initializer_range: float. Range of the initializer (stddev of truncated 863 | normal). 864 | do_return_all_layers: Whether to also return all layers or just the final 865 | layer. 866 | 867 | Returns: 868 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 869 | hidden layer of the Transformer. 870 | 871 | Raises: 872 | ValueError: A Tensor shape or parameter is invalid. 873 | """ 874 | if hidden_size % num_attention_heads != 0: 875 | raise ValueError( 876 | "The hidden size (%d) is not a multiple of the number of attention " 877 | "heads (%d)" % (hidden_size, num_attention_heads)) 878 | 879 | attention_head_size = int(hidden_size / num_attention_heads) 880 | input_shape = get_shape_list(input_tensor, expected_rank=3) 881 | batch_size = input_shape[0] 882 | seq_length = input_shape[1] 883 | input_width = input_shape[2] 884 | 885 | # The Transformer performs sum residuals on all layers so the input needs 886 | # to be the same as the hidden size. 887 | if input_width != hidden_size: 888 | raise ValueError( 889 | "The width of the input tensor (%d) != hidden size (%d)" % 890 | (input_width, hidden_size)) 891 | 892 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 893 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 894 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 895 | # help the optimizer. 896 | prev_output = reshape_to_matrix(input_tensor) 897 | 898 | all_layer_outputs = [] 899 | for layer_idx in range(num_hidden_layers): 900 | with tf.variable_scope("layer_%d" % layer_idx): 901 | layer_input = prev_output 902 | 903 | with tf.variable_scope("attention"): 904 | attention_heads = [] 905 | with tf.variable_scope("self"): 906 | attention_head = attention_layer( 907 | layer_idx=layer_idx, 908 | from_tensor=layer_input, 909 | to_tensor=layer_input, 910 | attention_mask=attention_mask, 911 | num_attention_heads=num_attention_heads, 912 | size_per_head=attention_head_size, 913 | attention_probs_dropout_prob= 914 | attention_probs_dropout_prob, 915 | initializer_range=initializer_range, 916 | do_return_2d_tensor=True, 917 | batch_size=batch_size, 918 | from_seq_length=seq_length, 919 | to_seq_length=seq_length) 920 | attention_heads.append(attention_head) 921 | 922 | attention_output = None 923 | if len(attention_heads) == 1: 924 | attention_output = attention_heads[0] 925 | else: 926 | # In the case where we have other sequences, we just concatenate 927 | # them to the self-attention head before the projection. 928 | attention_output = tf.concat(attention_heads, axis=-1) 929 | 930 | # Run a linear projection of `hidden_size` then add a residual 931 | # with `layer_input`. 932 | with tf.variable_scope("output"): 933 | attention_output = tf.layers.dense( 934 | attention_output, 935 | hidden_size, 936 | kernel_initializer=create_initializer( 937 | initializer_range)) 938 | attention_output = dropout(attention_output, 939 | hidden_dropout_prob) 940 | attention_output = layer_norm(attention_output + 941 | layer_input) 942 | 943 | # The activation is only applied to the "intermediate" hidden layer. 944 | with tf.variable_scope("intermediate"): 945 | intermediate_output = tf.layers.dense( 946 | attention_output, 947 | intermediate_size, 948 | activation=intermediate_act_fn, 949 | kernel_initializer=create_initializer(initializer_range)) 950 | 951 | # Down-project back to `hidden_size` then add the residual. 952 | with tf.variable_scope("output"): 953 | layer_output = tf.layers.dense( 954 | intermediate_output, 955 | hidden_size, 956 | kernel_initializer=create_initializer(initializer_range)) 957 | layer_output = dropout(layer_output, hidden_dropout_prob) 958 | layer_output = layer_norm(layer_output + attention_output) 959 | prev_output = layer_output 960 | all_layer_outputs.append(layer_output) 961 | 962 | if do_return_all_layers: 963 | final_outputs = [] 964 | for layer_output in all_layer_outputs: 965 | final_output = reshape_from_matrix(layer_output, input_shape) 966 | final_outputs.append(final_output) 967 | return final_outputs 968 | else: 969 | final_output = reshape_from_matrix(prev_output, input_shape) 970 | return final_output 971 | 972 | 973 | def transformer_model_cross_share(input_tensor, 974 | attention_mask=None, 975 | hidden_size=768, 976 | num_hidden_layers=12, 977 | num_attention_heads=12, 978 | intermediate_size=3072, 979 | intermediate_act_fn=gelu, 980 | hidden_dropout_prob=0.1, 981 | attention_probs_dropout_prob=0.1, 982 | initializer_range=0.02, 983 | do_return_all_layers=False): 984 | """Multi-headed, multi-layer Transformer from "Attention is All You Need". 985 | This is almost an exact implementation of the original Transformer encoder. 986 | 987 | See the original paper: 988 | https://arxiv.org/abs/1706.03762 989 | 990 | Also see: 991 | https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py 992 | 993 | Args: 994 | input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. 995 | attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, 996 | seq_length], with 1 for positions that can be attended to and 0 in 997 | positions that should not be. 998 | hidden_size: int. Hidden size of the Transformer. 999 | num_hidden_layers: int. Number of layers (blocks) in the Transformer. 1000 | num_attention_heads: int. Number of attention heads in the Transformer. 1001 | intermediate_size: int. The size of the "intermediate" (a.k.a., feed 1002 | forward) layer. 1003 | intermediate_act_fn: function. The non-linear activation function to apply 1004 | to the output of the intermediate/feed-forward layer. 1005 | hidden_dropout_prob: float. Dropout probability for the hidden layers. 1006 | attention_probs_dropout_prob: float. Dropout probability of the attention 1007 | probabilities. 1008 | initializer_range: float. Range of the initializer (stddev of truncated 1009 | normal). 1010 | do_return_all_layers: Whether to also return all layers or just the final 1011 | layer. 1012 | 1013 | Returns: 1014 | float Tensor of shape [batch_size, seq_length, hidden_size], the final 1015 | hidden layer of the Transformer. 1016 | 1017 | Raises: 1018 | ValueError: A Tensor shape or parameter is invalid. 1019 | """ 1020 | if hidden_size % num_attention_heads != 0: 1021 | raise ValueError( 1022 | "The hidden size (%d) is not a multiple of the number of attention " 1023 | "heads (%d)" % (hidden_size, num_attention_heads)) 1024 | 1025 | attention_head_size = int(hidden_size / num_attention_heads) 1026 | input_shape = get_shape_list(input_tensor, expected_rank=3) 1027 | batch_size = input_shape[0] 1028 | seq_length = input_shape[1] 1029 | input_width = input_shape[2] 1030 | 1031 | # The Transformer performs sum residuals on all layers so the input needs 1032 | # to be the same as the hidden size. 1033 | if input_width != hidden_size: 1034 | raise ValueError( 1035 | "The width of the input tensor (%d) != hidden size (%d)" % 1036 | (input_width, hidden_size)) 1037 | 1038 | # We keep the representation as a 2D tensor to avoid re-shaping it back and 1039 | # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on 1040 | # the GPU/CPU but may not be free on the TPU, so we want to minimize them to 1041 | # help the optimizer. 1042 | prev_output = reshape_to_matrix(input_tensor) 1043 | 1044 | all_layer_outputs = [] 1045 | for layer_idx in range(num_hidden_layers): 1046 | with tf.variable_scope("shared_layer", reuse=tf.AUTO_REUSE): 1047 | layer_input = prev_output 1048 | with tf.variable_scope("attention"): 1049 | attention_heads = [] 1050 | with tf.variable_scope("self"): 1051 | attention_head = attention_layer( 1052 | layer_idx=0, 1053 | from_tensor=layer_input, 1054 | to_tensor=layer_input, 1055 | attention_mask=attention_mask, 1056 | num_attention_heads=num_attention_heads, 1057 | size_per_head=attention_head_size, 1058 | attention_probs_dropout_prob= 1059 | attention_probs_dropout_prob, 1060 | initializer_range=initializer_range, 1061 | do_return_2d_tensor=True, 1062 | batch_size=batch_size, 1063 | from_seq_length=seq_length, 1064 | to_seq_length=seq_length) 1065 | attention_heads.append(attention_head) 1066 | 1067 | attention_output = None 1068 | if len(attention_heads) == 1: 1069 | attention_output = attention_heads[0] 1070 | else: 1071 | # In the case where we have other sequences, we just concatenate 1072 | # them to the self-attention head before the projection. 1073 | attention_output = tf.concat(attention_heads, axis=-1) 1074 | 1075 | # Run a linear projection of `hidden_size` then add a residual 1076 | # with `layer_input`. 1077 | with tf.variable_scope("output"): 1078 | attention_output = tf.layers.dense( 1079 | attention_output, 1080 | hidden_size, 1081 | kernel_initializer=create_initializer( 1082 | initializer_range)) 1083 | attention_output = dropout(attention_output, 1084 | hidden_dropout_prob) 1085 | attention_output = layer_norm(attention_output + 1086 | layer_input) 1087 | 1088 | # The activation is only applied to the "intermediate" hidden layer. 1089 | with tf.variable_scope("intermediate"): 1090 | intermediate_output = tf.layers.dense( 1091 | attention_output, 1092 | intermediate_size, 1093 | activation=intermediate_act_fn, 1094 | kernel_initializer=create_initializer(initializer_range)) 1095 | 1096 | # Down-project back to `hidden_size` then add the residual. 1097 | with tf.variable_scope("output"): 1098 | layer_output = tf.layers.dense( 1099 | intermediate_output, 1100 | hidden_size, 1101 | kernel_initializer=create_initializer(initializer_range)) 1102 | layer_output = dropout(layer_output, hidden_dropout_prob) 1103 | layer_output = layer_norm(layer_output + attention_output) 1104 | prev_output = layer_output 1105 | all_layer_outputs.append(layer_output) 1106 | 1107 | if do_return_all_layers: 1108 | final_outputs = [] 1109 | for layer_output in all_layer_outputs: 1110 | final_output = reshape_from_matrix(layer_output, input_shape) 1111 | final_outputs.append(final_output) 1112 | return final_outputs 1113 | else: 1114 | final_output = reshape_from_matrix(prev_output, input_shape) 1115 | return final_output 1116 | 1117 | 1118 | 1119 | def get_shape_list(tensor, expected_rank=None, name=None): 1120 | """Returns a list of the shape of tensor, preferring static dimensions. 1121 | 1122 | Args: 1123 | tensor: A tf.Tensor object to find the shape of. 1124 | expected_rank: (optional) int. The expected rank of `tensor`. If this is 1125 | specified and the `tensor` has a different rank, and exception will be 1126 | thrown. 1127 | name: Optional name of the tensor for the error message. 1128 | 1129 | Returns: 1130 | A list of dimensions of the shape of tensor. All static dimensions will 1131 | be returned as python integers, and dynamic dimensions will be returned 1132 | as tf.Tensor scalars. 1133 | """ 1134 | if name is None: 1135 | name = tensor.name 1136 | 1137 | if expected_rank is not None: 1138 | assert_rank(tensor, expected_rank, name) 1139 | 1140 | shape = tensor.shape.as_list() 1141 | 1142 | non_static_indexes = [] 1143 | for (index, dim) in enumerate(shape): 1144 | if dim is None: 1145 | non_static_indexes.append(index) 1146 | 1147 | if not non_static_indexes: 1148 | return shape 1149 | 1150 | dyn_shape = tf.shape(tensor) 1151 | for index in non_static_indexes: 1152 | shape[index] = dyn_shape[index] 1153 | return shape 1154 | 1155 | 1156 | def reshape_to_matrix(input_tensor): 1157 | """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" 1158 | ndims = input_tensor.shape.ndims 1159 | if ndims < 2: 1160 | raise ValueError("Input tensor must have at least rank 2. Shape = %s" % 1161 | (input_tensor.shape)) 1162 | if ndims == 2: 1163 | return input_tensor 1164 | 1165 | width = input_tensor.shape[-1] 1166 | output_tensor = tf.reshape(input_tensor, [-1, width]) 1167 | return output_tensor 1168 | 1169 | 1170 | def reshape_from_matrix(output_tensor, orig_shape_list): 1171 | """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" 1172 | if len(orig_shape_list) == 2: 1173 | return output_tensor 1174 | 1175 | output_shape = get_shape_list(output_tensor) 1176 | 1177 | orig_dims = orig_shape_list[0:-1] 1178 | width = output_shape[-1] 1179 | 1180 | return tf.reshape(output_tensor, orig_dims + [width]) 1181 | 1182 | 1183 | def assert_rank(tensor, expected_rank, name=None): 1184 | """Raises an exception if the tensor rank is not of the expected rank. 1185 | 1186 | Args: 1187 | tensor: A tf.Tensor to check the rank of. 1188 | expected_rank: Python integer or list of integers, expected rank. 1189 | name: Optional name of the tensor for the error message. 1190 | 1191 | Raises: 1192 | ValueError: If the expected shape doesn't match the actual shape. 1193 | """ 1194 | if name is None: 1195 | name = tensor.name 1196 | 1197 | expected_rank_dict = {} 1198 | if isinstance(expected_rank, six.integer_types): 1199 | expected_rank_dict[expected_rank] = True 1200 | else: 1201 | for x in expected_rank: 1202 | expected_rank_dict[x] = True 1203 | 1204 | actual_rank = tensor.shape.ndims 1205 | if actual_rank not in expected_rank_dict: 1206 | scope_name = tf.get_variable_scope().name 1207 | raise ValueError( 1208 | "For the tensor `%s` in scope `%s`, the actual rank " 1209 | "`%d` (shape = %s) is not equal to the expected rank `%s`" % 1210 | (name, scope_name, actual_rank, str(tensor.shape), 1211 | str(expected_rank))) -------------------------------------------------------------------------------- /Model/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Functions and classes related to optimization (weight updates).""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import re 22 | # import tensorflow as tf 23 | import tensorflow.compat.v1 as tf 24 | tf.disable_v2_behavior() 25 | 26 | 27 | def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, 28 | use_tpu): 29 | """Creates an optimizer training op.""" 30 | global_step = tf.train.get_or_create_global_step() 31 | 32 | learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) 33 | 34 | # Implements linear decay of the learning rate. 35 | learning_rate = tf.train.polynomial_decay( 36 | learning_rate, 37 | global_step, 38 | num_train_steps, 39 | end_learning_rate=0.0, 40 | power=1.0, 41 | cycle=False) 42 | 43 | # Implements linear warmup. I.e., if global_step < num_warmup_steps, the 44 | # learning rate will be `global_step/num_warmup_steps * init_lr`. 45 | if num_warmup_steps: 46 | global_steps_int = tf.cast(global_step, tf.int32) 47 | warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) 48 | 49 | global_steps_float = tf.cast(global_steps_int, tf.float32) 50 | warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) 51 | 52 | warmup_percent_done = global_steps_float / warmup_steps_float 53 | warmup_learning_rate = init_lr * warmup_percent_done 54 | 55 | is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) 56 | learning_rate = ((1.0 - is_warmup) * learning_rate + 57 | is_warmup * warmup_learning_rate) 58 | 59 | # It is recommended that you use this optimizer for fine tuning, since this 60 | # is how the model was trained (note that the Adam m/v variables are NOT 61 | # loaded from init_checkpoint.) 62 | optimizer = AdamWeightDecayOptimizer( 63 | learning_rate=learning_rate, 64 | weight_decay_rate=0.01, 65 | beta_1=0.9, 66 | beta_2=0.999, 67 | epsilon=1e-6, 68 | exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) 69 | 70 | if use_tpu: 71 | optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) 72 | 73 | tvars = tf.trainable_variables() 74 | grads = tf.gradients(loss, tvars) 75 | 76 | # This is how the model was pre-trained. 77 | (grads, _) = tf.clip_by_global_norm(grads, clip_norm=5.0) 78 | 79 | train_op = optimizer.apply_gradients( 80 | zip(grads, tvars), global_step=global_step) 81 | 82 | new_global_step = global_step + 1 83 | train_op = tf.group(train_op, [global_step.assign(new_global_step)]) 84 | return train_op 85 | 86 | 87 | class AdamWeightDecayOptimizer(tf.train.Optimizer): 88 | """A basic Adam optimizer that includes "correct" L2 weight decay.""" 89 | 90 | def __init__(self, 91 | learning_rate, 92 | weight_decay_rate=0.0, 93 | beta_1=0.9, 94 | beta_2=0.999, 95 | epsilon=1e-6, 96 | exclude_from_weight_decay=None, 97 | name="AdamWeightDecayOptimizer"): 98 | """Constructs a AdamWeightDecayOptimizer.""" 99 | super(AdamWeightDecayOptimizer, self).__init__(False, name) 100 | 101 | self.learning_rate = learning_rate 102 | self.weight_decay_rate = weight_decay_rate 103 | self.beta_1 = beta_1 104 | self.beta_2 = beta_2 105 | self.epsilon = epsilon 106 | self.exclude_from_weight_decay = exclude_from_weight_decay 107 | 108 | def apply_gradients(self, grads_and_vars, global_step=None, name=None): 109 | """See base class.""" 110 | assignments = [] 111 | for (grad, param) in grads_and_vars: 112 | if grad is None or param is None: 113 | continue 114 | 115 | param_name = self._get_variable_name(param.name) 116 | 117 | m = tf.get_variable( 118 | name=param_name + "/adam_m", 119 | shape=param.shape.as_list(), 120 | dtype=tf.float32, 121 | trainable=False, 122 | initializer=tf.zeros_initializer()) 123 | v = tf.get_variable( 124 | name=param_name + "/adam_v", 125 | shape=param.shape.as_list(), 126 | dtype=tf.float32, 127 | trainable=False, 128 | initializer=tf.zeros_initializer()) 129 | 130 | # Standard Adam update. 131 | next_m = (tf.multiply(self.beta_1, m) + 132 | tf.multiply(1.0 - self.beta_1, grad)) 133 | next_v = (tf.multiply(self.beta_2, v) + 134 | tf.multiply(1.0 - self.beta_2, tf.square(grad))) 135 | 136 | update = next_m / (tf.sqrt(next_v) + self.epsilon) 137 | 138 | # Just adding the square of the weights to the loss function is *not* 139 | # the correct way of using L2 regularization/weight decay with Adam, 140 | # since that will interact with the m and v parameters in strange ways. 141 | # 142 | # Instead we want ot decay the weights in a manner that doesn't interact 143 | # with the m/v parameters. This is equivalent to adding the square 144 | # of the weights to the loss with plain (non-momentum) SGD. 145 | if self._do_use_weight_decay(param_name): 146 | update += self.weight_decay_rate * param 147 | 148 | update_with_lr = self.learning_rate * update 149 | 150 | next_param = param - update_with_lr 151 | 152 | assignments.extend( 153 | [param.assign(next_param), 154 | m.assign(next_m), 155 | v.assign(next_v)]) 156 | return tf.group(*assignments, name=name) 157 | 158 | def _do_use_weight_decay(self, param_name): 159 | """Whether to use L2 weight decay for `param_name`.""" 160 | if not self.weight_decay_rate: 161 | return False 162 | if self.exclude_from_weight_decay: 163 | for r in self.exclude_from_weight_decay: 164 | if re.search(r, param_name) is not None: 165 | return False 166 | return True 167 | 168 | def _get_variable_name(self, param_name): 169 | """Get the variable name from the tensor name.""" 170 | m = re.match("^(.*):\\d+$", param_name) 171 | if m is not None: 172 | param_name = m.group(1) 173 | return param_name 174 | -------------------------------------------------------------------------------- /Model/partitioning.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | sys.path.append("..") 4 | import pickle as pkl 5 | 6 | n_bucket = 8 7 | 8 | with open("./data/vocab", "rb") as f: 9 | vocab = pkl.load(f) 10 | 11 | total_freq = np.sum(vocab.frequency) 12 | print(len(vocab.frequency)) 13 | unit_freq = total_freq/n_bucket 14 | 15 | offset = 0 16 | bucket_list = [] 17 | 18 | for i in range(n_bucket): 19 | lower = offset 20 | count = 0 21 | for j in range(lower, len(vocab.frequency)): 22 | count += vocab.frequency[j] 23 | if count >= unit_freq or j == len(vocab.frequency)-1: 24 | upper = j 25 | break 26 | 27 | bucket_list.append([lower, upper]) 28 | offset = upper + 1 29 | 30 | print(bucket_list) 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /Model/run_finetune.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Run masked LM/next sentence masked_lm pre-training for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | import os 21 | import sys 22 | sys.path.append("..") 23 | import optimization 24 | import collections 25 | import re 26 | import tensorflow.compat.v1 as tf 27 | tf.disable_v2_behavior() 28 | 29 | from sklearn.metrics import roc_curve, auc, classification_report 30 | from run_pretrain import * 31 | import pandas as pd 32 | import numpy as np 33 | import pickle as pkl 34 | import time 35 | 36 | def _decode_record(record, name_to_features): 37 | """Decodes a record to a TensorFlow example.""" 38 | example = tf.parse_single_example(record, name_to_features) 39 | 40 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 41 | # So cast all int64 to int32. 42 | for name in list(example.keys()): 43 | t = example[name] 44 | if t.dtype == tf.int64: 45 | t = tf.to_int32(t) 46 | example[name] = t 47 | return example 48 | 49 | def del_flags(FLAGS, keys_list): 50 | for keys in keys_list: 51 | FLAGS.__delattr__(keys) 52 | return 53 | 54 | def input_fn(input_files, 55 | is_training, 56 | num_cpu_threads=4): 57 | """ The actual input function""" 58 | 59 | name_to_features = { 60 | "address": 61 | tf.FixedLenFeature([1], tf.int64), 62 | "label": 63 | tf.FixedLenFeature([1], tf.float32), 64 | "input_ids": 65 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 66 | "input_positions": 67 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 68 | "input_counts": 69 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 70 | "input_mask": 71 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 72 | "input_io_flags": 73 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 74 | "input_values": 75 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64) 76 | } 77 | 78 | if is_training: 79 | d = tf.data.TFRecordDataset(input_files) 80 | d = d.repeat(FLAGS.epoch).shuffle(100) 81 | 82 | else: 83 | d = tf.data.TFRecordDataset(input_files) 84 | 85 | d = d.map(lambda record: _decode_record(record, name_to_features), num_parallel_calls=num_cpu_threads) 86 | d = d.batch(batch_size=FLAGS.batch_size) 87 | 88 | iterator = d.make_one_shot_iterator() 89 | features = iterator.get_next() 90 | 91 | return features 92 | 93 | 94 | def model_fn(features, mode, bert_config, vocab, init_checkpoint, learning_rate, 95 | num_train_steps, num_warmup_steps, load_cross, use_one_hot_embeddings): 96 | """The `model_fn` for TPUEstimator.""" 97 | 98 | tf.logging.info("*** Features ***") 99 | for name in sorted(features.keys()): 100 | tf.logging.info("name = %s, shape = %s" % (name, 101 | features[name].shape)) 102 | 103 | label = tf.squeeze(features["label"]) # squeeze is important 104 | input_ids = features["input_ids"] 105 | input_positions = features["input_positions"] 106 | input_mask = features["input_mask"] 107 | input_io_flags = features["input_io_flags"] 108 | input_values = features["input_values"] 109 | input_counts = features["input_counts"] 110 | 111 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 112 | 113 | model = modeling.BertModel( 114 | config=bert_config, 115 | is_training=is_training, 116 | input_ids=input_ids, 117 | input_positions=input_positions, 118 | input_io_flags=input_io_flags, 119 | input_amounts=input_values, 120 | input_counts=input_counts, 121 | input_mask=input_mask, 122 | token_type_ids=None, 123 | use_one_hot_embeddings=use_one_hot_embeddings, 124 | cross_share=FLAGS.cross_share) 125 | 126 | transformer_output = model.get_sequence_output() 127 | print(transformer_output) 128 | with tf.variable_scope("MLP", reuse=tf.AUTO_REUSE): 129 | 130 | # inp = tf.reduce_mean(transformer_output, 1) 131 | inp = transformer_output[:,0,:] 132 | 133 | dnn1 = tf.layers.dense(inp, FLAGS.hidden_size, activation=tf.nn.relu, name='f1') 134 | dnn2 = tf.layers.dense(dnn1, FLAGS.hidden_size, activation=tf.nn.relu, name='f2') 135 | logit = tf.squeeze(tf.layers.dense(dnn2 + dnn1, 1, activation=None, name='logit')) 136 | y_hat = tf.sigmoid(logit) 137 | 138 | # print("--------------------") 139 | # print("label:", label) 140 | # print("logit:", logit) 141 | # print("--------------------") 142 | loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=logit)) 143 | 144 | total_loss = loss 145 | tvars = tf.trainable_variables() 146 | initialized_variable_names = {} 147 | scaffold_fn = None 148 | 149 | if init_checkpoint: 150 | (assignment_map, initialized_variable_names 151 | ) = modeling.get_assignment_map_from_checkpoint( 152 | tvars, init_checkpoint) 153 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 154 | if load_cross: 155 | # multi-layer parameter loading 156 | 157 | name_to_variable = collections.OrderedDict() 158 | for var in tvars: 159 | name = var.name 160 | m = re.match("^(.*):\\d+$", name) 161 | if m is not None: 162 | name = m.group(1) 163 | name_to_variable[name] = var 164 | 165 | for layer_index in range(bert_config.num_hidden_layers): 166 | 167 | assignment_map = collections.OrderedDict() 168 | for name in name_to_variable.keys(): 169 | if "layer_" + str(layer_index) in name: 170 | 171 | var_name_list = name.split("/") 172 | var_name_list[2] = "shared_layer" 173 | load_name = "/".join(var_name_list) 174 | # assignment_map[name] = new_name 175 | assignment_map[load_name] = name 176 | initialized_variable_names[name] = 1 177 | initialized_variable_names[name + ":0"] = 1 178 | 179 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 180 | 181 | tf.logging.info("**** Trainable Variables ****") 182 | for var in tvars: 183 | init_string = "" 184 | if var.name in initialized_variable_names: 185 | init_string = ", *INIT_FROM_CKPT*" 186 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 187 | init_string) 188 | 189 | if mode == tf.estimator.ModeKeys.TRAIN: 190 | train_op = optimization.create_optimizer(total_loss, learning_rate, 191 | num_train_steps, 192 | num_warmup_steps, False) 193 | 194 | return model, train_op, total_loss 195 | 196 | elif mode == tf.estimator.ModeKeys.EVAL: 197 | 198 | return model, y_hat, total_loss 199 | 200 | else: 201 | raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) 202 | 203 | 204 | def main(_): 205 | 206 | # load label 207 | phisher_account = pd.read_csv("../Data/phisher_account.txt", names=["account"]) 208 | phisher_account_set = set(phisher_account.account.values) 209 | 210 | def is_phish(address): 211 | if address in phisher_account_set: 212 | return 1.0 213 | else: 214 | return 0.0 215 | 216 | mode = tf.estimator.ModeKeys.TRAIN 217 | train_input_files = FLAGS.train_input_file 218 | train_features = input_fn(train_input_files, is_training=True) 219 | 220 | # modeling 221 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 222 | tf.gfile.MakeDirs(FLAGS.checkpointDir) 223 | 224 | # load vocab 225 | vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename 226 | with open(vocab_file_name, "rb") as f: 227 | vocab = pkl.load(f) 228 | 229 | # must have checkpoint 230 | if FLAGS.init_checkpoint==None: 231 | # raise ValueError("Must need a checkpoint for finetuning") 232 | print("No checkpoint!") 233 | 234 | train_bert_model, train_op, total_loss = model_fn(train_features, mode, bert_config, vocab, 235 | FLAGS.init_checkpoint, 236 | FLAGS.learning_rate, 237 | FLAGS.num_train_steps, FLAGS.num_warmup_steps, FLAGS.load_cross, False) 238 | 239 | # saver define 240 | tvars = tf.trainable_variables() 241 | saver = tf.train.Saver(max_to_keep=30, var_list=tvars) 242 | 243 | # start session 244 | sess = tf.Session() 245 | sess.run(tf.global_variables_initializer()) 246 | 247 | # start TRAINING 248 | losses = [] 249 | iter = 0 250 | start = time.time() 251 | while True: 252 | try: 253 | _, loss = sess.run([train_op, total_loss]) 254 | losses.append(loss) 255 | 256 | if iter % 100 == 0: 257 | end = time.time() 258 | loss = np.mean(losses) 259 | print("iter=%d, loss=%f, time=%.2fs" % (iter, loss, end - start)) 260 | losses = [] 261 | start = time.time() 262 | 263 | iter += 1 264 | 265 | except Exception as e: 266 | print("Out of Sequence") 267 | saver.save(sess, os.path.join(FLAGS.checkpointDir, "bert_finetune")) 268 | break 269 | 270 | # Evaluation 271 | mode = tf.estimator.ModeKeys.EVAL 272 | test_input_files = FLAGS.test_input_file 273 | test_features = input_fn(test_input_files, is_training=False) 274 | # do not load checkpoint 275 | test_bert_model, y_hat, total_loss = model_fn(test_features, mode, bert_config, vocab, 276 | os.path.join(FLAGS.checkpointDir, "bert_finetune"), 277 | FLAGS.learning_rate, 278 | FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False) 279 | 280 | address_id_list = [] 281 | y_hat_list = [] 282 | label_list = [] 283 | 284 | iter = 0 285 | start = time.time() 286 | while True: 287 | try: 288 | address_id_v, y_hat_v, label_v, loss = sess.run([test_features["address"], y_hat, test_features["label"], total_loss]) 289 | address_id_list += list(np.squeeze(address_id_v)) 290 | y_hat_list += list(y_hat_v) 291 | label_list += list(label_v) 292 | losses.append(loss) 293 | 294 | if iter % 100 == 0: 295 | end = time.time() 296 | print("iter=%d, time=%.2fs" % (iter, end - start)) 297 | start = time.time() 298 | 299 | iter += 1 300 | 301 | except Exception as e: 302 | print("Out of Sequence") 303 | # save model 304 | # saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(iter))) 305 | break 306 | 307 | sess.close() 308 | 309 | # generate final result 310 | address_id_list = np.array(address_id_list).reshape([-1]) 311 | y_hat_list = np.array(y_hat_list).reshape([-1]) 312 | label_list = np.array(label_list).reshape([-1]) 313 | 314 | # aggregation 315 | # group by embedding according to address 316 | address_to_pred_proba = {} 317 | # address_to_label = {} 318 | for i in range(len(address_id_list)): 319 | address = address_id_list[i] 320 | pred_proba = y_hat_list[i] 321 | # label = label_list[i] 322 | try: 323 | address_to_pred_proba[address].append(pred_proba) 324 | # address_to_label[address].append(label) 325 | except: 326 | address_to_pred_proba[address] = [pred_proba] 327 | # address_to_label[address] = [label] 328 | 329 | # group to one 330 | address_list = [] 331 | agg_y_hat_list = [] 332 | agg_label_list = [] 333 | 334 | for addr, pred_proba_list in address_to_pred_proba.items(): 335 | address_list.append(addr) 336 | if len(pred_proba_list) > 1: 337 | agg_y_hat_list.append(np.mean(pred_proba_list, axis=0)) 338 | else: 339 | agg_y_hat_list.append(pred_proba_list[0]) 340 | 341 | agg_label_list.append(is_phish(vocab.id_to_tokens[addr])) 342 | 343 | # print("================ROC Curve====================") 344 | fpr, tpr, thresholds = roc_curve(agg_label_list, agg_y_hat_list, pos_label=1) 345 | print("AUC=", auc(fpr, tpr)) 346 | 347 | print(np.sum(agg_label_list)) 348 | print(np.sum(agg_y_hat_list)) 349 | 350 | # for threshold in [0.01, 0.03, 0.05]: 351 | for threshold in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]: 352 | 353 | print("threshold =", threshold) 354 | y_pred = np.zeros_like(agg_y_hat_list) 355 | y_pred[np.where(np.array(agg_y_hat_list) >= threshold)[0]] = 1 356 | print(np.sum(y_pred)) 357 | print(classification_report(agg_label_list, y_pred, digits=4)) 358 | 359 | return 360 | 361 | if __name__ == '__main__': 362 | 363 | del_flags(FLAGS, ["do_train", "do_eval", "cross_share", "load_cross", "epoch", "max_seq_length", "train_input_file", "test_input_file", "init_checkpoint","learning_rate"]) 364 | flags.DEFINE_bool("do_train", False, "") 365 | flags.DEFINE_bool("do_eval", True, "") 366 | flags.DEFINE_bool("cross_share", False, "whether to share or not") 367 | flags.DEFINE_bool("load_cross", True, "whether to load from cross") 368 | flags.DEFINE_integer("epoch", 1, "Epoch for finetune") 369 | flags.DEFINE_integer("max_seq_length", 100, "") 370 | flags.DEFINE_string("train_input_file", "./data/finetune_train.tfrecord", "Input train file for finetuning") 371 | flags.DEFINE_string("test_input_file", "./data/finetune_test.tfrecord", "Input test file for finetuning") 372 | flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BERT model).") 373 | flags.DEFINE_integer("hidden_size", 128, "Hidden size for downside MLP.") 374 | flags.DEFINE_float("learning_rate", 3e-4, "") 375 | 376 | print("==========Parameters===========") 377 | print("cross_share:", FLAGS.cross_share) 378 | print("load_cross:", FLAGS.load_cross) 379 | print("learning_rate:", FLAGS.learning_rate) 380 | tf.app.run() -------------------------------------------------------------------------------- /Model/run_pretrain.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Run masked LM/next sentence masked_lm pre-training for BERT.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | import os 21 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 22 | import sys 23 | 24 | import modeling 25 | sys.path.append("..") 26 | import optimization 27 | import tensorflow.compat.v1 as tf 28 | tf.disable_v2_behavior() 29 | 30 | import numpy as np 31 | 32 | import pickle as pkl 33 | # import time 34 | from timeit import default_timer as timer 35 | import math 36 | 37 | flags = tf.flags 38 | FLAGS = flags.FLAGS 39 | 40 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 41 | # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 42 | 43 | ## Required parameters 44 | flags.DEFINE_string( 45 | "bert_config_file", "./zipzap_config.json", 46 | "The config json file corresponding to the pre-trained BERT model. " 47 | "This specifies the model architecture.") 48 | 49 | flags.DEFINE_string( 50 | "train_input_file", "./data/train.tfrecord", 51 | "Input TF example files (can be a glob or comma separated).") 52 | 53 | flags.DEFINE_string( 54 | "test_input_file", "./data/test.tfrecord", 55 | "Input TF example files (can be a glob or comma separated).") 56 | 57 | flags.DEFINE_string( 58 | "checkpointDir", "ckpt_dir", 59 | "The output directory where the model checkpoints will be written.") 60 | 61 | flags.DEFINE_string("signature", 'default', "signature_name") 62 | 63 | ## Other parameters 64 | flags.DEFINE_string("init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BERT model).") 65 | flags.DEFINE_integer("max_seq_length", 29, "") 66 | flags.DEFINE_float("masked_lm_prob", 0.8, "Masked LM probability.") 67 | flags.DEFINE_bool("do_train", True, "") 68 | flags.DEFINE_bool("do_eval", False, "") 69 | flags.DEFINE_integer("batch_size", 256, "") 70 | flags.DEFINE_integer("epoch", 5, "") 71 | flags.DEFINE_float("learning_rate", 1e-4, "") 72 | flags.DEFINE_integer("num_train_steps", 10000000, "Number of training steps.") 73 | flags.DEFINE_integer("num_warmup_steps", 100, "Number of warmup steps.") 74 | flags.DEFINE_integer("save_checkpoints_steps", 8000, "") 75 | flags.DEFINE_integer("iterations_per_loop", 2000, "How many steps to make in each estimator call.") 76 | flags.DEFINE_integer("max_eval_steps", 1000, "Maximum number of eval steps.") 77 | flags.DEFINE_integer("neg_sample_num", 5000, "The number of negative samples in a batch") 78 | flags.DEFINE_string("neg_strategy", "zip", "Strategy of negative sampling") 79 | flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") 80 | flags.DEFINE_string("data_dir", './data/', "data dir.") 81 | flags.DEFINE_bool("cross_share", True, "whether to share or not") 82 | flags.DEFINE_bool("load_cross", False, "whether to load from cross") 83 | flags.DEFINE_string("vocab_filename", "vocab", "vocab filename") 84 | 85 | MAX_PREDICTIONS_PER_SEQ = math.ceil(FLAGS.max_seq_length * FLAGS.masked_lm_prob) 86 | 87 | print("MAX_SEQUENCE_LENGTH:", FLAGS.max_seq_length) 88 | print("MAX_PREDICTIONS_PER_SEQ:", MAX_PREDICTIONS_PER_SEQ) 89 | 90 | 91 | def input_fn(input_files, 92 | is_training, 93 | num_cpu_threads=4): 94 | """ The actual input function""" 95 | 96 | name_to_features = { 97 | "address": 98 | tf.FixedLenFeature([1], tf.int64), 99 | "input_ids": 100 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 101 | "input_positions": 102 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 103 | "input_counts": 104 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 105 | "input_mask": 106 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 107 | "input_io_flags": 108 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 109 | "input_values": 110 | tf.FixedLenFeature([FLAGS.max_seq_length], tf.int64), 111 | "masked_lm_positions": 112 | tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.int64), 113 | "masked_lm_ids": 114 | tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.int64), 115 | "masked_lm_weights": 116 | tf.FixedLenFeature([MAX_PREDICTIONS_PER_SEQ], tf.float32) 117 | } 118 | 119 | if is_training: 120 | d = tf.data.TFRecordDataset(input_files) 121 | d = d.repeat(FLAGS.epoch).shuffle(100) 122 | 123 | else: 124 | d = tf.data.TFRecordDataset(input_files) 125 | 126 | d = d.map(lambda record: _decode_record(record, name_to_features), num_parallel_calls=num_cpu_threads) 127 | d = d.batch(batch_size=FLAGS.batch_size) 128 | 129 | iterator = d.make_one_shot_iterator() 130 | features = iterator.get_next() 131 | 132 | return features 133 | 134 | 135 | def model_fn(features, mode, bert_config, vocab, init_checkpoint, learning_rate, 136 | num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings): 137 | """The `model_fn` for TPUEstimator.""" 138 | 139 | # tf.logging.info("*** Features ***") 140 | # for name in sorted(features.keys()): 141 | # tf.logging.info("name = %s, shape = %s" % (name, 142 | # features[name].shape)) 143 | 144 | input_ids = features["input_ids"] 145 | input_positions = features["input_positions"] 146 | input_mask = features["input_mask"] 147 | input_io_flags = features["input_io_flags"] 148 | input_values = features["input_values"] 149 | input_counts = features["input_counts"] 150 | masked_lm_positions = features["masked_lm_positions"] 151 | masked_lm_ids = features["masked_lm_ids"] 152 | masked_lm_weights = features["masked_lm_weights"] 153 | 154 | is_training = (mode == tf.estimator.ModeKeys.TRAIN) 155 | 156 | model = modeling.BertModel( 157 | config=bert_config, 158 | is_training=is_training, 159 | input_ids=input_ids, 160 | input_positions=input_positions, 161 | input_io_flags=input_io_flags, 162 | input_amounts=input_values, 163 | input_counts=input_counts, 164 | input_mask=input_mask, 165 | token_type_ids=None, 166 | use_one_hot_embeddings=use_one_hot_embeddings, 167 | cross_share=FLAGS.cross_share) 168 | 169 | embedding_table_list, factorize_table_list = model.get_embedding_table() 170 | 171 | (masked_lm_loss, 172 | masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output_negative_sampling( 173 | bert_config, 174 | model.get_sequence_output(), 175 | embedding_table_list, 176 | factorize_table_list, 177 | masked_lm_positions, 178 | masked_lm_ids, 179 | masked_lm_weights, 180 | vocab) # model use the token embedding table as the output_weights 181 | 182 | total_loss = masked_lm_loss 183 | tvars = tf.trainable_variables() 184 | initialized_variable_names = {} 185 | scaffold_fn = None 186 | 187 | if init_checkpoint: 188 | (assignment_map, initialized_variable_names 189 | ) = modeling.get_assignment_map_from_checkpoint( 190 | tvars, init_checkpoint) 191 | if use_tpu: 192 | 193 | def tpu_scaffold(): 194 | tf.train.init_from_checkpoint(init_checkpoint, 195 | assignment_map) 196 | return tf.train.Scaffold() 197 | 198 | scaffold_fn = tpu_scaffold 199 | else: 200 | tf.train.init_from_checkpoint(init_checkpoint, assignment_map) 201 | 202 | tf.logging.info("**** Trainable Variables ****") 203 | for var in tvars: 204 | init_string = "" 205 | if var.name in initialized_variable_names: 206 | init_string = ", *INIT_FROM_CKPT*" 207 | tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, 208 | init_string) 209 | 210 | if mode == tf.estimator.ModeKeys.TRAIN: 211 | train_op = optimization.create_optimizer(total_loss, learning_rate, 212 | num_train_steps, 213 | num_warmup_steps, use_tpu) 214 | 215 | return model, train_op, total_loss 216 | # output_spec = tf.estimator.EstimatorSpec( 217 | # mode=mode, 218 | # loss=total_loss, 219 | # train_op=train_op, 220 | # scaffold=scaffold_fn) 221 | 222 | elif mode == tf.estimator.ModeKeys.EVAL: 223 | 224 | def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): 225 | """Computes the loss and accuracy of the model.""" 226 | masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) 227 | masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) 228 | masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) 229 | masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) 230 | masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) 231 | masked_lm_accuracy = tf.metrics.accuracy(labels=masked_lm_ids, predictions=masked_lm_predictions, 232 | weights=masked_lm_weights) 233 | masked_lm_mean_loss = tf.metrics.mean(values=masked_lm_example_loss, weights=masked_lm_weights) 234 | 235 | return { 236 | "masked_lm_accuracy": masked_lm_accuracy, 237 | "masked_lm_loss": masked_lm_mean_loss, 238 | } 239 | 240 | tf.add_to_collection('eval_sp', masked_lm_log_probs) 241 | tf.add_to_collection('eval_sp', input_ids) 242 | tf.add_to_collection('eval_sp', masked_lm_ids) 243 | 244 | eval_metrics = metric_fn(masked_lm_example_loss, 245 | masked_lm_log_probs, 246 | masked_lm_ids, 247 | masked_lm_weights) 248 | 249 | # output_spec = tf.estimator.EstimatorSpec( 250 | # mode=mode, 251 | # loss=total_loss, 252 | # eval_metric_ops=eval_metrics, 253 | # scaffold=scaffold_fn) 254 | 255 | return model, total_loss 256 | 257 | else: 258 | raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) 259 | 260 | 261 | def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, 262 | label_ids, label_weights): 263 | """Get loss and log probs for the masked LM.""" 264 | # [batch_size*label_size, dim] 265 | input_tensor = gather_indexes(input_tensor, positions) 266 | 267 | with tf.variable_scope("cls/predictions"): 268 | # We apply one more non-linear transformation before the output layer. 269 | # This matrix is not used after pre-training. 270 | with tf.variable_scope("transform"): 271 | input_tensor = tf.layers.dense( 272 | input_tensor, 273 | units=bert_config.hidden_size, 274 | activation=modeling.get_activation(bert_config.hidden_act), 275 | kernel_initializer=modeling.create_initializer( 276 | bert_config.initializer_range)) 277 | input_tensor = modeling.layer_norm(input_tensor) 278 | 279 | # The output weights are the same as the input embeddings, but there is 280 | # an output-only bias for each token. 281 | output_bias = tf.get_variable( 282 | "output_bias", 283 | shape=[output_weights.shape[0]], 284 | initializer=tf.zeros_initializer()) 285 | logits = tf.matmul(input_tensor, output_weights, transpose_b=True) 286 | logits = tf.nn.bias_add(logits, output_bias) 287 | # logits, (bs*label_size, vocab_size) 288 | log_probs = tf.nn.log_softmax(logits, -1) 289 | 290 | label_ids = tf.reshape(label_ids, [-1]) 291 | label_weights = tf.reshape(label_weights, [-1]) 292 | 293 | one_hot_labels = tf.one_hot( 294 | label_ids, depth=output_weights.shape[0], dtype=tf.float32) 295 | 296 | # The `positions` tensor might be zero-padded (if the sequence is too 297 | # short to have the maximum number of predictions). The `label_weights` 298 | # tensor has a value of 1.0 for every real prediction and 0.0 for the 299 | # padding predictions. 300 | per_example_loss = -tf.reduce_sum( 301 | log_probs * one_hot_labels, axis=[-1]) 302 | numerator = tf.reduce_sum(label_weights * per_example_loss) 303 | denominator = tf.reduce_sum(label_weights) + 1e-5 304 | loss = numerator / denominator 305 | 306 | return (loss, per_example_loss, log_probs) 307 | 308 | 309 | def get_masked_lm_output_negative_sampling(bert_config, input_tensor, 310 | embedding_table_list, factorize_table_list, 311 | positions, label_ids, label_weights, vocab): 312 | """Get loss and log probs for the masked LM.""" 313 | 314 | # negative sample randomly 315 | word_num = len(vocab.vocab_words) - 3 316 | 317 | if FLAGS.neg_strategy == "uniform": 318 | neg_ids, _, _ = tf.nn.uniform_candidate_sampler(true_classes=[[len(vocab.vocab_words)]], 319 | num_true=1, 320 | num_sampled=FLAGS.neg_sample_num, 321 | unique=True, 322 | range_max=word_num) 323 | 324 | elif FLAGS.neg_strategy == "zip": 325 | neg_ids, _, _ = tf.nn.log_uniform_candidate_sampler(true_classes=[[len(vocab.vocab_words)]], 326 | num_true=1, 327 | num_sampled=FLAGS.neg_sample_num, 328 | unique=True, 329 | range_max=word_num) 330 | 331 | elif FLAGS.neg_strategy == "freq": 332 | # negative sample based on frequency 333 | neg_ids, _, _ = tf.nn.fixed_unigram_candidate_sampler(true_classes=[[len(vocab.vocab_words)]], 334 | num_true=1, 335 | num_sampled=FLAGS.neg_sample_num, 336 | unique=True, 337 | range_max=word_num, 338 | unigrams=list( 339 | map(lambda x: pow(x, 1 / 1), vocab.frequency[3:])) 340 | ) 341 | 342 | else: 343 | raise ValueError("Please select correct negative sampling strategy: uniform, zip, .") 344 | 345 | neg_ids = tf.cast(neg_ids, tf.int32) 346 | neg_ids = neg_ids + 1 + 3 # + 4 (1 padding, 2 mask, 3 not use) 347 | 348 | # [batch_size*label_size, dim] 349 | input_tensor = gather_indexes(input_tensor, positions) 350 | 351 | with tf.variable_scope("cls/predictions"): 352 | # We apply one more non-linear transformation before the output layer. 353 | # This matrix is not used after pre-training. 354 | with tf.variable_scope("transform"): 355 | input_tensor = tf.layers.dense( 356 | input_tensor, 357 | units=bert_config.hidden_size, 358 | activation=modeling.get_activation(bert_config.hidden_act), 359 | kernel_initializer=modeling.create_initializer( 360 | bert_config.initializer_range)) 361 | input_tensor = modeling.layer_norm(input_tensor) 362 | 363 | # label_ids = tf.reshape(label_ids, [-1]) 364 | label_weights = tf.reshape(label_weights, [-1]) 365 | 366 | pos_output_weights = modeling.new_embedding_lookup(label_ids, 367 | bert_config.bucket_list, 368 | embedding_table_list, 369 | factorize_table_list) 370 | 371 | pos_output_weights = tf.reshape(pos_output_weights, [-1, pos_output_weights.shape[-1]]) 372 | 373 | neg_ids = tf.expand_dims(neg_ids, axis=0) 374 | neg_output_weights = modeling.new_embedding_lookup(neg_ids, 375 | bert_config.bucket_list, 376 | embedding_table_list, 377 | factorize_table_list) 378 | 379 | neg_output_weights = tf.reshape(neg_output_weights, [-1, neg_output_weights.shape[-1]]) 380 | 381 | pos_logits = tf.reduce_sum(tf.multiply(input_tensor, pos_output_weights), axis=-1) # 768 382 | pos_logits = tf.expand_dims(pos_logits, axis=1) 383 | neg_logits = tf.matmul(input_tensor, neg_output_weights, transpose_b=True) # 768, 10000 384 | 385 | logits = tf.concat([pos_logits, neg_logits], axis=1) 386 | # The output weights are the same as the input embeddings, but there is 387 | # an output-only bias for each token. 388 | output_bias = tf.get_variable( 389 | "output_bias", 390 | shape=[logits.shape[1]], 391 | initializer=tf.zeros_initializer()) 392 | 393 | logits = tf.nn.bias_add(logits, output_bias) 394 | log_probs = tf.nn.log_softmax(logits, -1) 395 | per_example_loss = -log_probs[:, 0] 396 | # The `positions` tensor might be zero-padded (if the sequence is too 397 | # short to have the maximum number of predictions). The `label_weights` 398 | # tensor has a value of 1.0 for every real prediction and 0.0 for the 399 | # padding predictions. 400 | numerator = tf.reduce_sum(label_weights * per_example_loss) 401 | denominator = tf.reduce_sum(label_weights) + 1e-5 402 | loss = numerator / denominator 403 | 404 | return (loss, per_example_loss, log_probs) 405 | 406 | 407 | def gather_indexes(sequence_tensor, positions): 408 | """Gathers the vectors at the specific positions over a minibatch.""" 409 | sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) 410 | batch_size = sequence_shape[0] 411 | seq_length = sequence_shape[1] 412 | width = sequence_shape[2] 413 | flat_offsets = tf.reshape( 414 | tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) 415 | flat_positions = tf.reshape(positions + flat_offsets, [-1]) 416 | flat_sequence_tensor = tf.reshape(sequence_tensor, 417 | [batch_size * seq_length, width]) 418 | output_tensor = tf.gather(flat_sequence_tensor, flat_positions) 419 | return output_tensor 420 | 421 | 422 | def _decode_record(record, name_to_features): 423 | """Decodes a record to a TensorFlow example.""" 424 | example = tf.parse_single_example(record, name_to_features) 425 | 426 | # tf.Example only supports tf.int64, but the TPU only supports tf.int32. 427 | # So cast all int64 to int32. 428 | for name in list(example.keys()): 429 | t = example[name] 430 | if t.dtype == tf.int64: 431 | t = tf.to_int32(t) 432 | example[name] = t 433 | return example 434 | 435 | 436 | def main(_): 437 | if FLAGS.do_train: 438 | mode = tf.estimator.ModeKeys.TRAIN 439 | input_files = FLAGS.train_input_file 440 | # load data 441 | features = input_fn(input_files, is_training=True) 442 | 443 | elif FLAGS.do_eval: 444 | mode = tf.estimator.ModeKeys.EVAL 445 | input_files = FLAGS.test_input_file 446 | features = input_fn(input_files, is_training=False) 447 | 448 | else: 449 | raise ValueError("Only TRAIN and EVAL modes are supported.") 450 | 451 | # modeling 452 | bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) 453 | tf.gfile.MakeDirs(FLAGS.checkpointDir) 454 | 455 | # load vocab 456 | vocab_file_name = FLAGS.data_dir + FLAGS.vocab_filename 457 | with open(vocab_file_name, "rb") as f: 458 | vocab = pkl.load(f) 459 | 460 | if FLAGS.do_train: 461 | bert_model, train_op, total_loss = model_fn(features, mode, bert_config, vocab, FLAGS.init_checkpoint, 462 | FLAGS.learning_rate, 463 | FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False) 464 | # saver define 465 | tvars = tf.trainable_variables() 466 | saver = tf.train.Saver(max_to_keep=30, var_list=tvars) 467 | 468 | # start session 469 | config = tf.ConfigProto(allow_soft_placement=True) 470 | config.gpu_options.allow_growth = True 471 | 472 | with tf.Session(config=config) as sess: 473 | sess.run(tf.global_variables_initializer()) 474 | losses = [] 475 | iter = 0 476 | # start = time.time() 477 | start = timer() 478 | while True: 479 | try: 480 | _, loss = sess.run([train_op, total_loss]) 481 | # loss = sess.run([total_loss]) 482 | losses.append(loss) 483 | 484 | if iter % 500 == 0: 485 | # end = time.time() 486 | end = timer() 487 | loss = np.mean(losses) 488 | print("iter=%d, loss=%f, time=%.3fs" % (iter, loss, end - start)) 489 | losses = [] 490 | # start = time.time() 491 | start = timer() 492 | 493 | if iter % FLAGS.save_checkpoints_steps == 0 and iter > 0: 494 | saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(round(iter)))) 495 | 496 | iter += 1 497 | 498 | except Exception as e: 499 | # print("Out of Sequence, end of training...") 500 | print(e) 501 | # save model 502 | saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(round(iter)))) 503 | break 504 | 505 | elif FLAGS.do_eval: 506 | # must have checkpoint 507 | if FLAGS.init_checkpoint == None: 508 | raise ValueError("Must need a checkpoint for evaluation") 509 | 510 | bert_model, total_loss = model_fn(features, mode, bert_config, vocab, FLAGS.init_checkpoint, 511 | FLAGS.learning_rate, 512 | FLAGS.num_train_steps, FLAGS.num_warmup_steps, False, False) 513 | 514 | # start session 515 | with tf.Session() as sess: 516 | sess.run(tf.global_variables_initializer()) 517 | losses = [] 518 | iter = 0 519 | # start = time.time() 520 | start = timer() 521 | while True: 522 | try: 523 | loss = sess.run(total_loss) 524 | losses.append(loss) 525 | 526 | if iter % 500 == 0: 527 | # end = time.time() 528 | end = timer() 529 | print("iter=%d, time=%.3fs" % (iter, end - start)) 530 | # start = time.time() 531 | start = timer() 532 | iter += 1 533 | 534 | except Exception as e: 535 | print("Out of Sequence") 536 | # save model 537 | # saver.save(sess, os.path.join(FLAGS.checkpointDir, "model_" + str(iter))) 538 | break 539 | 540 | final_loss = np.mean(losses) 541 | eval_sample_num = len(losses) 542 | 543 | print("========Evaluation Results==========") 544 | print("sample_num=%d, loss=%.2f" % (eval_sample_num, final_loss)) 545 | 546 | else: 547 | raise ValueError("Only TRAIN and EVAL modes are supported.") 548 | 549 | return 550 | 551 | 552 | if __name__ == '__main__': 553 | tf.app.run() -------------------------------------------------------------------------------- /Model/run_zipzap.sh: -------------------------------------------------------------------------------- 1 | python gen_seq.py # construct transaction sequence 2 | python gen_pretrain_data.py # generate pre-training data 3 | python gen_finetune_data.py # generate fine-tuning data 4 | 5 | python run_pretrain.py # pre-training 6 | python run_finetune.py --init_checkpoint=ckpt_dir/model_64000 # fine-tuning and evaluation 7 | -------------------------------------------------------------------------------- /Model/vocab.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | def convert_by_vocab(vocab, tokens): 4 | """Converts a sequence of [tokens|ids] using the vocab.""" 5 | output = [] 6 | for token in tokens: 7 | output.append(vocab[token]) 8 | return output 9 | 10 | class FreqVocab(object): 11 | """Runs end-to-end tokenziation.""" 12 | 13 | def __init__(self): 14 | self.counter = Counter() 15 | self.frequency = [] 16 | 17 | def update(self, eoa2seq): 18 | for eoa in eoa2seq.keys(): 19 | seq = eoa2seq[eoa] 20 | self.counter[eoa] = len(seq) 21 | self.counter.update(map(lambda x:x[0], seq)) 22 | 23 | def generate_vocab(self): 24 | self.token_count = len(self.counter.keys()) 25 | self.special_tokens = ["[MASK]", "[pad]", '[NO_USE]'] 26 | self.token_to_ids = {} # index begin from 1 27 | 28 | # first special tokens for frequency factorization 29 | for token in self.special_tokens: 30 | self.token_to_ids[token] = len(self.token_to_ids) + 1 31 | 32 | # then normal item 33 | for token, count in self.counter.most_common(): 34 | self.token_to_ids[token] = len(self.token_to_ids) + 1 35 | 36 | # add count 37 | for token in self.special_tokens: 38 | self.counter[token] = 0 39 | 40 | self.id_to_tokens = {v: k for k, v in self.token_to_ids.items()} 41 | self.vocab_words = list(self.token_to_ids.keys()) 42 | 43 | id_list = sorted(list(self.token_to_ids.values())) 44 | for id in id_list: 45 | token = self.id_to_tokens[id] 46 | self.frequency.append(self.counter[token]) # used for negative sampling 47 | 48 | def convert_tokens_to_ids(self, tokens): 49 | return convert_by_vocab(self.token_to_ids, tokens) 50 | 51 | def convert_ids_to_tokens(self, ids): 52 | return convert_by_vocab(self.id_to_tokens, ids) 53 | 54 | -------------------------------------------------------------------------------- /Model/zipzap_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attention_probs_dropout_prob": 0.2, 3 | "hidden_act": "gelu", 4 | "hidden_dropout_prob": 0.2, 5 | "hidden_size": 64, 6 | "bucket_list": [[0, 63], [64, 751], [752, 4242], [4243, 16800], [16801, 56218], [56219, 175121], [175122, 514653], [514654, 2300000]], 7 | "factor_list": [64, 41, 27, 17, 11, 7, 5, 3], 8 | "intermediate_size": 64, 9 | "initializer_range": 0.02, 10 | "max_position_embeddings": 200, 11 | "num_attention_heads": 2, 12 | "num_hidden_layers": 8, 13 | "type_vocab_size": 2, 14 | "vocab_size": 2300000 15 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # ZipZap 3 | 4 | This is the code and dataset for the paper "ZipZap: Efficient Training of Language Models for Ethereum Fraud Detection" (the Web conference 2024) 5 | 6 | ## Getting Start 7 | ### Requirements: 8 | * Python >= 3.6.1 9 | * NumPy >= 1.18.1 10 | * TensorFlow >= 2.0.0 11 | 12 | ### 1. Download dataset 13 | 14 | #### Step 1. Download dataset from Google drive: 15 | * [All in one](https://drive.google.com/file/d/1EXMIWEPTuu3bN2gJOaxmEXyDG-AsDUIL/view) 16 | 17 | #### Step 2. Unzip dataset under the directory of "ZipZap/Data" 18 | ``` 19 | tar -xvf ZipZap_Data.tar.gz 20 | ``` 21 | 22 | ### 2. Run the code 23 | 24 | Please refer to ./Model/run_zipzap.sh 25 | --------------------------------------------------------------------------------