├── algorithms ├── intent_extractors │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── attribute_node_model.cpython-36.pyc │ └── attribute_node_model.py ├── encoder │ ├── .ipynb_checkpoints │ │ └── __init__-checkpoint.py │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-36.pyc │ └── transformer │ │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── encoder_layer.cpython-36.pyc │ │ ├── ff_and_ln_layers.cpython-36.pyc │ │ └── transformer_block.cpython-36.pyc │ │ ├── ff_and_ln_layers.py │ │ ├── transformer_block.py │ │ └── encoder_layer.py └── util_funcs │ ├── __pycache__ │ ├── model_handler.cpython-36.pyc │ ├── data_preprocessing.cpython-36.pyc │ └── attribute_model_builder.cpython-36.pyc │ ├── model_handler.py │ ├── data_preprocessing.py │ └── attribute_model_builder.py ├── util_funcs ├── __pycache__ │ ├── trainer.cpython-36.pyc │ ├── after_training.cpython-36.pyc │ ├── prepare_for_trianing.cpython-36.pyc │ ├── prepare_for_trianning.cpython-36.pyc │ └── process_and_save_data_for_training.cpython-36.pyc ├── process_and_save_data_for_training.py ├── trainer.py ├── after_training.py └── prepare_for_trianing.py ├── modeling ├── __pycache__ │ └── build_electra_graph.cpython-36.pyc └── build_electra_graph.py ├── get_train_data.py ├── configure.yml ├── train.py └── README.md /algorithms/intent_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .attribute_node_model import nlu_model_producer,parser_producer -------------------------------------------------------------------------------- /algorithms/encoder/.ipynb_checkpoints/__init__-checkpoint.py: -------------------------------------------------------------------------------- 1 | from .transformer.encoder_layer import transformer_encoder_producer -------------------------------------------------------------------------------- /algorithms/encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer.encoder_layer import transformer_encoder_producer,transformer_encoder_no_pe_producer,transformer_encoder -------------------------------------------------------------------------------- /util_funcs/__pycache__/trainer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/util_funcs/__pycache__/trainer.cpython-36.pyc -------------------------------------------------------------------------------- /util_funcs/__pycache__/after_training.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/util_funcs/__pycache__/after_training.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/encoder/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/encoder/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /modeling/__pycache__/build_electra_graph.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/modeling/__pycache__/build_electra_graph.cpython-36.pyc -------------------------------------------------------------------------------- /util_funcs/__pycache__/prepare_for_trianing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/util_funcs/__pycache__/prepare_for_trianing.cpython-36.pyc -------------------------------------------------------------------------------- /util_funcs/__pycache__/prepare_for_trianning.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/util_funcs/__pycache__/prepare_for_trianning.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/util_funcs/__pycache__/model_handler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/util_funcs/__pycache__/model_handler.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/intent_extractors/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/intent_extractors/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/encoder/transformer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/encoder/transformer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/util_funcs/__pycache__/data_preprocessing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/util_funcs/__pycache__/data_preprocessing.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/encoder/transformer/__pycache__/encoder_layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/encoder/transformer/__pycache__/encoder_layer.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/util_funcs/__pycache__/attribute_model_builder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/util_funcs/__pycache__/attribute_model_builder.cpython-36.pyc -------------------------------------------------------------------------------- /util_funcs/__pycache__/process_and_save_data_for_training.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/util_funcs/__pycache__/process_and_save_data_for_training.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/encoder/transformer/__pycache__/ff_and_ln_layers.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/encoder/transformer/__pycache__/ff_and_ln_layers.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/encoder/transformer/__pycache__/transformer_block.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/encoder/transformer/__pycache__/transformer_block.cpython-36.pyc -------------------------------------------------------------------------------- /algorithms/intent_extractors/__pycache__/attribute_node_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XufengXufengXufeng/Electra_with_tensorflow/HEAD/algorithms/intent_extractors/__pycache__/attribute_node_model.cpython-36.pyc -------------------------------------------------------------------------------- /get_train_data.py: -------------------------------------------------------------------------------- 1 | from util_funcs.process_and_save_data_for_training import process_data_for_training 2 | 3 | import yaml 4 | with open("configure.yml") as f: 5 | configure = yaml.safe_load(f) 6 | 7 | if __name__=="__main__": 8 | process_data_for_training(configure["datafiles"],configure["char2id_loc"] 9 | ,configure["id2char_loc"],configure["train_data_loc"]) -------------------------------------------------------------------------------- /configure.yml: -------------------------------------------------------------------------------- 1 | 2 | datafiles: 3 | - "data/baike_qa_valid.json" 4 | - "data/baike_qa_train.json" 5 | char2id_loc: "data/processed_data/char2id.json" 6 | id2char_loc: "data/processed_data/id2char.json" 7 | train_data_loc: "data/processed_data/train.txt" 8 | embedding_size: 100 9 | generator_size: 50 10 | gn_blocks: 1 11 | seq_length: 512 12 | gn_heads: 4 13 | gff_filter_size: 150 14 | g_dev: "/CPU:0" 15 | dn_blocks: 3 16 | dn_heads: 6 17 | dff_filter_size: 300 18 | d_dev: "/CPU:0" 19 | d_factor: 50 20 | learning_rate: 1e-3 21 | max_len: 512 22 | -------------------------------------------------------------------------------- /algorithms/encoder/transformer/ff_and_ln_layers.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def layer_norm(hidden_size,name,dtype=tf.float32): 4 | """ 5 | 1. hidden_size is embedding size, which is also the last dim, normalize on that dim 6 | 2. tf.compat.v1.get_variable 7 | """ 8 | with tf.variable_scope(name, reuse=tf.AUTO_REUSE): 9 | scale = tf.get_variable("layer_norm_scale",[hidden_size], 10 | initializer=tf.ones_initializer(),dtype=dtype) 11 | bias = tf.get_variable("layer_norm_bias",[hidden_size], 12 | initializer=tf.zeros_initializer(),dtype=dtype) 13 | epsilon = 1e-6 14 | def norm(inputs): 15 | mean = tf.reduce_mean(inputs, axis=[-1], keepdims=True) 16 | variance = tf.reduce_mean(tf.square(inputs - mean), axis=[-1], keepdims=True) 17 | norm_x = (inputs - mean) * tf.rsqrt(variance + epsilon) 18 | return norm_x * scale + bias 19 | return norm 20 | def feed_forward(hidden_size,ff_filter_size,ff_dropout): 21 | filter_layer = tf.keras.layers.Dense(ff_filter_size,activation="relu") 22 | output_layer = tf.keras.layers.Dense(hidden_size) 23 | def ff(inputs): 24 | out = filter_layer(inputs) 25 | if ff_dropout: 26 | out = tf.nn.dropout(out,rate=ff_dropout) 27 | return output_layer(out) 28 | return ff -------------------------------------------------------------------------------- /algorithms/intent_extractors/attribute_node_model.py: -------------------------------------------------------------------------------- 1 | from ..util_funcs.attribute_model_builder import build_graph,train_tf_model 2 | from ..util_funcs.data_preprocessing import get_dicts,get_train_data 3 | from ..util_funcs.model_handler import get_interpreter,bot_thought 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | def nlu_model_producer(config_path="nlu_config.yml"): 9 | import yaml 10 | with open(config_path,"r") as f: 11 | config = yaml.safe_load(f) 12 | def train_model(model_id,train_data): 13 | w2id = get_dicts(train_data)[0] 14 | graph,access_dict = build_graph(**config["model_hp"],vocab_size=len(w2id),token_label_size = 1, 15 | intent_size = 2) 16 | processed_train_data = get_train_data(train_data,w2id) 17 | model_path = config["model_path"]+model_id+"/" 18 | train_tf_model(graph,access_dict,processed_train_data,w2id,model_path,**config["train_params"]) 19 | print("model {} is trained!".format(model_id)) 20 | return train_model 21 | 22 | def parser_producer(node_id,config_path="nlu_config.yml"): 23 | import yaml 24 | with open(config_path,"r") as f: 25 | config = yaml.safe_load(f) 26 | model_path = config["model_path"]+node_id+"/" 27 | interpreter,w2id = get_interpreter(model_path) 28 | return bot_thought(interpreter,w2id,config["model_hp"]["max_len"]) -------------------------------------------------------------------------------- /util_funcs/process_and_save_data_for_training.py: -------------------------------------------------------------------------------- 1 | def read_data(file): 2 | import json 3 | import re 4 | def filter_words(s): 5 | return re.sub("[^\w+?!?!:;:;.。,,@&]","",s) 6 | data = [] 7 | with open(file,"r") as f: 8 | for l in f: 9 | tmp = json.loads(l) 10 | data += re.split("(\w+[?!?!:;:;.。]+)",filter_words(tmp["title"]))[1::2] 11 | data += re.split("(\w+[?!?!:;:;.。]+)",filter_words(tmp["desc"]))[1::2] 12 | data += re.split("(\w+[?!?!:;:;.。]+)",filter_words(tmp["answer"]))[1::2] 13 | return data 14 | def process_data_for_training(datafiles,char2id_loc,id2char_loc,train_data_loc): 15 | import json 16 | import re 17 | data = [] 18 | for file in datafiles: 19 | data+=read_data(file) 20 | chars = set() 21 | for l in data: 22 | chars|=set(l) 23 | char2id = {c:i for i,c in enumerate(chars)} 24 | id2char = {i:c for i,c in enumerate(chars)} 25 | first = id2char[0] 26 | id2char[0] = " " 27 | char2id[" "] = 0 28 | char2id[first]=len(id2char) 29 | id2char[char2id[first]]=first 30 | char2id["mask"]=len(id2char) 31 | id2char[char2id["mask"]]="mask" 32 | with open(char2id_loc,"w") as f: 33 | json.dump(char2id,f,ensure_ascii=False) 34 | with open(id2char_loc,"w") as f: 35 | json.dump(id2char,f,ensure_ascii=False) 36 | idified = [] 37 | for l in data: 38 | tmp = [] 39 | for w in list(l): 40 | tmp.append(str(char2id.get(w))) 41 | idified.append(",".join(tmp)) 42 | with open(train_data_loc,"w") as f: 43 | f.writelines("\n".join(idified)) 44 | print("data are ready!") -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | 4 | from util_funcs.prepare_for_trianing import get_dicts,data_batcher_producer 5 | from modeling.build_electra_graph import build_graph 6 | from util_funcs.trainer import train_tf_model 7 | parser = argparse.ArgumentParser(description='pass in some arguments') 8 | parser.add_argument("-e","--epochs",type=int) 9 | parser.add_argument("-b","--batch_size",type=int) 10 | parser.add_argument("-n","--tf_model_name",type=str,help="model name after training, includes folder") 11 | 12 | args=parser.parse_args() 13 | 14 | with open("configure.yml") as f: 15 | configure = yaml.safe_load(f) 16 | 17 | char2id_loc = "data/processed_data/char2id.json" 18 | char2id,_=get_dicts(configure["char2id_loc"],configure["id2char_loc"]) 19 | vocab_size = len(char2id) 20 | embedding_size = configure["embedding_size"] 21 | generator_size = configure["generator_size"] 22 | gn_blocks = configure["gn_blocks"] 23 | gseq_length = dseq_length = configure["seq_length"] 24 | gn_heads = configure["gn_heads"] 25 | gff_filter_size = configure["gff_filter_size"] 26 | g_dev = configure["g_dev"] 27 | dn_blocks = configure["dn_blocks"] 28 | dn_heads = configure["dn_heads"] 29 | dff_filter_size = configure["dff_filter_size"] 30 | d_dev = configure["d_dev"] 31 | d_factor = configure["d_factor"] 32 | learning_rate = float(configure["learning_rate"]) 33 | tf_model_name = args.tf_model_name 34 | epochs = args.epochs 35 | train_loc = configure["train_data_loc"] 36 | max_len = configure["max_len"] 37 | batch_size = args.batch_size 38 | mask_index = char2id["mask"] 39 | 40 | if __name__=="__main__": 41 | data_batcher = data_batcher_producer(train_loc,max_len,batch_size,mask_index) 42 | graph,access_dict = build_graph(vocab_size,embedding_size,generator_size, 43 | gn_blocks,gseq_length,gn_heads,gff_filter_size,g_dev, 44 | dn_blocks,dseq_length,dn_heads,dff_filter_size,d_dev,mask_index, 45 | d_factor,learning_rate) 46 | train_tf_model(graph,access_dict,data_batcher,tf_model_name,epochs) -------------------------------------------------------------------------------- /algorithms/util_funcs/model_handler.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | def save_models(sess,tf_model,w2id): 3 | import json 4 | import os 5 | from tensorflow.compat.v1.graph_util import convert_variables_to_constants 6 | graph = convert_variables_to_constants(sess,sess.graph_def,["input","prediction"]) 7 | try: 8 | os.mkdir("{}".format(tf_model)) 9 | except: 10 | pass 11 | logdir = "{}".format(tf_model) 12 | tf.io.write_graph(graph,logdir,"tf_model.pb",as_text=False) 13 | with open("{}/word2id.json".format(tf_model),'w') as f: 14 | json.dump(w2id,f,ensure_ascii=False) 15 | print("{}".format(tf_model)) 16 | 17 | def get_interpreter(model_path): 18 | import json 19 | with open(model_path+"/word2id.json","r") as f: 20 | w2id = json.load(f) 21 | def interpret(inputs): 22 | with tf.Graph().as_default(): 23 | g_def = tf.GraphDef() 24 | with open(model_path+"/tf_model.pb","rb") as f: 25 | g_def.ParseFromString(f.read()) 26 | tf.import_graph_def(g_def,name="") 27 | with tf.Session() as sess: 28 | tf.global_variables_initializer().run() 29 | input_node = sess.graph.get_tensor_by_name("input:0") 30 | prediction_node = sess.graph.get_tensor_by_name("prediction:0") 31 | prediction = prediction_node.eval(feed_dict={input_node:inputs}) 32 | return prediction 33 | return interpret,w2id 34 | def bot_thought(interpreter,w2id,max_len=30): 35 | key_type = (str,int)[type(list(w2id.keys())[0]).__name__=="int"] 36 | def bot_say(inputs): 37 | sentences = [] 38 | for w in inputs: 39 | if w in w2id.keys(): 40 | sentences.append(w2id[key_type(w)]) 41 | else: 42 | sentences.append(0) 43 | #seq_len = [len(sentences)] 44 | inputs = tf.keras.preprocessing.sequence.pad_sequences([sentences],maxlen=max_len,padding="post") 45 | 46 | ans = interpreter(inputs) 47 | print(inputs) 48 | #ans = interpreter({"inputs":inputs})["predictions"] 49 | print(ans) 50 | return bot_say -------------------------------------------------------------------------------- /util_funcs/trainer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from util_funcs.after_training import save_models 5 | def train_tf_model(graph,access_dict,data_batcher,tf_model_name,epochs=5): 6 | with tf.Session(graph=graph) as sess: 7 | 8 | access_dict["init"].run() 9 | losses,g_losses,d_losses = [],[],[] 10 | #""" 11 | for e in range(epochs): 12 | db = data_batcher() 13 | tmp_losses,tmp_g_losses,tmp_d_losses,mask_counts = [],[],[],[] 14 | mask_count = 0 15 | end_flag = False 16 | i = 0 17 | while not end_flag: 18 | end_flag,padded,position_indeces,target_word_indeces,mask_values = db() 19 | if (len(position_indeces[0])==0)|(len(target_word_indeces[0])==0): 20 | continue 21 | feed = {"input":padded,"position_indeces":position_indeces, 22 | "target_word_indeces":target_word_indeces,"mask_values":mask_values, 23 | "training":True} 24 | loss,g_loss,d_loss,_ = sess.run([ 25 | access_dict["losses"], 26 | access_dict["g_loss"],access_dict["d_loss"],access_dict["training_op"]], 27 | feed_dict = {access_dict[k]:feed[k] for k in feed.keys()}) 28 | 29 | tmp_losses.append(float(loss.mean())) 30 | tmp_g_losses.append(float(g_loss.mean())) 31 | tmp_d_losses.append(float(d_loss.mean())) 32 | mask_counts.append(len(mask_values[0])) 33 | if i%52==0: 34 | losses.append(float(np.mean(tmp_losses))) 35 | g_losses.append(float(np.mean(tmp_g_losses))) 36 | d_losses.append(float(np.mean(tmp_d_losses))) 37 | mask_count = np.mean(mask_counts) 38 | tmp_losses,tmp_g_losses,tmp_d_losses,mask_counts = [],[],[],[] 39 | print("epoch {}: loss is {:.2f} g_loss is {:.2f} d_loss is {:.2f} mask count is {:.2f}" 40 | .format(e,losses[-1],g_losses[-1],d_losses[-1],mask_count)) 41 | i += 1 42 | db(True); 43 | # save_models(sess,access_dict["input"], 44 | # access_dict["prediction"],tf_model_name,w2id) 45 | save_models(sess,tf_model_name+"_epoch_{}".format(e),access_dict) 46 | #file_writer = tf.summary.FileWriter('logs', sess.graph) 47 | # pd.Series(losses).clip(None,10).plot(); 48 | # plt.show(); 49 | # pd.Series(g_losses).clip(None,5).plot(); 50 | # plt.show(); 51 | # pd.Series(d_losses).clip(None,2).plot(); 52 | # plt.show(); -------------------------------------------------------------------------------- /algorithms/util_funcs/data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import tensorflow as tf 4 | def get_all_han_words(l): 5 | results = [] 6 | for m in re.finditer(r"[\u4e00-\u9fa5\ ]+",l): 7 | results.append(m.span()) 8 | return {i:("han",i,s) for i,s in results} 9 | def get_all_other_words(l): 10 | results = [] 11 | for m in re.finditer(r"[A-Za-z0-9]+",l): 12 | results.append(m.span()) 13 | return {i:("other",i,s) for i,s in results} 14 | def get_sep_sentence(l): 15 | diction = get_all_han_words(l) 16 | diction.update(get_all_other_words(l)) 17 | results = [] 18 | for k in sorted(list(diction.keys())): 19 | flag,start,end = diction[k] 20 | if flag=="han": 21 | results+=list(l[start:end]) 22 | else: 23 | results.append(l[start:end]) 24 | return results 25 | def get_dicts(data): 26 | all_chars = set() 27 | for l in data: 28 | all_chars|=set(get_sep_sentence(l[0])) 29 | all_chars = list(all_chars) 30 | char2id = {c:i for i,c in enumerate(all_chars)} 31 | id2char = {i:c for i,c in enumerate(all_chars)} 32 | empty_pos = char2id[" "] 33 | char2id[id2char[0]]=empty_pos 34 | id2char[empty_pos]=id2char[0] 35 | id2char[0]=" " 36 | char2id[" "]=0 37 | return char2id,id2char 38 | def idify_sentences(sentence,char2id): 39 | trans = [] 40 | for s in get_sep_sentence(sentence): 41 | trans.append(char2id[s]) 42 | return trans 43 | def get_token_labels(sentence,keys): 44 | labels = np.ones(len(sentence)) 45 | for k in keys: 46 | search = re.search(k,sentence) 47 | if type(search).__name__!="NoneType": 48 | span = search.span() 49 | labels[span[0]:span[1]]=0.0 50 | return labels.tolist() 51 | def get_train_data(data,char2id): 52 | train_data = [] 53 | for l in data: 54 | seq = idify_sentences(l[0],char2id) 55 | token_labels = get_token_labels(l[0],l[2]) 56 | train_data.append((seq,token_labels,l[1])) 57 | return train_data 58 | def data_batcher(data,max_len,batch_size=32): 59 | for i in range(0,len(data),batch_size): 60 | cut = data[i:i+batch_size] 61 | seqs = [] 62 | #seq_lens = [] 63 | token_labels = [] 64 | targets = [] 65 | for s in cut: 66 | seqs.append(s[0]) 67 | #seq_lens.append(s[1]) 68 | token_labels.append(s[1]) 69 | targets.append(s[2]) 70 | seqs = tf.keras.preprocessing.sequence.pad_sequences(seqs,maxlen=max_len,padding="post") 71 | token_labels = tf.keras.preprocessing.sequence.pad_sequences(token_labels, 72 | maxlen=max_len,padding="post") 73 | yield seqs,token_labels,targets -------------------------------------------------------------------------------- /util_funcs/after_training.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import shutil 3 | def save_models(sess,tf_model,access_dict): 4 | import json 5 | import os 6 | 7 | # try: 8 | # os.mkdir("{}".format(tf_model)) 9 | # except: 10 | # pass 11 | logdir = "{}".format(tf_model) 12 | #from tensorflow.compat.v1.graph_util import convert_variables_to_constants 13 | #graph = convert_variables_to_constants(sess,sess.graph_def,["input","prediction"]) 14 | # tf.io.write_graph(graph,logdir,"tf_model.pb",as_text=False) 15 | try: 16 | tf.saved_model.simple_save(sess, 17 | logdir, 18 | inputs={"input": access_dict["input"], 19 | "training":access_dict["training"], 20 | "position_indeces":access_dict["position_indeces"], 21 | "target_word_indeces":access_dict["target_word_indeces"], 22 | "mask_values":access_dict["mask_values"], 23 | "peep":access_dict["peep"]}, 24 | outputs={"outputs": access_dict["outputs"]}) 25 | except: 26 | shutil.rmtree(logdir) 27 | tf.saved_model.simple_save(sess, 28 | logdir, 29 | inputs={"input": access_dict["input"], 30 | "training":access_dict["training"], 31 | "position_indeces":access_dict["position_indeces"], 32 | "target_word_indeces":access_dict["target_word_indeces"], 33 | "mask_values":access_dict["mask_values"], 34 | "peep":access_dict["peep"]}, 35 | outputs={"outputs": access_dict["outputs"]}) 36 | print("{}".format(tf_model)) 37 | def get_interpreter(model_path): 38 | parser = tf.contrib.predictor.from_saved_model(model_path) 39 | def interpret(inputs): 40 | return parser({"input":inputs,"training":False, 41 | "position_indeces":[[[0,0]]], 42 | "target_word_indeces":[[[0,0,0]]], 43 | "mask_values":[[0]]})["outputs"] 44 | return interpret 45 | def bot_thought(interpreter,tokenizer,max_len=400): 46 | def bot_say(inputs): 47 | if type(inputs).__name__=="list": 48 | sentences = [] 49 | tokenized = [tokenizer(s) for s in inputs] 50 | for t in tokenized: 51 | pad_len = max_len-len(t) 52 | sentences.append((t+[0]*pad_len)[:max_len]) 53 | 54 | ans = interpreter(sentences) 55 | 56 | #ans = interpreter({"inputs":inputs})["predictions"] 57 | #print(ans) 58 | else: 59 | tokenized = tokenizer(inputs) 60 | pad_len = max_len-len(tokenized) 61 | tokenized = (tokenized+[0]*pad_len)[:max_len] 62 | ans = interpreter([tokenized]) 63 | return ans 64 | return bot_say -------------------------------------------------------------------------------- /algorithms/encoder/transformer/transformer_block.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from algorithms.encoder.transformer.ff_and_ln_layers import layer_norm,feed_forward 4 | 5 | def transpose_and_combine(projected_v,sizes): 6 | batch_size,seq_len,hidden_size,n_heads=sizes 7 | value = tf.reshape(projected_v,[batch_size,seq_len,n_heads,hidden_size]) 8 | transposed = tf.transpose(value,[0,2,1,3]) 9 | return tf.reshape(transposed,[batch_size*n_heads,seq_len,hidden_size]) 10 | def untranspose_and_combine(scaled_values,sizes): 11 | batch_size,seq_len,hidden_size,n_heads=sizes 12 | uncombined = tf.reshape(scaled_values,[batch_size,n_heads,seq_len,hidden_size]) 13 | untransposed = tf.transpose(uncombined,[0,2,1,3]) 14 | return tf.reshape(untransposed,[batch_size,seq_len,n_heads*hidden_size]) 15 | def get_qkv(v,seq_length,hidden_size,n_heads): 16 | query_projector = tf.keras.layers.Dense(hidden_size*n_heads,use_bias=False,name="query_params") 17 | key_projector = tf.keras.layers.Dense(hidden_size*n_heads,use_bias=False,name="key_params") 18 | value_projector = tf.keras.layers.Dense(hidden_size*n_heads,use_bias=False,name="value_params") 19 | #print(v.shape) 20 | #batch_size,seq_len = tf.shape(v)[1],tf.shape(v)[2] 21 | batch_size = tf.shape(v)[0] 22 | sizes = (batch_size,seq_length,hidden_size,n_heads) 23 | query = transpose_and_combine(query_projector(v),sizes) 24 | key = transpose_and_combine(key_projector(v),sizes) 25 | value = transpose_and_combine(value_projector(v),sizes) 26 | return query,key,value,sizes 27 | def scaled_dot_product(q,k,v,sizes): 28 | _,_,_,hidden_size = sizes 29 | weights = tf.matmul(q,k,transpose_b=True) 30 | #print(weights) 31 | scaled_weights = tf.nn.softmax(weights/tf.sqrt(np.float32(hidden_size)),axis=-1) 32 | #print(v) 33 | scaled_values = tf.matmul(scaled_weights,v) 34 | #print(scaled_values) 35 | attented = untranspose_and_combine(scaled_values,sizes) 36 | return attented 37 | def self_attention(seq_length,hidden_size,n_heads): 38 | def attention(v): 39 | q,k,v,sizes = get_qkv(v,seq_length,hidden_size,n_heads) 40 | attented = scaled_dot_product(q,k,v,sizes) 41 | #print(attented) 42 | return tf.keras.layers.Dense(hidden_size)(attented) 43 | return attention 44 | def transformer_block(seq_length,hidden_size,n_heads,ff_filter_size,name,ff_dropout): 45 | attention_block = self_attention(seq_length,hidden_size,n_heads) 46 | ln_after_attention = layer_norm(hidden_size,name) 47 | fead_forward_layer = feed_forward(hidden_size,ff_filter_size,ff_dropout) 48 | ln_after_ff = layer_norm(hidden_size,name) 49 | def transformer(inputs): 50 | after_att = attention_block(inputs) 51 | after_att_ln = ln_after_attention(after_att+inputs) 52 | after_ff = fead_forward_layer(after_att_ln) 53 | outputs = ln_after_ff(after_ff+after_att_ln) 54 | return outputs 55 | return transformer 56 | # def transformer_block(seq_length,input_size,hidden_size,n_heads,ff_filter_size,name,ff_dropout): 57 | # attention_block = self_attention(seq_length,hidden_size,n_heads) 58 | # ln_after_attention = layer_norm(hidden_size,name) 59 | # fead_forward_layer = feed_forward(input_size,hidden_size,ff_filter_size,ff_dropout) 60 | # ln_after_ff = layer_norm(input_size,name) 61 | # def transformer(inputs): 62 | # after_att = attention_block(inputs) 63 | # after_att_ln = ln_after_attention(after_att+inputs) 64 | # after_ff = fead_forward_layer(after_att_ln) 65 | # outputs = ln_after_ff(after_ff+after_att_ln) 66 | # return outputs 67 | # return transformer -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Electra_with_tensorflow 2 | This is an implementation of electra according to the paper {ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators} 3 | 4 | # Things to know before you read the project: 5 | 6 | 1. This is a very raw project. Too rough to use for production. It's not well organized and tested, so not good for research either. It may just provide some ideas when you want to implement electra. 7 | 8 | 2. There are some differences between my implement and the original electra paper: 9 | 10 | 2.1 I don't have any powerful computing resources, so i haven't used matrix multiplication for masking. Simplly put, for each batch i use a randomed sample size. Each batch has same number of token being masked. 11 | 12 | 3. As you may probably tell, this project is not polished. There maybe some errors that I haven't found. I haven't used the datasets that were used in the paper. I used a chinese dataset, so there is no reference regarding how well the model should work on the dataset. All in all, it's not well experimented project, I suggest my fellow viewers don't dive too much in this project, if you want to make a production ready application. 13 | 14 | 4. My writing with tensorflow is not standard, as you may tell. I use many functional programming. I do this because I didn't read tensorflow user guide thoroughly, also because I feel comfortable writing functions. I think data types more complex than the prime types are mutable and the tensorflow layers feel more complex than dictionary, so I just write functions with no test at all. That's probably why errors may happen running my project. 15 | 16 | # how to run this project 17 | ## the environment 18 | I use tensorflow official image for version 1.14; with docker just 19 | > docker pull tensorflow/tensorflow:1.14.0-gpu-py3-jupyter 20 | 21 | ## the data 22 | the entrance to the program is data. I don't want to be crue, but you really have to write functions to format your data into **one tokenized sentence (token ids seperated with comma) per line txt file**. That's the train.txt format. 23 | 24 | ## the configure.yml 25 | datafiles: 26 | 27 | - "data/baike_qa_valid.json" # my raw valid data location. remember you should use your own data formater functions. 28 | 29 | - "data/baike_qa_train.json" # my raw train data location. 30 | 31 | char2id_loc: "data/processed_data/char2id.json" # this is char2id file after formatting (data processing), this could be word2id, depending on how you tokenize your raw data. 32 | 33 | id2char_loc: "data/processed_data/id2char.json" # this is id2char file after formatting 34 | 35 | train_data_loc: "data/processed_data/train.txt" # this is the formatted train data. from here you can tell how rough this project and how lazy i am, as i don't even produce the valid data. 36 | 37 | embedding_size: 100 # this is the embedding size 38 | 39 | generator_size: 50 # this is the generator hidden size, which is also the discriminator hidden size. 40 | 41 | gn_blocks: 1 # this the number of the generator transformer block. 42 | 43 | seq_length: 512 # this is the max sequence length 44 | 45 | gn_heads: 4 # this is generator head count. 46 | 47 | gff_filter_size: 150 # this is generator feed forward filter size. 48 | 49 | g_dev: "/CPU:0" # this is the device I use, I once had a GPU, but later i lost it. 50 | 51 | dn_blocks: 3 # this is the number of the discriminator transformer block 52 | 53 | dn_heads: 6 # this is the discriminator head count. 54 | 55 | dff_filter_size: 300 # this is the discriminator feed forward filter size. 56 | 57 | d_dev: "/CPU:0" # this is the same GPU loss story. 58 | 59 | d_factor: 50 # this is the factor that is used to amp the discriminator loss. 60 | 61 | learning_rate: 1e-3 # this is the learning rate. 62 | 63 | max_len: 512 # this is the max sequence length again. This duplication is a result of my lazyness not a well thought action. 64 | -------------------------------------------------------------------------------- /util_funcs/prepare_for_trianing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Reader(object): 5 | def __init__(self,loc,max_len=512,batch_size=16): 6 | self.f = open(loc) 7 | self.loc = loc 8 | self.batch_size = batch_size 9 | self.result_buffer = self.f.readlines(max_len*self.batch_size) 10 | self.buffer_len = len(self.result_buffer) 11 | self.max_len = max_len 12 | def read(self): 13 | end_flag = False 14 | if self.buffer_len > self.batch_size: 15 | result = self.result_buffer[:self.batch_size] 16 | self.result_buffer = np.delete(self.result_buffer,np.arange(self.batch_size)).tolist() 17 | self.buffer_len = len(self.result_buffer) 18 | else: 19 | self.result_buffer += self.f.readlines(self.max_len*self.batch_size) 20 | if len(self.result_buffer)==self.buffer_len:# 到头了没有读到新数据 21 | result = self.result_buffer 22 | end_flag = True 23 | self.f.close() 24 | else: 25 | for _ in range(self.batch_size): 26 | self.result_buffer += self.f.readlines(self.max_len*self.batch_size) 27 | self.buffer_len = len(self.result_buffer) 28 | if self.buffer_len > self.batch_size: 29 | result = self.result_buffer[:self.batch_size] 30 | self.result_buffer = np.delete(self.result_buffer,np.arange(self.batch_size)).tolist() 31 | break 32 | if self.buffer_len < self.batch_size: 33 | result = self.result_buffer 34 | self.buffer_len = 0 35 | self.result_buffer = [] 36 | return [np.array(l.strip("\n").split(",")).astype("int").tolist() for l in result],end_flag 37 | def close(self): 38 | self.f.close() 39 | 40 | def get_dicts(char2id_loc,id2char_loc): 41 | import json 42 | with open(char2id_loc,"r") as f: 43 | char2id = json.load(f) 44 | with open(id2char_loc,"r") as f: 45 | id2char = json.load(f) 46 | return char2id,id2char 47 | def tokenizer_producer(dicts,already=True): 48 | if already: 49 | def tokenizer(text): 50 | result = [int(l) for l in text.strip("\n").split(",")] 51 | return result 52 | return tokenizer 53 | def tokenizer(text): 54 | result = [dicts.get(t,0) for t in text] 55 | return result 56 | return tokenizer 57 | def sample_positions(tokenized,max_len,mask_index): 58 | padded,position_indeces,target_word_indeces,mask_values = [],[],[],[] 59 | sample_size = np.random.choice(np.arange(min([len(s) for s in tokenized]))) 60 | for i,d in enumerate(tokenized): 61 | d = d[:max_len] 62 | row_indeces,row_targets = [],[] 63 | randomeds = np.random.choice(np.arange(len(d)),size=sample_size,replace=False) 64 | for j,randomed in enumerate(randomeds): 65 | row_indeces.append([i,randomed]) 66 | row_targets.append([i,j,d[randomed]]) 67 | position_indeces.append(row_indeces) 68 | target_word_indeces.append(row_targets) 69 | pad_len = max_len-len(d) 70 | padded.append((d+[0]*pad_len)[:max_len]) 71 | mask_values.append([mask_index]*sample_size) 72 | return padded,position_indeces,target_word_indeces,mask_values 73 | def data_batcher_producer(train_loc,max_len,batch_size,mask_index): 74 | def producer(): 75 | tx = Reader(train_loc,max_len,batch_size) 76 | def data_batcher(close=False): 77 | 78 | data_list,end_flag = tx.read() 79 | padded,position_indeces,target_word_indeces,mask_values = sample_positions( 80 | data_list,max_len,mask_index) 81 | if close: 82 | tx.close() 83 | return end_flag,padded,position_indeces,target_word_indeces,mask_values 84 | return data_batcher 85 | return producer -------------------------------------------------------------------------------- /algorithms/util_funcs/attribute_model_builder.py: -------------------------------------------------------------------------------- 1 | from ..encoder import transformer_encoder_producer 2 | from .data_preprocessing import data_batcher 3 | from .model_handler import save_models 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | import pandas as pd 8 | def build_graph(max_len,vocab_size, embedding_size,n_blocks,n_heads,ff_filter_size, 9 | token_label_size,intent_size,learning_rate): 10 | g = tf.Graph() 11 | tf.reset_default_graph() 12 | with g.as_default(): 13 | access_dict = {} 14 | access_dict["input"] = tf.placeholder(tf.int32,shape=[None,max_len],name="input") 15 | access_dict["target"] = tf.placeholder(tf.int32,shape=[None],name="target") 16 | #access_dict["seq_length"] = tf.placeholder(tf.int32,shape=[None],name="seq_length") 17 | access_dict["token_labels"] = tf.placeholder(tf.float32,shape=[None,max_len],name="token_labels") 18 | embeddings = tf.keras.layers.Embedding(vocab_size, 19 | embedding_size)(access_dict["input"]) 20 | transformer_encoder = transformer_encoder_producer( 21 | n_blocks,max_len,embedding_size,n_heads,ff_filter_size 22 | ) 23 | encoded = transformer_encoder(embeddings) 24 | token_layers = tf.keras.layers.Dense(token_label_size,name="token_layers",activation="sigmoid") 25 | token_layers.trainable=False 26 | token_logits = tf.concat([tf.expand_dims(token_layers(encoded[:,i,:]),axis=1) 27 | for i in range(max_len)],axis=1) 28 | print("token_logits",token_logits) 29 | print("encoded",encoded) 30 | #print(1/0) 31 | #logits_agg = tf.keras.layers.Dense(intent_size,name="logits")(lstms[:,-1,:]) 32 | token_labels_reshaped = tf.expand_dims(access_dict["token_labels"],axis=2) 33 | print("token_labels_reshaped",token_labels_reshaped.shape) 34 | token_loss = tf.reduce_mean(tf.keras.backend.binary_crossentropy( 35 | target=token_labels_reshaped,output=token_logits 36 | )) 37 | gated_encode = encoded*token_logits 38 | print("gated_encode",gated_encode) 39 | flatten = tf.keras.layers.Flatten()(gated_encode) 40 | logits_agg = tf.keras.layers.Dense(intent_size,name="logits_agg")(flatten) 41 | target_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits( 42 | labels=access_dict["target"],logits=logits_agg 43 | )) 44 | access_dict["loss"] = token_loss+target_loss 45 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 46 | access_dict["training_op"] = optimizer.minimize(access_dict["loss"]) 47 | correct = tf.nn.in_top_k(logits_agg,access_dict["target"],1) 48 | access_dict["accuracy"] = tf.reduce_mean(tf.cast(correct,tf.float16)) 49 | access_dict["prediction"] = tf.math.argmax(logits_agg,dimension=-1,name="prediction") 50 | access_dict["init"] = tf.global_variables_initializer() 51 | access_dict["max_len"] = max_len 52 | print("embeddings",embeddings) 53 | print("encoded",encoded) 54 | print("flatten",flatten) 55 | print("logits_agg",logits_agg) 56 | return g, access_dict 57 | def train_tf_model(graph,access_dict,ided_data,w2id,tf_model_name,batch_size,epochs=100): 58 | with tf.Session(graph=graph) as sess: 59 | 60 | access_dict["init"].run() 61 | losses = [] 62 | #""" 63 | for e in range(epochs): 64 | db = data_batcher(ided_data,access_dict["max_len"],batch_size=batch_size) 65 | tmp_losses = [] 66 | for inputs,token_labels,target in db: 67 | #loss = xentropy() 68 | feed = {"input":inputs,"target":target, 69 | "token_labels":token_labels} 70 | loss,_ = sess.run([access_dict["loss"],access_dict["training_op"]], 71 | feed_dict = {access_dict[k]:feed[k] for k in feed.keys()}) 72 | tmp_losses.append(float(loss.mean())) 73 | losses.append(float(np.mean(tmp_losses))) 74 | tmp_losses = [] 75 | acc = access_dict["accuracy"].eval(feed_dict={access_dict[k]:feed[k] for k in feed.keys()}) 76 | print("epoch {}: train_accuracy is {:.2f};".format(e,acc)) 77 | # save_models(sess,access_dict["input"], 78 | # access_dict["prediction"],tf_model_name,w2id) 79 | save_models(sess,tf_model_name,w2id) 80 | #file_writer = tf.summary.FileWriter('logs', sess.graph) 81 | pd.Series(losses).plot(); -------------------------------------------------------------------------------- /algorithms/encoder/transformer/encoder_layer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from algorithms.encoder.transformer.transformer_block import transformer_block 3 | 4 | def get_position_encoding(seq_length, hidden_size, min_timescale=1.0, max_timescale=1.0e4,dtype=tf.float32): 5 | """Return positional encoding. 6 | Calculates the position encoding as a mix of sine and cosine functions with 7 | geometrically increasing wavelengths. 8 | Defined and formulized in Attention is All You Need, section 3.5. 9 | Args: 10 | length: Sequence length. 11 | hidden_size: Size of the 12 | min_timescale: Minimum scale that will be applied at each position 13 | max_timescale: Maximum scale that will be applied at each position 14 | Returns: 15 | Tensor with shape [length, hidden_size] 16 | """ 17 | # We compute the positional encoding in float32 even if the model uses 18 | # float16, as many of the ops used, like log and exp, are numerically unstable 19 | # in float16. 20 | position = tf.cast(tf.range(seq_length), dtype) 21 | num_timescales = hidden_size // 2 22 | log_timescale_increment = ( 23 | tf.math.log(tf.cast(max_timescale,dtype) / tf.cast(min_timescale,dtype)) / 24 | (tf.cast(num_timescales, dtype) - 1)) 25 | inv_timescales = min_timescale * tf.exp( 26 | tf.cast(tf.range(num_timescales), dtype) * -log_timescale_increment) 27 | scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) 28 | signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) 29 | return signal 30 | 31 | def transformer_encoder_producer(n_blocks,seq_length,hidden_size,n_heads,ff_filter_size,name="trans",ff_dropout=None): 32 | """ 33 | arguments: 34 | n_blocks: number of transformer blocks 35 | seq_length: sequence_length, it must be specified, because it would be calculated every time when the tensor 36 | went through the network. 37 | hidden_size: embedding_size or hidden_size 38 | n_heads: number of attention heads 39 | ff_filter_size: number of fead_forward_filters. it's just the linear projector with out dim as 40 | ff_filter_size. 41 | ff_dropout: fead forward filter layer dropout rate, default is None. when None, no dropout. 42 | """ 43 | positional_encoding = get_position_encoding(seq_length, hidden_size) 44 | transformer_blocks = [ 45 | transformer_block(seq_length,hidden_size,n_heads,ff_filter_size,name,ff_dropout) for _ in range(n_blocks) 46 | ] 47 | def encode(inputs): 48 | outputs = inputs + positional_encoding 49 | for tb in transformer_blocks: 50 | outputs = tb(outputs) 51 | return outputs 52 | return encode 53 | def transformer_encoder(n_blocks,seq_length,hidden_size,n_heads,ff_filter_size,name="trans",ff_dropout=None): 54 | """ 55 | arguments: 56 | n_blocks: number of transformer blocks 57 | seq_length: sequence_length, it must be specified, because it would be calculated every time when the tensor 58 | went through the network. 59 | hidden_size: embedding_size or hidden_size 60 | n_heads: number of attention heads 61 | ff_filter_size: number of fead_forward_filters. it's just the linear projector with out dim as 62 | ff_filter_size. 63 | ff_dropout: fead forward filter layer dropout rate, default is None. when None, no dropout. 64 | """ 65 | positional_encoding = get_position_encoding(seq_length, hidden_size) 66 | transformer_blocks = [ 67 | transformer_block(seq_length,hidden_size,n_heads,ff_filter_size,name,ff_dropout) for _ in range(n_blocks) 68 | ] 69 | def encode(inputs): 70 | outputs = inputs + positional_encoding 71 | for tb in transformer_blocks: 72 | outputs = tb(outputs) 73 | return outputs 74 | return encode 75 | def transformer_encoder_no_pe_producer(n_blocks,seq_length,hidden_size,n_heads,ff_filter_size,name="trans",ff_dropout=None): 76 | """ 77 | arguments: 78 | n_blocks: number of transformer blocks 79 | seq_length: sequence_length, it must be specified, because it would be calculated every time when the tensor 80 | went through the network. 81 | hidden_size: embedding_size or hidden_size 82 | n_heads: number of attention heads 83 | ff_filter_size: number of fead_forward_filters. it's just the linear projector with out dim as 84 | ff_filter_size. 85 | ff_dropout: fead forward filter layer dropout rate, default is None. when None, no dropout. 86 | """ 87 | transformer_blocks = [ 88 | transformer_block(seq_length,hidden_size,n_heads,ff_filter_size,name,ff_dropout) for _ in range(n_blocks) 89 | ] 90 | def encode(inputs): 91 | outputs = inputs 92 | for tb in transformer_blocks: 93 | outputs = tb(outputs) 94 | return outputs 95 | return encode -------------------------------------------------------------------------------- /modeling/build_electra_graph.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from algorithms.encoder import transformer_encoder 4 | def generator_encoder_producer(embedding_layer,projector,g_transformer): 5 | def encode(inputs): 6 | projected = projector(embedding_layer.embeddings) 7 | embedded = tf.nn.embedding_lookup(projected,inputs) 8 | encoded = g_transformer(embedded) 9 | return projected,encoded 10 | return encode 11 | def discriminitor_encoder_producer(embedding_layer,d_transformer): 12 | def encode(inputs): 13 | embeded = embedding_layer(inputs) 14 | encoded = d_transformer(embeded) 15 | return encoded 16 | return encode 17 | 18 | 19 | 20 | def build_graph(vocab_size,embedding_size,generator_size, 21 | gn_blocks,gseq_length,gn_heads,gff_filter_size,g_dev, 22 | dn_blocks,dseq_length,dn_heads,dff_filter_size,d_dev,mask_index, 23 | d_factor,learning_rate): 24 | g = tf.Graph() 25 | tf.reset_default_graph() 26 | with g.as_default(): 27 | access_dict = {} 28 | access_dict["input"] = tf.placeholder(tf.int32,shape=[None,gseq_length],name="input") 29 | access_dict["training"] = tf.placeholder(tf.bool,shape=[],name="training") 30 | access_dict["peep"] = tf.placeholder(tf.bool,shape=[],name="peep") 31 | access_dict["mask_values"] = tf.placeholder(tf.int32,shape=[None,None],name="mask_values") 32 | access_dict["position_indeces"] = tf.placeholder(tf.int32,shape=[None,None,None],name="position_indeces") 33 | access_dict["target_word_indeces"] = tf.placeholder(tf.int32,shape=[None,None,None],name="target_word_indeces") 34 | 35 | with tf.device("/CPU:0"): 36 | embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size) 37 | embedding_layer(tf.constant([0])) 38 | embedding_projector = tf.keras.layers.Dense(generator_size) 39 | with tf.device(g_dev): 40 | g_transformer = transformer_encoder(gn_blocks,gseq_length,generator_size, 41 | gn_heads,gff_filter_size,name="gt") 42 | generator_encoder = generator_encoder_producer(embedding_layer,embedding_projector,g_transformer) 43 | with tf.device(d_dev): 44 | d_transformer = transformer_encoder(dn_blocks,dseq_length,embedding_size, 45 | dn_heads,dff_filter_size,name="dt") 46 | discriminitor_encoder = discriminitor_encoder_producer(embedding_layer,d_transformer) 47 | output_layer = tf.keras.layers.Dense(1,activation="sigmoid") 48 | losses,layers = {"generator_loss":0,"discriminitor_loss":0},{} 49 | 50 | if access_dict["training"]==False: 51 | d_encoded = discriminitor_encoder(access_dict["input"]) 52 | layers["d_encoded"] = d_encoded 53 | else: 54 | corrupted = tf.tensor_scatter_nd_update( 55 | access_dict["input"],access_dict["position_indeces"], 56 | access_dict["mask_values"]) 57 | if access_dict["peep"]==False: 58 | g_projected,g_encoded = generator_encoder(corrupted) 59 | else: 60 | g_projected,g_encoded = generator_encoder(access_dict["input"]) 61 | masked_g_encoded = tf.gather_nd(g_encoded,access_dict["position_indeces"]) 62 | generated = tf.nn.softmax( 63 | tf.transpose( 64 | tf.matmul(g_projected,masked_g_encoded,transpose_b=True),[0,2,1] 65 | ) 66 | ) 67 | print(generated.shape) 68 | print("shit") 69 | losses["generator_loss"] = tf.reduce_sum(-tf.math.log( 70 | tf.gather_nd(generated,access_dict["target_word_indeces"])+1e-6)) 71 | 72 | replaced = tf.tensor_scatter_nd_update( 73 | access_dict["input"], 74 | access_dict["position_indeces"],tf.cast(tf.math.argmax(generated,axis=-1),tf.int32) 75 | ) 76 | labels = tf.cast(tf.clip_by_value(tf.abs(access_dict["input"]-replaced),0,1),tf.float32) 77 | target_signs = labels*-2+1 78 | 79 | d_encoded = discriminitor_encoder(replaced) 80 | layers["d_encoded"] = d_encoded 81 | d_out = output_layer(d_encoded) 82 | pre_d_loss = (tf.squeeze(d_out)-labels)*target_signs 83 | losses["discriminitor_loss"] = tf.reduce_sum(tf.math.log1p(pre_d_loss+1e-6)) 84 | access_dict["outputs"] = layers["d_encoded"] 85 | access_dict["losses"] = losses["generator_loss"]+d_factor*losses["discriminitor_loss"] 86 | access_dict["g_loss"] = losses["generator_loss"] 87 | access_dict["d_loss"] = losses["discriminitor_loss"] 88 | optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) 89 | access_dict["training_op"] = optimizer.minimize(access_dict["losses"]) 90 | access_dict["init"] = tf.global_variables_initializer() 91 | print("encoded",access_dict["outputs"].shape) 92 | return g, access_dict --------------------------------------------------------------------------------