├── Chatbot-tensowflow2.0 ├── Distribute_seq2seqchatbot │ ├── config │ │ ├── getConfig.py │ │ └── seq2seq.ini │ ├── data_util.py │ ├── execute.py │ ├── seq2seqModel.py │ ├── static │ │ ├── css │ │ │ ├── normalize.css │ │ │ └── style.css │ │ ├── js │ │ │ ├── index.js │ │ │ ├── jquery-latest.js │ │ │ ├── jquery.mCustomScrollbar.concat.min.js │ │ │ ├── jquery.mCustomScrollbar.min.css │ │ │ └── jquery.min.js │ │ ├── res │ │ │ ├── botim.png │ │ │ └── easybot.png │ │ └── scss │ │ │ └── style.scss │ ├── templates │ │ └── index.html │ ├── train_data │ │ ├── inp.vocab │ │ ├── seq.data │ │ └── tar.vocab │ └── web │ │ ├── app.py │ │ ├── static │ │ ├── css │ │ │ ├── normalize.css │ │ │ └── style.css │ │ ├── js │ │ │ ├── index.js │ │ │ ├── jquery-latest.js │ │ │ ├── jquery.mCustomScrollbar.concat.min.js │ │ │ ├── jquery.mCustomScrollbar.min.css │ │ │ └── jquery.min.js │ │ ├── res │ │ │ ├── botim.png │ │ │ └── easybot.png │ │ └── scss │ │ │ └── style.scss │ │ └── templates │ │ └── index.html ├── Seq2seqchatbot │ ├── config │ │ ├── getConfig.py │ │ └── seq2seq.ini │ ├── data_util.py │ ├── execute.py │ ├── seq2seqModel.py │ ├── static │ │ ├── css │ │ │ ├── normalize.css │ │ │ └── style.css │ │ ├── js │ │ │ ├── index.js │ │ │ ├── jquery-latest.js │ │ │ ├── jquery.mCustomScrollbar.concat.min.js │ │ │ ├── jquery.mCustomScrollbar.min.css │ │ │ └── jquery.min.js │ │ ├── res │ │ │ ├── botim.png │ │ │ └── easybot.png │ │ └── scss │ │ │ └── style.scss │ ├── templates │ │ └── index.html │ ├── train_data │ │ └── xiaohuangji50w.conv.zip │ └── web │ │ ├── app.py │ │ ├── static │ │ ├── css │ │ │ ├── normalize.css │ │ │ └── style.css │ │ ├── js │ │ │ ├── index.js │ │ │ ├── jquery-latest.js │ │ │ ├── jquery.mCustomScrollbar.concat.min.js │ │ │ ├── jquery.mCustomScrollbar.min.css │ │ │ └── jquery.min.js │ │ ├── res │ │ │ ├── botim.png │ │ │ └── easybot.png │ │ └── scss │ │ │ └── style.scss │ │ └── templates │ │ └── index.html └── SeqGANchatbot │ └── README.md ├── Chatbot_pytorch ├── Distribute_seq2seqchatbot │ ├── README.md │ ├── data_util.py │ ├── execute.py │ └── seq2seqModel.py ├── Seq2seqchatbot │ ├── config │ │ ├── getConfig.py │ │ └── seq2seq.ini │ ├── data_util.py │ ├── execute.py │ ├── seq2seqModel.py │ ├── train_data │ │ └── xiaohuangji50w.conv.zip │ └── web │ │ ├── app.py │ │ ├── static │ │ ├── css │ │ │ ├── normalize.css │ │ │ └── style.css │ │ ├── js │ │ │ ├── index.js │ │ │ ├── jquery-latest.js │ │ │ ├── jquery.mCustomScrollbar.concat.min.js │ │ │ ├── jquery.mCustomScrollbar.min.css │ │ │ └── jquery.min.js │ │ ├── res │ │ │ ├── botim.png │ │ │ └── easybot.png │ │ └── scss │ │ │ └── style.scss │ │ └── templates │ │ └── index.html └── SeqGANchatbot │ └── README.md ├── README.md ├── image.jpg ├── img_1.png └── img_2.png /Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/getConfig.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import os 3 | from configparser import SafeConfigParser 4 | config_file=os.getcwd()+'/config/seq2seq.ini' 5 | if not os.path.exists(config_file): 6 | config_file = os.path.dirname(os.getcwd()) + '/config/seq2seq.ini' 7 | print(config_file) 8 | def get_config(): 9 | parser = SafeConfigParser() 10 | parser.read(config_file) 11 | # get the ints, floats and strings 12 | _conf_ints = [ (key, int(value)) for key,value in parser.items('ints')] 13 | _conf_floats = [ (key, float(value)) for key,value in parser.items('floats') ] 14 | _conf_strings = [ (key, str(value)) for key,value in parser.items('strings') ] 15 | return dict(_conf_ints +_conf_floats+ _conf_strings) 16 | -------------------------------------------------------------------------------- /Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/config/seq2seq.ini: -------------------------------------------------------------------------------- 1 | [strings] 2 | # Mode : train, test, serve 3 | mode = train 4 | train_data=train_data 5 | seq_data = train_data/seq.data 6 | vocab_inp_path=train_data/inp.vocab 7 | vocab_tar_path=train_data/tar.vocab 8 | #训练集原始文件 9 | resource_data = train_data/xiaohuangji50w.conv 10 | #分割后的训练样本文件 11 | split_train_data=train_data/seq_data_ 12 | #读取识别原始文件中段落和行头的标示 13 | e = E 14 | m = M 15 | model_data = model_data 16 | log_dir=log_dir 17 | [ints] 18 | # vocabulary size 19 | # 20,000 is a reasonable size 20 | vocab_inp_size = 20000 21 | vocab_tar_size = 20000 22 | embedding_dim=128 23 | train_epoch=10 24 | # typical options : 128, 256, 512, 1024 25 | layer_size = 512 26 | batch_size = 64 27 | #句子的最长长度 28 | max_length=20 29 | number_work=2 30 | [floats] 31 | #设置最小Loss,当模型loss值达到这个水平后停止训练 32 | min_loss=0.2 33 | 34 | -------------------------------------------------------------------------------- /Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/data_util.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import os 4 | import re 5 | import jieba 6 | from zhon.hanzi import punctuation 7 | from config import getConfig 8 | import io 9 | import tensorflow as tf 10 | 11 | # 加载参数配置文件 12 | gConfig = {} 13 | gConfig = getConfig.get_config() 14 | conv_path = gConfig['resource_data'] 15 | vocab_inp_path = gConfig['vocab_inp_path'] 16 | vocab_tar_path = gConfig['vocab_tar_path'] 17 | vocab_inp_size = gConfig['vocab_inp_size'] 18 | vocab_tar_size = gConfig['vocab_tar_size'] 19 | seq_train = gConfig['seq_data'] 20 | def predata_util(): 21 | # 判断训练语料文件是否存在,如果不存在则进行提醒 22 | if not os.path.exists(conv_path): 23 | print("找不到需要处理的文件,请确认在train_data文件中是否存在该文件") 24 | exit() 25 | # 新建一个文件,用于存放处理后的对话语料 26 | seq_train = open(gConfig['seq_data'], 'w') 27 | # 打开需要处理的语料,逐条读取并进行数据处理 28 | with open(conv_path, encoding='utf-8') as f: 29 | one_conv = "" # 存储一次完整对话 30 | i = 0 31 | # 开始循环处理语料 32 | for line in f: 33 | line = line.strip('\n') 34 | line = re.sub(r"[%s]+" % punctuation, "", line) # 去除标点符号 35 | if line == '': 36 | continue 37 | # 判断是否为一段对话的开始,如果是则把刚刚处理的语料保存下来 38 | if line[0] == gConfig['e']: 39 | if one_conv: 40 | seq_train.write(one_conv[:-1] + '\n') 41 | i = i + 1 42 | if i % 1000 == 0: 43 | print('处理进度:', i) 44 | one_conv = "" 45 | # 判断是否正在处理对话语句,如果是则进行语料的拼接处理 以及分词 46 | elif line[0] == gConfig['m']: 47 | one_conv = one_conv + str(" ".join(jieba.cut(line.split(' ')[1]))) + '\t' # 存储一次问或答 48 | # 处理完成,关闭文件 49 | seq_train.close() 50 | 51 | def create_vocab(lang, vocab_path, vocab_size): 52 | tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token=3) 53 | tokenizer.fit_on_texts(lang) 54 | vocab = json.loads(tokenizer.to_json(ensure_ascii=False)) 55 | vocab['index_word'] = tokenizer.index_word 56 | vocab['word_index'] = tokenizer.word_index 57 | vocab['document_count']=tokenizer.document_count 58 | vocab = json.dumps(vocab, ensure_ascii=False) 59 | with open(vocab_path, 'w', encoding='utf-8') as f: 60 | f.write(vocab) 61 | f.close() 62 | print("字典保存在:{}".format(vocab_path)) 63 | 64 | def preprocess_sentence(w): 65 | w = 'start ' + w + ' end' 66 | return w 67 | lines = io.open(seq_train, encoding='UTF-8').readlines() 68 | word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] 69 | input_lang, target_lang = zip(*word_pairs) 70 | predata_util() 71 | create_vocab(input_lang,vocab_inp_path,vocab_inp_size) 72 | create_vocab(target_lang,vocab_tar_path,vocab_tar_size) 73 | 74 | -------------------------------------------------------------------------------- /Chatbot-tensowflow2.0/Distribute_seq2seqchatbot/execute.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #导入依赖包 3 | import json 4 | import os 5 | import sys 6 | import time 7 | import tensorflow as tf 8 | import horovod.tensorflow as hvd 9 | import seq2seqModel 10 | from config import getConfig 11 | import io 12 | 13 | hvd.init() 14 | #初始化超参字典,并对相应的参数进行赋值 15 | gConfig = {} 16 | gConfig= getConfig.get_config() 17 | vocab_inp_size = gConfig['vocab_inp_size'] 18 | vocab_tar_size = gConfig['vocab_tar_size'] 19 | embedding_dim=gConfig['embedding_dim'] 20 | units=gConfig['layer_size'] 21 | BATCH_SIZE=gConfig['batch_size'] 22 | 23 | max_length_inp=gConfig['max_length'] 24 | max_length_tar=gConfig['max_length'] 25 | 26 | log_dir=gConfig['log_dir'] 27 | writer = tf.summary.create_file_writer(log_dir) 28 | #对训练语料进行处理,上下文分别加上start end标示 29 | def preprocess_sentence(w): 30 | w ='start '+ w + ' end' 31 | return w 32 | #定义数据读取函数,从训练语料中读取数据并进行word2number的处理,并生成词典 33 | def read_data(path): 34 | path = os.getcwd() + '/' + path 35 | if not os.path.exists(path): 36 | path=os.path.dirname(os.getcwd())+'/'+ path 37 | lines = io.open(path, encoding='UTF-8').read().strip().split('\n') 38 | word_pairs = [[preprocess_sentence(w) for w in l.split('\t')] for l in lines] 39 | input_lang,target_lang=zip(*word_pairs) 40 | input_tokenizer=tokenize(gConfig['vocab_inp_path']) 41 | target_tokenizer=tokenize(gConfig['vocab_tar_path']) 42 | input_tensor=input_tokenizer.texts_to_sequences(input_lang) 43 | target_tensor=target_tokenizer.texts_to_sequences(target_lang) 44 | input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=max_length_inp, 45 | padding='post') 46 | target_tensor= tf.keras.preprocessing.sequence.pad_sequences(target_tensor, maxlen=max_length_tar, 47 | padding='post') 48 | return input_tensor,input_tokenizer,target_tensor,target_tokenizer 49 | #定义word2number函数,通过对语料的处理提取词典,并进行word2number处理以及padding补全 50 | def tokenize(vocab_file): 51 | #从词典中读取预先生成tokenizer的config,构建词典矩阵 52 | with open(vocab_file,'r',encoding='utf-8') as f: 53 | tokenize_config=json.dumps(json.load(f),ensure_ascii=False) 54 | lang_tokenizer=tf.keras.preprocessing.text.tokenizer_from_json(tokenize_config) 55 | #利用词典进行word2number的转换以及padding处理 56 | return lang_tokenizer 57 | input_tensor, input_token, target_tensor, target_token = read_data(gConfig['seq_data']) 58 | steps_per_epoch = len(input_tensor) // (gConfig['batch_size']*hvd.size()) 59 | BUFFER_SIZE = len(input_tensor) 60 | dataset = tf.data.Dataset.from_tensor_slices((input_tensor,target_tensor)).shuffle(BUFFER_SIZE) 61 | dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) 62 | enc_hidden = seq2seqModel.encoder.initialize_hidden_state() 63 | dataset = dataset.shard(hvd.size(), hvd.rank()) 64 | #定义训练函数 65 | def train(): 66 | # 从训练语料中读取数据并使用预生成词典word2number的转换 67 | print("Preparing data in %s" % gConfig['train_data']) 68 | print('每个epoch的训练步数: {}'.format(steps_per_epoch)) 69 | #如有已经有预训练的模型则加载预训练模型继续训练 70 | checkpoint_dir = gConfig['model_data'] 71 | ckpt=tf.io.gfile.listdir(checkpoint_dir) 72 | if ckpt: 73 | print("reload pretrained model") 74 | seq2seqModel.checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) 75 | 76 | #使用Dataset加载训练数据,Dataset可以加速数据的并发读取并进行训练效率的优化 77 | checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") 78 | start_time = time.time() 79 | #current_loss=2 80 | #min_loss=gConfig['min_loss'] 81 | epoch = 0 82 | train_epoch = gConfig['train_epoch'] 83 | #开始进行循环训练,这里设置了一个结束循环的条件就是当loss小于设置的min_loss超参时终止训练 84 | while epoch