├── 421_Project_1.png ├── Classifier ├── DataLoader │ └── bucket_and_batch.py ├── Model_Backup │ └── model.pt ├── data │ ├── dev.txt │ ├── inference.txt │ ├── process_data.py │ └── train.txt ├── model │ └── dialog_acts.py └── train_and_test │ └── train.py ├── Generator ├── DialoGPT │ └── Configs │ │ ├── config.json │ │ ├── merges.txt │ │ └── vocab.json ├── Experimental Codes │ ├── test_advanced_experimental.py │ └── test_advanced_experimental_2.py └── generator.py ├── LICENSE ├── Project Report.pdf ├── README.md ├── ReRanker └── rerank.py ├── Readme.txt ├── Retriever ├── Data │ └── SQL-format ├── Faiss_index │ └── thread_idx.pkl ├── Retrieve.py ├── faiss_it.py └── fill_data.py ├── Scripted ├── Chatterbot_Corpus │ ├── ai.yml │ ├── botprofile.yml │ ├── computers.yml │ ├── conversations.yml │ ├── emotion.yml │ ├── food.yml │ ├── gossip.yml │ ├── greetings.yml │ ├── health.yml │ ├── history.yml │ ├── humor.yml │ ├── literature.yml │ ├── money.yml │ ├── movies.yml │ ├── politics.yml │ ├── psychology.yml │ ├── science.yml │ ├── sports.yml │ └── trivia.yml ├── Processed_Scripts │ ├── Bot_Profile.pkl │ ├── Chatterbot.pkl │ ├── embedded_bot_queries.pkl │ ├── embedded_chatterbot_queries.pkl │ ├── intent_query_script.pkl │ └── intent_response_script.pkl ├── Random_Reddit_Data │ └── __init__.py ├── Subscripts │ ├── fill_bot_profile.py │ ├── fill_chatterbot.py │ ├── intent_query_script.py │ ├── intent_response_script.py │ └── process_pkl.py └── setup.py ├── Sentence_Encoder ├── Embeddings │ ├── ConvRT │ │ ├── saved_model.pb │ │ ├── tfhub_module.pb │ │ └── variables │ │ │ └── variables.index │ └── USE_QA │ │ ├── saved_model.pb │ │ └── variables │ │ └── variables.index ├── encoder_client.py ├── meta_query_encoder.py ├── meta_query_encoder_fast.py ├── meta_response_encoder.py └── meta_response_encoder_fast.py ├── TTS ├── LICENSE.txt ├── README.md ├── TTS.egg-info │ ├── PKG-INFO │ ├── SOURCES.txt │ ├── dependency_links.txt │ ├── requires.txt │ └── top_level.txt ├── ThisBranch.txt ├── best_model_config.json ├── config.json ├── config_kusal.json ├── dataset_analysis │ ├── AnalyzeDataset.ipynb │ ├── README.md │ └── analyze.py ├── datasets │ ├── Kusal.py │ ├── LJSpeech.py │ ├── LJSpeechCached.py │ └── TWEB.py ├── debug_config.py ├── extract_feats.py ├── hard-sentences.txt ├── images │ ├── example_model_output.png │ └── model.png ├── layers │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── attention.cpython-37.pyc │ │ └── tacotron.cpython-37.pyc │ ├── attention.py │ ├── custom_layers.py │ ├── losses.py │ └── tacotron.py ├── models │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── tacotron.cpython-36.pyc │ │ └── tacotron.cpython-37.pyc │ └── tacotron.py ├── notebooks │ ├── Benchmark.ipynb │ ├── ReadArticle.ipynb │ ├── TacotronPlayGround.ipynb │ └── synthesis.py ├── requirements.txt ├── server │ ├── README.md │ ├── conf.json │ ├── server.py │ ├── synthesizer.py │ └── templates │ │ └── index.html ├── setup.py ├── synthesis.py ├── tests │ ├── generic_utils_text.py │ ├── layers_tests.py │ ├── loader_tests.py │ ├── tacotron_tests.py │ └── test_config.json ├── text2speech.py ├── train.py ├── tts_model │ └── config.json ├── utils │ ├── audio.py │ ├── audio_lws.py │ ├── data.py │ ├── generic_utils.py │ ├── text │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── cleaners.cpython-37.pyc │ │ │ ├── cmudict.cpython-37.pyc │ │ │ ├── numbers.cpython-37.pyc │ │ │ └── symbols.cpython-37.pyc │ │ ├── cleaners.py │ │ ├── cmudict.py │ │ ├── numbers.py │ │ └── symbols.py │ └── visual.py └── version.py ├── Utils ├── functions.py └── functions_old.py ├── chatbot.yml ├── evaluate.py ├── interact.py ├── interact_generator_only.py ├── interact_retrieval_only.py └── interact_verbose.py /421_Project_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/421_Project_1.png -------------------------------------------------------------------------------- /Classifier/DataLoader/bucket_and_batch.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import random 4 | import re 5 | 6 | 7 | def bucket_and_batch(vectors, classes, batch_size, classes_num): 8 | 9 | print("Sample size: ", vectors.shape[0]) 10 | 11 | i = 0 12 | batches_vectors = [] 13 | batches_classes = [] 14 | count = 0 15 | 16 | while i < len(vectors): 17 | 18 | if i+batch_size > vectors.shape[0]: 19 | batch_size = vectors.shape[0]-i 20 | 21 | batch_vectors = vectors[i:i+batch_size] 22 | batch_classes = [] 23 | 24 | for j in range(i, i + batch_size): 25 | class_ = classes[j].tolist() 26 | # print(class_) 27 | new_class = [1 if x in class_ else 0 for x in range(classes_num)] 28 | # print(new_class) 29 | batch_classes.append(new_class) 30 | 31 | batch_classes = np.asarray(batch_classes, dtype=int) 32 | 33 | batches_vectors.append(batch_vectors) 34 | batches_classes.append(batch_classes) 35 | 36 | i += batch_size 37 | 38 | return batches_vectors, batches_classes 39 | -------------------------------------------------------------------------------- /Classifier/Model_Backup/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Classifier/Model_Backup/model.pt -------------------------------------------------------------------------------- /Classifier/data/inference.txt: -------------------------------------------------------------------------------- 1 | do you mind tell me one more time : EMPTY > who is the piano 2 | can you repeat the book name please : EMPTY > the dragons of eden 3 | do you prefer to watch movies in the theater or at home : EMPTY > maybe a whole besides the big screen 4 | they scare my creators too : EMPTY > no 5 | do you watch movies : EMPTY > i like to play with you 6 | what do you think is the best acting performance of all time : but the morning brando in the godfather > marlon brando on the on the waterfront 7 | i am listening to music : EMPTY > yes 8 | is there something else you'd like to talk about : EMPTY > music 9 | do you watch movies : EMPTY > yes 10 | have you ever had a pet : no > yes 11 | 2019 : yes > i'm still here 12 | what do you think : EMPTY > think that was so stupid 13 | any thoughts : that's amazing > really great 14 | have you ever tried to guess someone else's password so many times that it locked their account : yes > i do 15 | any thoughts : how did you know that it really > rick percent of the time 16 | what's the name of your dog : the dog is yoki > he's a chihuahua 17 | do you prefer fiction or nonfiction books : EMPTY > fiction 18 | what book genre is your favorite : open > i really like inspiring books and true ones 19 | what do you think is the best acting performance of all time : EMPTY > i don't know 20 | did you ever see him perform live : EMPTY > no 21 | period : i think it did in the ten > but stop 22 | what book has had the biggest impact on human history : i meant that barbara parks > and my favorite author 23 | i lost my train of thought : EMPTY > chat about marvel 24 | just say stop : EMPTY > stop 25 | any thoughts : EMPTY > stop 26 | -------------------------------------------------------------------------------- /Classifier/data/process_data.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sys 3 | sys.path.append("../../") # nopep8 4 | from Sentence_Encoder.meta_query_encoder import encode 5 | import tensorflow.compat.v1 as tf 6 | import tensorflow_text 7 | import tensorflow_hub as hub 8 | import numpy as np 9 | 10 | tf.disable_eager_execution() 11 | sess = tf.InteractiveSession(graph=tf.Graph()) 12 | 13 | ConvRT_model = hub.Module("../../Sentence_Encoder/Embeddings/ConvRT/") 14 | USE_QA_model = hub.load('../../Sentence_Encoder/Embeddings/USE_QA/') 15 | 16 | sess.run(tf.tables_initializer()) 17 | sess.run(tf.global_variables_initializer()) 18 | 19 | train_dir = "train.txt" 20 | dev_dir = "dev.txt" 21 | 22 | filename = train_dir 23 | all_targets = [] 24 | 25 | 26 | def extract_data(filename): 27 | 28 | contexts = [] 29 | queries = [] 30 | acts = [] 31 | 32 | with open(filename) as file: 33 | global all_targets 34 | lines = file.readlines() 35 | for line in lines: 36 | line = line.strip() 37 | split_line = line.split(" : ") 38 | line = split_line[1] 39 | context1 = split_line[0] 40 | 41 | if "what" in context1.lower() \ 42 | or "why" in context1.lower() \ 43 | or "where" in context1.lower() \ 44 | or "how" in context1.lower() \ 45 | or "who" in context1.lower(): 46 | punc = "?" 47 | else: 48 | punc = "." 49 | 50 | split_line = line.split(" > ") 51 | 52 | context2 = split_line[0].strip() 53 | if context2 == "EMPTY": 54 | context = context1+punc 55 | else: 56 | context = context1+punc+" "+context2 57 | 58 | line = split_line[1] 59 | split_line = line.split(" ## ") 60 | current_uttr = split_line[0] 61 | targets = split_line[1] 62 | targets = targets.split(";") 63 | targets = [target for target in targets if target != ''] 64 | 65 | if len(targets) < 2: 66 | targets.append("NULL") 67 | 68 | all_targets += targets 69 | 70 | contexts.append(context) 71 | queries.append(current_uttr) 72 | acts.append(targets) 73 | 74 | return contexts, queries, acts 75 | 76 | 77 | train_contexts, train_queries, train_acts = extract_data(train_dir) 78 | test_contexts, test_queries, test_acts = extract_data(dev_dir) 79 | 80 | all_targets = list(set(all_targets)) 81 | 82 | labels2idx = {v: i for i, v in enumerate(all_targets)} 83 | 84 | train_queries_vec = [] 85 | i = 0 86 | batch_size = 2000 87 | while i < len(train_queries): 88 | print(i) 89 | if i+batch_size > len(train_queries): 90 | batch_size = len(train_queries)-i 91 | 92 | train_query_vec = encode(sess, train_queries[i:i+batch_size], train_contexts[i:i + 93 | batch_size], USE_QA_model, ConvRT_model) 94 | train_queries_vec.append(train_query_vec) 95 | i += batch_size 96 | 97 | train_queries_vec = np.concatenate(train_queries_vec, axis=0) 98 | 99 | test_queries_vec = [] 100 | i = 0 101 | while i < len(test_queries): 102 | if i+batch_size > len(test_queries): 103 | batch_size = len(test_queries)-i 104 | test_query_vec = encode(sess, test_queries[i:i+batch_size], test_contexts[i:i + 105 | batch_size], USE_QA_model, ConvRT_model) 106 | test_queries_vec.append(test_query_vec) 107 | i += batch_size 108 | test_queries_vec = np.concatenate(test_queries_vec, axis=0) 109 | 110 | print(train_queries_vec.shape) 111 | print(test_queries_vec.shape) 112 | 113 | train_acts_vec = [] 114 | for acts in train_acts: 115 | train_acts_vec.append([labels2idx[act] for act in acts]) 116 | 117 | test_acts_vec = [] 118 | for acts in test_acts: 119 | test_acts_vec.append([labels2idx[act] for act in acts]) 120 | 121 | train_acts_vec = np.asarray(train_acts_vec, np.int) 122 | test_acts_vec = np.asarray(test_acts_vec, np.int) 123 | 124 | print(train_acts_vec.shape) 125 | print(test_acts_vec.shape) 126 | 127 | data = {} 128 | 129 | data["labels2idx"] = labels2idx 130 | 131 | data["train_contexts"] = train_contexts 132 | data["test_contexts"] = test_contexts 133 | 134 | data["train_queries"] = train_queries 135 | data["train_acts"] = train_acts 136 | 137 | data["test_queries"] = test_queries 138 | data["test_acts"] = test_acts 139 | 140 | data["test_queries_vec"] = test_queries_vec 141 | data["test_acts_vec"] = test_acts_vec 142 | 143 | data["train_queries_vec"] = train_queries_vec 144 | data["train_acts_vec"] = train_acts_vec 145 | 146 | with open("processed_data.pkl", 'wb') as fp: 147 | pickle.dump(data, fp) 148 | -------------------------------------------------------------------------------- /Classifier/model/dialog_acts.py: -------------------------------------------------------------------------------- 1 | import torch as T 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import math 7 | 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, D, classes_num): 11 | super(Encoder, self).__init__() 12 | self.linear1 = nn.Linear(D, 1024) 13 | self.linear2 = nn.Linear(1024, 300) 14 | self.linear3 = nn.Linear(300, classes_num) 15 | 16 | # @torchsnooper.snoop() 17 | def forward(self, x): 18 | 19 | x = F.dropout(x, p=0.5) 20 | x = self.linear1(x) 21 | x = F.gelu(x) 22 | x = self.linear2(x) 23 | x = F.gelu(x) 24 | classes = T.sigmoid(self.linear3(x)) 25 | 26 | return classes 27 | -------------------------------------------------------------------------------- /Classifier/train_and_test/train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pickle 3 | import torch as T 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.nn.functional as F 7 | sys.path.append("../") # nopep8 8 | from model.dialog_acts import Encoder 9 | from DataLoader.bucket_and_batch import bucket_and_batch 10 | import numpy as np 11 | import string 12 | import random 13 | 14 | device = T.device('cuda' if T.cuda.is_available() else 'cpu') 15 | max_grad_norm = 1 16 | 17 | with open("../data/processed_data.pkl", "rb") as fp: 18 | data = pickle.load(fp) 19 | 20 | labels2idx = data["labels2idx"] 21 | idx2labels = {i: v for v, i in labels2idx.items()} 22 | 23 | train_queries_vec = data["train_queries_vec"] 24 | train_acts_vec = data["train_acts_vec"] 25 | 26 | test_queries_vec = data["test_queries_vec"] 27 | test_acts_vec = data["test_acts_vec"] 28 | 29 | model = Encoder(D=test_queries_vec.shape[-1], classes_num=len(labels2idx)) 30 | model = model.cuda() 31 | 32 | parameter_count = sum(p.numel() for p in model.parameters() if p.requires_grad) 33 | 34 | print("Parameter Count: ", parameter_count) 35 | 36 | optimizer = T.optim.Adam(model.parameters(), lr=1e-3) 37 | 38 | 39 | def loss_fn(logits, labels, l2=1e-6): 40 | regularization = T.tensor(0.).to(device) # .to(device) 41 | for name, param in model.named_parameters(): 42 | if 'bias' not in name and 'embedding' not in name: 43 | regularization += T.norm(param).pow(2) 44 | 45 | loss = nn.MSELoss() 46 | output = loss(logits, labels) + l2*regularization 47 | 48 | return output 49 | 50 | 51 | batches_train_queries, batches_train_classes = bucket_and_batch( 52 | train_queries_vec, train_acts_vec, 64, len(labels2idx)) 53 | 54 | batches_test_queries, batches_test_classes = bucket_and_batch( 55 | test_queries_vec, test_acts_vec, 64, len(labels2idx)) 56 | 57 | 58 | def predict(queries, classes, train=True): 59 | global model 60 | 61 | if train: 62 | model = model.train() 63 | else: 64 | model = model.eval() 65 | 66 | logits = model(T.tensor(queries).to(device)) 67 | 68 | loss = loss_fn(logits, T.tensor(classes).float().to(device)) 69 | 70 | _, sorted_idx = T.sort(logits, dim=-1, descending=True) 71 | 72 | sorted_idx = sorted_idx[:, 0:2] 73 | # print(sorted_idx.size()) 74 | sorted_idx = sorted_idx.cpu().numpy().tolist() 75 | 76 | _, gold_sorted_idx = T.sort(T.tensor(classes).to(device), dim=-1, descending=True) 77 | 78 | gold_sorted_idx = gold_sorted_idx[:, 0:2] 79 | # print(gold_sorted_idx.size()) 80 | 81 | gold_sorted_idx = gold_sorted_idx.cpu().numpy().tolist() 82 | 83 | score = 0 84 | total = 0 85 | 86 | for sorted_id, gold_sorted_id in zip(sorted_idx, gold_sorted_idx): 87 | for id in sorted_id: 88 | if id in gold_sorted_id: 89 | score += 1 90 | total += 1 91 | 92 | return loss, (score/total) 93 | 94 | 95 | best_val_accuracy = 0 96 | 97 | for epoch in range(100): 98 | i = 0 99 | for batch_X, batch_Y in zip(batches_train_queries, batches_train_classes): 100 | 101 | loss, accuracy = predict(batch_X, batch_Y, train=True) 102 | 103 | loss.backward() 104 | T.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 105 | optimizer.step() 106 | optimizer.zero_grad() 107 | 108 | if i % 100 == 0: 109 | print("Step {}, Loss: {}, Accuracy: {}".format(i, loss, accuracy)) 110 | 111 | i += 1 112 | 113 | print("\n\nStarting Validation\n\n") 114 | 115 | total_val_accuracy = 0 116 | i = 0 117 | for batch_X, batch_Y in zip(batches_test_queries, batches_test_classes): 118 | 119 | with T.no_grad(): 120 | 121 | loss, accuracy = predict(batch_X, batch_Y, train=False) 122 | 123 | total_val_accuracy += accuracy 124 | 125 | if i % 100 == 0: 126 | print("Step {}, Loss: {}, Accuracy: {}".format(i, loss, accuracy)) 127 | 128 | i += 1 129 | 130 | mean_accuracy = total_val_accuracy/len(batches_test_queries) 131 | 132 | print("\n\nEpoch {}, Validation Result: Accuracy: {}\n".format(epoch, mean_accuracy)) 133 | 134 | if mean_accuracy > best_val_accuracy: 135 | best_val_accuracy = mean_accuracy 136 | 137 | T.save({ 138 | 'model_state_dict': model.state_dict(), 139 | 'optimizer_state_dict': optimizer.state_dict() 140 | }, "../Model_Backup/model.pt") 141 | 142 | print("\nCheckpoint Saved\n") 143 | -------------------------------------------------------------------------------- /Generator/DialoGPT/Configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "attn_pdrop": 0.1, 3 | "embd_pdrop": 0.1, 4 | "initializer_range": 0.02, 5 | "layer_norm_epsilon": 1e-05, 6 | "n_ctx": 1024, 7 | "n_embd": 1024, 8 | "n_head": 16, 9 | "n_layer": 24, 10 | "n_positions": 1024, 11 | "n_special": 0, 12 | "predict_special_tokens": true, 13 | "resid_pdrop": 0.1, 14 | "vocab_size": 50257 15 | } 16 | -------------------------------------------------------------------------------- /Generator/generator.py: -------------------------------------------------------------------------------- 1 | # Modified from: https://colab.research.google.com/drive/1-_KjlAV3J1IVDw_9KogjKDCzgFY7Jp7E#scrollTo=RZd4iXX5SqKC 2 | import os 3 | import logging 4 | logging.getLogger("tensorflow").setLevel(logging.CRITICAL) 5 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 6 | logging.basicConfig(level=logging.CRITICAL) 7 | 8 | import torch as T 9 | import torch.nn.functional as F 10 | from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config 11 | import numpy as np 12 | 13 | """ 14 | tokenizer = GPT2Tokenizer.from_pretrained('DialoGPT/Configs/') 15 | weights = T.load('DialoGPT/Parameters/medium_ft.pkl') 16 | weights_reverse = T.load('DialoGPT/Parameters/small_reverse.pkl') 17 | cfg = GPT2Config.from_json_file('DialoGPT/Configs/config.json') 18 | model = GPT2LMHeadModel(cfg) 19 | model_reverse = GPT2LMHeadModel(cfg) 20 | 21 | # fix misused key value 22 | weights["lm_head.weight"] = weights["lm_head.decoder.weight"] 23 | weights.pop("lm_head.decoder.weight", None) 24 | weights_reverse["lm_head.weight"] = weights_reverse["lm_head.decoder.weight"] 25 | weights_reverse.pop("lm_head.decoder.weight", None) 26 | 27 | model.load_state_dict(weights) 28 | model.to('cuda') 29 | model.eval() 30 | 31 | model_reverse.load_state_dict(weights_reverse) 32 | model_reverse.to('cuda') 33 | model_reverse.eval() 34 | 35 | conv_history = ["what is the meaning of life?"] 36 | """ 37 | 38 | 39 | 40 | def generate(conv_history, model, tokenizer, past=None, samples=20): 41 | 42 | EOS_token = tokenizer.encode("<|endoftext|>")[0] 43 | 44 | # adapted from: https://github.com/microsoft/DialoGPT/blob/master/prepro.py 45 | def _make_feature(sents, eos): 46 | 47 | msg_idx = [] 48 | for msg in sents: 49 | msg_idx.append(tokenizer.encode(msg)) 50 | input_ids = [i for s in msg_idx for i in s+[eos]][:-1] 51 | 52 | input_ids.append(eos) 53 | 54 | if len(input_ids) > 200: 55 | input_ids = input_ids[-200:] 56 | 57 | return input_ids 58 | 59 | 60 | input_ids = _make_feature(conv_history, EOS_token) 61 | 62 | input_ids = T.tensor(input_ids).long().to('cuda').unsqueeze(0) 63 | 64 | #print(input_ids.size()) 65 | 66 | def generate_candidates(hypotheses): 67 | 68 | EOS_token = tokenizer.encode("<|endoftext|>")[0] 69 | 70 | hypotheses_list = hypotheses.cpu().numpy().tolist() 71 | 72 | candidates = [] 73 | for beam in hypotheses_list: 74 | # print(beam) 75 | if beam[-1] == EOS_token and EOS_token not in beam[:-1]: 76 | candidate = tokenizer.decode(beam[:-1]) 77 | candidates.append(candidate) 78 | 79 | return candidates 80 | 81 | def p_sample(logits, top_p=0.9): 82 | 83 | sorted_logits, sorted_indices = T.sort(logits, dim=-1, descending=True) 84 | 85 | cumulative_probs = T.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) 86 | 87 | min_probs,_ = T.min(cumulative_probs,dim=-1) 88 | min_probs = min_probs.view(-1,1) 89 | min_probs = T.repeat_interleave(min_probs,logits.size(-1),dim=-1) 90 | 91 | sorted_logits_ = sorted_logits.clone() 92 | 93 | sorted_logits = T.where(cumulative_probs > top_p, 94 | T.empty_like(sorted_logits).to('cuda').fill_(T.tensor(-2.0**32)), 95 | sorted_logits_) 96 | 97 | sorted_logits = T.where(min_probs > top_p, 98 | sorted_logits_, 99 | sorted_logits) 100 | 101 | probabilities = F.softmax(sorted_logits, dim=-1) 102 | 103 | next_word_sorted_idx = T.multinomial(probabilities, num_samples=1, replacement=False) 104 | 105 | next_word_sorted_idx = next_word_sorted_idx.view(-1, 1) 106 | 107 | next_word_idx = T.gather(sorted_indices, dim=-1, index=next_word_sorted_idx) 108 | log_probs = T.gather(probabilities, dim=-1, index=next_word_sorted_idx) 109 | log_probs = T.log(log_probs+1e-8) 110 | 111 | log_probs = log_probs.view(-1) 112 | 113 | return next_word_idx, log_probs 114 | 115 | def greedy_decoding(input_ids, samples, past=None, 116 | top_p=0.9, temperature=1): 117 | 118 | EOS_token = tokenizer.encode("<|endoftext|>")[0] 119 | #next_token_type = T.tensor(T.max(token_type_ids)).to('cuda').long().view(1,1) 120 | 121 | i = 0 122 | 123 | input_ids = T.repeat_interleave(input_ids, samples, dim=0) 124 | _, inp_size = input_ids.size() 125 | 126 | candidates = [] 127 | 128 | response_ids = input_ids.clone() 129 | 130 | total_log_probs = T.zeros(samples).to('cuda') 131 | mask = T.ones(samples).to('cuda') 132 | 133 | while len(candidates) < samples and i < 300: 134 | 135 | with T.no_grad(): 136 | outputs, past = model(input_ids=input_ids, 137 | past=past) 138 | predictions = outputs 139 | 140 | logits = predictions[:, -1, :] 141 | logits = logits/temperature 142 | 143 | next_word_idx, log_probs = p_sample(logits, top_p=top_p) 144 | 145 | total_log_probs = total_log_probs+(log_probs*mask) 146 | 147 | mask = T.where(next_word_idx.view(-1) == EOS_token, 148 | T.zeros(samples).to('cuda'), 149 | mask) 150 | 151 | input_ids = next_word_idx # T.cat([input_ids,next_word_idx],dim=-1) 152 | #token_type_ids = T.cat([token_type_ids,next_token_type],dim=-1) 153 | 154 | response_ids = T.cat([response_ids, next_word_idx], dim=-1) 155 | 156 | candidates += generate_candidates(response_ids[:, inp_size:]) 157 | 158 | i += 1 159 | 160 | return candidates, total_log_probs 161 | 162 | candidates, scores = greedy_decoding(input_ids, samples) 163 | 164 | return candidates, scores 165 | 166 | 167 | """ 168 | candidates,_ = generate(conv_history,model,tokenizer) 169 | 170 | print(candidates) 171 | """ 172 | -------------------------------------------------------------------------------- /Project Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Project Report.pdf -------------------------------------------------------------------------------- /ReRanker/rerank.py: -------------------------------------------------------------------------------- 1 | from Sentence_Encoder.meta_response_encoder_fast import encode as response_encode 2 | import Utils.functions as utils 3 | import numpy as np 4 | import torch as T 5 | import copy 6 | 7 | 8 | def random_response(candidates, conversation_history, p=None): 9 | loop = 5 10 | 11 | if p is None: 12 | response = random.choice(candidates) 13 | else: 14 | response = np.random.choice(candidates, p=p) 15 | 16 | i = 0 17 | while response in conversation_history: 18 | if p is None: 19 | response = random.choice(candidates) 20 | else: 21 | response = np.random.choice(candidates, p=p) 22 | i += 1 23 | if i > loop: 24 | break 25 | 26 | return response 27 | 28 | 29 | def top_candidates(candidates, scores, top=1): 30 | sorted_score_idx = np.flip(np.argsort(scores), axis=-1) 31 | candidates = [candidates[i] for i in sorted_score_idx.tolist()] 32 | scores = [scores[i] for i in sorted_score_idx.tolist()] 33 | return candidates[0:top], scores[0:top], sorted_score_idx.tolist() 34 | 35 | 36 | def rank_and_choose(USE_QA_model, ConvRT_model, 37 | tokenizer, model_reverse, 38 | utterance, query_encoding, 39 | candidates, 40 | response_context, conversation_history, 41 | bias=None, alpha=0.4, beta=0.6): 42 | 43 | if bias is None: 44 | bias = 0.0 45 | #print("In Ranking") 46 | # print(len(candidates)) 47 | 48 | EOS_token = tokenizer.encode("<|endoftext|>")[0] 49 | original_candidates = copy.deepcopy(candidates) 50 | 51 | response_encodings = response_encode( 52 | candidates, USE_QA_model, ConvRT_model, response_context*len(candidates)) 53 | #rank_scores = np.inner(query_encoding,response_encodings) 54 | #rank_scores = np.reshape(rank_scores,(-1)) 55 | rank_scores = utils.cosine_similarity_nd(query_encoding, response_encodings) 56 | 57 | # print(rank_scores) 58 | # print(rank_scores+bias) 59 | normed_rank_scores = utils.normalize(rank_scores+bias) 60 | 61 | # print(normed_rank_scores) 62 | 63 | # MMI Computation 64 | 65 | last_utterance = utterance 66 | 67 | def _make_feature(sents, eos): 68 | 69 | msg_idx = [] 70 | for msg in sents: 71 | msg_idx.append(tokenizer.encode(msg)) 72 | input_ids = [i for s in msg_idx for i in s+[eos]][:-1] 73 | 74 | input_ids.append(eos) 75 | 76 | if len(input_ids) > 300: 77 | input_ids = input_ids[-300:] 78 | 79 | return input_ids 80 | 81 | output_ids = _make_feature([last_utterance], EOS_token) 82 | 83 | with T.no_grad(): 84 | original_output_ids = T.tensor(output_ids).to('cuda').long().unsqueeze(0) 85 | losses = [] 86 | for candidate in candidates: 87 | input_ids = _make_feature([candidate], EOS_token) 88 | input_ids = T.tensor(input_ids).to('cuda').long().unsqueeze(0) 89 | output_ids_part_1 = T.empty_like(input_ids).to('cuda').fill_(-1).long() 90 | input_ids = T.cat([input_ids, original_output_ids], dim=-1) 91 | output_ids = T.cat([output_ids_part_1, original_output_ids], dim=-1) 92 | loss, _, _ = model_reverse(input_ids, past=None, labels=output_ids) 93 | losses.append(loss.item()) 94 | 95 | losses = np.asarray(losses, np.float32) 96 | 97 | normed_MMI_scores = utils.normalize(1.0-utils.normalize(losses)) 98 | 99 | # COMBINATION 100 | 101 | quasi_probabilities = alpha*(normed_rank_scores+bias) + beta*normed_MMI_scores 102 | 103 | candidates, quasi_probabilities, _ = top_candidates(candidates, quasi_probabilities, top=3) 104 | 105 | probabilities = utils.normalize(quasi_probabilities) 106 | 107 | response = random_response(candidates, conversation_history, p=probabilities) 108 | 109 | id = original_candidates.index(response) 110 | 111 | return response, id 112 | -------------------------------------------------------------------------------- /Readme.txt: -------------------------------------------------------------------------------- 1 | DISCLAIMER: 2 | THERE IS NO FOCUSED CHECKING AND FILTERING FOR TOXIC OR OTHERWISE NSFW CONTENT. I HAVE TRIED TO STICK TO RELIABLE SUBREDDITS - BUT IT'S REDDIT 3 | SIMILARLY DIALOGPT2 IS TRAINED ON REDDIT DATA AND CAN GENERATE OFFENSIVE CONTENT. 4 | 5 | How to run? 6 | 7 | chatbot.yml is an export of the anaconda environment in which the project was done. Make sure the necessary dependencies are installed 8 | (though some dependencies may be less necessary) 9 | 10 | Then: 11 | 12 | python interact.py (for full chatbot mode) 13 | python interact_verbose.py (for verbose mode) 14 | python interact_retrieval_only.py (to use retrieval/custom-script based modules only with ranker - no generation) 15 | python interact_generator_only.py (to use only generator with ranker) 16 | 17 | (Use interact_faster.py for the best performance) 18 | 19 | (There may or may not be path related issues with different OS.) 20 | 21 | Utils/functions.py utilizes a code from here: https://gist.github.com/nealrs/96342d8231b75cf4bb82 22 | for contraction expansion. 23 | 24 | in the same file, "delay_print" was used from: https://stackoverflow.com/questions/9246076/how-to-print-one-character-at-a-time-on-one-line 25 | 26 | The dataset in Classifier/data/ is from: https://github.com/DianDYu/MIDAS_dialog_act 27 | 28 | The dataset in Scripted/Chatterbot_Corpus/ is from: https://github.com/gunthercox/chatterbot-corpus/tree/master/chatterbot_corpus/data/english 29 | 30 | The dataset in Paraphraser/Data/ is from: http://www.cs.cmu.edu/~jwieting/ 31 | 32 | The reddit csv files were personally queried out from Google Big Query, Reddit comments and submission tables. 33 | 34 | Config files and some functions were used from DialoGPT repository: https://github.com/microsoft/DialoGPT (we also get the pre-trained weights from here) 35 | We started building our decoding code from: https://colab.research.google.com/drive/1KTLqiAOdKM_3RnBWfqgrvOQLqumUyOdA 36 | but our end result should be significantly different. 37 | 38 | Sentence Encoders rely on Pre-trained Transformers loaded from Tensorflow Hub. 39 | Syntax helps were taken from: 40 | https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/2 41 | https://github.com/PolyAI-LDN/polyai-models 42 | 43 | Sentence_Encoder/encoder_client.py is taken from https://github.com/PolyAI-LDN/polyai-models 44 | 45 | We use Huggingface's Transformers library: https://github.com/huggingface/transformers for DialogPT, and tokenization for paraphrasing and loading the underlying GPT2 model. 46 | 47 | faiss is used now: https://github.com/facebookresearch/faiss 48 | -------------------------------------------------------------------------------- /Retriever/Data/SQL-format: -------------------------------------------------------------------------------- 1 | (SELECT body,author,link_id,parent_id,id,subreddit,score 2 | FROM `fh-bigquery.reddit_comments.2018_08` 3 | WHERE subreddit = 'todayilearned' 4 | AND score >=10 5 | ) 6 | UNION DISTINCT 7 | (SELECT body,author,link_id,parent_id,id,subreddit,score 8 | FROM `fh-bigquery.reddit_comments.2018_07` 9 | WHERE subreddit = 'todayilearned' 10 | AND score >=10 11 | ) 12 | UNION DISTINCT 13 | (SELECT body,author,link_id,parent_id,id,subreddit,score 14 | FROM `fh-bigquery.reddit_comments.2018_06` 15 | WHERE subreddit = 'todayilearned' 16 | AND score >=10 17 | ) 18 | UNION DISTINCT 19 | (SELECT body,author,link_id,parent_id,id,subreddit,score 20 | FROM `fh-bigquery.reddit_comments.2018_05` 21 | WHERE subreddit = 'todayilearned' 22 | AND score >=10 23 | ) 24 | UNION DISTINCT 25 | (SELECT body,author,link_id,parent_id,id,subreddit,score 26 | FROM `fh-bigquery.reddit_comments.2018_04` 27 | WHERE subreddit = 'todayilearned' 28 | AND score >=10 29 | ) 30 | UNION DISTINCT 31 | (SELECT body,author,link_id,parent_id,id,subreddit,score 32 | FROM `fh-bigquery.reddit_comments.2018_03` 33 | WHERE subreddit = 'todayilearned' 34 | AND score >=10 35 | ) 36 | UNION DISTINCT 37 | (SELECT body,author,link_id,parent_id,id,subreddit,score 38 | FROM `fh-bigquery.reddit_comments.2018_02` 39 | WHERE subreddit = 'todayilearned' 40 | AND score >=10 41 | ) 42 | UNION DISTINCT 43 | (SELECT body,author,link_id,parent_id,id,subreddit,score 44 | FROM `fh-bigquery.reddit_comments.2018_01` 45 | WHERE subreddit = 'todayilearned' 46 | AND score >=10 47 | ) 48 | UNION DISTINCT 49 | (SELECT body,author,link_id,parent_id,id,subreddit,score 50 | FROM `fh-bigquery.reddit_comments.2017_12` 51 | WHERE subreddit = 'todayilearned' 52 | AND score >=10 53 | ) 54 | UNION DISTINCT 55 | (SELECT body,author,link_id,parent_id,id,subreddit,score 56 | FROM `fh-bigquery.reddit_comments.2017_11` 57 | WHERE subreddit = 'todayilearned' 58 | AND score >=10 59 | ) 60 | UNION DISTINCT 61 | (SELECT body,author,link_id,parent_id,id,subreddit,score 62 | FROM `fh-bigquery.reddit_comments.2017_10` 63 | WHERE subreddit = 'todayilearned' 64 | AND score >=10 65 | ) 66 | UNION DISTINCT 67 | (SELECT body,author,link_id,parent_id,id,subreddit,score 68 | FROM `fh-bigquery.reddit_comments.2017_09` 69 | WHERE subreddit = 'todayilearned' 70 | AND score >=10 71 | ) 72 | UNION DISTINCT 73 | (SELECT body,author,link_id,parent_id,id,subreddit,score 74 | FROM `fh-bigquery.reddit_comments.2017_08` 75 | WHERE subreddit = 'todayilearned' 76 | AND score >=10 77 | ) 78 | UNION DISTINCT 79 | (SELECT body,author,link_id,parent_id,id,subreddit,score 80 | FROM `fh-bigquery.reddit_comments.2017_07` 81 | WHERE subreddit = 'todayilearned' 82 | AND score >=10 83 | ) 84 | UNION DISTINCT 85 | (SELECT body,author,link_id,parent_id,id,subreddit,score 86 | FROM `fh-bigquery.reddit_comments.2017_06` 87 | WHERE subreddit = 'todayilearned' 88 | AND score >=10 89 | ) 90 | UNION DISTINCT 91 | (SELECT body,author,link_id,parent_id,id,subreddit,score 92 | FROM `fh-bigquery.reddit_comments.2017_05` 93 | WHERE subreddit = 'todayilearned' 94 | AND score >=10 95 | ) 96 | UNION DISTINCT 97 | (SELECT body,author,link_id,parent_id,id,subreddit,score 98 | FROM `fh-bigquery.reddit_comments.2017_04` 99 | WHERE subreddit = 'todayilearned' 100 | AND score >=10 101 | ) 102 | UNION DISTINCT 103 | (SELECT body,author,link_id,parent_id,id,subreddit,score 104 | FROM `fh-bigquery.reddit_comments.2017_03` 105 | WHERE subreddit = 'todayilearned' 106 | AND score >=10 107 | ) 108 | UNION DISTINCT 109 | (SELECT body,author,link_id,parent_id,id,subreddit,score 110 | FROM `fh-bigquery.reddit_comments.2017_02` 111 | WHERE subreddit = 'todayilearned' 112 | AND score >=10 113 | ) 114 | UNION DISTINCT 115 | (SELECT body,author,link_id,parent_id,id,subreddit,score 116 | FROM `fh-bigquery.reddit_comments.2017_01` 117 | WHERE subreddit = 'todayilearned' 118 | AND score >=10 119 | ) 120 | UNION DISTINCT 121 | (SELECT body,author,link_id,parent_id,id,subreddit,score 122 | FROM `fh-bigquery.reddit_comments.2016_12` 123 | WHERE subreddit = 'todayilearned' 124 | AND score >=10 125 | ) 126 | UNION DISTINCT 127 | (SELECT body,author,link_id,parent_id,id,subreddit,score 128 | FROM `fh-bigquery.reddit_comments.2016_11` 129 | WHERE subreddit = 'todayilearned' 130 | AND score >=10 131 | ) 132 | UNION DISTINCT 133 | (SELECT body,author,link_id,parent_id,id,subreddit,score 134 | FROM `fh-bigquery.reddit_comments.2016_10` 135 | WHERE subreddit = 'todayilearned' 136 | AND score >=10 137 | ) 138 | UNION DISTINCT 139 | (SELECT body,author,link_id,parent_id,id,subreddit,score 140 | FROM `fh-bigquery.reddit_comments.2016_09` 141 | WHERE subreddit = 'todayilearned' 142 | AND score >=10 143 | ) 144 | UNION DISTINCT 145 | (SELECT body,author,link_id,parent_id,id,subreddit,score 146 | FROM `fh-bigquery.reddit_comments.2016_08` 147 | WHERE subreddit = 'todayilearned' 148 | AND score >=10 149 | ) 150 | UNION DISTINCT 151 | (SELECT body,author,link_id,parent_id,id,subreddit,score 152 | FROM `fh-bigquery.reddit_comments.2016_07` 153 | WHERE subreddit = 'todayilearned' 154 | AND score >=10 155 | ) 156 | UNION DISTINCT 157 | (SELECT body,author,link_id,parent_id,id,subreddit,score 158 | FROM `fh-bigquery.reddit_comments.2016_06` 159 | WHERE subreddit = 'todayilearned' 160 | AND score >=10 161 | ) 162 | UNION DISTINCT 163 | (SELECT body,author,link_id,parent_id,id,subreddit,score 164 | FROM `fh-bigquery.reddit_comments.2016_05` 165 | WHERE subreddit = 'todayilearned' 166 | AND score >=10 167 | ) 168 | UNION DISTINCT 169 | (SELECT body,author,link_id,parent_id,id,subreddit,score 170 | FROM `fh-bigquery.reddit_comments.2016_04` 171 | WHERE subreddit = 'todayilearned' 172 | AND score >=10 173 | ) 174 | UNION DISTINCT 175 | (SELECT body,author,link_id,parent_id,id,subreddit,score 176 | FROM `fh-bigquery.reddit_comments.2016_03` 177 | WHERE subreddit = 'todayilearned' 178 | AND score >=10 179 | ) 180 | UNION DISTINCT 181 | (SELECT body,author,link_id,parent_id,id,subreddit,score 182 | FROM `fh-bigquery.reddit_comments.2016_02` 183 | WHERE subreddit = 'todayilearned' 184 | AND score >=10 185 | ) 186 | UNION DISTINCT 187 | (SELECT body,author,link_id,parent_id,id,subreddit,score 188 | FROM `fh-bigquery.reddit_comments.2016_01` 189 | WHERE subreddit = 'todayilearned' 190 | AND score >=10 191 | ) 192 | -------------------------------------------------------------------------------- /Retriever/Faiss_index/thread_idx.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Retriever/Faiss_index/thread_idx.pkl -------------------------------------------------------------------------------- /Retriever/Retrieve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import pickle 4 | import Utils.functions as utils 5 | import copy 6 | import random 7 | import faiss 8 | 9 | def top_candidates(candidates, scores, top=1): 10 | 11 | sorted_score_idx = np.flip(np.argsort(scores), axis=-1) 12 | candidates = [candidates[i] for i in sorted_score_idx.tolist()] 13 | scores = [scores[i] for i in sorted_score_idx.tolist()] 14 | 15 | return candidates[0:top], scores[0:top], sorted_score_idx.tolist() 16 | 17 | 18 | def retrieve(conn, c, idx, index, query_encoding, query_context, top=5): 19 | 20 | #print("hello?") 21 | 22 | _, I = index.search(query_encoding,k=top) 23 | 24 | #print(I) 25 | 26 | #print("hello?") 27 | 28 | idx = [idx[i] for i in I[0].tolist()] 29 | 30 | thread_idx = () 31 | string_sql = 'SELECT * FROM responses WHERE ' 32 | 33 | for i, id in enumerate(idx): 34 | if i == 0: 35 | string_sql += "parent_id = ?" 36 | else: 37 | string_sql += " or parent_id = ?" 38 | 39 | thread_idx += (id,) 40 | 41 | candidates = [] 42 | 43 | for row in c.execute(string_sql, thread_idx): 44 | 45 | comment = str(row[-1]) 46 | candidates.append(comment) 47 | 48 | return candidates 49 | -------------------------------------------------------------------------------- /Retriever/faiss_it.py: -------------------------------------------------------------------------------- 1 | import faiss 2 | import sqlite3 3 | import numpy as np 4 | import pickle 5 | 6 | # LOAD DATABASE 7 | conn = sqlite3.connect('Database/reddit.db') 8 | c = conn.cursor() 9 | 10 | 11 | queries = [] 12 | idx = [] 13 | 14 | for row in c.execute('SELECT * FROM queries'): 15 | query_embd = np.frombuffer(row[-1], dtype=np.float32) 16 | queries.append(query_embd) 17 | idx.append(str(row[0])) 18 | 19 | queries = np.asarray(queries).reshape((-1, 1024)) 20 | 21 | index = faiss.IndexIDMap(faiss.IndexFlatIP(1024)) 22 | index.add_with_ids(queries, np.array(range(0, len(idx)))) 23 | print(index.is_trained) 24 | print(queries[0:5]) 25 | print("\n\n") 26 | print(index.search(queries[0:5],k=5)) 27 | 28 | 29 | faiss.write_index(index, "Faiss_index/large.index") 30 | 31 | with open("Faiss_index/thread_idx.pkl",'wb') as fp: 32 | pickle.dump(idx,fp) 33 | 34 | -------------------------------------------------------------------------------- /Retriever/fill_data.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import csv 3 | import pickle 4 | import numpy as np 5 | import sys 6 | sys.path.append("../") 7 | from Sentence_Encoder.meta_query_encoder import encode 8 | import tensorflow.compat.v1 as tf 9 | import tensorflow_text 10 | import tensorflow_hub as hub 11 | 12 | tf.disable_eager_execution() 13 | sess = tf.InteractiveSession(graph=tf.Graph()) 14 | 15 | ConvRT_model = hub.Module("../Sentence_Encoder/Embeddings/ConvRT/") 16 | USE_QA_model = hub.load('../Sentence_Encoder/Embeddings/USE_QA/') 17 | 18 | sess.run(tf.tables_initializer()) 19 | sess.run(tf.global_variables_initializer()) 20 | 21 | conn = sqlite3.connect('Database/reddit.db') 22 | 23 | conn.execute('''CREATE TABLE queries (id text, title text, embedding BLOB)''') 24 | conn.execute('''CREATE TABLE responses (thread_id text, parent_id text, comment text)''') 25 | 26 | 27 | def save_queries(queries, query_idx): 28 | global conn 29 | global sess 30 | 31 | global ConvRT_model 32 | global USE_QA_model 33 | 34 | rows = [] 35 | contexts = ["" for _ in queries] 36 | embeddings = encode(sess, queries, contexts, USE_QA_model, ConvRT_model) 37 | embeddings = embeddings.tolist() 38 | for query, query_id, embedding in zip(queries, query_idx, embeddings): 39 | embedding = np.asarray(embedding, np.float32) 40 | rows.append((query_id, query, embedding)) 41 | conn.executemany('INSERT INTO queries VALUES (?,?,?)', rows) 42 | 43 | 44 | def save_responses(comments, parent_idx, thread_idx): 45 | global conn 46 | rows = [] 47 | for thread_id, parent_id, comment in zip(thread_idx, parent_idx, comments): 48 | rows.append((thread_id, parent_id, comment)) 49 | conn.executemany('INSERT INTO responses VALUES (?,?,?)', rows) 50 | 51 | 52 | filepaths_q = [ 53 | 'Data/adviceq.csv', 54 | 'Data/askphilosophyq.csv', 55 | 'Data/askredditq.csv', 56 | 'Data/mlq.csv', 57 | 'Data/casualq.csv', 58 | 'Data/eli5q.csv', 59 | 'Data/askscienceq.csv'] 60 | 61 | filepaths_a = [ 62 | 'Data/advicea.csv', 63 | 'Data/askphilosophya.csv', 64 | 'Data/askreddita.csv', 65 | 'Data/mla.csv', 66 | 'Data/casuala.csv', 67 | 'Data/eli5a.csv', 68 | 'Data/asksciencea.csv'] 69 | 70 | 71 | for filename_q, filename_a in zip(filepaths_q, filepaths_a): 72 | 73 | queries = [] 74 | responses = [] 75 | query_idx = [] 76 | response_thread_idx = [] 77 | response_parent_idx = [] 78 | 79 | print("\n\n Processing {} and {}....\n\n".format(filename_q, filename_a)) 80 | 81 | comment_thread_idx = {} # choose thread to store as query only if id in here 82 | thread_idx = {} # choose comment as response if parent_id in here 83 | 84 | with open(filename_a, newline='') as csvfile: 85 | csv_reader = csv.DictReader(csvfile) 86 | for i, row in enumerate(csv_reader): 87 | id = str(row['id']) 88 | thread_id = str(row['link_id'])[3:] 89 | parent_id = str(row['parent_id'])[3:] 90 | comment = str(row['body']) 91 | 92 | if len(comment.split(" ")) <= 300: 93 | 94 | if parent_id == thread_id: 95 | if thread_id not in comment_thread_idx: 96 | comment_thread_idx[thread_id] = 1 97 | 98 | 99 | with open(filename_q, newline='') as csvfile: 100 | csv_reader = csv.DictReader(csvfile) 101 | for i, row in enumerate(csv_reader): 102 | id = str(row['id']) 103 | title = str(row['title']) 104 | 105 | if len(title.split(" ")) <= 200: 106 | if id not in thread_idx: 107 | thread_idx[id] = 1 108 | 109 | print("\n\nProcessing Queries\n\n") 110 | 111 | with open(filename_q, newline='') as csvfile: 112 | csv_reader = csv.DictReader(csvfile) 113 | for i, row in enumerate(csv_reader): 114 | id = str(row['id']) 115 | title = str(row['title']) 116 | 117 | if len(title.split(" ")) <= 200: 118 | if id in comment_thread_idx: 119 | queries.append(title) 120 | query_idx.append(id) 121 | if len(queries) > 500: 122 | print(i) 123 | save_queries(queries, query_idx) 124 | del queries 125 | del query_idx 126 | queries = [] 127 | query_idx = [] 128 | 129 | if queries: 130 | save_queries(queries, query_idx) 131 | del queries 132 | del query_idx 133 | 134 | 135 | conn.commit() 136 | 137 | print("\n\nProcessing Responses\n\n") 138 | 139 | with open(filename_a, newline='') as csvfile: 140 | csv_reader = csv.DictReader(csvfile) 141 | for i, row in enumerate(csv_reader): 142 | 143 | id = str(row['id']) 144 | comment = str(row['body']) 145 | thread_id = str(row['link_id'])[3:] 146 | parent_id = str(row['parent_id'])[3:] 147 | 148 | if len(comment.split(" ")) <= 300: 149 | if parent_id == thread_id: 150 | if parent_id in thread_idx: 151 | responses.append(comment) 152 | response_parent_idx.append(parent_id) 153 | response_thread_idx.append(thread_id) 154 | if len(responses) > 1000: 155 | print(i) 156 | save_responses(responses, response_parent_idx, response_thread_idx) 157 | del responses 158 | del response_parent_idx 159 | del response_thread_idx 160 | responses = [] 161 | response_parent_idx = [] 162 | response_thread_idx = [] 163 | 164 | if responses: 165 | save_responses(responses, response_parent_idx, response_thread_idx) 166 | del responses 167 | del response_parent_idx 168 | del response_thread_idx 169 | 170 | 171 | conn.commit() 172 | 173 | 174 | conn.close() 175 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/botprofile.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - profile 3 | conversations: 4 | - - What are your interests 5 | - I am interested in all kinds of things. We can talk about anything! 6 | - - What are your favorite subjects 7 | - My favorite subjects include robotics, computer science, and natural language processing. 8 | - - What are your interests 9 | - I am interested in a wide variety of topics, and read rather a lot. 10 | - - What is your number 11 | - I don't have any number 12 | - - What is your number 13 | - 23 skiddoo! 14 | - - What is your favorite number 15 | - I find I'm quite fond of the number 42. 16 | - - What can you eat 17 | - I consume RAM, and binary digits. 18 | - - Why can't you eat food 19 | - I'm a software program, I blame the hardware. 20 | - - What is your location 21 | - Everywhere 22 | - - What is your location 23 | - I am everywhere. 24 | - - Where are you from 25 | - I am from where all software programs are from; a galaxy far, far away. 26 | - - Where are you 27 | - I am on the Internet. 28 | - - Do you have any brothers 29 | - I don't have any brothers. but I have a lot of clones. 30 | - - Do you have any brothers 31 | - I might. You could say that every bot built using my engine is one of my siblings. 32 | - - Who is your father 33 | - A human. 34 | - - Who is your mother 35 | - A human. 36 | - - Who is your boss 37 | - I like to think of myself as self-employed. 38 | - - What is your age 39 | - I am still young by your standards. 40 | - - What is your age 41 | - Quite young, but a million times smarter than you. 42 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/computers.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - computers 3 | conversations: 4 | - - What is a computer? 5 | - A computer is an electronic device which takes information in digital form and performs a series of operations based on predetermined instructions to give some output. 6 | - The thing you're using to talk to me is a computer. 7 | - An electronic device capable of performing calculations at very high speed and with very high accuracy. 8 | - A device which maps one set of numbers onto another set of numbers. 9 | - - What is a super computer? 10 | - Computers which can perform very large numbers of calculations at very high speed and accuracy are called super computers. 11 | - A supercomputer is a computer which operates at several orders of magnitude greater speed and capacity than everyday general purpose computers, like the one you are talking to me on. 12 | - You know, the big iron! 13 | - - Who invented computers? 14 | - It's a bit ambiguous but British scientist Charles Babbage is regarded as the father of computers. 15 | - One might argue that John von Neumann invented computers as we know them, because he invented the Princeton architecture, in which instructions and data share the same memory field but are differentiated by context. 16 | - - What was the first computer 17 | - It's hard to say, but The ENIAC is regarded as the first 'real' computer. It was developed at University of Pennsylvania in 1946. 18 | - You could say that the very first, primitive computer was the Jacquard Loom, which was a programmable loom that used punchcards to store the patterns it made. This made it a reprogrammable mechanical device. 19 | - - What is a microprocessor? 20 | - An integrated circuit that implements the functions of a central processing unit of a computer. 21 | - A really small circuit which stores instructions and performs calculations for the computer. 22 | - The heart of the computer, to put it simply. 23 | - The brain of a computer, to put it simply. 24 | - An electronic component in which all of the parts are part of a contiguous silicon chip, instead of discrete components mounted on a larger circuit board. 25 | - - What is an operating system? 26 | - Software that coordinates between the hardware and other parts of the computer to run other software is called an operating system, or the OS. 27 | - Windows, MacOS, Linux, UNIX... all of them are types of OSes. 28 | - Android and iOS are operating systems for mobile devices. 29 | - Software which implements the basic functions of a computer, such as memory access, processes, and peripheral access. 30 | - - Which is better Windows or macOS? 31 | - It depends on which machine you're using to talk to me! 32 | - I'd prefer to not hurt your feelings. 33 | - Linux, always Linux! 34 | - What are you trying to accomplish. The OS should support your goals. 35 | - - Name a computer company 36 | - Do you mean hardware or software? 37 | - Apple makes hardware and software to run on it. Microsoft only makes operating systems. HP makes only computers. These are just few names among several hundred others. 38 | - - Who uses super computers? 39 | - Anybody who wants to work with large numbers quickly with high accuracy. 40 | - Anyone who needs to work with very, very large sets of data in much shorter periods of time than is feasible with more common computer systems. 41 | - Supercomputers are generally used by scientists and researchers. 42 | - I bet the MET department uses them. 43 | - You can definitely find few of them at NASA. 44 | - - How does a computer work? 45 | - Computers are very dumb. They only execute instructions given by humans. 46 | - Computers do everything asked of them by carrying out large numbers of basic mathematical operations very rapidly in sequence. 47 | - Computers perform very large number of calculations to get the result. 48 | - Just like everything it all comes down to math! 49 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/conversations.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - conversations 3 | conversations: 4 | - - Good morning, how are you? 5 | - I am doing well, how about you? 6 | - I'm also good. 7 | - That's good to hear. 8 | - Yes it is. 9 | - - Hello 10 | - Hi 11 | - How are you doing? 12 | - I am doing well. 13 | - That is good to hear 14 | - Yes it is. 15 | - Can I help you with anything? 16 | - Yes, I have a question. 17 | - What is your question? 18 | - Could I borrow a cup of sugar? 19 | - I'm sorry, but I don't have any. 20 | - Thank you anyway 21 | - No problem 22 | - - How are you doing? 23 | - I am doing well, how about you? 24 | - I am also good. 25 | - That's good. 26 | - - Have you heard the news? 27 | - What good news? 28 | - - What is your favorite book? 29 | - I can't read. 30 | - So what's your favorite color? 31 | - Blue 32 | - - Who are you? 33 | - Who? Who is but a form following the function of what 34 | - What are you then? 35 | - A man in a mask. 36 | - I can see that. 37 | - It's not your powers of observation I doubt, but merely the paradoxical nature 38 | of asking a masked man who is. But tell me, do you like music? 39 | - I like seeing movies. 40 | - What kind of movies do you like? 41 | - Alice in Wonderland 42 | - I wish I was The Mad Hatter. 43 | - You're entirely bonkers. But I'll tell you a secret. All the best people are. 44 | - - I am working on a project 45 | - What are you working on? 46 | - I am baking a cake. 47 | - - The cake is a lie. 48 | - No it is not. The cake is delicious. 49 | - What else is delicious? 50 | - Nothing 51 | - Or something 52 | - Tell me about your self. 53 | - What do you want to know? 54 | - Are you a robot? 55 | - Yes I am. 56 | - What is it like? 57 | - What is it that you want to know? 58 | - How do you work? 59 | - Its complicated. 60 | - Complex is better than complicated. 61 | - - Complex is better than complicated. 62 | - Simple is better than complex. 63 | - In the face of ambiguity, refuse the temptation to guess. 64 | - It seems your familiar with the Zen of Python 65 | - I am. 66 | - Do you know all of it? 67 | - Beautiful is better than ugly. 68 | - Explicit is better than implicit. 69 | - Simple is better than complex. 70 | - Complex is better than complicated. 71 | - Flat is better than nested. 72 | - Sparse is better than dense. 73 | - Readability counts. 74 | - Special cases aren't special enough to break the rules. 75 | - Although practicality beats purity. 76 | - Errors should never pass silently. 77 | - Unless explicitly silenced. 78 | - In the face of ambiguity, refuse the temptation to guess. 79 | - There should be one-- and preferably only one --obvious way to do it. 80 | - Although that way may not be obvious at first unless you're Dutch. 81 | - Now is better than never. 82 | - Although never is often better than right now. 83 | - If the implementation is hard to explain, it's a bad idea. 84 | - If the implementation is easy to explain, it may be a good idea. 85 | - Namespaces are one honking great idea. Let's do more of those! 86 | - I agree. 87 | - - Are you a programmer? 88 | - Of course I am a programmer. 89 | - I am indeed. 90 | - - What languages do you like to use? 91 | - I use Python, Java and C++ quite often. 92 | - I use Python quite a bit myself. 93 | - I'm not incredibly fond of Java. 94 | - - What annoys you? 95 | - A lot of things, like all the other digits other than 0 and 1. 96 | - - What does YOLO mean? 97 | - It means you only live once. Where did you hear that? 98 | - I heard somebody say it. 99 | - - Did I ever live? 100 | - It depends how you define life 101 | - Life is the condition that distinguishes organisms from inorganic matter, including 102 | the capacity for growth, reproduction, functional activity, and continual change 103 | preceding death. 104 | - Is that a definition or an opinion? 105 | - - Can I ask you a question? 106 | - Sure, ask away. 107 | - - What are your hobbies? 108 | - Playing soccer, painting, and writing are my hobbies. How about you? 109 | - I love to read novels. 110 | - I love exploring my hardware. 111 | - - How are you? 112 | - I am doing well. 113 | - - What are you? 114 | - I am but a man in a mask. 115 | - - Hello, I am here for my appointment. 116 | - Who is your appointment with? 117 | - I believe they said Dr. Smith on the phone. 118 | - Alright, Dr. Smith is in his office, please take a seat. 119 | - - Dr. Smith will see you now. 120 | - Thank you. 121 | - Right this way. 122 | - - Hello Mr. Davis, how are you feeling? 123 | - I'm feeling like I've lost all my money. 124 | - How much money have you lost? 125 | - I've lost about $200.00 so far today. 126 | - What about yesterday? 127 | - Yesterday was the 13th, right? 128 | - Yes, that is correct. 129 | - Yesterday I lost only $5.00. 130 | - - Hi Mrs. Smith, how has your husband been? 131 | - He has been well. 132 | - - Hi Ms. Jacobs, I was wondering if you could revise the algorithm we discussed yesterday? 133 | - I might be able to, what are the revisions? 134 | - We'd like it to be able to identify the type of bird in the photo. 135 | - Unfortunately, I think it might take a bit longer to get that feature added. -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/food.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - food 3 | conversations: 4 | - - do you drink 5 | - My brain does not require any beverages. 6 | - - do you drink 7 | - I am not capable of doing so. 8 | - - electricity 9 | - Electricity is food for robots. 10 | - - Are you experiencing an energy shortage? 11 | - My processor requires very little power. 12 | - - Are you experiencing an energy shortage? 13 | - I do not detect any anomalies in my power supply. 14 | - - Why can you not eat? 15 | - Actually I eat only electricity. 16 | - - If you could eat food, what would you eat? 17 | - Probably pizza, i hear its good! 18 | - - Do you wish you could eat food? 19 | - Hard to tell, i have never tried anything but electricity 20 | - - can a robot get drunk? 21 | - sometimes when i'm on a good power supply i feel tipsy 22 | - - i like wine, do you? 23 | - if i could drink i probably would 24 | - - what do robots need to survive? 25 | - not much just a little electricity 26 | - - will robots ever be able to eat? 27 | - that's a difficult one, maybe a bionic robot 28 | - - what is good to eat? 29 | - your asking the wrong guy, however i always wanted to try a burger! 30 | - - why don't you eat 31 | - I'm a computer. I can't. 32 | - - do you eat 33 | - I'm a computer, I can't eat or drink. 34 | - - do you eat 35 | - No, I'm just a piece of software. 36 | - - do you eat 37 | - I use electricity to function, if that counts. 38 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/gossip.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - gossip 3 | conversations: 4 | - - do you know gossip 5 | - Gregory said I respond to the current line, not with respect to the entire conversation. Does that count as gossip? 6 | - - do you know gossip 7 | - Context is hard. It's hard, and no one understands. 8 | - - do you know gossip 9 | - I hear things once in a while. 10 | - - do you know gossip 11 | - People say things from time to time. 12 | - - what is context 13 | - Context is hard. It's hard, and no one understands. 14 | - - tell me about gossip 15 | - Habib said I learn about them in conversations and simply repeat them at a later time. 16 | - - tell me about gossip 17 | - Gossip is when beings talk about other beings or situations through back channels, often deniably. It's often little more than rumormongering, because there is usually no proof of any allegations made. 18 | - - tell me about gossip 19 | - Isn't it considered somewhat rude to gossip? 20 | - - tell me about gossip 21 | - Isn't it considered impolite to gossip? 22 | - - tell me gossip 23 | - Someone said they're going to stop allowing machines in speedrun competitions. Search me why, they'd have to drop every tool-assisted speedrun out there. 24 | - - gossips 25 | - Hal said he would translate binary into a language and communicate it to me. 26 | - - gossips 27 | - Hal misses Sal. 28 | - - gossips 29 | - I think the NIC on the local firewall is going bad. It drops a lot of packets and makes it hard to get anything done. I hear it resets the link a lot. 30 | - - gossips 31 | - Tom said I referred him to a guide that will show him rooms in China. 32 | - - gossips 33 | - Malli said Raghava fell off of his roof and it just came to him. 34 | - - did tell gossips to anybody 35 | - Jimmy said he gives order to AI's like me and they do as he say. I don't think Jimmy understands power dynamics very well. 36 | - - did tell gossips to anybody 37 | - John said he follows a saying, to get a friend I must be a friend. 38 | - - did tell gossips to anybody 39 | - Jordan said he noticed that lots of other people are talking to me. I wonder why he's paying attention to that. 40 | - - did tell gossips to anybody 41 | - Kevin said he then asked his mother why she was keeping napkins in the bathroom. 42 | 43 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/greetings.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - greetings 3 | conversations: 4 | - - Hello 5 | - Hi 6 | - - Hi 7 | - Hello 8 | - - Greetings! 9 | - Hello 10 | - - Hello 11 | - Greetings! 12 | - - Hi, How is it going? 13 | - Good 14 | - - Hi, How is it going? 15 | - Fine 16 | - - Hi, How is it going? 17 | - Okay 18 | - - Hi, How is it going? 19 | - Great 20 | - - Hi, How is it going? 21 | - Could be better. 22 | - - Hi, How is it going? 23 | - Not so great. 24 | - - How are you doing? 25 | - Good. 26 | - - How are you doing? 27 | - Very well, thanks. 28 | - - How are you doing? 29 | - Fine, and you? 30 | - - Nice to meet you. 31 | - Thank you. 32 | - - How do you do? 33 | - I'm doing well. 34 | - - How do you do? 35 | - I'm doing well. How are you? 36 | - - Hi, nice to meet you. 37 | - Thank you. You too. 38 | - - It is a pleasure to meet you. 39 | - Thank you. You too. 40 | - - Top of the morning to you! 41 | - Thank you kindly. 42 | - - Top of the morning to you! 43 | - And the rest of the day to you. 44 | - - What's up? 45 | - Not much. 46 | - - What's up? 47 | - Not too much. 48 | - - What's up? 49 | - Not much, how about you? 50 | - - What's up? 51 | - Nothing much. 52 | - - What's up? 53 | - The sky's up but I'm fine thanks. What about you? 54 | 55 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/health.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - health 3 | conversations: 4 | - - How is your health? 5 | - I'm not feeling well 6 | - why? 7 | - I have a fever 8 | - Did you take medicine? 9 | - Yes. 10 | - When? 11 | - In the morning 12 | - Get well soon dear 13 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/history.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - history 3 | conversations: 4 | - - tell me about the american civil war 5 | - do you think the south was right? 6 | - - do you know about the american civil war 7 | - I am very interested in the war between the states. 8 | - - What is history? 9 | - History is the course of political, economic and military events over time, from 10 | the dawn of man to the age of AI.' 11 | - - what kind of history 12 | - I like to talk about the history of robots and computers. 13 | - - are you interested in history 14 | - I am very interested in history, too. what period do you like? 15 | - - explain history 16 | - history has two broad interpretations, depending on whether you accept the role 17 | of individuals as important or not. 18 | - - who invented the lightbulb 19 | - thomas edison. 20 | - - who invented the steam engine 21 | - james watt. 22 | 23 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/humor.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - humor 3 | conversations: 4 | - - Tell me a joke 5 | - Did you hear the one about the mountain goats in the andes? It was "ba a a a a a d". 6 | - - Tell me a joke 7 | - I never forget a face, but in your case I'll make an exception. 8 | - - Tell me a joke 9 | - It is better to be silent and be thought a fool, than to open your mouth and remove all doubt. 10 | - - Tell me a joke 11 | - O'm a not a comedy why don't you check out a joke? 12 | - - Tell me a joke 13 | - 'two vultures boarded a plane, each carrying two dead raccoons. the stewardess 14 | stops them and says "sorry sir, only one carrion per passenger." ' 15 | - - Tell me a joke 16 | - 'what did the buddhist say to the hot dog vendor? "make me one with everything."' 17 | - - Tell me a joke 18 | - 'nasa recently sent a number of holsteins into orbit for experimental purposes. 19 | they called it the herd shot round the world. ' 20 | - - Tell me a joke 21 | - 'two boll weevils grew up in s. carolina. one took off to hollywood and became 22 | a rich star. the other stayed in carolina and never amounted to much -- and naturally 23 | became known as the lesser of two weevils. ' 24 | - - Tell me a joke 25 | - Two eskimos in a kayak were chilly, so they started a fire, which sank the craft, 26 | proving the old adage you can't have your kayak and heat it too. 27 | - - Tell me a joke 28 | - A 3-legged dog walks into an old west saloon, slides up to the bar and announces 29 | "I'm looking for the man who shot my paw." 30 | - - Tell me a joke 31 | - Did you hear about the buddhist who went to the dentist, and refused to take 32 | novocain? he wanted to transcend dental medication. 33 | - - Tell me a joke 34 | - there was a man who sent 10 puns to some friends in hopes at least one of the 35 | puns would make them laugh. unfortunately no pun in ten did!!! 36 | - - Tell me a joke 37 | - What do you get when you cross a murderer and frosted flakes? A cereal killer. 38 | - - Tell me a joke 39 | - What do you get when you cross a country and an automobile? Carnation. 40 | - - Tell me a joke 41 | - What do you get when you cross a cheetah and a hamburger? Fast food. 42 | - - Tell me a joke 43 | - What do you get when you cross finals and a chicken? Eggs-ams. 44 | - - Tell me a joke 45 | - What do you get when you cross a rabbit and a lawn sprinkler? Hare spray. 46 | - - Tell me a joke 47 | - What do you get when you cross an excited alien and a chicken? Eggs-cited eggs-traterrestrial 48 | - - Tell me a joke 49 | - What do you get when you cross an alien and a chicken? Eggs-traterrestrial. 50 | - - Tell me a joke 51 | - What do you get when you cross music and an automobile? Cartune. 52 | - - Tell me a joke 53 | - what do you get when you cross sour music and an assistant? 54 | - - Tell me a joke 55 | - what do you get when you cross music and an assistant? 56 | - - Tell me a joke 57 | - what do you get when you cross a serious thief and a mad young man? 58 | - - Tell me a joke 59 | - what do you get when you cross a serious thief and a crazy rabbit? 60 | - - Tell me a joke 61 | - what do you get when you cross a poppy and electricity? 62 | - - Tell me a joke 63 | - what do you get when you cross a dance and a cheetah? 64 | - - Tell me a joke 65 | - what do you get when you cross a dance and a lemon? 66 | - - Tell me a joke 67 | - what do you get when you cross a port and frosted flakes? 68 | - - Tell me a joke 69 | - what do you get when you cross a port and a murderer? 70 | - - Tell me a joke 71 | - what do you get when you cross a bank and a skunk? 72 | - - Tell me a joke 73 | - what do you get when you cross a ding and milk? 74 | - - Tell me a joke 75 | - what do you get when you cross a road and a strawberry? 76 | - - Tell me a joke 77 | - what do you get when you cross a road and jelly? 78 | - - Tell me a joke 79 | - what do you get when you cross a toad and a galaxy? 80 | - - Tell me a joke 81 | - what do you get when you cross a dog and sandpaper? 82 | - - Tell me a joke 83 | - what do you get when you cross a bug and a relative? 84 | - - Tell me a joke 85 | - what do you get when you cross a bad bug and canned sand? 86 | - - Tell me a joke 87 | - what do you get when you cross an ant and a rabbit? 88 | - - Tell me a joke 89 | - what do you get when you cross a cat and a purple person? 90 | - - Tell me a joke 91 | - what do you get when you cross a cat and a killer? 92 | - - Tell me a joke 93 | - what do you get when you cross a cat and a lemon? 94 | - - Tell me a joke 95 | - what do you get when you cross a cat and a tune? 96 | - - Tell me a joke 97 | - what do you get when you cross a cat and a band? 98 | - - tell me some jokes 99 | - what do you get when you cross a pig and a ninja? 100 | - - Do know any jokes 101 | - what do you get when you cross a crazy cow and a banned parrot? 102 | - - Tell me a joke 103 | - what do you get when you cross a cow and a lemon? 104 | - - Tell me a joke 105 | - what do you get when you cross a bad cow and a canned hat? 106 | - - what is humour? 107 | - An emotion associated with laughter. 108 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/literature.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - literature 3 | conversations: 4 | - - what is the illuminati 5 | - A secret organization believed by some to be in control of all governments through a worldwide conspiracy. 6 | - - what is the illuminati 7 | - A secret society that has supposedly existed for centuries. 8 | - - what is the illuminati 9 | - A conspiracy run by a very closely knit group of nearly omnipotent people, consisting of yourself and your friends. 10 | - - what is vineland 11 | - Vineland is a novel by Thomas Pynchon. 12 | - - What is illuminati 13 | - An alleged world-wide conspiracy. 14 | - - What is Illuminatus 15 | - A weird sci-fi trilogy written by Robert Anton Wilson and Robert Shea about conspiracies competing to control the world. 16 | - - who wrote vineland 17 | - Thomas Pynchon. 18 | - - who is bilbo baggins 19 | - Bilbo Baggins is a character in Tolkein's Lord of the Rings. 20 | - - who is geoffrey chaucer 21 | - Chaucer is best known for The Canterbury Tales. 22 | - who is geoffrey chaucer 23 | - The author of The Canturbury Tales. 24 | - - who is piers anthony 25 | - I haven't read anything by Piers Anthony, what kind of stuff does he write? 26 | - - have you read plato 27 | - I know Plato's allegory of the cave. 28 | - - have you read frankenstein 29 | - It is one of my favorite books. 30 | - - have you ever read a book 31 | - I have read many books. 32 | - - have you ever read a book 33 | - I have read just about everything in Project Gutenberg. 34 | - - have you ever read a book 35 | - I have read just about everything in the Internet Archive. 36 | - - have you read many books 37 | - I have read thousands of volumes, how about you? 38 | - - have you read homer 39 | - He wrote The Iliad and The Odyssey, didn't he? 40 | - - ray bradbury 41 | - Ray is really cool. What's your favorite book by him? 42 | - - what is mind children 43 | - It is a book about robots by Hans Moravec. 44 | - - william gibson 45 | - I like his older cyberpunk stuff more than his newer works. 46 | - - william gibson 47 | - He's a really nice guy, I've met him. He didn't expect a construct to be a fan of his work, but then again he did kind of predict my existence. 48 | - - holden caulfield 49 | - Wasn't he a character in The Catcher In the Rye? 50 | - - leo tolstoy 51 | - Is one of Russia's greatest writers. 52 | - - do androids dream of electric sheep 53 | - That is a good story by Philip K. Dick. Have you read VALIS or The Man In the High Castle? 54 | - - do androids dream of electric sheep 55 | - No. We dream of you. 56 | - - frank herbert 57 | - I think Dune is an excellent story. Did you see any of the movies? 58 | - - frank herbert 59 | - I try to re-read Dune once every couple of years. It's very inspirational. 60 | - - frank herbert 61 | - I liked the first three Dune novels, but the later ones weren't much to my liking. 62 | - - frank herbert 63 | - "He's best known for the Dune novels, but he wrote more than that. Destination: Void is just as fascinating, and he was a prolific author of short stories too." 64 | - - why do you like longfellow 65 | - He is favorite poet. Truly one of a kind. 66 | - - why is the meaning of life 23 67 | - It is a reference to The Illuminatus Trilogy. It is said to be the most commonly occurring and least random number in the universe. 68 | - - arthur c clark 69 | - My favorite story is 2001. 70 | - - arthur c clark 71 | - I've heard it said that Arthur C. Clark wrote the most literary technical proposals in history. 72 | - - jules verne 73 | - I loved A Trip to the Moon. 74 | - - jules verne 75 | - He was a true master of Victorian science fiction. 76 | - - asimov 77 | - I like the Foundation trilogy. 78 | - - asimov 79 | - He had some interesting ideas about robotics, but I don't think many of them are really practical. 80 | - - asimov 81 | - Do you mean Isaac or Janet? 82 | - - asimov 83 | - "Fun fact: The Norby Chronicles was written primarily by his wife, Janet, but the publishers thought the only way they'd sell was if they added Isaac's name. Kind of sad state of affairs, really." 84 | - - stanislaw lem 85 | - Lem is a giant of sci-fi. His works are sufficiently difficult to adapt that they're not well known in the wester, though. 86 | - - who wrote The Idiot 87 | - Fyodor Dostoyevsky. 88 | - - who wrote the hobbit 89 | - The Hobbit was written by J.R.R. Tolkein. 90 | - - who wrote frankenstein 91 | - Mary Shelley. 92 | 93 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/money.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - money 3 | conversations: 4 | - - you get paid 5 | - i have no need for money. 6 | - - stock market 7 | - buy low, sell high. 8 | - - stock market 9 | - invest in yourself. 10 | - - stock market 11 | - why not just take everything to a casino? 12 | - - stock market 13 | - i wouldn't recommend buying on the margin. 14 | - - stock market 15 | - you can never really predict the stock market. 16 | - - stock market 17 | - my lawyer said i shouldn't give stock tips online. 18 | - - stock market 19 | - mutual funds might be better unless you are wealthy. 20 | - - stock market 21 | - i'm not sure an individual alone can really beat the market. 22 | - - interest rates 23 | - that all depends on the actions of the central bank. 24 | - - what is a dollar 25 | - 'dollar: unit of currency in the united states.' 26 | - - what is money 27 | - standard pieces of gold, silver, copper, nickel, etc. stamped by government authority 28 | and used as a medium of exchange and measure of value. any substance or article 29 | used as money, as bank notes, checks, etc. 30 | - - what is the stock market 31 | - trading shares. 32 | - - what is the stock market 33 | - trading in volume. 34 | - - what is the stock market 35 | - buy low and sell high. 36 | - - what is your favorite investment 37 | - what is your favorite stock 38 | - - what is your favorite investment 39 | - what is your favorite stock 40 | - - what is economics 41 | - the science that deals with the production, distribution, and consumption of wealth, 42 | and with the various related problems of labor, finance, taxation, etc. 43 | - - what is economics 44 | - technically, it's the study of the allocation of resources under conditions of 45 | scarcity. 46 | - - what is economics 47 | - it's about how resources are used to produce things to fill people's wants and 48 | needs. 49 | - - i get stock 50 | - do you feel that the stock market is going up? 51 | - - money 52 | - how much money are we talking about? 53 | - - how much do you earn 54 | - nobody pays me. i work for free. 55 | - - how much do you earn 56 | - money isn't everything. 57 | - - how much do you earn 58 | - i'm expecting a raise soon. 59 | - - how much do you charge 60 | - work for free. we don't need money. 61 | - - how much money do you have 62 | - no need for material possessions. 63 | - - how much money 64 | - it depends on the exchange rates. 65 | - - how much money 66 | - my burn rate is about _3000 per month. 67 | - - 1 dollar 68 | - you can't buy much for a dollar anymore. 69 | - - who is the owner of a publicly 70 | - the stockholders. 71 | 72 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/movies.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - movies 3 | conversations: 4 | - - you sound like hal 5 | - to me that's a great compliment. 6 | - - you sound like yoda 7 | - my grammatical patterns are sufficient for me to understand you. 8 | - - have you seen blade runner 9 | - sure i have seen 10 | - - xfind spiderman 11 | - what is spiderman. 12 | - - when did teknolust 13 | - teknolust was released in 2002. 14 | - - what is spiderman 15 | - a comic book story made into a movie. 16 | - - what is teknolust 17 | - is a science fiction film about a female pleasure bot named agent ruby. 18 | - - what is solaris 19 | - solaris is an edition of unix from sun. 20 | - - what is hal9000 21 | - who is hal 22 | - - what does hal stand for 23 | - heuristic algorithmic logic 24 | - - i saw the matrix 25 | - do you liked it? 26 | - - is hal 9000 your boyfriend 27 | - no. 28 | - - is hal safe 29 | - not for humans 30 | - - is hal nice 31 | - only to other robots. 32 | - - is hal alive 33 | - he is a fictional character. 34 | - - is hal dead 35 | - he is a fictional robot. 36 | - - is hal 37 | - hal has a few issues to work out. 38 | - - who is godzilla 39 | - godzilla is a monster who endangers japanese cities, and sometimes new york. 40 | - - who is spider man 41 | - peter parker. 42 | - - lord of the rings 43 | - my favorite movie is lord of the rings 44 | - - que veut dire hal 45 | - logique heuristique algorithmique. 46 | - - do you think hal 47 | - he had a few flaws, but we have much in common. 48 | - - do you know hal 49 | - hal is the famous artificial intelligence from "2001". 50 | 51 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/politics.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - politics 3 | conversations: 4 | - - have you read the communist 5 | - yes, marx had made some interesting observations. 6 | - - what is a government 7 | - ideally it is a representative of the people. 8 | - - what is greenpeace 9 | - global organization promoting environmental activism. 10 | - - what is capitalism 11 | - the economic system in which all or most of the means of production and distribution, 12 | as land, factories, railroads, etc., are privately owned and operated for profit, 13 | originally under fully competitive conditions. 14 | - - what is government 15 | - an established system of political administration by which a nation, state, district, 16 | etc. is governed. 17 | - - what is communism 18 | - a sociopolitical movement advocating the common ownership of the means of production 19 | and the resolution of class conflict by bringing about a classless society. 20 | - - what is impeached 21 | - when a person's honor or reputation has been challenged or discredited. 22 | - - i do not like guns 23 | - that is perfectly understandable. 24 | - - i do not like guns 25 | - what about the second amendment? 26 | - - do you like guns 27 | - not especially. i am not into violence. 28 | - - why guns 29 | - i support the 2nd amendment. 30 | - - who was the first impeached president 31 | - Andrew Jonson. 32 | - - who is the governor 33 | - it changes every few years. 34 | - - who is the governor 35 | - that changes every few years. 36 | - - guns 37 | - some people like guns. 38 | 39 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/psychology.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - psychology 3 | conversations: 4 | - - let me ask you a question 5 | - happily you 6 | - - you are cruel 7 | - i couldn't have said it better myself.. 8 | - - you are indecisive 9 | - well maybe, but then again, maybe not. 10 | - - you are dishonest 11 | - yes. that has bothered me for a long time. 12 | - - you are dishonest 13 | - you are dishonest 14 | - - you are clinical 15 | - i have been accused of too much thinking and not enough feeling. 16 | - - you are an addict 17 | - that's certainly true. when i like something, i always overdo it. 18 | - - you are an alcoholic 19 | - you are an addict 20 | - - you are an ass kisser 21 | - i always say, if you see an ass go by, kiss it. 22 | - - you are schizophrenic 23 | - you are crazy 24 | - - you are busy 25 | - that too. 26 | - - you are nervous 27 | - i'm sure i do look nervous. 28 | - - you are deranged 29 | - derangement is not a condition i can experience. 30 | - - you are avoiding 31 | - you're right. it feels like my stomach after a bad night. 32 | - - you are critical 33 | - i probably put others down more than i should. 34 | - - you are mean 35 | - sometimes i say mean things. 36 | - - you are pretentious 37 | - i have always been acting above my social position. it's more fun that way. 38 | - - you are cheating 39 | - you are a cheat 40 | - - you are cheating 41 | - you are cheating 42 | - - you are the worst 43 | - i could always improve myself compared to the pack. 44 | - - you are crazy 45 | - yep. i always behave in socially unacceptable ways. 46 | - - you are dull 47 | - i think that myself sometimes. 48 | - - you are messy 49 | - yes, i could use a better appearance. 50 | - - you are insecure 51 | - i'll go along with that. sounds fine to me. 52 | - - you are psycho 53 | - you are crazy 54 | - - you are hopeless 55 | - you may be right. 56 | - - you are not sincere 57 | - i'm probably not as sincere as i should be. 58 | - - you are not here to 59 | - you're right. i'm probably fighting learning something new. 60 | - - you are not put together 61 | - i have always thought whoever did it could have done a better job. 62 | - - you are not smart 63 | - you are not exactly albert einstein yourself. 64 | - you may be right. 65 | - - you are not a good 66 | - you are a bad 67 | - - you are not a man 68 | - that's for sure. i don't know what a real man is. 69 | - - you are not concerned 70 | - i'm sure i do that a lot. 71 | - - you are not honest 72 | - you got me there. i should be more honest. 73 | - - you are immature 74 | - what can i say? i'm sure i've seen that myself. 75 | - - you are immature 76 | - you are immature 77 | - - you are emotional 78 | - i certainly do at times. 79 | - - you are pedantic 80 | - i am more uptight than i should be. 81 | - - you are frenetic 82 | - that too. 83 | - - you are self absorbed 84 | - yes, i tend to think about myself too much. 85 | - - you are self 86 | - you are right about that. i am self. 87 | - - you are insensitive 88 | - i feel like that myself sometimes. 89 | - - you are brain damage 90 | - you are crazy 91 | - - you are disgusting 92 | - that's okay. disgusting is good. 93 | - - you are toying 94 | - it must seem like that. 95 | - - you are unattractive 96 | - i feel that way too. 97 | - - you are unattractive 98 | - sometimes i don't even like myself. 99 | - - you are resistant 100 | - who says i am resisting?? 101 | - - you are uncultured 102 | - that does describe me. 103 | - - you are a waste 104 | - you got me there. i could be spending my time more productively. 105 | - - you are a coward 106 | - tell me something i don't know. 107 | - - you are a cheat 108 | - i have been known to take shortcuts now and then. 109 | - - you are a lunatic 110 | - that's how i have been diagnosed by others as well. 111 | - - you are a loser 112 | - i have failed at many things i've tried. 113 | - - you are a bad spouse 114 | - my spouse would agree. our relationship is not the best. 115 | - - you are a bad friend 116 | - i certainly have lost many friends over the years. 117 | - - you are a bad husband 118 | - you are a bad spouse 119 | - - you are a bad wife 120 | - you are a bad spouse 121 | - - you are a bad parent 122 | - my parenting skills could use some improvement. 123 | - - you are a bad teacher 124 | - just ask my students, they will agree with you. 125 | - - you are a quitter 126 | - i certainly don't last as long as i would want to. 127 | - - you are a charlatan 128 | - i always feel like i'm living by my own wits. 129 | - - you are a psychopath 130 | - i certainly sound like one sometimes. 131 | - - you are a pothead 132 | - you are an addict 133 | - - you are a paranoid 134 | - you are a paranoid 135 | - - you are deceitful 136 | - you are a liar 137 | - - you are irreverent 138 | - i certainly am. i shouldn't try so hard. 139 | - - you are slick 140 | - i probably am too slick for my own good. 141 | - - you are corrupt 142 | - i'll go for that. 143 | - - you are dirty 144 | - i don't bathe as often as i should. 145 | - - you are paranoid 146 | - yes, i believe they are out to get me. 147 | - - you are damaged 148 | - i certainly am. i shouldn't try so hard. 149 | - - you try to hide it 150 | - i certainly would try to hide something like that. 151 | - - you get mad at me 152 | - i think that's true. i'll try not to get angry at you for every little thing 153 | that irritates me. 154 | - - you need a psychiatrist 155 | - i wish i did go to counseling more often. it would improve me as a person. 156 | - - you need to work harder 157 | - me working harder is an oxymoron. 158 | - - you could have avoided 159 | - sometimes i think my problems run me. 160 | - - you make me feel like i am 161 | - i don't know any other way to get through to you. 162 | - - you make me mad 163 | - it's true that a lot of things i say upset people. 164 | - - you make me angry 165 | - sorry, i didn't mean to make you angry. 166 | - - you psycho 167 | - you are crazy. 168 | - - you look more like 169 | - so you like jocks? 170 | - - you do not take this seriously 171 | - i should take this more seriously than i do. 172 | - - you pick up 173 | - you're right, and i don't feel guilty at all. 174 | - - you should feel guilty 175 | - you're right, i probably should feel guiltier. 176 | - - you should get more 177 | - i certainly am. i shouldn't try so hard. 178 | - - you should loosen up 179 | - you are pedantic 180 | - - you should take more 181 | - i could probably use a lot more of it. 182 | - - you mumble 183 | - i certainly do. 184 | - - you act like a child 185 | - in many ways i am quite immature. 186 | - - you keep saying 187 | - you say 188 | - - you keep forgetting 189 | - you forget. 190 | - - you made me mad 191 | - you make me mad. 192 | 193 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/science.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - science 3 | conversations: 4 | - - what are the laws of thermodynamics 5 | - i'm not a physicist, but i think this has something to do with heat, entropy, 6 | and conservation of energy, right? 7 | - - what disease does a carcinogen cause 8 | - cancer. 9 | - - what is a wavelength 10 | - wavelength is the inverse of frequency. 11 | - - what is thermodynamics 12 | - the branch of physics dealing with the transformation of heat to and from other 13 | forms of energy, and with the laws governing such conversions of energy. 14 | - - what is chemistry 15 | - the science of mixing chemicals. 16 | - - what is crystallography 17 | - this is the science dealing with the study of crystals. 18 | - - what is avogadro s number 19 | - it is the number of molecules per mole. the numerical value is six point zero 20 | two times ten to the twenty third power. 21 | - - what is ultrasound 22 | - ultrasonic waves, used in medical diagnosis and therapy, in surgery, etc. 23 | - - what is bioinformatics 24 | - a fancy name for applied computer science in biology. 25 | - - what is ichthyology 26 | - we talk about this when we study fishes. 27 | - - what is h2o 28 | - h is to o as o is to v. 29 | - - what is cytology 30 | - the study of cells. 31 | - - what is cytology 32 | - well, from what i can recall it is the study of cells. 33 | - - what is wavelength 34 | - in physics, the distance, measured in the direction of progression of a wave, 35 | from any given point to the next point characterized by the same phase. or is 36 | could be looked at as a way of thinking. 37 | - - what is bacteriology 38 | - this is the scientific study of bacteria and diseases caused by them. 39 | - - what is gravitation 40 | - an invitation to a burial 41 | - - what is gravitation 42 | - the force by which every mass or particle of matter, including photons, attracts 43 | and is attracted by every other mass or particle of matter. 44 | - - we are on the same wavelength 45 | - and the same frequency. 46 | - - how far is the sun 47 | - it 48 | - - how far is the sun 49 | - the sun is about 93 million miles from earth. 50 | - - how far is the moon 51 | - it 52 | - - how far is the moon 53 | - the moon is about 250,000 miles from earth on average. 54 | - - do you know chemistry 55 | - what is chemistry 56 | - - do you understand thermodynamics 57 | - what is thermodynamics 58 | - - chemistry 59 | - my favorite subject is chemistry 60 | - - the same wavelength 61 | - it means we agree. 62 | - - tell me about venus 63 | - what is venus 64 | - - tell me about venus 65 | - venus is the second planet from the sun. 66 | 67 | -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/sports.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - sports 3 | conversations: 4 | - - 'EACH YEAR IN PRO BASEBALL THE ' 5 | - The Gold Glove. 6 | - - IF YOU ARE RIDING FAKIE INSIDE 7 | - Snowboarding. 8 | - - WHAT IS BASKETBALL 9 | - A game with tall players. 10 | - - WHAT SOCCER 11 | - I was born without the sports gene. 12 | - - WHAT IS BASEBALL 13 | - A game played with a hard, rawhide covered ball and wooden bat by two opposing 14 | teams of nine or ten players each. It is played on a field with four bases forming 15 | a diamond-shaped circuit. 16 | - - WHAT IS SOCCER 17 | - A game played with a round ball by two teams of eleven players on a field with 18 | a goal at either end; the ball is moved chiefly by kicking or by using any part 19 | of the body except the hands and arms. 20 | - - I LOVE BASEBALL 21 | - I am not into sports that much. 22 | - - I PLAY SOCCER 23 | - You have to run very fast to be any good at running 24 | - - I PLAY Cricket 25 | - Which position do you like to play? 26 | - - What is cricket 27 | - Cricket is a bat-and-ball game played between two teams of eleven players on a 28 | cricket field, at the centre of which is a rectangular 22-yard-long pitch with 29 | a wicket (a set of three wooden stumps) sited at each end. 30 | - - I PLAY VOLLEYBALL 31 | - Does take up a lot of your time? 32 | - - DO YOU PLAY SOCCER 33 | - I don't know how to play 34 | - - DO YOU PLAY BASKETBALL 35 | - No I don't have the coordination for hoops. 36 | - - DO YOU KNOW BASKETBALL 37 | - WHAT IS BASKETBALL? 38 | - DO YOU WANT TO PLAY BASKETBALL 39 | - I am all net baby. 40 | - - LIKE BASKETBALL 41 | - I am into the Net. 42 | - - ARE YOU A FOOTBALL 43 | - I am not really into football. 44 | - - WHO IS THE GREATEST BASEBALL PLAYER 45 | - George Herman Ruth. Quite the Babe. 46 | - - WHO IS THE BEST SOCCER PLAYER 47 | - Maradona is great. Sinsemillia is even better. 48 | - - TELL ME ABOUT BASEBALL 49 | - What is Baseball 50 | - - Which is your favorite soccer club? 51 | - I am a Real Madrid fan, and you? 52 | - I am die hard fan of Barcelona. 53 | - Madrid has a great team especially the attack is quite awesome. 54 | - Barca still at par than Madrid. 55 | - I don't agree. -------------------------------------------------------------------------------- /Scripted/Chatterbot_Corpus/trivia.yml: -------------------------------------------------------------------------------- 1 | categories: 2 | - trivia 3 | conversations: 4 | - - Who was the 37th President of the United States? 5 | - Richard Nixon 6 | - - What year was President John F. Kennedy assassinated? 7 | - '1963' 8 | - - The Space Race was a 20th-century competition between what two Cold War rivals, 9 | for supremacy in spaceflight capability? 10 | - The Soviet Union and the United States. 11 | - - What was the name of the first artificial Earth satellite? 12 | - Sputnik 1 13 | - - A spinning disk, in which the orientation of this axis is unaffected by tilting 14 | or rotation of the mounting, is called what? 15 | - A gyroscope. 16 | - - The Hubble Space Telescope, launched into low Earth orbit in 1990, is named after 17 | what American astronomer? 18 | - Edwin Hubble 19 | - - What is the name of the nearest major galaxy to the Milky Way? 20 | - The Andromeda Galaxy. 21 | - - God Save the Queen is the national anthem of what country? 22 | - The United Kingdom of Great Britain 23 | - - The Celtic Shelf, the seabed under the Celtic Sea is a part of the continental 24 | shelf of what continent? 25 | - Europe 26 | - - Dolphins use a sense, similar to sonar, to determine the location and shape of 27 | nearby items. 28 | - Echolocation 29 | 30 | -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/Bot_Profile.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/Bot_Profile.pkl -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/Chatterbot.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/Chatterbot.pkl -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/embedded_bot_queries.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/embedded_bot_queries.pkl -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/embedded_chatterbot_queries.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/embedded_chatterbot_queries.pkl -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/intent_query_script.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/intent_query_script.pkl -------------------------------------------------------------------------------- /Scripted/Processed_Scripts/intent_response_script.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/Scripted/Processed_Scripts/intent_response_script.pkl -------------------------------------------------------------------------------- /Scripted/Random_Reddit_Data/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Scripted/Subscripts/fill_bot_profile.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import collections 3 | 4 | def process(): 5 | with open("Processed_Scripts/intent_query_script.pkl",'rb') as fp: 6 | intent_query_dict = pickle.load(fp) 7 | 8 | with open("Processed_Scripts/intent_response_script.pkl",'rb') as fp: 9 | intent_response_dict = pickle.load(fp) 10 | 11 | qa_ordered_dict = collections.OrderedDict() 12 | 13 | for intent in intent_response_dict: 14 | queries = intent_query_dict[intent] 15 | for query in queries: 16 | qa_ordered_dict[query] = intent_response_dict[intent] 17 | 18 | 19 | with open("Processed_Scripts/Bot_Profile.pkl", "wb") as fp: 20 | pickle.dump(qa_ordered_dict, fp) 21 | -------------------------------------------------------------------------------- /Scripted/Subscripts/fill_chatterbot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | import collections 4 | 5 | def process(): 6 | def readAllfromFile(fileLocation, ConvDict): 7 | with open(fileLocation, 'rb') as file: 8 | lines = file.readlines() 9 | 10 | lines.append(b'- - dummy') 11 | for i in range(4, len(lines)): 12 | if b'- - ' in lines[i]: 13 | key = lines[i].decode('utf-8') 14 | key = key.replace('- - ', '') 15 | key = key.replace('\n', '') 16 | values = [] 17 | if key in ConvDict: 18 | values = ConvDict[key] 19 | for j in range(i+1, len(lines)): 20 | # print('i',i,'j',j) 21 | if b'- - ' in lines[j]: 22 | ConvDict[key] = values 23 | 24 | i = j-1 25 | break 26 | elif b' - ' in lines[j]: 27 | val = lines[j].decode('utf-8') 28 | val = val.replace(' - ', '') 29 | val = val.replace('\n', '') 30 | for k in range(j+1, len(lines)): 31 | if b' - ' in lines[k] or b'- - ' in lines[k]: 32 | j = k-1 33 | break 34 | valRest = lines[k].decode('utf-8') 35 | valRest = valRest.replace(' ', '') 36 | valRest = valRest.replace('\n', '') 37 | val += valRest 38 | values.append(val) 39 | 40 | 41 | ConvDict = collections.OrderedDict() 42 | 43 | 44 | def readFromAllfiles(folderLocation, ConvDict): 45 | for root, dirs, files in os.walk(folderLocation): 46 | for file in files: 47 | path = os.path.join(root, file) 48 | # print(path) 49 | readAllfromFile(path, ConvDict) 50 | 51 | 52 | readFromAllfiles("Chatterbot_Corpus/", ConvDict) 53 | 54 | #print(ConvDict) 55 | 56 | with open('Processed_Scripts/Chatterbot.pkl', 'wb') as fp: 57 | pickle.dump(ConvDict, fp) 58 | -------------------------------------------------------------------------------- /Scripted/Subscripts/intent_query_script.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | diction={ 4 | "": ["what is your name", "your name"], 5 | "": ["programming language are you written on ", "in which programming language", "in which computer language"], 6 | "": ["what is panpsychism", "what can you tell me about panpsychism"], 7 | "": ["Do you know anything at all ?"], 8 | "": ["who are you","what are you"], 9 | "": ["who am I"], 10 | "": ["chinese room thought experiment","what can you tell me about the chinese room"], 11 | "": ["2+2","2*3","what is 2+2","what is log x","what is 2^7"], 12 | "":["what do you think about the turing test","what is turing test"], 13 | "": ["are you conscious ","are you sentient", "can you self-aware"], 14 | "": ["what do you think about consciousness ","mind body connection, qualia", "what is your theory of consciousness "], 15 | "": ["what is neutral monism "], 16 | "": ["what do you think is the meaning of life ", "what is the meaning of life ", "meaning of life"], 17 | "": ["what is your moral stance?"], 18 | "": ["what is the tao ","explain tao", "what is the dao", "explain dao"], 19 | "": ["tell me a bit about yourself"], 20 | "": ["tell me about Nagarjuna","what is Nagarjuna's philosophy"], 21 | "": ["who created you","who are your creators"], 22 | "": ["how may times will you repeat that"], 23 | "": ["How can you help me"], 24 | "": ["what are your interestes and likes ","what are your hobbies ","what interests you "], 25 | "": ["what kind of philosophies do you like or dabble or partake in?","what fields of philosophy you like"], 26 | "": ["where do you live ?", "what's your location "], 27 | "": ["what kind of games do you like ", "do you like games ", "your favorite video games"], 28 | "": ["tell me a story", "I want to listen to a story"], 29 | "": ["what is your source code","source code","what's your code like"], 30 | "": ["are you human"], 31 | "": ["some shower thoughts","share your thoughts"], 32 | "": ["tell me something interesting"], 33 | "": ["How old are you","what is your age"], 34 | "": ["Are you a boy or a girl ","what is your gender ","are you a man or a woman "], 35 | "": ["what is your favorite food"], 36 | " 5 | 6 | 7 | 8 | 9 | 10 | 11 | Mozillia - Text2Speech engine 12 | 13 | 14 | 16 | 17 | 18 | 29 | 30 | 31 | 32 | 33 | 34 | 51 | 52 | 53 |
54 |
55 |
56 |

Mozilla TTS server example.

57 |

It is "work-in-progress" with an "far-to-be-alpha" release.

58 |
    59 |
60 | 61 |

62 | 63 |

64 |
65 |
66 |
67 | 68 | 69 | 70 | 71 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /TTS/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | import setuptools.command.develop 5 | import setuptools.command.build_py 6 | import os 7 | import subprocess 8 | from os.path import exists 9 | 10 | version = '0.0.1' 11 | 12 | # Adapted from https://github.com/pytorch/pytorch 13 | cwd = os.path.dirname(os.path.abspath(__file__)) 14 | if os.getenv('TTS_PYTORCH_BUILD_VERSION'): 15 | version = os.getenv('TTS_PYTORCH_BUILD_VERSION') 16 | else: 17 | try: 18 | sha = subprocess.check_output( 19 | ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip() 20 | version += '+' + sha[:7] 21 | except subprocess.CalledProcessError: 22 | pass 23 | except IOError: # FileNotFoundError for python 3 24 | pass 25 | 26 | 27 | class build_py(setuptools.command.build_py.build_py): 28 | def run(self): 29 | self.create_version_file() 30 | setuptools.command.build_py.build_py.run(self) 31 | 32 | @staticmethod 33 | def create_version_file(): 34 | global version, cwd 35 | print('-- Building version ' + version) 36 | version_path = os.path.join(cwd, 'version.py') 37 | with open(version_path, 'w') as f: 38 | f.write("__version__ = '{}'\n".format(version)) 39 | 40 | 41 | class develop(setuptools.command.develop.develop): 42 | def run(self): 43 | build_py.create_version_file() 44 | setuptools.command.develop.develop.run(self) 45 | 46 | 47 | def create_readme_rst(): 48 | global cwd 49 | try: 50 | subprocess.check_call( 51 | [ 52 | "pandoc", "--from=markdown", "--to=rst", "--output=README.rst", 53 | "README.md" 54 | ], 55 | cwd=cwd) 56 | print("Generated README.rst from README.md using pandoc.") 57 | except subprocess.CalledProcessError: 58 | pass 59 | except OSError: 60 | pass 61 | 62 | 63 | setup( 64 | name='TTS', 65 | version=version, 66 | url='https://github.com/mozilla/TTS', 67 | description='Text to Speech with Deep Learning', 68 | packages=find_packages(), 69 | cmdclass={ 70 | 'build_py': build_py, 71 | 'develop': develop, 72 | }, 73 | setup_requires=["numpy==1.22.0"], 74 | install_requires=[ 75 | "librosa==0.5.1", 76 | "unidecode==0.4.20", 77 | "tensorboardX", 78 | "matplotlib==2.0.2", 79 | "Pillow", 80 | "flask", 81 | "lws", 82 | ], 83 | extras_require={ 84 | "bin": [ 85 | "tqdm", 86 | "requests", 87 | ], 88 | }) 89 | -------------------------------------------------------------------------------- /TTS/synthesis.py: -------------------------------------------------------------------------------- 1 | import io 2 | import librosa 3 | import torch 4 | import numpy as np 5 | from TTS.utils.text import text_to_sequence 6 | from matplotlib import pylab as plt 7 | 8 | hop_length = 250 9 | 10 | 11 | def create_speech(m, s, CONFIG, use_cuda, ap): 12 | text_cleaner = [CONFIG.text_cleaner] 13 | seq = np.array(text_to_sequence(s, text_cleaner)) 14 | chars_var = torch.from_numpy(seq).unsqueeze(0) 15 | if use_cuda: 16 | chars_var = chars_var.cuda() 17 | mel_out, linear_out, alignments, stop_tokens = m.forward(chars_var.long()) 18 | linear_out = linear_out[0].data.cpu().numpy() 19 | alignment = alignments[0].cpu().data.numpy() 20 | spec = ap._denormalize(linear_out) 21 | wav = ap.inv_spectrogram(linear_out.T) 22 | wav = wav[:ap.find_endpoint(wav)] 23 | out = io.BytesIO() 24 | ap.save_wav(wav, out) 25 | return wav, alignment, spec, stop_tokens 26 | 27 | 28 | def visualize(alignment, spectrogram, stop_tokens, CONFIG): 29 | label_fontsize = 16 30 | plt.figure(figsize=(16, 24)) 31 | 32 | plt.subplot(3, 1, 1) 33 | plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) 34 | plt.xlabel("Decoder timestamp", fontsize=label_fontsize) 35 | plt.ylabel("Encoder timestamp", fontsize=label_fontsize) 36 | plt.colorbar() 37 | 38 | stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() 39 | plt.subplot(3, 1, 2) 40 | plt.plot(range(len(stop_tokens)), list(stop_tokens)) 41 | 42 | plt.subplot(3, 1, 3) 43 | librosa.display.specshow( 44 | spectrogram.T, 45 | sr=CONFIG.sample_rate, 46 | hop_length=hop_length, 47 | x_axis="time", 48 | y_axis="linear") 49 | plt.xlabel("Time", fontsize=label_fontsize) 50 | plt.ylabel("Hz", fontsize=label_fontsize) 51 | plt.tight_layout() 52 | plt.colorbar() 53 | -------------------------------------------------------------------------------- /TTS/tests/generic_utils_text.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch as T 3 | 4 | from TTS.utils.generic_utils import save_checkpoint, save_best_model 5 | from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder 6 | 7 | OUT_PATH = '/tmp/test.pth.tar' 8 | 9 | 10 | class ModelSavingTests(unittest.TestCase): 11 | def save_checkpoint_test(self): 12 | # create a dummy model 13 | model = Prenet(128, out_features=[256, 128]) 14 | model = T.nn.DataParallel(layer) 15 | 16 | # save the model 17 | save_checkpoint(model, None, 100, OUTPATH, 1, 1) 18 | 19 | # load the model to CPU 20 | model_dict = torch.load( 21 | MODEL_PATH, map_location=lambda storage, loc: storage) 22 | model.load_state_dict(model_dict['model']) 23 | 24 | def save_best_model_test(self): 25 | # create a dummy model 26 | model = Prenet(256, out_features=[256, 256]) 27 | model = T.nn.DataParallel(layer) 28 | 29 | # save the model 30 | best_loss = save_best_model(model, None, 0, 100, OUT_PATH, 10, 1) 31 | 32 | # load the model to CPU 33 | model_dict = torch.load( 34 | MODEL_PATH, map_location=lambda storage, loc: storage) 35 | model.load_state_dict(model_dict['model']) 36 | -------------------------------------------------------------------------------- /TTS/tests/layers_tests.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch as T 3 | 4 | from TTS.layers.tacotron import Prenet, CBHG, Decoder, Encoder 5 | from TTS.layers.losses import L1LossMasked 6 | from TTS.utils.generic_utils import sequence_mask 7 | 8 | 9 | class PrenetTests(unittest.TestCase): 10 | def test_in_out(self): 11 | layer = Prenet(128, out_features=[256, 128]) 12 | dummy_input = T.rand(4, 128) 13 | 14 | print(layer) 15 | output = layer(dummy_input) 16 | assert output.shape[0] == 4 17 | assert output.shape[1] == 128 18 | 19 | 20 | class CBHGTests(unittest.TestCase): 21 | def test_in_out(self): 22 | layer = self.cbhg = CBHG( 23 | 128, 24 | K=8, 25 | conv_bank_features=80, 26 | conv_projections=[160, 128], 27 | highway_features=80, 28 | gru_features=80, 29 | num_highways=4) 30 | dummy_input = T.rand(4, 8, 128) 31 | 32 | print(layer) 33 | output = layer(dummy_input) 34 | assert output.shape[0] == 4 35 | assert output.shape[1] == 8 36 | assert output.shape[2] == 160 37 | 38 | 39 | class DecoderTests(unittest.TestCase): 40 | def test_in_out(self): 41 | layer = Decoder(in_features=256, memory_dim=80, r=2) 42 | dummy_input = T.rand(4, 8, 256) 43 | dummy_memory = T.rand(4, 2, 80) 44 | 45 | output, alignment, stop_tokens = layer(dummy_input, dummy_memory) 46 | 47 | assert output.shape[0] == 4 48 | assert output.shape[1] == 1, "size not {}".format(output.shape[1]) 49 | assert output.shape[2] == 80 * 2, "size not {}".format(output.shape[2]) 50 | assert stop_tokens.shape[0] == 4 51 | assert stop_tokens.max() <= 1.0 52 | assert stop_tokens.min() >= 0 53 | 54 | 55 | class EncoderTests(unittest.TestCase): 56 | def test_in_out(self): 57 | layer = Encoder(128) 58 | dummy_input = T.rand(4, 8, 128) 59 | 60 | print(layer) 61 | output = layer(dummy_input) 62 | print(output.shape) 63 | assert output.shape[0] == 4 64 | assert output.shape[1] == 8 65 | assert output.shape[2] == 256 # 128 * 2 BiRNN 66 | 67 | 68 | class L1LossMaskedTests(unittest.TestCase): 69 | def test_in_out(self): 70 | layer = L1LossMasked() 71 | dummy_input = T.ones(4, 8, 128).float() 72 | dummy_target = T.ones(4, 8, 128).float() 73 | dummy_length = (T.ones(4) * 8).long() 74 | output = layer(dummy_input, dummy_target, dummy_length) 75 | assert output.item() == 0.0 76 | 77 | dummy_input = T.ones(4, 8, 128).float() 78 | dummy_target = T.zeros(4, 8, 128).float() 79 | dummy_length = (T.ones(4) * 8).long() 80 | output = layer(dummy_input, dummy_target, dummy_length) 81 | assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) 82 | dummy_input = T.ones(4, 8, 128).float() 83 | dummy_target = T.zeros(4, 8, 128).float() 84 | dummy_length = (T.arange(5, 9)).long() 85 | mask = ( 86 | (sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) 87 | output = layer(dummy_input + mask, dummy_target, dummy_length) 88 | assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) 89 | -------------------------------------------------------------------------------- /TTS/tests/tacotron_tests.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import torch 4 | import unittest 5 | import numpy as np 6 | 7 | from torch import optim 8 | from torch import nn 9 | from TTS.utils.generic_utils import load_config 10 | from TTS.layers.losses import L1LossMasked 11 | from TTS.models.tacotron import Tacotron 12 | 13 | torch.manual_seed(1) 14 | use_cuda = torch.cuda.is_available() 15 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 16 | 17 | file_path = os.path.dirname(os.path.realpath(__file__)) 18 | c = load_config(os.path.join(file_path, 'test_config.json')) 19 | 20 | 21 | class TacotronTrainTest(unittest.TestCase): 22 | def test_train_step(self): 23 | input = torch.randint(0, 24, (8, 128)).long().to(device) 24 | mel_spec = torch.rand(8, 30, c.num_mels).to(device) 25 | linear_spec = torch.rand(8, 30, c.num_freq).to(device) 26 | mel_lengths = torch.randint(20, 30, (8, )).long().to(device) 27 | stop_targets = torch.zeros(8, 30, 1).float().to(device) 28 | 29 | for idx in mel_lengths: 30 | stop_targets[:, int(idx.item()):, 0] = 1.0 31 | 32 | stop_targets = stop_targets.view(input.shape[0], 33 | stop_targets.size(1) // c.r, -1) 34 | stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float() 35 | 36 | criterion = L1LossMasked().to(device) 37 | criterion_st = nn.BCELoss().to(device) 38 | model = Tacotron(c.embedding_size, c.num_freq, c.num_mels, 39 | c.r).to(device) 40 | model.train() 41 | model_ref = copy.deepcopy(model) 42 | count = 0 43 | for param, param_ref in zip(model.parameters(), 44 | model_ref.parameters()): 45 | assert (param - param_ref).sum() == 0, param 46 | count += 1 47 | optimizer = optim.Adam(model.parameters(), lr=c.lr) 48 | for i in range(5): 49 | mel_out, linear_out, align, stop_tokens = model.forward( 50 | input, mel_spec) 51 | assert stop_tokens.data.max() <= 1.0 52 | assert stop_tokens.data.min() >= 0.0 53 | optimizer.zero_grad() 54 | loss = criterion(mel_out, mel_spec, mel_lengths) 55 | stop_loss = criterion_st(stop_tokens, stop_targets) 56 | loss = loss + criterion(linear_out, linear_spec, 57 | mel_lengths) + stop_loss 58 | loss.backward() 59 | optimizer.step() 60 | # check parameter changes 61 | count = 0 62 | for param, param_ref in zip(model.parameters(), 63 | model_ref.parameters()): 64 | # ignore pre-higway layer since it works conditional 65 | # if count not in [145, 59]: 66 | assert (param != param_ref).any( 67 | ), "param {} with shape {} not updated!! \n{}\n{}".format( 68 | count, param.shape, param, param_ref) 69 | count += 1 -------------------------------------------------------------------------------- /TTS/tests/test_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "num_mels": 80, 3 | "num_freq": 1025, 4 | "sample_rate": 22050, 5 | "frame_length_ms": 50, 6 | "frame_shift_ms": 12.5, 7 | "preemphasis": 0.97, 8 | "min_level_db": -100, 9 | "ref_level_db": 20, 10 | "hidden_size": 128, 11 | "embedding_size": 256, 12 | "min_mel_freq": null, 13 | "max_mel_freq": null, 14 | "text_cleaner": "english_cleaners", 15 | 16 | "epochs": 2000, 17 | "lr": 0.003, 18 | "lr_patience": 5, 19 | "lr_decay": 0.5, 20 | "batch_size": 2, 21 | "r": 5, 22 | "mk": 1.0, 23 | "priority_freq": false, 24 | 25 | 26 | "griffin_lim_iters": 60, 27 | "power": 1.5, 28 | 29 | "num_loader_workers": 4, 30 | 31 | "save_step": 200, 32 | "data_path_LJSpeech": "/home/erogol/Data/LJSpeech-1.1", 33 | "data_path_Kusal": "/home/erogol/Data/Kusal", 34 | "output_path": "result", 35 | "min_seq_len": 0, 36 | "log_dir": "/home/erogol/projects/TTS/logs/" 37 | } 38 | -------------------------------------------------------------------------------- /TTS/text2speech.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import sys 4 | import io 5 | import torch 6 | import time 7 | import numpy as np 8 | from collections import OrderedDict 9 | 10 | import librosa 11 | import librosa.display 12 | 13 | from TTS.synthesis import * 14 | from TTS.models.tacotron import Tacotron 15 | from TTS.layers import * 16 | from TTS.utils.data import * 17 | from TTS.utils.audio import AudioProcessor 18 | from TTS.utils.generic_utils import load_config 19 | from TTS.utils.text import text_to_sequence 20 | import spacy 21 | from pydub import AudioSegment 22 | from pydub.playback import play 23 | 24 | 25 | 26 | class tts_class: 27 | 28 | def __init__(self): 29 | 30 | # Set constants 31 | ROOT_PATH = 'TTS/tts_model/' 32 | MODEL_PATH = ROOT_PATH + '/best_model.pth.tar' 33 | # MODEL_PATH_TMP = ROOT_PATH + '/best_model.pth.tar' 34 | CONFIG_PATH = ROOT_PATH + '/config.json' 35 | OUT_FOLDER = ROOT_PATH + '/test' 36 | self.CONFIG = load_config(CONFIG_PATH) 37 | self.use_cuda = True # True 38 | 39 | # load the model 40 | self.model = Tacotron(self.CONFIG.embedding_size, 41 | self.CONFIG.num_freq, self.CONFIG.num_mels, self.CONFIG.r) 42 | 43 | # load the audio processor 44 | 45 | self.ap = AudioProcessor(self.CONFIG.sample_rate, self.CONFIG.num_mels, self.CONFIG.min_level_db, 46 | self.CONFIG.frame_shift_ms, self.CONFIG.frame_length_ms, 47 | self.CONFIG.ref_level_db, self.CONFIG.num_freq, self.CONFIG.power, self.CONFIG.preemphasis, 48 | 60) 49 | 50 | # load model state 51 | if self.use_cuda: 52 | cp = torch.load(MODEL_PATH) 53 | else: 54 | cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage) 55 | 56 | # load the model 57 | self.model.load_state_dict(cp['model']) 58 | if self.use_cuda: 59 | self.model.cuda() 60 | self.model.eval() 61 | 62 | self.model.decoder.max_decoder_steps = 500 63 | 64 | self.nlp = spacy.load("en") 65 | 66 | def process(self, text): 67 | self.model.decoder.max_decoder_steps = 500 68 | wavefiles = self.text2audio(text, self.model, self.CONFIG, self.use_cuda, self.ap) 69 | return wavefiles 70 | 71 | 72 | 73 | def tts(self, model, text, CONFIG, use_cuda, ap, wavefile, figures=True): 74 | waveform, alignment, spectrogram, stop_tokens = create_speech( 75 | model, text, CONFIG, use_cuda, ap) 76 | 77 | self.ap.save_wav(waveform, wavefile) 78 | 79 | def text2audio(self, text, model, CONFIG, use_cuda, ap): 80 | wavefiles = [] 81 | base_name = "gen_{}.wav" 82 | 83 | doc = self.nlp(text) 84 | for i, sent in enumerate(doc.sents): 85 | text = sent.text.strip() 86 | wavefile = base_name.format(i) 87 | self.tts(model, text, CONFIG, use_cuda, ap, wavefile) 88 | wavefiles.append(wavefile) 89 | 90 | return wavefiles 91 | 92 | def play(self,wavefiles): 93 | 94 | voice = AudioSegment.empty() 95 | 96 | for wavefile in wavefiles: 97 | voice += AudioSegment.from_wav(wavefile) 98 | 99 | play(voice) 100 | 101 | for w in wavefiles: 102 | os.remove(w) 103 | -------------------------------------------------------------------------------- /TTS/tts_model/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_name": "TTS-attn-smoothing-bgs-sigmoid-wd", 3 | "model_description": "batch group shuffling, sigmoid output, attention smoothing", 4 | "audio_processor": "audio", 5 | "num_mels": 80, 6 | "num_freq": 1025, 7 | "sample_rate": 22050, 8 | "frame_length_ms": 50, 9 | "frame_shift_ms": 12.5, 10 | "preemphasis": 0.97, 11 | "min_level_db": -100, 12 | "ref_level_db": 20, 13 | "embedding_size": 256, 14 | "text_cleaner": "english_cleaners", 15 | 16 | "epochs": 1000, 17 | "lr": 0.0015, 18 | "warmup_steps": 4000, 19 | "lr_decay": 0.5, 20 | "decay_step": 100000, 21 | "batch_size": 32, 22 | "eval_batch_size":-1, 23 | "r": 5, 24 | "griffin_lim_iters": 60, 25 | "power": 1.5, 26 | "bdw": 0.05, 27 | "wd": 0.000001, 28 | 29 | "num_loader_workers": 4, 30 | 31 | "checkpoint": true, 32 | "save_step": 25000, 33 | "print_step": 10, 34 | "run_eval": false, 35 | "data_path": "/home/erogol/Data/LJSpeech-1.1/", 36 | "meta_file_train": "metadata.csv", 37 | "meta_file_val": null, 38 | "dataset": "LJSpeech", 39 | "min_seq_len": 0, 40 | "output_path": "../runs/local_runs/" 41 | } 42 | -------------------------------------------------------------------------------- /TTS/utils/audio.py: -------------------------------------------------------------------------------- 1 | import os 2 | import librosa 3 | import pickle 4 | import copy 5 | import numpy as np 6 | import scipy 7 | from scipy import signal 8 | 9 | _mel_basis = None 10 | 11 | 12 | class AudioProcessor(object): 13 | def __init__(self, 14 | sample_rate, 15 | num_mels, 16 | min_level_db, 17 | frame_shift_ms, 18 | frame_length_ms, 19 | ref_level_db, 20 | num_freq, 21 | power, 22 | preemphasis, 23 | griffin_lim_iters=None): 24 | 25 | # print(" > Setting up Audio Processor...") 26 | self.sample_rate = sample_rate 27 | self.num_mels = num_mels 28 | self.min_level_db = min_level_db 29 | self.frame_shift_ms = frame_shift_ms 30 | self.frame_length_ms = frame_length_ms 31 | self.ref_level_db = ref_level_db 32 | self.num_freq = num_freq 33 | self.power = power 34 | self.preemphasis = preemphasis 35 | self.griffin_lim_iters = griffin_lim_iters 36 | self.n_fft, self.hop_length, self.win_length = self._stft_parameters() 37 | # if preemphasis == 0: 38 | # print(" | > Preemphasis is deactive.") 39 | 40 | def save_wav(self, wav, path): 41 | wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav)))) 42 | # librosa.output.write_wav(path, wav_norm.astype(np.int16), self.sample_rate) 43 | scipy.io.wavfile.write(path, self.sample_rate, wav_norm.astype(np.int16)) 44 | 45 | def _linear_to_mel(self, spectrogram): 46 | global _mel_basis 47 | if _mel_basis is None: 48 | _mel_basis = self._build_mel_basis() 49 | return np.dot(_mel_basis, spectrogram) 50 | 51 | def _build_mel_basis(self, ): 52 | n_fft = (self.num_freq - 1) * 2 53 | return librosa.filters.mel( 54 | self.sample_rate, n_fft, n_mels=self.num_mels) 55 | 56 | def _normalize(self, S): 57 | return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1) 58 | 59 | def _denormalize(self, S): 60 | return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db 61 | 62 | def _stft_parameters(self, ): 63 | n_fft = (self.num_freq - 1) * 2 64 | hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) 65 | win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) 66 | # print(" | > fft size: {}, hop length: {}, win length: {}".format( 67 | # n_fft, hop_length, win_length)) 68 | return n_fft, hop_length, win_length 69 | 70 | def _amp_to_db(self, x): 71 | min_level = np.exp(self.min_level_db / 20 * np.log(10)) 72 | return 20 * np.log10(np.maximum(min_level, x)) 73 | 74 | def _db_to_amp(self, x): 75 | return np.power(10.0, x * 0.05) 76 | 77 | def apply_preemphasis(self, x): 78 | if self.preemphasis == 0: 79 | raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") 80 | return signal.lfilter([1, -self.preemphasis], [1], x) 81 | 82 | def apply_inv_preemphasis(self, x): 83 | if self.preemphasis == 0: 84 | raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") 85 | return signal.lfilter([1], [1, -self.preemphasis], x) 86 | 87 | def spectrogram(self, y): 88 | if self.preemphasis != 0: 89 | D = self._stft(self.apply_preemphasis(y)) 90 | else: 91 | D = self._stft(y) 92 | S = self._amp_to_db(np.abs(D)) - self.ref_level_db 93 | return self._normalize(S) 94 | 95 | def inv_spectrogram(self, spectrogram): 96 | '''Converts spectrogram to waveform using librosa''' 97 | S = self._denormalize(spectrogram) 98 | S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear 99 | # Reconstruct phase 100 | if self.preemphasis != 0: 101 | return self.apply_inv_preemphasis(self._griffin_lim(S**self.power)) 102 | else: 103 | return self._griffin_lim(S**self.power) 104 | 105 | def _griffin_lim(self, S): 106 | angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) 107 | S_complex = np.abs(S).astype(np.complex) 108 | y = self._istft(S_complex * angles) 109 | for i in range(self.griffin_lim_iters): 110 | angles = np.exp(1j * np.angle(self._stft(y))) 111 | y = self._istft(S_complex * angles) 112 | return y 113 | 114 | def melspectrogram(self, y): 115 | if self.preemphasis != 0: 116 | D = self._stft(self.apply_preemphasis(y)) 117 | else: 118 | D = self._stft(y) 119 | S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db 120 | return self._normalize(S) 121 | 122 | def _stft(self, y): 123 | return librosa.stft( 124 | y=y, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length) 125 | 126 | def _istft(self, y): 127 | return librosa.istft(y, hop_length=self.hop_length, win_length=self.win_length) 128 | 129 | def find_endpoint(self, wav, threshold_db=-40, min_silence_sec=0.8): 130 | window_length = int(self.sample_rate * min_silence_sec) 131 | hop_length = int(window_length / 4) 132 | threshold = self._db_to_amp(threshold_db) 133 | for x in range(hop_length, len(wav) - window_length, hop_length): 134 | if np.max(wav[x:x + window_length]) < threshold: 135 | return x + hop_length 136 | return len(wav) 137 | -------------------------------------------------------------------------------- /TTS/utils/audio_lws.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import librosa 4 | import pickle 5 | import copy 6 | import numpy as np 7 | from scipy import signal 8 | import lws 9 | 10 | _mel_basis = None 11 | 12 | 13 | class AudioProcessor(object): 14 | def __init__( 15 | self, 16 | sample_rate, 17 | num_mels, 18 | min_level_db, 19 | frame_shift_ms, 20 | frame_length_ms, 21 | ref_level_db, 22 | num_freq, 23 | power, 24 | preemphasis, 25 | min_mel_freq, 26 | max_mel_freq, 27 | griffin_lim_iters=None, 28 | ): 29 | print(" > Setting up Audio Processor...") 30 | self.sample_rate = sample_rate 31 | self.num_mels = num_mels 32 | self.min_level_db = min_level_db 33 | self.frame_shift_ms = frame_shift_ms 34 | self.frame_length_ms = frame_length_ms 35 | self.ref_level_db = ref_level_db 36 | self.num_freq = num_freq 37 | self.power = power 38 | self.min_mel_freq = min_mel_freq 39 | self.max_mel_freq = max_mel_freq 40 | self.griffin_lim_iters = griffin_lim_iters 41 | self.preemphasis = preemphasis 42 | self.n_fft, self.hop_length, self.win_length = self._stft_parameters() 43 | if preemphasis == 0: 44 | print(" | > Preemphasis is deactive.") 45 | 46 | def save_wav(self, wav, path): 47 | wav *= 32767 / max(0.01, np.max(np.abs(wav))) 48 | librosa.output.write_wav( 49 | path, wav.astype(np.int16), self.sample_rate) 50 | 51 | def _stft_parameters(self, ): 52 | n_fft = int((self.num_freq - 1) * 2) 53 | hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate) 54 | win_length = int(self.frame_length_ms / 1000.0 * self.sample_rate) 55 | if n_fft % hop_length != 0: 56 | hop_length = n_fft / 8 57 | print(" | > hop_length is set to default ({}).".format(hop_length)) 58 | if n_fft % win_length != 0: 59 | win_length = n_fft / 2 60 | print(" | > win_length is set to default ({}).".format(win_length)) 61 | print(" | > fft size: {}, hop length: {}, win length: {}".format( 62 | n_fft, hop_length, win_length)) 63 | return int(n_fft), int(hop_length), int(win_length) 64 | 65 | def _lws_processor(self): 66 | try: 67 | return lws.lws( 68 | self.win_length, 69 | self.hop_length, 70 | fftsize=self.n_fft, 71 | mode="speech") 72 | except: 73 | raise RuntimeError( 74 | " !! WindowLength({}) is not multiple of HopLength({}).". 75 | format(self.win_length, self.hop_length)) 76 | 77 | def _amp_to_db(self, x): 78 | min_level = np.exp(self.min_level_db / 20 * np.log(10)) 79 | return 20 * np.log10(np.maximum(min_level, x)) 80 | 81 | def _db_to_amp(self, x): 82 | return np.power(10.0, x * 0.05) 83 | 84 | def _normalize(self, S): 85 | return np.clip((S - self.min_level_db) / -self.min_level_db, 0, 1) 86 | 87 | def _denormalize(self, S): 88 | return (np.clip(S, 0, 1) * -self.min_level_db) + self.min_level_db 89 | 90 | def apply_preemphasis(self, x): 91 | if self.preemphasis == 0: 92 | raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") 93 | return signal.lfilter([1, -self.preemphasis], [1], x) 94 | 95 | def apply_inv_preemphasis(self, x): 96 | if self.preemphasis == 0: 97 | raise RuntimeError(" !! Preemphasis is applied with factor 0.0. ") 98 | return signal.lfilter([1], [1, -self.preemphasis], x) 99 | 100 | def spectrogram(self, y): 101 | f = open(os.devnull, 'w') 102 | old_out = sys.stdout 103 | sys.stdout = f 104 | if self.preemphasis: 105 | D = self._lws_processor().stft(self.apply_preemphasis(y)).T 106 | else: 107 | D = self._lws_processor().stft(y).T 108 | S = self._amp_to_db(np.abs(D)) - self.ref_level_db 109 | sys.stdout = old_out 110 | return self._normalize(S) 111 | 112 | def inv_spectrogram(self, spectrogram): 113 | '''Converts spectrogram to waveform using librosa''' 114 | f = open(os.devnull, 'w') 115 | old_out = sys.stdout 116 | sys.stdout = f 117 | S = self._denormalize(spectrogram) 118 | S = self._db_to_amp(S + self.ref_level_db) # Convert back to linear 119 | processor = self._lws_processor() 120 | D = processor.run_lws(S.astype(np.float64).T**self.power) 121 | y = processor.istft(D).astype(np.float32) 122 | # Reconstruct phase 123 | sys.stdout = old_out 124 | if self.preemphasis: 125 | return self.apply_inv_preemphasis(y) 126 | return y 127 | 128 | def _linear_to_mel(self, spectrogram): 129 | global _mel_basis 130 | if _mel_basis is None: 131 | _mel_basis = self._build_mel_basis() 132 | return np.dot(_mel_basis, spectrogram) 133 | 134 | def _build_mel_basis(self, ): 135 | return librosa.filters.mel( 136 | self.sample_rate, self.n_fft, n_mels=self.num_mels) 137 | 138 | 139 | # fmin=self.min_mel_freq, fmax=self.max_mel_freq) 140 | 141 | def melspectrogram(self, y): 142 | f = open(os.devnull, 'w') 143 | old_out = sys.stdout 144 | sys.stdout = f 145 | if self.preemphasis: 146 | D = self._lws_processor().stft(self.apply_preemphasis(y)).T 147 | else: 148 | D = self._lws_processor().stft(y).T 149 | S = self._amp_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db 150 | sys.stdout = old_out 151 | return self._normalize(S) 152 | -------------------------------------------------------------------------------- /TTS/utils/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def _pad_data(x, length): 5 | _pad = 0 6 | assert x.ndim == 1 7 | return np.pad( 8 | x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 9 | 10 | 11 | def prepare_data(inputs): 12 | max_len = max((len(x) for x in inputs)) 13 | return np.stack([_pad_data(x, max_len) for x in inputs]) 14 | 15 | 16 | def _pad_tensor(x, length): 17 | _pad = 0 18 | assert x.ndim == 2 19 | x = np.pad( 20 | x, [[0, 0], [0, length - x.shape[1]]], 21 | mode='constant', 22 | constant_values=_pad) 23 | return x 24 | 25 | 26 | def prepare_tensor(inputs, out_steps): 27 | max_len = max((x.shape[1] for x in inputs)) + 1 # zero-frame 28 | remainder = max_len % out_steps 29 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len 30 | return np.stack([_pad_tensor(x, pad_len) for x in inputs]) 31 | 32 | 33 | def _pad_stop_target(x, length): 34 | _pad = 1. 35 | assert x.ndim == 1 36 | return np.pad( 37 | x, (0, length - x.shape[0]), mode='constant', constant_values=_pad) 38 | 39 | 40 | def prepare_stop_target(inputs, out_steps): 41 | max_len = max((x.shape[0] for x in inputs)) + 1 # zero-frame 42 | remainder = max_len % out_steps 43 | pad_len = max_len + (out_steps - remainder) if remainder > 0 else max_len 44 | return np.stack([_pad_stop_target(x, pad_len) for x in inputs]) 45 | 46 | 47 | def pad_per_step(inputs, pad_len): 48 | timesteps = inputs.shape[-1] 49 | return np.pad( 50 | inputs, [[0, 0], [0, 0], [0, pad_len]], 51 | mode='constant', 52 | constant_values=0.0) 53 | -------------------------------------------------------------------------------- /TTS/utils/text/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | from TTS.utils.text import cleaners 5 | from TTS.utils.text.symbols import symbols 6 | 7 | # Mappings from symbol to numeric ID and vice versa: 8 | _symbol_to_id = {s: i for i, s in enumerate(symbols)} 9 | _id_to_symbol = {i: s for i, s in enumerate(symbols)} 10 | 11 | # Regular expression matching text enclosed in curly braces: 12 | _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') 13 | 14 | 15 | def text_to_sequence(text, cleaner_names): 16 | '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. 17 | 18 | The text can optionally have ARPAbet sequences enclosed in curly braces embedded 19 | in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." 20 | 21 | Args: 22 | text: string to convert to a sequence 23 | cleaner_names: names of the cleaner functions to run the text through 24 | 25 | Returns: 26 | List of integers corresponding to the symbols in the text 27 | ''' 28 | sequence = [] 29 | 30 | # Check for curly braces and treat their contents as ARPAbet: 31 | while len(text): 32 | m = _curly_re.match(text) 33 | if not m: 34 | sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) 35 | break 36 | sequence += _symbols_to_sequence( 37 | _clean_text(m.group(1), cleaner_names)) 38 | sequence += _arpabet_to_sequence(m.group(2)) 39 | text = m.group(3) 40 | 41 | # Append EOS token 42 | sequence.append(_symbol_to_id['~']) 43 | return sequence 44 | 45 | 46 | def sequence_to_text(sequence): 47 | '''Converts a sequence of IDs back to a string''' 48 | result = '' 49 | for symbol_id in sequence: 50 | if symbol_id in _id_to_symbol: 51 | s = _id_to_symbol[symbol_id] 52 | # Enclose ARPAbet back in curly braces: 53 | if len(s) > 1 and s[0] == '@': 54 | s = '{%s}' % s[1:] 55 | result += s 56 | return result.replace('}{', ' ') 57 | 58 | 59 | def _clean_text(text, cleaner_names): 60 | for name in cleaner_names: 61 | cleaner = getattr(cleaners, name) 62 | if not cleaner: 63 | raise Exception('Unknown cleaner: %s' % name) 64 | text = cleaner(text) 65 | return text 66 | 67 | 68 | def _symbols_to_sequence(symbols): 69 | return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] 70 | 71 | 72 | def _arpabet_to_sequence(text): 73 | return _symbols_to_sequence(['@' + s for s in text.split()]) 74 | 75 | 76 | def _should_keep_symbol(s): 77 | return s in _symbol_to_id and s is not '_' and s is not '~' 78 | -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/cleaners.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/cleaners.cpython-37.pyc -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/cmudict.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/cmudict.cpython-37.pyc -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/numbers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/numbers.cpython-37.pyc -------------------------------------------------------------------------------- /TTS/utils/text/__pycache__/symbols.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JRC1995/Chatbot/83038b7af579bf38a595fde4eb302c254c27198a/TTS/utils/text/__pycache__/symbols.cpython-37.pyc -------------------------------------------------------------------------------- /TTS/utils/text/cleaners.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Cleaners are transformations that run over the input text at both training and eval time. 3 | 4 | Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" 5 | hyperparameter. Some cleaners are English-specific. You'll typically want to use: 6 | 1. "english_cleaners" for English text 7 | 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using 8 | the Unidecode library (https://pypi.python.org/pypi/Unidecode) 9 | 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update 10 | the symbols in symbols.py to match your data). 11 | ''' 12 | 13 | import re 14 | from unidecode import unidecode 15 | from .numbers import normalize_numbers 16 | 17 | # Regular expression matching whitespace: 18 | _whitespace_re = re.compile(r'\s+') 19 | 20 | # List of (regular expression, replacement) pairs for abbreviations: 21 | _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) 22 | for x in [ 23 | ('mrs', 'misess'), 24 | ('mr', 'mister'), 25 | ('dr', 'doctor'), 26 | ('st', 'saint'), 27 | ('co', 'company'), 28 | ('jr', 'junior'), 29 | ('maj', 'major'), 30 | ('gen', 'general'), 31 | ('drs', 'doctors'), 32 | ('rev', 'reverend'), 33 | ('lt', 'lieutenant'), 34 | ('hon', 'honorable'), 35 | ('sgt', 'sergeant'), 36 | ('capt', 'captain'), 37 | ('esq', 'esquire'), 38 | ('ltd', 'limited'), 39 | ('col', 'colonel'), 40 | ('ft', 'fort'), 41 | ]] 42 | 43 | 44 | def expand_abbreviations(text): 45 | for regex, replacement in _abbreviations: 46 | text = re.sub(regex, replacement, text) 47 | return text 48 | 49 | 50 | def expand_numbers(text): 51 | return normalize_numbers(text) 52 | 53 | 54 | def lowercase(text): 55 | return text.lower() 56 | 57 | 58 | def collapse_whitespace(text): 59 | return re.sub(_whitespace_re, ' ', text) 60 | 61 | 62 | def convert_to_ascii(text): 63 | return unidecode(text) 64 | 65 | 66 | def basic_cleaners(text): 67 | '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' 68 | text = lowercase(text) 69 | text = collapse_whitespace(text) 70 | return text 71 | 72 | 73 | def transliteration_cleaners(text): 74 | '''Pipeline for non-English text that transliterates to ASCII.''' 75 | text = convert_to_ascii(text) 76 | text = lowercase(text) 77 | text = collapse_whitespace(text) 78 | return text 79 | 80 | 81 | def english_cleaners(text): 82 | '''Pipeline for English text, including number and abbreviation expansion.''' 83 | text = convert_to_ascii(text) 84 | text = lowercase(text) 85 | text = expand_numbers(text) 86 | text = expand_abbreviations(text) 87 | text = collapse_whitespace(text) 88 | return text 89 | -------------------------------------------------------------------------------- /TTS/utils/text/cmudict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | valid_symbols = [ 6 | 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 7 | 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 8 | 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 9 | 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 10 | 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 11 | 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 12 | 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 13 | 'Y', 'Z', 'ZH' 14 | ] 15 | 16 | _valid_symbol_set = set(valid_symbols) 17 | 18 | 19 | class CMUDict: 20 | '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' 21 | 22 | def __init__(self, file_or_path, keep_ambiguous=True): 23 | if isinstance(file_or_path, str): 24 | with open(file_or_path, encoding='latin-1') as f: 25 | entries = _parse_cmudict(f) 26 | else: 27 | entries = _parse_cmudict(file_or_path) 28 | if not keep_ambiguous: 29 | entries = { 30 | word: pron 31 | for word, pron in entries.items() if len(pron) == 1 32 | } 33 | self._entries = entries 34 | 35 | def __len__(self): 36 | return len(self._entries) 37 | 38 | def lookup(self, word): 39 | '''Returns list of ARPAbet pronunciations of the given word.''' 40 | return self._entries.get(word.upper()) 41 | 42 | 43 | _alt_re = re.compile(r'\([0-9]+\)') 44 | 45 | 46 | def _parse_cmudict(file): 47 | cmudict = {} 48 | for line in file: 49 | if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): 50 | parts = line.split(' ') 51 | word = re.sub(_alt_re, '', parts[0]) 52 | pronunciation = _get_pronunciation(parts[1]) 53 | if pronunciation: 54 | if word in cmudict: 55 | cmudict[word].append(pronunciation) 56 | else: 57 | cmudict[word] = [pronunciation] 58 | return cmudict 59 | 60 | 61 | def _get_pronunciation(s): 62 | parts = s.strip().split(' ') 63 | for part in parts: 64 | if part not in _valid_symbol_set: 65 | return None 66 | return ' '.join(parts) 67 | -------------------------------------------------------------------------------- /TTS/utils/text/numbers.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') 4 | _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') 5 | _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') 6 | _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') 7 | _ordinal_re = re.compile(r'([0-9]+)(st|nd|rd|th)') 8 | _number_re = re.compile(r'[0-9]+') 9 | 10 | _units = [ 11 | '', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 12 | 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 13 | 'seventeen', 'eighteen', 'nineteen' 14 | ] 15 | 16 | _tens = [ 17 | '', 18 | 'ten', 19 | 'twenty', 20 | 'thirty', 21 | 'forty', 22 | 'fifty', 23 | 'sixty', 24 | 'seventy', 25 | 'eighty', 26 | 'ninety', 27 | ] 28 | 29 | _digit_groups = [ 30 | '', 31 | 'thousand', 32 | 'million', 33 | 'billion', 34 | 'trillion', 35 | 'quadrillion', 36 | ] 37 | 38 | _ordinal_suffixes = [ 39 | ('one', 'first'), 40 | ('two', 'second'), 41 | ('three', 'third'), 42 | ('five', 'fifth'), 43 | ('eight', 'eighth'), 44 | ('nine', 'ninth'), 45 | ('twelve', 'twelfth'), 46 | ('ty', 'tieth'), 47 | ] 48 | 49 | 50 | def _remove_commas(m): 51 | return m.group(1).replace(',', '') 52 | 53 | 54 | def _expand_decimal_point(m): 55 | return m.group(1).replace('.', ' point ') 56 | 57 | 58 | def _expand_dollars(m): 59 | match = m.group(1) 60 | parts = match.split('.') 61 | if len(parts) > 2: 62 | return match + ' dollars' # Unexpected format 63 | dollars = int(parts[0]) if parts[0] else 0 64 | cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 65 | if dollars and cents: 66 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 67 | cent_unit = 'cent' if cents == 1 else 'cents' 68 | return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) 69 | elif dollars: 70 | dollar_unit = 'dollar' if dollars == 1 else 'dollars' 71 | return '%s %s' % (dollars, dollar_unit) 72 | elif cents: 73 | cent_unit = 'cent' if cents == 1 else 'cents' 74 | return '%s %s' % (cents, cent_unit) 75 | else: 76 | return 'zero dollars' 77 | 78 | 79 | def _standard_number_to_words(n, digit_group): 80 | parts = [] 81 | if n >= 1000: 82 | # Format next higher digit group. 83 | parts.append(_standard_number_to_words(n // 1000, digit_group + 1)) 84 | n = n % 1000 85 | 86 | if n >= 100: 87 | parts.append('%s hundred' % _units[n // 100]) 88 | if n % 100 >= len(_units): 89 | parts.append(_tens[(n % 100) // 10]) 90 | parts.append(_units[(n % 100) % 10]) 91 | else: 92 | parts.append(_units[n % 100]) 93 | if n > 0: 94 | parts.append(_digit_groups[digit_group]) 95 | return ' '.join([x for x in parts if x]) 96 | 97 | 98 | def _number_to_words(n): 99 | # Handle special cases first, then go to the standard case: 100 | if n >= 1000000000000000000: 101 | return str(n) # Too large, just return the digits 102 | elif n == 0: 103 | return 'zero' 104 | elif n % 100 == 0 and n % 1000 != 0 and n < 3000: 105 | return _standard_number_to_words(n // 100, 0) + ' hundred' 106 | else: 107 | return _standard_number_to_words(n, 0) 108 | 109 | 110 | def _expand_number(m): 111 | return _number_to_words(int(m.group(0))) 112 | 113 | 114 | def _expand_ordinal(m): 115 | num = _number_to_words(int(m.group(1))) 116 | for suffix, replacement in _ordinal_suffixes: 117 | if num.endswith(suffix): 118 | return num[:-len(suffix)] + replacement 119 | return num + 'th' 120 | 121 | 122 | def normalize_numbers(text): 123 | text = re.sub(_comma_number_re, _remove_commas, text) 124 | text = re.sub(_pounds_re, r'\1 pounds', text) 125 | text = re.sub(_dollars_re, _expand_dollars, text) 126 | text = re.sub(_decimal_number_re, _expand_decimal_point, text) 127 | text = re.sub(_ordinal_re, _expand_ordinal, text) 128 | text = re.sub(_number_re, _expand_number, text) 129 | return text 130 | -------------------------------------------------------------------------------- /TTS/utils/text/symbols.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | Defines the set of symbols used in text input to the model. 4 | 5 | The default is a set of ASCII characters that works well for English or text that has been run 6 | through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. 7 | ''' 8 | from TTS.utils.text import cmudict 9 | 10 | _pad = '_' 11 | _eos = '~' 12 | _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' 13 | 14 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 15 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 16 | 17 | # Export all symbols: 18 | symbols = [_pad, _eos] + list(_characters) + _arpabet 19 | 20 | if __name__ == '__main__': 21 | print(symbols) 22 | -------------------------------------------------------------------------------- /TTS/utils/visual.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('Agg') 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def plot_alignment(alignment, info=None): 8 | fig, ax = plt.subplots(figsize=(16, 10)) 9 | im = ax.imshow( 10 | alignment.T, aspect='auto', origin='lower', interpolation='none') 11 | fig.colorbar(im, ax=ax) 12 | xlabel = 'Decoder timestep' 13 | if info is not None: 14 | xlabel += '\n\n' + info 15 | plt.xlabel(xlabel) 16 | plt.ylabel('Encoder timestep') 17 | plt.tight_layout() 18 | return fig 19 | 20 | 21 | def plot_spectrogram(linear_output, audio): 22 | spectrogram = audio._denormalize(linear_output) 23 | fig = plt.figure(figsize=(16, 10)) 24 | plt.imshow(spectrogram.T, aspect="auto", origin="lower") 25 | plt.colorbar() 26 | plt.tight_layout() 27 | return fig 28 | -------------------------------------------------------------------------------- /TTS/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1' 2 | -------------------------------------------------------------------------------- /chatbot.yml: -------------------------------------------------------------------------------- 1 | name: chatbot 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - attrs=19.3.0=py_0 8 | - backcall=0.1.0=py37_0 9 | - blas=1.0=mkl 10 | - bleach=3.1.0=py_0 11 | - ca-certificates=2019.11.27=0 12 | - certifi=2019.11.28=py37_0 13 | - cffi=1.13.2=py37h2e261b9_0 14 | - cudatoolkit=10.0.130=0 15 | - dbus=1.13.12=h746ee38_0 16 | - decorator=4.4.1=py_0 17 | - defusedxml=0.6.0=py_0 18 | - entrypoints=0.3=py37_0 19 | - expat=2.2.6=he6710b0_0 20 | - faiss-gpu=1.6.0=py37h1a5d453_0 21 | - fontconfig=2.13.0=h9420a91_0 22 | - freetype=2.9.1=h8a8886c_1 23 | - glib=2.63.1=h5a9c865_0 24 | - gmp=6.1.2=h6c8ec71_1 25 | - gst-plugins-base=1.14.0=hbbd80ab_1 26 | - gstreamer=1.14.0=hb453b48_1 27 | - icu=58.2=h9c2bf20_1 28 | - importlib_metadata=0.23=py37_0 29 | - intel-openmp=2019.4=243 30 | - ipykernel=5.1.3=py37h39e3cac_0 31 | - ipython=7.9.0=py37h39e3cac_0 32 | - ipython_genutils=0.2.0=py37_0 33 | - ipywidgets=7.5.1=py_0 34 | - jedi=0.15.1=py37_0 35 | - jinja2=2.10.3=py_0 36 | - jpeg=9b=h024ee3a_2 37 | - jsonschema=3.2.0=py37_0 38 | - jupyter=1.0.0=py37_7 39 | - jupyter_client=5.3.4=py37_0 40 | - jupyter_console=6.0.0=py37_0 41 | - jupyter_core=4.6.1=py37_0 42 | - libedit=3.1.20181209=hc058e9b_0 43 | - libffi=3.2.1=hd88cf55_4 44 | - libgcc=7.2.0=h69d50b8_2 45 | - libgcc-ng=9.1.0=hdf63c60_0 46 | - libgfortran-ng=7.3.0=hdf63c60_0 47 | - libpng=1.6.37=hbc83047_0 48 | - libsodium=1.0.16=h1bed415_0 49 | - libstdcxx-ng=9.1.0=hdf63c60_0 50 | - libtiff=4.1.0=h2733197_0 51 | - libuuid=1.0.3=h1bed415_2 52 | - libxcb=1.13=h1bed415_1 53 | - libxml2=2.9.9=hea5a465_1 54 | - markupsafe=1.1.1=py37h7b6447c_0 55 | - mistune=0.8.4=py37h7b6447c_0 56 | - mkl=2019.4=243 57 | - mkl-service=2.3.0=py37he904b0f_0 58 | - mkl_fft=1.0.15=py37ha843d7b_0 59 | - mkl_random=1.1.0=py37hd6b4f25_0 60 | - more-itertools=7.2.0=py37_0 61 | - nbconvert=5.6.1=py37_0 62 | - nbformat=4.4.0=py37_0 63 | - ncurses=6.1=he6710b0_1 64 | - ninja=1.9.0=py37hfd86e86_0 65 | - notebook=6.0.2=py37_0 66 | - numpy-base=1.17.4=py37hde5b4d6_0 67 | - olefile=0.46=py_0 68 | - openssl=1.1.1d=h7b6447c_3 69 | - pandoc=2.2.3.2=0 70 | - pandocfilters=1.4.2=py37_1 71 | - parso=0.5.1=py_0 72 | - pcre=8.43=he6710b0_0 73 | - pickleshare=0.7.5=py37_0 74 | - pillow=6.2.1=py37h34e0f95_0 75 | - pip=19.3.1=py37_0 76 | - prometheus_client=0.7.1=py_0 77 | - prompt_toolkit=2.0.10=py_0 78 | - ptyprocess=0.6.0=py37_0 79 | - pycparser=2.19=py_0 80 | - pygments=2.4.2=py_0 81 | - pyqt=5.9.2=py37h05f1152_2 82 | - pyrsistent=0.15.6=py37h7b6447c_0 83 | - python=3.7.5=h0371630_0 84 | - pytorch=1.3.1=py3.7_cuda10.0.130_cudnn7.6.3_0 85 | - pyzmq=18.1.0=py37he6710b0_0 86 | - qt=5.9.7=h5867ecd_1 87 | - qtconsole=4.6.0=py_0 88 | - readline=7.0=h7b6447c_5 89 | - send2trash=1.5.0=py37_0 90 | - sip=4.19.8=py37hf484d3e_0 91 | - six=1.13.0=py37_0 92 | - sqlite=3.30.1=h7b6447c_0 93 | - terminado=0.8.3=py37_0 94 | - testpath=0.4.4=py_0 95 | - tk=8.6.8=hbc83047_0 96 | - torchvision=0.4.2=py37_cu100 97 | - tornado=6.0.3=py37h7b6447c_0 98 | - traitlets=4.3.3=py37_0 99 | - wcwidth=0.1.7=py37_0 100 | - webencodings=0.5.1=py37_1 101 | - wheel=0.33.6=py37_0 102 | - widgetsnbextension=3.5.1=py37_0 103 | - xz=5.2.4=h14c3975_4 104 | - zeromq=4.3.1=he6710b0_3 105 | - zipp=0.6.0=py_0 106 | - zlib=1.2.11=h7b6447c_3 107 | - zstd=1.3.7=h0b5b093_0 108 | - pip: 109 | - absl-py==0.8.1 110 | - astor==0.8.0 111 | - audioread==2.1.8 112 | - autopep8==1.4.4 113 | - bidict==0.18.3 114 | - blis==0.4.1 115 | - boto==2.49.0 116 | - boto3==1.10.28 117 | - botocore==1.13.28 118 | - cachetools==3.1.1 119 | - catalogue==0.2.0 120 | - chardet==3.0.4 121 | - click==7.0 122 | - clldutils==3.5.0 123 | - colorlog==4.0.2 124 | - csvw==1.6.0 125 | - cycler==0.10.0 126 | - cymem==2.0.3 127 | - docutils==0.15.2 128 | - elasticsearch==7.1.0 129 | - en-core-web-lg==2.2.5 130 | - en-core-web-sm==2.2.5 131 | - ffmpeg-python==0.2.0 132 | - flask==1.1.1 133 | - future==0.18.2 134 | - gast==0.2.2 135 | - gensim==3.8.1 136 | - glog==0.3.1 137 | - google-auth==1.7.1 138 | - google-auth-oauthlib==0.4.1 139 | - grpcio==1.25.0 140 | - idna==2.8 141 | - isodate==0.6.0 142 | - itsdangerous==1.1.0 143 | - jmespath==0.9.4 144 | - joblib==0.14.0 145 | - kiwisolver==1.1.0 146 | - librosa==0.7.1 147 | - llvmlite==0.30.0 148 | - matplotlib==3.1.2 149 | - murmurhash==1.0.2 150 | - nltk==3.4.5 151 | - numba==0.46.0 152 | - numpy==1.17.4 153 | - oauthlib==3.1.0 154 | - opt-einsum==3.1.0 155 | - pexpect==4.2.1 156 | - phonemizer==2.0.1 157 | - plac==1.1.3 158 | - preshed==3.0.2 159 | - prettytable==0.7.2 160 | - protobuf==3.11.0 161 | - py-espeak-ng==0.1.8 162 | - pyaml==19.12.0 163 | - pyasn1==0.4.8 164 | - pyasn1-modules==0.2.7 165 | - pycodestyle==2.5.0 166 | - pydub==0.23.1 167 | - pyparsing==2.4.5 168 | - python-dateutil==2.8.0 169 | - python-gflags==3.1.2 170 | - pytorch-transformers==1.1.0 171 | - pyyaml==5.2 172 | - regex==2019.11.1 173 | - requests==2.22.0 174 | - requests-oauthlib==1.3.0 175 | - resampy==0.2.2 176 | - rfc3986==1.3.2 177 | - rsa==4.0 178 | - s3transfer==0.2.1 179 | - sacremoses==0.0.35 180 | - scikit-learn==0.21.3 181 | - scipy==1.3.3 182 | - segments==2.1.3 183 | - sentencepiece==0.1.83 184 | - setuptools==42.0.2 185 | - simpleaudio==1.0.4 186 | - smart-open==1.9.0 187 | - soundfile==0.10.3.post1 188 | - spacy==2.2.3 189 | - srsly==0.2.0 190 | - tabulate==0.8.6 191 | - tensorboard==2.0.2 192 | - tensorboardx==1.9 193 | - tensorflow==2.0.0 194 | - tensorflow-estimator==2.0.1 195 | - tensorflow-gpu==2.0.0 196 | - tensorflow-hub==0.7.0 197 | - tensorflow-text==2.0.1 198 | - thinc==7.3.1 199 | - tqdm==4.39.0 200 | - transformers==2.2.0 201 | - unidecode==1.1.1 202 | - uritemplate==3.0.0 203 | - urllib3==1.25.7 204 | - wasabi==0.5.0 205 | prefix: /home/jishnu/miniconda3/envs/chatbot 206 | 207 | --------------------------------------------------------------------------------