├── __init__.py
├── config.py
├── .floydexpt
├── save
    ├── training_data
    │   └── instructions.txt
    └── model
    │   └── instructions.txt
├── __pycache__
    ├── audio.cpython-36.pyc
    ├── load.cpython-36.pyc
    ├── model.cpython-36.pyc
    ├── train.cpython-36.pyc
    ├── config.cpython-36.pyc
    ├── evaluate.cpython-36.pyc
    └── pairlist.cpython-36.pyc
├── .ipynb_checkpoints
    ├── Untitled1-checkpoint.ipynb
    ├── Untitled2-checkpoint.ipynb
    └── Audiotesting-checkpoint.ipynb
├── .floydignore
├── data
    └── instruction.txt
├── pairlist.py
├── .gitignore
├── floyd.yml
├── command.txt
├── LICENSE
├── audio.py
├── Data Preprocessing
    ├── createCORPUS.py
    ├── createDB.py
    └── CreateVocabularyNPair.py
├── README.md
├── load.py
├── main.py
├── temp.txt
├── model.py
├── train.py
├── evaluate.py
└── Audiotesting.ipynb


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | MAX_LENGTH = 15
2 | teacher_forcing_ratio = 1.0
3 | save_dir = './save'
4 | 


--------------------------------------------------------------------------------
/.floydexpt:
--------------------------------------------------------------------------------
1 | {"family_id": "prj_W674BZVpgc8xejVF", "namespace": "aryanc55", "name": "chatbot"}


--------------------------------------------------------------------------------
/save/training_data/instructions.txt:
--------------------------------------------------------------------------------
1 | Here training data is saved. It includes pairs,vocabulary and bathes for training .


--------------------------------------------------------------------------------
/__pycache__/audio.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/audio.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/load.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/load.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/model.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/train.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/__pycache__/evaluate.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/evaluate.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/pairlist.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataSenseiAryan/TS3000_TheChatBOT/HEAD/__pycache__/pairlist.cpython-36.pyc


--------------------------------------------------------------------------------
/save/model/instructions.txt:
--------------------------------------------------------------------------------
1 | here trained models are saved.With folder named according to their network layer and then tar files named according to their iterations.


--------------------------------------------------------------------------------
/.floydignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Directories and files to ignore when uploading code to floyd
 3 | 
 4 | .git
 5 | .eggs
 6 | eggs
 7 | lib
 8 | lib64
 9 | parts
10 | sdist
11 | var
12 | *.pyc
13 | *.swp
14 | .DS_Store
15 | 


--------------------------------------------------------------------------------
/data/instruction.txt:
--------------------------------------------------------------------------------
1 | Move your generated text corpus here .
2 | For example I processed text corpus 2011-08small.txt in data preprocessing folder and moved it here for further training.
3 | 
4 | 2011-08small.txt is not being ignored by git .
5 | 


--------------------------------------------------------------------------------
/pairlist.py:
--------------------------------------------------------------------------------
 1 | # this contains some predefined pair outputs .
 2 | def fixedpair(inp):
 3 |         if inp == "who?":
 4 |             pair = "I am TS3000."
 5 |         elif inp == "who ?" :
 6 |             pair = "I m bitch"
 7 |         
 8 |         return pair
 9 |         
10 |      
11 |     


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # ignore folder Data Preprocessing 
 2 | #Data Preprocessing
 3 | Data\ Preprocessing/2011-08.db
 4 | Data\ Preprocessing/2011-08.txt
 5 | Data\ Preprocessing/2013-09.db 
 6 | 
 7 | #  2013-09small.txt  , instruction.txt is visible
 8 |   
 9 | data/movie_subtitles.txt
10 | data/2013-09.txt
11 | data/2011-08.txt
12 | 
13 | 
14 | #save/
15 | 
16 | save/model/2013-09small/1-1_512/*
17 | 
18 | save/model/movie_subtitles/1-1_512/*
19 | 
20 | save/training_data/2013-09small/*
21 | 
22 | save/training_data/movie_subtitles/*
23 | 
24 | 


--------------------------------------------------------------------------------
/floyd.yml:
--------------------------------------------------------------------------------
 1 | # see: https://docs.floydhub.com/floyd_config
 2 | # All supported configs:
 3 | #
 4 | #machine: cpu
 5 | #env: tensorflow-1.8
 6 | #input:
 7 | #  - destination: input
 8 | #    source: foo/datasets/yelp-food/1
 9 | #  - foo/datasets/yelp-food-test/1:test
10 | #description: this is a test
11 | #max_runtime: 3600
12 | #command: python train.py
13 | 
14 | # You can also define multiple tasks to use with --task argument:
15 | #
16 | #task:
17 | #  evaluate:
18 | #    machine: gpu
19 | #    command: python evaluate.py
20 | #
21 | #  serve:
22 | #    machine: cpu
23 | #    mode: serve
24 | 


--------------------------------------------------------------------------------
/command.txt:
--------------------------------------------------------------------------------
 1 | ####testing interactive mode  
 2 | 
 3 | python3 main.py -te save/model/2013-09small/1-1_512/3000_backup_bidir_model.tar -c data/2013-09small.txt -i
 4 | 
 5 | #in voice command mode 
 6 | python3 main.py -te save/model/2013-09small/1-1_512/3000_backup_bidir_model.tar -c data/2013-09small.txt -i -v
 7 | 
 8 | 
 9 | ### do training 
10 | python3 main.py -tr data/2013-09small.txt -la 1 -hi 512 -lr 0.0001 -it 50000 -b 64 -p 500 -s 1000
11 | 
12 | python main.py -tr <CORPUS_FILE_PATH> -la 1 -hi 512 -lr 0.0001 -it 50000 -b 64 -p 500 -s 1000
13 | 
14 | ###start training from where you left 
15 | 
16 | python main.py -tr <CORPUS_FILE_PATH> -l <MODEL_FILE_PATH> -lr 0.0001 -it 50000 -b 64 -p 500 -s 1000
17 | 
18 | python3 main.py -tr data/2013-09small.txt -l save/model/2013-09small/1-1_512/3000_backup_bidir_model.tar -lr 0.0001 -it 50000 -b 64 -p 500 -s 1000
19 | 
20 | ## for more options 
21 | 
22 | python3 main.py -h
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Aryan Chaudhary
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/audio.py:
--------------------------------------------------------------------------------
 1 | import pyttsx3 #pip install pyttsx3
 2 | import speech_recognition as sr #pip install speechRecognition
 3 | #import datetime
 4 | #import wikipedia #pip install wikipedia
 5 | #import webbrowser
 6 | #import os
 7 | #import smtplib
 8 | 
 9 | engine = pyttsx3.init()
10 | rate = engine.getProperty('rate')
11 | voices = engine.getProperty('voices')
12 | volume = engine.getProperty('volume') 
13 | 
14 | engine.setProperty('rate',120) #120 words per minute
15 | 
16 | 
17 | 
18 | 
19 | def speak(audio):
20 |     engine.say(audio)
21 |     engine.runAndWait()
22 | 
23 | 
24 | def takeCommand():
25 |     #It takes microphone input from the user and returns string output
26 | 
27 |     r = sr.Recognizer()
28 |     with sr.Microphone() as source:
29 |         print("Listening...")
30 |         r.pause_threshold = 1
31 |         audio = r.listen(source)
32 | 
33 |     try:
34 |         print("Recognizing...")    
35 |         query = r.recognize_google(audio, language='en-in')
36 |         print(f"User said: {query}\n")
37 | 
38 |     except Exception as e:
39 |         # print(e)    
40 |         print("Say that again please...")  
41 |         return "None"
42 |     return query


--------------------------------------------------------------------------------
/Data Preprocessing/createCORPUS.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import pandas as pd
 3 | 
 4 | timeframes = ['2011-08']
 5 | 
 6 | for timeframe in timeframes:
 7 |     connection = sqlite3.connect('{}.db'.format(timeframe))
 8 |     c = connection.cursor()
 9 |     limit = 5000
10 |     last_unix = 0
11 |     cur_length = limit
12 |     counter = 0
13 |     test_done = False
14 | 
15 |     while cur_length == limit:
16 | 
17 |         df = pd.read_sql(
18 |             "SELECT * FROM parent_reply WHERE unix > {} and parent NOT NULL and score > 0 ORDER BY unix ASC LIMIT {}".format(last_unix, limit), connection)
19 |         last_unix = df.tail(1)['unix'].values[0]
20 |         cur_length = len(df)
21 | 
22 |         if not test_done:
23 |             with open('data1', 'a', encoding='utf8') as f:
24 |                 for i in df.index:
25 |                     f.write(df.loc[i, 'parent']+'\t'+df.loc[i, 'comment']+'\n')
26 | 
27 |             test_done = True
28 | 
29 |         else:
30 | 
31 |             with open('data1.txt', 'a', encoding='utf8') as f:
32 |                 for i in df.index:
33 |                     f.write(df.loc[i, 'parent']+'\t'+df.loc[i, 'comment']+'\n')
34 | 
35 |         counter += 1
36 |         if counter % 20 == 0:
37 |             print(counter*limit, 'rows completed so far')
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # TS3000_TheChatBot
 4 | Tony Stark 3000 - The Chat Bot Its a very basic level conversational AI.
 5 | 
 6 | Its a social networking chat-bot trained on Reddit dataset . It supports open bounded queries developed on the concept of Neural Machine Translation. Beware of its being sarcastic like its creator :stuck_out_tongue_closed_eyes: :trollface: BDW it uses Pytorch framework and Python3 
 7 | ## Data Preprocessing :
 8 | [Downoad Reddit Data From Here](http://files.pushshift.io/reddit/comments/)
 9 | 
10 | ### Follow These :
11 | - Put Downloaded data in Data_Preprocessing directory .
12 | - Unzip the .bz2 file  **bzip2 -dk filename.bz2**
13 |     * *Install Bzip2 on Ubuntu
14 |           - sudo apt-get update
15 |           - sudo apt-get install bzip2
16 |           - Useful Links :
17 |                - [Installing Bzip2](https://www.techwalla.com/articles/how-to-install-bzip2-on-ubuntu) 
18 |                - [Unzipping Error](https://superuser.com/questions/480950/how-to-decompress-a-bz2-file)
19 | - Run createDB.py 
20 |    > python3 createDB.py 
21 | 
22 | 
23 |      This will create Database from Raw JSON text file which you unzipped earlier.
24 | - Run createCORPUS.py
25 |    > python3 createCORPUS.py
26 | 
27 | 
28 |      This will create corpus .For example I created 2011-08small.txt
29 | 
30 | - Move this created corpus to Data directory .
31 | 
32 | ___
33 | 
34 | ## Trainining Model :
35 | 
36 | - Start training model using this command :
37 |    >python3 main.py -tr data/2013-09small.txt -l -lr 0.0001 -it 50000 -b 64 -p 500 -s 1000
38 | 
39 | 
40 | - To resume training from last where yiu left :
41 |    > python3 main.py -tr data/2013-09small.txt -l save/model/2013-09small/1-1_512/3000_backup_bidir_model.tar -lr 0.0001 -it         50000 -b 64 -p 500 -s 1000
42 | 
43 | ___
44 | 
45 | ## Testing Model :
46 | 
47 | - To test the model in interactive mode :
48 |    > python3 main.py -te save/model/2013-09small/1-1_512/3000_backup_bidir_model.tar -c data/2013-09small.txt -i
49 | 
50 | ___
51 | 
52 | ## Acknowledgements :
53 | 
54 | - [Pytorch-Tutorial-NMT](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)
55 | - [Pytorch-Tutorial-Chatbot](https://pytorch.org/tutorials/beginner/chatbot_tutorial.html)
56 | - [Python Tensorflow Chatbot](https://pythonprogramming.net/chatbot-deep-learning-python-tensorflow/)
57 | - [Sentdex Git-REPO](https://github.com/daniel-kukiela/nmt-chatbot)
58 | - [Reddit Data](http://files.pushshift.io/reddit/comments/)
59 | - [Great Coursera-SeqtoSeq Tutorial](https://www.coursera.org/learn/nlp-sequence-models)
60 | 
61 | ___
62 | ## License :
63 | MIT License
64 | 
65 | Copyright (c) 2019 Aryan Chaudhary
66 | 
67 | [LICENSE](https://github.com/aryanc55/TS3000_TheChatBOT/blob/master/LICENSE)
68 | 
69 | 
70 | [![HitCount](http://hits.dwyl.io/aryanc55/https://githubcom/aryanc55/TS3000_TheChatBOT.svg)](http://hits.dwyl.io/aryanc55/https://githubcom/aryanc55/TS3000_TheChatBOT)
71 | 
72 | 


--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import re
  3 | import os
  4 | import unicodedata
  5 | 
  6 | from config import MAX_LENGTH, save_dir
  7 | 
  8 | SOS_token = 0
  9 | EOS_token = 1
 10 | PAD_token = 2
 11 | 
 12 | class Voc:
 13 |     def __init__(self, name):
 14 |         self.name = name
 15 |         self.word2index = {}
 16 |         self.word2count = {}
 17 |         self.index2word = {0: "SOS", 1: "EOS", 2:"PAD"}
 18 |         self.n_words = 3  # Count SOS and EOS
 19 | 
 20 |     def addSentence(self, sentence):
 21 |         for word in sentence.split(' '):
 22 |             self.addWord(word)
 23 | 
 24 |     def addWord(self, word):
 25 |         if word not in self.word2index:
 26 |             self.word2index[word] = self.n_words
 27 |             self.word2count[word] = 1
 28 |             self.index2word[self.n_words] = word
 29 |             self.n_words += 1
 30 |         else:
 31 |             self.word2count[word] += 1
 32 | 
 33 | # Turn a Unicode string to plain ASCII, thanks to
 34 | # http://stackoverflow.com/a/518232/2809427
 35 | def unicodeToAscii(s):
 36 |     return ''.join(
 37 |         c for c in unicodedata.normalize('NFD', s)
 38 |         if unicodedata.category(c) != 'Mn'
 39 |     )
 40 | 
 41 | # Lowercase, trim, and remove non-letter characters
 42 | def normalizeString(s):
 43 |     s = unicodeToAscii(s.lower().strip())
 44 |     s = re.sub(r"([.!?])", r" \1", s)
 45 |     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
 46 |     s = re.sub(r"\s+", r" ", s).strip()
 47 |     return s
 48 | 
 49 | def readVocs(corpus, corpus_name):
 50 |     print("Reading lines...")
 51 | 
 52 |     # combine every two lines into pairs and normalize
 53 |     with open(corpus) as f:
 54 |         content = f.readlines()
 55 |     # import gzip
 56 |     # content = gzip.open(corpus, 'rt')
 57 |     lines = [x.strip() for x in content]
 58 |     it = iter(lines)
 59 |     # pairs = [[normalizeString(x), normalizeString(next(it))] for x in it]
 60 |     pairs = [[x, next(it)] for x in it]
 61 | 
 62 |     voc = Voc(corpus_name)
 63 |     return voc, pairs
 64 | 
 65 | def filterPair(p):
 66 |     # input sequences need to preserve the last word for EOS_token
 67 |     return len(p[0].split(' ')) < MAX_LENGTH and \
 68 |         len(p[1].split(' ')) < MAX_LENGTH
 69 | 
 70 | def filterPairs(pairs):
 71 |     return [pair for pair in pairs if filterPair(pair)]
 72 | 
 73 | def prepareData(corpus, corpus_name):
 74 |     voc, pairs = readVocs(corpus, corpus_name)
 75 |     print("Read {!s} sentence pairs".format(len(pairs)))
 76 |     pairs = filterPairs(pairs)
 77 |     print("Trimmed to {!s} sentence pairs".format(len(pairs)))
 78 |     print("Counting words...")
 79 |     for pair in pairs:
 80 |         voc.addSentence(pair[0])
 81 |         voc.addSentence(pair[1])
 82 |     print("Counted words:", voc.n_words)
 83 |     directory = os.path.join(save_dir, 'training_data', corpus_name)
 84 |     if not os.path.exists(directory):
 85 |         os.makedirs(directory)
 86 |     torch.save(voc, os.path.join(directory, '{!s}.tar'.format('voc')))
 87 |     torch.save(pairs, os.path.join(directory, '{!s}.tar'.format('pairs')))
 88 |     return voc, pairs
 89 | 
 90 | def loadPrepareData(corpus):
 91 |     corpus_name = corpus.split('/')[-1].split('.')[0]
 92 |     try:
 93 |         print("Start loading training data ...")
 94 |         voc = torch.load(os.path.join(save_dir, 'training_data', corpus_name, 'voc.tar'))
 95 |         pairs = torch.load(os.path.join(save_dir, 'training_data', corpus_name, 'pairs.tar'))
 96 |     except FileNotFoundError:
 97 |         print("Saved data not found, start preparing trianing data ...")
 98 |         voc, pairs = prepareData(corpus, corpus_name)
 99 |     return voc, pairs
100 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from train import trainIters
 3 | from evaluate import runTest
 4 | 
 5 | def parse():
 6 |     parser = argparse.ArgumentParser(description='Attention Seq2Seq Chatbot')
 7 |     parser.add_argument('-tr', '--train', help='Train the model with corpus')
 8 |     parser.add_argument('-te', '--test', help='Test the saved model')
 9 |     parser.add_argument('-l', '--load', help='Load the model and train')
10 |     parser.add_argument('-c', '--corpus', help='Test the saved model with vocabulary of the corpus')
11 |     parser.add_argument('-r', '--reverse', action='store_true', help='Reverse the input sequence')
12 |     parser.add_argument('-f', '--filter', action='store_true', help='Filter to small training data set')
13 |     parser.add_argument('-i', '--input', action='store_true', help='Test the model by input the sentence')
14 |     parser.add_argument('-it', '--iteration', type=int, default=10000, help='Train the model with it iterations')
15 |     parser.add_argument('-p', '--print', type=int, default=100, help='Print every p iterations')
16 |     parser.add_argument('-b', '--batch_size', type=int, default=64, help='Batch size')
17 |     parser.add_argument('-la', '--layer', type=int, default=1, help='Number of layers in encoder and decoder')
18 |     parser.add_argument('-hi', '--hidden', type=int, default=256, help='Hidden size in encoder and decoder')
19 |     parser.add_argument('-be', '--beam', type=int, default=1, help='Hidden size in encoder and decoder')
20 |     parser.add_argument('-s', '--save', type=int, default=500, help='Save every s iterations')
21 |     parser.add_argument('-lr', '--learning_rate', type=float, default=0.01, help='Learning rate')
22 |     parser.add_argument('-d', '--dropout', type=float, default=0.1, help='Dropout probability for rnn and dropout layers')
23 |     parser.add_argument('-v', '--voice_mode',type=bool, default=False, help='Interact with bot voice command mode')
24 | 
25 |     args = parser.parse_args()
26 |     return args
27 | 
28 | def parseFilename(filename, test=False):
29 |     filename = filename.split('/')
30 |     dataType = filename[-1][:-4] # remove '.tar'
31 |     parse = dataType.split('_')
32 |     reverse = 'reverse' in parse
33 |     layers, hidden = filename[-2].split('_')
34 |     n_layers = int(layers.split('-')[0])
35 |     hidden_size = int(hidden)
36 |     return n_layers, hidden_size, reverse
37 | 
38 | def run(args):
39 |     reverse, fil, n_iteration, print_every, save_every, learning_rate, \
40 |         n_layers, hidden_size, batch_size, beam_size, inp, dropout = \
41 |         args.reverse, args.filter, args.iteration, args.print, args.save, args.learning_rate, \
42 |         args.layer, args.hidden, args.batch_size, args.beam, args.input, args.dropout
43 |     if args.train and not args.load:
44 |         trainIters(args.train, reverse, n_iteration, learning_rate, batch_size,
45 |                     n_layers, hidden_size, print_every, save_every, dropout)
46 |     elif args.load:
47 |         n_layers, hidden_size, reverse = parseFilename(args.load)
48 |         trainIters(args.train, reverse, n_iteration, learning_rate, batch_size,
49 |                     n_layers, hidden_size, print_every, save_every, dropout, loadFilename=args.load)
50 |    
51 |     elif args.test and args.voice_mode :
52 |         n_layers, hidden_size, reverse = parseFilename(args.test, True)
53 |             
54 |         runTest(n_layers, hidden_size, reverse, args.test, beam_size, inp, args.corpus ,audio = True)
55 |     
56 |     elif args.test:
57 |         n_layers, hidden_size, reverse = parseFilename(args.test, True)
58 |         runTest(n_layers, hidden_size, reverse, args.test, beam_size, inp, args.corpus ,audio = False)
59 |                         
60 | if __name__ == '__main__':
61 |     args = parse()
62 |     run(args)
63 | 


--------------------------------------------------------------------------------
/.ipynb_checkpoints/Audiotesting-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyttsx3\n",
 10 |     "import speech_recognition as sr"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 5,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "engine = pyttsx3.init()\n",
 20 |     "rate = engine.getProperty('rate')\n",
 21 |     "voices = engine.getProperty('voices')\n",
 22 |     "volume = engine.getProperty('volume') \n",
 23 |     "\n",
 24 |     "engine.setProperty('rate',120) #120 words per minute\n",
 25 |     "#engine.setProperty('volume',0.9)  # setting up volume level  between 0 and 1\n",
 26 |     "#engine.setProperty('voice', voices[0].id) #changing index, changes voices. 1 for female\n"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": []
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 6,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "engine.say('My current speaking rate is ' + str(rate))\n",
 43 |     "engine.runAndWait()\n",
 44 |     "engine.stop()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import pyttsx3\n",
 54 |     "engine = pyttsx3.init()\n",
 55 |     "engine.say(\"I will speak this text\")\n",
 56 |     "engine.runAndWait()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 2,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "import pyttsx3 #pip install pyttsx3\n",
 66 |     "import speech_recognition as sr #pip install speechRecognition\n",
 67 |     "#import datetime\n",
 68 |     "#import wikipedia #pip install wikipedia\n",
 69 |     "#import webbrowser\n",
 70 |     "#import os\n",
 71 |     "#import smtplib\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "def speak(audio):\n",
 76 |     "    engine.say(audio)\n",
 77 |     "    engine.runAndWait()\n",
 78 |     "\n",
 79 |     "\n",
 80 |     "def takeCommand():\n",
 81 |     "    #It takes microphone input from the user and returns string output\n",
 82 |     "\n",
 83 |     "    r = sr.Recognizer()\n",
 84 |     "    with sr.Microphone() as source:\n",
 85 |     "        print(\"Listening...\")\n",
 86 |     "        r.pause_threshold = 0.1\n",
 87 |     "        audio = r.listen(source)\n",
 88 |     "\n",
 89 |     "    try:\n",
 90 |     "        print(\"Recognizing...\")    \n",
 91 |     "        query = r.recognize_google(audio, language='en-in')\n",
 92 |     "        print(f\"User said: {query}\\n\")\n",
 93 |     "\n",
 94 |     "    except Exception as e:\n",
 95 |     "        # print(e)    \n",
 96 |     "        print(\"Say that again please...\")  \n",
 97 |     "        return \"None\"\n",
 98 |     "    return query"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Listening...\n",
111 |       "Recognizing...\n",
112 |       "User said: hello hello hello hello hello hello hello hello\n",
113 |       "\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "pair = takeCommand().lower()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 3
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython3",
145 |    "version": "3.6.8"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 2
150 | }
151 | 


--------------------------------------------------------------------------------
/temp.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.7.1
  2 | asn1crypto==0.24.0
  3 | astor==0.8.0
  4 | autobahn==18.12.1
  5 | autopep8==1.3.4
  6 | backports-abc==0.5
  7 | backports.functools-lru-cache==1.5
  8 | backports.shutil-get-terminal-size==1.0.0
  9 | backports.ssl-match-hostname==3.5.0.1
 10 | backports.weakref==1.0.post1
 11 | beautifulsoup4==4.8.0
 12 | bleach==1.5.0
 13 | boto==2.49.0
 14 | boto3==1.7.61
 15 | botocore==1.10.61
 16 | bs4==0.0.1
 17 | bz2file==0.98
 18 | certifi==2019.6.16
 19 | chardet==3.0.4
 20 | Click==7.0
 21 | cloudpickle==0.5.3
 22 | colorama==0.4.1
 23 | configparser==3.5.0
 24 | cryptography==2.1.4
 25 | cycler==0.10.0
 26 | cymem==1.31.2
 27 | cytoolz==0.8.2
 28 | dask==1.2.2
 29 | decorator==4.3.0
 30 | dill==0.2.8.2
 31 | docker-py==1.10.3
 32 | docker-pycreds==0.2.1
 33 | docutils==0.14
 34 | dtrx==7.1
 35 | duplicity==0.7.17
 36 | EasyProcess==0.2.3
 37 | emoji==0.5.3
 38 | en-core-web-md==2.0.0
 39 | entrypoints==0.2.3
 40 | enum34==1.1.6
 41 | fasteners==0.12.0
 42 | fastzbarlight==0.0.14
 43 | ffmpeg-normalize==1.3.10
 44 | Flask==1.0.2
 45 | funcsigs==1.0.2
 46 | functools32==3.2.3.post2
 47 | future==0.16.0
 48 | futures==3.2.0
 49 | gast==0.2.2
 50 | gensim==3.5.0
 51 | google-pasta==0.1.7
 52 | grpcio==1.21.1
 53 | gTTS==2.0.3
 54 | gTTS-token==1.1.3
 55 | gym==0.9.5
 56 | h5py==2.9.0
 57 | html5lib==0.9999999
 58 | idna==2.8
 59 | IMDbPY==6.6
 60 | ipaddress==1.0.17
 61 | ipykernel==4.8.2
 62 | ipython==5.7.0
 63 | ipython-genutils==0.2.0
 64 | ipywidgets==7.2.1
 65 | itsdangerous==0.24
 66 | Jinja2==2.10
 67 | jmespath==0.9.3
 68 | joblib==0.13.2
 69 | jsonschema==2.6.0
 70 | jupyter==1.0.0
 71 | jupyter-client==5.2.3
 72 | jupyter-console==5.2.0
 73 | jupyter-core==4.4.0
 74 | kafka-python==1.4.3
 75 | Keras==2.2.4
 76 | Keras-Applications==1.0.8
 77 | Keras-Preprocessing==1.1.0
 78 | keyring==10.6.0
 79 | keyrings.alt==3.0
 80 | kiwisolver==1.0.1
 81 | lockfile==0.12.2
 82 | lxml==4.3.2
 83 | Markdown==3.1.1
 84 | MarkupSafe==1.0
 85 | matplotlib==2.2.2
 86 | mistune==0.8.3
 87 | mock==3.0.5
 88 | monotonic==1.0
 89 | MouseInfo==0.0.4
 90 | msgpack-numpy==0.4.1
 91 | msgpack-python==0.5.6
 92 | mss==3.2.1
 93 | murmurhash==0.28.0
 94 | mysql-connector==2.1.6
 95 | nbconvert==5.3.1
 96 | nbformat==4.4.0
 97 | networkx==2.1
 98 | notebook==5.5.0
 99 | numpy==1.16.4
100 | oauthlib==2.1.0
101 | opencv-python==3.4.1.15
102 | pandas==0.23.4
103 | pandocfilters==1.4.2
104 | pathlib==1.0.1
105 | pathlib2==2.3.2
106 | patsy==0.5.0
107 | pbr==4.2.0
108 | pep8==1.7.1
109 | pexpect==4.6.0
110 | pickleshare==0.7.4
111 | Pillow==5.2.0
112 | pip-autoremove==0.9.1
113 | plac==0.9.6
114 | preshed==1.0.0
115 | prompt-toolkit==1.0.15
116 | protobuf==3.8.0
117 | ptyprocess==0.6.0
118 | PyAudio==0.2.11
119 | PyAutoGUI==0.9.47
120 | pycodestyle==2.3.1
121 | pycrypto==2.6.1
122 | pygame==1.9.6
123 | PyGetWindow==0.0.7
124 | pyglet==1.3.2
125 | Pygments==2.2.0
126 | pygobject==3.26.1
127 | PyMsgBox==1.0.7
128 | pynput==1.4.2
129 | pyparsing==2.2.0
130 | pyperclip==1.7.0
131 | PyRect==0.1.4
132 | pyscreenshot==0.4.2
133 | PyScreeze==0.1.22
134 | PySocks==1.6.8
135 | python-dateutil==2.7.3
136 | python-twitter==3.4.2
137 | python-xlib==0.25
138 | pyttsx3==2.7
139 | PyTweening==1.0.3
140 | pytz==2018.5
141 | PyVirtualDisplay==0.2.1
142 | PyWavelets==0.5.2
143 | pyxdg==0.25
144 | PyYAML==5.1.1
145 | pyzmq==17.0.0
146 | qtconsole==4.3.1
147 | regex==2017.4.5
148 | requests==2.22.0
149 | requests-oauthlib==1.0.0
150 | s3transfer==0.1.13
151 | scandir==1.7
152 | scikit-image==0.14.0
153 | scikit-learn==0.19.1
154 | scipy==1.2.2
155 | seaborn==0.8.1
156 | SecretStorage==2.3.1
157 | selenium==3.13.0
158 | Send2Trash==1.5.0
159 | simplegeneric==0.8.1
160 | singledispatch==3.4.0.3
161 | six==1.12.0
162 | sklearn==0.0
163 | smart-open==1.6.0
164 | soupsieve==1.9.2
165 | spacy==2.0.11
166 | SpeechRecognition==3.8.1
167 | SQLAlchemy==1.3.0
168 | statsmodels==0.9.0
169 | subprocess32==3.5.2
170 | tensorboard==1.14.0
171 | tensorflow==1.4.1
172 | tensorflow-estimator==1.14.0rc1
173 | tensorflow-gpu==1.4.1
174 | tensorflow-tensorboard==0.4.0
175 | termcolor==1.1.0
176 | terminado==0.8.1
177 | testpath==0.3.1
178 | thinc==6.10.2
179 | toolz==0.9.0
180 | torch==0.4.0
181 | torchvision==0.2.1
182 | tornado==5.1
183 | tqdm==4.31.1
184 | traitlets==4.3.2
185 | tweepy==3.6.0
186 | txaio==18.8.1
187 | ujson==1.35
188 | urllib3==1.25.3
189 | wcwidth==0.1.7
190 | webencodings==0.5.1
191 | websocket-client==0.54.0
192 | Werkzeug==0.15.4
193 | widgetsnbextension==3.2.1
194 | wikipedia==1.4.0
195 | wrapt==1.11.2
196 | xgboost==0.80
197 | xlib==0.21
198 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | USE_CUDA = torch.cuda.is_available()
  6 | device = torch.device("cuda" if USE_CUDA else "cpu")
  7 | 
  8 | class EncoderRNN(nn.Module):
  9 |     def __init__(self, input_size, hidden_size, embedding, n_layers=1, dropout=0):
 10 |         super(EncoderRNN, self).__init__()
 11 |         self.n_layers = n_layers
 12 |         self.hidden_size = hidden_size
 13 |         self.embedding = embedding
 14 | 
 15 |         self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
 16 |                           dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
 17 | 
 18 |     def forward(self, input_seq, input_lengths, hidden=None):
 19 |         embedded = self.embedding(input_seq)
 20 |         packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
 21 |         outputs, hidden = self.gru(packed, hidden) # output: (seq_len, batch, hidden*n_dir)
 22 |         outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
 23 |         outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] # Sum bidirectional outputs (1, batch, hidden)
 24 |         return outputs, hidden
 25 | 
 26 | class Attn(nn.Module):
 27 |     def __init__(self, method, hidden_size):
 28 |         super(Attn, self).__init__()
 29 | 
 30 |         self.method = method
 31 |         self.hidden_size = hidden_size
 32 | 
 33 |         if self.method == 'general':
 34 |             self.attn = nn.Linear(self.hidden_size, hidden_size)
 35 | 
 36 |         elif self.method == 'concat':
 37 |             self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
 38 |             self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
 39 | 
 40 |     def forward(self, hidden, encoder_outputs):
 41 |         # hidden [1, 64, 512], encoder_outputs [14, 64, 512]
 42 |         max_len = encoder_outputs.size(0)
 43 |         batch_size = encoder_outputs.size(1)
 44 | 
 45 |         # Create variable to store attention energies
 46 |         attn_energies = torch.zeros(batch_size, max_len) # B x S
 47 |         attn_energies = attn_energies.to(device)
 48 | 
 49 |         # For each batch of encoder outputs
 50 |         for b in range(batch_size):
 51 |             # Calculate energy for each encoder output
 52 |             for i in range(max_len):
 53 |                 attn_energies[b, i] = self.score(hidden[:, b], encoder_outputs[i, b].unsqueeze(0))
 54 | 
 55 |         # Normalize energies to weights in range 0 to 1, resize to 1 x B x S
 56 |         return F.softmax(attn_energies, dim=1).unsqueeze(1)
 57 | 
 58 |     def score(self, hidden, encoder_output):
 59 |         # hidden [1, 512], encoder_output [1, 512]
 60 |         if self.method == 'dot':
 61 |             energy = hidden.squeeze(0).dot(encoder_output.squeeze(0))
 62 |             return energy
 63 | 
 64 |         elif self.method == 'general':
 65 |             energy = self.attn(encoder_output)
 66 |             energy = hidden.squeeze(0).dot(energy.squeeze(0))
 67 |             return energy
 68 | 
 69 |         elif self.method == 'concat':
 70 |             energy = self.attn(torch.cat((hidden, encoder_output), 1))
 71 |             energy = self.v.squeeze(0).dot(energy.squeeze(0))
 72 |             return energy
 73 | 
 74 | class LuongAttnDecoderRNN(nn.Module):
 75 |     def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1):
 76 |         super(LuongAttnDecoderRNN, self).__init__()
 77 | 
 78 |         # Keep for reference
 79 |         self.attn_model = attn_model
 80 |         self.hidden_size = hidden_size
 81 |         self.output_size = output_size
 82 |         self.n_layers = n_layers
 83 |         self.dropout = dropout
 84 | 
 85 |         # Define layers
 86 |         self.embedding = embedding
 87 |         self.embedding_dropout = nn.Dropout(dropout)
 88 |         self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
 89 |         self.concat = nn.Linear(hidden_size * 2, hidden_size)
 90 |         self.out = nn.Linear(hidden_size, output_size)
 91 | 
 92 |         # Choose attention model
 93 |         if attn_model != 'none':
 94 |             self.attn = Attn(attn_model, hidden_size)
 95 | 
 96 |     def forward(self, input_seq, last_hidden, encoder_outputs):
 97 |         # Note: we run this one step at a time
 98 | 
 99 |         # Get the embedding of the current input word (last output word)
100 |         embedded = self.embedding(input_seq)
101 |         embedded = self.embedding_dropout(embedded) #[1, 64, 512]
102 |         if(embedded.size(0) != 1):
103 |             raise ValueError('Decoder input sequence length should be 1')
104 | 
105 |         # Get current hidden state from input word and last hidden state
106 |         rnn_output, hidden = self.gru(embedded, last_hidden)
107 | 
108 |         # Calculate attention from current RNN state and all encoder outputs;
109 |         # apply to encoder outputs to get weighted average
110 |         attn_weights = self.attn(rnn_output, encoder_outputs) #[64, 1, 14]
111 |         # encoder_outputs [14, 64, 512]
112 |         context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) #[64, 1, 512]
113 | 
114 |         # Attentional vector using the RNN hidden state and context vector
115 |         # concatenated together (Luong eq. 5)
116 |         rnn_output = rnn_output.squeeze(0) #[64, 512]
117 |         context = context.squeeze(1) #[64, 512]
118 |         concat_input = torch.cat((rnn_output, context), 1) #[64, 1024]
119 |         concat_output = torch.tanh(self.concat(concat_input)) #[64, 512]
120 | 
121 |         # Finally predict next token (Luong eq. 6, without softmax)
122 |         output = self.out(concat_output) #[64, output_size]
123 | 
124 |         # Return final output, hidden state, and attention weights (for visualization)
125 |         return output, hidden, attn_weights
126 | 


--------------------------------------------------------------------------------
/Data Preprocessing/createDB.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import json
  3 | from datetime import datetime
  4 | import time
  5 | 
  6 | timeframe = '2011-08'
  7 | 
  8 | sql_transaction = []
  9 | start_row = 0
 10 | cleanup = 1000000
 11 | 
 12 | connection = sqlite3.connect('{}.db'.format(timeframe))
 13 | c = connection.cursor()
 14 | 
 15 | def create_table():
 16 |     c.execute("CREATE TABLE IF NOT EXISTS parent_reply(parent_id TEXT PRIMARY KEY, comment_id TEXT UNIQUE, parent TEXT, comment TEXT, subreddit TEXT, unix INT, score INT)")
 17 | 
 18 | def format_data(data):
 19 |     data = data.replace('\n',' newlinechar ').replace('\r',' newlinechar ').replace('"',"'")
 20 |     return data
 21 | 
 22 | def transaction_bldr(sql):
 23 |     global sql_transaction
 24 |     sql_transaction.append(sql)
 25 |     if len(sql_transaction) > 1000:
 26 |         c.execute('BEGIN TRANSACTION')
 27 |         for s in sql_transaction:
 28 |             try:
 29 |                 c.execute(s)
 30 |             except:
 31 |                 pass
 32 |         connection.commit()
 33 |         sql_transaction = []
 34 | 
 35 | def sql_insert_replace_comment(commentid,parentid,parent,comment,subreddit,time,score):
 36 |     try:
 37 |         sql = """UPDATE parent_reply SET parent_id = ?, comment_id = ?, parent = ?, comment = ?, subreddit = ?, unix = ?, score = ? WHERE parent_id =?;""".format(parentid, commentid, parent, comment, subreddit, int(time), score, parentid)
 38 |         transaction_bldr(sql)
 39 |     except Exception as e:
 40 |         print('s0 insertion',str(e))
 41 | 
 42 | def sql_insert_has_parent(commentid,parentid,parent,comment,subreddit,time,score):
 43 |     try:
 44 |         sql = """INSERT INTO parent_reply (parent_id, comment_id, parent, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}","{}",{},{});""".format(parentid, commentid, parent, comment, subreddit, int(time), score)
 45 |         transaction_bldr(sql)
 46 |     except Exception as e:
 47 |         print('s0 insertion',str(e))
 48 | 
 49 | def sql_insert_no_parent(commentid,parentid,comment,subreddit,time,score):
 50 |     try:
 51 |         sql = """INSERT INTO parent_reply (parent_id, comment_id, comment, subreddit, unix, score) VALUES ("{}","{}","{}","{}",{},{});""".format(parentid, commentid, comment, subreddit, int(time), score)
 52 |         transaction_bldr(sql)
 53 |     except Exception as e:
 54 |         print('s0 insertion',str(e))
 55 | 
 56 | def acceptable(data):
 57 |     if len(data.split(' ')) > 1000 or len(data) < 1:
 58 |         return False
 59 |     elif len(data) > 32000:
 60 |         return False
 61 |     elif data == '[deleted]':
 62 |         return False
 63 |     elif data == '[removed]':
 64 |         return False
 65 |     else:
 66 |         return True
 67 | 
 68 | def find_parent(pid):
 69 |     try:
 70 |         sql = "SELECT comment FROM parent_reply WHERE comment_id = '{}' LIMIT 1".format(pid)
 71 |         c.execute(sql)
 72 |         result = c.fetchone()
 73 |         if result != None:
 74 |             return result[0]
 75 |         else: return False
 76 |     except Exception as e:
 77 |         #print(str(e))
 78 |         return False
 79 | 
 80 | def find_existing_score(pid):
 81 |     try:
 82 |         sql = "SELECT score FROM parent_reply WHERE parent_id = '{}' LIMIT 1".format(pid)
 83 |         c.execute(sql)
 84 |         result = c.fetchone()
 85 |         if result != None:
 86 |             return result[0]
 87 |         else: return False
 88 |     except Exception as e:
 89 |         #print(str(e))
 90 |         return False
 91 |     
 92 | if __name__ == '__main__':
 93 |     create_table()
 94 |     row_counter = 0
 95 |     paired_rows = 0
 96 | 
 97 |     #with open('J:/chatdata/reddit_data/{}/RC_{}'.format(timeframe.split('-')[0],timeframe), buffering=1000) as f:
 98 |     with open('/home/ryan/stark/TS3000_TheChatBOT/Data Preprocessing/RC_{}'.format(timeframe), buffering=1000) as f:
 99 |         for row in f:
100 |             #print(row)
101 |             #time.sleep(555)
102 |             row_counter += 1
103 | 
104 |             if row_counter > start_row:
105 |                 try:
106 |                     row = json.loads(row)
107 |                     parent_id = row['parent_id'].split('_')[1]
108 |                     body = format_data(row['body'])
109 |                     created_utc = row['created_utc']
110 |                     score = row['score']
111 |                     
112 |                     comment_id = row['id']
113 |                     
114 |                     subreddit = row['subreddit']
115 |                     parent_data = find_parent(parent_id)
116 |                     
117 |                     existing_comment_score = find_existing_score(parent_id)
118 |                     if existing_comment_score:
119 |                         if score > existing_comment_score:
120 |                             if acceptable(body):
121 |                                 sql_insert_replace_comment(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
122 |                                 
123 |                     else:
124 |                         if acceptable(body):
125 |                             if parent_data:
126 |                                 if score >= 2:
127 |                                     sql_insert_has_parent(comment_id,parent_id,parent_data,body,subreddit,created_utc,score)
128 |                                     paired_rows += 1
129 |                             else:
130 |                                 sql_insert_no_parent(comment_id,parent_id,body,subreddit,created_utc,score)
131 |                 except Exception as e:
132 |                     print(str(e))
133 |                             
134 |             if row_counter % 100000 == 0:
135 |                 print('Total Rows Read: {}, Paired Rows: {}, Time: {}'.format(row_counter, paired_rows, str(datetime.now())))
136 | 
137 |             if row_counter > start_row:
138 |                 if row_counter % cleanup == 0:
139 |                     print("Cleanin up!")
140 |                     sql = "DELETE FROM parent_reply WHERE parent IS NULL"
141 |                     c.execute(sql)
142 |                     connection.commit()
143 |                     c.execute("VACUUM")
144 |                     connection.commit()
145 | 


--------------------------------------------------------------------------------
/Data Preprocessing/CreateVocabularyNPair.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | from __future__ import unicode_literals
  5 | 
  6 | import torch
  7 | from torch.jit import script, trace
  8 | import torch.nn as nn
  9 | from torch import optim
 10 | import torch.nn.functional as F
 11 | import csv
 12 | import random
 13 | import re
 14 | import os
 15 | import unicodedata
 16 | import codecs
 17 | from io import open
 18 | import itertools
 19 | import math
 20 | import pickle
 21 | 
 22 | ################defining call variables##################
 23 | corpus_name = "redit_corpus"
 24 | corpus = os.path.join("data", corpus_name)
 25 | datafile = os.path.join(corpus, "test1.txt")
 26 | save_dir = os.path.join("data", "save")
 27 | ##########################################################
 28 | 
 29 | # Default word tokens
 30 | PAD_token = 0  # Used for padding short sentences
 31 | SOS_token = 1  # Start-of-sentence token
 32 | EOS_token = 2  # End-of-sentence token
 33 | 
 34 | class Voc:
 35 |     def __init__(self, name):
 36 |         self.name = name
 37 |         self.trimmed = False
 38 |         self.word2index = {}
 39 |         self.word2count = {}
 40 |         self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
 41 |         self.num_words = 3  # Count SOS, EOS, PAD
 42 | 
 43 |     def addSentence(self, sentence):
 44 |         for word in sentence.split(' '):
 45 |             self.addWord(word)
 46 | 
 47 |     def addWord(self, word):
 48 |         if word not in self.word2index:
 49 |             self.word2index[word] = self.num_words
 50 |             self.word2count[word] = 1
 51 |             self.index2word[self.num_words] = word
 52 |             self.num_words += 1
 53 |         else:
 54 |             self.word2count[word] += 1
 55 | 
 56 |     # Remove words below a certain count threshold
 57 |     def trim(self, min_count):
 58 |         if self.trimmed:
 59 |             return
 60 |         self.trimmed = True
 61 | 
 62 |         keep_words = []
 63 | 
 64 |         for k, v in self.word2count.items():
 65 |             if v >= min_count:
 66 |                 keep_words.append(k)
 67 | 
 68 |         print('keep_words {} / {} = {:.4f}'.format(
 69 |             len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
 70 |         ))
 71 | 
 72 |         # Reinitialize dictionaries
 73 |         self.word2index = {}
 74 |         self.word2count = {}
 75 |         self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
 76 |         self.num_words = 3 # Count default tokens
 77 | 
 78 |         for word in keep_words:
 79 |             self.addWord(word)
 80 |             
 81 | MAX_LENGTH = 15  # Maximum sentence length to consider
 82 | 
 83 | def unicodeToAscii(s):
 84 |     return ''.join(
 85 |         c for c in unicodedata.normalize('NFD', s)
 86 |         if unicodedata.category(c) != 'Mn'
 87 |     )
 88 | 
 89 | # Lowercase, trim, and remove non-letter characters
 90 | def normalizeString(s):
 91 |     s = unicodeToAscii(s.lower().strip())
 92 |     s = re.sub(r"([.!?])", r" \1", s)
 93 |     s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
 94 |     s = re.sub(r"\s+", r" ", s).strip()
 95 |     return s
 96 | 
 97 | # Read query/response pairs and return a voc object
 98 | def readVocs(datafile, corpus_name):
 99 |     print("Reading lines...")
100 |     # Read the file and split into lines
101 |     print(datafile)
102 |     lines = open('data/redit_corpus/test1.txt', encoding='utf-8').\
103 |         read().strip().split('\n')
104 |     # Split every line into pairs and normalize
105 |     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
106 |     voc = Voc(corpus_name)
107 |     return voc, pairs
108 | 
109 | # Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
110 | def filterPair(p):
111 |     # Input sequences need to preserve the last word for EOS token
112 |     return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH
113 | 
114 | # Filter pairs using filterPair condition
115 | def filterPairs(pairs):
116 |     return [pair for pair in pairs if filterPair(pair)]
117 | 
118 | 
119 | 
120 | MIN_COUNT = 2   # Minimum word count threshold for trimming
121 | 
122 | def trimRareWords(voc, pairs, MIN_COUNT):
123 |     # Trim words used under the MIN_COUNT from the voc
124 |     voc.trim(MIN_COUNT)
125 |     # Filter out pairs with trimmed words
126 |     keep_pairs = []
127 |     for pair in pairs:
128 |         input_sentence = pair[0]
129 |         output_sentence = pair[1]
130 |         keep_input = True
131 |         keep_output = True
132 |         # Check input sentence
133 |         for word in input_sentence.split(' '):
134 |             if word not in voc.word2index:
135 |                 keep_input = False
136 |                 break
137 |         # Check output sentence
138 |         for word in output_sentence.split(' '):
139 |             if word not in voc.word2index:
140 |                 keep_output = False
141 |                 break
142 | 
143 |         # Only keep pairs that do not contain trimmed word(s) in their input or output sentence
144 |         if keep_input and keep_output:
145 |             keep_pairs.append(pair)
146 | 
147 |     print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))
148 |     return keep_pairs
149 | 
150 | 
151 | 
152 | def prepareData(corpus, corpus_name):
153 |     voc, pairs = readVocs(corpus, corpus_name)
154 |     print("Read {!s} sentence pairs".format(len(pairs)))
155 |     pairs = filterPairs(pairs)
156 |     print("Trimmed to {!s} sentence pairs".format(len(pairs)))
157 |     print("Counting words...")
158 |     for pair in pairs:
159 |         voc.addSentence(pair[0])
160 |         voc.addSentence(pair[1])
161 |     print("Counted words:", voc.num_words)
162 |     
163 |     print("further trimming less used words \n  ")
164 |     pairs = trimRareWords(voc, pairs, MIN_COUNT)
165 |     
166 |     
167 |     print("saving voc now \n  ")
168 |     
169 |     directory = '/home/ryan/stark/chatbot/data/save'
170 |     if not os.path.exists(directory):
171 |         os.makedirs(directory)
172 |     torch.save(voc, os.path.join(directory, '{!s}.tar'.format('voc')))
173 |     torch.save(pairs, os.path.join(directory, '{!s}.tar'.format('pairs')))
174 |     
175 |    
176 |     return voc, pairs
177 | 
178 | 
179 | #############################################################################################
180 | def loadPrepareData():
181 |     #corpus_name = corpus.split('/')[-1].split('.')[0]
182 |     try:
183 |         print("Start loading training data ...")
184 |         #voc = torch.load(os.path.join('/home/ryan/stark/chatbot/data/save/voc.tar'))
185 |         #pairs = torch.load(os.path.join('/home/ryan/stark/chatbot/data/save/pairs.tar'))
186 |         voc = torch.load(os.path.join('/home/ryan/stark/chatbot/data/save/voc.tar'))
187 |         pairs = torch.load(os.path.join('/home/ryan/stark/chatbot/data/save/pairs.tar'))
188 | 
189 |     except FileNotFoundError:
190 |         print("Saved data not found, start preparing trianing data ...")
191 |         voc, pairs = prepareData(corpus, corpus_name)
192 |     return voc, pairs
193 | 
194 | ######################################################################
195 | # Load/Assemble voc and pairs
196 | 
197 | voc, pairs = loadPrepareData()
198 | 
199 | 
200 | # Print some pairs to validate
201 | print("\npairs:")
202 | for pair in pairs[:10]:
203 |     print(pair)
204 | 
205 | #################################trimminggdta##################
206 | 
207 | 
208 | 
209 | 
210 | 
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch import optim
  5 | import torch.backends.cudnn as cudnn
  6 | 
  7 | import itertools
  8 | import random
  9 | import math
 10 | import os
 11 | from tqdm import tqdm
 12 | from load import loadPrepareData
 13 | from load import SOS_token, EOS_token, PAD_token
 14 | from model import EncoderRNN, LuongAttnDecoderRNN
 15 | from config import MAX_LENGTH, teacher_forcing_ratio, save_dir
 16 | 
 17 | USE_CUDA = torch.cuda.is_available()
 18 | device = torch.device("cuda" if USE_CUDA else "cpu")
 19 | 
 20 | cudnn.benchmark = True
 21 | #############################################
 22 | # generate file name for saving parameters
 23 | #############################################
 24 | def filename(reverse, obj):
 25 | 	filename = ''
 26 | 	if reverse:
 27 | 		filename += 'reverse_'
 28 | 	filename += obj
 29 | 	return filename
 30 | 
 31 | 
 32 | #############################################
 33 | # Prepare Training Data
 34 | #############################################
 35 | def indexesFromSentence(voc, sentence):
 36 |     return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
 37 | 
 38 | # batch_first: true -> false, i.e. shape: seq_len * batch
 39 | def zeroPadding(l, fillvalue=PAD_token):
 40 |     return list(itertools.zip_longest(*l, fillvalue=fillvalue))
 41 | 
 42 | def binaryMatrix(l, value=PAD_token):
 43 |     m = []
 44 |     for i, seq in enumerate(l):
 45 |         m.append([])
 46 |         for token in seq:
 47 |             if token == PAD_token:
 48 |                 m[i].append(0)
 49 |             else:
 50 |                 m[i].append(1)
 51 |     return m
 52 | 
 53 | # convert to index, add EOS
 54 | # return input pack_padded_sequence
 55 | def inputVar(l, voc):
 56 |     indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
 57 |     lengths = [len(indexes) for indexes in indexes_batch]
 58 |     padList = zeroPadding(indexes_batch)
 59 |     padVar = torch.LongTensor(padList)
 60 |     return padVar, lengths
 61 | 
 62 | # convert to index, add EOS, zero padding
 63 | # return output variable, mask, max length of the sentences in batch
 64 | def outputVar(l, voc):
 65 |     indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
 66 |     max_target_len = max([len(indexes) for indexes in indexes_batch])
 67 |     padList = zeroPadding(indexes_batch)
 68 |     mask = binaryMatrix(padList)
 69 |     mask = torch.ByteTensor(mask)
 70 |     padVar = torch.LongTensor(padList)
 71 |     return padVar, mask, max_target_len
 72 | 
 73 | # pair_batch is a list of (input, output) with length batch_size
 74 | # sort list of (input, output) pairs by input length, reverse input
 75 | # return input, lengths for pack_padded_sequence, output_variable, mask
 76 | def batch2TrainData(voc, pair_batch, reverse):
 77 |     if reverse:
 78 |         pair_batch = [pair[::-1] for pair in pair_batch]
 79 |     pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
 80 |     input_batch, output_batch = [], []
 81 |     for pair in pair_batch:
 82 |         input_batch.append(pair[0])
 83 |         output_batch.append(pair[1])
 84 |     inp, lengths = inputVar(input_batch, voc)
 85 |     output, mask, max_target_len = outputVar(output_batch, voc)
 86 |     return inp, lengths, output, mask, max_target_len
 87 | 
 88 | #############################################
 89 | # Training
 90 | #############################################
 91 | 
 92 | def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding,
 93 |           encoder_optimizer, decoder_optimizer, batch_size, max_length=MAX_LENGTH):
 94 | 
 95 |     encoder_optimizer.zero_grad()
 96 |     decoder_optimizer.zero_grad()
 97 | 
 98 |     input_variable = input_variable.to(device)
 99 |     target_variable = target_variable.to(device)
100 |     mask = mask.to(device)
101 | 
102 |     loss = 0
103 |     print_losses = []
104 |     n_totals = 0
105 | 
106 |     encoder_outputs, encoder_hidden = encoder(input_variable, lengths, None)
107 | 
108 |     decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
109 |     decoder_input = decoder_input.to(device)
110 | 
111 |     decoder_hidden = encoder_hidden[:decoder.n_layers]
112 | 
113 |     use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
114 | 
115 |     # Run through decoder one time step at a time
116 |     if use_teacher_forcing:
117 |         for t in range(max_target_len):
118 |             decoder_output, decoder_hidden, _ = decoder(
119 |                 decoder_input, decoder_hidden, encoder_outputs
120 |             )
121 |             decoder_input = target_variable[t].view(1, -1) # Next input is current target
122 |             loss += F.cross_entropy(decoder_output, target_variable[t], ignore_index=EOS_token)
123 |     else:
124 |         for t in range(max_target_len):
125 |             decoder_output, decoder_hidden, decoder_attn = decoder(
126 |                 decoder_input, decoder_hidden, encoder_outputs
127 |             )
128 |             _, topi = decoder_output.topk(1) # [64, 1]
129 | 
130 |             decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
131 |             decoder_input = decoder_input.to(device)
132 |             loss += F.cross_entropy(decoder_output, target_variable[t], ignore_index=EOS_token)
133 | 
134 |     loss.backward()
135 | 
136 |     clip = 50.0
137 |     _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
138 |     _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)
139 | 
140 |     encoder_optimizer.step()
141 |     decoder_optimizer.step()
142 | 
143 |     return loss.item() / max_target_len 
144 | 
145 | 
146 | def trainIters(corpus, reverse, n_iteration, learning_rate, batch_size, n_layers, hidden_size,
147 |                 print_every, save_every, dropout, loadFilename=None, attn_model='dot', decoder_learning_ratio=5.0):
148 | 
149 |     voc, pairs = loadPrepareData(corpus)
150 | 
151 |     # training data
152 |     corpus_name = os.path.split(corpus)[-1].split('.')[0]
153 |     training_batches = None
154 |     try:
155 |         training_batches = torch.load(os.path.join(save_dir, 'training_data', corpus_name,
156 |                                                    '{}_{}_{}.tar'.format(n_iteration, \
157 |                                                                          filename(reverse, 'training_batches'), \
158 |                                                                          batch_size)))
159 |     except FileNotFoundError:
160 |         print('Training pairs not found, generating ...')
161 |         training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)], reverse)
162 |                           for _ in range(n_iteration)]
163 |         torch.save(training_batches, os.path.join(save_dir, 'training_data', corpus_name,
164 |                                                   '{}_{}_{}.tar'.format(n_iteration, \
165 |                                                                         filename(reverse, 'training_batches'), \
166 |                                                                         batch_size)))
167 |     # model
168 |     checkpoint = None
169 |     print('Building encoder and decoder ...')
170 |     embedding = nn.Embedding(voc.n_words, hidden_size)
171 |     encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers, dropout)
172 |     attn_model = 'dot'
173 |     decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.n_words, n_layers, dropout)
174 |     if loadFilename:
175 |         checkpoint = torch.load(loadFilename)
176 |         encoder.load_state_dict(checkpoint['en'])
177 |         decoder.load_state_dict(checkpoint['de'])
178 |     # use cuda
179 |     encoder = encoder.to(device)
180 |     decoder = decoder.to(device)
181 | 
182 |     # optimizer
183 |     print('Building optimizers ...')
184 |     encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
185 |     decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
186 |     if loadFilename:
187 |         encoder_optimizer.load_state_dict(checkpoint['en_opt'])
188 |         decoder_optimizer.load_state_dict(checkpoint['de_opt'])
189 | 
190 |     # initialize
191 |     print('Initializing ...')
192 |     start_iteration = 1
193 |     perplexity = []
194 |     print_loss = 0
195 |     if loadFilename:
196 |         start_iteration = checkpoint['iteration'] + 1
197 |         perplexity = checkpoint['plt']
198 | 
199 |     for iteration in tqdm(range(start_iteration, n_iteration + 1)):
200 |         training_batch = training_batches[iteration - 1]
201 |         input_variable, lengths, target_variable, mask, max_target_len = training_batch
202 | 
203 |         loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder,
204 |                      decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size)
205 |         print_loss += loss
206 |         perplexity.append(loss)
207 | 
208 |         if iteration % print_every == 0:
209 |             print_loss_avg = math.exp(print_loss / print_every)
210 |             print('%d %d%% %.4f' % (iteration, iteration / n_iteration * 100, print_loss_avg))
211 |             print_loss = 0
212 | 
213 |         if (iteration % save_every == 0):
214 |             directory = os.path.join(save_dir, 'model', corpus_name, '{}-{}_{}'.format(n_layers, n_layers, hidden_size))
215 |             if not os.path.exists(directory):
216 |                 os.makedirs(directory)
217 |             torch.save({
218 |                 'iteration': iteration,
219 |                 'en': encoder.state_dict(),
220 |                 'de': decoder.state_dict(),
221 |                 'en_opt': encoder_optimizer.state_dict(),
222 |                 'de_opt': decoder_optimizer.state_dict(),
223 |                 'loss': loss,
224 |                 'plt': perplexity
225 |             }, os.path.join(directory, '{}_{}.tar'.format(iteration, filename(reverse, 'backup_bidir_model'))))
226 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import random
  3 | from train import indexesFromSentence
  4 | from load import SOS_token, EOS_token
  5 | from load import MAX_LENGTH, loadPrepareData, Voc
  6 | from model import *
  7 | from pairlist import fixedpair
  8 | import emoji
  9 | from audio import *
 10 | 
 11 | USE_CUDA = torch.cuda.is_available()
 12 | device = torch.device("cuda" if USE_CUDA else "cpu")
 13 | 
 14 | class Sentence:
 15 |     def __init__(self, decoder_hidden, last_idx=SOS_token, sentence_idxes=[], sentence_scores=[]):
 16 |         if(len(sentence_idxes) != len(sentence_scores)):
 17 |             raise ValueError("length of indexes and scores should be the same")
 18 |         self.decoder_hidden = decoder_hidden
 19 |         self.last_idx = last_idx
 20 |         self.sentence_idxes =  sentence_idxes
 21 |         self.sentence_scores = sentence_scores
 22 | 
 23 |     def avgScore(self):
 24 |         if len(self.sentence_scores) == 0:
 25 |             raise ValueError("Calculate average score of sentence, but got no word")
 26 |         # return mean of sentence_score
 27 |         return sum(self.sentence_scores) / len(self.sentence_scores)
 28 | 
 29 |     def addTopk(self, topi, topv, decoder_hidden, beam_size, voc):
 30 |         topv = torch.log(topv)
 31 |         terminates, sentences = [], []
 32 |         for i in range(beam_size):
 33 |             if topi[0][i] == EOS_token:
 34 |                 terminates.append(([voc.index2word[idx.item()] for idx in self.sentence_idxes] + ['<EOS>'],
 35 |                                    self.avgScore())) # tuple(word_list, score_float
 36 |                 continue
 37 |             idxes = self.sentence_idxes[:] # pass by value
 38 |             scores = self.sentence_scores[:] # pass by value
 39 |             idxes.append(topi[0][i])
 40 |             scores.append(topv[0][i])
 41 |             sentences.append(Sentence(decoder_hidden, topi[0][i], idxes, scores))
 42 |         return terminates, sentences
 43 | 
 44 |     def toWordScore(self, voc):
 45 |         words = []
 46 |         for i in range(len(self.sentence_idxes)):
 47 |             if self.sentence_idxes[i] == EOS_token:
 48 |                 words.append('<EOS>')
 49 |             else:
 50 |                 words.append(voc.index2word[self.sentence_idxes[i].item()])
 51 |         if self.sentence_idxes[-1] != EOS_token:
 52 |             words.append('<EOS>')
 53 |         return (words, self.avgScore())
 54 | 
 55 | def beam_decode(decoder, decoder_hidden, encoder_outputs, voc, beam_size, max_length=MAX_LENGTH):
 56 |     terminal_sentences, prev_top_sentences, next_top_sentences = [], [], []
 57 |     prev_top_sentences.append(Sentence(decoder_hidden))
 58 |     for i in range(max_length):
 59 |         for sentence in prev_top_sentences:
 60 |             decoder_input = torch.LongTensor([[sentence.last_idx]])
 61 |             decoder_input = decoder_input.to(device)
 62 | 
 63 |             decoder_hidden = sentence.decoder_hidden
 64 |             decoder_output, decoder_hidden, _ = decoder(
 65 |                 decoder_input, decoder_hidden, encoder_outputs
 66 |             )
 67 |             topv, topi = decoder_output.topk(beam_size)
 68 |             term, top = sentence.addTopk(topi, topv, decoder_hidden, beam_size, voc)
 69 |             terminal_sentences.extend(term)
 70 |             next_top_sentences.extend(top)
 71 | 
 72 |         next_top_sentences.sort(key=lambda s: s.avgScore(), reverse=True)
 73 |         prev_top_sentences = next_top_sentences[:beam_size]
 74 |         next_top_sentences = []
 75 | 
 76 |     terminal_sentences += [sentence.toWordScore(voc) for sentence in prev_top_sentences]
 77 |     terminal_sentences.sort(key=lambda x: x[1], reverse=True)
 78 | 
 79 |     n = min(len(terminal_sentences), 15)
 80 |     return terminal_sentences[:n]
 81 | 
 82 | def decode(decoder, decoder_hidden, encoder_outputs, voc, max_length=MAX_LENGTH):
 83 | 
 84 |     decoder_input = torch.LongTensor([[SOS_token]])
 85 |     decoder_input = decoder_input.to(device)
 86 | 
 87 |     decoded_words = []
 88 |     decoder_attentions = torch.zeros(max_length, max_length) #TODO: or (MAX_LEN+1, MAX_LEN+1)
 89 | 
 90 |     for di in range(max_length):
 91 |         decoder_output, decoder_hidden, decoder_attn = decoder(
 92 |             decoder_input, decoder_hidden, encoder_outputs
 93 |         )
 94 |         _, topi = decoder_output.topk(3)
 95 |         ni = topi[0][0]
 96 |         if ni == EOS_token:
 97 |             decoded_words.append('<EOS>')
 98 |             break
 99 |         else:
100 |             decoded_words.append(voc.index2word[ni.item()])
101 | 
102 |         decoder_input = torch.LongTensor([[ni]])
103 |         decoder_input = decoder_input.to(device)
104 | 
105 |     return decoded_words, decoder_attentions[:di + 1]
106 | 
107 | 
108 | def evaluate(encoder, decoder, voc, sentence, beam_size, max_length=MAX_LENGTH):
109 |     indexes_batch = [indexesFromSentence(voc, sentence)] #[1, seq_len]
110 |     lengths = [len(indexes) for indexes in indexes_batch]
111 |     input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
112 |     input_batch = input_batch.to(device)
113 | 
114 |     encoder_outputs, encoder_hidden = encoder(input_batch, lengths, None)
115 | 
116 |     decoder_hidden = encoder_hidden[:decoder.n_layers]
117 | 
118 |     if beam_size == 1:
119 |         return decode(decoder, decoder_hidden, encoder_outputs, voc)
120 |     else:
121 |         return beam_decode(decoder, decoder_hidden, encoder_outputs, voc, beam_size)
122 | 
123 | 
124 | def evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, n=10):
125 |     for _ in range(n):
126 |         pair = random.choice(pairs)
127 |         print("=============================================================")
128 |         if reverse:
129 |             print('>', " ".join(reversed(pair[0].split())))
130 |         else:
131 |             print('>', pair[0])
132 |         if beam_size == 1:
133 |             output_words, _ = evaluate(encoder, decoder, voc, pair[0], beam_size)
134 |             output_sentence = ' '.join(output_words)
135 |             print('<', output_sentence)
136 |         else:
137 |             output_words_list = evaluate(encoder, decoder, voc, pair[0], beam_size)
138 |             for output_words, score in output_words_list:
139 |                 output_sentence = ' '.join(output_words)
140 |                 print("{:.3f} < {}".format(score, output_sentence))
141 | 
142 | def evaluateInput(encoder, decoder, voc, beam_size ,audio):
143 |     if audio:
144 |         #print ("audio should be true and is " + str(audio))
145 |         pair = ''
146 |         while(1):
147 |             try:
148 |                 pair = takeCommand().lower()
149 |                 print('>' + pair)
150 |                 if pair == 'quit': break 
151 |     #             elif pair == fixedpair("who?"):
152 | 
153 |     #                 output_sentence = fixedpair(pair)
154 | 
155 |                 elif pair == 'who?' :
156 |                     output_sentence = 'I am TS300'
157 | 
158 |                 elif beam_size == 1:
159 |                     output_words, _ = evaluate(encoder, decoder, voc, pair, beam_size)
160 |                     output_sentence = ' '.join(output_words)
161 | 
162 |                 print('<', output_sentence)
163 |                 speak(output_sentence)
164 |             except KeyError:
165 |                 #"+ emoji.emojize(':thumbs_up:')"   
166 |                 print("I don't know what to say .I was not trained for this perhaps"  + emoji.emojize(':thumbs_up:') )
167 |                 speak("I don't know what to say .I was not trained for this perhaps")
168 |     else :
169 |         #print ("audio should be true and is " + str(audio))
170 |         pair = ''
171 |         while(1):
172 |             try:
173 | 
174 |                 pair = input('> ')
175 |                 print('>' + pair)
176 |                 if pair == 'q': break 
177 |     #             elif pair == fixedpair("who?"):
178 | 
179 |     #                 output_sentence = fixedpair(pair)
180 | 
181 |                 elif pair == 'who?' :
182 |                     output_sentence = 'I am TS300'
183 | 
184 |                 elif beam_size == 1:
185 |                     output_words, _ = evaluate(encoder, decoder, voc, pair, beam_size)
186 |                     output_sentence = ' '.join(output_words)
187 | 
188 |                 print('<', output_sentence)
189 | 
190 | 
191 |                 """else:
192 |                     output_words_list = evaluate(encoder, decoder, voc, pair, beam_size)
193 |                     for output_words, score in output_words_list:
194 |                         output_sentence = ' '.join(output_words)
195 |                         print("{:.3f} < {}".format(score, output_sentence))
196 |             except KeyError:
197 |                 print("Incorrect spelling.") """
198 |             except KeyError:
199 | 
200 |                 print("I don't know what to say .I was not trained for this perhaps"  + emoji.emojize(':thumbs_up:') )
201 | 
202 | 
203 | def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, inp, corpus, audio):
204 |     torch.set_grad_enabled(False)
205 | 
206 |     voc, pairs = loadPrepareData(corpus)
207 |     embedding = nn.Embedding(voc.n_words, hidden_size)
208 |     encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
209 |     attn_model = 'dot'
210 |     decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.n_words, n_layers)
211 | 
212 |     checkpoint = torch.load(modelFile)
213 |     encoder.load_state_dict(checkpoint['en'])
214 |     decoder.load_state_dict(checkpoint['de'])
215 | 
216 |     # train mode set to false, effect only on dropout, batchNorm
217 |     encoder.train(False);
218 |     decoder.train(False);
219 | 
220 |     encoder = encoder.to(device)
221 |     decoder = decoder.to(device)
222 | 
223 |     if inp and audio:
224 |         evaluateInput(encoder, decoder, voc, beam_size, audio)
225 |                     
226 |     elif inp:
227 |         evaluateInput(encoder, decoder, voc, beam_size, audio)
228 |     
229 |     else:
230 |         evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 20)
231 | 


--------------------------------------------------------------------------------
/Audiotesting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyttsx3\n",
 10 |     "import speech_recognition as sr"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "engine = pyttsx3.init()\n",
 20 |     "rate = engine.getProperty('rate')\n",
 21 |     "voices = engine.getProperty('voices')\n",
 22 |     "volume = engine.getProperty('volume') \n",
 23 |     "\n",
 24 |     "engine.setProperty('rate',120) #120 words per minute\n",
 25 |     "#engine.setProperty('volume',0.9)  # setting up volume level  between 0 and 1\n",
 26 |     "#engine.setProperty('voice', voices[0].id) #changing index, changes voices. 1 for female\n"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": []
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "engine.say('My current speaking rate is ' + str(rate))\n",
 43 |     "engine.runAndWait()\n",
 44 |     "engine.stop()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "import pyttsx3\n",
 54 |     "engine = pyttsx3.init()\n",
 55 |     "engine.say(\"I will speak this text\")\n",
 56 |     "engine.runAndWait()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "import pyttsx3 #pip install pyttsx3\n",
 66 |     "import speech_recognition as sr #pip install speechRecognition\n",
 67 |     "#import datetime\n",
 68 |     "#import wikipedia #pip install wikipedia\n",
 69 |     "#import webbrowser\n",
 70 |     "#import os\n",
 71 |     "#import smtplib\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "\n",
 75 |     "def speak(audio):\n",
 76 |     "    engine.say(audio)\n",
 77 |     "    engine.runAndWait()\n",
 78 |     "\n",
 79 |     "\n",
 80 |     "def takeCommand():\n",
 81 |     "    #It takes microphone input from the user and returns string output\n",
 82 |     "\n",
 83 |     "    r = sr.Recognizer()\n",
 84 |     "    with sr.Microphone() as source:\n",
 85 |     "        print(\"Listening...\")\n",
 86 |     "        r.pause_threshold = 1\n",
 87 |     "        audio = r.listen(source)\n",
 88 |     "\n",
 89 |     "    try:\n",
 90 |     "        print(\"Recognizing...\")    \n",
 91 |     "        query = r.recognize_google(audio, language='en-in')\n",
 92 |     "        print(f\"User said: {query}\\n\")\n",
 93 |     "\n",
 94 |     "    except Exception as e:\n",
 95 |     "        # print(e)    \n",
 96 |     "        print(\"Say that again please...\")  \n",
 97 |     "        return \"None\"\n",
 98 |     "    return query"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 7,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "Listening...\n"
111 |      ]
112 |     },
113 |     {
114 |      "ename": "KeyboardInterrupt",
115 |      "evalue": "",
116 |      "output_type": "error",
117 |      "traceback": [
118 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
119 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
120 |       "\u001b[0;32m<ipython-input-7-18c0d7f269c0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpair\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtakeCommand\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
121 |       "\u001b[0;32m<ipython-input-4-677cba5765af>\u001b[0m in \u001b[0;36mtakeCommand\u001b[0;34m()\u001b[0m\n\u001b[1;32m     21\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Listening...\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     22\u001b[0m         \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpause_threshold\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 23\u001b[0;31m         \u001b[0maudio\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlisten\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     25\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
122 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/speech_recognition/__init__.py\u001b[0m in \u001b[0;36mlisten\u001b[0;34m(self, source, timeout, phrase_time_limit, snowboy_configuration)\u001b[0m\n\u001b[1;32m    650\u001b[0m                     \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    651\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 652\u001b[0;31m                 \u001b[0mbuffer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msource\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mCHUNK\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    653\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;32mbreak\u001b[0m  \u001b[0;31m# reached end of the stream\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    654\u001b[0m                 \u001b[0mframes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
123 |       "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/speech_recognition/__init__.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, size)\u001b[0m\n\u001b[1;32m    159\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    160\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msize\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 161\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyaudio_stream\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexception_on_overflow\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    163\u001b[0m         \u001b[0;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
124 |       "\u001b[0;32m/usr/lib/python3/dist-packages/pyaudio.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, num_frames, exception_on_overflow)\u001b[0m\n\u001b[1;32m    606\u001b[0m                           paCanNotReadFromAnOutputOnlyStream)\n\u001b[1;32m    607\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 608\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_stream\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_stream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum_frames\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexception_on_overflow\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    609\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    610\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget_read_available\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
125 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "pair = takeCommand().lower()"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 13,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "<gtts.tts.gTTS at 0x7f95b93b9cf8>"
142 |       ]
143 |      },
144 |      "execution_count": 13,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "from gtts import gTTS\n",
151 |     "import os\n",
152 |     "gTTS(text='Good morning', lang='en')\n",
153 |     "#tts.save(\"good.mp3\")\n",
154 |     "#os.system(\"mpg321 good.mp3\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 14,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "data": {
164 |       "text/plain": [
165 |        "<gtts.tts.gTTS at 0x7f95b93e0f28>"
166 |       ]
167 |      },
168 |      "execution_count": 14,
169 |      "metadata": {},
170 |      "output_type": "execute_result"
171 |     }
172 |    ],
173 |    "source": [
174 |     "tts"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "kernelspec": {
187 |    "display_name": "Python 3",
188 |    "language": "python",
189 |    "name": "python3"
190 |   },
191 |   "language_info": {
192 |    "codemirror_mode": {
193 |     "name": "ipython",
194 |     "version": 3
195 |    },
196 |    "file_extension": ".py",
197 |    "mimetype": "text/x-python",
198 |    "name": "python",
199 |    "nbconvert_exporter": "python",
200 |    "pygments_lexer": "ipython3",
201 |    "version": "3.6.8"
202 |   }
203 |  },
204 |  "nbformat": 4,
205 |  "nbformat_minor": 2
206 | }
207 | 


--------------------------------------------------------------------------------