├── model ├── model.tflearn.index ├── model.tflearn.meta ├── model.tflearn.data-00000-of-00001 └── checkpoint ├── checkpoint ├── requirements.txt ├── readme.md ├── data_import.py ├── allocator.py └── neuralnet.py /model/model.tflearn.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.index -------------------------------------------------------------------------------- /model/model.tflearn.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.meta -------------------------------------------------------------------------------- /model/model.tflearn.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.data-00000-of-00001 -------------------------------------------------------------------------------- /checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "/home/zach/chapter2/chapter2/application/model.tflearn" 2 | all_model_checkpoint_paths: "/home/zach/chapter2/chapter2/application/model.tflearn" 3 | -------------------------------------------------------------------------------- /model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "/home/zach/chapter2/chapter2/application/model/model.tflearn" 2 | all_model_checkpoint_paths: "/home/zach/chapter2/chapter2/application/model/model.tflearn" 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bleach==1.5.0 2 | enum34==1.1.6 3 | html5lib==0.9999999 4 | Markdown==2.6.11 5 | numpy==1.14.0 6 | protobuf==3.5.1 7 | six==1.11.0 8 | tensorflow==1.12.1 9 | tensorflow-tensorboard==0.4.0rc3 10 | Werkzeug==0.14.1 11 | nltk 12 | tflearn 13 | h5py 14 | scipy 15 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # PQ Allocation Neural Net # 2 | 3 | ## neuralnet.py ## 4 | 5 | This application trains the neural net, using the Tensor flow library. 6 | 7 | The training data is contained within the application folder, in a file named json2.text 8 | 9 | json2.txt is a dictionary of lists, with each Unit in DfT as a key, and each PQ they have answered in the last 2 years in the list contained within 10 | 11 | Lines 12-88 of neuralnet.py are getting the data into a format tensorflow can use, ie converting each question into a 1 dimensional matrix of 0s and 1s 12 | 13 | Lines 89-99 are designing the neural net 14 | 15 | Lines 104 - 109 instantiate the model, provides a location to save logs, runs it and save the state of the model upon completion (in the current directory) 16 | 17 | Lines 111-137 perform some tests immediately after the model has been run 18 | 19 | ## allocator.py ## 20 | 21 | This appliation reinstatiates the model and allows the user to input questions, and outputs the suggested unit 22 | -------------------------------------------------------------------------------- /data_import.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from textacy import preprocess_text 3 | 4 | 5 | def import_data(filepath): 6 | df = pd.read_excel(filepath, header=0, sheet_name='FullExtract') 7 | 8 | df['year'] = df.DateOfReceipt.astype('str').str[0:4] 9 | 10 | # We will subset the df to only include key values 11 | df = df[['ItemID', 'Summary', 'ORG_UNIT_NAME', 'year']].copy() 12 | df = df[df.Summary.str.match( 13 | 'To ask the Secretary of State for Transport')].copy() 14 | 15 | # Remove the leading sentence 16 | df['Summary'] = df.Summary.str.replace( 17 | '^To ask the Secretary of State for Transport, ', '') 18 | return (df) 19 | 20 | 21 | def clean_text(df): 22 | ''' 23 | Func to clean up textual data to remove items that aren't likely to be 24 | useful in machine learning. 25 | ''' 26 | 27 | def preprocess_text_settings(string_in): 28 | string_in = preprocess_text( 29 | string_in, 30 | fix_unicode=True, 31 | lowercase=True, 32 | no_urls=True, 33 | no_emails=True, 34 | no_numbers=True, 35 | no_accents=True, 36 | no_punct=True) 37 | return (string_in) 38 | 39 | df['summary_cleaned'] = df['Summary'].apply(preprocess_text_settings) 40 | return (df) 41 | 42 | 43 | def import_and_clean(filepath='data/Extract.xlsx'): 44 | df = import_data(filepath) 45 | df = clean_text(df) 46 | return (df) 47 | -------------------------------------------------------------------------------- /allocator.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.lancaster import LancasterStemmer 3 | import numpy as np 4 | import tflearn 5 | import tensorflow as tf 6 | import random 7 | import json 8 | import string 9 | import unicodedata 10 | import sys 11 | from nltk.corpus import stopwords 12 | 13 | #punctuation remover (keys in table point to nothing so it just deletes the punctuation in the sentence) 14 | def remove_punctuation(sentence, tbl): 15 | return sentence.translate(tbl) 16 | 17 | 18 | def prepare_input_and_output_data(pq_data): 19 | """takes json data as per json2.txt, processes and returns a list of 20 | all the words used in each question and list of categories""" 21 | 22 | 23 | #create table (a dictionary) to hold remove_punctuation 24 | #unicodedata.category for punctuation always starts with a 'P' 25 | table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) 26 | 27 | #list of categories ( needs to the same as a category list created when the model is trained!!! TODO!) 28 | categories = sorted(list(pq_data.keys())) 29 | #list of words that holds all unique stemmed words 30 | words = [] 31 | 32 | for each_category in pq_data.keys(): 33 | for each_sentence in pq_data[each_category]: 34 | #remove punctuation 35 | each_sentence = remove_punctuation(each_sentence, table) 36 | #extract words from sentence and add to list 'words' 37 | w = nltk.word_tokenize(each_sentence) 38 | words.extend(w) 39 | 40 | #stem and lower each word 41 | words = [stemmer.stem(w.lower()) for w in words] 42 | #remove duplicates (the set does this) 43 | words = sorted(list(set(words))) 44 | 45 | return words, categories 46 | 47 | def rebuild_neural_net(word_list, category_list, model_location): 48 | """Takes expected input and output data lists plus location of saved model 49 | returns a complete model for use""" 50 | #net is expecting a shape the size of the bag of words 51 | net = tflearn.input_data(shape=[None,len(word_list)]) 52 | #layer? 53 | net = tflearn.fully_connected(net,8) 54 | #layer? 55 | net = tflearn.fully_connected(net,8) 56 | #output layer 57 | net = tflearn.fully_connected(net, len(category_list),activation='softmax') 58 | net = tflearn.regression(net) 59 | 60 | model = tflearn.DNN(net, tensorboard_dir = '../tflearn_logs') 61 | model.load('{}'.format(model_location)) 62 | return model 63 | 64 | def create_bagofwords(sentence, word_list): 65 | """Takes input string and list of all words and returns as a BoW NP array ready for insertion into the mode""" 66 | sentence_words = nltk.word_tokenize(sentence) 67 | sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] 68 | BoW = [0]*len(word_list) 69 | for s in sentence_words: 70 | for i, w in enumerate(word_list): 71 | if w==s: 72 | BoW[i]=1 73 | return(np.array(BoW)) 74 | 75 | 76 | if __name__ == '__main__': 77 | 78 | stemmer = LancasterStemmer() 79 | 80 | #load json data into var called json 81 | with open('json2.txt', 'r') as json_data: 82 | data = json.load(json_data) 83 | 84 | 85 | wordlist, categories = prepare_input_and_output_data(data) 86 | 87 | net = rebuild_neural_net(wordlist, categories,'model/model.tflearn') 88 | 89 | while True: 90 | sent1 = input("Enter question: ") 91 | print("{}: {}".format(sent1, categories[np.argmax(net.predict([create_bagofwords(sent1, wordlist)]))])) 92 | -------------------------------------------------------------------------------- /neuralnet.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.lancaster import LancasterStemmer 3 | import numpy as np 4 | import tflearn 5 | import tensorflow as tf 6 | import random 7 | import json 8 | import string 9 | import unicodedata 10 | import sys 11 | from nltk.corpus import stopwords 12 | 13 | #SORT OUT DATA# 14 | 15 | #create table (a dictionary) that holds all punctuation characters for removal 16 | #unicodedata.category for a character is like 'punctuation, connector' 17 | tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P')) 18 | 19 | #punctuation remover (cos keys in tbl point to nothing it just deletes the punctuation) 20 | def remove_punctuation(sentence): 21 | return sentence.translate(tbl) 22 | 23 | 24 | #initialise LancasterStemmer (stems words quite harshly) 25 | stemmer = LancasterStemmer() 26 | 27 | #initialise pointer to data file 28 | data = None 29 | 30 | #load json data into var called json 31 | with open('json2.txt', 'r') as json_data: 32 | data = json.load(json_data) 33 | 34 | 35 | #get categories from json (ie business unit) 36 | categories = sorted(list(data.keys())) 37 | print(categories) 38 | 39 | #set of stop words: 40 | stopWords = set(stopwords.words('english')) 41 | 42 | #list of words that holds all unique stemmed words 43 | words = [] 44 | 45 | #list of tuples of words and units 46 | docs = [] 47 | 48 | for each_category in data.keys(): 49 | for each_sentence in data[each_category]: 50 | #remove punctuation 51 | each_sentence = remove_punctuation(each_sentence) 52 | each_sentence = each_sentence.replace('To ask the Secetary of State for Transport ', '') 53 | each_sentence = each_sentence.replace('To ask To ask Her Majesty\'s Government ', '') 54 | #extract words from sentence and add to list 'words' 55 | w = nltk.word_tokenize(each_sentence) 56 | words.extend(w) 57 | 58 | #for word in tokenized_words: 59 | #if word not in stopWords: 60 | #words.append(word) 61 | #w.append(word) 62 | 63 | #add list of words from sentence and category as a tuple to docs 64 | docs.append((w, each_category)) 65 | 66 | #stem and lower each word 67 | words = [stemmer.stem(w.lower()) for w in words] 68 | #remove duplicates (the set does this) 69 | words = sorted(list(set(words))) 70 | 71 | #list to hold training data 72 | training = [] 73 | # create an array of 0s for our output 74 | output_empty = [0] * len(categories) 75 | 76 | 77 | 78 | for doc in docs: 79 | # initialize our bag of words(bow) for each document in the list 80 | bow = [] 81 | # list of tokenized words for the pattern 82 | token_words = doc[0] 83 | # stem each word 84 | token_words = [stemmer.stem(word.lower()) for word in token_words] 85 | # create our bag of words array 86 | for w in words: 87 | bow.append(1) if w in token_words else bow.append(0) 88 | 89 | output_row = list(output_empty) 90 | output_row[categories.index(doc[1])] = 1 91 | 92 | # our training set will contain a the bag of words model and the output row that tells which catefory that bow belongs to. 93 | training.append([bow, output_row]) 94 | 95 | 96 | # shuffle our features and turn into np.array as tensorflow takes in numpy array 97 | random.shuffle(training) 98 | training = np.array(training) 99 | 100 | # train_x contains the Bag of words and train_y contains the label/ category 101 | train_x = list(training[:,0]) 102 | train_y = list(training[:,1]) 103 | 104 | #DESIGN MODEL# 105 | 106 | #reset underlying graph data 107 | tf.reset_default_graph() 108 | #Build neural network 109 | #input layer - size is the same as the input matrix (ie length of the BoW) 110 | net = tflearn.input_data(shape=[None,len(train_x[0])]) 111 | #Hidden layer1 of size 8 112 | net = tflearn.fully_connected(net,8) 113 | #Hidden layer2 of size 8 114 | net = tflearn.fully_connected(net,8) 115 | #output layer - size is dictated by the number of units we're allocating cases to 116 | net = tflearn.fully_connected(net, len(train_y[0]),activation='softmax') 117 | #regression layer that does the gradient descent (i think) 118 | net = tflearn.regression(net) 119 | 120 | #RUN MODEL# 121 | 122 | #define model and set up tensorboard 123 | model = tflearn.DNN(net, tensorboard_dir = '../tflearn_logs') 124 | #start training 125 | model.fit(train_x, train_y, n_epoch = 100, batch_size=10, show_metric = True) 126 | #save model once training is complete 127 | model.save('model/model.tflearn') 128 | 129 | #TESTING# 130 | 131 | #testing questions 132 | sent1 = "How much money is spent on cycling" 133 | sent2 = "How many accidents where there last year" 134 | sent3 = "operation stack" 135 | sent4 = "vnuk european court of justice" 136 | sent5 = "exiting the european union" 137 | sent6 = "rail fares southern rail" 138 | 139 | 140 | #convert testing questions into bag of words numpy array 141 | def get_tf_record(sentence): 142 | global words 143 | sentence_words = nltk.word_tokenize(sentence) 144 | #sentence_words=[] 145 | #for word in tokenized_words: 146 | #if word not in stopWords: 147 | #sentence_words.append(word) 148 | 149 | sentence_words = [stemmer.stem(word.lower()) for word in sentence_words] 150 | 151 | 152 | BoW = [0]*len(words) 153 | for s in sentence_words: 154 | for i, w in enumerate(words): 155 | if w==s: 156 | BoW[i]=1 157 | return(np.array(BoW)) 158 | 159 | #print results of testing 160 | print(categories[np.argmax(model.predict([get_tf_record(sent1)]))]) 161 | print(categories[np.argmax(model.predict([get_tf_record(sent2)]))]) 162 | print(categories[np.argmax(model.predict([get_tf_record(sent3)]))]) 163 | print(categories[np.argmax(model.predict([get_tf_record(sent4)]))]) 164 | print(categories[np.argmax(model.predict([get_tf_record(sent5)]))]) 165 | print(categories[np.argmax(model.predict([get_tf_record(sent6)]))]) 166 | --------------------------------------------------------------------------------