├── model
    ├── model.tflearn.index
    ├── model.tflearn.meta
    ├── model.tflearn.data-00000-of-00001
    └── checkpoint
├── checkpoint
├── requirements.txt
├── readme.md
├── data_import.py
├── allocator.py
└── neuralnet.py


/model/model.tflearn.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.index


--------------------------------------------------------------------------------
/model/model.tflearn.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.meta


--------------------------------------------------------------------------------
/model/model.tflearn.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/department-for-transport/lab-PQNeuralNet/master/model/model.tflearn.data-00000-of-00001


--------------------------------------------------------------------------------
/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "/home/zach/chapter2/chapter2/application/model.tflearn"
2 | all_model_checkpoint_paths: "/home/zach/chapter2/chapter2/application/model.tflearn"
3 | 


--------------------------------------------------------------------------------
/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "/home/zach/chapter2/chapter2/application/model/model.tflearn"
2 | all_model_checkpoint_paths: "/home/zach/chapter2/chapter2/application/model/model.tflearn"
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bleach==1.5.0
 2 | enum34==1.1.6
 3 | html5lib==0.9999999
 4 | Markdown==2.6.11
 5 | numpy==1.14.0
 6 | protobuf==3.5.1
 7 | six==1.11.0
 8 | tensorflow==1.12.1 
 9 | tensorflow-tensorboard==0.4.0rc3
10 | Werkzeug==0.14.1
11 | nltk
12 | tflearn
13 | h5py
14 | scipy
15 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # PQ Allocation Neural Net #
 2 | 
 3 | ## neuralnet.py ##
 4 | 
 5 | This application trains the neural net, using the Tensor flow library.
 6 | 
 7 | The training data is contained within the application folder, in a file named json2.text
 8 | 
 9 | json2.txt is a dictionary of lists, with each Unit in DfT as a key, and each PQ they have answered in the last 2 years in the list contained within
10 | 
11 | Lines 12-88 of neuralnet.py are getting the data into a format tensorflow can use, ie converting each question into a 1 dimensional matrix of 0s and 1s
12 | 
13 | Lines 89-99 are designing the neural net
14 | 
15 | Lines 104 - 109 instantiate the model, provides a location to save logs, runs it and save the state of the model upon completion (in the current directory)
16 | 
17 | Lines 111-137 perform some tests immediately after the model has been run
18 | 
19 | ## allocator.py ##
20 | 
21 | This appliation reinstatiates the model and allows the user to input questions, and outputs the suggested unit
22 | 


--------------------------------------------------------------------------------
/data_import.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from textacy import preprocess_text
 3 | 
 4 | 
 5 | def import_data(filepath):
 6 |     df = pd.read_excel(filepath, header=0, sheet_name='FullExtract')
 7 | 
 8 |     df['year'] = df.DateOfReceipt.astype('str').str[0:4]
 9 | 
10 |     # We will subset the df to only include key values
11 |     df = df[['ItemID', 'Summary', 'ORG_UNIT_NAME', 'year']].copy()
12 |     df = df[df.Summary.str.match(
13 |         'To ask the Secretary of State for Transport')].copy()
14 | 
15 |     # Remove the leading sentence
16 |     df['Summary'] = df.Summary.str.replace(
17 |         '^To ask the Secretary of State for Transport, ', '')
18 |     return (df)
19 | 
20 | 
21 | def clean_text(df):
22 |     '''
23 |     Func to clean up textual data to remove items that aren't likely to be
24 |     useful in machine learning.
25 |     '''
26 | 
27 |     def preprocess_text_settings(string_in):
28 |         string_in = preprocess_text(
29 |             string_in,
30 |             fix_unicode=True,
31 |             lowercase=True,
32 |             no_urls=True,
33 |             no_emails=True,
34 |             no_numbers=True,
35 |             no_accents=True,
36 |             no_punct=True)
37 |         return (string_in)
38 | 
39 |     df['summary_cleaned'] = df['Summary'].apply(preprocess_text_settings)
40 |     return (df)
41 | 
42 | 
43 | def import_and_clean(filepath='data/Extract.xlsx'):
44 |     df = import_data(filepath)
45 |     df = clean_text(df)
46 |     return (df)
47 | 


--------------------------------------------------------------------------------
/allocator.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.stem.lancaster import LancasterStemmer
 3 | import numpy as np
 4 | import tflearn
 5 | import tensorflow as tf
 6 | import random
 7 | import json
 8 | import string
 9 | import unicodedata
10 | import sys
11 | from nltk.corpus import stopwords
12 | 
13 | #punctuation remover (keys in table point to nothing so it just deletes the punctuation in the sentence)
14 | def remove_punctuation(sentence, tbl):
15 |     return sentence.translate(tbl)
16 | 
17 | 
18 | def prepare_input_and_output_data(pq_data):
19 |     """takes json data as per json2.txt, processes and returns a list of
20 |     all the words used in each question and list of categories"""
21 | 
22 | 
23 |     #create table (a dictionary) to hold remove_punctuation
24 |     #unicodedata.category for punctuation always starts with a 'P'
25 |     table = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
26 | 
27 |     #list of categories ( needs to the same as a category list created when the model is trained!!! TODO!)
28 |     categories = sorted(list(pq_data.keys()))
29 |     #list of words that holds all unique stemmed words
30 |     words = []
31 | 
32 |     for each_category in pq_data.keys():
33 |         for each_sentence in pq_data[each_category]:
34 |             #remove punctuation
35 |             each_sentence = remove_punctuation(each_sentence, table)
36 |             #extract words from sentence and add to list 'words'
37 |             w = nltk.word_tokenize(each_sentence)
38 |             words.extend(w)
39 | 
40 |     #stem and lower each word
41 |     words = [stemmer.stem(w.lower()) for w in words]
42 |     #remove duplicates (the set does this)
43 |     words = sorted(list(set(words)))
44 | 
45 |     return words, categories
46 | 
47 | def rebuild_neural_net(word_list, category_list, model_location):
48 |     """Takes expected input and output data lists plus location of saved model
49 |     returns a complete model for use"""
50 |     #net is expecting a shape the size of the bag of words
51 |     net = tflearn.input_data(shape=[None,len(word_list)])
52 |     #layer?
53 |     net = tflearn.fully_connected(net,8)
54 |     #layer?
55 |     net = tflearn.fully_connected(net,8)
56 |     #output layer
57 |     net = tflearn.fully_connected(net, len(category_list),activation='softmax')
58 |     net = tflearn.regression(net)
59 | 
60 |     model = tflearn.DNN(net, tensorboard_dir = '../tflearn_logs')
61 |     model.load('{}'.format(model_location))
62 |     return model
63 | 
64 | def create_bagofwords(sentence, word_list):
65 |     """Takes input string and list of all words and returns as a BoW NP array ready for insertion into the mode"""
66 |     sentence_words = nltk.word_tokenize(sentence)
67 |     sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
68 |     BoW = [0]*len(word_list)
69 |     for s in sentence_words:
70 |         for i, w in enumerate(word_list):
71 |             if w==s:
72 |                 BoW[i]=1
73 |     return(np.array(BoW))
74 | 
75 | 
76 | if __name__ == '__main__':
77 | 
78 |     stemmer = LancasterStemmer()
79 | 
80 |     #load json data into var called json
81 |     with open('json2.txt', 'r') as json_data:
82 |         data = json.load(json_data)
83 | 
84 | 
85 |     wordlist, categories = prepare_input_and_output_data(data)
86 | 
87 |     net = rebuild_neural_net(wordlist, categories,'model/model.tflearn')
88 | 
89 |     while True:
90 |         sent1 = input("Enter question: ")
91 |         print("{}: {}".format(sent1, categories[np.argmax(net.predict([create_bagofwords(sent1, wordlist)]))]))
92 | 


--------------------------------------------------------------------------------
/neuralnet.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | from nltk.stem.lancaster import LancasterStemmer
  3 | import numpy as np
  4 | import tflearn
  5 | import tensorflow as tf
  6 | import random
  7 | import json
  8 | import string
  9 | import unicodedata
 10 | import sys
 11 | from nltk.corpus import stopwords
 12 | 
 13 | #SORT OUT DATA#
 14 | 
 15 | #create table (a dictionary) that holds all punctuation characters for removal
 16 | #unicodedata.category for a character is like 'punctuation, connector'
 17 | tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
 18 | 
 19 | #punctuation remover (cos keys in tbl point to nothing it just deletes the punctuation)
 20 | def remove_punctuation(sentence):
 21 |     return sentence.translate(tbl)
 22 | 
 23 | 
 24 | #initialise LancasterStemmer (stems words quite harshly)
 25 | stemmer = LancasterStemmer()
 26 | 
 27 | #initialise pointer to data file
 28 | data = None
 29 | 
 30 | #load json data into var called json
 31 | with open('json2.txt', 'r') as json_data:
 32 |     data = json.load(json_data)
 33 | 
 34 | 
 35 | #get categories from json (ie business unit)
 36 | categories = sorted(list(data.keys()))
 37 | print(categories)
 38 | 
 39 | #set of stop words:
 40 | stopWords = set(stopwords.words('english'))
 41 | 
 42 | #list of words that holds all unique stemmed words
 43 | words = []
 44 | 
 45 | #list of tuples of words and units
 46 | docs = []
 47 | 
 48 | for each_category in data.keys():
 49 |     for each_sentence in data[each_category]:
 50 |         #remove punctuation
 51 |         each_sentence = remove_punctuation(each_sentence)
 52 |         each_sentence = each_sentence.replace('To ask the Secetary of State for Transport ', '')
 53 |         each_sentence = each_sentence.replace('To ask To ask Her Majesty\'s Government ', '')
 54 |         #extract words from sentence and add to list 'words'
 55 |         w = nltk.word_tokenize(each_sentence)
 56 |         words.extend(w)
 57 | 
 58 |         #for word in tokenized_words:
 59 |             #if word not in stopWords:
 60 |                 #words.append(word)
 61 |                 #w.append(word)
 62 | 
 63 |         #add list of words from sentence and category as a tuple to docs
 64 |         docs.append((w, each_category))
 65 | 
 66 | #stem and lower each word
 67 | words = [stemmer.stem(w.lower()) for w in words]
 68 | #remove duplicates (the set does this)
 69 | words = sorted(list(set(words)))
 70 | 
 71 | #list to hold training data
 72 | training = []
 73 | # create an array of 0s for our output
 74 | output_empty = [0] * len(categories)
 75 | 
 76 | 
 77 | 
 78 | for doc in docs:
 79 |     # initialize our bag of words(bow) for each document in the list
 80 |     bow = []
 81 |     # list of tokenized words for the pattern
 82 |     token_words = doc[0]
 83 |     # stem each word
 84 |     token_words = [stemmer.stem(word.lower()) for word in token_words]
 85 |     # create our bag of words array
 86 |     for w in words:
 87 |         bow.append(1) if w in token_words else bow.append(0)
 88 | 
 89 |     output_row = list(output_empty)
 90 |     output_row[categories.index(doc[1])] = 1
 91 | 
 92 |     # our training set will contain a the bag of words model and the output row that tells which catefory that bow belongs to.
 93 |     training.append([bow, output_row])
 94 | 
 95 | 
 96 | # shuffle our features and turn into np.array as tensorflow  takes in numpy array
 97 | random.shuffle(training)
 98 | training = np.array(training)
 99 | 
100 | # train_x contains the Bag of words and train_y contains the label/ category
101 | train_x = list(training[:,0])
102 | train_y = list(training[:,1])
103 | 
104 | #DESIGN MODEL#
105 | 
106 | #reset underlying graph data
107 | tf.reset_default_graph()
108 | #Build neural network
109 | #input layer - size is the same as the input matrix (ie length of the BoW)
110 | net = tflearn.input_data(shape=[None,len(train_x[0])])
111 | #Hidden layer1 of size 8
112 | net = tflearn.fully_connected(net,8)
113 | #Hidden layer2 of size 8
114 | net = tflearn.fully_connected(net,8)
115 | #output layer - size is dictated by the number of units we're allocating cases to
116 | net = tflearn.fully_connected(net, len(train_y[0]),activation='softmax')
117 | #regression layer that does the gradient descent (i think)
118 | net = tflearn.regression(net)
119 | 
120 | #RUN MODEL#
121 | 
122 | #define model and set up tensorboard
123 | model = tflearn.DNN(net, tensorboard_dir = '../tflearn_logs')
124 | #start training
125 | model.fit(train_x, train_y, n_epoch = 100, batch_size=10, show_metric = True)
126 | #save model once training is complete
127 | model.save('model/model.tflearn')
128 | 
129 | #TESTING#
130 | 
131 | #testing questions
132 | sent1 = "How much money is spent on cycling"
133 | sent2 = "How many accidents where there last year"
134 | sent3 = "operation stack"
135 | sent4 = "vnuk european court of justice"
136 | sent5 = "exiting the european union"
137 | sent6 = "rail fares southern rail"
138 | 
139 | 
140 | #convert testing questions into bag of words numpy array
141 | def get_tf_record(sentence):
142 |     global words
143 |     sentence_words = nltk.word_tokenize(sentence)
144 |     #sentence_words=[]
145 |     #for word in tokenized_words:
146 |         #if word not in stopWords:
147 |             #sentence_words.append(word)
148 | 
149 |     sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
150 | 
151 | 
152 |     BoW = [0]*len(words)
153 |     for s in sentence_words:
154 |         for i, w in enumerate(words):
155 |             if w==s:
156 |                 BoW[i]=1
157 |     return(np.array(BoW))
158 | 
159 | #print results of testing
160 | print(categories[np.argmax(model.predict([get_tf_record(sent1)]))])
161 | print(categories[np.argmax(model.predict([get_tf_record(sent2)]))])
162 | print(categories[np.argmax(model.predict([get_tf_record(sent3)]))])
163 | print(categories[np.argmax(model.predict([get_tf_record(sent4)]))])
164 | print(categories[np.argmax(model.predict([get_tf_record(sent5)]))])
165 | print(categories[np.argmax(model.predict([get_tf_record(sent6)]))])
166 | 


--------------------------------------------------------------------------------