├── Mariam_Garba_NLP_HW1_Report.pdf
├── README.md
├── code
    ├── model.py
    ├── predict.py
    ├── preprocess.py
    ├── score.py
    └── train.py
└── resources
    └── .gitkeep


/Mariam_Garba_NLP_HW1_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokeam/Chinese-Word-Segmentation-in-NLP/93f0bae947152e3885adba966cdd67f3ae5ffc1e/Mariam_Garba_NLP_HW1_Report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Chinese Word Segmentation 
 2 | 
 3 | State of the art Chinese Word Segmentation with Bi-LSTMs (Ji Ma, Kuzman Ganchev and David Weiss, EMNLP 2018) - (https://aclweb.org/anthology/D18-1529)
 4 | 
 5 | ## Compatibility
 6 | 
 7 | Python3.6.X,&nbsp;&nbsp;Tensorflow 1.12.0
 8 | 
 9 | ## Notes
10 | 
11 | In this project, four chinese datasets (AS,CITYU,MSR and PKU) were used to train the deep learning model for chinese word segmentation task. These datasets can be gotten from: http://sighan.cs.uchicago.edu/bakeoff2005/
12 | 
13 | 
14 | 
15 | ## For Training
16 | 
17 | ```bash
18 | Run: python3 train.py
19 | ```
20 | input_file_path is the path that contains no-space chinese sequence. &nbsp;
21 | 
22 | label_file_path is the path that contains the chinese sequence labels in BIES format.
23 | 
24 | ## For Preprocessing
25 | 
26 | ```bash
27 | Run: python3 preprocess.py original_file_path input_file_path output_file_path 
28 | ```
29 | original_file_path is the file that contains the chinese sequence. &nbsp;
30 | 
31 | input_file_path is the path to save the no-space chinese sequence. &nbsp;
32 | 
33 | label_file_path is the path to save the chinese sequence labels in BIES format.
34 | 
35 | ## For Prediction
36 | 
37 | ```bash
38 | Run: python3 predict.py input_path output_path resources_path
39 | ```
40 | input_path is the file that contains the no-space chinese sequence. &nbsp;
41 | 
42 | output_path is the path to save the predictions in BIES format. &nbsp;
43 | 
44 | resources_path is the path to the saved model. &nbsp;
45 | 
46 | The saved model and extras can be downloaded from http://bit.ly/2PKGZBg and placed in the resources folder.
47 | 
48 | ## For Scoring
49 | 
50 | ```bash
51 | Run: python3 score.py predicition_file gold_file
52 | ```
53 | prediction_file is the file that contains the predicitions in BIES format from previous step. &nbsp;
54 | 
55 | gold_file is the path to the gold file in BIES format.
56 | 


--------------------------------------------------------------------------------
/code/model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import collections
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | import pickle
  6 | from typing import Tuple, List, Dict
  7 | from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint
  8 | from collections import Counter
  9 | from tensorflow.keras.layers import Dense, Input,Masking,LSTM, Embedding,Reshape, Dropout, Activation,TimeDistributed,Bidirectional
 10 | from tensorflow.keras.models import Model,Sequential,load_model
 11 | from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
 12 | from tensorflow.keras.optimizers import SGD
 13 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 14 | from tensorflow.keras.utils import to_categorical 
 15 | from tensorflow.keras import backend as K
 16 | 
 17 | unigram_path = '../resources/as_cityu_msr_pku_unigram.utf8'
 18 | X_test_path = 'pku_input.utf8'
 19 | unigram_vocab = dict()
 20 | unigram_word_to_id = dict()
 21 | X_test_uni = []
 22 | 
 23 | def vocabulary(unigram_path):
 24 | 	"""
 25 | 	This is the function to build the vocabulary of the dataset.
 26 | 	
 27 | 	:param unigram_path: The path to the file that contains the unigrams
 28 | 	:return: None
 29 | 	"""
 30 | 	with open(unigram_path, 'r', encoding='utf8') as f:
 31 | 	  original_lines = f.readlines()
 32 | 	  for line in original_lines:
 33 | 	  	words = line.split()
 34 | 	  	for word in words:
 35 | 	  		if word not in unigram_vocab:
 36 | 	  			unigram_vocab[word] = 1
 37 | 	  		else:
 38 | 	  			unigram_vocab[word] += 1
 39 | 
 40 | def word2index():
 41 | 	"""
 42 | 	Converts each character to its index in the vocabulary
 43 | 	
 44 | 	:return: None
 45 | 	"""
 46 | 	vocabulary(unigram_path)
 47 | 	unigram_word_to_id["<PAD>"] = 0 #zero is not casual!
 48 | 	unigram_word_to_id["<UNK>"] = 1 #OOV are mapped as <UNK>
 49 | 	unigram_word_to_id.update({k:v+len(unigram_word_to_id) for k, v in unigram_vocab.items()})
 50 | 
 51 | def tokenize_dataset(X_test_path):
 52 | 	word2index()
 53 | 	with open(X_test_path, 'r', encoding='utf8') as f:
 54 | 	  original_lines = f.readlines()
 55 | 	  original_lines = [line.replace("\u3000","")for line in original_lines]
 56 | 	  for line in original_lines:
 57 | 	  	words = line.split()
 58 | 	  	for word in words:
 59 | 	  		char = []
 60 | 	  		for c in word:
 61 | 	  			try:
 62 | 	  				char.append(unigram_word_to_id[c])
 63 | 	  			except KeyError:
 64 | 	  				char.append(unigram_word_to_id["<UNK>"])
 65 | 	  	X_test_uni.append(char)
 66 | 
 67 | def precision(y_true, y_pred):
 68 | 	"""Precision metric.
 69 | 	Only computes a batch-wise average of precision.
 70 | 	Computes the precision, a metric for multi-label classification of
 71 | 	how many selected items are relevant.
 72 | 	"""
 73 | 	true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
 74 | 	predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
 75 | 	precision = true_positives / (predicted_positives + K.epsilon())
 76 | 	return precision
 77 | 
 78 | def convert_integer_to_label(string):
 79 | 	"""
 80 | 	Converts the encoded integer labels from integers to BIES format
 81 | 	:param string: integer to be converted to BIES format
 82 | 	:return: Array of labels in BIES format+
 83 | 	"""
 84 | 	tags = []
 85 | 	for word in string:
 86 | 		if word == '3':
 87 | 			tags.append('S')  # 'S', a Single character
 88 | 		if word == '0':
 89 | 				tags.append('B')  # 'B', Begin of a word
 90 | 		if word == '1':
 91 | 			tags.append('I')  # 'I', Middle of a word
 92 | 		if word == '2':
 93 | 			tags.append('E')  # 'E',  End of a word
 94 | 	return tags
 95 | 
 96 | def getlabel(array):
 97 | 	"""
 98 | 	Get the BIES format of an array
 99 | 	:param array: The encoded integer labels to be converted to BIES format
100 | 	"""
101 | 	result = []
102 | 	for i in array:
103 | 		string = ""
104 | 		for digit in i:
105 | 			string += str(digit)
106 | 		result.append(convert_integer_to_label(string))
107 | 	return result
108 | 
109 | def predict_model(input_path,output_path,model_path):
110 | 	tokenize_dataset(input_path)
111 | 	model = load_model(model_path,custom_objects={"precision": precision})
112 | 
113 | 	y_pred = [None]*len(X_test_uni)
114 | 	for i in range(len(X_test_uni)):
115 | 		this_pred = model.predict(X_test_uni[i])    
116 | 		y_pred[i] = this_pred
117 | 
118 | 	Y = [None]*len(y_pred)
119 | 	for i in range(len(y_pred)):
120 | 	  Y[i] = y_pred[i].argmax(axis=-1)
121 | 
122 | 	A = getlabel(Y)
123 | 
124 | 	with open(output_path, 'w') as f:
125 | 		for item in A:
126 | 			line = ("".join("%s"%a for a in item))
127 | 			f.write("%s\n" %line)
128 | 		print("BIES Predictions Saved at "+output_path)


--------------------------------------------------------------------------------
/code/predict.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from model import predict_model
 3 | 
 4 | def parse_args():
 5 |     parser = ArgumentParser()
 6 |     parser.add_argument("input_path", help="The path of the input file")
 7 |     parser.add_argument("output_path", help="The path of the output file")
 8 |     parser.add_argument("resources_path", help="The path of the resources needed to load your model")
 9 | 
10 |     return parser.parse_args()
11 | 
12 | 
13 | def predict(input_path, output_path, resources_path):
14 |     """
15 |     This is the skeleton of the prediction function.
16 |     The predict function will build your model, load the weights from the checkpoint and write a new file (output_path)
17 |     with your predictions in the BIES format.
18 |     
19 |     The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission.
20 |     
21 |     N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code.
22 | 
23 |     :param input_path: the path of the input file to predict.
24 |     :param output_path: the path of the output file (where you save your predictions)
25 |     :param resources_path: the path of the resources folder containing your model and stuff you might need.
26 |     :return: None
27 |     """
28 |     print("Loading......")
29 |     predict_model(input_path,output_path,resources_path)
30 |     print("Done!")
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     args = parse_args()
35 |     predict(args.input_path, args.output_path, args.resources_path)
36 | 


--------------------------------------------------------------------------------
/code/preprocess.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python
  2 | # -*- coding: utf8 -*-
  3 | import re
  4 | import numpy as np
  5 | import six
  6 | import sys
  7 | import getopt
  8 | import locale
  9 | from argparse import ArgumentParser
 10 | 
 11 | def parse_args():
 12 | 	parser = ArgumentParser()
 13 | 	parser.add_argument("original_file_path", help="The path of the original file")
 14 | 	parser.add_argument("input_file_path", help="The path of the input file with no spaces")
 15 | 	parser.add_argument("label_file_path", help="The path of the label file in BIES format")
 16 | 	return parser.parse_args()
 17 | 
 18 | def generate_ngram(string,n):
 19 | 	"""
 20 | 	This is the function to generate n-grams.
 21 | 	
 22 | 	:param string: The string to be splitted.
 23 | 	:param n: The number of gram
 24 | 	:return: A string of ngrams
 25 | 	:usage: ("ABCD",1) -> "A B C D" or ("ABCD",2) -> "AB BC CD"
 26 | 	"""
 27 | 
 28 | 	ans = ''
 29 | 	for i in range(len(string) - n + 1):
 30 | 		ans += string[i:i+n]
 31 | 		ans += '  '
 32 | 	return ans
 33 | 
 34 | def get_ngrams(input_file_path):
 35 | 	"""
 36 | 	This is the function that writes ngrams to file.
 37 | 	
 38 | 	:param input_file_path: The path to file that contains strings to be converted to ngrams
 39 | 	:return: unigram file
 40 | 	"""
 41 | 	
 42 | 	corpora = open(input_file_path, 'r', encoding='utf8')
 43 | 	unigram_input = open('unigram.utf8', 'w', encoding='utf8')
 44 | 	all_lines = corpora.readlines()
 45 | 	all_lines = [line.replace(' ', '')[0:-1] for line in all_lines]
 46 | 	for line in all_lines:
 47 | 		unigram_input.write(generate_ngram(line,1))
 48 | 		if line != all_lines[-1]:
 49 | 			unigram_input.write('\n')
 50 | 		else:
 51 | 			pass
 52 | 	corpora.close()
 53 | 	unigram_input.close()
 54 | 	print("Unigram Generated!")
 55 | 
 56 | def convert_to_bies(string):
 57 | 	"""
 58 | 	This is the function to encode labels in BIES format.
 59 | 	
 60 | 	:param string: The labels to be encoded
 61 | 	:return: Encoded labels in BIES format
 62 | 	:usage: ("共同 创造 美好 的 新 世纪 ——") -> "BEBEBESSBEBE"
 63 | 	"""
 64 | 
 65 | 	features = []
 66 | 	for word in string.split():
 67 | 		for c in word:
 68 | 			feature = ""
 69 | 			len_word = len(word)
 70 | 			if len_word == 1:
 71 | 				feature += "S"
 72 | 			else:
 73 | 				feature += "B"
 74 | 				for i_ in range(len_word - 2):
 75 | 					feature += "I"
 76 | 				feature += "E"
 77 | 		features.append(feature)
 78 | 	results = ''.join(str(e) for e in features)
 79 | 	return results
 80 | 
 81 | def preprocess(original_file,input_file,label_file):
 82 | 	"""
 83 | 	This is the function that read the original training file.
 84 |  
 85 | 	:param string: The labels to be encoded
 86 | 	:return: Input file with no spaces and label file in BIES format
 87 | 	"""
 88 | 
 89 | 	with open(original_file, 'r', encoding='utf8') as f:
 90 | 		original_lines = f.readlines()
 91 | 		original_lines = list(filter(lambda x: x.strip(),original_lines))
 92 | 	# remove spacess
 93 | 	lines = [re.sub(r'\s(?=[^A-z0-9])','',line) for line in original_lines]
 94 | 	lines = [line.replace(" ","")for line in lines]
 95 | 	lines = [line.replace("\u3000","")for line in lines]
 96 | 	
 97 | 	# finally, write lines in the file
 98 | 	with open(input_file, 'w') as f:
 99 | 		f.writelines(lines)
100 | 
101 | 	# finally, write labels in the file
102 | 	label_lines = [convert_to_bies(label) for label in original_lines]
103 | 	with open(label_file, 'w') as f:
104 | 		for item in label_lines:
105 | 			f.write("%s" % item)
106 | 			if item != label_lines[-1]:
107 | 				f.write("\n")
108 | 	print("Input and Label files generated!")
109 | 
110 | if __name__ == '__main__':
111 | 	args = parse_args()
112 | 	preprocess(args.original_file_path, args.input_file_path, args.label_file_path)
113 | 	get_ngrams(args.input_file_path)


--------------------------------------------------------------------------------
/code/score.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | ALL_TAGS = {"B", "I", "E", "S"}
 4 | 
 5 | 
 6 | def parse_args():
 7 |     parser = ArgumentParser()
 8 |     parser.add_argument("prediction_file", help="The path to the prediction file (in BIES format)")
 9 |     parser.add_argument("gold_file", help="The path to the gold file (in BIES format)")
10 | 
11 |     return parser.parse_args()
12 | 
13 | 
14 | def is_valid_prediction(prediction_iter, gold_iter):
15 |     assert len(prediction_iter) == len(gold_iter), "Prediction and gold have different lengths"
16 | 
17 |     prediction_tags = set()
18 |     gold_tags = set()
19 |     nr_line = 1
20 |     for preds, gold in zip(prediction_iter, gold_iter):
21 |         assert len(preds) == len(gold), "Line " + str(nr_line) + ": lengths mismatch"
22 |         prediction_tags.update(preds)
23 |         gold_tags.update(gold)
24 |         nr_line += 1
25 |     
26 |     prediction_tags = {t.upper() for t in prediction_tags}
27 |     gold_tags = {t.upper() for t in gold_tags}
28 |     
29 |     assert len(gold_tags.difference(ALL_TAGS)) == 0, "Unknown tag detected in gold"
30 |     assert len(prediction_tags.difference(ALL_TAGS)) == 0, "Unknown tag detected in predictions"
31 | 
32 | 
33 | def score(prediction_iter, gold_iter, verbose=False):
34 |     """
35 |     Returns the precision of the model's predictions w.r.t. the gold standard (i.e. the tags of the
36 |     correct word segmentation).
37 | 
38 |     :param prediction_iter: List of strings in the BIES format representing the model's predictions.
39 |     :param gold_iter: List of strings in the BIES format representing the gold standard.
40 | 
41 |     :return: precision [0.0, 1.0]
42 |     
43 |     Ex. predictions_iter = ["BEBESBIIE",
44 |                             "BIIIEBEBESS"]
45 |         gold_iter = ["BEBIEBIES",
46 |                      "BIIESBEBESS"]
47 |         output: 0.7
48 |     
49 |     The same result can be obtain by passing list of lists
50 |     Ex. predictions_iter = [["B", "E", "B", "E", "S", "B", "I", "I", "E"],
51 |                             ["B", "I", "I", "I", "E", "B", "E", "B", "E", "S", "S"]]
52 |         gold_iter = [["B", "E", "B", "I", "E", "B", "I", "E", "S"],
53 |                      ["B", "I", "I", "E", "S", "B", "E", "B", "E", "S", "S"]]
54 |         output: 0.7
55 | 
56 |     
57 |     """
58 |     
59 |     is_valid_prediction(prediction_iter, gold_iter)
60 | 
61 |     right_predictions = 0
62 |     wrong_predictions = 0
63 | 
64 |     for prediction_sentence, gold_sentence in zip(prediction_iter, gold_iter):
65 |         for prediction_tag, gold_tag in zip(prediction_sentence, gold_sentence):
66 |             if prediction_tag == gold_tag:
67 |                 right_predictions += 1
68 |             else:
69 |                 wrong_predictions += 1
70 | 
71 |     precision = right_predictions / (right_predictions + wrong_predictions)
72 |     if verbose:
73 |         print("Precision:\t", precision)
74 | 
75 |     return precision
76 | 
77 | 
78 | def label_text_to_iter(file_path):
79 |     iter_ = []
80 |     with open(file_path) as f:
81 |         for line in f:
82 |             line = line.strip().upper()
83 |             iter_.append(line)
84 |     return iter_
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     args = parse_args()
89 |     prediction_iter = []
90 |     score(label_text_to_iter(args.prediction_file), label_text_to_iter(args.gold_file), verbose=True)
91 | 
92 | 


--------------------------------------------------------------------------------
/code/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Tuple, List, Dict
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint,EarlyStopping
  6 | from collections import Counter
  7 | from tensorflow.keras.layers import Dense, Input,Masking,LSTM, Embedding,Reshape, Dropout, Activation,TimeDistributed,Bidirectional,concatenate, GlobalMaxPool1D
  8 | from tensorflow.keras.models import Model,Sequential,load_model
  9 | from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
 10 | import pickle
 11 | from tensorflow.keras.optimizers import SGD
 12 | from  tensorflow.keras.regularizers import l2
 13 | from tensorflow.keras.preprocessing.text import Tokenizer
 14 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 15 | from tensorflow.keras.utils import to_categorical 
 16 | from sklearn.model_selection import GridSearchCV
 17 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
 18 | import collections
 19 | from sklearn.model_selection import train_test_split
 20 | import matplotlib.pyplot as plt
 21 | from tensorflow.keras import backend as K
 22 | 
 23 | unigram_path = '../resources/as_cityu_msr_pku_unigram.utf8'
 24 | X_train_path = '../resources/as_cityu_msr_pku_input.utf8'
 25 | Y_train_path = '../resources/as_cityu_msr_pku_label.utf8'
 26 | 
 27 | unigram_vocab = dict()
 28 | unigram_word_to_id = dict()
 29 | X_train_uni = []
 30 | Y_train = []
 31 | 
 32 | 
 33 | def vocabulary(unigram_path=unigram_path ):
 34 | 	"""
 35 | 	This is the function to build the vocabulary of the dataset.
 36 | 	
 37 | 	:param unigram_path: The path to the file that contains the unigrams
 38 | 	:return: None
 39 | 	"""
 40 | 	with open(unigram_path, 'r', encoding='utf8') as f:
 41 | 	  original_lines = f.readlines()
 42 | 	  for line in original_lines:
 43 | 	  	words = line.split()
 44 | 	  	for word in words:
 45 | 	  		if word not in unigram_vocab:
 46 | 	  			unigram_vocab[word] = 1
 47 | 	  		else:
 48 | 	  			unigram_vocab[word] += 1
 49 | 
 50 | def word2index():
 51 | 	"""
 52 | 	Converts each character to its index in the vocabulary
 53 | 	
 54 | 	:return: None
 55 | 	"""
 56 | 	vocabulary()
 57 | 	unigram_word_to_id["<PAD>"] = 0 #zero is not casual!
 58 | 	unigram_word_to_id["<UNK>"] = 1 #OOV are mapped as <UNK>
 59 | 	unigram_word_to_id.update({k:v+len(unigram_word_to_id) for k, v in unigram_vocab.items()})
 60 | 
 61 | def tokenize_dataset(X_train_path=X_train_path):
 62 | 	"""
 63 | 	Converts each character to its index in the vocabulary
 64 | 
 65 | 	:param X_train_path: path to the trainig set with no spaces 
 66 | 	:return: encoded X  training set
 67 | 	"""
 68 | 	word2index()
 69 | 	with open(X_train_path, 'r', encoding='utf8') as f:
 70 | 	  original_lines = f.readlines()
 71 | 	  original_lines = [line.replace("\u3000","")for line in original_lines]
 72 | 	  for line in original_lines:
 73 | 	  	words = line.split()
 74 | 	  	for word in words:
 75 | 	  		char = []
 76 | 	  		for c in word:
 77 | 	  			try:
 78 | 	  				char.append(unigram_word_to_id[c])
 79 | 	  			except KeyError:
 80 | 	  				char.append(unigram_word_to_id["<UNK>"])
 81 | 	  	X_train_uni.append(char)
 82 | 	return X_train_uni
 83 | 
 84 | def convert_labels_to_integer(string):
 85 | 	"""
 86 | 	Converts the labels from BIES format to integer
 87 | 	:param string: integer to be converted to BIES format
 88 | 	:return: Array of labels in Integer
 89 | 	"""
 90 | 	tags = []
 91 | 	for words in string.split():
 92 | 		for word in words:
 93 | 			if word == 'S':
 94 | 				tags.append(3)  # 'S', a Single character
 95 | 			if word == 'B':
 96 | 					tags.append(0)  # 'B', Begin of a word
 97 | 			if word == 'I':
 98 | 				tags.append(1)  # 'I', Middle of a word
 99 | 			if word == 'E':
100 | 				tags.append(2)  # 'E',  End of a word
101 | 	return tags
102 | 
103 | 
104 | def encode_y(Y_train_path=Y_train_path):
105 | 	"""
106 | 	Encodes the labels
107 | 	:param Y_train_path: Path to labels in BIES format
108 | 	:return: Array of one hot encoded training labels
109 | 	"""
110 | 	#Training Labels
111 | 	with open(Y_train_path, 'r', encoding='utf8') as f:
112 | 			label_original_lines = f.readlines()
113 | 	Y_tra = [convert_labels_to_integer(label) for label in label_original_lines]
114 | 
115 | 	#One Hot Encoding of Training Labels
116 | 	for y in Y_tra:
117 | 		Y_train.append(to_categorical(y,num_classes=4))
118 | 	return Y_train
119 | 
120 | def pad_data(X_train_path,Y_train_path):
121 | 	"""
122 | 	Pad training set sequences
123 | 	:param X_train_path: Path to X encoded
124 | 	:param Y_train_path: Path to Y encoded
125 | 	:return: padded training sets
126 | 	"""
127 | 	max_len = (sum(len(line) for line in X_train_uni) / len(X_train_uni))
128 | 	MAX_LEN = round(max_len)+1
129 | 
130 | 	train_x_uni_padded = pad_sequences(X_train_uni,padding='post', maxlen=MAX_LEN)
131 | 	train_y_padded = pad_sequences(Y_train,padding='post', maxlen=MAX_LEN)
132 | 
133 | 	return train_x_uni_padded,train_y_padded
134 | 
135 | 
136 | def precision(y_true, y_pred):
137 | 	"""Precision metric.
138 | 	Only computes a batch-wise average of precision.
139 | 	Computes the precision, a metric for multi-label classification of
140 | 	how many selected items are relevant.
141 | 	"""
142 | 	true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
143 | 	predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
144 | 	precision = true_positives / (predicted_positives + K.epsilon())
145 | 
146 | 	return precision
147 | 
148 | def bilstm_model():
149 | 	"""
150 | 	Bilstm model
151 | 	:return: model
152 | 	"""
153 | 	LEN = 2000000
154 | 	visible = Input(shape=(None,))
155 | 	em = Embedding(LEN,64,input_length=None,mask_zero=True)(visible)
156 | 	hidden = Bidirectional(LSTM(256,return_sequences=True,dropout=0.6,recurrent_dropout=0.4),merge_mode='sum')(em)
157 | 	output = TimeDistributed(Dense(4,activation='softmax'))(hidden)
158 | 	model = Model(inputs=visible, outputs=output)
159 | 	model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.035, momentum=0.95), metrics=['accuracy',precision])
160 | 	return model
161 | 
162 | 
163 | if __name__ == '__main__':
164 | 	word2index()
165 | 	X_train_uni = tokenize_dataset()
166 | 	Y_train = encode_y()
167 | 	train_x_uni_padded,train_y_padded = pad_data(X_train_uni,Y_train)
168 | 	model = bilstm_model()
169 | 	filepath = "weights.{epoch:02d}-{val_loss:.2f}.hdf5"
170 | 	mc = ModelCheckpoint(filepath, monitor='val_precision', verbose=1, save_best_only=True, mode='max')
171 | 	print("Training")
172 | 	history = model.fit(train_x_uni_padded,train_y_padded,batch_size=256, epochs=20, verbose=1,shuffle=True,validation_split=0.2,callbacks=[mc])
173 | 
174 | 	# Plot training & validation precision values
175 | 	plt.plot(history.history['precision'])
176 | 	plt.plot(history.history['val_precision'])
177 | 	plt.title('Model Precision')
178 | 	plt.ylabel('Precision')
179 | 	plt.xlabel('Epoch')
180 | 	plt.legend(['Train', 'Test'], loc='upper left')
181 | 	plt.show()
182 | 
183 | 	# Plot training & validation accuracy values
184 | 	plt.plot(history.history['acc'])
185 | 	plt.plot(history.history['val_acc'])
186 | 	plt.title('Model accuracy')
187 | 	plt.ylabel('Accuracy')
188 | 	plt.xlabel('Epoch')
189 | 	plt.legend(['Train', 'Test'], loc='upper left')
190 | 	plt.show()
191 | 
192 | 	# Plot training & validation loss values
193 | 	plt.plot(history.history['loss'])
194 | 	plt.plot(history.history['val_loss'])
195 | 	plt.title('Model loss')
196 | 	plt.ylabel('Loss')
197 | 	plt.xlabel('Epoch')
198 | 	plt.legend(['Train', 'Test'], loc='upper left')
199 | 	plt.show()


--------------------------------------------------------------------------------
/resources/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokeam/Chinese-Word-Segmentation-in-NLP/93f0bae947152e3885adba966cdd67f3ae5ffc1e/resources/.gitkeep


--------------------------------------------------------------------------------