├── Mariam_Garba_NLP_HW1_Report.pdf ├── README.md ├── code ├── model.py ├── predict.py ├── preprocess.py ├── score.py └── train.py └── resources └── .gitkeep /Mariam_Garba_NLP_HW1_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokeam/Chinese-Word-Segmentation-in-NLP/93f0bae947152e3885adba966cdd67f3ae5ffc1e/Mariam_Garba_NLP_HW1_Report.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chinese Word Segmentation 2 | 3 | State of the art Chinese Word Segmentation with Bi-LSTMs (Ji Ma, Kuzman Ganchev and David Weiss, EMNLP 2018) - (https://aclweb.org/anthology/D18-1529) 4 | 5 | ## Compatibility 6 | 7 | Python3.6.X,  Tensorflow 1.12.0 8 | 9 | ## Notes 10 | 11 | In this project, four chinese datasets (AS,CITYU,MSR and PKU) were used to train the deep learning model for chinese word segmentation task. These datasets can be gotten from: http://sighan.cs.uchicago.edu/bakeoff2005/ 12 | 13 | 14 | 15 | ## For Training 16 | 17 | ```bash 18 | Run: python3 train.py 19 | ``` 20 | input_file_path is the path that contains no-space chinese sequence.   21 | 22 | label_file_path is the path that contains the chinese sequence labels in BIES format. 23 | 24 | ## For Preprocessing 25 | 26 | ```bash 27 | Run: python3 preprocess.py original_file_path input_file_path output_file_path 28 | ``` 29 | original_file_path is the file that contains the chinese sequence.   30 | 31 | input_file_path is the path to save the no-space chinese sequence.   32 | 33 | label_file_path is the path to save the chinese sequence labels in BIES format. 34 | 35 | ## For Prediction 36 | 37 | ```bash 38 | Run: python3 predict.py input_path output_path resources_path 39 | ``` 40 | input_path is the file that contains the no-space chinese sequence.   41 | 42 | output_path is the path to save the predictions in BIES format.   43 | 44 | resources_path is the path to the saved model.   45 | 46 | The saved model and extras can be downloaded from http://bit.ly/2PKGZBg and placed in the resources folder. 47 | 48 | ## For Scoring 49 | 50 | ```bash 51 | Run: python3 score.py predicition_file gold_file 52 | ``` 53 | prediction_file is the file that contains the predicitions in BIES format from previous step.   54 | 55 | gold_file is the path to the gold file in BIES format. 56 | -------------------------------------------------------------------------------- /code/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import tensorflow as tf 4 | import numpy as np 5 | import pickle 6 | from typing import Tuple, List, Dict 7 | from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint 8 | from collections import Counter 9 | from tensorflow.keras.layers import Dense, Input,Masking,LSTM, Embedding,Reshape, Dropout, Activation,TimeDistributed,Bidirectional 10 | from tensorflow.keras.models import Model,Sequential,load_model 11 | from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers 12 | from tensorflow.keras.optimizers import SGD 13 | from tensorflow.keras.preprocessing.sequence import pad_sequences 14 | from tensorflow.keras.utils import to_categorical 15 | from tensorflow.keras import backend as K 16 | 17 | unigram_path = '../resources/as_cityu_msr_pku_unigram.utf8' 18 | X_test_path = 'pku_input.utf8' 19 | unigram_vocab = dict() 20 | unigram_word_to_id = dict() 21 | X_test_uni = [] 22 | 23 | def vocabulary(unigram_path): 24 | """ 25 | This is the function to build the vocabulary of the dataset. 26 | 27 | :param unigram_path: The path to the file that contains the unigrams 28 | :return: None 29 | """ 30 | with open(unigram_path, 'r', encoding='utf8') as f: 31 | original_lines = f.readlines() 32 | for line in original_lines: 33 | words = line.split() 34 | for word in words: 35 | if word not in unigram_vocab: 36 | unigram_vocab[word] = 1 37 | else: 38 | unigram_vocab[word] += 1 39 | 40 | def word2index(): 41 | """ 42 | Converts each character to its index in the vocabulary 43 | 44 | :return: None 45 | """ 46 | vocabulary(unigram_path) 47 | unigram_word_to_id[""] = 0 #zero is not casual! 48 | unigram_word_to_id[""] = 1 #OOV are mapped as 49 | unigram_word_to_id.update({k:v+len(unigram_word_to_id) for k, v in unigram_vocab.items()}) 50 | 51 | def tokenize_dataset(X_test_path): 52 | word2index() 53 | with open(X_test_path, 'r', encoding='utf8') as f: 54 | original_lines = f.readlines() 55 | original_lines = [line.replace("\u3000","")for line in original_lines] 56 | for line in original_lines: 57 | words = line.split() 58 | for word in words: 59 | char = [] 60 | for c in word: 61 | try: 62 | char.append(unigram_word_to_id[c]) 63 | except KeyError: 64 | char.append(unigram_word_to_id[""]) 65 | X_test_uni.append(char) 66 | 67 | def precision(y_true, y_pred): 68 | """Precision metric. 69 | Only computes a batch-wise average of precision. 70 | Computes the precision, a metric for multi-label classification of 71 | how many selected items are relevant. 72 | """ 73 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 74 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 75 | precision = true_positives / (predicted_positives + K.epsilon()) 76 | return precision 77 | 78 | def convert_integer_to_label(string): 79 | """ 80 | Converts the encoded integer labels from integers to BIES format 81 | :param string: integer to be converted to BIES format 82 | :return: Array of labels in BIES format+ 83 | """ 84 | tags = [] 85 | for word in string: 86 | if word == '3': 87 | tags.append('S') # 'S', a Single character 88 | if word == '0': 89 | tags.append('B') # 'B', Begin of a word 90 | if word == '1': 91 | tags.append('I') # 'I', Middle of a word 92 | if word == '2': 93 | tags.append('E') # 'E', End of a word 94 | return tags 95 | 96 | def getlabel(array): 97 | """ 98 | Get the BIES format of an array 99 | :param array: The encoded integer labels to be converted to BIES format 100 | """ 101 | result = [] 102 | for i in array: 103 | string = "" 104 | for digit in i: 105 | string += str(digit) 106 | result.append(convert_integer_to_label(string)) 107 | return result 108 | 109 | def predict_model(input_path,output_path,model_path): 110 | tokenize_dataset(input_path) 111 | model = load_model(model_path,custom_objects={"precision": precision}) 112 | 113 | y_pred = [None]*len(X_test_uni) 114 | for i in range(len(X_test_uni)): 115 | this_pred = model.predict(X_test_uni[i]) 116 | y_pred[i] = this_pred 117 | 118 | Y = [None]*len(y_pred) 119 | for i in range(len(y_pred)): 120 | Y[i] = y_pred[i].argmax(axis=-1) 121 | 122 | A = getlabel(Y) 123 | 124 | with open(output_path, 'w') as f: 125 | for item in A: 126 | line = ("".join("%s"%a for a in item)) 127 | f.write("%s\n" %line) 128 | print("BIES Predictions Saved at "+output_path) -------------------------------------------------------------------------------- /code/predict.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from model import predict_model 3 | 4 | def parse_args(): 5 | parser = ArgumentParser() 6 | parser.add_argument("input_path", help="The path of the input file") 7 | parser.add_argument("output_path", help="The path of the output file") 8 | parser.add_argument("resources_path", help="The path of the resources needed to load your model") 9 | 10 | return parser.parse_args() 11 | 12 | 13 | def predict(input_path, output_path, resources_path): 14 | """ 15 | This is the skeleton of the prediction function. 16 | The predict function will build your model, load the weights from the checkpoint and write a new file (output_path) 17 | with your predictions in the BIES format. 18 | 19 | The resources folder should contain everything you need to make the predictions. It is the "resources" folder in your submission. 20 | 21 | N.B. DO NOT HARD CODE PATHS IN HERE. Use resource_path instead, otherwise we will not be able to run the code. 22 | 23 | :param input_path: the path of the input file to predict. 24 | :param output_path: the path of the output file (where you save your predictions) 25 | :param resources_path: the path of the resources folder containing your model and stuff you might need. 26 | :return: None 27 | """ 28 | print("Loading......") 29 | predict_model(input_path,output_path,resources_path) 30 | print("Done!") 31 | 32 | 33 | if __name__ == '__main__': 34 | args = parse_args() 35 | predict(args.input_path, args.output_path, args.resources_path) 36 | -------------------------------------------------------------------------------- /code/preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | # -*- coding: utf8 -*- 3 | import re 4 | import numpy as np 5 | import six 6 | import sys 7 | import getopt 8 | import locale 9 | from argparse import ArgumentParser 10 | 11 | def parse_args(): 12 | parser = ArgumentParser() 13 | parser.add_argument("original_file_path", help="The path of the original file") 14 | parser.add_argument("input_file_path", help="The path of the input file with no spaces") 15 | parser.add_argument("label_file_path", help="The path of the label file in BIES format") 16 | return parser.parse_args() 17 | 18 | def generate_ngram(string,n): 19 | """ 20 | This is the function to generate n-grams. 21 | 22 | :param string: The string to be splitted. 23 | :param n: The number of gram 24 | :return: A string of ngrams 25 | :usage: ("ABCD",1) -> "A B C D" or ("ABCD",2) -> "AB BC CD" 26 | """ 27 | 28 | ans = '' 29 | for i in range(len(string) - n + 1): 30 | ans += string[i:i+n] 31 | ans += ' ' 32 | return ans 33 | 34 | def get_ngrams(input_file_path): 35 | """ 36 | This is the function that writes ngrams to file. 37 | 38 | :param input_file_path: The path to file that contains strings to be converted to ngrams 39 | :return: unigram file 40 | """ 41 | 42 | corpora = open(input_file_path, 'r', encoding='utf8') 43 | unigram_input = open('unigram.utf8', 'w', encoding='utf8') 44 | all_lines = corpora.readlines() 45 | all_lines = [line.replace(' ', '')[0:-1] for line in all_lines] 46 | for line in all_lines: 47 | unigram_input.write(generate_ngram(line,1)) 48 | if line != all_lines[-1]: 49 | unigram_input.write('\n') 50 | else: 51 | pass 52 | corpora.close() 53 | unigram_input.close() 54 | print("Unigram Generated!") 55 | 56 | def convert_to_bies(string): 57 | """ 58 | This is the function to encode labels in BIES format. 59 | 60 | :param string: The labels to be encoded 61 | :return: Encoded labels in BIES format 62 | :usage: ("共同 创造 美好 的 新 世纪 ——") -> "BEBEBESSBEBE" 63 | """ 64 | 65 | features = [] 66 | for word in string.split(): 67 | for c in word: 68 | feature = "" 69 | len_word = len(word) 70 | if len_word == 1: 71 | feature += "S" 72 | else: 73 | feature += "B" 74 | for i_ in range(len_word - 2): 75 | feature += "I" 76 | feature += "E" 77 | features.append(feature) 78 | results = ''.join(str(e) for e in features) 79 | return results 80 | 81 | def preprocess(original_file,input_file,label_file): 82 | """ 83 | This is the function that read the original training file. 84 | 85 | :param string: The labels to be encoded 86 | :return: Input file with no spaces and label file in BIES format 87 | """ 88 | 89 | with open(original_file, 'r', encoding='utf8') as f: 90 | original_lines = f.readlines() 91 | original_lines = list(filter(lambda x: x.strip(),original_lines)) 92 | # remove spacess 93 | lines = [re.sub(r'\s(?=[^A-z0-9])','',line) for line in original_lines] 94 | lines = [line.replace(" ","")for line in lines] 95 | lines = [line.replace("\u3000","")for line in lines] 96 | 97 | # finally, write lines in the file 98 | with open(input_file, 'w') as f: 99 | f.writelines(lines) 100 | 101 | # finally, write labels in the file 102 | label_lines = [convert_to_bies(label) for label in original_lines] 103 | with open(label_file, 'w') as f: 104 | for item in label_lines: 105 | f.write("%s" % item) 106 | if item != label_lines[-1]: 107 | f.write("\n") 108 | print("Input and Label files generated!") 109 | 110 | if __name__ == '__main__': 111 | args = parse_args() 112 | preprocess(args.original_file_path, args.input_file_path, args.label_file_path) 113 | get_ngrams(args.input_file_path) -------------------------------------------------------------------------------- /code/score.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | ALL_TAGS = {"B", "I", "E", "S"} 4 | 5 | 6 | def parse_args(): 7 | parser = ArgumentParser() 8 | parser.add_argument("prediction_file", help="The path to the prediction file (in BIES format)") 9 | parser.add_argument("gold_file", help="The path to the gold file (in BIES format)") 10 | 11 | return parser.parse_args() 12 | 13 | 14 | def is_valid_prediction(prediction_iter, gold_iter): 15 | assert len(prediction_iter) == len(gold_iter), "Prediction and gold have different lengths" 16 | 17 | prediction_tags = set() 18 | gold_tags = set() 19 | nr_line = 1 20 | for preds, gold in zip(prediction_iter, gold_iter): 21 | assert len(preds) == len(gold), "Line " + str(nr_line) + ": lengths mismatch" 22 | prediction_tags.update(preds) 23 | gold_tags.update(gold) 24 | nr_line += 1 25 | 26 | prediction_tags = {t.upper() for t in prediction_tags} 27 | gold_tags = {t.upper() for t in gold_tags} 28 | 29 | assert len(gold_tags.difference(ALL_TAGS)) == 0, "Unknown tag detected in gold" 30 | assert len(prediction_tags.difference(ALL_TAGS)) == 0, "Unknown tag detected in predictions" 31 | 32 | 33 | def score(prediction_iter, gold_iter, verbose=False): 34 | """ 35 | Returns the precision of the model's predictions w.r.t. the gold standard (i.e. the tags of the 36 | correct word segmentation). 37 | 38 | :param prediction_iter: List of strings in the BIES format representing the model's predictions. 39 | :param gold_iter: List of strings in the BIES format representing the gold standard. 40 | 41 | :return: precision [0.0, 1.0] 42 | 43 | Ex. predictions_iter = ["BEBESBIIE", 44 | "BIIIEBEBESS"] 45 | gold_iter = ["BEBIEBIES", 46 | "BIIESBEBESS"] 47 | output: 0.7 48 | 49 | The same result can be obtain by passing list of lists 50 | Ex. predictions_iter = [["B", "E", "B", "E", "S", "B", "I", "I", "E"], 51 | ["B", "I", "I", "I", "E", "B", "E", "B", "E", "S", "S"]] 52 | gold_iter = [["B", "E", "B", "I", "E", "B", "I", "E", "S"], 53 | ["B", "I", "I", "E", "S", "B", "E", "B", "E", "S", "S"]] 54 | output: 0.7 55 | 56 | 57 | """ 58 | 59 | is_valid_prediction(prediction_iter, gold_iter) 60 | 61 | right_predictions = 0 62 | wrong_predictions = 0 63 | 64 | for prediction_sentence, gold_sentence in zip(prediction_iter, gold_iter): 65 | for prediction_tag, gold_tag in zip(prediction_sentence, gold_sentence): 66 | if prediction_tag == gold_tag: 67 | right_predictions += 1 68 | else: 69 | wrong_predictions += 1 70 | 71 | precision = right_predictions / (right_predictions + wrong_predictions) 72 | if verbose: 73 | print("Precision:\t", precision) 74 | 75 | return precision 76 | 77 | 78 | def label_text_to_iter(file_path): 79 | iter_ = [] 80 | with open(file_path) as f: 81 | for line in f: 82 | line = line.strip().upper() 83 | iter_.append(line) 84 | return iter_ 85 | 86 | 87 | if __name__ == '__main__': 88 | args = parse_args() 89 | prediction_iter = [] 90 | score(label_text_to_iter(args.prediction_file), label_text_to_iter(args.gold_file), verbose=True) 91 | 92 | -------------------------------------------------------------------------------- /code/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Tuple, List, Dict 3 | import tensorflow as tf 4 | import numpy as np 5 | from tensorflow.keras.callbacks import TensorBoard,ModelCheckpoint,EarlyStopping 6 | from collections import Counter 7 | from tensorflow.keras.layers import Dense, Input,Masking,LSTM, Embedding,Reshape, Dropout, Activation,TimeDistributed,Bidirectional,concatenate, GlobalMaxPool1D 8 | from tensorflow.keras.models import Model,Sequential,load_model 9 | from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers 10 | import pickle 11 | from tensorflow.keras.optimizers import SGD 12 | from tensorflow.keras.regularizers import l2 13 | from tensorflow.keras.preprocessing.text import Tokenizer 14 | from tensorflow.keras.preprocessing.sequence import pad_sequences 15 | from tensorflow.keras.utils import to_categorical 16 | from sklearn.model_selection import GridSearchCV 17 | from tensorflow.keras.wrappers.scikit_learn import KerasClassifier 18 | import collections 19 | from sklearn.model_selection import train_test_split 20 | import matplotlib.pyplot as plt 21 | from tensorflow.keras import backend as K 22 | 23 | unigram_path = '../resources/as_cityu_msr_pku_unigram.utf8' 24 | X_train_path = '../resources/as_cityu_msr_pku_input.utf8' 25 | Y_train_path = '../resources/as_cityu_msr_pku_label.utf8' 26 | 27 | unigram_vocab = dict() 28 | unigram_word_to_id = dict() 29 | X_train_uni = [] 30 | Y_train = [] 31 | 32 | 33 | def vocabulary(unigram_path=unigram_path ): 34 | """ 35 | This is the function to build the vocabulary of the dataset. 36 | 37 | :param unigram_path: The path to the file that contains the unigrams 38 | :return: None 39 | """ 40 | with open(unigram_path, 'r', encoding='utf8') as f: 41 | original_lines = f.readlines() 42 | for line in original_lines: 43 | words = line.split() 44 | for word in words: 45 | if word not in unigram_vocab: 46 | unigram_vocab[word] = 1 47 | else: 48 | unigram_vocab[word] += 1 49 | 50 | def word2index(): 51 | """ 52 | Converts each character to its index in the vocabulary 53 | 54 | :return: None 55 | """ 56 | vocabulary() 57 | unigram_word_to_id[""] = 0 #zero is not casual! 58 | unigram_word_to_id[""] = 1 #OOV are mapped as 59 | unigram_word_to_id.update({k:v+len(unigram_word_to_id) for k, v in unigram_vocab.items()}) 60 | 61 | def tokenize_dataset(X_train_path=X_train_path): 62 | """ 63 | Converts each character to its index in the vocabulary 64 | 65 | :param X_train_path: path to the trainig set with no spaces 66 | :return: encoded X training set 67 | """ 68 | word2index() 69 | with open(X_train_path, 'r', encoding='utf8') as f: 70 | original_lines = f.readlines() 71 | original_lines = [line.replace("\u3000","")for line in original_lines] 72 | for line in original_lines: 73 | words = line.split() 74 | for word in words: 75 | char = [] 76 | for c in word: 77 | try: 78 | char.append(unigram_word_to_id[c]) 79 | except KeyError: 80 | char.append(unigram_word_to_id[""]) 81 | X_train_uni.append(char) 82 | return X_train_uni 83 | 84 | def convert_labels_to_integer(string): 85 | """ 86 | Converts the labels from BIES format to integer 87 | :param string: integer to be converted to BIES format 88 | :return: Array of labels in Integer 89 | """ 90 | tags = [] 91 | for words in string.split(): 92 | for word in words: 93 | if word == 'S': 94 | tags.append(3) # 'S', a Single character 95 | if word == 'B': 96 | tags.append(0) # 'B', Begin of a word 97 | if word == 'I': 98 | tags.append(1) # 'I', Middle of a word 99 | if word == 'E': 100 | tags.append(2) # 'E', End of a word 101 | return tags 102 | 103 | 104 | def encode_y(Y_train_path=Y_train_path): 105 | """ 106 | Encodes the labels 107 | :param Y_train_path: Path to labels in BIES format 108 | :return: Array of one hot encoded training labels 109 | """ 110 | #Training Labels 111 | with open(Y_train_path, 'r', encoding='utf8') as f: 112 | label_original_lines = f.readlines() 113 | Y_tra = [convert_labels_to_integer(label) for label in label_original_lines] 114 | 115 | #One Hot Encoding of Training Labels 116 | for y in Y_tra: 117 | Y_train.append(to_categorical(y,num_classes=4)) 118 | return Y_train 119 | 120 | def pad_data(X_train_path,Y_train_path): 121 | """ 122 | Pad training set sequences 123 | :param X_train_path: Path to X encoded 124 | :param Y_train_path: Path to Y encoded 125 | :return: padded training sets 126 | """ 127 | max_len = (sum(len(line) for line in X_train_uni) / len(X_train_uni)) 128 | MAX_LEN = round(max_len)+1 129 | 130 | train_x_uni_padded = pad_sequences(X_train_uni,padding='post', maxlen=MAX_LEN) 131 | train_y_padded = pad_sequences(Y_train,padding='post', maxlen=MAX_LEN) 132 | 133 | return train_x_uni_padded,train_y_padded 134 | 135 | 136 | def precision(y_true, y_pred): 137 | """Precision metric. 138 | Only computes a batch-wise average of precision. 139 | Computes the precision, a metric for multi-label classification of 140 | how many selected items are relevant. 141 | """ 142 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 143 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 144 | precision = true_positives / (predicted_positives + K.epsilon()) 145 | 146 | return precision 147 | 148 | def bilstm_model(): 149 | """ 150 | Bilstm model 151 | :return: model 152 | """ 153 | LEN = 2000000 154 | visible = Input(shape=(None,)) 155 | em = Embedding(LEN,64,input_length=None,mask_zero=True)(visible) 156 | hidden = Bidirectional(LSTM(256,return_sequences=True,dropout=0.6,recurrent_dropout=0.4),merge_mode='sum')(em) 157 | output = TimeDistributed(Dense(4,activation='softmax'))(hidden) 158 | model = Model(inputs=visible, outputs=output) 159 | model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.035, momentum=0.95), metrics=['accuracy',precision]) 160 | return model 161 | 162 | 163 | if __name__ == '__main__': 164 | word2index() 165 | X_train_uni = tokenize_dataset() 166 | Y_train = encode_y() 167 | train_x_uni_padded,train_y_padded = pad_data(X_train_uni,Y_train) 168 | model = bilstm_model() 169 | filepath = "weights.{epoch:02d}-{val_loss:.2f}.hdf5" 170 | mc = ModelCheckpoint(filepath, monitor='val_precision', verbose=1, save_best_only=True, mode='max') 171 | print("Training") 172 | history = model.fit(train_x_uni_padded,train_y_padded,batch_size=256, epochs=20, verbose=1,shuffle=True,validation_split=0.2,callbacks=[mc]) 173 | 174 | # Plot training & validation precision values 175 | plt.plot(history.history['precision']) 176 | plt.plot(history.history['val_precision']) 177 | plt.title('Model Precision') 178 | plt.ylabel('Precision') 179 | plt.xlabel('Epoch') 180 | plt.legend(['Train', 'Test'], loc='upper left') 181 | plt.show() 182 | 183 | # Plot training & validation accuracy values 184 | plt.plot(history.history['acc']) 185 | plt.plot(history.history['val_acc']) 186 | plt.title('Model accuracy') 187 | plt.ylabel('Accuracy') 188 | plt.xlabel('Epoch') 189 | plt.legend(['Train', 'Test'], loc='upper left') 190 | plt.show() 191 | 192 | # Plot training & validation loss values 193 | plt.plot(history.history['loss']) 194 | plt.plot(history.history['val_loss']) 195 | plt.title('Model loss') 196 | plt.ylabel('Loss') 197 | plt.xlabel('Epoch') 198 | plt.legend(['Train', 'Test'], loc='upper left') 199 | plt.show() -------------------------------------------------------------------------------- /resources/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokeam/Chinese-Word-Segmentation-in-NLP/93f0bae947152e3885adba966cdd67f3ae5ffc1e/resources/.gitkeep --------------------------------------------------------------------------------