├── Attention.py ├── README.md ├── __init__.py ├── ceshi.py ├── conlleval ├── conlleval.py ├── data1 ├── 1.test ├── 1.train ├── laptop_test_POS_DEP_BIO_data.csv └── laptop_train_POS_DEP_BIO_data.csv ├── data_utils.py ├── lexcion ├── laptop14_dict.csv ├── restaurant14_dict.csv ├── restaurant15_dict.csv └── restaurant16_dict.csv ├── loader.py ├── main.py ├── model.py ├── rnncell.py └── utils.py /Attention.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.contrib.layers.python.layers import initializers 3 | from tensorflow.python.ops import rnn_cell 4 | from tensorflow.python.ops import rnn 5 | 6 | INF = 1e30 7 | 8 | def attention_layer(att_inputs, num_step, max_len, attention_size, gru_hidden, lengths, name=None): 9 | #att_inputs, att的输入, [batch_size, max_len, dim], 在建图是batch_size和max_len都是None 10 | #attention_size, att之后,输出的维度 11 | #max_len是通过placeholder传入的句子最长长度,应为每个batch的句子最长长度不同,所以这个max_len以feed——dict传入 12 | print("attinput-----", att_inputs) 13 | print("maxlen-----", max_len) 14 | # hidden_size = att_inputs.shape[-1] 15 | # max_len1 = att_inputs.shape[1] 16 | # print(max_len1) 17 | 18 | #att_inputs_tranpose = tf.transpose(att_inputs, [1, 0, 2]) 19 | #Ct = [] 20 | with tf.variable_scope("Attention_layer1" if not name else name): 21 | hidden_size = att_inputs.shape[-1] 22 | att_inputs_tranpose = tf.transpose(att_inputs, [1, 0, 2]) 23 | Ct = [] 24 | # St = [] 25 | with tf.variable_scope("Attention_compute1"): 26 | w1 = tf.get_variable("w1", shape=[hidden_size, attention_size], 27 | dtype=tf.float32, initializer=initializers.xavier_initializer()) 28 | w2 = tf.get_variable("w2", shape=[hidden_size, attention_size], 29 | dtype=tf.float32, initializer=initializers.xavier_initializer()) 30 | b1 = tf.get_variable("b1", shape=[attention_size], 31 | dtype=tf.float32, initializer=tf.zeros_initializer()) 32 | b2 = tf.get_variable("b2", shape=[attention_size], 33 | dtype=tf.float32, initializer=tf.zeros_initializer()) 34 | u = tf.get_variable("u", shape=[attention_size, 1], 35 | dtype=tf.float32, initializer=initializers.xavier_initializer()) 36 | 37 | input_w1 = tf.reshape(tf.tensordot(att_inputs, w1, axes=1) + b1, [-1, num_step, attention_size]) 38 | for t in range(max_len): 39 | slice_w2 = tf.matmul(att_inputs_tranpose[t], w2) + b2 40 | add_input_jt = tf.tanh(tf.expand_dims(slice_w2, 1) + input_w1) 41 | score_a_step = tf.reshape(tf.tensordot(add_input_jt, u, axes=1), [-1, 1, num_step]) 42 | normalized_score = tf.nn.softmax(score_a_step) 43 | # St.append(normalized_score) 44 | # Ct.append(tf.matmul(normalized_score, att_inputs)) 45 | normalize_s_a_step = tf.reshape(normalized_score, [-1, num_step, 1]) 46 | Ct.append(tf.reduce_sum(att_inputs * normalize_s_a_step, 1)) 47 | C = tf.transpose(Ct, [1, 0, 2]) 48 | # S = tf.transpose(St, [0, 1, 2, 3]) 49 | # C = tf.concat(Ct, axis=1) 50 | print(C) 51 | with tf.variable_scope("gate"): 52 | concat_C_att_input = tf.concat([att_inputs, C], axis=-1) 53 | g_dim = concat_C_att_input.get_shape().as_list()[-1] 54 | w = tf.get_variable("w", shape=[g_dim, g_dim], 55 | dtype=tf.float32, initializer=initializers.xavier_initializer()) 56 | gate = tf.nn.sigmoid(tf.reshape(tf.tensordot(concat_C_att_input, w, axes=1), [-1, num_step, g_dim])) 57 | gated = gate * concat_C_att_input 58 | # gru_cell = rnn_cell.GRUCell(gru_hidden) 59 | # gru_cell = rnn_cell.LSTMCell(gru_hidden) 60 | gru_cell = {} 61 | for direction in ["forward", "backword"]: 62 | with tf.variable_scope(direction): 63 | gru_cell[direction] = rnn_cell.GRUCell(gru_hidden) 64 | outputs, state = rnn.bidirectional_dynamic_rnn(gru_cell["forward"], gru_cell["backword"], gated, dtype=tf.float32, sequence_length=lengths) 65 | outputs = tf.concat(outputs, axis=2) 66 | # outputs, state = rnn.dynamic_rnn(gru_cell, gated, dtype=tf.float32, sequence_length=lengths) 67 | print("attouts---", outputs) 68 | return outputs, state 69 | ''' 70 | def update_attention_outputs_layer1(att_outputs, hidden_units, lengths, name=None): 71 | with tf.variable_scope("update_attention_outputs1" if not name else name): 72 | with tf.variable_scope("gate"): 73 | gru_cell = rnn_cell.GRUCell(hidden_units) 74 | outputs, state = rnn.dynamic_rnn(gru_cell, att_outputs, dtype=tf.float32, sequence_length=lengths) 75 | return outputs 76 | ''' 77 | 78 | def softmax_mask(val, mask): 79 | return -INF * (1 - tf.cast(mask, tf.float32)) + val 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DA-CRF 2 | * Framework: Bi-lstm-Attention-CRF 3 | Gated dynamic Attention 4 | # installation 5 | * python >= 3.5.2 6 | tensorflow >= 1.2.0 7 | numpy >=1.11.1 8 | 9 | # Training, developmenting and testing simultaneously 10 | * python main.py 11 | 12 | # Other 13 | * If you want some other parameter settings, you can change them on main.py 14 | This project also implements a static attention, see model.py, if you wanna use it, please manually modify on model.py 15 | You can change the implemented CoupledInputForgetGateLSTMCell to tensorflow's LSTM_Cell manually in model.py 16 | This implementation also suports Chinese, you just feed data like the data on directery "data1" 17 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DCdream/DA-CRF/96f0f64e24a03afa75ec4799e6463d77e716d4bf/__init__.py -------------------------------------------------------------------------------- /ceshi.py: -------------------------------------------------------------------------------- 1 | ''' 2 | import codecs 3 | from data_utils import create_dico, create_mapping, zero_digits 4 | from loader import char_mapping, tag_mapping 5 | from loader import load_sentences, update_tag_scheme 6 | from loader import augment_with_pretrained, prepare_dataset 7 | import itertools 8 | train_sentences = load_sentences(r"G:\pyworkspace\AS_select_features\ChineseNER-master1\ChineseNER-master\data1\1.train", True, False) 9 | test_sentences = load_sentences(r"G:\pyworkspace\AS_select_features\ChineseNER-master1\ChineseNER-master\data1\1.test", True, False) 10 | dico_chars_train = char_mapping(train_sentences, True)[0] 11 | #训练数据统计的词典 12 | print(dico_chars_train) 13 | dico_chars, char_to_id, id_to_char = augment_with_pretrained( 14 | dico_chars_train.copy(), 15 | r"G:\pyworkspace\AS_select_features\ChineseNER-master1\ChineseNER-master\glove.6B.100d.txt", 16 | list(itertools.chain.from_iterable( 17 | [[w[0] for w in s] for s in test_sentences]) 18 | ) 19 | ) 20 | n_words = len(id_to_char) 21 | for i in range(n_words): 22 | print(id_to_char[i]) 23 | ''' 24 | ''' 25 | import random 26 | from data_utils import split_train_dev 27 | l = [[['s', 'pos', 3], ['a', 'tn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 28 | [['v', 'pos', 3], ['a', 'yn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 29 | [['t', 'pos', 3], ['a', 'un', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 30 | [['s', 'pos', 3], ['a', 'tn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 31 | [['v', 'pos', 3], ['a', 'yn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 32 | [['t', 'pos', 3], ['a', 'un', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 33 | [['s', 'pos', 3], ['a', 'tn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 34 | [['v', 'pos', 3], ['a', 'yn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 35 | [['t', 'pos', 3], ['a', 'un', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 36 | [['s', 'pos', 3], ['a', 'tn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 37 | [['v', 'pos', 3], ['a', 'yn', 8], ['k', 'nl', 6], ['c', 'nt', 10]], 38 | [['t', 'pos', 3], ['a', 'un', 8], ['k', 'nl', 6], ['c', 'nt', 10]], [['t', 'pos', 3], ['a', 'un', 8], ['k', 'nl', 6], ['c', 'nt', 10]]] 39 | random.shuffle(l) 40 | print(l) 41 | print(int(11/10*8+1)) 42 | train, dev = split_train_dev(l) 43 | print(len(train)) 44 | print(len(dev)) 45 | import tensorflow as tf 46 | a = tf.constant([[[], []], [[], []], [[], []]]) 47 | import numpy as np 48 | a = np.array([[["1", "2", "3", "9"], [1, 2, 3]], [["4", "5", "6"], [4, 5, 6]], [["1"], [1]]]) 49 | sorted_data = sorted(a, key=lambda x: len(x[0])) 50 | print(sorted_data) 51 | 52 | import tensorflow as tf 53 | import numpy as np 54 | 55 | c = np.random.random([10, 1]) 56 | b = tf.nn.embedding_lookup(c, [[1, 3], [2, 4]]) 57 | 58 | with tf.Session() as sess: 59 | sess.run(tf.initialize_all_variables()) 60 | print(sess.run(b)) 61 | print(c) 62 | ''' 63 | ''' 64 | import tensorflow as tf 65 | print(":------------") 66 | a = tf.constant([[[1., 1.], [2., 1.], [2., 3.]], [[1., 3.], [3., 4.], [1., 2.]], [[3., 4.],[2., 2.], [3., 1.]]]) 67 | b = tf.constant([[[1., 1.], [2., 1.], [2., 3.]], [[1., 3.], [3., 4.], [1., 2.]], [[3., 4.],[2., 2.], [3., 1.]]]) 68 | s = tf.constant([[1., 2., 3.], [2., 1., 1.]]) 69 | c = tf.tensordot(a, s, axes=1) 70 | u = tf.constant([2., 3.]) 71 | aten = tf.tensordot(a, u, axes=1) 72 | a_b = tf.concat([a, b], axis=-1) 73 | 74 | c_1 = tf.expand_dims(c, -1) 75 | 76 | char_inputs = tf.constant([[1, 2], [3, 4], [5, 6], [7, 8]]) 77 | enbedding = tf.nn.embedding_lookup(char_inputs, [[1, 2, 3], [2, 0, 1], [0, 0, 1]]) 78 | used = tf.sign(tf.abs(char_inputs)) 79 | length = tf.reduce_sum(used, reduction_indices=1) 80 | lengths = tf.cast(length, tf.int32) 81 | with tf.Session() as sess: 82 | sess.run(tf.initialize_all_variables()) 83 | print(sess.run(c)) 84 | print(sess.run(aten)) 85 | print("a_b---", sess.run(a_b)) 86 | 87 | print(sess.run(tf.reshape(a, [-1, 2]))) 88 | print(sess.run(tf.reshape(tf.nn.xw_plus_b(tf.reshape(a, [-1, 2]), s, [0., 0.]), [-1, 3, 2]))) 89 | print(sess.run(tf.nn.softmax(c))) 90 | print(sess.run(c_1)) 91 | print(sess.run(a*c_1)) 92 | print(a.shape[0]) 93 | print(used) 94 | print(sess.run(length)) 95 | print(sess.run(lengths)) 96 | print(sess.run(enbedding)) 97 | print(sess.run(tf.concat(enbedding, axis=-1))) 98 | ''' 99 | 100 | import tensorflow as tf 101 | from tensorflow.python.ops import rnn_cell 102 | a = tf.constant([[[1., 1.], [2., 1.], [2., 3.]], [[1., 3.], [3., 4.], [1., 2.]]]) 103 | w1 = tf.constant([[1., 1.], [2., 2.]]) 104 | w2 = tf.constant([[2., 2.], [2., 2.]]) 105 | v = tf.constant([[1.], [2.]]) 106 | a_trans = tf.transpose(a, [1, 0, 2]) 107 | num_step = a.shape[1] 108 | ai = [] 109 | ci = [] 110 | for i in range(num_step): 111 | b = tf.matmul(a_trans[i], w1) 112 | score = [] 113 | for j in range(num_step): 114 | c = tf.matmul(a_trans[j], w2) 115 | d = b + c 116 | s = tf.matmul(d, v) 117 | score.append(s) 118 | score = tf.transpose(score, [1, 0, 2]) 119 | ci_1 = a * score 120 | ci.append(tf.reduce_sum(ci_1, 1)) 121 | ai.append(score) 122 | ci = tf.transpose(ci, [1, 0, 2]) 123 | ai = tf.transpose(ai, [0, 1, 2, 3]) 124 | # ai.append(s) 125 | # ai = tf.transpose(ai, [1, 0, 2]) 126 | 127 | 128 | 129 | with tf.Session() as sess: 130 | sess.run(tf.global_variables_initializer()) 131 | print("---a:\n", sess.run(a)) 132 | print("a_trains:\n", sess.run(a_trans)) 133 | # print(sess.run(a_trans[0])) 134 | # print(sess.run(w2 + w1)) 135 | print(sess.run(ai)) 136 | print("---ci_1\n", sess.run(ci_1)) 137 | 138 | print(sess.run(ci)) -------------------------------------------------------------------------------- /conlleval: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 265 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 266 | 267 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 269 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 270 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 271 | 272 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 273 | $chunkEnd = $true; 274 | } 275 | 276 | # corrected 1998-12-22: these chunks are assumed to have length 1 277 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 279 | 280 | return($chunkEnd); 281 | } 282 | 283 | # startOfChunk: checks if a chunk started between the previous and current word 284 | # arguments: previous and current chunk tags, previous and current types 285 | # note: this code is capable of handling other chunk representations 286 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 287 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 288 | 289 | sub startOfChunk { 290 | my $prevTag = shift(@_); 291 | my $tag = shift(@_); 292 | my $prevType = shift(@_); 293 | my $type = shift(@_); 294 | my $chunkStart = $false; 295 | 296 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 297 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 298 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 299 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 300 | 301 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 302 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 303 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 304 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 305 | 306 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 307 | $chunkStart = $true; 308 | } 309 | 310 | # corrected 1998-12-22: these chunks are assumed to have length 1 311 | if ( $tag eq "[" ) { $chunkStart = $true; } 312 | if ( $tag eq "]" ) { $chunkStart = $true; } 313 | 314 | return($chunkStart); 315 | } 316 | -------------------------------------------------------------------------------- /conlleval.py: -------------------------------------------------------------------------------- 1 | # Python version of the evaluation script from CoNLL'00- 2 | # Originates from: https://github.com/spyysalo/conlleval.py 3 | 4 | 5 | # Intentional differences: 6 | # - accept any space as delimiter by default 7 | # - optional file argument (default STDIN) 8 | # - option to set boundary (-b argument) 9 | # - LaTeX output (-l argument) not supported 10 | # - raw tags (-r argument) not supported 11 | 12 | import sys 13 | import re 14 | import codecs 15 | from collections import defaultdict, namedtuple 16 | 17 | ANY_SPACE = '' 18 | 19 | 20 | class FormatError(Exception): 21 | pass 22 | 23 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') 24 | 25 | 26 | class EvalCounts(object): 27 | def __init__(self): 28 | self.correct_chunk = 0 # number of correctly identified chunks 29 | self.correct_tags = 0 # number of correct chunk tags 30 | self.found_correct = 0 # number of chunks in corpus 31 | self.found_guessed = 0 # number of identified chunks 32 | self.token_counter = 0 # token counter (ignores sentence breaks) 33 | 34 | # counts by type 35 | self.t_correct_chunk = defaultdict(int) 36 | self.t_found_correct = defaultdict(int) 37 | self.t_found_guessed = defaultdict(int) 38 | 39 | 40 | def parse_args(argv): 41 | import argparse 42 | parser = argparse.ArgumentParser( 43 | description='evaluate tagging results using CoNLL criteria', 44 | formatter_class=argparse.ArgumentDefaultsHelpFormatter 45 | ) 46 | arg = parser.add_argument 47 | arg('-b', '--boundary', metavar='STR', default='-X-', 48 | help='sentence boundary') 49 | arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, 50 | help='character delimiting items in input') 51 | arg('-o', '--otag', metavar='CHAR', default='O', 52 | help='alternative outside tag') 53 | arg('file', nargs='?', default=None) 54 | return parser.parse_args(argv) 55 | 56 | 57 | def parse_tag(t): 58 | m = re.match(r'^([^-]*)-(.*)$', t) 59 | return m.groups() if m else (t, '') 60 | 61 | 62 | def evaluate(iterable, options=None): 63 | if options is None: 64 | options = parse_args([]) # use defaults 65 | 66 | counts = EvalCounts() 67 | num_features = None # number of features per line 68 | in_correct = False # currently processed chunks is correct until now 69 | last_correct = 'O' # previous chunk tag in corpus 70 | last_correct_type = '' # type of previously identified chunk tag 71 | last_guessed = 'O' # previously identified chunk tag 72 | last_guessed_type = '' # type of previous chunk tag in corpus 73 | 74 | for line in iterable: 75 | line = line.rstrip('\r\n') 76 | 77 | if options.delimiter == ANY_SPACE: 78 | features = line.split() 79 | else: 80 | features = line.split(options.delimiter) 81 | 82 | if num_features is None: 83 | num_features = len(features) 84 | elif num_features != len(features) and len(features) != 0: 85 | raise FormatError('unexpected number of features: %d (%d)' % 86 | (len(features), num_features)) 87 | 88 | if len(features) == 0 or features[0] == options.boundary: 89 | features = [options.boundary, 'O', 'O'] 90 | if len(features) < 3: 91 | raise FormatError('unexpected number of features in line %s' % line) 92 | 93 | guessed, guessed_type = parse_tag(features.pop()) 94 | correct, correct_type = parse_tag(features.pop()) 95 | first_item = features.pop(0) 96 | 97 | if first_item == options.boundary: 98 | guessed = 'O' 99 | 100 | end_correct = end_of_chunk(last_correct, correct, 101 | last_correct_type, correct_type) 102 | end_guessed = end_of_chunk(last_guessed, guessed, 103 | last_guessed_type, guessed_type) 104 | start_correct = start_of_chunk(last_correct, correct, 105 | last_correct_type, correct_type) 106 | start_guessed = start_of_chunk(last_guessed, guessed, 107 | last_guessed_type, guessed_type) 108 | 109 | if in_correct: 110 | if (end_correct and end_guessed and 111 | last_guessed_type == last_correct_type): 112 | in_correct = False 113 | counts.correct_chunk += 1 114 | counts.t_correct_chunk[last_correct_type] += 1 115 | elif (end_correct != end_guessed or guessed_type != correct_type): 116 | in_correct = False 117 | 118 | if start_correct and start_guessed and guessed_type == correct_type: 119 | in_correct = True 120 | 121 | if start_correct: 122 | counts.found_correct += 1 123 | counts.t_found_correct[correct_type] += 1 124 | if start_guessed: 125 | counts.found_guessed += 1 126 | counts.t_found_guessed[guessed_type] += 1 127 | if first_item != options.boundary: 128 | if correct == guessed and guessed_type == correct_type: 129 | counts.correct_tags += 1 130 | counts.token_counter += 1 131 | 132 | last_guessed = guessed 133 | last_correct = correct 134 | last_guessed_type = guessed_type 135 | last_correct_type = correct_type 136 | 137 | if in_correct: 138 | counts.correct_chunk += 1 139 | counts.t_correct_chunk[last_correct_type] += 1 140 | 141 | return counts 142 | 143 | 144 | def uniq(iterable): 145 | seen = set() 146 | return [i for i in iterable if not (i in seen or seen.add(i))] 147 | 148 | 149 | def calculate_metrics(correct, guessed, total): 150 | tp, fp, fn = correct, guessed-correct, total-correct 151 | p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) 152 | r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) 153 | f = 0 if p + r == 0 else 2 * p * r / (p + r) 154 | return Metrics(tp, fp, fn, p, r, f) 155 | 156 | 157 | def metrics(counts): 158 | c = counts 159 | overall = calculate_metrics( 160 | c.correct_chunk, c.found_guessed, c.found_correct 161 | ) 162 | by_type = {} 163 | for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): 164 | by_type[t] = calculate_metrics( 165 | c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] 166 | ) 167 | return overall, by_type 168 | 169 | 170 | def report(counts, out=None): 171 | if out is None: 172 | out = sys.stdout 173 | 174 | overall, by_type = metrics(counts) 175 | 176 | c = counts 177 | out.write('processed %d tokens with %d phrases; ' % 178 | (c.token_counter, c.found_correct)) 179 | out.write('found: %d phrases; correct: %d.\n' % 180 | (c.found_guessed, c.correct_chunk)) 181 | 182 | if c.token_counter > 0: 183 | out.write('accuracy: %6.2f%%; ' % 184 | (100.*c.correct_tags/c.token_counter)) 185 | out.write('precision: %6.2f%%; ' % (100.*overall.prec)) 186 | out.write('recall: %6.2f%%; ' % (100.*overall.rec)) 187 | out.write('FB1: %6.2f\n' % (100.*overall.fscore)) 188 | 189 | for i, m in sorted(by_type.items()): 190 | out.write('%17s: ' % i) 191 | out.write('precision: %6.2f%%; ' % (100.*m.prec)) 192 | out.write('recall: %6.2f%%; ' % (100.*m.rec)) 193 | out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) 194 | 195 | 196 | def report_notprint(counts, out=None): 197 | if out is None: 198 | out = sys.stdout 199 | 200 | overall, by_type = metrics(counts) 201 | 202 | c = counts 203 | final_report = [] 204 | line = [] 205 | line.append('processed %d tokens with %d phrases; ' % 206 | (c.token_counter, c.found_correct)) 207 | line.append('found: %d phrases; correct: %d.\n' % 208 | (c.found_guessed, c.correct_chunk)) 209 | final_report.append("".join(line)) 210 | 211 | if c.token_counter > 0: 212 | line = [] 213 | line.append('accuracy: %6.2f%%; ' % 214 | (100.*c.correct_tags/c.token_counter)) 215 | line.append('precision: %6.2f%%; ' % (100.*overall.prec)) 216 | line.append('recall: %6.2f%%; ' % (100.*overall.rec)) 217 | line.append('FB1: %6.2f\n' % (100.*overall.fscore)) 218 | final_report.append("".join(line)) 219 | 220 | for i, m in sorted(by_type.items()): 221 | line = [] 222 | line.append('%17s: ' % i) 223 | line.append('precision: %6.2f%%; ' % (100.*m.prec)) 224 | line.append('recall: %6.2f%%; ' % (100.*m.rec)) 225 | line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) 226 | final_report.append("".join(line)) 227 | return final_report 228 | 229 | 230 | def end_of_chunk(prev_tag, tag, prev_type, type_): 231 | # check if a chunk ended between the previous and current word 232 | # arguments: previous and current chunk tags, previous and current types 233 | chunk_end = False 234 | 235 | if prev_tag == 'E': chunk_end = True 236 | if prev_tag == 'S': chunk_end = True 237 | 238 | if prev_tag == 'B' and tag == 'B': chunk_end = True 239 | if prev_tag == 'B' and tag == 'S': chunk_end = True 240 | if prev_tag == 'B' and tag == 'O': chunk_end = True 241 | if prev_tag == 'I' and tag == 'B': chunk_end = True 242 | if prev_tag == 'I' and tag == 'S': chunk_end = True 243 | if prev_tag == 'I' and tag == 'O': chunk_end = True 244 | 245 | if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: 246 | chunk_end = True 247 | 248 | # these chunks are assumed to have length 1 249 | if prev_tag == ']': chunk_end = True 250 | if prev_tag == '[': chunk_end = True 251 | 252 | return chunk_end 253 | 254 | 255 | def start_of_chunk(prev_tag, tag, prev_type, type_): 256 | # check if a chunk started between the previous and current word 257 | # arguments: previous and current chunk tags, previous and current types 258 | chunk_start = False 259 | 260 | if tag == 'B': chunk_start = True 261 | if tag == 'S': chunk_start = True 262 | 263 | if prev_tag == 'E' and tag == 'E': chunk_start = True 264 | if prev_tag == 'E' and tag == 'I': chunk_start = True 265 | if prev_tag == 'S' and tag == 'E': chunk_start = True 266 | if prev_tag == 'S' and tag == 'I': chunk_start = True 267 | if prev_tag == 'O' and tag == 'E': chunk_start = True 268 | if prev_tag == 'O' and tag == 'I': chunk_start = True 269 | 270 | if tag != 'O' and tag != '.' and prev_type != type_: 271 | chunk_start = True 272 | 273 | # these chunks are assumed to have length 1 274 | if tag == '[': chunk_start = True 275 | if tag == ']': chunk_start = True 276 | 277 | return chunk_start 278 | 279 | 280 | def return_report(input_file): 281 | with codecs.open(input_file, "r", "utf8") as f: 282 | counts = evaluate(f) 283 | return report_notprint(counts) 284 | 285 | 286 | def main(argv): 287 | args = parse_args(argv[1:]) 288 | 289 | if args.file is None: 290 | counts = evaluate(sys.stdin, args) 291 | else: 292 | with open(args.file) as f: 293 | counts = evaluate(f, args) 294 | report(counts) 295 | 296 | if __name__ == '__main__': 297 | sys.exit(main(sys.argv)) -------------------------------------------------------------------------------- /data1/1.test: -------------------------------------------------------------------------------- 1 | Boot NN compound time B-AS 2 | time NN nsubj super I-AS 3 | is VBZ cop super O 4 | super JJ ROOT ROOT O 5 | fast RB advmod super O 6 | , , punct super O 7 | around RB advmod anywhere O 8 | anywhere RB advmod super O 9 | from IN case seconds O 10 | 35 CD nummod seconds O 11 | seconds NNS nmod anywhere O 12 | to TO case minute O 13 | 1 CD nummod minute O 14 | minute NN nmod seconds O 15 | . . punct super O 16 | 17 | tech NN compound support B-AS 18 | support NN nsubj fix I-AS 19 | would MD aux fix O 20 | not RB neg fix O 21 | fix VB ROOT ROOT O 22 | the DT det problem O 23 | problem NN dobj fix O 24 | unless IN mark bought O 25 | I PRP nsubj bought O 26 | bought VBD advcl fix O 27 | your PRP$ nmod:poss plan O 28 | plan NN dobj bought O 29 | for IN case 150 O 30 | $ $ dep 150 O 31 | 150 CD nmod bought O 32 | plus CC advmod 150 O 33 | . . punct fix O 34 | 35 | but CC ROOT ROOT O 36 | in IN mark resume O 37 | resume VB root but O 38 | this DT det rocks O 39 | computer NN compound rocks O 40 | rocks NNS dobj resume O 41 | ! . punct resume O 42 | 43 | Set VB csubj easy B-AS 44 | up RP compound:prt Set I-AS 45 | was VBD cop easy O 46 | easy JJ ROOT ROOT O 47 | . . punct easy O 48 | 49 | Did VBD aux enjoy O 50 | not RB neg enjoy O 51 | enjoy VB ROOT ROOT O 52 | the DT det Windows O 53 | new JJ amod Windows O 54 | Windows NNP dobj enjoy B-AS 55 | 8 CD nummod Windows I-AS 56 | and CC cc Windows O 57 | touchscreen NN compound functions B-AS 58 | functions NNS conj Windows I-AS 59 | . . punct enjoy O 60 | 61 | I PRP nsubj expected O 62 | expected VBD ROOT ROOT O 63 | so RB advmod product O 64 | as IN mark product O 65 | it PRP nsubj product O 66 | 's VBZ cop product O 67 | an DT det product O 68 | Apple NNP compound product O 69 | product NN dobj expected O 70 | , , punct product O 71 | but CC cc product O 72 | I PRP nsubj glad O 73 | was VBD cop glad O 74 | glad JJ conj product O 75 | to TO mark see O 76 | see VB xcomp glad O 77 | my PRP$ nmod:poss expectations O 78 | expectations NNS nsubj exceeded O 79 | exceeded VBD ccomp see O 80 | , , punct product O 81 | this DT nsubj laptop O 82 | is VBZ cop laptop O 83 | THE DT det laptop O 84 | laptop NN acl:relcl product O 85 | to TO mark buy O 86 | buy VB acl laptop O 87 | right RB advmod now O 88 | now RB advmod buy O 89 | . . punct expected O 90 | 91 | Other JJ ccomp hard O 92 | than IN mark fan O 93 | not RB neg fan O 94 | being VBG cop fan O 95 | a DT det fan O 96 | fan NN dep Other O 97 | of IN case pads O 98 | click VB compound pads B-AS 99 | pads NNS nmod fan I-AS 100 | -LRB- -LRB- punct standard O 101 | industry NN compound standard O 102 | standard NN dep pads O 103 | these DT det days O 104 | days NNS nmod:tmod standard O 105 | -RRB- -RRB- punct standard O 106 | and CC cc fan O 107 | the DT det speakers O 108 | lousy JJ amod speakers O 109 | internal JJ amod speakers B-AS 110 | speakers NNS conj fan I-AS 111 | , , punct hard O 112 | it PRP nsubj hard O 113 | 's VBZ cop hard O 114 | hard JJ ROOT ROOT O 115 | for IN mark find O 116 | me PRP nsubj find O 117 | to TO mark find O 118 | find VB advcl hard O 119 | things NNS dobj find O 120 | about IN case notebook O 121 | this DT det notebook O 122 | notebook NN nmod things O 123 | I PRP nsubj like O 124 | do VBP aux like O 125 | n't RB neg like O 126 | like VB acl:relcl notebook O 127 | , , punct find O 128 | especially RB advmod considering O 129 | considering VBG advcl find O 130 | the DT det tag O 131 | $ $ amod tag O 132 | 350 CD compound $ O 133 | price NN compound tag B-AS 134 | tag NN dobj considering I-AS 135 | . . punct hard O 136 | 137 | excellent JJ ROOT ROOT O 138 | in IN case way O 139 | every DT det way O 140 | way NN nmod excellent O 141 | . . punct excellent O 142 | 143 | No DT neg disk O 144 | installation NN compound disk B-AS 145 | disk NN nsubjpass included I-AS 146 | -LRB- -LRB- punct DVD I-AS 147 | DVD NN appos disk I-AS 148 | -RRB- -RRB- punct DVD I-AS 149 | is VBZ auxpass included O 150 | included VBN ROOT ROOT O 151 | . . punct included O 152 | 153 | It PRP nsubj light O 154 | 's VBZ cop light O 155 | fast RB advmod light O 156 | , , punct light O 157 | light NN ROOT ROOT O 158 | , , punct light O 159 | and CC cc light O 160 | simple JJ conj light O 161 | to TO mark use O 162 | use VB xcomp simple B-AS 163 | . . punct light O 164 | 165 | Works NNP ROOT ROOT B-AS 166 | well RB advmod Works O 167 | , , punct Works O 168 | and CC cc Works O 169 | I PRP nsubj happy O 170 | am VBP cop happy O 171 | extremely RB advmod happy O 172 | happy JJ conj Works O 173 | to TO dep happy O 174 | be VB dep to O 175 | back RB advmod be O 176 | to TO case OS O 177 | an DT det OS O 178 | apple NN compound OS B-AS 179 | OS NN nmod back I-AS 180 | . . punct Works O 181 | 182 | This DT det mac O 183 | mac NN nsubj problem O 184 | has VBZ aux problem O 185 | been VBN cop problem O 186 | a DT det problem O 187 | problem NN ROOT ROOT O 188 | since IN mark got O 189 | we PRP nsubj got O 190 | got VBD advcl problem O 191 | it PRP dobj got O 192 | . . punct problem O 193 | 194 | Sure JJ ROOT ROOT O 195 | it PRP nsubj light O 196 | 's VBZ cop light O 197 | not RB neg light O 198 | light JJ dep Sure O 199 | and CC cc light O 200 | slim JJ conj light O 201 | but CC cc light O 202 | the DT det features O 203 | features NNS nsubj make B-AS 204 | make VBP conj light O 205 | up RP compound:prt make O 206 | for IN case it O 207 | it PRP nmod make O 208 | 100 CD nummod % O 209 | % NN dobj make O 210 | . . punct Sure O 211 | -------------------------------------------------------------------------------- /data1/1.train: -------------------------------------------------------------------------------- 1 | I PRP nsubj charge O 2 | charge VBP ROOT ROOT O 3 | it PRP dobj charge O 4 | at IN case night O 5 | night NN nmod charge O 6 | and CC cc charge O 7 | skip VB conj charge O 8 | taking VBG xcomp skip O 9 | the DT det cord O 10 | cord NN dobj taking B-AS 11 | with IN case me O 12 | me PRP nmod taking O 13 | because IN case life O 14 | of IN mwe because O 15 | the DT det life O 16 | good JJ amod life O 17 | battery NN compound life B-AS 18 | life NN nmod taking I-AS 19 | . . punct charge O 20 | 21 | I PRP nsubj bought O 22 | bought VBD ROOT ROOT O 23 | a DT det laptop O 24 | HP NNP compound laptop O 25 | Pavilion NNP compound laptop O 26 | DV4-1222nr NN compound laptop O 27 | laptop NN dobj bought O 28 | and CC cc bought O 29 | have VB aux had O 30 | had VBN conj bought O 31 | so RB advmod many O 32 | many JJ amod problems O 33 | problems NNS dobj had O 34 | with IN case computer O 35 | the DT det computer O 36 | computer NN nmod problems O 37 | . . punct bought O 38 | 39 | The DT det guy O 40 | tech NN compound guy B-AS 41 | guy NN nsubj said I-AS 42 | then RB advmod said O 43 | said VBD ROOT ROOT O 44 | the DT det center O 45 | service NN compound center B-AS 46 | center NN nsubj do I-AS 47 | does VBZ aux do O 48 | not RB neg do O 49 | do VB ccomp said O 50 | 1-to-1 JJ amod exchange O 51 | exchange NN dobj do O 52 | and CC cc do O 53 | I PRP nsubj have O 54 | have VBP conj do O 55 | to TO mark direct O 56 | direct VB xcomp have O 57 | my PRP$ nmod:poss concern O 58 | concern NN dobj direct O 59 | to TO case team O 60 | the DT det team O 61 | `` `` punct team B-AS 62 | sales NNS compound team I-AS 63 | '' '' punct team I-AS 64 | team NN nmod concern I-AS 65 | , , punct team O 66 | which WDT nsubj shop O 67 | is VBZ cop shop O 68 | the DT det shop O 69 | retail JJ amod shop O 70 | shop NN acl:relcl team O 71 | which WDT dobj bought O 72 | I PRP nsubj bought O 73 | bought VBD acl:relcl shop O 74 | my PRP$ nmod:poss netbook O 75 | netbook NN dobj bought O 76 | from IN nmod bought O 77 | . . punct said O 78 | 79 | I PRP nsubj investigated O 80 | investigated VBD ROOT ROOT O 81 | netbooks NNS dobj investigated O 82 | and CC cc investigated O 83 | saw VBD conj investigated O 84 | the DT det NB305-N410BL O 85 | Toshiba NNP compound NB305-N410BL O 86 | NB305-N410BL NN dobj saw O 87 | . . punct investigated O 88 | 89 | The DT det day O 90 | other JJ amod day O 91 | day NN nsubj had O 92 | I CD nummod day O 93 | had VBD ROOT ROOT O 94 | a DT det presentation O 95 | presentation NN dobj had O 96 | to TO mark do O 97 | do VB acl presentation O 98 | for IN case seminar O 99 | a DT det seminar O 100 | seminar NN nmod do O 101 | at IN case conference O 102 | a DT det conference O 103 | large JJ amod conference O 104 | conference NN nmod do O 105 | in IN case town O 106 | town NN nmod conference O 107 | - : punct presentation O 108 | lots NNS dep presentation O 109 | of IN case people O 110 | people NNS nmod lots O 111 | , , punct lots O 112 | little JJ amod time O 113 | time NN appos lots O 114 | to TO case prep O 115 | prep NN nmod time O 116 | and CC cc had O 117 | have VBP conj had O 118 | to TO mark set O 119 | set VB xcomp have O 120 | up RP compound:prt set O 121 | a DT det computer O 122 | computer NN dobj set O 123 | to TO case projector O 124 | a DT det projector O 125 | projector NN nmod set O 126 | , , punct projector O 127 | etc. FW appos projector O 128 | . . punct had O 129 | 130 | it PRP nsubj quality O 131 | is VBZ cop quality O 132 | of IN case quality O 133 | high JJ amod quality O 134 | quality NN ROOT ROOT B-AS 135 | , , punct quality O 136 | has VBZ conj quality O 137 | a DT det GUI O 138 | killer NN compound GUI O 139 | GUI NNP dobj has B-AS 140 | , , punct quality O 141 | is VBZ cop stable O 142 | extremely RB advmod stable O 143 | stable JJ conj quality O 144 | , , punct quality O 145 | is VBZ cop expandable O 146 | highly RB advmod expandable O 147 | expandable JJ conj quality O 148 | , , punct quality O 149 | is VBZ auxpass bundled O 150 | bundled VBN acl quality O 151 | with IN case lots O 152 | lots NNS nmod bundled O 153 | of IN case applications O 154 | very RB advmod good O 155 | good JJ amod applications O 156 | applications NNS nmod lots B-AS 157 | , , punct quality O 158 | is VBZ cop easy O 159 | easy JJ conj quality O 160 | to TO mark use O 161 | use VB xcomp easy B-AS 162 | , , punct quality O 163 | and CC cc quality O 164 | is VBZ cop gorgeous O 165 | absolutely RB advmod gorgeous O 166 | gorgeous JJ conj quality O 167 | . . punct quality O 168 | 169 | Easy NNP nsubj start O 170 | to TO mark start O 171 | start VB ROOT ROOT B-AS 172 | up RP compound:prt start I-AS 173 | and CC cc start O 174 | does VBZ aux overheat O 175 | not RB neg overheat O 176 | overheat VB conj start O 177 | as RB advmod much O 178 | much JJ dobj overheat O 179 | as IN case laptops O 180 | other JJ amod laptops O 181 | laptops NNS nmod much O 182 | . . punct start O 183 | 184 | Sad JJ ROOT ROOT O 185 | very RB advmod SAD O 186 | SAD JJ dep Sad O 187 | . . punct Sad O 188 | 189 | I PRP nsubj got O 190 | even RB advmod got O 191 | got VBD ROOT ROOT O 192 | my PRP$ nmod:poss son O 193 | teenage JJ amod son O 194 | son NN dobj got O 195 | one CD nummod son O 196 | , , punct got O 197 | because IN case features O 198 | of IN mwe because O 199 | the DT det features O 200 | features NNS nmod got B-AS 201 | that IN dobj offers O 202 | it PRP nsubj offers O 203 | offers VBZ acl:relcl features O 204 | , , punct features O 205 | like IN case iChat O 206 | , , punct iChat O 207 | iChat NNP nmod features B-AS 208 | , , punct iChat O 209 | Photobooth NNP conj iChat B-AS 210 | , , punct iChat O 211 | garage NN compound band B-AS 212 | band NN conj iChat I-AS 213 | and CC cc iChat O 214 | more JJR conj iChat O 215 | ! . punct got O 216 | 217 | Needless JJ ROOT ROOT O 218 | to TO mark say O 219 | say VB xcomp Needless O 220 | a DT det PC O 221 | PC NN nsubj less O 222 | that WDT nsubj support O 223 | ca MD aux support O 224 | n't RB neg support O 225 | support VB acl:relcl PC O 226 | a DT det phone O 227 | cell NN compound phone O 228 | phone NN dobj support O 229 | is VBZ cop less O 230 | less JJR ccomp say O 231 | than IN case useless O 232 | useless JJ advcl less O 233 | ! . punct Needless O 234 | 235 | Great JJ amod laptop O 236 | laptop NN ROOT ROOT O 237 | that WDT nsubj offers O 238 | offers VBZ acl:relcl laptop O 239 | many JJ amod features O 240 | great JJ amod features O 241 | features NNS dobj offers B-AS 242 | ! . punct laptop O 243 | 244 | they PRP nsubj done O 245 | have VBP aux done O 246 | done VBN ROOT ROOT O 247 | absolutely RB advmod nothing O 248 | nothing NN dobj done O 249 | to TO mark fix O 250 | fix VB acl nothing O 251 | the DT det problem O 252 | computer NN compound problem O 253 | problem NN dobj fix O 254 | . . punct done O 255 | 256 | One CD nummod night O 257 | night NN nmod:tmod turned O 258 | I PRP nsubj turned O 259 | turned VBD ROOT ROOT O 260 | the DT det thing O 261 | freaking VBG amod thing O 262 | thing NN dobj turned O 263 | off IN compound:prt turned O 264 | after IN mark using O 265 | using VBG advcl turned O 266 | it PRP dobj using O 267 | , , punct turned O 268 | the DT det day O 269 | next JJ amod day O 270 | day NN nsubj turn O 271 | I CD nummod day O 272 | turn VBP parataxis turned O 273 | it PRP dobj turn O 274 | on IN compound:prt turn O 275 | , , punct turn O 276 | no DT neg GUI O 277 | GUI NNP dobj turn B-AS 278 | , , punct GUI O 279 | screen NN appos GUI B-AS 280 | all DT advmod screen O 281 | dark JJ amod light O 282 | , , punct light O 283 | power NN compound light B-AS 284 | light NN dep all I-AS 285 | steady JJ amod light O 286 | , , punct light O 287 | hard JJ amod light B-AS 288 | drive NN compound light I-AS 289 | light NN dep light I-AS 290 | steady JJ amod light O 291 | and CC cc steady O 292 | not RB neg flashing O 293 | flashing VBG conj steady O 294 | as IN mark does O 295 | it PRP nsubj does O 296 | usually RB advmod does O 297 | does VBZ dep steady O 298 | . . punct turned O 299 | 300 | Still RB advmod pricey O 301 | pretty RB advmod pricey O 302 | pricey JJ ROOT ROOT O 303 | , , punct pricey O 304 | but CC cc pricey O 305 | I PRP nsubj putting O 306 | 've VBP aux putting O 307 | been VBN aux putting O 308 | putting VBG conj pricey O 309 | off RP compound:prt putting O 310 | money NN dobj putting O 311 | for IN case while O 312 | a DT det while O 313 | while NN nmod putting O 314 | as IN case Fund O 315 | a DT det Fund O 316 | little JJ amod Fund O 317 | Macbook NNP compound Fund O 318 | Fund NNP nmod while O 319 | , , punct pricey O 320 | and CC cc pricey O 321 | finally RB advmod got O 322 | got VBD conj pricey O 323 | to TO mark use O 324 | use VB xcomp got O 325 | it PRP dobj use O 326 | . . punct pricey O 327 | 328 | I PRP nsubj took O 329 | took VBD ROOT ROOT O 330 | it PRP dobj took O 331 | back RP compound:prt took O 332 | for IN case thing O 333 | an DT det thing O 334 | Asus NNP amod thing O 335 | and CC cc Asus O 336 | same JJ conj Asus O 337 | thing NN nmod took O 338 | - : punct thing O 339 | blue JJ amod screen O 340 | screen NN dep thing O 341 | which WDT nsubj required O 342 | required VBD acl:relcl screen O 343 | me PRP dobj required O 344 | to TO mark remove O 345 | remove VB xcomp required O 346 | the DT det battery O 347 | battery NN dobj remove B-AS 348 | to TO mark reset O 349 | reset VB advcl remove O 350 | . . punct took O 351 | 352 | In IN case shop O 353 | the DT det shop O 354 | shop NN nmod encased O 355 | , , punct encased O 356 | these DT det MacBooks O 357 | MacBooks NNS nsubjpass encased O 358 | are VBP auxpass encased O 359 | encased VBN ROOT ROOT O 360 | in IN case enclosure O 361 | a DT det enclosure O 362 | soft JJ amod enclosure O 363 | rubber NN compound enclosure B-AS 364 | enclosure NN nmod encased I-AS 365 | - : punct encased O 366 | so IN mark know O 367 | you PRP nsubj know O 368 | will MD aux know O 369 | never RB neg know O 370 | know VB advcl encased O 371 | about IN case edge O 372 | the DT det edge O 373 | razor NN compound edge O 374 | edge NN nmod know B-AS 375 | until IN mark buy O 376 | you PRP nsubj buy O 377 | buy VBP advcl know O 378 | it PRP dobj buy O 379 | , , punct know O 380 | get VB dep know O 381 | it PRP dobj get O 382 | home RB advmod get O 383 | , , punct know O 384 | break VB conj know O 385 | the DT det seal O 386 | seal NN dobj break O 387 | and CC cc know O 388 | use VB conj know O 389 | it PRP dobj use O 390 | -LRB- -LRB- punct con O 391 | very RB advmod clever O 392 | clever JJ amod con O 393 | con NN dep use O 394 | -RRB- -RRB- punct con O 395 | . . punct encased O 396 | 397 | However RB advmod make O 398 | , , punct make O 399 | the DT det gestures O 400 | multi-touch JJ amod gestures B-AS 401 | gestures NNS nsubj make I-AS 402 | and CC cc gestures O 403 | large JJ amod area O 404 | tracking NN compound area B-AS 405 | area NN conj gestures I-AS 406 | make VBP ROOT ROOT O 407 | having VBG xcomp make O 408 | an DT det mouse O 409 | external JJ amod mouse B-AS 410 | mouse NN dobj having I-AS 411 | unnecessary JJ amod mouse O 412 | -LRB- -LRB- punct gaming O 413 | unless IN mark gaming O 414 | you PRP nsubj gaming O 415 | ' '' punct gaming O 416 | re JJ amod gaming O 417 | gaming NN dep having B-AS 418 | -RRB- -RRB- punct gaming O 419 | . . punct make O 420 | 421 | Plus CC cc small O 422 | it PRP nsubj small O 423 | is VBZ cop small O 424 | small JJ ROOT ROOT O 425 | and CC cc small O 426 | reasonably RB advmod light O 427 | light JJ conj small O 428 | so IN mark take O 429 | I PRP nsubj take O 430 | can MD aux take O 431 | take VB advcl small O 432 | it PRP dobj take O 433 | with IN case me O 434 | me PRP nmod take O 435 | to TO case work O 436 | and CC cc to O 437 | from IN conj to O 438 | work NN nmod take O 439 | . . punct small O 440 | 441 | I PRP nsubj HATE O 442 | HATE VBP ROOT ROOT O 443 | this DT det one O 444 | one CD dobj HATE O 445 | . . punct HATE O 446 | 447 | They PRP nsubj unconcerned O 448 | were VBD cop unconcerned O 449 | totally RB advmod unconcerned O 450 | unconcerned JJ ROOT ROOT O 451 | that IN mark repaired O 452 | the DT det computer O 453 | computer NN nsubjpass repaired O 454 | was VBD auxpass repaired O 455 | not RB neg repaired O 456 | correctly RB advmod repaired O 457 | repaired VBN ccomp unconcerned O 458 | in IN case place O 459 | the DT det place O 460 | first JJ amod place O 461 | place NN nmod repaired O 462 | . . punct unconcerned O 463 | 464 | Toshiba NNP nsubj send O 465 | does VBZ aux send O 466 | not RB neg send O 467 | send VB ROOT ROOT O 468 | any DT det one O 469 | one CD dobj send O 470 | out IN compound:prt send O 471 | unless IN mark paid O 472 | you PRP nsubj paid O 473 | have VBP aux paid O 474 | paid VBN advcl send O 475 | extra JJ dobj paid O 476 | to TO mark have O 477 | have VB advcl paid O 478 | the DT dobj have O 479 | on IN case repairround O 480 | site NN compound repairround O 481 | repairround NN nmod the O 482 | and CC cc repairround O 483 | based VBN amod o O 484 | o NN conj repairround O 485 | done VBN acl repairround O 486 | . . punct send O 487 | 488 | Oh UH discourse boy O 489 | , , punct boy O 490 | boy NN ROOT ROOT O 491 | ! . punct boy O 492 | 493 | But CC cc are O 494 | while IN mark advantage O 495 | this DT nsubj advantage O 496 | is VBZ cop advantage O 497 | one CD nummod advantage O 498 | big JJ amod advantage O 499 | advantage NN advcl are O 500 | -LRB- -LRB- punct know O 501 | as IN mark know O 502 | you PRP nsubj know O 503 | may MD aux know O 504 | know VB dep are O 505 | from IN case company O 506 | the DT det company O 507 | company NN nmod know O 508 | s VBZ compound commercials O 509 | recent JJ amod commercials O 510 | commercials NNS dobj know O 511 | -RRB- -RRB- punct know O 512 | there EX expl are O 513 | are VBP ROOT ROOT O 514 | other JJ amod things O 515 | things NNS nsubj are O 516 | to TO mark consider O 517 | consider VB acl things O 518 | before IN mark going O 519 | going VBG advcl consider O 520 | with IN case Apple O 521 | Apple NNP nmod going O 522 | . . punct are O 523 | 524 | I PRP nsubj love O 525 | love VBP ROOT ROOT O 526 | the DT det way O 527 | way NN dobj love O 528 | the DT det suite O 529 | entire JJ amod suite O 530 | suite NN nsubj works B-AS 531 | of IN case software I-AS 532 | software NN nmod suite I-AS 533 | works VBZ acl:relcl way O 534 | together RB advmod works O 535 | . . punct love O 536 | 537 | The DT det speed O 538 | speed NN nsubj incredible B-AS 539 | is VBZ cop incredible O 540 | incredible JJ ROOT ROOT O 541 | and CC cc incredible O 542 | I PRP conj incredible O 543 | am VBP dep I O 544 | more JJR advmod am O 545 | than IN case satisfied O 546 | satisfied JJ advcl more O 547 | . . punct incredible O 548 | 549 | on IN case side O 550 | the DT det side O 551 | bright JJ amod side O 552 | side NN ROOT ROOT O 553 | at IN advmod laptop O 554 | least JJS mwe at O 555 | I PRP nsubj laptop O 556 | was VBD cop laptop O 557 | n't RB neg laptop O 558 | without IN case laptop O 559 | my PRP$ nmod:poss laptop O 560 | laptop NN nmod side O 561 | for IN case time O 562 | long RB advmod time O 563 | this DT det time O 564 | time NN nmod laptop O 565 | ! . punct side O 566 | 567 | This DT det laptop O 568 | laptop NN nsubj meets O 569 | meets VBZ csubj great O 570 | every DT det expectation O 571 | expectation NN dobj meets O 572 | and CC cc expectation O 573 | Windows NNP conj expectation B-AS 574 | 7 CD nummod Windows I-AS 575 | is VBZ cop great O 576 | great JJ ROOT ROOT O 577 | ! . punct great O 578 | 579 | That DT nsubj 's O 580 | 's VBZ ROOT ROOT O 581 | how WRB advmod frustrating O 582 | frustrating VBG dep was O 583 | it PRP nsubj was O 584 | was VBD ccomp 's O 585 | . . punct 's O 586 | 587 | I PRP nsubj use O 588 | can MD aux use O 589 | barely RB advmod use O 590 | use VB ROOT ROOT O 591 | any DT det devices O 592 | usb NN compound devices B-AS 593 | devices NNS dobj use I-AS 594 | because IN mark stay O 595 | they PRP nsubj stay O 596 | will MD aux stay O 597 | not RB neg stay O 598 | stay VB advcl use O 599 | connected JJ xcomp stay O 600 | properly RB advmod connected O 601 | . . punct use O 602 | -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # encoding = utf8 2 | import re 3 | import math 4 | import codecs 5 | import random 6 | 7 | import numpy as np 8 | #import jieba 9 | #jieba.initialize() 10 | #jieba.load_userdict('D:\DC\pyproject\data_utils\jieba_dict1.txt') 11 | 12 | 13 | def create_dico(item_list): 14 | """ 15 | Create a dictionary of items from a list of list of items. 16 | """ 17 | assert type(item_list) is list 18 | dico = {} 19 | for items in item_list: 20 | for item in items: 21 | if item not in dico: 22 | dico[item] = 1 23 | else: 24 | dico[item] += 1 25 | #print(dico) 26 | return dico 27 | 28 | 29 | def create_mapping(dico): 30 | """ 31 | Create a mapping (item to ID / ID to item) from a dictionary. 32 | Items are ordered by decreasing frequency. 33 | """ 34 | sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0])) 35 | #print("sorted", sorted_items) 36 | id_to_item = {i: v[0] for i, v in enumerate(sorted_items)} 37 | #print("id_to_item", id_to_item) 38 | item_to_id = {v: k for k, v in id_to_item.items()} 39 | #print("item_to_id", item_to_id) 40 | return item_to_id, id_to_item 41 | 42 | 43 | def zero_digits(s): 44 | """ 45 | Replace every digit in a string by a zero. 46 | """ 47 | return re.sub('\d', '0', s) 48 | 49 | 50 | def iob2(tags): 51 | """ 52 | Check that tags have a valid IOB format. 53 | Tags in IOB1 format are converted to IOB2. 54 | """ 55 | for i, tag in enumerate(tags): 56 | if tag == 'O': 57 | continue 58 | split = tag.split('-') 59 | if len(split) != 2 or split[0] not in ['I', 'B']: 60 | return False 61 | if split[0] == 'B': 62 | continue 63 | elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2 64 | tags[i] = 'B' + tag[1:] 65 | elif tags[i - 1][1:] == tag[1:]: 66 | continue 67 | else: # conversion IOB1 to IOB2 68 | tags[i] = 'B' + tag[1:] 69 | return True 70 | 71 | 72 | def iob_iobes(tags): 73 | """ 74 | IOB -> IOBES 75 | """ 76 | new_tags = [] 77 | for i, tag in enumerate(tags): 78 | if tag == 'O': 79 | new_tags.append(tag) 80 | elif tag.split('-')[0] == 'B': 81 | if i + 1 != len(tags) and \ 82 | tags[i + 1].split('-')[0] == 'I': 83 | new_tags.append(tag) 84 | else: 85 | new_tags.append(tag.replace('B-', 'S-')) 86 | elif tag.split('-')[0] == 'I': 87 | if i + 1 < len(tags) and \ 88 | tags[i + 1].split('-')[0] == 'I': 89 | new_tags.append(tag) 90 | else: 91 | new_tags.append(tag.replace('I-', 'E-')) 92 | else: 93 | raise Exception('Invalid IOB format!') 94 | return new_tags 95 | 96 | 97 | def iobes_iob(tags): 98 | """ 99 | IOBES -> IOB 100 | """ 101 | new_tags = [] 102 | for i, tag in enumerate(tags): 103 | if tag.split('-')[0] == 'B': 104 | new_tags.append(tag) 105 | elif tag.split('-')[0] == 'I': 106 | new_tags.append(tag) 107 | elif tag.split('-')[0] == 'S': 108 | new_tags.append(tag.replace('S-', 'B-')) 109 | elif tag.split('-')[0] == 'E': 110 | new_tags.append(tag.replace('E-', 'I-')) 111 | elif tag.split('-')[0] == 'O': 112 | new_tags.append(tag) 113 | else: 114 | raise Exception('Invalid format!') 115 | return new_tags 116 | 117 | 118 | def insert_singletons(words, singletons, p=0.5): 119 | """ 120 | Replace singletons by the unknown word with a probability p. 121 | """ 122 | new_words = [] 123 | for word in words: 124 | if word in singletons and np.random.uniform() < p: 125 | new_words.append(0) 126 | else: 127 | new_words.append(word) 128 | return new_words 129 | 130 | def split_train_dev(train_sentences): 131 | random.seed(7) 132 | random.shuffle(train_sentences) 133 | l = len(train_sentences) 134 | split_line = int(l / 10 * 8) 135 | real_train_sentences = train_sentences[0:split_line] 136 | dev_sentences = train_sentences[split_line:] 137 | return real_train_sentences, dev_sentences 138 | ''' 139 | def get_seg_features(string): 140 | """ 141 | Segment text with jieba 142 | features are represented in bies format 143 | s donates single word 144 | """ 145 | seg_feature = [] 146 | 147 | for word in jieba.cut(string): 148 | if len(word) == 1: 149 | seg_feature.append(0) 150 | else: 151 | tmp = [2] * len(word) 152 | tmp[0] = 1 153 | tmp[-1] = 3 154 | seg_feature.extend(tmp) 155 | return seg_feature 156 | ''' 157 | def load_lexcion(lexcion_path, nlp): 158 | print("loading lexcion from {}......".format(lexcion_path)) 159 | l_lexcion = [] 160 | for line in codecs.open(lexcion_path): 161 | l_lexcion.append(line.strip()) 162 | ll_lexcion = [] 163 | for a_l in l_lexcion: 164 | ll_lexcion.append(nlp.word_tokenize(a_l)) 165 | l_sorted_lexcion = sorted(ll_lexcion, key=lambda i: len(i), reverse=True) 166 | print("loading done!") 167 | return l_sorted_lexcion 168 | 169 | 170 | def get_lexcion_features(list_strings, l_sorted_lexcion): 171 | #print("loading lexcion......") 172 | #nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09') 173 | len_list_strings = len(list_strings) 174 | #l_lexcion = [] 175 | l_lexcion_features = [0] * len_list_strings 176 | # for line in codecs.open(lexcion_path): 177 | # l_lexcion.append(line.strip()) 178 | # l_sorted_lexcion = sorted(l_lexcion, key=lambda i: len(nlp.word_tokenize(i)), reverse=True) 179 | for a_lex in l_sorted_lexcion: 180 | #print(type(a_lex)) 181 | if " ".join(a_lex) in " ".join(list_strings) \ 182 | or " ".join(a_lex).lower() in " ".join(list_strings).lower(): 183 | # list_a_lex = nlp.word_tokenize(a_lex) 184 | len_a_lex = len(a_lex) 185 | if len_list_strings >= len_a_lex: 186 | for i in range(len_list_strings): 187 | if i <= len_list_strings - len_a_lex: 188 | if a_lex == list_strings[i:i + len_a_lex] \ 189 | or [j.lower() for j in a_lex] == [j.lower() for j in list_strings[i:i + len_a_lex]]: 190 | if l_lexcion_features[i:i + len_a_lex] == [0] * len_a_lex: 191 | if len_a_lex == 1: 192 | l_lexcion_features[i:i + len_a_lex] = [1] * len_a_lex 193 | elif len_a_lex == 2: 194 | l_lexcion_features[i] = 1 195 | l_lexcion_features[i + len_a_lex - 1] = 1 196 | elif len_a_lex > 2: 197 | l_lexcion_features[i:i + len_a_lex] = [1] * len_a_lex 198 | l_lexcion_features[i] = 1 199 | l_lexcion_features[i + len_a_lex - 1] = 1 200 | return l_lexcion_features 201 | 202 | 203 | 204 | 205 | 206 | def get_pos_ids(poses): 207 | # jieba共有53种不同的词性 208 | # all_pos = ['a', 'ad', 'ag', 'an', 'b', 'c', 'd', 'df', 'dg', 209 | # 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'mg', 210 | # 'mq', 'n', 'ng', 'nr', 'nrfg', 'nrt', 'ns', 'nt', 211 | # 'nz', 'o', 'p', 'q', 'r', 'rg', 'rr', 'rz', 's', 212 | # 't', 'tg', 'u', 'ud', 'ug', 'uj', 'ul', 'uv', 'uz', 213 | # 'v', 'vd', 'vi', 'vn', 'vq', 'x', 'y', 'z', 'zg', 'eng'] 214 | #stanford共有47种 215 | all_pos = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 216 | 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 217 | 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 218 | 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 219 | 'WP', 'WP$', 'WRB', '#', '$', ',', '``', "''", '.', ':', 220 | '-RRB-', '-LRB-', '(', ')', '"'] 221 | pos_ids = [] 222 | for a_pos in poses: 223 | if a_pos in all_pos: 224 | pos_ids.append(all_pos.index(a_pos)) 225 | else: 226 | pos_ids.append(47) 227 | return pos_ids 228 | 229 | def get_dep_ids(dep_name): 230 | #由语料统计出来,共41个 231 | all_dep_name = ['nsubj', 'ROOT', 'dobj', 'case', 'nmod', 'cc', 232 | 'conj', 'xcomp', 'det', 'mwe', 'amod', 'compound', 233 | 'punct', 'aux', 'advmod', 'neg', 'ccomp', 'mark', 234 | 'nmod:poss', 'cop', 'acl:relcl', 'nummod', 'acl', 235 | 'dep', 'appos', 'compound:prt', 'auxpass', 'advcl', 236 | 'nmod:tmod', 'parataxis', 'nsubjpass', 'discourse', 237 | 'expl', 'csubj', 'root', 'det:predet', 'nmod:npmod', 238 | 'iobj', 'cc:preconj', 'CD', 'csubjpass'] 239 | dep_ids = [] 240 | for a_dep in dep_name: 241 | if a_dep in all_dep_name: 242 | dep_ids.append(all_dep_name.index(a_dep)) 243 | else: 244 | dep_ids.append(42) 245 | return dep_ids 246 | 247 | def create_input(data): 248 | """ 249 | Take sentence data and return an input for 250 | the training or the evaluation function. 251 | """ 252 | inputs = list() 253 | inputs.append(data['chars']) 254 | inputs.append(data["segs"]) 255 | inputs.append(data['tags']) 256 | return inputs 257 | 258 | 259 | def load_word2vec(emb_path, id_to_word, word_dim, old_weights): 260 | """ 261 | Load word embedding from pre-trained file 262 | embedding size must match 263 | """ 264 | new_weights = old_weights 265 | print('Loading pretrained embeddings from {}...'.format(emb_path)) 266 | pre_trained = {} 267 | emb_invalid = 0 268 | for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')): 269 | line = line.rstrip().split() 270 | if len(line) == word_dim + 1: 271 | pre_trained[line[0]] = np.array( 272 | [float(x) for x in line[1:]] 273 | ).astype(np.float32) 274 | else: 275 | emb_invalid += 1 276 | if emb_invalid > 0: 277 | print('WARNING: %i invalid lines' % emb_invalid) 278 | c_found = 0 279 | c_lower = 0 280 | c_zeros = 0 281 | n_words = len(id_to_word) 282 | # Lookup table initialization 283 | for i in range(n_words): 284 | word = id_to_word[i] 285 | if word in pre_trained: 286 | new_weights[i] = pre_trained[word] 287 | c_found += 1 288 | elif word.lower() in pre_trained: 289 | new_weights[i] = pre_trained[word.lower()] 290 | c_lower += 1 291 | elif re.sub('\d', '0', word.lower()) in pre_trained: 292 | new_weights[i] = pre_trained[ 293 | re.sub('\d', '0', word.lower()) 294 | ] 295 | c_zeros += 1 296 | print('Loaded %i pretrained embeddings.' % len(pre_trained)) 297 | print('%i / %i (%.4f%%) words have been initialized with ' 298 | 'pretrained embeddings.' % ( 299 | c_found + c_lower + c_zeros, n_words, 300 | 100. * (c_found + c_lower + c_zeros) / n_words) 301 | ) 302 | print('%i found directly, %i after lowercasing, ' 303 | '%i after lowercasing + zero.' % ( 304 | c_found, c_lower, c_zeros 305 | )) 306 | #print(len(new_weights[0])) 307 | return new_weights 308 | 309 | 310 | def full_to_half(s): 311 | """ 312 | Convert full-width character to half-width one 313 | """ 314 | n = [] 315 | for char in s: 316 | num = ord(char) 317 | if num == 0x3000: 318 | num = 32 319 | elif 0xFF01 <= num <= 0xFF5E: 320 | num -= 0xfee0 321 | char = chr(num) 322 | n.append(char) 323 | return ''.join(n) 324 | 325 | 326 | def cut_to_sentence(text): 327 | """ 328 | Cut text to sentences 329 | """ 330 | sentence = [] 331 | sentences = [] 332 | len_p = len(text) 333 | pre_cut = False 334 | for idx, word in enumerate(text): 335 | sentence.append(word) 336 | cut = False 337 | if pre_cut: 338 | cut=True 339 | pre_cut=False 340 | if word in u"。;!?\n": 341 | cut = True 342 | if len_p > idx+1: 343 | if text[idx+1] in ".。”\"\'“”‘’?!": 344 | cut = False 345 | pre_cut=True 346 | 347 | if cut: 348 | sentences.append(sentence) 349 | sentence = [] 350 | if sentence: 351 | sentences.append("".join(list(sentence))) 352 | return sentences 353 | 354 | 355 | def replace_html(s): 356 | s = s.replace('"','"') 357 | s = s.replace('&','&') 358 | s = s.replace('<','<') 359 | s = s.replace('>','>') 360 | s = s.replace(' ',' ') 361 | s = s.replace("“", "“") 362 | s = s.replace("”", "”") 363 | s = s.replace("—","") 364 | s = s.replace("\xa0", " ") 365 | return(s) 366 | 367 | 368 | def input_from_line(line, char_to_id): 369 | """ 370 | Take sentence data and return an input for 371 | the training or the evaluation function. 372 | """ 373 | line = full_to_half(line) 374 | line = replace_html(line) 375 | inputs = list() 376 | inputs.append([line]) 377 | line.replace(" ", "$") 378 | inputs.append([[char_to_id[char] if char in char_to_id else char_to_id[""] 379 | for char in line]]) 380 | inputs.append([get_seg_features(line)]) 381 | inputs.append([[]]) 382 | return inputs 383 | 384 | 385 | class BatchManager(object): 386 | 387 | def __init__(self, data, batch_size, max_len): 388 | self.batch_data = self.sort_and_pad(data, batch_size, max_len) 389 | self.len_data = len(self.batch_data) 390 | 391 | def sort_and_pad(self, data, batch_size, max_len): 392 | num_batch = int(math.ceil(len(data) /batch_size)) 393 | sorted_data = sorted(data, key=lambda x: len(x[0])) 394 | batch_data = list() 395 | for i in range(num_batch): 396 | batch_data.append(self.pad_data(sorted_data[i*batch_size : (i+1)*batch_size], max_len)) 397 | return batch_data 398 | 399 | @staticmethod 400 | def pad_data(data, max_length): 401 | strings = [] 402 | chars = [] 403 | #segs = [] 404 | lexcion_teatures = [] 405 | pos_ids = [] 406 | dep_ids = [] 407 | head_ids = [] 408 | targets = [] 409 | # max_length = max([len(sentence[0]) for sentence in data]) 410 | for line in data: 411 | string, char, lexcion_feature, pos_id, dep_id, head_id, target = line 412 | padding = [0] * (max_length - len(string)) 413 | strings.append(string + padding) 414 | chars.append(char + padding) 415 | lexcion_teatures.append(lexcion_feature + padding) 416 | pos_ids.append(pos_id + padding) 417 | dep_ids.append(dep_id + padding) 418 | head_ids.append(head_id + padding) 419 | #segs.append(seg + padding) 420 | targets.append(target + padding) 421 | #return [strings, chars, segs, targets] 422 | return [strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets] 423 | 424 | def iter_batch(self, shuffle=False): 425 | if shuffle: 426 | random.shuffle(self.batch_data) 427 | for idx in range(self.len_data): 428 | yield self.batch_data[idx] 429 | 430 | 431 | 432 | def pad_data(data, max_length): 433 | strings = [] 434 | chars = [] 435 | lexcion_teatures = [] 436 | pos_ids = [] 437 | dep_ids = [] 438 | head_ids = [] 439 | targets = [] 440 | # paded_data = [] 441 | # max_length = max([len(sentence[0]) for sentence in data]) 442 | for line in data: 443 | # a_line = [] 444 | string, char, lexcion_feature, pos_id, dep_id, head_id, target = line 445 | padding = [0] * (max_length - len(string)) 446 | # a_line.append(string + padding) 447 | # a_line.append(char + padding) 448 | # a_line.append(lexcion_feature + padding) 449 | # a_line.append(pos_id + padding) 450 | # a_line.append(dep_id + padding) 451 | # a_line.append(head_id + padding) 452 | # a_line.append(target + padding) 453 | # paded_data.append(a_line) 454 | 455 | strings.append(string + padding) 456 | chars.append(char + padding) 457 | lexcion_teatures.append(lexcion_feature + padding) 458 | pos_ids.append(pos_id + padding) 459 | dep_ids.append(dep_id + padding) 460 | head_ids.append(head_id + padding) 461 | targets.append(target + padding) 462 | return [strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets] 463 | # return paded_data 464 | ''' 465 | def feed_format_data(data): 466 | strings = [] 467 | chars = [] 468 | lexcion_teatures = [] 469 | pos_ids = [] 470 | dep_ids = [] 471 | head_ids = [] 472 | targets = [] 473 | for line in data: 474 | string, char, lexcion_feature, pos_id, dep_id, head_id, target = line 475 | strings.append(string) 476 | chars.append(char) 477 | lexcion_teatures.append(lexcion_feature) 478 | pos_ids.append(pos_id) 479 | dep_ids.append(dep_id) 480 | head_ids.append(head_id) 481 | targets.append(target) 482 | return [strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets] 483 | ''' 484 | -------------------------------------------------------------------------------- /lexcion/laptop14_dict.csv: -------------------------------------------------------------------------------- 1 | cord 2 | battery life 3 | service center 4 | "sales" team 5 | tech guy 6 | quality 7 | GUI 8 | applications 9 | use 10 | start up 11 | features 12 | iChat 13 | Photobooth 14 | garage band 15 | screen 16 | power light 17 | hard drive light 18 | battery 19 | rubber enclosure 20 | edge 21 | multi-touch gestures 22 | tracking area 23 | external mouse 24 | gaming 25 | suite of software 26 | speed 27 | Windows 7 28 | usb devices 29 | keyboard 30 | software 31 | system 32 | Microsoft office for the mac 33 | syncing 34 | 30" HD Monitor 35 | boot up 36 | service 37 | operating system 38 | preloaded software 39 | price 40 | clock in BIOS setup 41 | WARRANTY SERVICE 42 | brand 43 | warranty 44 | fan 45 | Customer Service number 46 | talking to a technician 47 | hard disc 48 | windows 49 | drivers 50 | Drivers 51 | BIOS update 52 | HP Technical Support 53 | browser 54 | virus scan 55 | 9 punds 56 | program 57 | warrenty 58 | Toshiba Warranty 59 | Quality 60 | webcam 61 | value 62 | internet 63 | managing personal files 64 | motherboard 65 | charger 66 | hardware 67 | force quit 68 | works 69 | VMWare program 70 | runs 71 | navigate 72 | find files 73 | extended warranty 74 | memory 75 | performace 76 | opening my Documents folder 77 | operating systems 78 | glass touchpad 79 | hard drive 80 | mother board 81 | external speaker 82 | sound 83 | shipping 84 | Mac software 85 | Microsoft software 86 | delete key 87 | editing 88 | phone assistance 89 | genius bar 90 | movie playing 91 | web browsing 92 | word editing 93 | recovery discs 94 | Office Max's "Max Assurance" 95 | iMac backup disc 96 | USB 97 | battery timer 98 | Temperatures 99 | Core Processing Unit temperatures 100 | virus protection for Mac 101 | bluetooth 102 | LG notebook service center 103 | hinge 104 | games 105 | responds 106 | representive at Microsoft 107 | XP 108 | WIndows 7 109 | Vista 110 | Keyboard 111 | Dell's customer disservice 112 | cooling system 113 | quad core I7 114 | sales associate 115 | Windows applications 116 | virus program 117 | weighed 118 | seven pounds 119 | bag 120 | color 121 | keys 122 | designed 123 | style 124 | mousepad 125 | sales 126 | DELL SUPPORT 127 | construction quality 128 | longevity 129 | cost 130 | configuration of "extra key" 131 | touch pad 132 | security 133 | pricetag 134 | work 135 | vga port 136 | volume 137 | mouse 138 | touchpad 139 | boots up 140 | look and feel standpoint 141 | safari 142 | Mozzilla firfox 143 | Bluetooth 144 | fingerprint reader driver 145 | display 146 | delivery 147 | performance 148 | Microsoft office 149 | Resolution 150 | appearance 151 | graphics card 152 | built-in webcam 153 | built-in mic 154 | cam 155 | mic 156 | Windows Vista 157 | board 158 | connector 159 | power supply 160 | ethernet 161 | drag and drop feature 162 | sales tax 163 | software packages 164 | iWork 165 | GarageBand 166 | iMovie 167 | colors 168 | resolution 169 | monitor 170 | iTunes 171 | Size 172 | external monitor 173 | base installation 174 | manuf 175 | camera 176 | AMD Turin processor 177 | Intel 178 | start menu 179 | bluray read/write drive 180 | service tech 181 | tech store 182 | surface 183 | iLife 184 | Snow Leopard X 185 | price tag 186 | Mac OS 187 | sound output quality 188 | techie 189 | Toshiba online help 190 | service life 191 | ports 192 | processor 193 | look 194 | stock screen 195 | gray color 196 | volume buttons 197 | computing 198 | protector 199 | key pad 200 | magnetic plug-in power charging power cord 201 | graphics 202 | support 203 | Geek Squad 204 | graphics quality 205 | Appleworks 206 | paint 207 | service department 208 | warrentys 209 | hard disk 210 | CD drive 211 | running 212 | screen size 213 | loads 214 | trackpad 215 | human interface 216 | inbuilt applications 217 | Software 218 | body 219 | i5 220 | antivirus software 221 | firewall 222 | Apple team 223 | video chat 224 | Fan 225 | cooling pad 226 | feature 227 | installation time 228 | power 229 | storage 230 | abilitiy 231 | spotlight search 232 | wireless system 233 | OS 234 | Apple support 235 | techs 236 | adding the bluetooth 237 | graphic power 238 | Adobe Creative apps 239 | programs 240 | materials 241 | Tech support 242 | POWER SUPPLY 243 | size 244 | Windows XP SP2 245 | Windows 7 Ultimate 246 | Fingerprint reader 247 | techs at HP 248 | video card 249 | audio 250 | set up 251 | word processing 252 | spec 253 | HD 254 | graphics chip 255 | components 256 | DVD burner 257 | USB ports 258 | ran 259 | operation 260 | ground loop isolator 261 | price range 262 | leading edge 263 | bloatware 264 | Maximum sound 265 | windows 7 266 | Bootcamp 267 | action pack games 268 | twin packing 269 | functions 270 | after sales support 271 | display screen 272 | acer arcade 273 | customer support 274 | cd drive 275 | design 276 | life 277 | pad 278 | system performance 279 | linux based os 280 | batteries 281 | shipping carton 282 | virus protection 283 | OSX 16 284 | Intel Core processors 285 | HDMI port 286 | Internet Explorer 287 | staff 288 | price premium 289 | product and help aftermarket 290 | Microsoft word 291 | usb ports 292 | sd memory card reader 293 | sd memory car expansion 294 | LaCie 2Big external drive 295 | firewire 800 interface 296 | Time Machine 297 | extra features 298 | windows 7 home premium 299 | Acer screen 300 | USB port 301 | boot 302 | fires up 303 | online tutorial videos 304 | image 305 | soud 306 | shut down 307 | Windows 7 Starter 308 | Windows 7 Home Premium 309 | Windows 7 Professional 310 | specifications 311 | power cords 312 | multi-touch features 313 | clicking buttons 314 | Windows 315 | wired lan 316 | hard reboot 317 | Apple care 318 | font 319 | Features 320 | run 321 | plastic pieces 322 | 2GB RAM stick 323 | Microsoft Word for Mac 324 | WARRANTY COMPANY 325 | cusromer service center 326 | weight 327 | aluminum body 328 | watching movies 329 | playing 330 | build quality 331 | dock 332 | working 333 | costing 334 | i7 335 | Boots up 336 | driver 337 | tech support 338 | BIOS 339 | tech issues 340 | space 341 | priced 342 | internet speed 343 | Garageband 344 | Photo Booth 345 | iPhoto 346 | video-editing 347 | movie-making 348 | photo management 349 | music 350 | Pages 351 | Numbers 352 | Keynote 353 | school or office use 354 | web cam 355 | burn cd's 356 | 17 inch screen 357 | Drivers/Applications DVD 358 | Launch Manager 359 | glass screen 360 | switchable graphic card 361 | technical support 362 | BOOT MGR 363 | recovery cd 364 | Skype 365 | mouse buttons 366 | HDD 367 | rails 368 | Games 369 | home use 370 | business use 371 | updates 372 | MS Office apps 373 | Nortons virus scan 374 | frame 375 | loaded 376 | Windows XP 377 | key broad 378 | repair "depot" 379 | school use 380 | Wireless 381 | navigating 382 | speakers 383 | subwoofer 384 | built in camera 385 | left mouse key 386 | costed 387 | hard drive space 388 | picture 389 | extended warranties 390 | Screen 391 | technical service for dell 392 | case 393 | Photoshop 394 | system clock 395 | RAM 396 | securitysoftware 397 | driver updates 398 | 15 inch 399 | Product support 400 | casing of the power cord 401 | internet signals 402 | right speaker 403 | multiple page viewer 404 | shipped 405 | recovery disk 406 | Office 407 | Office programs 408 | wifi 409 | TYPING 410 | temp 411 | Windows operating system 412 | processor speed 413 | thunderbolt port 414 | lcd screen 415 | Images 416 | windows vista 417 | windows xp 418 | Adobe Flash player 419 | Internet Explore 420 | Windows update 421 | update 422 | switch 423 | 4GB stick of RAM 424 | hardrive 425 | touch control buttons 426 | iLife software 427 | iWeb 428 | system board 429 | navigation 430 | LCD 431 | DELL Customer Service 432 | RMA service 433 | Design 434 | screen/video resolution 435 | Dreamweaver 436 | Final Cut Pro 7 437 | Safari 438 | Firefox 439 | MSN Messenger 440 | Apple applications 441 | gigs 442 | 17-inch screen 443 | DVD 444 | CD burners 445 | uploading photos 446 | creating presentations 447 | Quality Display 448 | 12 cell battery 449 | power adapter plug 450 | TAB 451 | extended warrenty 452 | TFT panel 453 | Navigation 454 | Windows operating systems 455 | bulk 456 | music software 457 | video 458 | combined touch pad and clicker 459 | Unibody construction 460 | screen hinges 461 | BATTERY 462 | STORAGE LIFE 463 | SERVICE 464 | ram 465 | Nvidia chipset 466 | Programs 467 | CHARGE TIME 468 | consistancy 469 | key bindings 470 | 10-key 471 | 500gb hard drive 472 | USB output 473 | wireless mouse 474 | Linux 475 | windows disc 476 | disk throughput 477 | spinning beachball 478 | wireless internet access 479 | company 480 | screen resolutions 481 | carry 482 | edges 483 | ILife 484 | iPhotos 485 | built 486 | screen resolution 487 | Snow Leopard 488 | CPU 489 | Office Mac applications 490 | Word 491 | Excel 492 | spinning wheel 493 | 18.4" screen 494 | regular layout keyboard 495 | Runs 496 | Support 497 | 3 year warranty 498 | online chat service 499 | VHS 500 | Mac version of Microsoft Office 501 | starting-up time 502 | operates 503 | pictures 504 | transporting 505 | network capability 506 | wifi card 507 | printer 508 | OS X 509 | repair service 510 | customer service agents 511 | AfterEffects programs 512 | dependability 513 | mouse button 514 | buttons 515 | load 516 | webpages 517 | OpenOffice 518 | PRODUCT KEY 519 | specs 520 | one touch keys 521 | Delivery 522 | stability 523 | SATA controller 524 | video editing 525 | mobile video editing 526 | imovie program 527 | bluetooth mouse 528 | control buttons 529 | Processor 530 | patches 531 | system memory 532 | radeon 5850 533 | DDR5 534 | DDR3 535 | OS (Vista) 536 | pop up windows 537 | web programming software 538 | pop ups 539 | Memory 540 | 3G network 541 | Web access 542 | design based programs 543 | Adobe Creative Suite 544 | companies 545 | customer service 546 | plug 547 | HDD cover 548 | iphoto 549 | Spy ware 550 | Nortell 551 | videocard 552 | Setting 553 | pixel sizes 554 | pre installed software update 555 | customer services 556 | toshiba customer services 557 | space bar 558 | performing 559 | booting 560 | imail 561 | imovie 562 | transport 563 | repair depot 564 | hard disk capacity 565 | Keys 566 | gaming look 567 | gaming performance 568 | volume control 569 | internet connectivity 570 | USB wireless card 571 | discharges 572 | compatibility 573 | charging 574 | stand 575 | heat sink 576 | Webcam 577 | trend micro 578 | antiviral program 579 | plastic piece 580 | usb port wires 581 | case design 582 | connection 583 | pricing 584 | built-in wireless 585 | processing power 586 | install 587 | uninstall  588 | noise 589 | IWORKS 590 | Itunes 591 | MS Office 592 | application 593 | Boot Camp 594 | Applecare 595 | cad programs 596 | bootcamp 597 | tutorials 598 | mouse pad 599 | right click key 600 | full charge 601 | cordless mouse 602 | Toshiba tech support 603 | charged 604 | online chat 605 | speaker grill 606 | full service 607 | letter A 608 | after sales service 609 | recovey disk 610 | capacity 611 | build 612 | portability 613 | warranty service to Toshiba 614 | screens 615 | downloads 616 | LED backlit display 617 | aesthetics 618 | connect quality 619 | headphone 620 | mic jack 621 | touch-pad 622 | headphones 623 | connection with the internet 624 | booting up 625 | shutting down 626 | OSX 627 | KEYS 628 | KEYBOARD 629 | Dashboard 630 | return policy 631 | affordability 632 | hook to my wireless network 633 | aluminum style 634 | nvidia 9800 635 | arm piece 636 | 1-year-warranty 637 | word 638 | word processer 639 | performance specs 640 | quality control 641 | touch-mouse 642 | numeric pad 643 | cover for the DVD drive 644 | arm velcro 645 | 2 GB of RAM 646 | versitility 647 | hinge design 648 | browsers 649 | Charger 650 | screen graphics 651 | clarity 652 | sharpness 653 | safety feature 654 | unibody design 655 | BOOTING UP 656 | one-year warranty 657 | charge 658 | services 659 | Bluetooth 3 660 | fan blade 661 | internet connection 662 | external harddrives 663 | functionality 664 | simplicity 665 | looking 666 | mouse keys 667 | mute 668 | color reproduction 669 | Sony 'Certified' technician 670 | proprietary software 671 | Apple Care plan 672 | video games 673 | trial software 674 | Graphics 675 | internet interfaces 676 | black keyboard 677 | software options 678 | functioning 679 | port 680 | grafics card 681 | LED backlit screen 682 | 8GB RAM 683 | instructions 684 | cursor 685 | operate 686 | gadgets 687 | disk image 688 | track pad 689 | disc drive 690 | drive 691 | word processing program 692 | Battery 693 | usage 694 | MacOSX 695 | dual-core 696 | quad-core 697 | 22" Monitor 698 | default background 699 | island backlit keyboard 700 | multi-touch mouse 701 | Microsoft Office 702 | spreadsheets 703 | presentations 704 | built it web cam 705 | performed 706 | graphics cards 707 | bluray player 708 | book 709 | systems 710 | media 711 | thermal paste 712 | internals 713 | facial recognition 714 | windows logon 715 | online service 716 | recovery DVDs 717 | fans 718 | preformed 719 | windows OS 720 | user interface 721 | iDVD 722 | surfing 723 | play 724 | Intel i processors 725 | sound quality via USB 726 | extended life battery 727 | Windows XP drivers 728 | sound card 729 | budget 730 | Google Chrome 731 | PRICE 732 | art aspect 733 | pre-loaded Norton Firewall/Security program 734 | USB connect 735 | expense 736 | multi-touch trackpad 737 | System 738 | Apple keyboard 739 | PC's keyboard 740 | power cord 741 | techies 742 | Sony Sonic Stage software 743 | Docking port 744 | bluetooth enabled 745 | Powerpoint program 746 | customer service rep 747 | genius bar staff 748 | non-dedicated graphics card 749 | Mouse Cable 750 | RAM slots 751 | HDD Bays 752 | 16GB RAM support 753 | ASUS TECH SUPPORT 754 | mouse on the pad 755 | left button 756 | built in tools 757 | Leopard running system 758 | LG service center 759 | Material 760 | Specs 761 | performs 762 | photo booth 763 | multi-touch track pad 764 | Hard disk 765 | hard disk space 766 | internal hard disk 767 | connects to WIFI 768 | power adapter 769 | mainboard 770 | 500gb external hard drive 771 | wireless card 772 | accessories 773 | playing games 774 | equipment 775 | wt 776 | Apple navigation 777 | apps 778 | hard drives 779 | Sound card 780 | sized 781 | desktop keyboard 782 | drivers/applications DVD 783 | Operating System 784 | motherboards 785 | AC plug 786 | Peformance 787 | flexibility 788 | S-video port 789 | surfing the web 790 | Iphoto 791 | Senior Tech 792 | volume wheel 793 | sound quality 794 | manual 795 | sensitivity 796 | core applications 797 | hook up to other wireless networks 798 | shift key 799 | rep 800 | standard os cd 801 | proprietary hardware drivers 802 | programm 803 | response 804 | Internet focused activity 805 | charges 806 | iBook backup 807 | firewire connection 808 | keyboard shortcuts 809 | shortcuts 810 | windows media 811 | replacement charger 812 | windows system 813 | printer software 814 | seventeen inch screen 815 | technician 816 | DC jack 817 | power brick 818 | screen quality 819 | 17" inch screen 820 | price-point 821 | MOTHERBOARD 822 | retail price 823 | Microsoft Student Edition 824 | every day computing 825 | LED monitor 826 | backlit keys 827 | Safari internet browser 828 | firefox 829 | support line 830 | Windows 7 starter 831 | KEYBOARD FUNCTION 832 | DVD drive 833 | microphone 834 | security-prone OS 835 | firewire cable system 836 | iBook 837 | shipment 838 | quicklook 839 | browsing 840 | PhotoBooth 841 | built-in camera 842 | Chrome 843 | prices 844 | external dvd drive 845 | external drive 846 | PORTABILITY 847 | PROCESSING 848 | Supplied software 849 | apple applications 850 | bios 851 | kernal 852 | upgraded memory 853 | call center 854 | cords 855 | starts 856 | Keynotes 857 | dvd drive 858 | key board 859 | ease 860 | Touchpad 861 | connection card 862 | 4GB of RAM 863 | 8GB of RAM 864 | game 865 | Browsing 866 | itunes 867 | setup 868 | Toshiba customer services 869 | internet capabilities 870 | High definition quality 871 | Speakers 872 | Beast graphics 873 | wall charger 874 | windows vista system 875 | Function keys 876 | Screen size 877 | push button 878 | lid 879 | Pentium 4 880 | 1 GB ram 881 | word processor 882 | interface device 883 | included program 884 | expese 885 | Microsoft Office apps 886 | internet use 887 | aero 888 | machined aluminum frame 889 | parallels type program 890 | ATI graphics card 891 | Intel built-in card 892 | handle 893 | power supply cord 894 | Applecare warranty plan 895 | 13 inch 896 | Final Cut Pro 897 | Mac Snow Leopard O/S 898 | Win XP 899 | Visa 900 | Win7 901 | Startup 902 | shutdown 903 | resume from sleep 904 | fit 905 | external keyboard 906 | windows 7 system 907 | wheel 908 | flatline keyboard 909 | ergonomics 910 | connections 911 | Applecare tech support 912 | power plug 913 | resolution on the screen 914 | AC power port 915 | running system 916 | force 917 | durability 918 | Vista Business 919 | surf the web 920 | size of the screen 921 | tote 922 | Win 7 923 | 2GB stick of memory 924 | Norton 925 | command prompt 926 | programming 927 | Battery life 928 | icon list 929 | desktop icons 930 | apple care 931 | delivery service 932 | picture quality 933 | i3 processor 934 | casing 935 | resolution of the screen 936 | screen brightness 937 | backlit keyboard 938 | motherboard chip 939 | Windows Vista Home Premium 940 | 1GB of RAM 941 | brightness 942 | electronic fuzz sound 943 | headphone jack 944 | graphics editing 945 | complex data analysis 946 | costs 947 | hardcopy manuel 948 | visual 949 | plate 950 | mouse command buttons 951 | speeds 952 | lighted keyboard 953 | operation system 954 | Applications 955 | Warrenty 956 | agents 957 | three year warranty 958 | Nvidia grafics card 959 | left "mouse" button 960 | type 961 | 15" 962 | 17" 963 | feel 964 | virus protection programs for a Mac 965 | HDMI 966 | wireless switch 967 | technical person 968 | driver/application DVD 969 | photo detection software 970 | mouse pointer 971 | engineering design 972 | hinges 973 | Customer Service 974 | pointer 975 | life span 976 | looks 977 | my toshiba feature 978 | layout 979 | one of the programs 980 | iwork 981 | office 982 | document creation 983 | memory stick 984 | button below the mouse pad 985 | lightscribe 986 | photo application 987 | 17 ince screen 988 | warranty period 989 | Performance 990 | Battery Life 991 | Price 992 | Value 993 | WiFi 994 | update programs 995 | MS applications 996 | Internet tabs 997 | noises 998 | bottom of the computer 999 | repair technician 1000 | Cords 1001 | network connection 1002 | Core2 Quad 1003 | charger unit 1004 | Looks 1005 | signals 1006 | signal 1007 | Win 7 Home 1008 | incase shells 1009 | repair center 1010 | keyboard functions 1011 | SERVICE FACILITY 1012 | using the internet 1013 | safe mode 1014 | 18-inch 1015 | universal charger 1016 | powerpoint 1017 | desktop background 1018 | window's 7 starter 1019 | function 1020 | IT support technicians 1021 | customer service center 1022 | shaped 1023 | satellite card 1024 | depot 1025 | zooming 1026 | service rep 1027 | windows movie maker 1028 | beauty 1029 | Garmin GPS software 1030 | Microsoft Office 2003 1031 | fan noise 1032 | apple associates 1033 | mousepad sensitivity 1034 | mac osx 1035 | HDD bay 1036 | leather carrying case 1037 | commodity hardware 1038 | Paralles 1039 | Windows XP Professional 1040 | Windows Server Enterprise 2003 1041 | Windows Server 2008 Enterprise 1042 | repair 1043 | -------------------------------------------------------------------------------- /lexcion/restaurant14_dict.csv: -------------------------------------------------------------------------------- 1 | staff 2 | food 3 | kitchen 4 | menu 5 | perks 6 | orrechiete with sausage and chicken 7 | waiters 8 | meats 9 | dish 10 | Bagels 11 | toast 12 | mayonnaise 13 | bacon 14 | cheese 15 | ingredients 16 | plate 17 | omelet 18 | drinks 19 | check 20 | design 21 | atmosphere 22 | cuisine 23 | pizza 24 | thin crusted pizza 25 | interior decoration 26 | chefs 27 | seats 28 | seltzer with lime 29 | pickles 30 | selection of meats and seafoods 31 | dishes 32 | eat family style 33 | vibe 34 | owner 35 | service 36 | delivery 37 | prices 38 | interior decor 39 | wine 40 | price 41 | quantity 42 | sushi 43 | sushi bar 44 | fried rice 45 | mussels 46 | puff pastry goat cheese 47 | salad with a delicious dressing 48 | hanger steak au poivre 49 | courses 50 | indian food 51 | place 52 | broth with noodles 53 | meal 54 | money 55 | wine list 56 | Thai food 57 | glass of wine 58 | specials 59 | lunch 60 | dinner 61 | dine 62 | desert 63 | bar 64 | table 65 | environment 66 | lasagnette appetizer 67 | beer selection 68 | setting 69 | dining 70 | Dosa 71 | Wine list selection 72 | wine-by-the-glass 73 | entrees 74 | sake martini 75 | bagels 76 | spreads 77 | chopsticks 78 | garden terrace 79 | crowded 80 | grilled branzino 81 | wait staff 82 | French food 83 | packed 84 | pre-theater menu 85 | order 86 | server 87 | scents 88 | lobster sandwich 89 | Spider Roll 90 | shell crab 91 | Deep Fried Skewers 92 | reservations 93 | hostess 94 | rice dishes 95 | congee (rice porridge) 96 | tuna tartar appetizer 97 | Food 98 | chef 99 | fish dishes 100 | soups 101 | french fare 102 | Indian food 103 | baby pizzas 104 | waiting 105 | lava cake dessert 106 | managers 107 | selection of wines 108 | Italian decor 109 | dining experience 110 | wine selection 111 | reservation 112 | Service 113 | maitre 114 | panang duck 115 | raw vegatables 116 | Indian 117 | Ambiance 118 | BBQ Salmon 119 | Sea Bass 120 | Crispy Duck 121 | crust 122 | kababs 123 | Dal Bukhara 124 | eating 125 | taste 126 | glass of prosecco 127 | glass of Leaping Lizard 128 | creme brulee 129 | sugar 130 | served 131 | Pizza 132 | garlic knots 133 | chinese food 134 | teapot 135 | fresh mozzarella 136 | getting a table 137 | bistro-type vibe 138 | waitress 139 | tables 140 | fish 141 | variety of fish 142 | pesto pizza 143 | house salad 144 | bottle of wine 145 | bartender 146 | beverage manager 147 | music 148 | refleshment 149 | ambiance 150 | foods 151 | tastes 152 | sommlier 153 | captain 154 | back waiters 155 | tandoori salmon 156 | space 157 | atmorphere 158 | menu items 159 | three course meal 160 | crawfish boiled 161 | Pad Thai 162 | noodles 163 | delivered 164 | values for your money 165 | wait building 166 | corridor 167 | counters 168 | counter 169 | steaks 170 | salads 171 | sides 172 | Lunch 173 | pickels and slaw 174 | wine by the glass 175 | Change Mojito 176 | soup for the udon 177 | soy sauce 178 | water 179 | anti-pasta 180 | calamari 181 | filling pasta mains 182 | glasses of wine 183 | ceviche mix (special) 184 | crab dumplings 185 | assorted sashimi 186 | rolls 187 | sake 188 | banana tempura 189 | plain pizza 190 | garlic 191 | eggplant 192 | servers 193 | takeout menu 194 | italian dishes 195 | bottles of wine 196 | beers 197 | pumkin tortelini 198 | dinner special 199 | crabmeat lasagna 200 | chocolate bread pudding 201 | dessert 202 | pasta dishes 203 | tiramisu 204 | bagel 205 | plate of dumplings 206 | Filet Mignon 207 | sangria 208 | sake menu 209 | beef cubes 210 | portions 211 | traffic noise 212 | ambience 213 | dinner meeting 214 | Cantonese 215 | cooking 216 | Prix Fixe menu 217 | quality 218 | eggs benedict 219 | hollondaise sauce 220 | brunch 221 | oatmeal 222 | Chilean Sea Bass 223 | onion soup 224 | Dim Sum 225 | roll 226 | rice 227 | gin and tonic 228 | tonic 229 | dim sum 230 | little dishes 231 | Thai 232 | Grilled Chicken special with Edamame Puree 233 | roast duck 234 | pork 235 | chicken on rice with ginger 236 | Staff 237 | Sake 238 | portion 239 | roti rolls 240 | tip 241 | meat 242 | hot dogs 243 | Delivery 244 | noise level 245 | Personal pans 246 | Spicy Fried Clam Rolls 247 | Spider Rolls 248 | FOOD 249 | food quality 250 | man 251 | cheesecake 252 | employees 253 | containers for condiments 254 | containers 255 | manager 256 | Decor 257 | salmon dish 258 | flavor 259 | tuna melt 260 | tuna sandwich 261 | chicken tikka 262 | naan 263 | dals 264 | bill 265 | lawns 266 | thai food 267 | ravioli 268 | price tag 269 | appetizers 270 | mozzarella 271 | pie 272 | Fish 273 | main dining room 274 | ceiling 275 | menu selections 276 | establishment 277 | seafood dishes 278 | grapes 279 | Prices 280 | waiter 281 | burger 282 | pastas 283 | staples 284 | seating 285 | wait 286 | seafood 287 | Salmon 288 | beverages 289 | Asian appetizers 290 | view 291 | value 292 | attitude 293 | striped bass 294 | corriander 295 | black white shakes 296 | chu chu curry 297 | pad thai chicken 298 | prix fixe 299 | pastrami 300 | bagel with lox spread 301 | bagles 302 | quality of food 303 | lobby area 304 | seated 305 | proprietor 306 | lunch meetings 307 | decor 308 | food's presentation 309 | Waitstaff 310 | swordfish 311 | management 312 | desserts 313 | Italian food 314 | Moules 315 | lobster ravioli 316 | dosa 317 | eat 318 | indian 319 | wines 320 | Pastrami or corned beef 321 | rye bread 322 | glasses of champagne 323 | after dinner drinks 324 | seasoning 325 | owners 326 | people 327 | main dishes 328 | location 329 | stuff 330 | eats 331 | gourmet food 332 | bruschetta 333 | panini 334 | fried mini buns with the condensed milk and the assorted fruits on beancurd 335 | starter 336 | take-out pizza 337 | green curry with vegetables 338 | door 339 | priced 340 | portioins 341 | lunch specials 342 | pizzas 343 | pot-stickers 344 | tempura dish 345 | entree range 346 | deliveries 347 | smoked salmon and roe appetizer 348 | garlic shrimp 349 | lamb 350 | okra (bindi) 351 | steak 352 | escargot 353 | casual dinner 354 | chicken tikka marsala 355 | styles of pizza 356 | look 357 | coconut rice 358 | appetizer 359 | beers on tap 360 | bottle minimun 361 | spot 362 | Quality of food 363 | chicken 364 | vegetables 365 | appetizing 366 | bun 367 | fusion of French and Indian cooking 368 | turnip cake 369 | roast pork buns 370 | egg custards 371 | seating in the garden 372 | food options 373 | cost 374 | pre-theatre 3-course dinner 375 | wine flight 376 | calzones 377 | drink 378 | bar scene 379 | beverage selections 380 | secret back room 381 | gulab jamun 382 | diner 383 | room 384 | pig feet ginger simmered in black vinegar 385 | desserts with frog jelly 386 | thai cuisine 387 | crew 388 | host 389 | round tables 390 | Taiwanese 391 | french fries 392 | regular menu-fare 393 | fresh tomato sauce 394 | fresh mozz cheese 395 | basil 396 | dough 397 | flour 398 | Paneer Roll 399 | plates 400 | fresh mozzerella slices 401 | Plain Cheese slice 402 | Filet Mignon with garlic mash 403 | roasted tomato soup with chevre 404 | steak frites 405 | waitstaff 406 | bread 407 | margarite pizza with cold prosciutto and baby arugula on top 408 | salad 409 | Peter's Favourite pizza with prosciutto and baby arugula 410 | bruschettas 411 | paninis 412 | tramezzinis 413 | diners 414 | pre-theater prix-fixe 415 | fried chicken 416 | pork chop 417 | noodle dishes 418 | privacy 419 | corner booth table 420 | open faced cheese sandwich 421 | breads 422 | squid 423 | half-price Saturday night option 424 | homemade pasta 425 | value ofr money 426 | soup 427 | mushrooms 428 | beginning appetizers 429 | scallops 430 | chocolate souffle with rasberry mint sorbet 431 | Japanese Tapas 432 | seaweed 433 | lamb sausages 434 | sardines with biscuits 435 | large whole shrimp 436 | pistachio ice cream 437 | Chef 438 | fresh tomatoes 439 | sauce on the pizza 440 | outdoor seating 441 | mesclun 442 | salmon 443 | ice cream 444 | bistro fare 445 | chilaquiles 446 | jazz brunch 447 | live jazz 448 | measures of liquers 449 | table service 450 | pork belly 451 | fat 452 | lamb chop 453 | lemon 454 | extra virgnin olive oil 455 | Pastrami sandwich 456 | beer 457 | truffle oil 458 | wild mushroom(third generation-Fornini) pizza 459 | omelletes 460 | Sushi fix 461 | reservation sigh 462 | food art 463 | outdoor atmosphere 464 | toppings 465 | bills 466 | scallion pancakes 467 | fried dumplings 468 | pasta 469 | dumpling 470 | fondue appetizer 471 | spice 472 | Octopus salad 473 | MSG cooking 474 | Tom Kha soup 475 | Bombay beer 476 | dining hall 477 | semi-private boths 478 | office lunch 479 | sweet lassi 480 | lamb chettinad 481 | garlic naan 482 | rasamalai 483 | lambchops 484 | spicy tuna 485 | hosts 486 | quality value 487 | Meat dishes 488 | vegetarian-friendly choices 489 | amuse bouche 490 | crunchy tuna 491 | live entertainment 492 | special effects 493 | fountain drinks 494 | Steak Tartare 495 | Crab Croquette apt 496 | house wine 497 | dim sum servings 498 | glass front 499 | Barbecued codfish 500 | texture 501 | spice rub 502 | herb mix 503 | sauce 504 | dance floor 505 | platter 506 | in sandwiches 507 | frying 508 | bacos 509 | American Chinese food 510 | beef carpaachio 511 | kalbi 512 | nebbiolo 513 | Located 514 | clubhouse 515 | trays of Dim Sum 516 | bartenders 517 | cheff 518 | price range 519 | svc 520 | whole grilled fish 521 | tuna tartare 522 | mushroom ravioli 523 | pinot noir 524 | chocolate sampler 525 | dessert wine 526 | lines 527 | crowds 528 | shows 529 | characters 530 | Sauce 531 | Sangria 532 | indian cuisine 533 | pasta dish 534 | main course 535 | neighborhood 536 | hosting staff 537 | thai 538 | apetizers 539 | noodle and rices dishes 540 | fried clams 541 | sauces 542 | lunch menu 543 | entree 544 | cooked 545 | champagne 546 | caviar 547 | shrimp scampi 548 | antipasti 549 | dim sum combo 550 | martini 551 | Roast Chicken 552 | insde table 553 | chicken vindaloo 554 | spicy food 555 | bar food 556 | hand-crafted beers 557 | capex 558 | back garden 559 | nigiri 560 | architecture 561 | mushroom pizza 562 | cod with paella 563 | fish and chips 564 | serving 565 | fries 566 | porcini mushroom pasta special 567 | seafood tagliatelle 568 | spicy wontons 569 | salt pepper shrimps 570 | FOOD PORTIONS 571 | bar service 572 | Taiwanese food 573 | homemade lasagna 574 | dessert menu 575 | in-house lady DJ 576 | buns 577 | Beef noodle soup 578 | Delivery service 579 | Thai cuisine 580 | flavors 581 | herbs 582 | tomatoes 583 | shredded cheese 584 | root vegetables 585 | mushroom consomme 586 | Shabu-Shabu dinner 587 | beef 588 | comfort 589 | meals 590 | pho 591 | vegtables 592 | presentaion 593 | choice 594 | oil 595 | wait-staff 596 | sea urchin 597 | bruscetta 598 | mix of greens 599 | iceberg 600 | General Tao chicken 601 | Chinese food 602 | shrimp appetizers 603 | Waiters 604 | food portions 605 | cold appetizer dishes 606 | parmesean porcini souffle 607 | lamb glazed with balsamic vinegar 608 | 'gourmet' Indian cuisine 609 | pub 610 | corned beef sandwich 611 | Priced 612 | amount 613 | olive cream cheese 614 | lox spread 615 | rice to fish ration 616 | selecion of wines 617 | Godmother pizza 618 | crab cocktail 619 | lime juice concoction 620 | lime 621 | corned beef 622 | Chicken pad tai 623 | dumplings 624 | crowd 625 | turkey burger 626 | Margheritta 627 | eel 628 | lox 629 | cod 630 | trout 631 | crispy chicken 632 | meatballs 633 | caprese salad 634 | beans on toast 635 | lobby 636 | tapas 637 | pita bread 638 | waitresses 639 | variety 640 | seat 641 | Dinner 642 | sandwiches 643 | Price 644 | serves 645 | front doors 646 | back garden sitting area 647 | personal herb garden 648 | pastrami sandwiches 649 | Mexican food 650 | glass of water 651 | french food 652 | live music 653 | jazz nights 654 | lemon salad 655 | pepper 656 | Pad thai 657 | lad nar 658 | places 659 | front door 660 | Guizhou chicken 661 | fish with hot bean source 662 | fish fillet in spicy source 663 | special menu 664 | open kitchen 665 | bhelpuri 666 | sevpuri 667 | samosa chaats 668 | indian appetizers 669 | bombay style chaat 670 | Singapore Mai Fun 671 | curry flavor 672 | dinner location 673 | Indian restaurant food 674 | AT MOSHPHERE 675 | tips 676 | sea bass 677 | sushi places 678 | Sushi 679 | round of drinks 680 | toaster 681 | menu choices 682 | main courses 683 | exotic food 684 | mediterranean salad 685 | People 686 | cream cheeses 687 | coffee 688 | beef version 689 | club soda, filled with ice, no lime 690 | house varities 691 | pudding dessert 692 | vegetable samosa 693 | malai tikka wrap 694 | goat cheese 695 | panchetta 696 | raddichio 697 | kinds of beer 698 | turnip soup with pureed basil 699 | Thai spiced curry noodles with shrimp 700 | nightcap 701 | food suggestions 702 | sommelier 703 | fromager 704 | Japanese food 705 | price category 706 | entertainment 707 | seved 708 | signs 709 | specials menus 710 | scallop roll 711 | sesame chicken 712 | nori-wrapped tuna 713 | fast food 714 | all you can eat deal 715 | meat patties in steamed buns 716 | dining room 717 | variety of dishes 718 | white bean brushetta 719 | lay out 720 | cart attendant 721 | lotus leaf wrapped rice 722 | feel 723 | interior 724 | sashimi plate 725 | servants 726 | noise 727 | chicken dish 728 | Bartender 729 | prix fixe lunch 730 | buffet 731 | hot bagel 732 | pepper powder 733 | glass of Sangria 734 | table by the window 735 | jalapeno 736 | pad see ew 737 | back patio 738 | Guacamole+shrimp appetizer 739 | filet 740 | frites 741 | lunch special 742 | The chicken pot pie 743 | cheeseburger 744 | spicy ethnic foods 745 | snacking 746 | SEASONAL beer 747 | course 748 | cook 749 | Yellowtail 750 | clams oreganta 751 | salad with perfectly marinated cucumbers and tomatoes with lots of shrimp and basil 752 | lentil dish 753 | basmati rice dish 754 | cigar bar 755 | ambient 756 | cheescake 757 | Bill 758 | entertaining 759 | Fluke sashimi 760 | jalapeno-lime olive oil 761 | fruit of the oil 762 | steak au poivre 763 | congee 764 | donut like deep fried dough they call Ow Ley Soh 765 | candle-light 766 | food-quality 767 | ethnic food 768 | Ambience 769 | AC 770 | cooks 771 | sushi rolls 772 | dinner menu to sit 773 | take out 774 | Scallion Pancake 775 | vegetable juice 776 | Guizhou Chicken 777 | Shredded Squid Family Style 778 | Sichuan Spicy Soft Shell Crab 779 | Shuizhu Fish 780 | Sichuan food 781 | spinach mushroom calzone 782 | canned vegetables 783 | Blue Point oysters 784 | Green Curry dish 785 | atmoshere 786 | spicy tuna roll 787 | rock shrimp tempura 788 | White Chocolate Bread Pudding with Gelato and hot chocolate 789 | banana tower 790 | icing on the cake 791 | Wine list 792 | french cuisine 793 | Drinks 794 | vegetarian entree 795 | Abby's treasure 796 | wintermelon 797 | assortment of fresh mushrooms and vegetables 798 | Steak au Poivre 799 | Onglet 800 | reputation 801 | spinach and corn dumplings 802 | bathroom 803 | massamman curry 804 | chow fun and chow see 805 | pita 806 | oysters 807 | grilled cheese 808 | doors 809 | good 810 | terrace 811 | Japanese cuisine 812 | Jazz bands 813 | halibut special 814 | japanese comfort food 815 | half price sushi deal 816 | appetizer platter 817 | dress codes 818 | attitudes 819 | pieces of sushi 820 | jewish deli food 821 | Neapolitan pizza 822 | zucchini blossoms 823 | Wait staff 824 | slice 825 | Vietnamese classics 826 | wait time 827 | fish tanks 828 | noodle soup dishes 829 | spaghetti with Scallops and Shrimp 830 | spinach ravioli in a light oil and garlic sauce 831 | Chicken Teriyaki dish 832 | Pam's special fried fish 833 | New England Chowder 834 | Lobster Bisque 835 | wines by the glass 836 | duck breast special 837 | french indian fusion 838 | fried shrimp 839 | cheese fondue 840 | green curry 841 | currys (masaman, green, red) 842 | Gigondas 843 | house champagne 844 | yellowfun tuna 845 | mussel selection 846 | scenery 847 | dinner reservations 848 | Cheese plate 849 | glasses of water 850 | Godmother pizza (a sort of traditional flat pizza with an olive oil-brushed crust and less tomato sauce than usual) 851 | sandwich 852 | plastic forks 853 | hummus platter 854 | cocktails 855 | apple tarte tatin 856 | selection 857 | chocolate 858 | SHOWS 859 | ACTORS 860 | New York Bagel 861 | miso soup 862 | filets 863 | Prix Fixe 864 | quail 865 | zucchini 866 | mashed potatoes 867 | butter 868 | Spicy Scallop roll 869 | Indian dining experience 870 | Pub atmosphere 871 | workers 872 | mixed drinks 873 | snack foods 874 | chocolate mud cake (warmed) 875 | dulce de leche gelato 876 | stauff 877 | classical furniture 878 | cole slaw 879 | knish 880 | blond wood decor 881 | eggs 882 | chicken tikka-masala 883 | menu description 884 | Pad Se-Ew 885 | Chicken with Cashew Nuts 886 | French bistro fare 887 | Shabu-Shabu 888 | people with carts of food 889 | crackling calamari salad 890 | sushi place 891 | group dinner 892 | Thai ice tea 893 | guy 894 | roofdeck 895 | sesame crusted Salmon 896 | chicken with chili and lemon grass 897 | lamb chops 898 | Chicken Tikka Masala 899 | plain slice 900 | takeout 901 | live jazz band 902 | Lobster Cobb Salad 903 | tea room 904 | apppetizers 905 | chips 906 | side 907 | dim sum atmosphere 908 | selection of wine 909 | Salads 910 | tamarind duck 911 | noodles with ground beef 912 | Thai flavors 913 | black cod with yuzu sauce 914 | sicilian 915 | Downstairs lounge 916 | menu prices 917 | portion size 918 | busboy 919 | water and wine glasses 920 | prix fixe meal 921 | night scene 922 | expresso 923 | actors 924 | hot sauce 925 | thai popcorn 926 | parathas 927 | kebabs 928 | tuna 929 | wasabe potatoes 930 | coat check girls 931 | cannoli 932 | after dinner drink 933 | prix fixe pricing 934 | area 935 | tanks 936 | Halibut 937 | cream cheese 938 | chicken casserole 939 | mezzanine 940 | Valentines Day dinner 941 | jelly fish 942 | drunken chicken 943 | soupy dumplings 944 | stir fry blue crab 945 | dumpling menu 946 | aesthetics 947 | box wine 948 | sushi chef 949 | atmoshpere 950 | sashimi 951 | Italian cheese 952 | thin-crust pizza 953 | shrimp appetizer 954 | lamb vindaloo 955 | dinners 956 | spices 957 | onions 958 | roti 959 | potato pancakes 960 | fried dumpling 961 | primi 962 | secondi 963 | dining area 964 | crab cakes 965 | Sichuan chef 966 | nori 967 | patio 968 | Pastrami 969 | Butter 970 | white wine 971 | private room 972 | aisle 973 | pre-fixe menu 974 | choices per course 975 | ordering a la carte 976 | Pakistani food 977 | eggplant parmesan 978 | baked ziti with meatsauce 979 | hot pot with seafood 980 | shrimp 981 | crabmeat 982 | kielbasa 983 | apples 984 | curry 985 | white sauce 986 | paratha bread 987 | stuffing 988 | tandoori 989 | delivery time 990 | Thai noodles with shrimp and chicken and coconut juice 991 | portion sizes 992 | Kosher dills 993 | Kamikaze 994 | Rolls 995 | mayonaisse 996 | food runners 997 | braised lamb shank in red wine 998 | special 999 | iced tea 1000 | servings 1001 | lettuce 1002 | Obv caviar 1003 | dining experiences 1004 | santa fe chopped salad 1005 | Tuna roll 1006 | comfort food 1007 | technique 1008 | pastrami sandwich 1009 | atomosphere 1010 | Indian Fast Food 1011 | italian food 1012 | main entree 1013 | cold udon 1014 | snapple 1015 | apps 1016 | dosas 1017 | popcorn topping 1018 | scene 1019 | customers 1020 | ladies 1021 | people serving 1022 | fillings 1023 | dosa batter 1024 | pre-theatre or after-theatre drinks 1025 | mussaman curry 1026 | fried tofu 1027 | potato 1028 | Gnochi 1029 | lunch buffet 1030 | serve 1031 | cokes 1032 | drumsticks over rice 1033 | sour spicy soup 1034 | whitefish salad 1035 | whitefish 1036 | mayo 1037 | chicken parm 1038 | chicken with portobello mushrooms 1039 | tomato sauce 1040 | counter service 1041 | staff member 1042 | upstairs 1043 | red curry 1044 | pad thai 1045 | Saul 1046 | Quality 1047 | pad penang 1048 | walls 1049 | hanger steak 1050 | lunch food 1051 | quasi-thai 1052 | barebecued salmon 1053 | chai tea 1054 | Spicy Tuna hand rolls 1055 | meat dishes 1056 | brassiere food 1057 | Manager 1058 | prix fix 1059 | Thali 1060 | chef app 1061 | delicate butternut squash ravioli in a delicious truffle sauce 1062 | buttery and tender langostine entree 1063 | Cakebread Cabernet 1064 | pastries 1065 | makhani 1066 | korma 1067 | foie gras terrine with figs 1068 | duck confit 1069 | Gulab Jamun (dessert) 1070 | freshness 1071 | brioche and lollies 1072 | 2-person table 1073 | mussels in spicy tomato sauce 1074 | case of snapple 1075 | lamb dishes 1076 | care 1077 | dimsum 1078 | sushimi cucumber roll 1079 | salt 1080 | potato chips 1081 | pasta entre'es 1082 | customer service 1083 | oyster 1084 | maitre d' 1085 | ala carte 1086 | Pad See Ew 1087 | Pork Chops 1088 | Tofu plates 1089 | steak with portobello mushrooms 1090 | surroundings 1091 | dinner menu 1092 | CUISINE 1093 | rush 1094 | duck noodles 1095 | martinis 1096 | avocado 1097 | drink refills 1098 | business dinner 1099 | Meal 1100 | LOBSTER TAILS 1101 | espresso cup filled with chocolate mousse 1102 | Pho 1103 | Lemon grass chicken 1104 | Beef Cube on rice 1105 | portraits 1106 | appitizers 1107 | curry sauce 1108 | deli 1109 | pork buns 1110 | Maine Lobster 1111 | Dessert 1112 | Ingredients 1113 | ingrediants 1114 | backyard dining area 1115 | RICE 1116 | chocolate cake 1117 | vanilla gelato (with espresso) 1118 | dining rooms 1119 | tiramisu chocolate cake 1120 | decore 1121 | cater 1122 | mango chicken 1123 | presentation 1124 | garlic mashed potatoes 1125 | lamb meat 1126 | red wine 1127 | hot tea 1128 | outside 1129 | Lassi 1130 | Taxan 1131 | Southern Indian cuisine 1132 | Staffs 1133 | jukebox 1134 | Margarita 1135 | sake list 1136 | Purple Haze 1137 | Steak 1138 | Japanese classic cuisine 1139 | spinach 1140 | Shanghai low mein 1141 | Owner 1142 | dim sum orders 1143 | wine menu 1144 | horedevous 1145 | hong-kong styled milk 1146 | tea with tapioca pearls (hot) 1147 | assortment of fish 1148 | Fatty Yellow Tail 1149 | Boton Shrimp 1150 | Sea Eel 1151 | Sea Urchin 1152 | Blue Fin Torro (Fatty Tuna) 1153 | waterbugs 1154 | veal in carozza chicken saltimbocca 1155 | chutneys 1156 | mixed drink special 1157 | waiting area 1158 | BBQ ribs 1159 | Argentinian Pizza 1160 | Pelligrino 1161 | blue cheese 1162 | kamasutra 1163 | bombay cosmopolitan 1164 | date spot 1165 | Seating 1166 | botle of wine 1167 | steak dish 1168 | food spot 1169 | somosas 1170 | chai 1171 | chole 1172 | dhosas 1173 | dhal 1174 | palak paneer 1175 | malai kofta 1176 | spring rolls 1177 | cod with pineapple tempura 1178 | bottle 1179 | topping 1180 | qualities 1181 | dress code 1182 | candles 1183 | surrounding 1184 | pictures 1185 | Reuben sandwich 1186 | strawberry daiquiries 1187 | wait service 1188 | Frites 1189 | zen feel 1190 | egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork 1191 | rice congee soup 1192 | summer rolls 1193 | Spreads 1194 | Indoor 1195 | mimosas 1196 | cafe 1197 | outdoor chairs 1198 | walnuts 1199 | tom yum soup 1200 | Tuscan cuisine 1201 | Neapolitan fare 1202 | lobster teriyaki 1203 | rose special roll 1204 | glass of beer 1205 | service staff 1206 | delivery guys 1207 | Tamarind Margaritas 1208 | filet mignon dish 1209 | pastrami on challah sandwich 1210 | knishes 1211 | mahi mahi 1212 | saffron risotto 1213 | Go Go Hamburgers 1214 | sandwhiches 1215 | Thia food 1216 | bars 1217 | pre-theatre menu 1218 | stomach 1219 | wallet 1220 | SERVICE 1221 | FOOD QUALITY 1222 | PRICES 1223 | line 1224 | dinner specials 1225 | fondue 1226 | table grilling 1227 | exotic salad 1228 | green salad 1229 | sweet basil fried tofu 1230 | peanut sauce 1231 | TOASTING 1232 | garden 1233 | lamb sandwhich 1234 | dim sum dish 1235 | Traditional French decour 1236 | hall 1237 | wasabi 1238 | sea salt 1239 | burgers 1240 | cheeseburgers 1241 | Unda (Egg) rolls 1242 | wine choices 1243 | Jazz 1244 | lazy susans 1245 | pork loin 1246 | discount 1247 | Edamame pureed 1248 | vegetarian dishes 1249 | non-veg selections 1250 | shrimp fritters 1251 | spot lights 1252 | Times Square cocktail 1253 | ginger lemonade with vodka 1254 | Veal Parmigana 1255 | air conditioning 1256 | sichuan cooking 1257 | chongqing hotpot 1258 | waiter traffic 1259 | dinner plates 1260 | Sake collection 1261 | Nanbu Bijin 1262 | chicken in curry sauce 1263 | salmon caserole 1264 | Angry Lobster 1265 | cold lobster salad 1266 | crab-cake eggs benedict 1267 | bathrooms 1268 | Deliveries 1269 | chicken with garlic sauce 1270 | chicken with black bean sauce 1271 | hunan chicken 1272 | take-out pies 1273 | bannan fritter 1274 | Appetizers 1275 | mascarpone with chocolate chips 1276 | caprese salad appetizer 1277 | Half-Price Saturday Night Special 1278 | basic dishes 1279 | nosh (pastrami sandwich) 1280 | Christmas dinner 1281 | dinosaur rolls 1282 | white tuna sashimi 1283 | Gnocchi 1284 | bottle of sake 1285 | edamames 1286 | sushi plate 1287 | cheese sticks 1288 | pot of boiling water 1289 | glass noodles 1290 | -------------------------------------------------------------------------------- /lexcion/restaurant15_dict.csv: -------------------------------------------------------------------------------- 1 | place 2 | staff 3 | NULL 4 | food 5 | portions 6 | Saul 7 | foie gras terrine with figs 8 | duck confit 9 | wine list 10 | restaurant 11 | cart attendant 12 | Food 13 | Chow fun 14 | pork shu mai 15 | Fish 16 | Service 17 | oysters 18 | Restaurant Saul 19 | service 20 | decor 21 | duck breast special 22 | Thai fusion stuff 23 | Grilled Chicken special with Edamame Puree 24 | Edamame pureed 25 | vent 26 | sake list 27 | spicy tuna roll 28 | rock shrimp tempura 29 | pink pony 30 | spot 31 | Ambiance 32 | waiter 33 | sea urchin 34 | sushi 35 | rice to fish ration 36 | half price sushi deal 37 | crowd 38 | Prix Fixe menu 39 | dishes 40 | somosas 41 | chai 42 | chole 43 | dhosas 44 | dhal 45 | kitchen 46 | ambience 47 | view 48 | lava cake dessert 49 | Cosette 50 | French Onion soup 51 | desserts 52 | pizza 53 | cheese 54 | ingredients 55 | crust 56 | meals 57 | seafood 58 | menu 59 | Pastrami 60 | portion 61 | fried shrimp 62 | signs 63 | specials menus 64 | waitstaff 65 | Leon 66 | specials 67 | atmosphere 68 | French bistro fare 69 | wine 70 | Zucchero Pomodori 71 | Gnocchi 72 | hostess 73 | wait 74 | glass of wine 75 | people 76 | atmoshpere 77 | lunch 78 | Sauce 79 | waitress 80 | tuna of gari 81 | thai food 82 | Planet Thailand 83 | rolls 84 | sashimi 85 | crunchy tuna 86 | garden terrace 87 | Steak Tartare 88 | open kitchen 89 | wine selection 90 | Gigondas 91 | house champagne 92 | vibe 93 | French food 94 | VT's 95 | neighborhood 96 | setting 97 | Mizu 98 | Spicy Scallop roll 99 | Moules 100 | lobster ravioli 101 | maitre d' 102 | dessert 103 | drinks 104 | chef's specials 105 | Downstairs lounge 106 | Raga's 107 | exotic food 108 | bar 109 | beers 110 | wines 111 | Jekyll and Hyde 112 | shows 113 | actors 114 | server 115 | tuna 116 | wasabe potatoes 117 | outdoor atmosphere 118 | fresh mozzarella 119 | appetizer selection 120 | Wait staff 121 | pie 122 | salad 123 | dining 124 | chicken pot pie 125 | cheeseburger 126 | bagels 127 | 1st Ave spot 128 | Uni Hand roll 129 | sake menu 130 | lobster teriyaki 131 | rose special roll 132 | pork belly 133 | Wine list selection 134 | wine-by-the-glass 135 | Traditional French decour 136 | hall 137 | Cafe Spice 138 | Seating 139 | raw vegatables in side orders 140 | semi-private boths 141 | Red Eye 142 | live jazz band 143 | meal 144 | Red Eye Grill 145 | balance of herbs and tomatoes 146 | Jekyll and hyde Pub 147 | hidden bathrooms 148 | Jekyll and Hyde Pub 149 | pumkin tortelini 150 | entertainment 151 | bagel 152 | lobster roll 153 | lobster 154 | santa fe chopped salad 155 | fish and chips 156 | chow fun and chow see 157 | scallion pancakes 158 | fried dumplings 159 | pad penang 160 | fresh restaurant 161 | chef 162 | salads 163 | Pam's special fried fish 164 | Ambience 165 | Salads 166 | Ingredients 167 | spicy Tuna roll 168 | Yellowtail 169 | all you can eat deal 170 | Big Wong 171 | waiters 172 | congee 173 | noodles 174 | rice dishes 175 | takeout 176 | Caesar Salad 177 | arugula and goat cheese 178 | pasta dish 179 | tiramisu chocolate cake 180 | Manager 181 | good 182 | Patis 183 | Pastis 184 | raddichio 185 | mushroom pizza 186 | sangria 187 | Bombay beer 188 | Pizza 189 | homemade pasta 190 | hanger steak 191 | filet mignon dish 192 | ambient 193 | beef and noodle soup dishes 194 | backyard dining area 195 | outdoor restaurants 196 | tables 197 | Teodora 198 | rosemary or orange flavoring 199 | wait staff 200 | expresso 201 | Myagi 202 | fish 203 | tuna tartar appetizer 204 | dining room 205 | ambiance 206 | Lobster Bisque 207 | New England Chowder 208 | Prime Rib 209 | bottles of Korbett 210 | chicken vindaloo 211 | selection of wines 212 | Chef's tasting menu 213 | prixe fixe tasting menu 214 | lemon salad 215 | grilled branzino 216 | Prune 217 | characters 218 | Jeckll and Hydes 219 | Drinks 220 | Delivery 221 | seating 222 | cream cheeses 223 | lox 224 | resturant 225 | Shabu-Shabu Restaurant 226 | feel 227 | Shabu-Shabu 228 | owner 229 | Taxan 230 | location 231 | green curry with vegetables 232 | quantity 233 | ravioli 234 | trattoria 235 | regular menu-fare 236 | parmesean porcini souffle 237 | lamb glazed with balsamic vinegar 238 | candle-light 239 | marinara/arrabiatta sauce 240 | mozzarella en Carozza 241 | back room 242 | Emilio 243 | wine choices 244 | pepperoni 245 | family style salad 246 | Amma 247 | vegetarian dishes 248 | non-veg selections 249 | Decor 250 | sea bass 251 | Dal Bukhara 252 | kababs 253 | Haru on Park S 254 | rice 255 | all-u-can-eat sushi 256 | soy sauce 257 | waitstaffs 258 | Roth's 259 | dinner 260 | eggs benedict 261 | room 262 | Planet Thai 263 | svc 264 | Pad Thai 265 | sandwiches 266 | Chennai Garden 267 | Indian 268 | MEAT dishes 269 | seats 270 | cigar bar 271 | portion sizes 272 | PLACE 273 | Waitstaff 274 | YUKA 275 | Mermaid Inn 276 | lobster sandwich 277 | spaghetti with Scallops and Shrimp 278 | halibut special 279 | steak 280 | foods 281 | jelly fish 282 | drunken chicken 283 | soupy dumplings 284 | stir fry blue crab 285 | Cheese plate 286 | bruschettas 287 | paninis 288 | tramezzinis 289 | asparagus, truffle oil, parmesan bruschetta 290 | Wine list 291 | Cafe Noir 292 | manager 293 | calamari 294 | thai cuisine 295 | customer service 296 | gentleman 297 | mileau 298 | Casimir 299 | outside table 300 | unisex bathroom 301 | caviar 302 | salmon dish 303 | Change Mojito 304 | dim sum 305 | back patio 306 | music 307 | buffet 308 | cheesecake 309 | pastries 310 | cheeseburgers 311 | burgers 312 | pastrami sandwich on a roll 313 | spice 314 | Tom Kha soup 315 | Thai 316 | pesto pizza 317 | spicy Italian cheese 318 | back garden sitting area 319 | french fries 320 | scallops 321 | sauce 322 | martinis 323 | japanese comfort food 324 | lamb sausages 325 | sardines with biscuits 326 | large whole shrimp 327 | pistachio ice cream 328 | delivery 329 | ceviche mix (special) 330 | crab dumplings 331 | assorted sashimi 332 | two types of sake 333 | banana tempura 334 | Thai food 335 | Thai restaurant 336 | Gulab Jamun (dessert) 337 | Guacamole+shrimp appetizer 338 | filet 339 | frites 340 | pizza place 341 | mare 342 | pizzeria 343 | pizzas 344 | Sophia pizza 345 | blond wood decor 346 | premium sake 347 | kitchen food 348 | Sushi 349 | cuisine 350 | Thalia 351 | smoked salmon and roe appetizer 352 | entree 353 | strawberry daiquiries 354 | Taiwanese food 355 | cold appetizer dishes 356 | mahi mahi 357 | saffron risotto 358 | chicken and mashed potatos 359 | crab cakes 360 | garden 361 | joint 362 | selection of thin crust pizza 363 | Basil slice 364 | calzones 365 | counter service 366 | PIZZA 33 367 | dosas 368 | clerks 369 | Italian food 370 | basic dishes 371 | apppetizers 372 | sushimi cucumber roll 373 | spreads 374 | beverage selections 375 | Pizza 33 376 | Williamsburg spot 377 | proprietor 378 | coffee 379 | Bagels 380 | turkey burgers 381 | soup for the udon 382 | Japanese cuisine 383 | bottle 384 | Margheritta slice 385 | atmoshere 386 | Winnie 387 | appetizer menu 388 | brioche and lollies 389 | salmon 390 | crab salad 391 | Lucky Strike 392 | mussels in spicy tomato sauce 393 | fries 394 | late night atmosphere 395 | martini 396 | Vanilla Shanty 397 | in-house lady DJ 398 | Suan 399 | noodles with shrimp and chicken and coconut juice 400 | terrace 401 | Indian food 402 | balsamic vinegar over icecream 403 | Go Go Hamburgers 404 | turnip cake 405 | roast pork buns 406 | egg custards 407 | braised lamb shank in red wine 408 | interior decor 409 | pad se ew chicken 410 | pad thai 411 | Ginger House 412 | Chinese restaurant 413 | management 414 | Spreads 415 | toppings 416 | Rao 417 | indian cuisine 418 | shrimp appetizers 419 | eats 420 | indian food 421 | cheff 422 | baked clams octopus 423 | lamb 424 | bar scene 425 | Appetizers 426 | potato stuff kanish 427 | chicken 428 | servings for main entree 429 | Dessert 430 | veal 431 | anti-pasta 432 | pasta mains 433 | measures of liquers 434 | SEASONAL beer 435 | Heartland Brewery 436 | beer 437 | shrimp scampi 438 | antipasti 439 | Corona 440 | atomosphere 441 | porcini mushroom pasta special 442 | seafood tagliatelle 443 | tiramisu 444 | BBQ ribs 445 | congee (rice porridge) 446 | main dining room 447 | ceiling 448 | patio 449 | hot sauce 450 | drink 451 | cheescake 452 | outdoor seating 453 | chicken casserole 454 | beef 455 | lamb dishes 456 | Reuben sandwich 457 | Bloom's 458 | sauces 459 | Ravioli 460 | wines by the glass 461 | setting/atmosphere 462 | Pakistani food 463 | People 464 | Faan 465 | design 466 | bottles of wine 467 | mussles 468 | seabass 469 | goat cheese salad 470 | penne w/ chicken 471 | desert 472 | jukebox 473 | pastas 474 | Usha 475 | sassy lassi 476 | roti rolls 477 | Unda (Egg) rolls 478 | spices 479 | onions 480 | eggs 481 | roti 482 | Toons 483 | drumsticks over rice 484 | sour spicy soup 485 | Beef noodle soup 486 | Rao's 487 | Indoor 488 | Staff 489 | dumplings 490 | $10 10-piece dim sum combo 491 | crabmeat lasagna 492 | chocolate bread pudding 493 | egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork 494 | dish 495 | Ow Ley Soh 496 | Chinese food 497 | jazz duo 498 | wine by the glass 499 | Japanese Tapas 500 | Atmosphere 501 | Yakitori (bbq meats) 502 | sushi chef 503 | Rice Avenue 504 | wait-staff 505 | Baluchi's 506 | nigiri 507 | all you can eat sushi 508 | Areo 509 | servers 510 | view of the new york city skiline 511 | table by the window 512 | Personal pans 513 | delivery guys 514 | scene 515 | penne a la vodka 516 | pasta penne 517 | La Rosa 518 | selection 519 | mussels 520 | host 521 | crew 522 | Dining Garden 523 | Jazz Bar 524 | Thin Crust Pizzas 525 | Lasagna Menu 526 | BBQ Salmon 527 | Sea Bass 528 | Crispy Duck 529 | -------------------------------------------------------------------------------- /lexcion/restaurant16_dict.csv: -------------------------------------------------------------------------------- 1 | place 2 | staff 3 | NULL 4 | food 5 | portions 6 | Saul 7 | foie gras terrine with figs 8 | duck confit 9 | wine list 10 | restaurant 11 | cart attendant 12 | Food 13 | Chow fun 14 | pork shu mai 15 | Fish 16 | Service 17 | oysters 18 | Restaurant Saul 19 | service 20 | decor 21 | duck breast special 22 | Thai fusion stuff 23 | Grilled Chicken special with Edamame Puree 24 | Edamame pureed 25 | vent 26 | sake list 27 | spicy tuna roll 28 | rock shrimp tempura 29 | pink pony 30 | spot 31 | Ambiance 32 | waiter 33 | sea urchin 34 | sushi 35 | rice to fish ration 36 | half price sushi deal 37 | crowd 38 | Prix Fixe menu 39 | dishes 40 | somosas 41 | chai 42 | chole 43 | dhosas 44 | dhal 45 | kitchen 46 | ambience 47 | view 48 | lava cake dessert 49 | Cosette 50 | French Onion soup 51 | desserts 52 | pizza 53 | cheese 54 | ingredients 55 | crust 56 | meals 57 | seafood 58 | menu 59 | Pastrami 60 | portion 61 | fried shrimp 62 | signs 63 | specials menus 64 | waitstaff 65 | Leon 66 | specials 67 | atmosphere 68 | French bistro fare 69 | wine 70 | Zucchero Pomodori 71 | Gnocchi 72 | hostess 73 | wait 74 | glass of wine 75 | people 76 | atmoshpere 77 | toppings 78 | lunch 79 | Sauce 80 | waitress 81 | block 82 | tuna of gari 83 | thai food 84 | Planet Thailand 85 | rolls 86 | sashimi 87 | crunchy tuna 88 | garden terrace 89 | Steak Tartare 90 | open kitchen 91 | wine selection 92 | Gigondas 93 | house champagne 94 | vibe 95 | French food 96 | VT's 97 | neighborhood 98 | setting 99 | Mizu 100 | Spicy Scallop roll 101 | Moules 102 | lobster ravioli 103 | maitre d' 104 | dessert 105 | drinks 106 | chef's specials 107 | Downstairs lounge 108 | Raga's 109 | exotic food 110 | bar 111 | beers 112 | wines 113 | Jekyll and Hyde 114 | shows 115 | actors 116 | server 117 | tuna 118 | wasabe potatoes 119 | outdoor atmosphere 120 | fresh mozzarella 121 | appetizer selection 122 | Wait staff 123 | pie 124 | salad 125 | dining 126 | chicken pot pie 127 | cheeseburger 128 | bagels 129 | 1st Ave spot 130 | Uni Hand roll 131 | sake menu 132 | lobster teriyaki 133 | rose special roll 134 | pork belly 135 | Wine list selection 136 | wine-by-the-glass 137 | Traditional French decour 138 | hall 139 | Cafe Spice 140 | Seating 141 | raw vegatables in side orders 142 | semi-private boths 143 | Red Eye 144 | live jazz band 145 | meal 146 | Red Eye Grill 147 | balance of herbs and tomatoes 148 | Jekyll and hyde Pub 149 | hidden bathrooms 150 | Jekyll and Hyde Pub 151 | pumkin tortelini 152 | bagel 153 | lobster roll 154 | lobster 155 | santa fe chopped salad 156 | fish and chips 157 | chow fun and chow see 158 | scallion pancakes 159 | fried dumplings 160 | pad penang 161 | fresh restaurant 162 | chef 163 | salads 164 | Pam's special fried fish 165 | Ambience 166 | Salads 167 | Ingredients 168 | spicy Tuna roll 169 | Yellowtail 170 | all you can eat deal 171 | Big Wong 172 | waiters 173 | congee 174 | noodles 175 | rice dishes 176 | takeout 177 | Caesar Salad 178 | arugula and goat cheese 179 | pasta dish 180 | tiramisu 181 | chocolate cake 182 | Manager 183 | good 184 | Patis 185 | Pastis 186 | raddichio 187 | mushroom pizza 188 | sangria 189 | Bombay beer 190 | Pizza 191 | homemade pasta 192 | hanger steak 193 | filet mignon dish 194 | ambient 195 | beef and noodle soup dishes 196 | backyard dining area 197 | tables 198 | Teodora 199 | rosemary or orange flavoring 200 | wait staff 201 | expresso 202 | Myagi 203 | fish 204 | tuna tartar appetizer 205 | dining room 206 | ambiance 207 | Lobster Bisque 208 | New England Chowder 209 | Prime Rib 210 | bottles of Korbett 211 | chicken vindaloo 212 | servers 213 | selection of wines 214 | Chef's tasting menu 215 | prixe fixe tasting menu 216 | lemon salad 217 | grilled branzino 218 | Prune 219 | characters 220 | Jeckll and Hydes 221 | Drinks 222 | iced tea 223 | Delivery 224 | seating 225 | cream cheeses 226 | lox 227 | resturant 228 | Shabu-Shabu Restaurant 229 | feel 230 | Shabu-Shabu 231 | owner 232 | Taxan 233 | location 234 | green curry with vegetables 235 | ravioli 236 | trattoria 237 | regular menu-fare 238 | parmesean porcini souffle 239 | lamb glazed with balsamic vinegar 240 | candle-light 241 | marinara/arrabiatta sauce 242 | mozzarella en Carozza 243 | back room 244 | Emilio 245 | wine choices 246 | pepperoni 247 | family style salad 248 | Amma 249 | vegetarian dishes 250 | non-veg selections 251 | Decor 252 | sea bass 253 | Dal Bukhara 254 | kababs 255 | Haru on Park S 256 | rice 257 | all-u-can-eat sushi 258 | soy sauce 259 | waitstaffs 260 | Roth's 261 | dinner 262 | eggs benedict 263 | room 264 | Planet Thai 265 | svc 266 | Pad Thai 267 | sandwiches 268 | Chennai Garden 269 | Indian 270 | seats 271 | cigar bar 272 | portion sizes 273 | PLACE 274 | Waitstaff 275 | YUKA 276 | Mermaid Inn 277 | lobster sandwich 278 | spaghetti with Scallops and Shrimp 279 | halibut special 280 | steak 281 | foods 282 | jelly fish 283 | drunken chicken 284 | soupy dumplings 285 | stir fry blue crab 286 | Cheese plate 287 | bruschettas 288 | paninis 289 | tramezzinis 290 | asparagus, truffle oil, parmesan bruschetta 291 | Wine list 292 | Cafe Noir 293 | manager 294 | calamari 295 | thai cuisine 296 | customer service 297 | gentleman 298 | mileau 299 | Casimir 300 | outside table 301 | unisex bathroom 302 | caviar 303 | salmon dish 304 | Change Mojito 305 | dim sum 306 | back patio 307 | music 308 | buffet 309 | cheesecake 310 | pastries 311 | cheeseburgers 312 | burgers 313 | pastrami sandwich on a roll 314 | spice 315 | Tom Kha soup 316 | Thai 317 | pesto pizza 318 | spicy Italian cheese 319 | back garden sitting area 320 | french fries 321 | scallops 322 | sauce 323 | martinis 324 | japanese comfort food 325 | lamb sausages 326 | sardines with biscuits 327 | large whole shrimp 328 | pistachio ice cream 329 | delivery 330 | ceviche mix (special) 331 | crab dumplings 332 | assorted sashimi 333 | sake 334 | banana tempura 335 | Thai food 336 | Thai restaurant 337 | Gulab Jamun (dessert) 338 | Guacamole+shrimp appetizer 339 | filet 340 | frites 341 | pizza place 342 | mare 343 | pizzeria 344 | pizzas 345 | Sophia pizza 346 | blond wood decor 347 | premium sake 348 | kitchen food 349 | Sushi 350 | cuisine 351 | Thalia 352 | smoked salmon and roe appetizer 353 | entree 354 | strawberry daiquiries 355 | Taiwanese food 356 | cold appetizer dishes 357 | mahi mahi 358 | saffron risotto 359 | chicken and mashed potatos 360 | crab cakes 361 | garden 362 | joint 363 | selection of thin crust pizza 364 | Basil slice 365 | calzones 366 | counter service 367 | PIZZA 33 368 | dosas 369 | clerks 370 | Italian food 371 | basic dishes 372 | apppetizers 373 | sushimi cucumber roll 374 | spreads 375 | beverage selections 376 | Pizza 33 377 | Williamsburg spot 378 | proprietor 379 | coffee 380 | Bagels 381 | turkey burgers 382 | Japanese food 383 | soup for the udon 384 | Japanese cuisine 385 | bottle minimun 386 | Margheritta slice 387 | atmoshere 388 | Winnie 389 | appetizer menu 390 | brioche and lollies 391 | salmon 392 | crab salad 393 | Lucky Strike 394 | mussels in spicy tomato sauce 395 | fries 396 | late night atmosphere 397 | martini 398 | Vanilla Shanty 399 | in-house lady DJ 400 | Suan 401 | noodles with shrimp and chicken and coconut juice 402 | terrace 403 | Indian food 404 | balsamic vinegar over icecream 405 | Go Go Hamburgers 406 | turnip cake 407 | roast pork buns 408 | egg custards 409 | braised lamb shank in red wine 410 | interior decor 411 | pad se ew chicken 412 | pad thai 413 | Ginger House 414 | Chinese restaurant 415 | management 416 | Spreads 417 | Rao 418 | indian cuisine 419 | shrimp appetizers 420 | eats 421 | indian food 422 | cheff 423 | baked clams octopus 424 | lamb 425 | bar scene 426 | Appetizers 427 | potato stuff kanish 428 | chicken 429 | servings for main entree 430 | Dessert 431 | veal 432 | anti-pasta 433 | pasta mains 434 | measures of liquers 435 | SEASONAL beer 436 | Heartland Brewery 437 | beer 438 | shrimp scampi 439 | antipasti 440 | Corona 441 | atomosphere 442 | porcini mushroom pasta special 443 | seafood tagliatelle 444 | BBQ ribs 445 | congee (rice porridge) 446 | main dining room 447 | ceiling 448 | patio 449 | hot sauce 450 | drink 451 | cheescake 452 | outdoor seating 453 | selecion of wines 454 | chicken casserole 455 | beef 456 | lamb dishes 457 | Reuben sandwich 458 | Bloom's 459 | sauces 460 | Ravioli 461 | wines by the glass 462 | setting/atmosphere 463 | Pakistani food 464 | People 465 | Faan 466 | design 467 | bottles of wine 468 | mussles 469 | seabass 470 | goat cheese salad 471 | penne w/ chicken 472 | desert 473 | jukebox 474 | pastas 475 | Usha 476 | sassy lassi 477 | roti rolls 478 | Unda (Egg) rolls 479 | spices 480 | onions 481 | eggs 482 | roti 483 | Toons 484 | drumsticks over rice 485 | sour spicy soup 486 | Beef noodle soup 487 | Rao's 488 | Indoor 489 | Staff 490 | dumplings 491 | $10 10-piece dim sum combo 492 | crabmeat lasagna 493 | chocolate bread pudding 494 | egg noodles in the beef broth with shrimp dumplings and slices of BBQ roast pork 495 | dish 496 | Ow Ley Soh 497 | Chinese food 498 | jazz duo 499 | wine by the glass 500 | Japanese Tapas 501 | Atmosphere 502 | Yakitori (bbq meats) 503 | sushi chef 504 | Rice Avenue 505 | wait-staff 506 | Baluchi's 507 | nigiri 508 | all you can eat sushi 509 | Areo 510 | view of the new york city skiline 511 | Personal pans 512 | delivery guys 513 | scene 514 | penne a la vodka 515 | pasta penne 516 | La Rosa 517 | selection 518 | mussels 519 | Vittorio 520 | crew 521 | Dining Garden 522 | Jazz Bar 523 | Thin Crust Pizzas 524 | Lasagna Menu 525 | BBQ Salmon 526 | Sea Bass 527 | Crispy Duck 528 | Al Di La 529 | risottos 530 | sepia 531 | braised rabbit 532 | Dog 533 | dog 534 | establishment 535 | Cypriot restaurant 536 | pork souvlaki 537 | eggplant pizza 538 | half/half pizza 539 | interior 540 | space 541 | Delivery guy 542 | millennium roll 543 | views of the city 544 | Place 545 | seafood spaghetti 546 | indo-chinese food 547 | chicken pasta 548 | restaraunt 549 | vitello alla marsala 550 | mushrooms 551 | potato balls 552 | bartender 553 | waitresses 554 | japanese food 555 | Chicken teriyaki 556 | The Four Seasons 557 | Red Dragon Roll 558 | Yamato 559 | Seafood Dynamite 560 | back garden area 561 | Dancing, White River and Millenium rolls 562 | quesadilla 563 | guacamole 564 | margaritas 565 | Indian Food 566 | indian place 567 | drink menu 568 | Voss bottles of water 569 | view of river and NYC 570 | survice 571 | Egyptian restaurant 572 | baba ganoush 573 | belly dancers 574 | hookah 575 | Raymond 576 | Pacifico 577 | omlette for brunch 578 | spinach 579 | quacamole 580 | wings with chimmichuri 581 | chicken in the salads 582 | portobello and asparagus mole 583 | Indian Restaurant 584 | gyros 585 | gyro meat 586 | sausages 587 | Greek and Cypriot dishes 588 | gyro 589 | booths 590 | Water's Edge 591 | rooms 592 | views 593 | Paul 594 | bathroom 595 | bar drinks 596 | stuff tilapia 597 | mens bathroom 598 | bread 599 | appetizer of olives 600 | main course 601 | pear torte 602 | boths 603 | service button 604 | looks 605 | Greg 606 | runner 607 | bev 608 | dogs 609 | trimmings 610 | hot dog 611 | Casa La Femme 612 | cocktail with Citrus Vodka and lemon and lime juice and mint leaves 613 | belly dancing show 614 | mushroom sauce 615 | triple color and norwegetan rolls 616 | special roll 617 | regular roll 618 | banana chocolate dessert 619 | green tea tempura 620 | appetizers 621 | modern Japanese 622 | scheme of mirrors 623 | modern Japanese food 624 | mirrors 625 | Indo Chinese food 626 | Chinese style Indian food 627 | chicken lollipop 628 | Chilli Chicken 629 | vegetarian dish 630 | Patsy's Pizza 631 | hot dogs 632 | indian chinese food 633 | Indian Chinese 634 | Village Underground 635 | SERVICE 636 | DJ 637 | dinner for two 638 | environment 639 | Vanison 640 | Bison 641 | dessserts 642 | selection of bottled beer 643 | fried oysters and clams 644 | lobster knuckles 645 | "salt encrusted shrimp" appetizer 646 | Thai style Fried Sea Bass 647 | grilled Mahi Mahi 648 | chicken tikka masala 649 | Bukhara Grill 650 | lunch buffet 651 | Bukhara 652 | kimchee 653 | Korean fair 654 | four course prix fix menu 655 | white organza tent 656 | bibimbap 657 | stone bowl 658 | nakgi-bokum 659 | stir-fried squid 660 | side dishes 661 | risotto 662 | farro salad 663 | mashed yukon potatoes 664 | east village pizza 665 | margherita pizza 666 | slice of NYC pizza 667 | Restaurant 668 | modern Japanese brasserie 669 | unmarked wooden doors 670 | décor 671 | private booths 672 | glass ceilings 673 | BBE $29 fixe prix menu 674 | sashimi amuse bouche 675 | Grilled Black Cod 676 | Grilled Salmon dish 677 | sake’s 678 | frozen black sesame mousse 679 | matcha (powdered green tea) and blueberry cheesecake 680 | brasserie 681 | Zenkichi 682 | pita bread 683 | Shabu Shabu 684 | meat 685 | kimchi 686 | Korean food 687 | fusion twists 688 | pork belly tacos 689 | pork croquette sandwich 690 | bun 691 | family seafood entree 692 | main entree 693 | mirrored walls 694 | japanese tapas 695 | sitting space 696 | appetizer 697 | fit-out 698 | furnishings 699 | palce 700 | STAFF 701 | pita 702 | hummus 703 | grilled octopus 704 | eggplant 705 | Hot Dogs 706 | Bark 707 | Slamwich 708 | front of house staff 709 | fish dishes 710 | BFC 711 | girl 712 | Maitre-D 713 | maitre-D 714 | bottle of wine 715 | frontman 716 | Mussles 717 | Lamb special 718 | flank steak 719 | The Four Seasons restaurant 720 | fish tacos 721 | fire place 722 | Casa la Femme 723 | pasta 724 | -------------------------------------------------------------------------------- /loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import codecs 4 | 5 | 6 | from data_utils import create_dico, create_mapping, zero_digits 7 | from data_utils import iob2, iob_iobes, get_pos_ids, get_dep_ids, get_lexcion_features, load_lexcion 8 | 9 | 10 | def load_sentences(path, lower, zeros): 11 | """ 12 | Load sentences. A line must contain at least a word and its tag. 13 | Sentences are separated by empty lines. 14 | """ 15 | sentences = [] 16 | sentence = [] 17 | num = 0 18 | for line in codecs.open(path, 'r', 'utf8'): 19 | num+=1 20 | line = zero_digits(line.rstrip()) if zeros else line.rstrip() 21 | # print(list(line)) 22 | if not line: 23 | if len(sentence) > 0: 24 | if 'DOCSTART' not in sentence[0][0]: 25 | sentences.append(sentence) 26 | sentence = [] 27 | else: 28 | if line[0] == " ": 29 | line = "$" + line[1:] 30 | word = line.split() 31 | # word[0] = " " 32 | else: 33 | word= line.split() 34 | assert len(word) >= 2, print([word[0]]) 35 | sentence.append(word) 36 | if len(sentence) > 0: 37 | if 'DOCSTART' not in sentence[0][0]: 38 | sentences.append(sentence) 39 | return sentences 40 | 41 | 42 | def update_tag_scheme(sentences, tag_scheme): 43 | """ 44 | Check and update sentences tagging scheme to IOB2. 45 | Only IOB1 and IOB2 schemes are accepted. 46 | """ 47 | 48 | for i, s in enumerate(sentences): 49 | tags = [w[-1] for w in s] 50 | # Check that tags are given in the IOB format 51 | if not iob2(tags): 52 | s_str = '\n'.join(' '.join(w) for w in s) 53 | raise Exception('Sentences should be given in IOB format! ' + 54 | 'Please check sentence %i:\n%s' % (i, s_str)) 55 | if tag_scheme == 'iob': 56 | # If format was IOB1, we convert to IOB2 57 | for word, new_tag in zip(s, tags): 58 | word[-1] = new_tag 59 | elif tag_scheme == 'iobes': 60 | new_tags = iob_iobes(tags) 61 | for word, new_tag in zip(s, new_tags): 62 | word[-1] = new_tag 63 | else: 64 | raise Exception('Unknown tagging scheme!') 65 | 66 | 67 | def char_mapping(sentences, lower): 68 | """ 69 | Create a dictionary and a mapping of words, sorted by frequency. 70 | """ 71 | chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] 72 | #print(chars) 73 | dico = create_dico(chars) 74 | dico[""] = 10000001 75 | dico[''] = 10000000 76 | #print(dico) 77 | char_to_id, id_to_char = create_mapping(dico) 78 | #print("id_to_char", id_to_char) 79 | # print("char_to_id", char_to_id) 80 | print("Found %i unique words (%i in total)" % ( 81 | len(dico), sum(len(x) for x in chars) 82 | )) 83 | return dico, char_to_id, id_to_char 84 | 85 | 86 | def tag_mapping(sentences): 87 | """ 88 | Create a dictionary and a mapping of tags, sorted by frequency. 89 | """ 90 | tags = [[char[-1] for char in s] for s in sentences] 91 | dico = create_dico(tags) 92 | tag_to_id, id_to_tag = create_mapping(dico) 93 | print("Found %i unique named entity tags" % len(dico)) 94 | return dico, tag_to_id, id_to_tag 95 | 96 | 97 | def prepare_dataset(sentences, char_to_id, tag_to_id, l_sorted_lexcion, lower=False, train=True): 98 | """ 99 | Prepare the dataset. Return a list of lists of dictionaries containing: 100 | - word indexes 101 | - word char indexes 102 | - tag indexes 103 | """ 104 | l = lower 105 | 106 | none_index = tag_to_id["O"] 107 | 108 | def f(x): 109 | return x.lower() if lower else x 110 | data = [] 111 | for s in sentences: 112 | string = [w[0] for w in s] 113 | chars = [char_to_id[f(w) if f(w) in char_to_id else ''] 114 | for w in string] 115 | #segs = get_seg_features(" ".join(string)) 116 | lexcion_features = get_lexcion_features(string, l_sorted_lexcion) 117 | poses = [w[1] for w in s] 118 | pos_ids = get_pos_ids(poses) 119 | dep_name = [w[2] for w in s] 120 | dep_ids = get_dep_ids(dep_name) 121 | string_heads = [w[3] for w in s] 122 | head_ids = [char_to_id[f(w) if f(w) in char_to_id else ''] 123 | for w in string_heads] 124 | if train: 125 | tags = [tag_to_id[w[-1]] for w in s] 126 | else: 127 | tags = [none_index for _ in chars] 128 | # data.append([string, chars, segs, tags]) 129 | data.append([string, chars, lexcion_features, pos_ids, dep_ids, head_ids, tags]) 130 | 131 | return data 132 | 133 | 134 | def augment_with_pretrained(dictionary, ext_emb_path, chars): 135 | """ 136 | Augment the dictionary with words that have a pretrained embedding. 137 | If `words` is None, we add every word that has a pretrained embedding 138 | to the dictionary, otherwise, we only add the words that are given by 139 | `words` (typically the words in the development and test sets.) 140 | """ 141 | print('Loading pretrained embeddings from %s...' % ext_emb_path) 142 | assert os.path.isfile(ext_emb_path) 143 | 144 | # Load pretrained embeddings from file 145 | pretrained = set([ 146 | line.rstrip().split()[0].strip() 147 | for line in codecs.open(ext_emb_path, 'r', 'utf-8') 148 | if len(ext_emb_path) > 0 149 | ]) 150 | #print(pretrained) 151 | 152 | # We either add every word in the pretrained file, 153 | # or only words given in the `words` list to which 154 | # we can assign a pretrained embedding 155 | if chars is None: 156 | print("char is none") 157 | for char in pretrained: 158 | if char not in dictionary: 159 | dictionary[char] = 0 160 | else: 161 | #print("___________") 162 | for char in chars: 163 | if any(x in pretrained for x in [ 164 | char, 165 | char.lower(), 166 | re.sub('\d', '0', char.lower()) 167 | ]) and char not in dictionary: 168 | dictionary[char] = 0 169 | 170 | word_to_id, id_to_word = create_mapping(dictionary) 171 | #print(dictionary) 172 | return dictionary, word_to_id, id_to_word 173 | 174 | 175 | def save_maps(save_path, *params): 176 | """ 177 | Save mappings and invert mappings 178 | """ 179 | pass 180 | # with codecs.open(save_path, "w", encoding="utf8") as f: 181 | # pickle.dump(params, f) 182 | 183 | 184 | def load_maps(save_path): 185 | """ 186 | Load mappings from the file 187 | """ 188 | pass 189 | # with codecs.open(save_path, "r", encoding="utf8") as f: 190 | # pickle.load(save_path, f) 191 | 192 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import os 3 | import codecs 4 | import csv 5 | import pickle 6 | import itertools 7 | from collections import OrderedDict 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | from model import Model 12 | from loader import load_sentences, update_tag_scheme 13 | from loader import char_mapping, tag_mapping 14 | from loader import augment_with_pretrained, prepare_dataset 15 | from utils import get_logger, make_path, clean, create_model, save_model 16 | from utils import print_config, save_config, load_config, test_as 17 | from data_utils import load_word2vec, create_input, input_from_line, BatchManager, load_lexcion, split_train_dev, pad_data 18 | #from stanfordcorenlp import StanfordCoreNLP 19 | import random 20 | 21 | flags = tf.app.flags 22 | flags.DEFINE_boolean("clean", True, "clean train folder") 23 | flags.DEFINE_boolean("train", True, "Wither train the model") 24 | # configurations for the model 25 | #flags.DEFINE_integer("seg_dim", 0, "Embedding size for segmentation, 0 if not used") 26 | flags.DEFINE_integer("char_dim", 100, "Embedding size for characters") 27 | flags.DEFINE_integer("pos_dim", 0, "Embedding size of pos, 0 if not used") 28 | flags.DEFINE_integer("dep_name_dim", 0, "Embedding size of dep_name, 0 if not used") 29 | flags.DEFINE_integer("dependency_dim", 0, "Embedding size of dep, 0 if not used") 30 | flags.DEFINE_integer("lexcion_dim", 0, "Embedding size of lexcion, 0 if not used") 31 | flags.DEFINE_integer("lstm_dim", 100, "Num of hidden units in LSTM") 32 | flags.DEFINE_integer("attention_dim", 200, "Attention_dim") 33 | flags.DEFINE_integer("gru_dim", 100, "Gru hidden units") 34 | flags.DEFINE_string("tag_schema", "iobes", "tagging schema iobes or iob") 35 | 36 | # configurations for training 37 | flags.DEFINE_float("clip", 5, "Gradient clip") 38 | flags.DEFINE_float("dropout", 0.5, "Dropout rate") 39 | flags.DEFINE_float("batch_size", 20, "batch size") 40 | flags.DEFINE_float("lr", 0.001, "Initial learning rate") 41 | flags.DEFINE_string("optimizer", "adam", "Optimizer for training") 42 | flags.DEFINE_boolean("pre_emb", True, "Wither use pre-trained embedding") 43 | flags.DEFINE_boolean("zeros", False, "Wither replace digits with zero") 44 | flags.DEFINE_boolean("lower", True, "Wither lower case") 45 | 46 | flags.DEFINE_integer("max_epoch", 100, "maximum training epochs") 47 | flags.DEFINE_integer("steps_check", 30, "steps per checkpoint") 48 | flags.DEFINE_string("ckpt_path", "ckpt", "Path to save model") 49 | flags.DEFINE_string("summary_path", "summary", "Path to store summaries") 50 | flags.DEFINE_string("log_file", "train.log", "File for log") 51 | flags.DEFINE_string("map_file", "maps.pkl", "file for maps") 52 | flags.DEFINE_string("vocab_file", "vocab.json", "File for vocab") 53 | flags.DEFINE_string("config_file", "config_file", "File for config") 54 | flags.DEFINE_string("script", "conlleval", "evaluation script") 55 | flags.DEFINE_string("result_path", "result", "Path for results") 56 | flags.DEFINE_string("emb_file", "glove.6B.100d.txt", "Path for pre_trained embedding") 57 | flags.DEFINE_string("lexcion_file", os.path.join("lexcion", "restaurant15_dict.csv"), "Path for lexcion file") 58 | flags.DEFINE_string("train_file", os.path.join("data1", "laptop_train_POS_DEP_BIO_data.csv"), "Path for train data") 59 | flags.DEFINE_string("dev_file", os.path.join("data1", "laptop_test_POS_DEP_BIO_data.csv"), "Path for dev data") 60 | flags.DEFINE_string("test_file", os.path.join("data1", "laptop_test_POS_DEP_BIO_data.csv"), "Path for test data") 61 | 62 | 63 | FLAGS = tf.app.flags.FLAGS 64 | assert FLAGS.clip < 5.1, "gradient clip should't be too much" 65 | assert 0 <= FLAGS.dropout < 1, "dropout rate between 0 and 1" 66 | assert FLAGS.lr > 0, "learning rate must larger than zero" 67 | assert FLAGS.optimizer in ["adam", "sgd", "adagrad"] 68 | 69 | 70 | # config for the model 71 | def config_model(char_to_id, tag_to_id, max_len): 72 | config = OrderedDict() 73 | config["num_chars"] = len(char_to_id) 74 | config["char_dim"] = FLAGS.char_dim 75 | config["pos_dim"] = FLAGS.pos_dim 76 | config["dep_name_dim"] = FLAGS.dep_name_dim 77 | config["dependency_dim"] = FLAGS.dependency_dim 78 | config["lexcion_dim"] = FLAGS.lexcion_dim 79 | config["num_tags"] = len(tag_to_id) 80 | # config["seg_dim"] = FLAGS.seg_dim 81 | config["lstm_dim"] = FLAGS.lstm_dim 82 | config["attention_dim"] = FLAGS.attention_dim 83 | config["gru_dim"] = FLAGS.gru_dim 84 | config["batch_size"] = FLAGS.batch_size 85 | 86 | config["lexcion_file"] = FLAGS.lexcion_file 87 | 88 | config["emb_file"] = FLAGS.emb_file 89 | config["clip"] = FLAGS.clip 90 | config["dropout_keep"] = 1.0 - FLAGS.dropout 91 | config["optimizer"] = FLAGS.optimizer 92 | config["lr"] = FLAGS.lr 93 | config["tag_schema"] = FLAGS.tag_schema 94 | config["pre_emb"] = FLAGS.pre_emb 95 | config["zeros"] = FLAGS.zeros 96 | config["lower"] = FLAGS.lower 97 | 98 | config["max_epoch"] = FLAGS.max_epoch 99 | config["max_len"] = max_len 100 | return config 101 | 102 | 103 | def evaluate(sess, model, name, data, id_to_tag, logger): 104 | logger.info("evaluate:{}".format(name)) 105 | as_results = model.evaluate(sess, data, id_to_tag) 106 | # logger.info(att_scores) 107 | eval_lines = test_as(as_results, FLAGS.result_path) 108 | for line in eval_lines: 109 | logger.info(line) 110 | f1 = float(eval_lines[1].strip().split()[-1]) 111 | 112 | if name == "dev": 113 | best_test_f1 = model.best_dev_f1.eval() 114 | if f1 > best_test_f1: 115 | tf.assign(model.best_dev_f1, f1).eval() 116 | logger.info("new best dev f1 score:{:>.3f}".format(f1)) 117 | return f1 > best_test_f1 118 | elif name == "test": 119 | best_test_f1 = model.best_test_f1.eval() 120 | if f1 > best_test_f1: 121 | tf.assign(model.best_test_f1, f1).eval() 122 | logger.info("new best test f1 score:{:>.3f}".format(f1)) 123 | return f1 > best_test_f1 124 | 125 | 126 | def train(): 127 | # load data sets 128 | # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) 129 | # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) 130 | all_train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) 131 | train_sentences, dev_sentences = split_train_dev(all_train_sentences) 132 | test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) 133 | 134 | # Use selected tagging scheme (IOB / IOBES) 135 | update_tag_scheme(train_sentences, FLAGS.tag_schema) 136 | update_tag_scheme(test_sentences, FLAGS.tag_schema) 137 | 138 | # update_tag_scheme(dev_sentences, FLAGS.tag_schema) 139 | 140 | # create maps if not exist 141 | if not os.path.isfile(FLAGS.map_file): 142 | # create dictionary for word 143 | if FLAGS.pre_emb: 144 | # dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] 145 | dico_chars_train = char_mapping(all_train_sentences, FLAGS.lower)[0] 146 | dico_chars, char_to_id, id_to_char = augment_with_pretrained( 147 | dico_chars_train.copy(), 148 | FLAGS.emb_file, 149 | list(itertools.chain.from_iterable( 150 | [[w[0] for w in s] for s in test_sentences]) 151 | ) 152 | ) 153 | else: 154 | _c, char_to_id, id_to_char = char_mapping(all_train_sentences, FLAGS.lower) 155 | # _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) 156 | 157 | # Create a dictionary and a mapping for tags 158 | _t, tag_to_id, id_to_tag = tag_mapping(all_train_sentences) 159 | # _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) 160 | with open(FLAGS.map_file, "wb") as f: 161 | pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) 162 | else: 163 | with open(FLAGS.map_file, "rb") as f: 164 | char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) 165 | 166 | # nlp = StanfordCoreNLP(r'E:\DC\dataset\泰一指尚评测数据\stanford-corenlp-full-2017-06-09') 167 | #l_sorted_lexcion = load_lexcion(FLAGS.lexcion_file, nlp) 168 | l_sorted_lexcion = [] 169 | # prepare data, get a collection of list containing index 170 | train_data = prepare_dataset( 171 | train_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower 172 | ) 173 | dev_data = prepare_dataset( 174 | dev_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower 175 | ) 176 | test_data = prepare_dataset( 177 | test_sentences, char_to_id, tag_to_id, l_sorted_lexcion, FLAGS.lower 178 | ) 179 | print("%i / %i / %i sentences in train / dev / test." % ( 180 | len(train_data), len(dev_data), len(test_data))) 181 | 182 | max_len = max([len(sentence[0]) for sentence in train_data + test_data + dev_data]) 183 | 184 | train_manager = BatchManager(train_data, FLAGS.batch_size, max_len) 185 | dev_manager = BatchManager(dev_data, 800, max_len) 186 | test_manager = BatchManager(test_data, 800, max_len) 187 | 188 | # random.shuffle(train_data) 189 | 190 | 191 | # pad_test_data = pad_data(test_data) 192 | # pad_dev_data = pad_data(dev_data) 193 | 194 | # make path for store log and model if not exist 195 | make_path(FLAGS) 196 | if os.path.isfile(FLAGS.config_file): 197 | config = load_config(FLAGS.config_file) 198 | else: 199 | config = config_model(char_to_id, tag_to_id, max_len) 200 | save_config(config, FLAGS.config_file) 201 | make_path(FLAGS) 202 | 203 | log_path = os.path.join("log", FLAGS.log_file) 204 | logger = get_logger(log_path) 205 | print_config(config, logger) 206 | 207 | # limit GPU memory 208 | tf_config = tf.ConfigProto() 209 | tf_config.gpu_options.allow_growth = True 210 | steps_per_epoch = train_manager.len_data 211 | with tf.Session(config=tf_config) as sess: 212 | model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) 213 | logger.info("start training") 214 | loss = [] 215 | for i in range(FLAGS.max_epoch): 216 | random.shuffle(train_data) 217 | pad_train_data = pad_data(train_data, max_len) 218 | strings, chars, lexcion_teatures, pos_ids, dep_ids, head_ids, targets = pad_train_data 219 | for j in range(0, len(strings), FLAGS.batch_size): 220 | batch = [strings[j: j + FLAGS.batch_size], 221 | chars[j: j + FLAGS.batch_size], 222 | lexcion_teatures[j: j + FLAGS.batch_size], 223 | pos_ids[j: j + FLAGS.batch_size], 224 | dep_ids[j: j + FLAGS.batch_size], 225 | head_ids[j: j + FLAGS.batch_size], 226 | targets[j: j + FLAGS.batch_size]] 227 | step, batch_loss = model.run_step(sess, True, batch) 228 | loss.append(batch_loss) 229 | if step % FLAGS.steps_check == 0: 230 | iteration = step // steps_per_epoch + 1 231 | logger.info("iteration:{} step:{}/{}, " 232 | "AS loss:{:>9.6f}".format( 233 | iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) 234 | loss = [] 235 | 236 | best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) 237 | if best: 238 | save_model(sess, model, FLAGS.ckpt_path, logger, i) 239 | evaluate(sess, model, "test", test_manager, id_to_tag, logger) 240 | evaluate(sess, model, "test", test_manager, id_to_tag, logger) 241 | 242 | def evaluate_line(): 243 | config = load_config(FLAGS.config_file) 244 | logger = get_logger(FLAGS.log_file) 245 | # limit GPU memory 246 | tf_config = tf.ConfigProto() 247 | tf_config.gpu_options.allow_growth = True 248 | with open(FLAGS.map_file, "rb") as f: 249 | char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) 250 | with tf.Session(config=tf_config) as sess: 251 | model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) 252 | while True: 253 | # try: 254 | # line = input("请输入测试句子:") 255 | # result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) 256 | # print(result) 257 | # except Exception as e: 258 | # logger.info(e) 259 | 260 | line = input("请输入测试句子: 未完待续") 261 | result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) 262 | print(result) 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | def main(_): 271 | 272 | if FLAGS.train: 273 | if FLAGS.clean: 274 | clean(FLAGS) 275 | train() 276 | else: 277 | evaluate_line() 278 | # clean(FLAGS) 279 | # train() 280 | 281 | 282 | if __name__ == "__main__": 283 | tf.app.run(main) 284 | 285 | 286 | 287 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | # encoding = utf8 2 | import numpy as np 3 | import tensorflow as tf 4 | from tensorflow.contrib.crf import crf_log_likelihood 5 | from tensorflow.contrib.crf import viterbi_decode 6 | from tensorflow.contrib.layers.python.layers import initializers 7 | 8 | import rnncell as rnn1 9 | from utils import result_to_json 10 | from data_utils import create_input, iobes_iob 11 | 12 | from tensorflow.python.ops import rnn_cell 13 | from tensorflow.python.ops import rnn 14 | 15 | from Attention import attention_layer 16 | #from Attention1 import attention_layer1 17 | 18 | class Model(object): 19 | def __init__(self, config): 20 | 21 | self.config = config 22 | self.lr = config["lr"] 23 | self.char_dim = config["char_dim"] 24 | self.lstm_dim = config["lstm_dim"] 25 | self.attention_dim = config["attention_dim"] 26 | self.gru_dim = config["gru_dim"] 27 | # self.seg_dim = config["seg_dim"] 28 | self.pos_dim = config["pos_dim"] 29 | self.dep_name_dim = config["dep_name_dim"] 30 | self.dependency_dim = config["dependency_dim"] 31 | self.lexcion_dim = config["lexcion_dim"] 32 | 33 | self.num_tags = config["num_tags"] 34 | self.num_chars = config["num_chars"] 35 | 36 | self.max_len = config["max_len"] 37 | 38 | #self.num_segs = 4 39 | self.num_lexcion_features = 5 40 | self.num_poses = 48 41 | self.num_deps = 42 42 | 43 | self.global_step = tf.Variable(0, trainable=False) 44 | self.best_dev_f1 = tf.Variable(0.0, trainable=False) 45 | self.best_test_f1 = tf.Variable(0.0, trainable=False) 46 | self.initializer = initializers.xavier_initializer() 47 | 48 | # add placeholders for the model 49 | #shape = [batch_size, max_len] 50 | self.char_inputs = tf.placeholder(dtype=tf.int32, 51 | shape=[None, None], 52 | name="ChatInputs") 53 | self.lexcion_feature_inputs = tf.placeholder(dtype=tf.int32, 54 | shape=[None, None], 55 | name="LexcionFeatureInputs") 56 | # self.seg_inputs = tf.placeholder(dtype=tf.int32, 57 | 58 | # name="SegInputs") 59 | self.pos_id_inputs = tf.placeholder(dtype=tf.int32, 60 | shape=[None, None], 61 | name="PosIdInputs") 62 | self.dep_id_inputs = tf.placeholder(dtype=tf.int32, 63 | shape=[None, None], 64 | name="DepIdInputs") 65 | self.head_id_inputs = tf.placeholder(dtype=tf.int32, 66 | shape=[None, None], 67 | name="HeadIdInputs") 68 | 69 | self.targets = tf.placeholder(dtype=tf.int32, 70 | shape=[None, None], 71 | name="Targets") 72 | # dropout keep prob 73 | self.dropout = tf.placeholder(dtype=tf.float32, 74 | name="Dropout") 75 | 76 | used = tf.sign(tf.abs(self.char_inputs)) 77 | length = tf.reduce_sum(used, reduction_indices=1) 78 | self.lengths = tf.cast(length, tf.int32) 79 | self.batch_size = tf.shape(self.char_inputs)[0] 80 | self.num_steps = tf.shape(self.char_inputs)[-1] 81 | 82 | self.mask = tf.cast(self.char_inputs, tf.bool) 83 | # print("lengths-----", self.lengths) 84 | 85 | # print("maxlen-----", self.max_len) 86 | # print("dropout-----", self.dropout.eval()) 87 | # print("num_step---", self.num_steps) 88 | 89 | # embeddings for chinese character and segmentation representation 90 | # embedding = self.embedding_layer(self.char_inputs, self.seg_inputs, config) 91 | embedding = self.embedding_layer(self.char_inputs, self.lexcion_feature_inputs, self.pos_id_inputs, self.dep_id_inputs, self.head_id_inputs, config) 92 | 93 | # apply dropout before feed to lstm layer 94 | lstm_inputs = tf.nn.dropout(embedding, self.dropout) 95 | # dep_inputs = tf.nn.dropout(dep_embedding, self.dropout) 96 | print(lstm_inputs) 97 | # bi-directional lstm layer 98 | lstm_outputs = self.biLSTM_layer(lstm_inputs, self.lstm_dim, self.lengths) 99 | 100 | attention1_outputs,_ = attention_layer(lstm_outputs, self.num_steps, self.max_len, self.attention_dim, self.gru_dim, self.lengths) 101 | 102 | 103 | #attention1_outputs, _, SCORES = attention_layer1(lstm_outputs, self.num_steps, self.max_len, self.attention_dim, self.gru_dim, self.lengths) 104 | # attention1_outputs = attention_layer1_with_dep(lstm_outputs, dep_inputs, self.mask, self.num_steps, self.max_len, self.attention_dim, self.gru_dim, self.lengths) 105 | 106 | # update_att_outputs1 = update_attention_outputs_layer1(attention1_outputs, self.gru_dim, self.lengths) 107 | # print(lstm_outputs) 108 | # lstm_outputs = tf.nn.dropout(lstm_outputs, self.dropout) 109 | 110 | # attention1_outputs = self.attention_layer1(lstm_outputs, self.attention_dim) 111 | # print("attention_outputs", attention_outputs) 112 | # attention1_outputs = tf.nn.dropout(attention1_outputs, self.dropout) 113 | # updated_attention1_outputs = self.update_attention_outputs_layer1(attention1_outputs, self.gru_dim, self.lengths) 114 | # updated_attention1_outputs = tf.nn.dropout(updated_attention1_outputs, self.dropout) 115 | # attention2_outputs = self.attention_layer2(lstm_outputs, updated_attention1_outputs, self.attention_dim) 116 | # attention2_outputs = tf.nn.dropout(attention2_outputs, self.dropout) 117 | # updated_attention2_outputs = self.update_attention_outputs_layer2(attention2_outputs, self.gru_dim, self.lengths) 118 | # attention_outputs = tf.nn.dropout(attention_outputs, self.dropout) 119 | # attention3_outputs = self.attention_layer2(lstm_outputs, updated_attention2_outputs, self.attention_dim, name="attention3") 120 | # updated_attention3_outputs = self.update_attention_outputs_layer2(attention3_outputs, self.gru_dim, self.lengths, name="update3") 121 | # attention4_outputs = self.attention_layer2(lstm_outputs, updated_attention3_outputs, self.attention_dim, name="attention4") 122 | # updated_attention4_outputs = self.update_attention_outputs_layer2(attention4_outputs, self.gru_dim, self.lengths, name="update4") 123 | 124 | # self.att_s = tf.slice(SCORES, [0, 0, 0, 0], [20, 1, 1, 20]) 125 | 126 | # logits for tags 127 | self.logits = self.project_layer(attention1_outputs) 128 | 129 | # loss of the model 130 | self.loss = self.loss_layer(self.logits, self.lengths) 131 | 132 | with tf.variable_scope("optimizer"): 133 | optimizer = self.config["optimizer"] 134 | if optimizer == "sgd": 135 | self.opt = tf.train.GradientDescentOptimizer(self.lr) 136 | elif optimizer == "adam": 137 | self.opt = tf.train.AdamOptimizer(self.lr) 138 | elif optimizer == "adgrad": 139 | self.opt = tf.train.AdagradOptimizer(self.lr) 140 | else: 141 | raise KeyError 142 | 143 | # apply grad clip to avoid gradient explosion 144 | grads_vars = self.opt.compute_gradients(self.loss) 145 | capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] 146 | for g, v in grads_vars] 147 | self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) 148 | 149 | # saver of the model 150 | self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5) 151 | 152 | def embedding_layer(self, char_inputs, lexcion_feature_inputs, pos_id_inputs, dep_id_inputs, head_id_inputs, config, name=None): 153 | """ 154 | :param char_inputs: one-hot encoding of sentence 155 | :param seg_inputs: segmentation feature 156 | :param config: wither use segmentation feature 157 | :return: [1, num_steps, embedding size], 158 | """ 159 | 160 | embedding = [] 161 | # dep_embedding = [] 162 | # shape = [batch_size, max_len, embedding_dim] 163 | with tf.variable_scope("char_embedding" if not name else name), tf.device('/cpu:0'): 164 | self.char_lookup = tf.get_variable( 165 | name="char_embedding", 166 | shape=[self.num_chars, self.char_dim], 167 | initializer=self.initializer) 168 | embedding.append(tf.nn.embedding_lookup(self.char_lookup, char_inputs)) 169 | if config["lexcion_dim"]: 170 | print("Using Lexcion......") 171 | # shape = [batch_size, max_len, lex_dim] 172 | with tf.variable_scope("lexcion_embedding"), tf.device('/cpu:0'): 173 | self.lexcion_loookup = tf.get_variable( 174 | name="lexcion_embedding", 175 | shape=[self.num_lexcion_features,self.lexcion_dim], 176 | initializer=self.initializer) 177 | embedding.append(tf.nn.embedding_lookup(self.lexcion_loookup, lexcion_feature_inputs)) 178 | if config["pos_dim"]: 179 | print("Using PosTags.......") 180 | #shape = [batch_size, max_len, pos_dim] 181 | with tf.variable_scope("pos_embedding"), tf.device('/cpu:0'): 182 | self.pos_id_lookup = tf.get_variable( 183 | name="pos_embedding", 184 | shape=[self.num_poses, self.pos_dim], 185 | initializer=self.initializer) 186 | embedding.append(tf.nn.embedding_lookup(self.pos_id_lookup, pos_id_inputs)) 187 | if config["dep_name_dim"]: 188 | print("Using dep_name......") 189 | # shape = [batch_size, max_len, dep_name_dim] 190 | with tf.variable_scope("dep_name_embedding"), tf.device('/cpu:0'): 191 | self.dep_id_lookup = tf.get_variable( 192 | name="dep_name_embedding", 193 | shape=[self.num_deps, self.dep_name_dim], 194 | initializer=self.initializer) 195 | embedding.append(tf.nn.embedding_lookup(self.dep_id_lookup, dep_id_inputs)) 196 | if config["dependency_dim"]: 197 | print("Using dep........") 198 | # dep_embedding.append(tf.nn.embedding_lookup(self.char_lookup, head_id_inputs)) 199 | # shape = [batch_size, max_len, dep_dim] 200 | #print(self.dependency_dim) 201 | embedding.append(tf.nn.embedding_lookup(self.char_lookup, head_id_inputs)) 202 | #print(len(embedding)) 203 | embed = tf.concat(embedding, axis=-1) 204 | # dep_embeded = tf.concat(dep_embedding, axis=-1) 205 | print(embed) 206 | # print("depembedding----", dep_embeded) 207 | # shape = [batch_size, max_len, (embedding+pos_dim_dep_name_dim+dep_dim组合)] 208 | #return embed, dep_embeded 209 | return embed 210 | 211 | def biLSTM_layer(self, lstm_inputs, lstm_dim, lengths, name=None): 212 | """ 213 | :param lstm_inputs: [batch_size, num_steps, emb_size] 214 | :return: [batch_size, num_steps, 2*lstm_dim] 215 | """ 216 | with tf.variable_scope("char_BiLSTM" if not name else name): 217 | lstm_cell = {} 218 | for direction in ["forward", "backward"]: 219 | with tf.variable_scope(direction): 220 | lstm_cell[direction] = rnn1.CoupledInputForgetGateLSTMCell( 221 | lstm_dim, 222 | use_peepholes=True, 223 | initializer=self.initializer, 224 | state_is_tuple=True) 225 | outputs, final_states = tf.nn.bidirectional_dynamic_rnn( 226 | lstm_cell["forward"], 227 | lstm_cell["backward"], 228 | lstm_inputs, 229 | dtype=tf.float32, 230 | sequence_length=lengths) 231 | # shape = [batch_size, max_len, 2*lstm_dim] 232 | #这里之后可以加attention层 233 | print(tf.concat(outputs, axis=2)) 234 | return tf.concat(outputs, axis=2) 235 | 236 | def attention_layer1(self, lstm_output, attention_size, name=None): 237 | #lstm_output, shape=[batch_size, max_len, 2*lstm_dim] 238 | #attention_size 239 | hidden_size = lstm_output.shape[-1] 240 | print("hidden_size----", hidden_size) 241 | with tf.variable_scope("Attention1" if not name else name): 242 | with tf.variable_scope("Attention_matrix1"): 243 | W = tf.get_variable("W", shape=[hidden_size, attention_size], 244 | dtype=tf.float32, initializer=self.initializer) 245 | b = tf.get_variable("b", shape=[attention_size], 246 | dtype=tf.float32, initializer=tf.zeros_initializer()) 247 | u = tf.get_variable("u", shape=[attention_size], 248 | dtype=tf.float32, initializer=self.initializer) 249 | #shape = [batch, max_len, attention_size] 250 | attentioned = tf.reshape(tf.tanh(tf.tensordot(lstm_output, W, axes=1) + b), [-1, self.num_steps, attention_size]) 251 | print("attentioned---", attentioned) 252 | #shape= [batch, max_len] 253 | attention_score = tf.reshape(tf.tensordot(attentioned, u, axes=1), [-1, self.num_steps]) 254 | print("attention_score---", attention_score) 255 | # shape= [batch, max_len] 256 | normalized_attention_score = tf.nn.softmax(attention_score) 257 | # shape= [batch, max_len, 2*lstm_dim] 258 | attention_output = lstm_output * tf.expand_dims(normalized_attention_score, -1) 259 | print("attention_output---",attention_output) 260 | return attention_output 261 | 262 | def update_attention_outputs_layer1(self, attention_outputs, hidden_units, lengths, name=None): 263 | with tf.variable_scope("update_attention_outputs1" if not name else name): 264 | gru_cell = rnn_cell.GRUCell(hidden_units) 265 | outputs, state = rnn.dynamic_rnn(gru_cell, attention_outputs, dtype=tf.float32, sequence_length=lengths) 266 | #shape = [b, t, 2d] 267 | return outputs 268 | 269 | def attention_layer2(self, lstm_output, attention1_outs, attention_size, name=None): 270 | attention2_input = tf.concat([lstm_output, attention1_outs], axis=-1) 271 | hidden_size = attention2_input.shape[-1] 272 | with tf.variable_scope("Attention2" if not name else name): 273 | with tf.variable_scope("Attention_matrix2"): 274 | W = tf.get_variable("W", shape=[hidden_size, attention_size], 275 | dtype=tf.float32, initializer=self.initializer) 276 | b = tf.get_variable("b", shape=[attention_size], 277 | dtype=tf.float32, initializer=tf.zeros_initializer()) 278 | u = tf.get_variable("u", shape=[attention_size], 279 | dtype=tf.float32, initializer=self.initializer) 280 | # shape = [batch, max_len, attention_size] 281 | attentioned = tf.reshape(tf.tanh(tf.tensordot(attention2_input, W, axes=1) + b), 282 | [-1, self.num_steps, attention_size]) 283 | # print("attentioned---", attentioned) 284 | # shape= [batch, max_len] 285 | attention_score = tf.reshape(tf.tensordot(attentioned, u, axes=1), [-1, self.num_steps]) 286 | # print("attention_score---", attention_score) 287 | # shape= [batch, max_len] 288 | normalized_attention_score = tf.nn.softmax(attention_score) 289 | # shape= [batch, max_len, 2*lstm_dim] 290 | attention_output = lstm_output * tf.expand_dims(normalized_attention_score, -1) 291 | # print("attention_output---", attention_output) 292 | return attention_output 293 | 294 | def update_attention_outputs_layer2(self, attention_outputs, hidden_units, lengths, name=None): 295 | with tf.variable_scope("update_attention_outputs2" if not name else name): 296 | gru_cell = rnn_cell.GRUCell(hidden_units) 297 | outputs, state = rnn.dynamic_rnn(gru_cell, attention_outputs, dtype=tf.float32, sequence_length=lengths) 298 | #shape = [b, t, 2d] 299 | return outputs 300 | 301 | def project_layer(self, last_layer_outputs, name=None): 302 | """ 303 | hidden layer between lstm layer and logits 304 | :param last_layer_outputs: [batch_size, num_steps, emb_size] 305 | :return: [batch_size, num_steps, num_tags] 306 | """ 307 | with tf.variable_scope("project" if not name else name): 308 | with tf.variable_scope("hidden"): 309 | W = tf.get_variable("W", shape=[self.lstm_dim*2, self.lstm_dim], 310 | dtype=tf.float32, initializer=self.initializer) 311 | 312 | b = tf.get_variable("b", shape=[self.lstm_dim], dtype=tf.float32, 313 | initializer=tf.zeros_initializer()) 314 | #shape = [batch_size*max_len, 2embedding_dim] 315 | output = tf.reshape(last_layer_outputs, shape=[-1, self.lstm_dim*2]) 316 | print("project_out----", output) 317 | #这边对bilstm的输出做了一个XW + b, shape = [batch_size*max_len, embedding_dim] 318 | hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b)) 319 | print("hidden--", hidden) 320 | # project to score of tags 321 | with tf.variable_scope("logits"): 322 | W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags], 323 | dtype=tf.float32, initializer=self.initializer) 324 | 325 | b = tf.get_variable("b", shape=[self.num_tags], dtype=tf.float32, 326 | initializer=tf.zeros_initializer()) 327 | #这边能否用softmax,这边用softmax是否比用 328 | #shape = [batch_size*max_len, num_tags(BIO为3, BEMSO为5)] 329 | pred = tf.nn.xw_plus_b(hidden, W, b) 330 | print("pre----", pred) 331 | print(tf.reshape(pred, [-1, self.num_steps, self.num_tags])) 332 | #shape = [batch_size, max_len, num_tags] 333 | return tf.reshape(pred, [-1, self.num_steps, self.num_tags]) 334 | 335 | def loss_layer(self, project_logits, lengths, name=None): 336 | """ 337 | calculate crf loss 338 | :param project_logits: [1, num_steps, num_tags] 339 | :return: scalar loss 340 | """ 341 | with tf.variable_scope("crf_loss" if not name else name): 342 | small = -1000.0 343 | # pad logits for crf loss 344 | start_logits = tf.concat( 345 | [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1]), 346 | small * tf.ones(shape=[self.batch_size, 1, 1])], axis=-1) 347 | end_logits = tf.concat( 348 | [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), 349 | small * tf.ones(shape=[self.batch_size, 1, 1]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1) 350 | 351 | pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 2]), tf.float32) 352 | logits = tf.concat([project_logits, pad_logits], axis=-1) 353 | logits = tf.concat([start_logits, logits, end_logits], axis=1) 354 | targets = tf.concat( 355 | [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets, 356 | tf.cast((self.num_tags + 1) * tf.ones([self.batch_size, 1]), tf.int32)], axis=-1) 357 | log_likelihood, self.trans = crf_log_likelihood( 358 | inputs=logits, 359 | tag_indices=targets, 360 | # transition_params=self.trans, 361 | sequence_lengths=lengths + 2) 362 | return tf.reduce_mean(-log_likelihood) 363 | 364 | def create_feed_dict(self, is_train, batch): 365 | """ 366 | :param is_train: Flag, True for train batch 367 | :param batch: list train/evaluate data 368 | :return: structured data to feed 369 | """ 370 | # _, chars, segs, tags = batch 371 | # feed_dict = { 372 | # self.char_inputs: np.asarray(chars), 373 | # self.seg_inputs: np.asarray(segs), 374 | # self.dropout: 1.0, 375 | # } 376 | _, chars, lexcion_features, pos_ids, dep_ids, head_ids, tags = batch 377 | # print(type(len(chars[0]))) 378 | feed_dict = { 379 | self.char_inputs: np.asarray(chars), 380 | self.lexcion_feature_inputs: np.asarray(lexcion_features), 381 | self.pos_id_inputs: np.asarray(pos_ids), 382 | self.dep_id_inputs: np.asarray(dep_ids), 383 | self.head_id_inputs: np.asarray(head_ids), 384 | self.dropout: 1.0, 385 | } 386 | if is_train: 387 | feed_dict[self.targets] = np.asarray(tags) 388 | feed_dict[self.dropout] = self.config["dropout_keep"] 389 | return feed_dict 390 | 391 | def run_step(self, sess, is_train, batch): 392 | """ 393 | :param sess: session to run the batch 394 | :param is_train: a flag indicate if it is a train batch 395 | :param batch: a dict containing batch data 396 | :return: batch result, loss of the batch or logits 397 | """ 398 | feed_dict = self.create_feed_dict(is_train, batch) 399 | if is_train: 400 | global_step, loss, _ = sess.run( 401 | [self.global_step, self.loss, self.train_op], 402 | feed_dict) 403 | return global_step, loss 404 | else: 405 | lengths, logits = sess.run([self.lengths, self.logits], feed_dict) 406 | return lengths, logits 407 | 408 | 409 | def decode(self, logits, lengths, matrix): 410 | """ 411 | :param logits: [batch_size, num_steps, num_tags]float32, logits 412 | :param lengths: [batch_size]int32, real length of each sequence 413 | :param matrix: transaction matrix for inference 414 | :return: 415 | """ 416 | # inference final labels usa viterbi Algorithm 417 | paths = [] 418 | small = -1000.0 419 | start = np.asarray([[small] * self.num_tags + [0, small]]) 420 | end = np.asarray([[small] * self.num_tags + [small, 0]]) 421 | for score, length in zip(logits, lengths): 422 | score = score[:length] 423 | pad = small * np.ones([length, 2]) 424 | logits = np.concatenate([score, pad], axis=1) 425 | logits = np.concatenate([start, logits, end], axis=0) 426 | # print('logits shape:', logits.shape) 427 | # print('matrix shape:', matrix.shape) 428 | path, _ = viterbi_decode(logits, matrix) 429 | 430 | paths.append(path[1:len(path) - 1]) 431 | return paths 432 | 433 | def evaluate(self, sess, data_manager, id_to_tag): 434 | """ 435 | :param sess: session to run the model 436 | :param data: list of data 437 | :param id_to_tag: index to tag name 438 | :return: evaluate result 439 | """ 440 | results = [] 441 | trans = self.trans.eval() 442 | for batch in data_manager.iter_batch(): 443 | strings = batch[0] 444 | tags = batch[-1] 445 | lengths, scores = self.run_step(sess, False, batch) 446 | batch_paths = self.decode(scores, lengths, trans) 447 | for i in range(len(strings)): 448 | result = [] 449 | string = strings[i][:lengths[i]] 450 | gold = iobes_iob([id_to_tag[int(x)] for x in tags[i][:lengths[i]]]) 451 | pred = iobes_iob([id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]]) 452 | for char, gold, pred in zip(string, gold, pred): 453 | result.append(" ".join([char, gold, pred])) 454 | results.append(result) 455 | return results 456 | 457 | #result = model.evaluate_line(sess, input_from_line(line, char_to_id), id_to_tag) 458 | def evaluate_line(self, sess, inputs, id_to_tag): 459 | trans = self.trans.eval() 460 | lengths, scores = self.run_step(sess, False, inputs) 461 | batch_paths = self.decode(scores, lengths, trans) 462 | tags = [id_to_tag[idx] for idx in batch_paths[0]] 463 | #return tags 464 | #print(inputs[0][0]) 465 | return result_to_json(inputs[0][0], tags) 466 | -------------------------------------------------------------------------------- /rnncell.py: -------------------------------------------------------------------------------- 1 | """Module for constructing RNN Cells.""" 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | import collections 7 | import math 8 | import tensorflow as tf 9 | from tensorflow.contrib.compiler import jit 10 | from tensorflow.contrib.layers.python.layers import layers 11 | from tensorflow.python.framework import dtypes 12 | from tensorflow.python.framework import op_def_registry 13 | from tensorflow.python.framework import ops 14 | from tensorflow.python.ops import array_ops 15 | from tensorflow.python.ops import clip_ops 16 | from tensorflow.python.ops import init_ops 17 | from tensorflow.python.ops import math_ops 18 | from tensorflow.python.ops import nn_ops 19 | from tensorflow.python.ops import random_ops 20 | from tensorflow.python.ops import rnn_cell_impl 21 | from tensorflow.python.ops import variable_scope as vs 22 | from tensorflow.python.platform import tf_logging as logging 23 | from tensorflow.python.util import nest 24 | 25 | 26 | def _get_concat_variable(name, shape, dtype, num_shards): 27 | """Get a sharded variable concatenated into one tensor.""" 28 | sharded_variable = _get_sharded_variable(name, shape, dtype, num_shards) 29 | if len(sharded_variable) == 1: 30 | return sharded_variable[0] 31 | 32 | concat_name = name + "/concat" 33 | concat_full_name = vs.get_variable_scope().name + "/" + concat_name + ":0" 34 | for value in ops.get_collection(ops.GraphKeys.CONCATENATED_VARIABLES): 35 | if value.name == concat_full_name: 36 | return value 37 | 38 | concat_variable = array_ops.concat(sharded_variable, 0, name=concat_name) 39 | ops.add_to_collection(ops.GraphKeys.CONCATENATED_VARIABLES, 40 | concat_variable) 41 | return concat_variable 42 | 43 | 44 | def _get_sharded_variable(name, shape, dtype, num_shards): 45 | """Get a list of sharded variables with the given dtype.""" 46 | if num_shards > shape[0]: 47 | raise ValueError("Too many shards: shape=%s, num_shards=%d" % 48 | (shape, num_shards)) 49 | unit_shard_size = int(math.floor(shape[0] / num_shards)) 50 | remaining_rows = shape[0] - unit_shard_size * num_shards 51 | 52 | shards = [] 53 | for i in range(num_shards): 54 | current_size = unit_shard_size 55 | if i < remaining_rows: 56 | current_size += 1 57 | shards.append(vs.get_variable(name + "_%d" % i, [current_size] + shape[1:], 58 | dtype=dtype)) 59 | return shards 60 | 61 | 62 | class CoupledInputForgetGateLSTMCell(rnn_cell_impl.RNNCell): 63 | """Long short-term memory unit (LSTM) recurrent network cell. 64 | 65 | The default non-peephole implementation is based on: 66 | 67 | http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf 68 | 69 | S. Hochreiter and J. Schmidhuber. 70 | "Long Short-Term Memory". Neural Computation, 9(8):1735-1780, 1997. 71 | 72 | The peephole implementation is based on: 73 | 74 | https://research.google.com/pubs/archive/43905.pdf 75 | 76 | Hasim Sak, Andrew Senior, and Francoise Beaufays. 77 | "Long short-term memory recurrent neural network architectures for 78 | large scale acoustic modeling." INTERSPEECH, 2014. 79 | 80 | The coupling of input and forget gate is based on: 81 | 82 | http://arxiv.org/pdf/1503.04069.pdf 83 | 84 | Greff et al. "LSTM: A Search Space Odyssey" 85 | 86 | The class uses optional peep-hole connections, and an optional projection 87 | layer. 88 | """ 89 | 90 | def __init__(self, num_units, use_peepholes=False, 91 | initializer=None, num_proj=None, proj_clip=None, 92 | num_unit_shards=1, num_proj_shards=1, 93 | forget_bias=1.0, state_is_tuple=True, 94 | activation=math_ops.tanh, reuse=None): 95 | """Initialize the parameters for an LSTM cell. 96 | 97 | Args: 98 | num_units: int, The number of units in the LSTM cell 99 | use_peepholes: bool, set True to enable diagonal/peephole connections. 100 | initializer: (optional) The initializer to use for the weight and 101 | projection matrices. 102 | num_proj: (optional) int, The output dimensionality for the projection 103 | matrices. If None, no projection is performed. 104 | proj_clip: (optional) A float value. If `num_proj > 0` and `proj_clip` is 105 | provided, then the projected values are clipped elementwise to within 106 | `[-proj_clip, proj_clip]`. 107 | num_unit_shards: How to split the weight matrix. If >1, the weight 108 | matrix is stored across num_unit_shards. 109 | num_proj_shards: How to split the projection matrix. If >1, the 110 | projection matrix is stored across num_proj_shards. 111 | forget_bias: Biases of the forget gate are initialized by default to 1 112 | in order to reduce the scale of forgetting at the beginning of 113 | the training. 114 | state_is_tuple: If True, accepted and returned states are 2-tuples of 115 | the `c_state` and `m_state`. By default (False), they are concatenated 116 | along the column axis. This default behavior will soon be deprecated. 117 | activation: Activation function of the inner states. 118 | reuse: (optional) Python boolean describing whether to reuse variables 119 | in an existing scope. If not `True`, and the existing scope already has 120 | the given variables, an error is raised. 121 | """ 122 | super(CoupledInputForgetGateLSTMCell, self).__init__(_reuse=reuse) 123 | if not state_is_tuple: 124 | logging.warn( 125 | "%s: Using a concatenated state is slower and will soon be " 126 | "deprecated. Use state_is_tuple=True.", self) 127 | self._num_units = num_units 128 | self._use_peepholes = use_peepholes 129 | self._initializer = initializer 130 | self._num_proj = num_proj 131 | self._proj_clip = proj_clip 132 | self._num_unit_shards = num_unit_shards 133 | self._num_proj_shards = num_proj_shards 134 | self._forget_bias = forget_bias 135 | self._state_is_tuple = state_is_tuple 136 | self._activation = activation 137 | self._reuse = reuse 138 | 139 | if num_proj: 140 | self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_proj) 141 | if state_is_tuple else num_units + num_proj) 142 | self._output_size = num_proj 143 | else: 144 | self._state_size = (rnn_cell_impl.LSTMStateTuple(num_units, num_units) 145 | if state_is_tuple else 2 * num_units) 146 | self._output_size = num_units 147 | 148 | @property 149 | def state_size(self): 150 | return self._state_size 151 | 152 | @property 153 | def output_size(self): 154 | return self._output_size 155 | 156 | def call(self, inputs, state): 157 | """Run one step of LSTM. 158 | 159 | Args: 160 | inputs: input Tensor, 2D, batch x num_units. 161 | state: if `state_is_tuple` is False, this must be a state Tensor, 162 | `2-D, batch x state_size`. If `state_is_tuple` is True, this must be a 163 | tuple of state Tensors, both `2-D`, with column sizes `c_state` and 164 | `m_state`. 165 | scope: VariableScope for the created subgraph; defaults to "LSTMCell". 166 | 167 | Returns: 168 | A tuple containing: 169 | - A `2-D, [batch x output_dim]`, Tensor representing the output of the 170 | LSTM after reading `inputs` when previous state was `state`. 171 | Here output_dim is: 172 | num_proj if num_proj was set, 173 | num_units otherwise. 174 | - Tensor(s) representing the new state of LSTM after reading `inputs` when 175 | the previous state was `state`. Same type and shape(s) as `state`. 176 | 177 | Raises: 178 | ValueError: If input size cannot be inferred from inputs via 179 | static shape inference. 180 | """ 181 | sigmoid = math_ops.sigmoid 182 | 183 | num_proj = self._num_units if self._num_proj is None else self._num_proj 184 | 185 | if self._state_is_tuple: 186 | (c_prev, m_prev) = state 187 | else: 188 | c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units]) 189 | m_prev = array_ops.slice(state, [0, self._num_units], [-1, num_proj]) 190 | 191 | dtype = inputs.dtype 192 | input_size = inputs.get_shape().with_rank(2)[1] 193 | 194 | if input_size.value is None: 195 | raise ValueError("Could not infer input size from inputs.get_shape()[-1]") 196 | 197 | # Input gate weights 198 | self.w_xi = tf.get_variable("_w_xi", [input_size.value, self._num_units]) 199 | self.w_hi = tf.get_variable("_w_hi", [self._num_units, self._num_units]) 200 | self.w_ci = tf.get_variable("_w_ci", [self._num_units, self._num_units]) 201 | # Output gate weights 202 | self.w_xo = tf.get_variable("_w_xo", [input_size.value, self._num_units]) 203 | self.w_ho = tf.get_variable("_w_ho", [self._num_units, self._num_units]) 204 | self.w_co = tf.get_variable("_w_co", [self._num_units, self._num_units]) 205 | 206 | # Cell weights 207 | self.w_xc = tf.get_variable("_w_xc", [input_size.value, self._num_units]) 208 | self.w_hc = tf.get_variable("_w_hc", [self._num_units, self._num_units]) 209 | 210 | # Initialize the bias vectors 211 | self.b_i = tf.get_variable("_b_i", [self._num_units], initializer=init_ops.zeros_initializer()) 212 | self.b_c = tf.get_variable("_b_c", [self._num_units], initializer=init_ops.zeros_initializer()) 213 | self.b_o = tf.get_variable("_b_o", [self._num_units], initializer=init_ops.zeros_initializer()) 214 | 215 | i_t = sigmoid(math_ops.matmul(inputs, self.w_xi) + 216 | math_ops.matmul(m_prev, self.w_hi) + 217 | math_ops.matmul(c_prev, self.w_ci) + 218 | self.b_i) 219 | c_t = ((1 - i_t) * c_prev + i_t * self._activation(math_ops.matmul(inputs, self.w_xc) + 220 | math_ops.matmul(m_prev, self.w_hc) + self.b_c)) 221 | 222 | o_t = sigmoid(math_ops.matmul(inputs, self.w_xo) + 223 | math_ops.matmul(m_prev, self.w_ho) + 224 | math_ops.matmul(c_t, self.w_co) + 225 | self.b_o) 226 | 227 | h_t = o_t * self._activation(c_t) 228 | 229 | new_state = (rnn_cell_impl.LSTMStateTuple(c_t, h_t) if self._state_is_tuple else 230 | array_ops.concat([c_t, h_t], 1)) 231 | return h_t, new_state -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | import logging 5 | 6 | import tensorflow as tf 7 | from conlleval import return_report 8 | 9 | models_path = "./models" 10 | eval_path = "./evaluation" 11 | eval_temp = os.path.join(eval_path, "temp") 12 | eval_script = os.path.join(eval_path, "conlleval") 13 | 14 | 15 | def get_logger(log_file): 16 | logger = logging.getLogger(log_file) 17 | logger.setLevel(logging.DEBUG) 18 | fh = logging.FileHandler(log_file) 19 | fh.setLevel(logging.DEBUG) 20 | ch = logging.StreamHandler() 21 | ch.setLevel(logging.INFO) 22 | formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") 23 | ch.setFormatter(formatter) 24 | fh.setFormatter(formatter) 25 | logger.addHandler(ch) 26 | logger.addHandler(fh) 27 | return logger 28 | 29 | def test_as(results, path): 30 | """ 31 | Run perl script to evaluate model 32 | """ 33 | output_file = os.path.join(path, "as_predict.utf8") 34 | with open(output_file, "w", encoding='utf8') as f: 35 | to_write = [] 36 | for block in results: 37 | for line in block: 38 | to_write.append(line + "\n") 39 | to_write.append("\n") 40 | 41 | f.writelines(to_write) 42 | eval_lines = return_report(output_file) 43 | return eval_lines 44 | 45 | 46 | def print_config(config, logger): 47 | """ 48 | Print configuration of the model 49 | """ 50 | for k, v in config.items(): 51 | logger.info("{}:\t{}".format(k.ljust(15), v)) 52 | 53 | 54 | def make_path(params): 55 | """ 56 | Make folders for training and evaluation 57 | """ 58 | if not os.path.isdir(params.result_path): 59 | os.makedirs(params.result_path) 60 | if not os.path.isdir(params.ckpt_path): 61 | os.makedirs(params.ckpt_path) 62 | if not os.path.isdir("log"): 63 | os.makedirs("log") 64 | 65 | 66 | def clean(params): 67 | """ 68 | Clean current folder 69 | remove saved model and training log 70 | """ 71 | if os.path.isfile(params.vocab_file): 72 | os.remove(params.vocab_file) 73 | 74 | if os.path.isfile(params.map_file): 75 | os.remove(params.map_file) 76 | 77 | if os.path.isdir(params.ckpt_path): 78 | shutil.rmtree(params.ckpt_path) 79 | 80 | if os.path.isdir(params.summary_path): 81 | shutil.rmtree(params.summary_path) 82 | 83 | if os.path.isdir(params.result_path): 84 | shutil.rmtree(params.result_path) 85 | 86 | if os.path.isdir("log"): 87 | shutil.rmtree("log") 88 | 89 | if os.path.isdir("__pycache__"): 90 | shutil.rmtree("__pycache__") 91 | 92 | if os.path.isfile(params.config_file): 93 | os.remove(params.config_file) 94 | 95 | if os.path.isfile(params.vocab_file): 96 | os.remove(params.vocab_file) 97 | 98 | 99 | def save_config(config, config_file): 100 | """ 101 | Save configuration of the model 102 | parameters are stored in json format 103 | """ 104 | with open(config_file, "w", encoding="utf8") as f: 105 | json.dump(config, f, ensure_ascii=False, indent=4) 106 | 107 | 108 | def load_config(config_file): 109 | """ 110 | Load configuration of the model 111 | parameters are stored in json format 112 | """ 113 | with open(config_file, encoding="utf8") as f: 114 | return json.load(f) 115 | 116 | 117 | def convert_to_text(line): 118 | """ 119 | Convert conll data to text 120 | """ 121 | to_print = [] 122 | for item in line: 123 | 124 | try: 125 | if item[0] == " ": 126 | to_print.append(" ") 127 | continue 128 | word, gold, tag = item.split(" ") 129 | if tag[0] in "SB": 130 | to_print.append("[") 131 | to_print.append(word) 132 | if tag[0] in "SE": 133 | to_print.append("@" + tag.split("-")[-1]) 134 | to_print.append("]") 135 | except: 136 | print(list(item)) 137 | return "".join(to_print) 138 | 139 | 140 | def save_model(sess, model, path, logger, step): 141 | checkpoint_path = os.path.join(path, "as.ckpt") 142 | model.saver.save(sess, checkpoint_path, global_step=step) 143 | logger.info("model saved") 144 | 145 | 146 | def create_model(session, Model_class, path, load_vec, config, id_to_char, logger): 147 | # create model, reuse parameters if exists 148 | model = Model_class(config) 149 | 150 | ckpt = tf.train.get_checkpoint_state(path) 151 | if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): 152 | logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) 153 | model.saver.restore(session, ckpt.model_checkpoint_path) 154 | else: 155 | logger.info("Created model with fresh parameters.") 156 | session.run(tf.global_variables_initializer()) 157 | if config["pre_emb"]: 158 | emb_weights = session.run(model.char_lookup.read_value()) 159 | emb_weights = load_vec(config["emb_file"],id_to_char, config["char_dim"], emb_weights) 160 | session.run(model.char_lookup.assign(emb_weights)) 161 | logger.info("Load pre-trained embedding.") 162 | return model 163 | 164 | 165 | def result_to_json(string, tags): 166 | item = {"string": string, "entities": []} 167 | entity_name = "" 168 | entity_start = 0 169 | idx = 0 170 | for char, tag in zip(string, tags): 171 | if tag[0] == "S": 172 | item["entities"].append({"word": char, "start": idx, "end": idx+1, "type":tag[2:]}) 173 | elif tag[0] == "B": 174 | entity_name += char 175 | entity_start = idx 176 | elif tag[0] == "I": 177 | entity_name += char 178 | elif tag[0] == "E": 179 | entity_name += char 180 | item["entities"].append({"word": entity_name, "start": entity_start, "end": idx + 1, "type": tag[2:]}) 181 | entity_name = "" 182 | else: 183 | entity_name = "" 184 | entity_start = idx 185 | idx += 1 186 | return item 187 | 188 | 189 | 190 | 191 | --------------------------------------------------------------------------------