├── .gitignore ├── README.md ├── __init__.py ├── data ├── __init__.py └── load.py ├── examples ├── elman-forward.py ├── elman-keras.py └── jordan-forward.py ├── metrics ├── __init__.py └── accuracy.py ├── rnn ├── __init__.py ├── elman.py └── jordan.py └── utils ├── __init__.py └── tools.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | *.gz 3 | *.pyc 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Note:** I don't provide personal support for custom changes in the code. Only 2 | for the release. For people just starting, I recommend 3 | [Treehouse](http://referrals.trhou.se/grgoiremesnil) for online-learning. 4 | 5 | Investigation of Recurrent Neural Network Architectures and Learning Methods for Spoken Language Understanding 6 | ============================================================================================================== 7 | 8 | ### Code for RNN and Spoken Language Understanding 9 | 10 | Based on the Interspeech '13 paper: 11 | 12 | [Grégoire Mesnil, Xiaodong He, Li Deng and Yoshua Bengio - **Investigation of Recurrent Neural Network Architectures and Learning Methods for Spoken Language Understanding**](http://www.iro.umontreal.ca/~lisa/pointeurs/RNNSpokenLanguage2013.pdf) 13 | 14 | We also have a follow-up IEEE paper: 15 | 16 | [Grégoire Mesnil, Yann Dauphin, Kaisheng Yao, Yoshua Bengio, Li Deng, Dilek Hakkani-Tur, Xiaodong He, Larry Heck, Gokhan Tur, Dong Yu and Geoffrey Zweig - **Using Recurrent Neural Networks for Slot Filling in Spoken Language Understanding**](http://www.iro.umontreal.ca/~lisa/pointeurs/taslp_RNNSLU_final_doubleColumn.pdf) 17 | 18 | ## Code 19 | 20 | This code allows to get state-of-the-art results and a significant improvement 21 | (+1% in F1-score) with respect to the results presented in the paper. 22 | 23 | In order to reproduce the results, make sure Theano is installed and the 24 | repository is in your `PYTHONPATH`, e.g run the command 25 | `export PYTHONPATH=/path/where/is13/is:$PYTHONPATH`. Then, run the following 26 | commands: 27 | 28 | ``` 29 | git clone git@github.com:mesnilgr/is13.git 30 | python is13/examples/elman-forward.py 31 | ``` 32 | 33 | For running the Jordan architecture: 34 | 35 | ``` 36 | python is13/examples/jordan-forward.py 37 | ``` 38 | 39 | ## ATIS Data 40 | 41 | Download ATIS Dataset here! [split 0](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold0.pkl.gz) [split 1](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold1.pkl.gz) [split 2](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold2.pkl.gz) [split 3](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold3.pkl.gz) [split 4](http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/atis.fold4.pkl.gz) 42 | 43 | ``` 44 | import cPickle 45 | train, test, dicts = cPickle.load(open("atis.pkl")) 46 | ``` 47 | 48 | `dicts` is a python dictionnary that contains the mapping from the labels, the 49 | name entities (if existing) and the words to indexes used in `train` and `test` 50 | lists. Refer to this [tutorial](http://deeplearning.net/tutorial/rnnslu.html) for more details. 51 | 52 | Running the following command can give you an idea of how the data has been preprocessed: 53 | 54 | ``` 55 | python data/load.py 56 | ``` 57 | To download the intent labels, you may be intersted in this [notebook](https://github.com/kpe/notebooks/blob/master/ms_cntk_atis_dataset_reader.ipynb). 58 | 59 | ## License 60 | 61 | Creative Commons License
Recurrent Neural Network Architectures for Spoken Language Understanding by Grégoire Mesnil is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
Based on a work at https://github.com/mesnilgr/is13. 62 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesnilgr/is13/f25db1026d3b30d329992c803f793347164c371c/__init__.py -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesnilgr/is13/f25db1026d3b30d329992c803f793347164c371c/data/__init__.py -------------------------------------------------------------------------------- /data/load.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import cPickle 3 | import urllib 4 | import os 5 | import random 6 | 7 | from os.path import isfile 8 | 9 | PREFIX = os.getenv('ATISDATA', '') 10 | 11 | def download(origin): 12 | ''' 13 | download the corresponding atis file 14 | from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/ 15 | ''' 16 | print 'Downloading data from %s' % origin 17 | name = origin.split('/')[-1] 18 | urllib.urlretrieve(origin, name) 19 | 20 | def download_dropbox(fold): 21 | ''' 22 | download from drop box in the meantime 23 | ''' 24 | try: 25 | assert fold in [1, 3] 26 | except AssertionError: 27 | print('only the fold 1 and 3 are available since the UdeM deleted my server account without notice. apologies.') 28 | 29 | if fold == 1: 30 | os.system('wget -O atis.fold1.pkl.gz https://www.dropbox.com/s/81ar3d2ck3yavic/atis.fold1.pkl.gz?dl=0') 31 | elif fold == 3: 32 | os.system('wget -O atis.fold3.pkl.gz https://www.dropbox.com/s/tinjzm1b22tozn8/atis.fold3.pkl.gz?dl=0') 33 | 34 | 35 | def load_dropbox(filename, fold): 36 | if not isfile(filename): 37 | #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/'+filename) 38 | download_dropbox(fold) 39 | f = gzip.open(filename,'rb') 40 | return f 41 | 42 | def load_udem(filename): 43 | if not isfile(filename): 44 | download('http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/'+filename) 45 | f = gzip.open(filename,'rb') 46 | return f 47 | 48 | 49 | def atisfull(): 50 | f = load_dropbox(PREFIX + 'atis.pkl') 51 | train_set, test_set, dicts = cPickle.load(f) 52 | return train_set, test_set, dicts 53 | 54 | def atisfold(fold): 55 | assert fold in range(5) 56 | f = load_dropbox(PREFIX + 'atis.fold'+str(fold)+'.pkl.gz', fold) 57 | train_set, valid_set, test_set, dicts = cPickle.load(f) 58 | return train_set, valid_set, test_set, dicts 59 | 60 | if __name__ == '__main__': 61 | 62 | ''' visualize a few sentences ''' 63 | 64 | import pdb 65 | pdb.set_trace() 66 | w2ne, w2la = {}, {} 67 | train, _, test, dic = atisfold(3) 68 | 69 | w2idx, ne2idx, labels2idx = dic['words2idx'], dic['tables2idx'], dic['labels2idx'] 70 | 71 | idx2w = dict((v,k) for k,v in w2idx.iteritems()) 72 | idx2ne = dict((v,k) for k,v in ne2idx.iteritems()) 73 | idx2la = dict((v,k) for k,v in labels2idx.iteritems()) 74 | 75 | test_x, test_ne, test_label = test 76 | train_x, train_ne, train_label = train 77 | wlength = 35 78 | 79 | for e in ['train','test']: 80 | for sw, se, sl in zip(eval(e+'_x'), eval(e+'_ne'), eval(e+'_label')): 81 | print 'WORD'.rjust(wlength), 'LABEL'.rjust(wlength) 82 | for wx, la in zip(sw, sl): print idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength) 83 | print '\n'+'**'*30+'\n' 84 | pdb.set_trace() 85 | -------------------------------------------------------------------------------- /examples/elman-forward.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import time 3 | import sys 4 | import subprocess 5 | import os 6 | import random 7 | 8 | from is13.data import load 9 | from is13.rnn.elman import model 10 | from is13.metrics.accuracy import conlleval 11 | from is13.utils.tools import shuffle, minibatch, contextwin 12 | 13 | if __name__ == '__main__': 14 | 15 | s = {'fold':3, # 5 folds 0,1,2,3,4 16 | 'lr':0.0627142536696559, 17 | 'verbose':1, 18 | 'decay':False, # decay on the learning rate if improvement stops 19 | 'win':7, # number of words in the context window 20 | 'bs':9, # number of backprop through time steps 21 | 'nhidden':100, # number of hidden units 22 | 'seed':345, 23 | 'emb_dimension':100, # dimension of word embedding 24 | 'nepochs':50} 25 | 26 | folder = os.path.basename(__file__).split('.')[0] 27 | if not os.path.exists(folder): os.mkdir(folder) 28 | 29 | # load the dataset 30 | train_set, valid_set, test_set, dic = load.atisfold(s['fold']) 31 | idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) 32 | idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) 33 | 34 | train_lex, train_ne, train_y = train_set 35 | valid_lex, valid_ne, valid_y = valid_set 36 | test_lex, test_ne, test_y = test_set 37 | 38 | vocsize = len(dic['words2idx']) 39 | nclasses = len(dic['labels2idx']) 40 | nsentences = len(train_lex) 41 | 42 | # instanciate the model 43 | numpy.random.seed(s['seed']) 44 | random.seed(s['seed']) 45 | rnn = model( nh = s['nhidden'], 46 | nc = nclasses, 47 | ne = vocsize, 48 | de = s['emb_dimension'], 49 | cs = s['win'] ) 50 | 51 | # train with early stopping on validation set 52 | best_f1 = -numpy.inf 53 | s['clr'] = s['lr'] 54 | for e in xrange(s['nepochs']): 55 | # shuffle 56 | shuffle([train_lex, train_ne, train_y], s['seed']) 57 | s['ce'] = e 58 | tic = time.time() 59 | for i in xrange(nsentences): 60 | cwords = contextwin(train_lex[i], s['win']) 61 | words = map(lambda x: numpy.asarray(x).astype('int32'),\ 62 | minibatch(cwords, s['bs'])) 63 | labels = train_y[i] 64 | for word_batch , label_last_word in zip(words, labels): 65 | rnn.train(word_batch, label_last_word, s['clr']) 66 | rnn.normalize() 67 | if s['verbose']: 68 | print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), 69 | sys.stdout.flush() 70 | 71 | # evaluation // back into the real world : idx -> words 72 | predictions_test = [ map(lambda x: idx2label[x], \ 73 | rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ 74 | for x in test_lex ] 75 | groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] 76 | words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] 77 | 78 | predictions_valid = [ map(lambda x: idx2label[x], \ 79 | rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ 80 | for x in valid_lex ] 81 | groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] 82 | words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] 83 | 84 | # evaluation // compute the accuracy using conlleval.pl 85 | res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') 86 | res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') 87 | 88 | if res_valid['f1'] > best_f1: 89 | rnn.save(folder) 90 | best_f1 = res_valid['f1'] 91 | if s['verbose']: 92 | print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 93 | s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] 94 | s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] 95 | s['be'] = e 96 | subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) 97 | subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) 98 | else: 99 | print '' 100 | 101 | # learning rate decay if no improvement in 10 epochs 102 | if s['decay'] and abs(s['be']-s['ce']) >= 10: s['clr'] *= 0.5 103 | if s['clr'] < 1e-5: break 104 | 105 | print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder 106 | 107 | -------------------------------------------------------------------------------- /examples/elman-keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import sys 4 | import subprocess 5 | import os 6 | import random 7 | 8 | from is13.data import load 9 | from is13.rnn.elman import model 10 | from is13.metrics.accuracy import conlleval 11 | from is13.utils.tools import shuffle, minibatch, contextwin 12 | 13 | from keras.models import Sequential 14 | from keras.layers import (Input, Embedding, SimpleRNN, Dense, Activation, 15 | TimeDistributed) 16 | from keras.optimizers import SGD 17 | from keras.utils.np_utils import to_categorical 18 | 19 | if __name__ == '__main__': 20 | 21 | s = {'fold':3, # 5 folds 0,1,2,3,4 22 | 'lr':0.1, 23 | 'verbose':1, 24 | 'nhidden':100, # number of hidden units 25 | 'seed':345, 26 | 'emb_dimension':100, # dimension of word embedding 27 | 'nepochs':50} 28 | 29 | folder = os.path.basename(__file__).split('.')[0] 30 | if not os.path.exists(folder): os.mkdir(folder) 31 | 32 | # load the dataset 33 | train_set, valid_set, test_set, dic = load.atisfold(s['fold']) 34 | idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) 35 | idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) 36 | 37 | train_lex, train_ne, train_y = train_set 38 | valid_lex, valid_ne, valid_y = valid_set 39 | test_lex, test_ne, test_y = test_set 40 | 41 | vocsize = len(dic['words2idx']) 42 | nclasses = len(dic['labels2idx']) 43 | nsentences = len(train_lex) 44 | 45 | # instanciate the model 46 | np.random.seed(s['seed']) 47 | random.seed(s['seed']) 48 | 49 | model = Sequential() 50 | model.add(Embedding(vocsize, s['emb_dimension'])) 51 | model.add(SimpleRNN(s['nhidden'], activation='sigmoid', 52 | return_sequences=True)) 53 | model.add(TimeDistributed(Dense(output_dim=nclasses))) 54 | model.add(Activation("softmax")) 55 | 56 | sgd = SGD(lr=s['lr'], momentum=0.0, decay=0.0, nesterov=False) 57 | model.compile(loss='categorical_crossentropy', optimizer=sgd, 58 | metrics=['accuracy']) 59 | 60 | # train with early stopping on validation set 61 | best_f1 = -np.inf 62 | for e in xrange(s['nepochs']): 63 | # shuffle 64 | shuffle([train_lex, train_ne, train_y], s['seed']) 65 | s['ce'] = e 66 | tic = time.time() 67 | for i in xrange(nsentences): 68 | X = np.asarray([train_lex[i]]) 69 | Y = to_categorical(np.asarray(train_y[i])[:, np.newaxis], 70 | nclasses)[np.newaxis, :, :] 71 | if X.shape[1] == 1: 72 | continue # bug with X, Y of len 1 73 | model.train_on_batch(X, Y) 74 | 75 | if s['verbose']: 76 | print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), 77 | sys.stdout.flush() 78 | 79 | # evaluation // back into the real world : idx -> words 80 | predictions_test = [map(lambda x: idx2label[x], \ 81 | model.predict_on_batch( \ 82 | np.asarray([x])).argmax(2)[0]) \ 83 | for x in test_lex] 84 | groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] 85 | words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] 86 | 87 | predictions_valid = [map(lambda x: idx2label[x], \ 88 | model.predict_on_batch( \ 89 | np.asarray([x])).argmax(2)[0]) \ 90 | for x in valid_lex] 91 | groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] 92 | words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] 93 | 94 | # evaluation // compute the accuracy using conlleval.pl 95 | res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') 96 | res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') 97 | 98 | if res_valid['f1'] > best_f1: 99 | model.save_weights('best_model.h5', overwrite=True) 100 | best_f1 = res_valid['f1'] 101 | if s['verbose']: 102 | print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 103 | s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] 104 | s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] 105 | s['be'] = e 106 | subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) 107 | subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) 108 | else: 109 | print '' 110 | 111 | print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder 112 | 113 | -------------------------------------------------------------------------------- /examples/jordan-forward.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import time 3 | import sys 4 | import subprocess 5 | import os 6 | import random 7 | 8 | from is13.data import load 9 | from is13.rnn.jordan import model 10 | from is13.metrics.accuracy import conlleval 11 | from is13.utils.tools import shuffle, minibatch, contextwin 12 | 13 | if __name__ == '__main__': 14 | 15 | s = {'fold':3, # 5 folds 0,1,2,3,4 16 | 'lr':0.0627142536696559, 17 | 'verbose':1, 18 | 'decay':False, # decay on the learning rate if improvement stops 19 | 'win':7, # number of words in the context window 20 | 'bs':9, # number of backprop through time steps 21 | 'nhidden':100, # number of hidden units 22 | 'seed':345, 23 | 'emb_dimension':100, # dimension of word embedding 24 | 'nepochs':50} 25 | 26 | folder = os.path.basename(__file__).split('.')[0] 27 | if not os.path.exists(folder): os.mkdir(folder) 28 | 29 | # load the dataset 30 | train_set, valid_set, test_set, dic = load.atisfold(s['fold']) 31 | idx2label = dict((k,v) for v,k in dic['labels2idx'].iteritems()) 32 | idx2word = dict((k,v) for v,k in dic['words2idx'].iteritems()) 33 | 34 | train_lex, train_ne, train_y = train_set 35 | valid_lex, valid_ne, valid_y = valid_set 36 | test_lex, test_ne, test_y = test_set 37 | 38 | vocsize = len(set(reduce(\ 39 | lambda x, y: list(x)+list(y),\ 40 | train_lex+valid_lex+test_lex))) 41 | 42 | nclasses = len(set(reduce(\ 43 | lambda x, y: list(x)+list(y),\ 44 | train_y+test_y+valid_y))) 45 | 46 | nsentences = len(train_lex) 47 | 48 | # instanciate the model 49 | numpy.random.seed(s['seed']) 50 | random.seed(s['seed']) 51 | rnn = model( nh = s['nhidden'], 52 | nc = nclasses, 53 | ne = vocsize, 54 | de = s['emb_dimension'], 55 | cs = s['win'] ) 56 | 57 | # train with early stopping on validation set 58 | best_f1 = -numpy.inf 59 | s['clr'] = s['lr'] 60 | for e in xrange(s['nepochs']): 61 | # shuffle 62 | shuffle([train_lex, train_ne, train_y], s['seed']) 63 | s['ce'] = e 64 | tic = time.time() 65 | for i in xrange(nsentences): 66 | cwords = contextwin(train_lex[i], s['win']) 67 | words = map(lambda x: numpy.asarray(x).astype('int32'),\ 68 | minibatch(cwords, s['bs'])) 69 | labels = train_y[i] 70 | 71 | for word_batch , label_last_word in zip(words, labels): 72 | rnn.train(word_batch, label_last_word, s['clr']) 73 | rnn.normalize() 74 | 75 | if s['verbose']: 76 | print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic), 77 | sys.stdout.flush() 78 | 79 | # evaluation // back into the real world : idx -> words 80 | predictions_test = [ map(lambda x: idx2label[x], \ 81 | rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ 82 | for x in test_lex ] 83 | groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ] 84 | words_test = [ map(lambda x: idx2word[x], w) for w in test_lex] 85 | 86 | predictions_valid = [ map(lambda x: idx2label[x], \ 87 | rnn.classify(numpy.asarray(contextwin(x, s['win'])).astype('int32')))\ 88 | for x in valid_lex ] 89 | groundtruth_valid = [ map(lambda x: idx2label[x], y) for y in valid_y ] 90 | words_valid = [ map(lambda x: idx2word[x], w) for w in valid_lex] 91 | 92 | # evaluation // compute the accuracy using conlleval.pl 93 | res_test = conlleval(predictions_test, groundtruth_test, words_test, folder + '/current.test.txt') 94 | res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, folder + '/current.valid.txt') 95 | 96 | if res_valid['f1'] > best_f1: 97 | rnn.save(folder) 98 | best_f1 = res_valid['f1'] 99 | if s['verbose']: 100 | print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 101 | s['vf1'], s['vp'], s['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] 102 | s['tf1'], s['tp'], s['tr'] = res_test['f1'], res_test['p'], res_test['r'] 103 | s['be'] = e 104 | subprocess.call(['mv', folder + '/current.test.txt', folder + '/best.test.txt']) 105 | subprocess.call(['mv', folder + '/current.valid.txt', folder + '/best.valid.txt']) 106 | else: 107 | print '' 108 | 109 | # learning rate decay if no improvement in 10 epochs 110 | if s['decay'] and abs(s['be']-s['ce']) >= 10: s['clr'] *= 0.5 111 | if s['clr'] < 1e-5: break 112 | 113 | print 'BEST RESULT: epoch', e, 'valid F1', s['vf1'], 'best test F1', s['tf1'], 'with the model', folder 114 | 115 | -------------------------------------------------------------------------------- /metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesnilgr/is13/f25db1026d3b30d329992c803f793347164c371c/metrics/__init__.py -------------------------------------------------------------------------------- /metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import pdb 3 | import cPickle 4 | import random 5 | import os 6 | import stat 7 | import subprocess 8 | from os.path import isfile, join 9 | from os import chmod 10 | from is13.data.load import download 11 | 12 | PREFIX = os.getenv('ATISDATA', '') 13 | 14 | def conlleval(p, g, w, filename): 15 | ''' 16 | INPUT: 17 | p :: predictions 18 | g :: groundtruth 19 | w :: corresponding words 20 | 21 | OUTPUT: 22 | filename :: name of the file where the predictions 23 | are written. it will be the input of conlleval.pl script 24 | for computing the performance in terms of precision 25 | recall and f1 score 26 | ''' 27 | out = '' 28 | for sl, sp, sw in zip(g, p, w): 29 | out += 'BOS O O\n' 30 | for wl, wp, w in zip(sl, sp, sw): 31 | out += w + ' ' + wl + ' ' + wp + '\n' 32 | out += 'EOS O O\n\n' 33 | 34 | f = open(filename,'w') 35 | f.writelines(out) 36 | f.close() 37 | 38 | return get_perf(filename) 39 | 40 | def get_perf(filename): 41 | ''' run conlleval.pl perl script to obtain 42 | precision/recall and F1 score ''' 43 | _conlleval = PREFIX + 'conlleval.pl' 44 | if not isfile(_conlleval): 45 | #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 46 | os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl') 47 | chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions 48 | 49 | proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE) 50 | stdout, _ = proc.communicate(open(filename).read()) 51 | for line in stdout.split('\n'): 52 | if 'accuracy' in line: 53 | out = line.split() 54 | break 55 | 56 | # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00'] 57 | 58 | precision = float(out[3][:-2]) 59 | recall = float(out[5][:-2]) 60 | f1score = float(out[7]) 61 | 62 | return {'p':precision, 'r':recall, 'f1':f1score} 63 | 64 | def get_perfo(filename): 65 | ''' 66 | work around for using a PERL script in python 67 | dirty but still works. 68 | ''' 69 | tempfile = str(random.randint(1,numpy.iinfo('i').max)) + '.txt' 70 | if not isfile(PREFIX + 'conlleval.pl'): 71 | os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl') 72 | #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 73 | chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions 74 | if len(PREFIX) > 0: 75 | chmod(PREFIX + 'conlleval.pl', stat.S_IRWXU) # give the execute permissions 76 | cmd = PREFIX + 'conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile) 77 | else: 78 | cmd = './conlleval.pl < %s | grep accuracy > %s'%(filename,tempfile) 79 | print cmd 80 | out = os.system(cmd) 81 | out = open(tempfile).readlines()[0].split() 82 | os.system('rm %s'%tempfile) 83 | precision = float(out[6][:-2]) 84 | recall = float(out[8][:-2]) 85 | f1score = float(out[10]) 86 | return {'p':precision, 'r':recall, 'f1':f1score} 87 | 88 | if __name__ == '__main__': 89 | #print get_perf('valid.txt') 90 | print get_perf('valid.txt') 91 | -------------------------------------------------------------------------------- /rnn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesnilgr/is13/f25db1026d3b30d329992c803f793347164c371c/rnn/__init__.py -------------------------------------------------------------------------------- /rnn/elman.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy 3 | import os 4 | 5 | from theano import tensor as T 6 | from collections import OrderedDict 7 | 8 | class model(object): 9 | 10 | def __init__(self, nh, nc, ne, de, cs): 11 | ''' 12 | nh :: dimension of the hidden layer 13 | nc :: number of classes 14 | ne :: number of word embeddings in the vocabulary 15 | de :: dimension of the word embeddings 16 | cs :: word window context size 17 | ''' 18 | # parameters of the model 19 | self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 20 | (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end 21 | self.Wx = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 22 | (de * cs, nh)).astype(theano.config.floatX)) 23 | self.Wh = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 24 | (nh, nh)).astype(theano.config.floatX)) 25 | self.W = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 26 | (nh, nc)).astype(theano.config.floatX)) 27 | self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) 28 | self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) 29 | self.h0 = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) 30 | 31 | # bundle 32 | self.params = [ self.emb, self.Wx, self.Wh, self.W, self.bh, self.b, self.h0 ] 33 | self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 'h0'] 34 | idxs = T.imatrix() # as many columns as context window size/lines as words in the sentence 35 | x = self.emb[idxs].reshape((idxs.shape[0], de*cs)) 36 | y = T.iscalar('y') # label 37 | 38 | def recurrence(x_t, h_tm1): 39 | h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + T.dot(h_tm1, self.Wh) + self.bh) 40 | s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b) 41 | return [h_t, s_t] 42 | 43 | [h, s], _ = theano.scan(fn=recurrence, \ 44 | sequences=x, outputs_info=[self.h0, None], \ 45 | n_steps=x.shape[0]) 46 | 47 | p_y_given_x_lastword = s[-1,0,:] 48 | p_y_given_x_sentence = s[:,0,:] 49 | y_pred = T.argmax(p_y_given_x_sentence, axis=1) 50 | 51 | # cost and gradients and learning rate 52 | lr = T.scalar('lr') 53 | nll = -T.log(p_y_given_x_lastword)[y] 54 | gradients = T.grad( nll, self.params ) 55 | updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients)) 56 | 57 | # theano functions 58 | self.classify = theano.function(inputs=[idxs], outputs=y_pred) 59 | 60 | self.train = theano.function( inputs = [idxs, y, lr], 61 | outputs = nll, 62 | updates = updates ) 63 | 64 | self.normalize = theano.function( inputs = [], 65 | updates = {self.emb:\ 66 | self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')}) 67 | 68 | def save(self, folder): 69 | for param, name in zip(self.params, self.names): 70 | numpy.save(os.path.join(folder, name + '.npy'), param.get_value()) 71 | -------------------------------------------------------------------------------- /rnn/jordan.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy 3 | import os 4 | 5 | from theano import tensor as T 6 | from collections import OrderedDict 7 | 8 | class model(object): 9 | 10 | def __init__(self, nh, nc, ne, de, cs): 11 | ''' 12 | nh :: dimension of the hidden layer 13 | nc :: number of classes 14 | ne :: number of word embeddings in the vocabulary 15 | de :: dimension of the word embeddings 16 | cs :: word window context size 17 | ''' 18 | #assert st in ['proba', 'argmax'] 19 | 20 | self.emb = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 21 | (ne+1, de)).astype(theano.config.floatX)) # add one for PADDING at the end 22 | self.Wx = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 23 | (de * cs, nh)).astype(theano.config.floatX)) 24 | self.Ws = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 25 | (nc, nh)).astype(theano.config.floatX)) 26 | self.W = theano.shared(0.2 * numpy.random.uniform(-1.0, 1.0,\ 27 | (nh, nc)).astype(theano.config.floatX)) 28 | self.bh = theano.shared(numpy.zeros(nh, dtype=theano.config.floatX)) 29 | self.b = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) 30 | self.s0 = theano.shared(numpy.zeros(nc, dtype=theano.config.floatX)) 31 | 32 | # bundle 33 | self.params = [ self.emb, self.Wx, self.Ws, self.W, self.bh, self.b, self.s0 ] 34 | self.names = ['embeddings', 'Wx', 'Wh', 'W', 'bh', 'b', 's0'] 35 | idxs = T.imatrix() # as many columns as context window size/lines as words in the sentence 36 | x = self.emb[idxs].reshape((idxs.shape[0], de*cs)) 37 | y = T.iscalar('y') # label 38 | 39 | def recurrence(x_t, s_tm1): 40 | h_t = T.nnet.sigmoid(T.dot(x_t, self.Wx) + \ 41 | T.dot(s_tm1, self.Ws) + self.bh) 42 | s_t = T.nnet.softmax(T.dot(h_t, self.W) + self.b)[0] 43 | return [h_t, s_t] 44 | 45 | [h, s], _ = theano.scan(fn=recurrence, \ 46 | sequences=x, outputs_info=[None, self.s0], \ 47 | n_steps=x.shape[0]) 48 | 49 | p_y_given_x_lastword = s[-1,:] 50 | p_y_given_x_sentence = s 51 | y_pred = T.argmax(p_y_given_x_sentence, axis=1) 52 | 53 | # cost and gradients and learning rate 54 | lr = T.scalar('lr') 55 | nll = -T.mean(T.log(p_y_given_x_lastword)[y]) 56 | gradients = T.grad( nll, self.params ) 57 | updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients)) 58 | 59 | # theano functions 60 | self.classify = theano.function(inputs=[idxs], outputs=y_pred) 61 | 62 | self.train = theano.function( inputs = [idxs, y, lr], 63 | outputs = nll, 64 | updates = updates ) 65 | 66 | self.normalize = theano.function( inputs = [], 67 | updates = {self.emb:\ 68 | self.emb/T.sqrt((self.emb**2).sum(axis=1)).dimshuffle(0,'x')}) 69 | 70 | def save(self, folder): 71 | for param, name in zip(self.params, self.names): 72 | numpy.save(os.path.join(folder, name + '.npy'), param.get_value()) 73 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mesnilgr/is13/f25db1026d3b30d329992c803f793347164c371c/utils/__init__.py -------------------------------------------------------------------------------- /utils/tools.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | def shuffle(lol, seed): 4 | ''' 5 | lol :: list of list as input 6 | seed :: seed the shuffling 7 | 8 | shuffle inplace each list in the same order 9 | ''' 10 | for l in lol: 11 | random.seed(seed) 12 | random.shuffle(l) 13 | 14 | def minibatch(l, bs): 15 | ''' 16 | l :: list of word idxs 17 | return a list of minibatches of indexes 18 | which size is equal to bs 19 | border cases are treated as follow: 20 | eg: [0,1,2,3] and bs = 3 21 | will output: 22 | [[0],[0,1],[0,1,2],[1,2,3]] 23 | ''' 24 | out = [l[:i] for i in xrange(1, min(bs,len(l)+1) )] 25 | out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ] 26 | assert len(l) == len(out) 27 | return out 28 | 29 | def contextwin(l, win): 30 | ''' 31 | win :: int corresponding to the size of the window 32 | given a list of indexes composing a sentence 33 | it will return a list of list of indexes corresponding 34 | to context windows surrounding each word in the sentence 35 | ''' 36 | assert (win % 2) == 1 37 | assert win >=1 38 | l = list(l) 39 | 40 | lpadded = win/2 * [-1] + l + win/2 * [-1] 41 | out = [ lpadded[i:i+win] for i in range(len(l)) ] 42 | 43 | assert len(out) == len(l) 44 | return out 45 | 46 | --------------------------------------------------------------------------------