├── data └── coNLL │ ├── .DS_Store │ └── tmp │ ├── itol.pkl │ ├── itos.pkl │ ├── lbl_trn.npy │ ├── lbl_val.npy │ ├── trn_ids.npy │ ├── val_ids.npy │ ├── lbl_test.npy │ ├── test_ids.npy │ ├── trn_lm_ids.npy │ └── val_lm_ids.npy ├── .gitignore ├── LICENSE ├── sebastian ├── eval.py ├── evaluate_seq.py ├── create_toks_conll.py └── train_seq.py ├── README.md └── coNLL_three_layer.ipynb /data/coNLL/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/.DS_Store -------------------------------------------------------------------------------- /data/coNLL/tmp/itol.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/itol.pkl -------------------------------------------------------------------------------- /data/coNLL/tmp/itos.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/itos.pkl -------------------------------------------------------------------------------- /data/coNLL/tmp/lbl_trn.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_trn.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/lbl_val.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_val.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/trn_ids.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/trn_ids.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/val_ids.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/val_ids.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/lbl_test.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_test.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/test_ids.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/test_ids.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/trn_lm_ids.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/trn_lm_ids.npy -------------------------------------------------------------------------------- /data/coNLL/tmp/val_lm_ids.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/val_lm_ids.npy -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | coNLL_two_layer.ipynb 3 | data/ 4 | imdb-coNLL-Copy1.ipynb 5 | imdb-coNLL-copy2.ipynb 6 | imdb-coNLL-copy3.ipynb 7 | model/ 8 | testing_cases.md 9 | .DS_Store 10 | .ipynb_checkpoints/ 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Hong Pengfei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sebastian/eval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from sklearn.metrics import accuracy_score 5 | from .create_toks_conll import PAD, BOS_LABEL 6 | 7 | from allennlp.training.metrics import SpanBasedF1Measure 8 | from allennlp.data.vocabulary import Vocabulary 9 | 10 | 11 | def get_acc(preds1, preds2, gold, weight=0.5): 12 | preds = np.exp(preds1) * weight + np.exp(preds2) * (1 - weight) 13 | preds = np.array([np.argmax(p) for p in preds]) 14 | return accuracy_score(gold, preds) 15 | 16 | 17 | def eval_ner(learner, id2label, is_test=False): 18 | # set up AllenNLP evaluation metric 19 | mode = 'Test' if is_test else 'Validation' 20 | id2label = [f'B-{l}' if l in [PAD, BOS_LABEL] else l for l in id2label] 21 | namespace = 'ner_labels' 22 | label_vocab = Vocabulary( 23 | non_padded_namespaces=(namespace,), 24 | tokens_to_add={namespace: id2label}) # create the tag vocabulary 25 | f1_metric = SpanBasedF1Measure(label_vocab, 26 | tag_namespace=namespace, 27 | ignore_classes=[PAD, BOS_LABEL]) 28 | preds, y = learner.predict_with_targs(is_test=is_test) 29 | # convert to tensors, add a batch dimension 30 | preds_tensor = torch.from_numpy(preds).unsqueeze(0) 31 | y_tensor = torch.from_numpy(y).unsqueeze(0) 32 | f1_metric(preds_tensor, y_tensor) 33 | all_metrics = f1_metric.get_metric(reset=True) 34 | print(f'{mode} f1 measure overall:', all_metrics['f1-measure-overall']) 35 | print(all_metrics) 36 | preds_fwd_ids = [np.argmax(p) for p in preds] 37 | acc_fwd = accuracy_score(y, preds_fwd_ids) 38 | print(f'{mode} token-level accuracy of NER model: %.4f.' % acc_fwd) 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastai_sequence_tagging 2 | sequence tagging for NER for ULMFiT 3 | 4 | ## data 5 | to replicate result: 6 | you can download the ```data/``` folder from [here](https://www.dropbox.com/sh/z308tqyqrvakl66/AADsiYKx3vfNZ3LQGInz0Q-qa?dl=0), and put it in root directory. 7 | 8 | ## run training 9 | I am currently doing experiments in jupyter notebook ```coNLL_three_layer.ipynb``` 10 | 11 | ## files modified from lesson10.ipynb 12 | 1. concat both forward and backward outputs from language model ```W_LM = [W_forward, W_backward]``` 13 | 14 | 2. feeding word vectors from GloVe to a BiLSTM and get output ```W_glove``` 15 | 16 | 3. concatenating these outputs ```W = [W_glove, W_LM]``` 17 | 18 | 4. feeding ```W``` to another BiLSTM to get final result. 19 | 20 | ## results 21 | F1 score of 76. 22 | 23 | (need to improve by fine tuning parameters, see how the toks are preprocessed, [adding char embedding](http://alanakbik.github.io/papers/coling2018.pdf), [adding CRF layer](https://arxiv.org/abs/1603.01360). 24 | 25 | ## questions 26 | 1. which layer of lanuage model should be used for Sequence tagging problem 27 | 28 | 2. how to build a better language model for sequence tagging 29 | 30 | ## relevant papers 31 | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/pdf/1708.02182.pdf) 32 | 33 | [deep contextualized word representations](https://arxiv.org/abs/1802.05365) 34 | 35 | [End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](http://www.aclweb.org/anthology/P16-1101) 36 | 37 | [Semi-supervised sequence tagging with bidirectional language models](https://arxiv.org/abs/1705.00108) 38 | 39 | [Contextual String Embeddings for Sequence Labeling](http://alanakbik.github.io/papers/coling2018.pdf) 40 | -------------------------------------------------------------------------------- /sebastian/evaluate_seq.py: -------------------------------------------------------------------------------- 1 | mport fire 2 | from fastai.text import * 3 | from fastai.lm_rnn import * 4 | 5 | from train_seq import get_rnn_seq_labeler, TextSeqDataset, SeqDataLoader 6 | from eval import eval_ner 7 | 8 | 9 | def evaluate(dir_path, cuda_id, clas_id='', bs=64, bpe=False): 10 | 11 | print(f'prefix {dir_path}; cuda_id {cuda_id}; bs {bs}; bpe {bpe}') 12 | if not hasattr(torch._C, '_cuda_setDevice'): 13 | print('CUDA not available. Setting device=-1.') 14 | cuda_id = -1 15 | torch.cuda.set_device(cuda_id) 16 | PRE_FWD = 'fwd_' 17 | PRE_FWD = 'bpe_' + PRE_FWD if bpe else PRE_FWD 18 | IDS = 'bpe' if bpe else 'ids' 19 | if clas_id != '': clas_id += '_' 20 | dir_path = Path(dir_path) 21 | fwd_clas_file = f'{PRE_FWD}{clas_id}clas_1' 22 | fwd_clas_path = dir_path / 'models' / f'{fwd_clas_file}.h5' 23 | assert fwd_clas_path.exists(), f'Error: {fwd_clas_path} does not exist.' 24 | 25 | bptt,em_sz,nh,nl = 70,400,1150,3 26 | opt_fn = partial(optim.Adam, betas=(0.7, 0.99)) 27 | 28 | trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}.npy') 29 | val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}.npy') 30 | test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}.npy') 31 | 32 | trn_lbls = np.load(dir_path / 'tmp' / 'lbl_trn.npy') 33 | val_lbls = np.load(dir_path / 'tmp' / 'lbl_val.npy') 34 | test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy') 35 | id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb')) 36 | print('id2label:', id2label) 37 | c = len(id2label) 38 | 39 | trn_ds = TextSeqDataset(trn_sent, trn_lbls) 40 | val_ds = TextSeqDataset(val_sent, val_lbls) 41 | test_ds = TextSeqDataset(test_sent, test_lbls) 42 | trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2) 43 | val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x])) 44 | test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x])) 45 | trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp) # TODO why transpose? Should we also transpose the labels? 46 | val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp) 47 | test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp) 48 | md = ModelData(dir_path, trn_dl, val_dl, test_dl) 49 | 50 | if bpe: vs=30002 51 | else: 52 | itos = pickle.load(open(dir_path / 'tmp' / f'itos.pkl', 'rb')) 53 | vs = len(itos) 54 | 55 | dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.9 56 | 57 | m = get_rnn_seq_labeler(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1, 58 | layers=[em_sz, 50, c], drops=[dps[4], 0.1], 59 | dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3]) 60 | 61 | learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn) 62 | learn.load(fwd_clas_file) 63 | 64 | eval_ner(learn, id2label, is_test=False) 65 | eval_ner(learn, id2label, is_test=True) 66 | 67 | if __name__ == '__main__': fire.Fire(evaluate) 68 | -------------------------------------------------------------------------------- /sebastian/create_toks_conll.py: -------------------------------------------------------------------------------- 1 | from fastai.text import * 2 | import fire 3 | 4 | BOS = 'xbos' # beginning-of-sentence tag 5 | FLD = 'xfld' # data field tag 6 | 7 | BOS_LABEL = '_bos_' 8 | PAD = '_pad_' 9 | 10 | re1 = re.compile(r' +') 11 | 12 | 13 | def read_file(filepath): 14 | assert os.path.exists(filepath) 15 | sentences = [] 16 | labels = [] 17 | with open(filepath, encoding='utf-8') as f: 18 | sentence = [BOS] 19 | sentence_labels = [BOS_LABEL] 20 | for line in f: 21 | if line == '\n': 22 | sentences.append(sentence) 23 | labels.append(sentence_labels) 24 | sentence = [BOS] # use xbos as the start of sentence token 25 | sentence_labels = [BOS_LABEL] 26 | else: 27 | sentence.append(line.split()[0].lower()) 28 | # label is generally in the last column 29 | sentence_labels.append(line.split()[-1]) 30 | if sentence: # some files, e.g. NER end on an empty line 31 | sentences.append(sentence) 32 | labels.append(sentence_labels) 33 | return sentences, labels 34 | 35 | 36 | def create_toks(prefix, max_vocab=30000, min_freq=1): 37 | PATH = f'data/nlp_seq/{prefix}/' 38 | 39 | names = {} 40 | if prefix == 'ner': 41 | names['train'] = 'train.txt' 42 | names['val'] = 'valid.txt' 43 | names['test'] = 'test.txt' 44 | else: 45 | raise ValueError(f'Filenames for {prefix} have to be added first.') 46 | paths = {} 47 | for split in ['train', 'val', 'test']: 48 | paths[split] = f'{PATH}{names[split]}' 49 | 50 | print(f'prefix {prefix} max_vocab {max_vocab} min_freq {min_freq}') 51 | 52 | os.makedirs(f'{PATH}tmp', exist_ok=True) 53 | trn_tok, trn_labels = read_file(paths['train']) 54 | val_tok, val_labels = read_file(paths['val']) 55 | test_tok, test_labels = read_file(paths['test']) 56 | 57 | for trn_t, trn_l in zip(trn_tok[:5], trn_labels[:5]): 58 | print('Sentence:', trn_t, 'labels:', trn_l) 59 | 60 | print(f'# of train: {len(trn_tok)}, # of val: {len(val_tok)},' 61 | f'# of test: {len(test_tok)}') 62 | 63 | freq = Counter(p for o in trn_tok for p in o) 64 | print(freq.most_common(25)) 65 | itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq] 66 | itos.insert(0, PAD) 67 | itos.insert(0, '_unk_') 68 | stoi = collections.defaultdict(lambda: 0, 69 | {v: k for k, v in enumerate(itos)}) 70 | print(len(itos)) 71 | 72 | trn_ids = np.array([[stoi[o] for o in p] for p in trn_tok]) 73 | val_ids = np.array([[stoi[o] for o in p] for p in val_tok]) 74 | test_ids = np.array([[stoi[o] for o in p] for p in test_tok]) 75 | 76 | # map the labels to ids 77 | freq = Counter(p for o in trn_labels for p in o) 78 | print(freq) 79 | itol = [l for l, c in freq.most_common()] 80 | itol.insert(1, PAD) # insert padding label at index 1 81 | print(itol) 82 | ltoi = {l: i for i, l in enumerate(itol)} 83 | trn_lbl_ids = np.array([[ltoi[o] for o in p] for p in trn_labels]) 84 | val_lbl_ids = np.array([[ltoi[o] for o in p] for p in val_labels]) 85 | test_lbl_ids = np.array([[ltoi[o] for o in p] for p in test_labels]) 86 | 87 | ids_joined = np.array([[stoi[o] for o in p] for p in trn_tok + val_tok + test_tok]) 88 | val_ids_joined = ids_joined[int(len(ids_joined)*0.9):] 89 | ids_joined = ids_joined[:int(len(ids_joined)*0.9)] 90 | 91 | np.save(f'{PATH}tmp/trn_ids.npy', trn_ids) 92 | np.save(f'{PATH}tmp/val_ids.npy', val_ids) 93 | np.save(f'{PATH}tmp/test_ids.npy', test_ids) 94 | np.save(f'{PATH}tmp/lbl_trn.npy', trn_lbl_ids) 95 | np.save(f'{PATH}tmp/lbl_val.npy', val_lbl_ids) 96 | np.save(f'{PATH}tmp/lbl_test.npy', test_lbl_ids) 97 | pickle.dump(itos, open(f'{PATH}tmp/itos.pkl', 'wb')) 98 | pickle.dump(itol, open(f'{PATH}tmp/itol.pkl', 'wb')) 99 | np.save(f'{PATH}tmp/trn_lm_ids.npy', ids_joined) 100 | np.save(f'{PATH}tmp/val_lm_ids.npy', val_ids_joined) 101 | 102 | 103 | if __name__ == '__main__': fire.Fire(create_toks) 104 | -------------------------------------------------------------------------------- /sebastian/train_seq.py: -------------------------------------------------------------------------------- 1 | import fire 2 | from fastai.text import * 3 | from fastai.lm_rnn import * 4 | 5 | from eval import eval_ner 6 | 7 | 8 | def freeze_all_but(learner, n): 9 | c=learner.get_layer_groups() 10 | for l in c: set_trainable(l, False) 11 | set_trainable(c[n], True) 12 | 13 | 14 | def get_rnn_seq_labeler(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False, 15 | dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5): 16 | rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir, 17 | dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop) 18 | # return SequentialRNN(rnn_enc, LinearBlocks(layers, drops)) 19 | return SequentialRNN(rnn_enc, LinearDecoder(n_class, emb_sz, 0.1)) 20 | 21 | 22 | class MultiBatchSeqRNN(RNN_Encoder): 23 | def __init__(self, bptt, max_seq, *args, **kwargs): 24 | self.max_seq,self.bptt = max_seq,bptt 25 | super().__init__(*args, **kwargs) 26 | 27 | def concat(self, arrs): 28 | return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))] 29 | 30 | def forward(self, input): 31 | sl,bs = input.size() 32 | for l in self.hidden: 33 | for h in l: h.data.zero_() 34 | # raw_outputs, outputs = [],[] 35 | raw_outputs, outputs = super().forward(input) 36 | # for i in range(0, sl, self.bptt): 37 | # r, o = super().forward(input[i: min(i+self.bptt, sl)]) 38 | # if i>(sl-self.max_seq): 39 | # raw_outputs.append(r) 40 | # outputs.append(o) 41 | # return self.concat(raw_outputs), self.concat(outputs) 42 | return raw_outputs, outputs 43 | 44 | 45 | class SeqDataLoader(DataLoader): 46 | def get_batch(self, indices): 47 | res = self.np_collate([self.dataset[i] for i in indices]) 48 | # res = self.np_collate([self.dataset[i] for i in indices], self.pad_idx) 49 | # if not self.transpose: return res 50 | # res[0] = res[0].T 51 | # print('First seq:', res[0][0]) 52 | # print('First labels:', res[1][0]) 53 | res[1] = np.reshape(res[1], -1) # reshape the labels to one sequence 54 | return res 55 | 56 | 57 | class TextSeqDataset(Dataset): 58 | def __init__(self, x, y, backwards=False, sos=None, eos=None): 59 | self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos 60 | 61 | def __getitem__(self, idx): 62 | x = self.x[idx] 63 | y = self.y[idx] # we need to get y as array 64 | if self.backwards: x = list(reversed(x)) 65 | if self.eos is not None: x = x + [self.eos] 66 | if self.sos is not None: x = [self.sos]+x 67 | return np.array(x),np.array(y) 68 | 69 | def __len__(self): return len(self.x) 70 | 71 | 72 | def train_seq(dir_path, cuda_id, lm_id='', clas_id=None, bs=64, cl=1, backwards=False, startat=0, unfreeze=True, 73 | lr=0.01, dropmult=1.0, pretrain=True, bpe=False, use_clr=True, 74 | use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=False, 75 | from_scratch=False, train_file_id=''): 76 | print(f'prefix {dir_path}; cuda_id {cuda_id}; lm_id {lm_id}; clas_id {clas_id}; bs {bs}; cl {cl}; backwards {backwards}; ' 77 | f'dropmult {dropmult} unfreeze {unfreeze} startat {startat}; pretrain {pretrain}; bpe {bpe}; use_clr {use_clr};' 78 | f'use_regular_schedule {use_regular_schedule}; use_discriminative {use_discriminative}; last {last};' 79 | f'chain_thaw {chain_thaw}; from_scratch {from_scratch}; train_file_id {train_file_id}') 80 | 81 | if not hasattr(torch._C, '_cuda_setDevice'): 82 | print('CUDA not available. Setting device=-1.') 83 | cuda_id = -1 84 | torch.cuda.set_device(cuda_id) 85 | PRE = 'bwd_' if backwards else 'fwd_' 86 | PRE = 'bpe_' + PRE if bpe else PRE 87 | IDS = 'bpe' if bpe else 'ids' 88 | dir_path = Path(dir_path) 89 | train_file_id = train_file_id if train_file_id == '' else f'_{train_file_id}' 90 | lm_id = lm_id if lm_id == '' else f'{lm_id}_' 91 | clas_id = lm_id if clas_id is None else clas_id 92 | clas_id = clas_id if clas_id == '' else f'{clas_id}_' 93 | lm_file = f'{PRE}{lm_id}lm_enc' 94 | lm_path = dir_path / 'models' / f'{lm_file}.h5' 95 | if not from_scratch: 96 | assert lm_path.exists(), f'Error: {lm_path} does not exist.' 97 | # bptt,em_sz,nh,nl = 70,400,1150,3 98 | bptt, em_sz, nh, nl = 70, 100, 100, 2 99 | 100 | opt_fn = partial(optim.Adam, betas=(0.8, 0.99)) 101 | 102 | if backwards: 103 | trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}{train_file_id}_bwd.npy') 104 | val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}_bwd.npy') 105 | test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}_bwd.npy') 106 | else: 107 | trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}{train_file_id}.npy') 108 | val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}.npy') 109 | test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}.npy') 110 | 111 | trn_lbls = np.load(dir_path / 'tmp' / f'lbl_trn{train_file_id}.npy') 112 | val_lbls = np.load(dir_path / 'tmp' / f'lbl_val.npy') 113 | test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy') 114 | id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb')) 115 | c = len(id2label) 116 | 117 | if bpe: 118 | vs=30002 119 | else: 120 | id2token = pickle.load(open(dir_path / 'tmp' / 'itos.pkl', 'rb')) 121 | vs = len(id2token) 122 | 123 | print('Train sentences shape:', trn_sent.shape) 124 | print('Train labels shape:', trn_lbls.shape) 125 | print('Token ids:', [id2token[id_] for id_ in trn_sent[0]]) 126 | print('Label ids:', [id2label[id_] for id_ in trn_lbls[0]]) 127 | 128 | trn_ds = TextSeqDataset(trn_sent, trn_lbls) 129 | val_ds = TextSeqDataset(val_sent, val_lbls) 130 | test_ds = TextSeqDataset(test_sent, test_lbls) 131 | trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2) 132 | val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x])) 133 | test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x])) 134 | trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp) # TODO why transpose? Should we also transpose the labels? 135 | val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp) 136 | test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp) 137 | md = ModelData(dir_path, trn_dl, val_dl, test_dl) 138 | 139 | dps = np.array([0.4,0.5,0.05,0.3,0.4])*dropmult 140 | #dps = np.array([0.5, 0.4, 0.04, 0.3, 0.6])*dropmult 141 | #dps = np.array([0.65,0.48,0.039,0.335,0.34])*dropmult 142 | #dps = np.array([0.6,0.5,0.04,0.3,0.4])*dropmult 143 | 144 | m = get_rnn_seq_labeler(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1, 145 | layers=[em_sz, 50, c], drops=[dps[4], 0.1], 146 | dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3]) 147 | 148 | learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn) 149 | learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1) 150 | learn.clip=25. 151 | learn.metrics = [accuracy] 152 | 153 | lrm = 2.6 154 | if use_discriminative: 155 | lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr]) 156 | else: 157 | lrs = lr 158 | wd = 1e-6 159 | if not from_scratch: 160 | print(f'Loading encoder from {lm_file}...') 161 | learn.load_encoder(lm_file) 162 | else: 163 | print('Training classifier from scratch. LM encoder is not loaded.') 164 | use_regular_schedule = True 165 | 166 | if (startat<1) and pretrain and not last and not chain_thaw and not from_scratch: 167 | learn.freeze_to(-1) 168 | learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1, 169 | use_clr=None if use_regular_schedule or not use_clr else (8,3)) 170 | learn.freeze_to(-2) 171 | learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1, 172 | use_clr=None if use_regular_schedule or not use_clr else (8, 3)) 173 | learn.save(f'{PRE}{clas_id}clas_0') 174 | elif startat==1: 175 | learn.load(f'{PRE}{clas_id}clas_0') 176 | 177 | if chain_thaw: 178 | lrs = np.array([0.0001, 0.0001, 0.0001, 0.0001, 0.001]) 179 | print('Using chain-thaw. Unfreezing all layers one at a time...') 180 | n_layers = len(learn.get_layer_groups()) 181 | print('# of layers:', n_layers) 182 | # fine-tune last layer 183 | learn.freeze_to(-1) 184 | print('Fine-tuning last layer...') 185 | learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1, 186 | use_clr=None if use_regular_schedule or not use_clr else (8,3)) 187 | n = 0 188 | # fine-tune all layers up to the second-last one 189 | while n < n_layers-1: 190 | print('Fine-tuning layer #%d.' % n) 191 | freeze_all_but(learn, n) 192 | learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1, 193 | use_clr=None if use_regular_schedule or not use_clr else (8,3)) 194 | n += 1 195 | 196 | if unfreeze: 197 | learn.unfreeze() 198 | else: 199 | learn.freeze_to(-3) 200 | 201 | if last: 202 | print('Fine-tuning only the last layer...') 203 | learn.freeze_to(-1) 204 | 205 | if use_regular_schedule: 206 | print('Using regular schedule. Setting use_clr=None, n_cycles=cl, cycle_len=None.') 207 | use_clr = None 208 | n_cycles = cl 209 | cl = None 210 | else: 211 | n_cycles = 1 212 | learn.fit(lrs, n_cycles, wds=wd, cycle_len=cl, use_clr=(8,8) if use_clr else None) 213 | print('Plotting lrs...') 214 | learn.sched.plot_lr() 215 | learn.save(f'{PRE}{clas_id}clas_1') 216 | 217 | eval_ner(learn, id2label, is_test=False) 218 | eval_ner(learn, id2label, is_test=True) 219 | 220 | if __name__ == '__main__': fire.Fire(train_seq) 221 | -------------------------------------------------------------------------------- /coNLL_three_layer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "% reload_ext autoreload\n", 10 | "% autoreload 2\n", 11 | "% matplotlib inline\n", 12 | "import os\n", 13 | "os.environ['CUDA_VISIBLE_DEVICES'] = '3'" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 4, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "from fastai.text import *\n", 23 | "from fastai.lm_rnn import *\n", 24 | "from sebastian.eval import eval_ner" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 5, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "\"\"\"run this cell for only forward direction\"\"\"\n", 34 | "class LinearDecoder(nn.Module):\n", 35 | " initrange=0.1\n", 36 | " def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):\n", 37 | " super().__init__()\n", 38 | " self.decoder = nn.Linear(n_hid, n_out, bias=bias)\n", 39 | " self.decoder.weight.data.uniform_(-self.initrange, self.initrange)\n", 40 | " self.dropout = LockedDropout(dropout)\n", 41 | " if bias: self.decoder.bias.data.zero_()\n", 42 | " if tie_encoder: self.decoder.weight = tie_encoder.weight\n", 43 | "\n", 44 | " def forward(self, input):\n", 45 | " raw_outputs, outputs = input\n", 46 | " output = self.dropout(outputs[-1])\n", 47 | " decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))\n", 48 | " result = decoded.view(-1, decoded.size(1))\n", 49 | " return result, raw_outputs, outputs\n", 50 | "\n", 51 | " \n", 52 | "class SequentialRNN(nn.Sequential):\n", 53 | " def reset(self):\n", 54 | " for c in self.children():\n", 55 | " if hasattr(c, 'reset'): c.reset()\n", 56 | " \n", 57 | " \n", 58 | "class RNN_Learner(Learner):\n", 59 | " def __init__(self, data, models, **kwargs):\n", 60 | " super().__init__(data, models, **kwargs)\n", 61 | "\n", 62 | " def _get_crit(self, data): return F.cross_entropy\n", 63 | " def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)\n", 64 | "\n", 65 | " def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))\n", 66 | " def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))\n", 67 | " \n", 68 | " \n", 69 | "class TextModel(BasicModel):\n", 70 | " def get_layer_groups(self):\n", 71 | " m = self.model[0]\n", 72 | " return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]\n", 73 | " \n", 74 | " \n", 75 | "def get_rnn_seq_labeler(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,\n", 76 | " dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1):\n", 77 | " rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n", 78 | " dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n", 79 | " return SequentialRNN(rnn_enc, LinearDecoder(n_class, emb_sz, linear_decoder_dp))" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 99, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "\"\"\"run this cell for bidir\"\"\"\n", 89 | "class LinearDecoder_bidir(nn.Module):\n", 90 | " initrange=0.1\n", 91 | " def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):\n", 92 | " super().__init__()\n", 93 | " self.decoder = nn.Linear(n_hid, n_out, bias=bias)\n", 94 | " self.decoder.weight.data.uniform_(-self.initrange, self.initrange)\n", 95 | " self.dropout = LockedDropout(dropout)\n", 96 | " if bias: self.decoder.bias.data.zero_()\n", 97 | " if tie_encoder: self.decoder.weight = tie_encoder.weight\n", 98 | "\n", 99 | " def forward(self, input):\n", 100 | " raw_outputs, outputs = input\n", 101 | " output = self.dropout(outputs)\n", 102 | " decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))\n", 103 | " result = decoded.view(-1, decoded.size(1))\n", 104 | " return result, raw_outputs, outputs\n", 105 | " \n", 106 | " \n", 107 | "##### rewrite sequentialRNN #####\n", 108 | "'''changed the class it inherits from nn.Sequential to Sequential'''\n", 109 | "class SequentialRNN_bidir(nn.Module):\n", 110 | " def __init__(self, rnn_enc_fw, rnn_enc_bw, linear_decoder, embedding_path, emb_sz, freeze_word2vec=False, wordvec_sz=300):\n", 111 | " super().__init__()\n", 112 | " self.rnn_enc_fw = rnn_enc_fw\n", 113 | " self.rnn_enc_bw = rnn_enc_bw\n", 114 | " self.linear_decoder = linear_decoder\n", 115 | " self.rnn_lm= nn.LSTM(input_size=emb_sz*2+wordvec_sz*2, hidden_size=emb_sz, num_layers=1, batch_first=True, bidirectional=True)\n", 116 | " weights = np.load(embedding_path)\n", 117 | " self.embedding = nn.Embedding.from_pretrained(T(weights), freeze=freeze_word2vec)\n", 118 | " self.rnn = nn.LSTM(input_size=wordvec_sz, hidden_size=wordvec_sz, num_layers=1, batch_first=True, bidirectional=True)\n", 119 | " def reset(self):\n", 120 | " for c in self.children():\n", 121 | " if hasattr(c, 'reset'): c.reset()\n", 122 | " def forward(self, input):\n", 123 | " input_fw = input\n", 124 | " lstm_out, (n_h, n_cell) = self.rnn(self.embedding(input))\n", 125 | " input_bw = V(np.array([o.cpu().numpy()[::-1] for o in input]))\n", 126 | " raw_outputs_fw, outputs_fw = self.rnn_enc_fw(input_fw)\n", 127 | " raw_outputs_bw, outputs_bw = self.rnn_enc_bw(input_bw)\n", 128 | " bs, sl, _ = outputs_bw[-1].size()\n", 129 | " idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))\n", 130 | " output_bw = outputs_bw[-1].index_select(1, idx)\n", 131 | " outputs_fw_bw = torch.cat([outputs_fw[-1], output_bw], dim=-1)\n", 132 | " \n", 133 | " ## concat forward raw_outputs & backward raw_outputs together\n", 134 | " raw_outputs_bw_ = []\n", 135 | " # concat them together\n", 136 | " for i in range(3):\n", 137 | " bs, sl, _ = raw_outputs_bw[i].size()\n", 138 | " idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))\n", 139 | " raw_output_bw = raw_outputs_bw[i].index_select(1, idx)\n", 140 | " raw_outputs_bw_.append(raw_output_bw)\n", 141 | " raw_outputs_fw_bw = [torch.cat([raw_outputs_fw[i], raw_outputs_bw_[i]]) for i in range(3)]\n", 142 | " # concat output from lstm_out and rnn_lm\n", 143 | " outputs_fw_bw = torch.cat([lstm_out, outputs_fw_bw], dim=-1)\n", 144 | " outputs_fw_bw, (n_h, n_cell) = self.rnn_lm(outputs_fw_bw)\n", 145 | " out = self.linear_decoder((raw_outputs_fw_bw, outputs_fw_bw.contiguous()))\n", 146 | " return out\n", 147 | "\n", 148 | " \n", 149 | "##### rewrite RNN Learner #####\n", 150 | "'''rewrite load_encoder to load the encoding modules'''\n", 151 | "class RNN_Learner_bidir(Learner):\n", 152 | " def __init__(self, data, models, **kwargs):\n", 153 | " super().__init__(data, models, **kwargs)\n", 154 | "\n", 155 | " def _get_crit(self, data): return F.cross_entropy\n", 156 | " def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)\n", 157 | "\n", 158 | " def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))\n", 159 | " def load_encoder(self, name_fw, name_bw): \n", 160 | " load_model(self.model.rnn_enc_fw, self.get_model_path(name_fw))\n", 161 | " load_model(self.model.rnn_enc_bw, self.get_model_path(name_bw))\n", 162 | "##### end #####\n", 163 | "\n", 164 | "\n", 165 | "##### rewrite textmodel #####\n", 166 | "'''get layer groups'''\n", 167 | "class TextModel_bidir(BasicModel):\n", 168 | " def get_layer_groups(self):\n", 169 | " m_fw = self.model.rnn_enc_fw\n", 170 | " m_bw = self.model.rnn_enc_bw\n", 171 | " return [(m_fw.encoder, m_fw.dropouti, m_bw.encoder, m_bw.dropouti), \n", 172 | " *zip(m_fw.rnns, m_fw.dropouths, m_bw.rnns, m_bw.dropouths), \n", 173 | " (self.model.embedding), (self.model.linear_decoder), (self.model.rnn), (self.model.rnn_lm)]\n", 174 | "\n", 175 | "\n", 176 | "def get_rnn_seq_labeler_bidir(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,\n", 177 | " dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1, dir_path='', freeze_word2vec=False):\n", 178 | " rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n", 179 | " dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n", 180 | " rnn_enc_backward = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n", 181 | " dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n", 182 | " return SequentialRNN_bidir(rnn_enc, rnn_enc_backward, LinearDecoder_bidir(n_class, emb_sz*2, linear_decoder_dp), \n", 183 | " dir_path/'tmp'/'coNLL_embedding.npy', emb_sz, freeze_word2vec=freeze_word2vec, wordvec_sz=300)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 100, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "'''common functions'''\n", 193 | "def freeze_all_but(learner, n):\n", 194 | " c=learner.get_layer_groups()\n", 195 | " for l in c: set_trainable(l, False)\n", 196 | " set_trainable(c[n], True)\n", 197 | "\n", 198 | "class MultiBatchSeqRNN(RNN_Encoder):\n", 199 | " def __init__(self, bptt, max_seq, *args, **kwargs):\n", 200 | " self.max_seq,self.bptt = max_seq,bptt\n", 201 | " super().__init__(*args, **kwargs)\n", 202 | "\n", 203 | " def concat(self, arrs):\n", 204 | " return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]\n", 205 | "\n", 206 | " def forward(self, input):\n", 207 | " sl,bs = input.size()\n", 208 | " for l in self.hidden:\n", 209 | " for h in l: h.data.zero_()\n", 210 | " raw_outputs, outputs = super().forward(input)\n", 211 | " return raw_outputs, outputs\n", 212 | "\n", 213 | " \n", 214 | "class SeqDataLoader(DataLoader):\n", 215 | " def get_batch(self, indices):\n", 216 | " res = self.np_collate([self.dataset[i] for i in indices])\n", 217 | " res[1] = np.reshape(res[1], -1) # reshape the labels to one sequence\n", 218 | " return res\n", 219 | "\n", 220 | "\n", 221 | "class TextSeqDataset(Dataset):\n", 222 | " def __init__(self, x, y, backwards=False, sos=None, eos=None):\n", 223 | " self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos\n", 224 | "\n", 225 | " def __getitem__(self, idx):\n", 226 | " x = self.x[idx]\n", 227 | " y = self.y[idx] # we need to get y as array\n", 228 | " if self.backwards: x = list(reversed(x))\n", 229 | " if self.eos is not None: x = x + [self.eos]\n", 230 | " if self.sos is not None: x = [self.sos]+x\n", 231 | " return np.array(x),np.array(y)\n", 232 | "\n", 233 | " def __len__(self): return len(self.x)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 111, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "def train_seq(dir_path, lm_id='', train_file_id='', clas_id=None, bs=64, cl=1, bidir=False, startat=0, unfreeze=True,\n", 243 | " lr=0.01, dropmult=1.0, pretrain=True, bpe=False, use_clr=True,\n", 244 | " use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=False,\n", 245 | " from_scratch=False, freeze_word2vec=False, n_cycle=3, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.1):\n", 246 | " \"\"\"hyperaparameter settings\"\"\"\n", 247 | " bptt,em_sz,nh,nl = 70,400,1150,3\n", 248 | "# bptt, em_sz, nh, nl = 70, 100, 100, 2\n", 249 | " dps = np.array([0.4,0.5,0.05,0.3,0.4])*dropmult\n", 250 | "# dps = np.array([0.4,0.5,0.05,0.3,0.7])*dropmult\n", 251 | "# dps = np.array([0.5, 0.4, 0.04, 0.3, 0.6])*dropmult\n", 252 | " #dps = np.array([0.65,0.48,0.039,0.335,0.34])*dropmult\n", 253 | "# dps = np.array([0.6,0.5,0.04,0.3,0.4])*dropmult\n", 254 | "\n", 255 | " print(f'prefix {dir_path}; lm_id {lm_id}; train_file_id {train_file_id}; clas_id {clas_id};'\n", 256 | " f' bs {bs}; cl {cl}; bidir {bidir}; '\n", 257 | " f'dropmult {dropmult} unfreeze {unfreeze} startat {startat}; pretrain {pretrain}; bpe {bpe}; use_clr {use_clr};'\n", 258 | " f' use_regular_schedule {use_regular_schedule}; use_discriminative {use_discriminative}; last {last};'\n", 259 | " f' chain_thaw {chain_thaw}; from_scratch {from_scratch}; freeze_word2vec {freeze_word2vec}; bptt {bptt};'\n", 260 | " f' em_sz {em_sz}; nh {nh}; nl {nl}; dropouts {dps}; dropmult {dropmult};'\n", 261 | " f' linear_decoder_dp {linear_decoder_dp}')\n", 262 | " dir_path = Path(dir_path)\n", 263 | " \n", 264 | " lm_file = dir_path/'models'/'lm1_enc'\n", 265 | " lm_file_bw = dir_path/'models'/'lm1_enc_backward'\n", 266 | "\n", 267 | " opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 268 | "\n", 269 | " \"\"\"load datasets\"\"\"\n", 270 | " trn_sent = np.load(dir_path / 'tmp' / f'trn_ids{train_file_id}.npy')\n", 271 | " val_sent = np.load(dir_path / 'tmp' / f'val_ids.npy')\n", 272 | " test_sent = np.load(dir_path / 'tmp' / f'test_ids.npy')\n", 273 | " trn_lbls = np.load(dir_path / 'tmp' / f'lbl_trn{train_file_id}.npy')\n", 274 | " val_lbls = np.load(dir_path / 'tmp' / f'lbl_val.npy')\n", 275 | " test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy')\n", 276 | " id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb'))\n", 277 | " c = len(id2label)\n", 278 | "\n", 279 | " if bpe:\n", 280 | " vs=30002\n", 281 | " else:\n", 282 | " id2token = pickle.load(open(dir_path / 'tmp' / 'itos.pkl', 'rb'))\n", 283 | " vs = len(id2token)\n", 284 | "\n", 285 | " print('Train sentences shape:', trn_sent.shape)\n", 286 | " print('Train labels shape:', trn_lbls.shape)\n", 287 | " print('Token ids:', [id2token[id_] for id_ in trn_sent[0]])\n", 288 | " print('Label ids:', [id2label[id_] for id_ in trn_lbls[0]])\n", 289 | "\n", 290 | " trn_ds = TextSeqDataset(trn_sent, trn_lbls)\n", 291 | " val_ds = TextSeqDataset(val_sent, val_lbls)\n", 292 | " test_ds = TextSeqDataset(test_sent, test_lbls)\n", 293 | " trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2)\n", 294 | " val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x]))\n", 295 | " test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x]))\n", 296 | " trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp) # TODO why transpose? Should we also transpose the labels?\n", 297 | " val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp)\n", 298 | " test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp)\n", 299 | " md = ModelData(dir_path, trn_dl, val_dl, test_dl)\n", 300 | "\n", 301 | " if bidir:\n", 302 | " m = get_rnn_seq_labeler_bidir(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 303 | " layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n", 304 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp, \n", 305 | " freeze_word2vec=freeze_word2vec, dir_path=dir_path, )\n", 306 | " learn = RNN_Learner_bidir(md, TextModel_bidir(to_gpu(m)), opt_fn=opt_fn)\n", 307 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 308 | " learn.clip=25.\n", 309 | " learn.metrics = [accuracy]\n", 310 | " else:\n", 311 | " m = get_rnn_seq_labeler(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 312 | " layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n", 313 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp)\n", 314 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 315 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 316 | " learn.clip=25.\n", 317 | " learn.metrics = [accuracy]\n", 318 | "\n", 319 | " \n", 320 | "\n", 321 | " lrm = 2.6\n", 322 | " if use_discriminative:\n", 323 | "# lrs = np.array([lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 324 | " lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 325 | " else:\n", 326 | " lrs = lr\n", 327 | " wd = 1e-5\n", 328 | " if not from_scratch:\n", 329 | " print(f'Loading encoder from {lm_file}...')\n", 330 | " if bidir:\n", 331 | " learn.load_encoder(lm_file, lm_file_bw)\n", 332 | " else:\n", 333 | " learn.load_encoder(lm_file)\n", 334 | " else:\n", 335 | " print('Training classifier from scratch. LM encoder is not loaded.')\n", 336 | " use_regular_schedule = True\n", 337 | "\n", 338 | " if (startat<1) and pretrain and not last and not chain_thaw and not from_scratch:\n", 339 | " learn.freeze_to(-1)\n", 340 | " learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n", 341 | " use_clr=None if use_regular_schedule or not use_clr else (8,3))\n", 342 | " learn.freeze_to(-2)\n", 343 | " learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n", 344 | " use_clr=None if use_regular_schedule or not use_clr else (8, 3))\n", 345 | " learn.save(f'{PRE}{clas_id}clas_0')\n", 346 | " elif startat==1:\n", 347 | " learn.load(f'{PRE}{clas_id}clas_0')\n", 348 | "\n", 349 | " if chain_thaw:\n", 350 | " lrs = np.array([0.0001, 0.0001, 0.0001, 0.001])\n", 351 | " ## Emrys\n", 352 | " lrm = 4\n", 353 | " # the 4th is too big, and the word embedding and rnn can increase\n", 354 | " lrs = np.array([lr/(lrm**5), 2*lr/(lrm**5), lr/(lrm**4), lr/(lrm**4), 5e-4, lr/2, 7e-4, 1e-2])\n", 355 | "# lrf = learn.lr_find(lrs) # find the proper learning rate\n", 356 | "# learn.sched.plot()\n", 357 | " # end\n", 358 | " print(f'AWDLSTM learning_rate {lrs[:4]}; embedding_lr {lrs[4]}; linear_decoder_lr {lrs[5]}; rnn_lr {lrs[6]}; lm_lr {lrs[7]}; weight_decay {wd}')\n", 359 | " print('Using chain-thaw. Unfreezing all layers one at a time...')\n", 360 | " n_layers = len(learn.get_layer_groups())\n", 361 | " print('# of layers:', n_layers)\n", 362 | " # fine-tune last layer\n", 363 | " learn.freeze_to(-1)\n", 364 | " print('Fine-tuning layer #7')\n", 365 | " learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n", 366 | " use_clr=None if use_regular_schedule or not use_clr else (8,3))\n", 367 | " n = n_layers-2\n", 368 | " # fine-tune all layers up to the second-last one\n", 369 | " while n>-1:\n", 370 | " print('Fine-tuning layer #%d.' % n)\n", 371 | " freeze_all_but(learn, n)\n", 372 | " learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n", 373 | " use_clr=None if use_regular_schedule or not use_clr else (8,3))\n", 374 | " n -= 1\n", 375 | "\n", 376 | " if unfreeze:\n", 377 | " learn.unfreeze()\n", 378 | " else:\n", 379 | " learn.freeze_to(-3)\n", 380 | "\n", 381 | " if last:\n", 382 | " print('Fine-tuning only the last layer...')\n", 383 | " learn.freeze_to(-1)\n", 384 | "\n", 385 | " if use_regular_schedule:\n", 386 | " print('Using regular schedule. Setting use_clr=None, n_cycles=cl, cycle_len=None.')\n", 387 | " use_clr = None\n", 388 | " n_cycle = n_cycle\n", 389 | " cycle_len = None\n", 390 | " else:\n", 391 | " n_cycle = n_cycle\n", 392 | " print(f'n_cycle {n_cycle}; cycle_len {cycle_len}; cycle_mult {cycle_mult}; use_clr {use_clr}')\n", 393 | " learn.fit(lrs, n_cycle, wds=wd, cycle_len=cycle_len, cycle_mult=cycle_mult, use_clr=(8,8) if use_clr else None) # previously cycle_len=cl\n", 394 | " print('Plotting lrs...')\n", 395 | " learn.sched.plot_lr()\n", 396 | " clas_id = clas_id if clas_id is not None else lm_id\n", 397 | " bidir = 'bidir' if bidir else 'forward'\n", 398 | " learn.save(f'{clas_id}clas_1{bidir}')\n", 399 | "\n", 400 | " eval_ner(learn, id2label, is_test=False)\n", 401 | " eval_ner(learn, id2label, is_test=True)" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": { 408 | "scrolled": false 409 | }, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "prefix /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/; lm_id ; train_file_id ; clas_id None; bs 64; cl 1; bidir True; dropmult 1 unfreeze True startat 0; pretrain True; bpe False; use_clr False; use_regular_schedule False; use_discriminative True; last False; chain_thaw True; from_scratch False; freeze_word2vec False; bptt 70; em_sz 400; nh 1150; nl 3; dropouts [0.4 0.5 0.05 0.3 0.4 ]; dropmult 1; linear_decoder_dp 0.2\n", 416 | "Train sentences shape: (14988,)\n", 417 | "Train labels shape: (14988,)\n", 418 | "Token ids: ['xbos', '-docstart-']\n", 419 | "Label ids: ['_bos_', 'O']\n", 420 | "Loading encoder from /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/models/lm1_enc...\n", 421 | "AWDLSTM learning_rate [0.00001 0.00002 0.00004 0.00004]; embedding_lr 0.0005; linear_decoder_lr 0.005; rnn_lr 0.0007; lm_lr 0.01; weight_decay 1e-05\n", 422 | "Using chain-thaw. Unfreezing all layers one at a time...\n", 423 | "# of layers: 8\n", 424 | "Fine-tuning layer #7\n" 425 | ] 426 | }, 427 | { 428 | "data": { 429 | "application/vnd.jupyter.widget-view+json": { 430 | "model_id": "b93741108e674636a47a7f6ca46c18bd", 431 | "version_major": 2, 432 | "version_minor": 0 433 | }, 434 | "text/plain": [ 435 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 436 | ] 437 | }, 438 | "metadata": {}, 439 | "output_type": "display_data" 440 | }, 441 | { 442 | "name": "stdout", 443 | "output_type": "stream", 444 | "text": [ 445 | "epoch trn_loss val_loss accuracy \n", 446 | " 0 0.159474 0.186139 0.948293 \n", 447 | "\n", 448 | "Fine-tuning layer #6.\n" 449 | ] 450 | }, 451 | { 452 | "data": { 453 | "application/vnd.jupyter.widget-view+json": { 454 | "model_id": "9b7e60369a684ecb924a25497c250fb7", 455 | "version_major": 2, 456 | "version_minor": 0 457 | }, 458 | "text/plain": [ 459 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 460 | ] 461 | }, 462 | "metadata": {}, 463 | "output_type": "display_data" 464 | }, 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "epoch trn_loss val_loss accuracy \n", 470 | " 0 0.122185 0.153179 0.959319 \n", 471 | "\n", 472 | "Fine-tuning layer #5.\n" 473 | ] 474 | }, 475 | { 476 | "data": { 477 | "application/vnd.jupyter.widget-view+json": { 478 | "model_id": "ca7cc14d072f411a81ef87f64b5a4fb9", 479 | "version_major": 2, 480 | "version_minor": 0 481 | }, 482 | "text/plain": [ 483 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 484 | ] 485 | }, 486 | "metadata": {}, 487 | "output_type": "display_data" 488 | }, 489 | { 490 | "name": "stdout", 491 | "output_type": "stream", 492 | "text": [ 493 | "epoch trn_loss val_loss accuracy \n", 494 | " 0 0.098153 0.131593 0.960315 \n", 495 | "\n", 496 | "Fine-tuning layer #4.\n" 497 | ] 498 | }, 499 | { 500 | "data": { 501 | "application/vnd.jupyter.widget-view+json": { 502 | "model_id": "c2374ceed7014d008961059438b286c4", 503 | "version_major": 2, 504 | "version_minor": 0 505 | }, 506 | "text/plain": [ 507 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 508 | ] 509 | }, 510 | "metadata": {}, 511 | "output_type": "display_data" 512 | }, 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "epoch trn_loss val_loss accuracy \n", 518 | " 0 0.093267 0.129416 0.961494 \n", 519 | "\n", 520 | "Fine-tuning layer #3.\n" 521 | ] 522 | }, 523 | { 524 | "data": { 525 | "application/vnd.jupyter.widget-view+json": { 526 | "model_id": "bb534368138846fda4fd5b13a7ddecb0", 527 | "version_major": 2, 528 | "version_minor": 0 529 | }, 530 | "text/plain": [ 531 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 532 | ] 533 | }, 534 | "metadata": {}, 535 | "output_type": "display_data" 536 | }, 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "epoch trn_loss val_loss accuracy \n", 542 | " 0 0.102729 0.13574 0.961593 \n", 543 | "\n", 544 | "Fine-tuning layer #2.\n" 545 | ] 546 | }, 547 | { 548 | "data": { 549 | "application/vnd.jupyter.widget-view+json": { 550 | "model_id": "2820ea159b7447adb5d4afd295b53274", 551 | "version_major": 2, 552 | "version_minor": 0 553 | }, 554 | "text/plain": [ 555 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 556 | ] 557 | }, 558 | "metadata": {}, 559 | "output_type": "display_data" 560 | }, 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "epoch trn_loss val_loss accuracy \n", 566 | " 0 0.093768 0.136768 0.961793 \n", 567 | "\n", 568 | "Fine-tuning layer #1.\n" 569 | ] 570 | }, 571 | { 572 | "data": { 573 | "application/vnd.jupyter.widget-view+json": { 574 | "model_id": "44e6836351de4ee596359c36a2063c45", 575 | "version_major": 2, 576 | "version_minor": 0 577 | }, 578 | "text/plain": [ 579 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 580 | ] 581 | }, 582 | "metadata": {}, 583 | "output_type": "display_data" 584 | }, 585 | { 586 | "name": "stdout", 587 | "output_type": "stream", 588 | "text": [ 589 | "epoch trn_loss val_loss accuracy \n", 590 | " 0 0.100634 0.13684 0.96171 \n", 591 | "\n", 592 | "Fine-tuning layer #0.\n" 593 | ] 594 | }, 595 | { 596 | "data": { 597 | "application/vnd.jupyter.widget-view+json": { 598 | "model_id": "03c24cda9b29481cb0465fd7bc1e9383", 599 | "version_major": 2, 600 | "version_minor": 0 601 | }, 602 | "text/plain": [ 603 | "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))" 604 | ] 605 | }, 606 | "metadata": {}, 607 | "output_type": "display_data" 608 | }, 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "epoch trn_loss val_loss accuracy \n", 614 | " 0 0.091858 0.136862 0.96171 \n", 615 | "\n", 616 | "n_cycle 4; cycle_len 1; cycle_mult 2; use_clr False\n" 617 | ] 618 | }, 619 | { 620 | "data": { 621 | "application/vnd.jupyter.widget-view+json": { 622 | "model_id": "771c25ab715540ff90896e31b64ececa", 623 | "version_major": 2, 624 | "version_minor": 0 625 | }, 626 | "text/plain": [ 627 | "HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))" 628 | ] 629 | }, 630 | "metadata": {}, 631 | "output_type": "display_data" 632 | }, 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "epoch trn_loss val_loss accuracy \n", 638 | " 0 0.087901 0.120198 0.965296 \n", 639 | " \r" 640 | ] 641 | } 642 | ], 643 | "source": [ 644 | "train_seq('/fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/', lm_id='', train_file_id='', clas_id=None,\n", 645 | " bs=64, cl=1, bidir=True, startat=0, unfreeze=True,\n", 646 | " lr=0.01, dropmult=1, pretrain=True, bpe=False, use_clr=False,\n", 647 | " use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=True,\n", 648 | " from_scratch=False, freeze_word2vec=False, n_cycle=4, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.2)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [] 657 | } 658 | ], 659 | "metadata": { 660 | "kernelspec": { 661 | "display_name": "Python 3", 662 | "language": "python", 663 | "name": "python3" 664 | }, 665 | "language_info": { 666 | "codemirror_mode": { 667 | "name": "ipython", 668 | "version": 3 669 | }, 670 | "file_extension": ".py", 671 | "mimetype": "text/x-python", 672 | "name": "python", 673 | "nbconvert_exporter": "python", 674 | "pygments_lexer": "ipython3", 675 | "version": "3.6.6" 676 | } 677 | }, 678 | "nbformat": 4, 679 | "nbformat_minor": 2 680 | } 681 | --------------------------------------------------------------------------------