├── data
    └── coNLL
    │   ├── .DS_Store
    │   └── tmp
    │       ├── itol.pkl
    │       ├── itos.pkl
    │       ├── lbl_trn.npy
    │       ├── lbl_val.npy
    │       ├── trn_ids.npy
    │       ├── val_ids.npy
    │       ├── lbl_test.npy
    │       ├── test_ids.npy
    │       ├── trn_lm_ids.npy
    │       └── val_lm_ids.npy
├── .gitignore
├── LICENSE
├── sebastian
    ├── eval.py
    ├── evaluate_seq.py
    ├── create_toks_conll.py
    └── train_seq.py
├── README.md
└── coNLL_three_layer.ipynb


/data/coNLL/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/.DS_Store


--------------------------------------------------------------------------------
/data/coNLL/tmp/itol.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/itol.pkl


--------------------------------------------------------------------------------
/data/coNLL/tmp/itos.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/itos.pkl


--------------------------------------------------------------------------------
/data/coNLL/tmp/lbl_trn.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_trn.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/lbl_val.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_val.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/trn_ids.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/trn_ids.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/val_ids.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/val_ids.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/lbl_test.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/lbl_test.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/test_ids.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/test_ids.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/trn_lm_ids.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/trn_lm_ids.npy


--------------------------------------------------------------------------------
/data/coNLL/tmp/val_lm_ids.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Emrys-Hong/fastai_sequence_tagging/HEAD/data/coNLL/tmp/val_lm_ids.npy


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | coNLL_two_layer.ipynb
 3 | data/
 4 | imdb-coNLL-Copy1.ipynb
 5 | imdb-coNLL-copy2.ipynb
 6 | imdb-coNLL-copy3.ipynb
 7 | model/
 8 | testing_cases.md
 9 | .DS_Store
10 | .ipynb_checkpoints/
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Hong Pengfei
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/sebastian/eval.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from sklearn.metrics import accuracy_score
 5 | from .create_toks_conll import PAD, BOS_LABEL
 6 | 
 7 | from allennlp.training.metrics import SpanBasedF1Measure
 8 | from allennlp.data.vocabulary import Vocabulary
 9 | 
10 | 
11 | def get_acc(preds1, preds2, gold, weight=0.5):
12 |     preds = np.exp(preds1) * weight + np.exp(preds2) * (1 - weight)
13 |     preds = np.array([np.argmax(p) for p in preds])
14 |     return accuracy_score(gold, preds)
15 | 
16 | 
17 | def eval_ner(learner, id2label, is_test=False):
18 |     # set up AllenNLP evaluation metric
19 |     mode = 'Test' if is_test else 'Validation'
20 |     id2label = [f'B-{l}' if l in [PAD, BOS_LABEL] else l for l in id2label]
21 |     namespace = 'ner_labels'
22 |     label_vocab = Vocabulary(
23 |         non_padded_namespaces=(namespace,),
24 |         tokens_to_add={namespace: id2label})  # create the tag vocabulary
25 |     f1_metric = SpanBasedF1Measure(label_vocab,
26 |                                    tag_namespace=namespace,
27 |                                    ignore_classes=[PAD, BOS_LABEL])
28 |     preds, y = learner.predict_with_targs(is_test=is_test)
29 |     # convert to tensors, add a batch dimension
30 |     preds_tensor = torch.from_numpy(preds).unsqueeze(0)
31 |     y_tensor = torch.from_numpy(y).unsqueeze(0)
32 |     f1_metric(preds_tensor, y_tensor)
33 |     all_metrics = f1_metric.get_metric(reset=True)
34 |     print(f'{mode} f1 measure overall:', all_metrics['f1-measure-overall'])
35 |     print(all_metrics)
36 |     preds_fwd_ids = [np.argmax(p) for p in preds]
37 |     acc_fwd = accuracy_score(y, preds_fwd_ids)
38 |     print(f'{mode} token-level accuracy of NER model: %.4f.' % acc_fwd)
39 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # fastai_sequence_tagging
 2 | sequence tagging for NER for ULMFiT
 3 | 
 4 | ## data
 5 | to replicate result:
 6 | you can download the ```data/``` folder from [here](https://www.dropbox.com/sh/z308tqyqrvakl66/AADsiYKx3vfNZ3LQGInz0Q-qa?dl=0), and put it in root directory.
 7 | 
 8 | ## run training
 9 | I am currently doing experiments in jupyter notebook ```coNLL_three_layer.ipynb```
10 | 
11 | ## files modified from lesson10.ipynb
12 | 1. concat both forward and backward outputs from language model ```W_LM = [W_forward, W_backward]```
13 | 
14 | 2. feeding word vectors from GloVe to a BiLSTM and get output ```W_glove```
15 | 
16 | 3. concatenating these outputs ```W = [W_glove, W_LM]```
17 | 
18 | 4. feeding ```W``` to another BiLSTM to get final result.
19 | 
20 | ## results
21 | F1 score of 76. 
22 | 
23 | (need to improve by fine tuning parameters, see how the toks are preprocessed, [adding char embedding](http://alanakbik.github.io/papers/coling2018.pdf), [adding CRF layer](https://arxiv.org/abs/1603.01360).
24 | 
25 | ## questions
26 | 1. which layer of lanuage model should be used for Sequence tagging problem
27 | 
28 | 2. how to build a better language model for sequence tagging
29 | 
30 | ## relevant papers
31 | [Regularizing and Optimizing LSTM Language Models](https://arxiv.org/pdf/1708.02182.pdf)
32 | 
33 | [deep contextualized word representations](https://arxiv.org/abs/1802.05365)
34 | 
35 | [End-to-end Sequence Labeling via Bi-directional LSTM-CNNs-CRF](http://www.aclweb.org/anthology/P16-1101)
36 | 
37 | [Semi-supervised sequence tagging with bidirectional language models](https://arxiv.org/abs/1705.00108)
38 | 
39 | [Contextual String Embeddings for Sequence Labeling](http://alanakbik.github.io/papers/coling2018.pdf)
40 | 


--------------------------------------------------------------------------------
/sebastian/evaluate_seq.py:
--------------------------------------------------------------------------------
 1 | mport fire
 2 | from fastai.text import *
 3 | from fastai.lm_rnn import *
 4 | 
 5 | from train_seq import get_rnn_seq_labeler, TextSeqDataset, SeqDataLoader
 6 | from eval import eval_ner
 7 | 
 8 | 
 9 | def evaluate(dir_path, cuda_id, clas_id='', bs=64, bpe=False):
10 | 
11 |     print(f'prefix {dir_path}; cuda_id {cuda_id}; bs {bs}; bpe {bpe}')
12 |     if not hasattr(torch._C, '_cuda_setDevice'):
13 |         print('CUDA not available. Setting device=-1.')
14 |         cuda_id = -1
15 |     torch.cuda.set_device(cuda_id)
16 |     PRE_FWD = 'fwd_'
17 |     PRE_FWD = 'bpe_' + PRE_FWD if bpe else PRE_FWD
18 |     IDS = 'bpe' if bpe else 'ids'
19 |     if clas_id != '': clas_id += '_'
20 |     dir_path = Path(dir_path)
21 |     fwd_clas_file = f'{PRE_FWD}{clas_id}clas_1'
22 |     fwd_clas_path = dir_path / 'models' / f'{fwd_clas_file}.h5'
23 |     assert fwd_clas_path.exists(), f'Error: {fwd_clas_path} does not exist.'
24 | 
25 |     bptt,em_sz,nh,nl = 70,400,1150,3
26 |     opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
27 | 
28 |     trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}.npy')
29 |     val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}.npy')
30 |     test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}.npy')
31 | 
32 |     trn_lbls = np.load(dir_path / 'tmp' / 'lbl_trn.npy')
33 |     val_lbls = np.load(dir_path / 'tmp' / 'lbl_val.npy')
34 |     test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy')
35 |     id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb'))
36 |     print('id2label:', id2label)
37 |     c = len(id2label)
38 | 
39 |     trn_ds = TextSeqDataset(trn_sent, trn_lbls)
40 |     val_ds = TextSeqDataset(val_sent, val_lbls)
41 |     test_ds = TextSeqDataset(test_sent, test_lbls)
42 |     trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2)
43 |     val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x]))
44 |     test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x]))
45 |     trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp)  # TODO why transpose? Should we also transpose the labels?
46 |     val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp)
47 |     test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp)
48 |     md = ModelData(dir_path, trn_dl, val_dl, test_dl)
49 | 
50 |     if bpe: vs=30002
51 |     else:
52 |         itos = pickle.load(open(dir_path / 'tmp' / f'itos.pkl', 'rb'))
53 |         vs = len(itos)
54 | 
55 |     dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.9
56 | 
57 |     m = get_rnn_seq_labeler(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
58 |               layers=[em_sz, 50, c], drops=[dps[4], 0.1],
59 |               dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
60 | 
61 |     learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
62 |     learn.load(fwd_clas_file)
63 | 
64 |     eval_ner(learn, id2label, is_test=False)
65 |     eval_ner(learn, id2label, is_test=True)
66 | 
67 | if __name__ == '__main__': fire.Fire(evaluate)
68 | 


--------------------------------------------------------------------------------
/sebastian/create_toks_conll.py:
--------------------------------------------------------------------------------
  1 | from fastai.text import *
  2 | import fire
  3 | 
  4 | BOS = 'xbos'  # beginning-of-sentence tag
  5 | FLD = 'xfld'  # data field tag
  6 | 
  7 | BOS_LABEL = '_bos_'
  8 | PAD = '_pad_'
  9 | 
 10 | re1 = re.compile(r'  +')
 11 | 
 12 | 
 13 | def read_file(filepath):
 14 |     assert os.path.exists(filepath)
 15 |     sentences = []
 16 |     labels = []
 17 |     with open(filepath, encoding='utf-8') as f:
 18 |         sentence = [BOS]
 19 |         sentence_labels = [BOS_LABEL]
 20 |         for line in f:
 21 |             if line == '\n':
 22 |                 sentences.append(sentence)
 23 |                 labels.append(sentence_labels)
 24 |                 sentence = [BOS]  # use xbos as the start of sentence token
 25 |                 sentence_labels = [BOS_LABEL]
 26 |             else:
 27 |                 sentence.append(line.split()[0].lower())
 28 |                 # label is generally in the last column
 29 |                 sentence_labels.append(line.split()[-1])
 30 |         if sentence:  # some files, e.g. NER end on an empty line
 31 |             sentences.append(sentence)
 32 |             labels.append(sentence_labels)
 33 |     return sentences, labels
 34 | 
 35 | 
 36 | def create_toks(prefix, max_vocab=30000, min_freq=1):
 37 |     PATH = f'data/nlp_seq/{prefix}/'
 38 | 
 39 |     names = {}
 40 |     if prefix == 'ner':
 41 |         names['train'] = 'train.txt'
 42 |         names['val'] = 'valid.txt'
 43 |         names['test'] = 'test.txt'
 44 |     else:
 45 |         raise ValueError(f'Filenames for {prefix} have to be added first.')
 46 |     paths = {}
 47 |     for split in ['train', 'val', 'test']:
 48 |         paths[split] = f'{PATH}{names[split]}'
 49 | 
 50 |     print(f'prefix {prefix} max_vocab {max_vocab} min_freq {min_freq}')
 51 | 
 52 |     os.makedirs(f'{PATH}tmp', exist_ok=True)
 53 |     trn_tok, trn_labels = read_file(paths['train'])
 54 |     val_tok, val_labels = read_file(paths['val'])
 55 |     test_tok, test_labels = read_file(paths['test'])
 56 | 
 57 |     for trn_t, trn_l in zip(trn_tok[:5], trn_labels[:5]):
 58 |         print('Sentence:', trn_t, 'labels:', trn_l)
 59 | 
 60 |     print(f'# of train: {len(trn_tok)}, # of val: {len(val_tok)},'
 61 |           f'# of test: {len(test_tok)}')
 62 | 
 63 |     freq = Counter(p for o in trn_tok for p in o)
 64 |     print(freq.most_common(25))
 65 |     itos = [o for o, c in freq.most_common(max_vocab) if c > min_freq]
 66 |     itos.insert(0, PAD)
 67 |     itos.insert(0, '_unk_')
 68 |     stoi = collections.defaultdict(lambda: 0,
 69 |                                    {v: k for k, v in enumerate(itos)})
 70 |     print(len(itos))
 71 | 
 72 |     trn_ids = np.array([[stoi[o] for o in p] for p in trn_tok])
 73 |     val_ids = np.array([[stoi[o] for o in p] for p in val_tok])
 74 |     test_ids = np.array([[stoi[o] for o in p] for p in test_tok])
 75 | 
 76 |     # map the labels to ids
 77 |     freq = Counter(p for o in trn_labels for p in o)
 78 |     print(freq)
 79 |     itol = [l for l, c in freq.most_common()]
 80 |     itol.insert(1, PAD)  # insert padding label at index 1
 81 |     print(itol)
 82 |     ltoi = {l: i for i, l in enumerate(itol)}
 83 |     trn_lbl_ids = np.array([[ltoi[o] for o in p] for p in trn_labels])
 84 |     val_lbl_ids = np.array([[ltoi[o] for o in p] for p in val_labels])
 85 |     test_lbl_ids = np.array([[ltoi[o] for o in p] for p in test_labels])
 86 | 
 87 |     ids_joined = np.array([[stoi[o] for o in p] for p in trn_tok + val_tok + test_tok])
 88 |     val_ids_joined = ids_joined[int(len(ids_joined)*0.9):]
 89 |     ids_joined = ids_joined[:int(len(ids_joined)*0.9)]
 90 | 
 91 |     np.save(f'{PATH}tmp/trn_ids.npy', trn_ids)
 92 |     np.save(f'{PATH}tmp/val_ids.npy', val_ids)
 93 |     np.save(f'{PATH}tmp/test_ids.npy', test_ids)
 94 |     np.save(f'{PATH}tmp/lbl_trn.npy', trn_lbl_ids)
 95 |     np.save(f'{PATH}tmp/lbl_val.npy', val_lbl_ids)
 96 |     np.save(f'{PATH}tmp/lbl_test.npy', test_lbl_ids)
 97 |     pickle.dump(itos, open(f'{PATH}tmp/itos.pkl', 'wb'))
 98 |     pickle.dump(itol, open(f'{PATH}tmp/itol.pkl', 'wb'))
 99 |     np.save(f'{PATH}tmp/trn_lm_ids.npy', ids_joined)
100 |     np.save(f'{PATH}tmp/val_lm_ids.npy', val_ids_joined)
101 | 
102 | 
103 | if __name__ == '__main__': fire.Fire(create_toks)
104 | 


--------------------------------------------------------------------------------
/sebastian/train_seq.py:
--------------------------------------------------------------------------------
  1 | import fire
  2 | from fastai.text import *
  3 | from fastai.lm_rnn import *
  4 | 
  5 | from eval import eval_ner
  6 | 
  7 | 
  8 | def freeze_all_but(learner, n):
  9 |     c=learner.get_layer_groups()
 10 |     for l in c: set_trainable(l, False)
 11 |     set_trainable(c[n], True)
 12 | 
 13 | 
 14 | def get_rnn_seq_labeler(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
 15 |                       dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
 16 |     rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
 17 |                       dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
 18 |     # return SequentialRNN(rnn_enc, LinearBlocks(layers, drops))
 19 |     return SequentialRNN(rnn_enc, LinearDecoder(n_class, emb_sz, 0.1))
 20 | 
 21 | 
 22 | class MultiBatchSeqRNN(RNN_Encoder):
 23 |     def __init__(self, bptt, max_seq, *args, **kwargs):
 24 |         self.max_seq,self.bptt = max_seq,bptt
 25 |         super().__init__(*args, **kwargs)
 26 | 
 27 |     def concat(self, arrs):
 28 |         return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]
 29 | 
 30 |     def forward(self, input):
 31 |         sl,bs = input.size()
 32 |         for l in self.hidden:
 33 |             for h in l: h.data.zero_()
 34 |         # raw_outputs, outputs = [],[]
 35 |         raw_outputs, outputs = super().forward(input)
 36 |         # for i in range(0, sl, self.bptt):
 37 |         #     r, o = super().forward(input[i: min(i+self.bptt, sl)])
 38 |         #     if i>(sl-self.max_seq):
 39 |         #         raw_outputs.append(r)
 40 |         #         outputs.append(o)
 41 |         # return self.concat(raw_outputs), self.concat(outputs)
 42 |         return raw_outputs, outputs
 43 | 
 44 | 
 45 | class SeqDataLoader(DataLoader):
 46 |     def get_batch(self, indices):
 47 |         res = self.np_collate([self.dataset[i] for i in indices])
 48 |         # res = self.np_collate([self.dataset[i] for i in indices], self.pad_idx)
 49 |         # if not self.transpose: return res
 50 |         # res[0] = res[0].T
 51 |         # print('First seq:', res[0][0])
 52 |         # print('First labels:', res[1][0])
 53 |         res[1] = np.reshape(res[1], -1)  # reshape the labels to one sequence
 54 |         return res
 55 | 
 56 | 
 57 | class TextSeqDataset(Dataset):
 58 |     def __init__(self, x, y, backwards=False, sos=None, eos=None):
 59 |         self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos
 60 | 
 61 |     def __getitem__(self, idx):
 62 |         x = self.x[idx]
 63 |         y = self.y[idx]  # we need to get y as array
 64 |         if self.backwards: x = list(reversed(x))
 65 |         if self.eos is not None: x = x + [self.eos]
 66 |         if self.sos is not None: x = [self.sos]+x
 67 |         return np.array(x),np.array(y)
 68 | 
 69 |     def __len__(self): return len(self.x)
 70 | 
 71 | 
 72 | def train_seq(dir_path, cuda_id, lm_id='', clas_id=None, bs=64, cl=1, backwards=False, startat=0, unfreeze=True,
 73 |               lr=0.01, dropmult=1.0, pretrain=True, bpe=False, use_clr=True,
 74 |               use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=False,
 75 |               from_scratch=False, train_file_id=''):
 76 |     print(f'prefix {dir_path}; cuda_id {cuda_id}; lm_id {lm_id}; clas_id {clas_id}; bs {bs}; cl {cl}; backwards {backwards}; '
 77 |         f'dropmult {dropmult} unfreeze {unfreeze} startat {startat}; pretrain {pretrain}; bpe {bpe}; use_clr {use_clr};'
 78 |         f'use_regular_schedule {use_regular_schedule}; use_discriminative {use_discriminative}; last {last};'
 79 |         f'chain_thaw {chain_thaw}; from_scratch {from_scratch}; train_file_id {train_file_id}')
 80 | 
 81 |     if not hasattr(torch._C, '_cuda_setDevice'):
 82 |         print('CUDA not available. Setting device=-1.')
 83 |         cuda_id = -1
 84 |     torch.cuda.set_device(cuda_id)
 85 |     PRE = 'bwd_' if backwards else 'fwd_'
 86 |     PRE = 'bpe_' + PRE if bpe else PRE
 87 |     IDS = 'bpe' if bpe else 'ids'
 88 |     dir_path = Path(dir_path)
 89 |     train_file_id = train_file_id if train_file_id == '' else f'_{train_file_id}'
 90 |     lm_id = lm_id if lm_id == '' else f'{lm_id}_'
 91 |     clas_id = lm_id if clas_id is None else clas_id
 92 |     clas_id = clas_id if clas_id == '' else f'{clas_id}_'
 93 |     lm_file = f'{PRE}{lm_id}lm_enc'
 94 |     lm_path = dir_path / 'models' / f'{lm_file}.h5'
 95 |     if not from_scratch:
 96 |         assert lm_path.exists(), f'Error: {lm_path} does not exist.'
 97 |     # bptt,em_sz,nh,nl = 70,400,1150,3
 98 |     bptt, em_sz, nh, nl = 70, 100, 100, 2
 99 | 
100 |     opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
101 | 
102 |     if backwards:
103 |         trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}{train_file_id}_bwd.npy')
104 |         val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}_bwd.npy')
105 |         test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}_bwd.npy')
106 |     else:
107 |         trn_sent = np.load(dir_path / 'tmp' / f'trn_{IDS}{train_file_id}.npy')
108 |         val_sent = np.load(dir_path / 'tmp' / f'val_{IDS}.npy')
109 |         test_sent = np.load(dir_path / 'tmp' / f'test_{IDS}.npy')
110 | 
111 |     trn_lbls = np.load(dir_path / 'tmp' / f'lbl_trn{train_file_id}.npy')
112 |     val_lbls = np.load(dir_path / 'tmp' / f'lbl_val.npy')
113 |     test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy')
114 |     id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb'))
115 |     c = len(id2label)
116 | 
117 |     if bpe:
118 |         vs=30002
119 |     else:
120 |         id2token = pickle.load(open(dir_path / 'tmp' / 'itos.pkl', 'rb'))
121 |         vs = len(id2token)
122 | 
123 |     print('Train sentences shape:', trn_sent.shape)
124 |     print('Train labels shape:', trn_lbls.shape)
125 |     print('Token ids:', [id2token[id_] for id_ in trn_sent[0]])
126 |     print('Label ids:', [id2label[id_] for id_ in trn_lbls[0]])
127 | 
128 |     trn_ds = TextSeqDataset(trn_sent, trn_lbls)
129 |     val_ds = TextSeqDataset(val_sent, val_lbls)
130 |     test_ds = TextSeqDataset(test_sent, test_lbls)
131 |     trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2)
132 |     val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x]))
133 |     test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x]))
134 |     trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp)  # TODO why transpose? Should we also transpose the labels?
135 |     val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp)
136 |     test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp)
137 |     md = ModelData(dir_path, trn_dl, val_dl, test_dl)
138 | 
139 |     dps = np.array([0.4,0.5,0.05,0.3,0.4])*dropmult
140 |     #dps = np.array([0.5, 0.4, 0.04, 0.3, 0.6])*dropmult
141 |     #dps = np.array([0.65,0.48,0.039,0.335,0.34])*dropmult
142 |     #dps = np.array([0.6,0.5,0.04,0.3,0.4])*dropmult
143 | 
144 |     m = get_rnn_seq_labeler(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
145 |               layers=[em_sz, 50, c], drops=[dps[4], 0.1],
146 |               dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
147 | 
148 |     learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
149 |     learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
150 |     learn.clip=25.
151 |     learn.metrics = [accuracy]
152 | 
153 |     lrm = 2.6
154 |     if use_discriminative:
155 |         lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])
156 |     else:
157 |         lrs = lr
158 |     wd = 1e-6
159 |     if not from_scratch:
160 |         print(f'Loading encoder from {lm_file}...')
161 |         learn.load_encoder(lm_file)
162 |     else:
163 |         print('Training classifier from scratch. LM encoder is not loaded.')
164 |         use_regular_schedule = True
165 | 
166 |     if (startat<1) and pretrain and not last and not chain_thaw and not from_scratch:
167 |         learn.freeze_to(-1)
168 |         learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
169 |                   use_clr=None if use_regular_schedule or not use_clr else (8,3))
170 |         learn.freeze_to(-2)
171 |         learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
172 |                   use_clr=None if use_regular_schedule or not use_clr else (8, 3))
173 |         learn.save(f'{PRE}{clas_id}clas_0')
174 |     elif startat==1:
175 |         learn.load(f'{PRE}{clas_id}clas_0')
176 | 
177 |     if chain_thaw:
178 |         lrs = np.array([0.0001, 0.0001, 0.0001, 0.0001, 0.001])
179 |         print('Using chain-thaw. Unfreezing all layers one at a time...')
180 |         n_layers = len(learn.get_layer_groups())
181 |         print('# of layers:', n_layers)
182 |         # fine-tune last layer
183 |         learn.freeze_to(-1)
184 |         print('Fine-tuning last layer...')
185 |         learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
186 |                   use_clr=None if use_regular_schedule or not use_clr else (8,3))
187 |         n = 0
188 |         # fine-tune all layers up to the second-last one
189 |         while n < n_layers-1:
190 |             print('Fine-tuning layer #%d.' % n)
191 |             freeze_all_but(learn, n)
192 |             learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,
193 |                       use_clr=None if use_regular_schedule or not use_clr else (8,3))
194 |             n += 1
195 | 
196 |     if unfreeze:
197 |         learn.unfreeze()
198 |     else:
199 |         learn.freeze_to(-3)
200 | 
201 |     if last:
202 |         print('Fine-tuning only the last layer...')
203 |         learn.freeze_to(-1)
204 | 
205 |     if use_regular_schedule:
206 |         print('Using regular schedule. Setting use_clr=None, n_cycles=cl, cycle_len=None.')
207 |         use_clr = None
208 |         n_cycles = cl
209 |         cl = None
210 |     else:
211 |         n_cycles = 1
212 |     learn.fit(lrs, n_cycles, wds=wd, cycle_len=cl, use_clr=(8,8) if use_clr else None)
213 |     print('Plotting lrs...')
214 |     learn.sched.plot_lr()
215 |     learn.save(f'{PRE}{clas_id}clas_1')
216 | 
217 |     eval_ner(learn, id2label, is_test=False)
218 |     eval_ner(learn, id2label, is_test=True)
219 | 
220 | if __name__ == '__main__': fire.Fire(train_seq)
221 | 


--------------------------------------------------------------------------------
/coNLL_three_layer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "% reload_ext autoreload\n",
 10 |     "% autoreload 2\n",
 11 |     "% matplotlib inline\n",
 12 |     "import os\n",
 13 |     "os.environ['CUDA_VISIBLE_DEVICES'] = '3'"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 4,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "from fastai.text import *\n",
 23 |     "from fastai.lm_rnn import *\n",
 24 |     "from sebastian.eval import eval_ner"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 5,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "\"\"\"run this cell for only forward direction\"\"\"\n",
 34 |     "class LinearDecoder(nn.Module):\n",
 35 |     "    initrange=0.1\n",
 36 |     "    def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):\n",
 37 |     "        super().__init__()\n",
 38 |     "        self.decoder = nn.Linear(n_hid, n_out, bias=bias)\n",
 39 |     "        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)\n",
 40 |     "        self.dropout = LockedDropout(dropout)\n",
 41 |     "        if bias: self.decoder.bias.data.zero_()\n",
 42 |     "        if tie_encoder: self.decoder.weight = tie_encoder.weight\n",
 43 |     "\n",
 44 |     "    def forward(self, input):\n",
 45 |     "        raw_outputs, outputs = input\n",
 46 |     "        output = self.dropout(outputs[-1])\n",
 47 |     "        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))\n",
 48 |     "        result = decoded.view(-1, decoded.size(1))\n",
 49 |     "        return result, raw_outputs, outputs\n",
 50 |     "\n",
 51 |     "    \n",
 52 |     "class SequentialRNN(nn.Sequential):\n",
 53 |     "    def reset(self):\n",
 54 |     "        for c in self.children():\n",
 55 |     "            if hasattr(c, 'reset'): c.reset()\n",
 56 |     "                \n",
 57 |     "                \n",
 58 |     "class RNN_Learner(Learner):\n",
 59 |     "    def __init__(self, data, models, **kwargs):\n",
 60 |     "        super().__init__(data, models, **kwargs)\n",
 61 |     "\n",
 62 |     "    def _get_crit(self, data): return F.cross_entropy\n",
 63 |     "    def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)\n",
 64 |     "\n",
 65 |     "    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))\n",
 66 |     "    def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))\n",
 67 |     "        \n",
 68 |     "        \n",
 69 |     "class TextModel(BasicModel):\n",
 70 |     "    def get_layer_groups(self):\n",
 71 |     "        m = self.model[0]\n",
 72 |     "        return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]\n",
 73 |     "    \n",
 74 |     "    \n",
 75 |     "def get_rnn_seq_labeler(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,\n",
 76 |     "                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1):\n",
 77 |     "    rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n",
 78 |     "                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n",
 79 |     "    return SequentialRNN(rnn_enc, LinearDecoder(n_class, emb_sz, linear_decoder_dp))"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 99,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "\"\"\"run this cell for bidir\"\"\"\n",
 89 |     "class LinearDecoder_bidir(nn.Module):\n",
 90 |     "    initrange=0.1\n",
 91 |     "    def __init__(self, n_out, n_hid, dropout, tie_encoder=None, bias=False):\n",
 92 |     "        super().__init__()\n",
 93 |     "        self.decoder = nn.Linear(n_hid, n_out, bias=bias)\n",
 94 |     "        self.decoder.weight.data.uniform_(-self.initrange, self.initrange)\n",
 95 |     "        self.dropout = LockedDropout(dropout)\n",
 96 |     "        if bias: self.decoder.bias.data.zero_()\n",
 97 |     "        if tie_encoder: self.decoder.weight = tie_encoder.weight\n",
 98 |     "\n",
 99 |     "    def forward(self, input):\n",
100 |     "        raw_outputs, outputs = input\n",
101 |     "        output = self.dropout(outputs)\n",
102 |     "        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))\n",
103 |     "        result = decoded.view(-1, decoded.size(1))\n",
104 |     "        return result, raw_outputs, outputs\n",
105 |     "    \n",
106 |     "    \n",
107 |     "##### rewrite sequentialRNN #####\n",
108 |     "'''changed the class it inherits from nn.Sequential to Sequential'''\n",
109 |     "class SequentialRNN_bidir(nn.Module):\n",
110 |     "    def __init__(self, rnn_enc_fw, rnn_enc_bw, linear_decoder, embedding_path, emb_sz, freeze_word2vec=False, wordvec_sz=300):\n",
111 |     "        super().__init__()\n",
112 |     "        self.rnn_enc_fw = rnn_enc_fw\n",
113 |     "        self.rnn_enc_bw = rnn_enc_bw\n",
114 |     "        self.linear_decoder = linear_decoder\n",
115 |     "        self.rnn_lm= nn.LSTM(input_size=emb_sz*2+wordvec_sz*2, hidden_size=emb_sz, num_layers=1, batch_first=True, bidirectional=True)\n",
116 |     "        weights = np.load(embedding_path)\n",
117 |     "        self.embedding = nn.Embedding.from_pretrained(T(weights), freeze=freeze_word2vec)\n",
118 |     "        self.rnn = nn.LSTM(input_size=wordvec_sz, hidden_size=wordvec_sz, num_layers=1, batch_first=True, bidirectional=True)\n",
119 |     "    def reset(self):\n",
120 |     "        for c in self.children():\n",
121 |     "            if hasattr(c, 'reset'): c.reset()\n",
122 |     "    def forward(self, input):\n",
123 |     "        input_fw = input\n",
124 |     "        lstm_out, (n_h, n_cell) = self.rnn(self.embedding(input))\n",
125 |     "        input_bw = V(np.array([o.cpu().numpy()[::-1] for o in input]))\n",
126 |     "        raw_outputs_fw, outputs_fw = self.rnn_enc_fw(input_fw)\n",
127 |     "        raw_outputs_bw, outputs_bw = self.rnn_enc_bw(input_bw)\n",
128 |     "        bs, sl, _ = outputs_bw[-1].size()\n",
129 |     "        idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))\n",
130 |     "        output_bw = outputs_bw[-1].index_select(1, idx)\n",
131 |     "        outputs_fw_bw = torch.cat([outputs_fw[-1], output_bw], dim=-1)\n",
132 |     "    \n",
133 |     "        ## concat forward raw_outputs & backward raw_outputs together\n",
134 |     "        raw_outputs_bw_ = []\n",
135 |     "        # concat them together\n",
136 |     "        for i in range(3):\n",
137 |     "            bs, sl, _ = raw_outputs_bw[i].size()\n",
138 |     "            idx = V(torch.LongTensor([i for i in range(sl-1, -1, -1)]))\n",
139 |     "            raw_output_bw = raw_outputs_bw[i].index_select(1, idx)\n",
140 |     "            raw_outputs_bw_.append(raw_output_bw)\n",
141 |     "        raw_outputs_fw_bw = [torch.cat([raw_outputs_fw[i], raw_outputs_bw_[i]]) for i in range(3)]\n",
142 |     "        # concat output from lstm_out and rnn_lm\n",
143 |     "        outputs_fw_bw = torch.cat([lstm_out, outputs_fw_bw], dim=-1)\n",
144 |     "        outputs_fw_bw, (n_h, n_cell) = self.rnn_lm(outputs_fw_bw)\n",
145 |     "        out = self.linear_decoder((raw_outputs_fw_bw, outputs_fw_bw.contiguous()))\n",
146 |     "        return out\n",
147 |     "\n",
148 |     "    \n",
149 |     "##### rewrite RNN Learner #####\n",
150 |     "'''rewrite load_encoder to load the encoding modules'''\n",
151 |     "class RNN_Learner_bidir(Learner):\n",
152 |     "    def __init__(self, data, models, **kwargs):\n",
153 |     "        super().__init__(data, models, **kwargs)\n",
154 |     "\n",
155 |     "    def _get_crit(self, data): return F.cross_entropy\n",
156 |     "    def fit(self, *args, **kwargs): return super().fit(*args, **kwargs, seq_first=True)\n",
157 |     "\n",
158 |     "    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))\n",
159 |     "    def load_encoder(self, name_fw, name_bw): \n",
160 |     "        load_model(self.model.rnn_enc_fw, self.get_model_path(name_fw))\n",
161 |     "        load_model(self.model.rnn_enc_bw, self.get_model_path(name_bw))\n",
162 |     "##### end #####\n",
163 |     "\n",
164 |     "\n",
165 |     "##### rewrite textmodel #####\n",
166 |     "'''get layer groups'''\n",
167 |     "class TextModel_bidir(BasicModel):\n",
168 |     "    def get_layer_groups(self):\n",
169 |     "        m_fw = self.model.rnn_enc_fw\n",
170 |     "        m_bw = self.model.rnn_enc_bw\n",
171 |     "        return [(m_fw.encoder, m_fw.dropouti, m_bw.encoder, m_bw.dropouti), \n",
172 |     "                *zip(m_fw.rnns, m_fw.dropouths, m_bw.rnns, m_bw.dropouths), \n",
173 |     "            (self.model.embedding), (self.model.linear_decoder), (self.model.rnn), (self.model.rnn_lm)]\n",
174 |     "\n",
175 |     "\n",
176 |     "def get_rnn_seq_labeler_bidir(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,\n",
177 |     "                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, linear_decoder_dp=0.1, dir_path='', freeze_word2vec=False):\n",
178 |     "    rnn_enc = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n",
179 |     "                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n",
180 |     "    rnn_enc_backward = MultiBatchSeqRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,\n",
181 |     "                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)\n",
182 |     "    return SequentialRNN_bidir(rnn_enc, rnn_enc_backward, LinearDecoder_bidir(n_class, emb_sz*2, linear_decoder_dp), \n",
183 |     "                               dir_path/'tmp'/'coNLL_embedding.npy', emb_sz, freeze_word2vec=freeze_word2vec, wordvec_sz=300)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 100,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "'''common functions'''\n",
193 |     "def freeze_all_but(learner, n):\n",
194 |     "    c=learner.get_layer_groups()\n",
195 |     "    for l in c: set_trainable(l, False)\n",
196 |     "    set_trainable(c[n], True)\n",
197 |     "\n",
198 |     "class MultiBatchSeqRNN(RNN_Encoder):\n",
199 |     "    def __init__(self, bptt, max_seq, *args, **kwargs):\n",
200 |     "        self.max_seq,self.bptt = max_seq,bptt\n",
201 |     "        super().__init__(*args, **kwargs)\n",
202 |     "\n",
203 |     "    def concat(self, arrs):\n",
204 |     "        return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]\n",
205 |     "\n",
206 |     "    def forward(self, input):\n",
207 |     "        sl,bs = input.size()\n",
208 |     "        for l in self.hidden:\n",
209 |     "            for h in l: h.data.zero_()\n",
210 |     "        raw_outputs, outputs = super().forward(input)\n",
211 |     "        return raw_outputs, outputs\n",
212 |     "\n",
213 |     "    \n",
214 |     "class SeqDataLoader(DataLoader):\n",
215 |     "    def get_batch(self, indices):\n",
216 |     "        res = self.np_collate([self.dataset[i] for i in indices])\n",
217 |     "        res[1] = np.reshape(res[1], -1)  # reshape the labels to one sequence\n",
218 |     "        return res\n",
219 |     "\n",
220 |     "\n",
221 |     "class TextSeqDataset(Dataset):\n",
222 |     "    def __init__(self, x, y, backwards=False, sos=None, eos=None):\n",
223 |     "        self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos\n",
224 |     "\n",
225 |     "    def __getitem__(self, idx):\n",
226 |     "        x = self.x[idx]\n",
227 |     "        y = self.y[idx]  # we need to get y as array\n",
228 |     "        if self.backwards: x = list(reversed(x))\n",
229 |     "        if self.eos is not None: x = x + [self.eos]\n",
230 |     "        if self.sos is not None: x = [self.sos]+x\n",
231 |     "        return np.array(x),np.array(y)\n",
232 |     "\n",
233 |     "    def __len__(self): return len(self.x)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 111,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "def train_seq(dir_path, lm_id='', train_file_id='', clas_id=None, bs=64, cl=1, bidir=False, startat=0, unfreeze=True,\n",
243 |     "              lr=0.01, dropmult=1.0, pretrain=True, bpe=False, use_clr=True,\n",
244 |     "              use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=False,\n",
245 |     "              from_scratch=False, freeze_word2vec=False, n_cycle=3, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.1):\n",
246 |     "    \"\"\"hyperaparameter settings\"\"\"\n",
247 |     "    bptt,em_sz,nh,nl = 70,400,1150,3\n",
248 |     "#     bptt, em_sz, nh, nl = 70, 100, 100, 2\n",
249 |     "    dps = np.array([0.4,0.5,0.05,0.3,0.4])*dropmult\n",
250 |     "#     dps = np.array([0.4,0.5,0.05,0.3,0.7])*dropmult\n",
251 |     "#     dps = np.array([0.5, 0.4, 0.04, 0.3, 0.6])*dropmult\n",
252 |     "    #dps = np.array([0.65,0.48,0.039,0.335,0.34])*dropmult\n",
253 |     "#     dps = np.array([0.6,0.5,0.04,0.3,0.4])*dropmult\n",
254 |     "\n",
255 |     "    print(f'prefix {dir_path}; lm_id {lm_id}; train_file_id {train_file_id}; clas_id {clas_id};'\n",
256 |     "          f' bs {bs}; cl {cl}; bidir {bidir}; '\n",
257 |     "        f'dropmult {dropmult} unfreeze {unfreeze} startat {startat}; pretrain {pretrain}; bpe {bpe}; use_clr {use_clr};'\n",
258 |     "        f' use_regular_schedule {use_regular_schedule}; use_discriminative {use_discriminative}; last {last};'\n",
259 |     "        f' chain_thaw {chain_thaw}; from_scratch {from_scratch}; freeze_word2vec {freeze_word2vec}; bptt {bptt};'\n",
260 |     "          f' em_sz {em_sz}; nh {nh}; nl {nl}; dropouts {dps}; dropmult {dropmult};'\n",
261 |     "         f' linear_decoder_dp {linear_decoder_dp}')\n",
262 |     "    dir_path = Path(dir_path)\n",
263 |     " \n",
264 |     "    lm_file = dir_path/'models'/'lm1_enc'\n",
265 |     "    lm_file_bw = dir_path/'models'/'lm1_enc_backward'\n",
266 |     "\n",
267 |     "    opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
268 |     "\n",
269 |     "    \"\"\"load datasets\"\"\"\n",
270 |     "    trn_sent = np.load(dir_path / 'tmp' / f'trn_ids{train_file_id}.npy')\n",
271 |     "    val_sent = np.load(dir_path / 'tmp' / f'val_ids.npy')\n",
272 |     "    test_sent = np.load(dir_path / 'tmp' / f'test_ids.npy')\n",
273 |     "    trn_lbls = np.load(dir_path / 'tmp' / f'lbl_trn{train_file_id}.npy')\n",
274 |     "    val_lbls = np.load(dir_path / 'tmp' / f'lbl_val.npy')\n",
275 |     "    test_lbls = np.load(dir_path / 'tmp' / f'lbl_test.npy')\n",
276 |     "    id2label = pickle.load(open(dir_path / 'tmp' / 'itol.pkl', 'rb'))\n",
277 |     "    c = len(id2label)\n",
278 |     "\n",
279 |     "    if bpe:\n",
280 |     "        vs=30002\n",
281 |     "    else:\n",
282 |     "        id2token = pickle.load(open(dir_path / 'tmp' / 'itos.pkl', 'rb'))\n",
283 |     "        vs = len(id2token)\n",
284 |     "\n",
285 |     "    print('Train sentences shape:', trn_sent.shape)\n",
286 |     "    print('Train labels shape:', trn_lbls.shape)\n",
287 |     "    print('Token ids:', [id2token[id_] for id_ in trn_sent[0]])\n",
288 |     "    print('Label ids:', [id2label[id_] for id_ in trn_lbls[0]])\n",
289 |     "\n",
290 |     "    trn_ds = TextSeqDataset(trn_sent, trn_lbls)\n",
291 |     "    val_ds = TextSeqDataset(val_sent, val_lbls)\n",
292 |     "    test_ds = TextSeqDataset(test_sent, test_lbls)\n",
293 |     "    trn_samp = SortishSampler(trn_sent, key=lambda x: len(trn_sent[x]), bs=bs//2)\n",
294 |     "    val_samp = SortSampler(val_sent, key=lambda x: len(val_sent[x]))\n",
295 |     "    test_samp = SortSampler(test_sent, key=lambda x: len(test_sent[x]))\n",
296 |     "    trn_dl = SeqDataLoader(trn_ds, bs//2, transpose=False, num_workers=1, pad_idx=1, sampler=trn_samp)  # TODO why transpose? Should we also transpose the labels?\n",
297 |     "    val_dl = SeqDataLoader(val_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=val_samp)\n",
298 |     "    test_dl = SeqDataLoader(test_ds, bs, transpose=False, num_workers=1, pad_idx=1, sampler=test_samp)\n",
299 |     "    md = ModelData(dir_path, trn_dl, val_dl, test_dl)\n",
300 |     "\n",
301 |     "    if bidir:\n",
302 |     "        m = get_rnn_seq_labeler_bidir(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
303 |     "                  layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n",
304 |     "                  dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp, \n",
305 |     "                                      freeze_word2vec=freeze_word2vec, dir_path=dir_path, )\n",
306 |     "        learn = RNN_Learner_bidir(md, TextModel_bidir(to_gpu(m)), opt_fn=opt_fn)\n",
307 |     "        learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
308 |     "        learn.clip=25.\n",
309 |     "        learn.metrics = [accuracy]\n",
310 |     "    else:\n",
311 |     "        m = get_rnn_seq_labeler(bptt, 70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
312 |     "                  layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n",
313 |     "                  dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3], linear_decoder_dp=linear_decoder_dp)\n",
314 |     "        learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
315 |     "        learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
316 |     "        learn.clip=25.\n",
317 |     "        learn.metrics = [accuracy]\n",
318 |     "\n",
319 |     "    \n",
320 |     "\n",
321 |     "    lrm = 2.6\n",
322 |     "    if use_discriminative:\n",
323 |     "#         lrs = np.array([lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
324 |     "        lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
325 |     "    else:\n",
326 |     "        lrs = lr\n",
327 |     "    wd = 1e-5\n",
328 |     "    if not from_scratch:\n",
329 |     "        print(f'Loading encoder from {lm_file}...')\n",
330 |     "        if bidir:\n",
331 |     "            learn.load_encoder(lm_file, lm_file_bw)\n",
332 |     "        else:\n",
333 |     "            learn.load_encoder(lm_file)\n",
334 |     "    else:\n",
335 |     "        print('Training classifier from scratch. LM encoder is not loaded.')\n",
336 |     "        use_regular_schedule = True\n",
337 |     "\n",
338 |     "    if (startat<1) and pretrain and not last and not chain_thaw and not from_scratch:\n",
339 |     "        learn.freeze_to(-1)\n",
340 |     "        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n",
341 |     "                  use_clr=None if use_regular_schedule or not use_clr else (8,3))\n",
342 |     "        learn.freeze_to(-2)\n",
343 |     "        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n",
344 |     "                  use_clr=None if use_regular_schedule or not use_clr else (8, 3))\n",
345 |     "        learn.save(f'{PRE}{clas_id}clas_0')\n",
346 |     "    elif startat==1:\n",
347 |     "        learn.load(f'{PRE}{clas_id}clas_0')\n",
348 |     "\n",
349 |     "    if chain_thaw:\n",
350 |     "        lrs = np.array([0.0001, 0.0001, 0.0001, 0.001])\n",
351 |     "        ## Emrys\n",
352 |     "        lrm = 4\n",
353 |     "        # the 4th is too big, and the word embedding and rnn can increase\n",
354 |     "        lrs = np.array([lr/(lrm**5), 2*lr/(lrm**5), lr/(lrm**4), lr/(lrm**4), 5e-4, lr/2, 7e-4, 1e-2])\n",
355 |     "#         lrf = learn.lr_find(lrs) # find the proper learning rate\n",
356 |     "#         learn.sched.plot()\n",
357 |     "        # end\n",
358 |     "        print(f'AWDLSTM learning_rate {lrs[:4]}; embedding_lr {lrs[4]}; linear_decoder_lr {lrs[5]}; rnn_lr {lrs[6]}; lm_lr {lrs[7]}; weight_decay {wd}')\n",
359 |     "        print('Using chain-thaw. Unfreezing all layers one at a time...')\n",
360 |     "        n_layers = len(learn.get_layer_groups())\n",
361 |     "        print('# of layers:', n_layers)\n",
362 |     "        # fine-tune last layer\n",
363 |     "        learn.freeze_to(-1)\n",
364 |     "        print('Fine-tuning layer #7')\n",
365 |     "        learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n",
366 |     "                  use_clr=None if use_regular_schedule or not use_clr else (8,3))\n",
367 |     "        n = n_layers-2\n",
368 |     "        # fine-tune all layers up to the second-last one\n",
369 |     "        while n>-1:\n",
370 |     "            print('Fine-tuning layer #%d.' % n)\n",
371 |     "            freeze_all_but(learn, n)\n",
372 |     "            learn.fit(lrs, 1, wds=wd, cycle_len=None if use_regular_schedule else 1,\n",
373 |     "                      use_clr=None if use_regular_schedule or not use_clr else (8,3))\n",
374 |     "            n -= 1\n",
375 |     "\n",
376 |     "    if unfreeze:\n",
377 |     "        learn.unfreeze()\n",
378 |     "    else:\n",
379 |     "        learn.freeze_to(-3)\n",
380 |     "\n",
381 |     "    if last:\n",
382 |     "        print('Fine-tuning only the last layer...')\n",
383 |     "        learn.freeze_to(-1)\n",
384 |     "\n",
385 |     "    if use_regular_schedule:\n",
386 |     "        print('Using regular schedule. Setting use_clr=None, n_cycles=cl, cycle_len=None.')\n",
387 |     "        use_clr = None\n",
388 |     "        n_cycle = n_cycle\n",
389 |     "        cycle_len = None\n",
390 |     "    else:\n",
391 |     "        n_cycle = n_cycle\n",
392 |     "    print(f'n_cycle {n_cycle}; cycle_len {cycle_len}; cycle_mult {cycle_mult}; use_clr {use_clr}')\n",
393 |     "    learn.fit(lrs, n_cycle, wds=wd, cycle_len=cycle_len, cycle_mult=cycle_mult, use_clr=(8,8) if use_clr else None) # previously cycle_len=cl\n",
394 |     "    print('Plotting lrs...')\n",
395 |     "    learn.sched.plot_lr()\n",
396 |     "    clas_id = clas_id if clas_id is not None else lm_id\n",
397 |     "    bidir = 'bidir' if bidir else 'forward'\n",
398 |     "    learn.save(f'{clas_id}clas_1{bidir}')\n",
399 |     "\n",
400 |     "    eval_ner(learn, id2label, is_test=False)\n",
401 |     "    eval_ner(learn, id2label, is_test=True)"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": null,
407 |    "metadata": {
408 |     "scrolled": false
409 |    },
410 |    "outputs": [
411 |     {
412 |      "name": "stdout",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "prefix /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/; lm_id ; train_file_id ; clas_id None; bs 64; cl 1; bidir True; dropmult 1 unfreeze True startat 0; pretrain True; bpe False; use_clr False; use_regular_schedule False; use_discriminative True; last False; chain_thaw True; from_scratch False; freeze_word2vec False; bptt 70; em_sz 400; nh 1150; nl 3; dropouts [0.4  0.5  0.05 0.3  0.4 ]; dropmult 1; linear_decoder_dp 0.2\n",
416 |       "Train sentences shape: (14988,)\n",
417 |       "Train labels shape: (14988,)\n",
418 |       "Token ids: ['xbos', '-docstart-']\n",
419 |       "Label ids: ['_bos_', 'O']\n",
420 |       "Loading encoder from /fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/models/lm1_enc...\n",
421 |       "AWDLSTM learning_rate [0.00001 0.00002 0.00004 0.00004]; embedding_lr 0.0005; linear_decoder_lr 0.005; rnn_lr 0.0007; lm_lr 0.01; weight_decay 1e-05\n",
422 |       "Using chain-thaw. Unfreezing all layers one at a time...\n",
423 |       "# of layers: 8\n",
424 |       "Fine-tuning layer #7\n"
425 |      ]
426 |     },
427 |     {
428 |      "data": {
429 |       "application/vnd.jupyter.widget-view+json": {
430 |        "model_id": "b93741108e674636a47a7f6ca46c18bd",
431 |        "version_major": 2,
432 |        "version_minor": 0
433 |       },
434 |       "text/plain": [
435 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
436 |       ]
437 |      },
438 |      "metadata": {},
439 |      "output_type": "display_data"
440 |     },
441 |     {
442 |      "name": "stdout",
443 |      "output_type": "stream",
444 |      "text": [
445 |       "epoch      trn_loss   val_loss   accuracy                    \n",
446 |       "    0      0.159474   0.186139   0.948293  \n",
447 |       "\n",
448 |       "Fine-tuning layer #6.\n"
449 |      ]
450 |     },
451 |     {
452 |      "data": {
453 |       "application/vnd.jupyter.widget-view+json": {
454 |        "model_id": "9b7e60369a684ecb924a25497c250fb7",
455 |        "version_major": 2,
456 |        "version_minor": 0
457 |       },
458 |       "text/plain": [
459 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
460 |       ]
461 |      },
462 |      "metadata": {},
463 |      "output_type": "display_data"
464 |     },
465 |     {
466 |      "name": "stdout",
467 |      "output_type": "stream",
468 |      "text": [
469 |       "epoch      trn_loss   val_loss   accuracy                    \n",
470 |       "    0      0.122185   0.153179   0.959319  \n",
471 |       "\n",
472 |       "Fine-tuning layer #5.\n"
473 |      ]
474 |     },
475 |     {
476 |      "data": {
477 |       "application/vnd.jupyter.widget-view+json": {
478 |        "model_id": "ca7cc14d072f411a81ef87f64b5a4fb9",
479 |        "version_major": 2,
480 |        "version_minor": 0
481 |       },
482 |       "text/plain": [
483 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
484 |       ]
485 |      },
486 |      "metadata": {},
487 |      "output_type": "display_data"
488 |     },
489 |     {
490 |      "name": "stdout",
491 |      "output_type": "stream",
492 |      "text": [
493 |       "epoch      trn_loss   val_loss   accuracy                     \n",
494 |       "    0      0.098153   0.131593   0.960315  \n",
495 |       "\n",
496 |       "Fine-tuning layer #4.\n"
497 |      ]
498 |     },
499 |     {
500 |      "data": {
501 |       "application/vnd.jupyter.widget-view+json": {
502 |        "model_id": "c2374ceed7014d008961059438b286c4",
503 |        "version_major": 2,
504 |        "version_minor": 0
505 |       },
506 |       "text/plain": [
507 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
508 |       ]
509 |      },
510 |      "metadata": {},
511 |      "output_type": "display_data"
512 |     },
513 |     {
514 |      "name": "stdout",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "epoch      trn_loss   val_loss   accuracy                     \n",
518 |       "    0      0.093267   0.129416   0.961494  \n",
519 |       "\n",
520 |       "Fine-tuning layer #3.\n"
521 |      ]
522 |     },
523 |     {
524 |      "data": {
525 |       "application/vnd.jupyter.widget-view+json": {
526 |        "model_id": "bb534368138846fda4fd5b13a7ddecb0",
527 |        "version_major": 2,
528 |        "version_minor": 0
529 |       },
530 |       "text/plain": [
531 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
532 |       ]
533 |      },
534 |      "metadata": {},
535 |      "output_type": "display_data"
536 |     },
537 |     {
538 |      "name": "stdout",
539 |      "output_type": "stream",
540 |      "text": [
541 |       "epoch      trn_loss   val_loss   accuracy                     \n",
542 |       "    0      0.102729   0.13574    0.961593  \n",
543 |       "\n",
544 |       "Fine-tuning layer #2.\n"
545 |      ]
546 |     },
547 |     {
548 |      "data": {
549 |       "application/vnd.jupyter.widget-view+json": {
550 |        "model_id": "2820ea159b7447adb5d4afd295b53274",
551 |        "version_major": 2,
552 |        "version_minor": 0
553 |       },
554 |       "text/plain": [
555 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
556 |       ]
557 |      },
558 |      "metadata": {},
559 |      "output_type": "display_data"
560 |     },
561 |     {
562 |      "name": "stdout",
563 |      "output_type": "stream",
564 |      "text": [
565 |       "epoch      trn_loss   val_loss   accuracy                     \n",
566 |       "    0      0.093768   0.136768   0.961793  \n",
567 |       "\n",
568 |       "Fine-tuning layer #1.\n"
569 |      ]
570 |     },
571 |     {
572 |      "data": {
573 |       "application/vnd.jupyter.widget-view+json": {
574 |        "model_id": "44e6836351de4ee596359c36a2063c45",
575 |        "version_major": 2,
576 |        "version_minor": 0
577 |       },
578 |       "text/plain": [
579 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
580 |       ]
581 |      },
582 |      "metadata": {},
583 |      "output_type": "display_data"
584 |     },
585 |     {
586 |      "name": "stdout",
587 |      "output_type": "stream",
588 |      "text": [
589 |       "epoch      trn_loss   val_loss   accuracy                     \n",
590 |       "    0      0.100634   0.13684    0.96171   \n",
591 |       "\n",
592 |       "Fine-tuning layer #0.\n"
593 |      ]
594 |     },
595 |     {
596 |      "data": {
597 |       "application/vnd.jupyter.widget-view+json": {
598 |        "model_id": "03c24cda9b29481cb0465fd7bc1e9383",
599 |        "version_major": 2,
600 |        "version_minor": 0
601 |       },
602 |       "text/plain": [
603 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))"
604 |       ]
605 |      },
606 |      "metadata": {},
607 |      "output_type": "display_data"
608 |     },
609 |     {
610 |      "name": "stdout",
611 |      "output_type": "stream",
612 |      "text": [
613 |       "epoch      trn_loss   val_loss   accuracy                     \n",
614 |       "    0      0.091858   0.136862   0.96171   \n",
615 |       "\n",
616 |       "n_cycle 4; cycle_len 1; cycle_mult 2; use_clr False\n"
617 |      ]
618 |     },
619 |     {
620 |      "data": {
621 |       "application/vnd.jupyter.widget-view+json": {
622 |        "model_id": "771c25ab715540ff90896e31b64ececa",
623 |        "version_major": 2,
624 |        "version_minor": 0
625 |       },
626 |       "text/plain": [
627 |        "HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))"
628 |       ]
629 |      },
630 |      "metadata": {},
631 |      "output_type": "display_data"
632 |     },
633 |     {
634 |      "name": "stdout",
635 |      "output_type": "stream",
636 |      "text": [
637 |       "epoch      trn_loss   val_loss   accuracy                     \n",
638 |       "    0      0.087901   0.120198   0.965296  \n",
639 |       "                                                              \r"
640 |      ]
641 |     }
642 |    ],
643 |    "source": [
644 |     "train_seq('/fs-object-detection/paperspace/fastai/courses/coNLL/data/nlp_seq/ner/', lm_id='', train_file_id='', clas_id=None,\n",
645 |     "          bs=64, cl=1, bidir=True, startat=0, unfreeze=True,\n",
646 |     "              lr=0.01, dropmult=1, pretrain=True, bpe=False, use_clr=False,\n",
647 |     "              use_regular_schedule=False, use_discriminative=True, last=False, chain_thaw=True,\n",
648 |     "              from_scratch=False, freeze_word2vec=False, n_cycle=4, cycle_len=1, cycle_mult=2, linear_decoder_dp=0.2)"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": null,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": []
657 |   }
658 |  ],
659 |  "metadata": {
660 |   "kernelspec": {
661 |    "display_name": "Python 3",
662 |    "language": "python",
663 |    "name": "python3"
664 |   },
665 |   "language_info": {
666 |    "codemirror_mode": {
667 |     "name": "ipython",
668 |     "version": 3
669 |    },
670 |    "file_extension": ".py",
671 |    "mimetype": "text/x-python",
672 |    "name": "python",
673 |    "nbconvert_exporter": "python",
674 |    "pygments_lexer": "ipython3",
675 |    "version": "3.6.6"
676 |   }
677 |  },
678 |  "nbformat": 4,
679 |  "nbformat_minor": 2
680 | }
681 | 


--------------------------------------------------------------------------------