├── README.md
├── blend.py
├── fastai
    ├── text.py
    └── lm_rnn.py
├── alica_bkw.ipynb
└── alica_fwd.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # Yandex Algorithm 2018 ML track
 2 | 
 3 | [Final results: 2nd place](https://contest.yandex.ru/algorithm2018/contest/7914/standings/?lang=en)
 4 | 
 5 | To reproduce the second place solution run `blend.py` script. It will blend predictions from 100 models.
 6 | 
 7 | Language and classification model training as well as prediction is in `alica_fwd.ipynb` file. Backwards model training and prediction (short version based on forward model) is in `alica_bkw.ipynb` file. 
 8 | 
 9 | In order to run the solution install [fast.ai library](https://github.com/fastai/fastai). Then replace two files (`lm_rnn.py` and `text.py`) from fastai folder.
10 | 


--------------------------------------------------------------------------------
/blend.py:
--------------------------------------------------------------------------------
 1 | #blend_all folder is here https://drive.google.com/open?id=1dzOJX4eBqFekAh5ZQPv31jq3qfyKwO0h
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | from os import listdir
 7 | from collections import defaultdict
 8 | import operator
 9 | 
10 | path = "./blend_all/"
11 | files  = listdir(path)
12 | 
13 | d = defaultdict(lambda:defaultdict(lambda:0))
14 | 
15 | for f in tqdm(files):
16 |     df = pd.read_csv(path+f,sep="\t",header=None)
17 |     if len(df.columns)==3:
18 |         for i,j,k in zip(df[0],df[1],df[2]):
19 |             if f.endswith("_r.tsv"):
20 |                 d[i][j]+=k/len(files)/2.22
21 |             else:
22 |                 d[i][j]+=k/len(files)
23 |     else:
24 |         for i,j,k,l in zip(df[0],df[1],df[2],df[3]):
25 |             d[i][j]+=(k/1.5+l)/len(files)*2.5
26 | 
27 | res = []
28 | for i in sorted(list(set(df[0]))):
29 |     for j in [c[0] for c in sorted(d[i].items(), key=operator.itemgetter(1), reverse=True)]:
30 |         res.append(j)
31 |         
32 | df[1] = res
33 | df.to_csv("blend_all.csv.gz", sep="\t", header=False, index=False, columns=[0,1], compression='gzip')
34 | 
35 | #87302 LB
36 | 


--------------------------------------------------------------------------------
/fastai/text.py:
--------------------------------------------------------------------------------
  1 | from .core import *
  2 | from .learner import *
  3 | from .lm_rnn import *
  4 | from torch.utils.data.sampler import Sampler
  5 | import spacy
  6 | from spacy.symbols import ORTH
  7 | 
  8 | re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
  9 | def tokenize(s): return re_tok.sub(r' \1 ', s).split()
 10 | 
 11 | def texts_labels_from_folders(path, folders):
 12 |     texts,labels = [],[]
 13 |     for idx,label in enumerate(folders):
 14 |         for fname in glob(os.path.join(path, label, '*.*')):
 15 |             texts.append(open(fname, 'r').read())
 16 |             labels.append(idx)
 17 |     return texts, np.array(labels).astype(np.int64)
 18 | 
 19 | def numericalize_tok(tokens, max_vocab=50000, min_freq=0, unk_tok="_unk_", pad_tok="_pad_", bos_tok="_bos_", eos_tok="_eos_"):
 20 |     """Takes in text tokens and returns int2tok and tok2int coverters
 21 | 
 22 |         Arguments:
 23 |         tokens(list): List of tokens. Can be a list of strings, or a list of lists of strings.
 24 |         max_vocab(int): Number of tokens to return in the vocab (sorted by frequency)
 25 |         min_freq(int): Minimum number of instances a token must be present in order to be preserved.
 26 |         unk_tok(str): Token to use when unknown tokens are encountered in the source text.
 27 |         pad_tok(str): Token to use when padding sequences.
 28 |     """
 29 |     if isinstance(tokens, str):
 30 |         raise ValueError("Expected to receive a list of tokens. Received a string instead")
 31 |     if isinstance(tokens[0], list):
 32 |         tokens = [p for o in tokens for p in o]
 33 |     freq = Counter(tokens)
 34 |     int2tok = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
 35 |     unk_id = 3
 36 |     int2tok.insert(0, bos_tok)
 37 |     int2tok.insert(1, pad_tok)
 38 |     int2tok.insert(2, eos_tok)
 39 |     int2tok.insert(unk_id, unk_tok)
 40 |     tok2int = collections.defaultdict(lambda:unk_id, {v:k for k,v in enumerate(int2tok)})
 41 |     return int2tok, tok2int
 42 | 
 43 | class Tokenizer():
 44 |     def __init__(self, lang='en'):
 45 |         self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
 46 |         self.tok = spacy.load(lang)
 47 |         for w in ('<eos>','<bos>','<unk>'):
 48 |             self.tok.tokenizer.add_special_case(w, [{ORTH: w}])
 49 | 
 50 |     def sub_br(self,x): return self.re_br.sub("\n", x)
 51 | 
 52 |     def spacy_tok(self,x):
 53 |         return [t.text for t in self.tok.tokenizer(self.sub_br(x))]
 54 | 
 55 |     re_rep = re.compile(r'(\S)(\1{3,})')
 56 |     re_word_rep = re.compile(r'(\b\w+\W+)(\1{3,})')
 57 | 
 58 |     @staticmethod
 59 |     def replace_rep(m):
 60 |         TK_REP = 'tk_rep'
 61 |         c,cc = m.groups()
 62 |         return f' {TK_REP} {len(cc)+1} {c} '
 63 | 
 64 |     @staticmethod
 65 |     def replace_wrep(m):
 66 |         TK_WREP = 'tk_wrep'
 67 |         c,cc = m.groups()
 68 |         return f' {TK_WREP} {len(cc.split())+1} {c} '
 69 | 
 70 |     @staticmethod
 71 |     def do_caps(ss):
 72 |         TOK_UP,TOK_SENT,TOK_MIX = ' t_up ',' t_st ',' t_mx '
 73 |         res = []
 74 |         prev='.'
 75 |         re_word = re.compile('\w')
 76 |         re_nonsp = re.compile('\S')
 77 |         for s in re.findall(r'\w+|\W+', ss):
 78 |             res += ([TOK_UP,s.lower()] if (s.isupper() and (len(s)>2))
 79 |     #                 else [TOK_SENT,s.lower()] if (s.istitle() and re_word.search(prev))
 80 |                     else [s.lower()])
 81 |     #         if re_nonsp.search(s): prev = s
 82 |         return ''.join(res)
 83 | 
 84 |     def proc_text(self, s):
 85 |         s = self.re_rep.sub(Tokenizer.replace_rep, s)
 86 |         s = self.re_word_rep.sub(Tokenizer.replace_wrep, s)
 87 |         s = Tokenizer.do_caps(s)
 88 |         s = re.sub(r'([/#])', r' \1 ', s)
 89 |         s = re.sub(' {2,}', ' ', s)
 90 |         return self.spacy_tok(s)
 91 | 
 92 |     @staticmethod
 93 |     def proc_all(ss, lang):
 94 |         tok = Tokenizer(lang)
 95 |         return [tok.proc_text(s) for s in ss]
 96 | 
 97 |     @staticmethod
 98 |     def proc_all_mp(ss, lang='en'):
 99 |         ncpus = num_cpus()//2
100 |         with ProcessPoolExecutor(ncpus) as e:
101 |             return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss)), [])
102 | 
103 | 
104 | class TextDataset(Dataset):
105 |     def __init__(self, x, y, backwards=False, sos=None, eos=None):
106 |         self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos
107 | 
108 |     def __getitem__(self, idx):
109 |         x = self.x[idx]
110 |         if self.backwards: x = list(reversed(x))
111 |         if self.eos is not None: x = x + [self.eos]
112 |         if self.sos is not None: x = [self.sos]+x
113 |         return np.array(x),self.y[idx]
114 | 
115 |     def __len__(self): return len(self.x)
116 | 
117 | 
118 | class SortSampler(Sampler):
119 |     def __init__(self, data_source, key): self.data_source,self.key = data_source,key
120 |     def __len__(self): return len(self.data_source)
121 |     def __iter__(self):
122 |         return iter(sorted(range(len(self.data_source)), key=self.key))
123 |     
124 | class SimpleSampler(Sampler):
125 |     def __init__(self, data_source): self.data_source = data_source
126 |     def __len__(self): return len(self.data_source)
127 |     def __iter__(self):
128 |         return iter(range(len(self.data_source)))
129 | 
130 | class SortishSampler(Sampler):
131 |     """Returns an iterator that traverses the the data in randomly ordered batches that are approximately the same size.
132 |     The max key size batch is always returned in the first call because of pytorch cuda memory allocation sequencing.
133 |     Without that max key returned first multiple buffers may be allocated when the first created isn't large enough
134 |     to hold the next in the sequence.
135 |     """
136 |     def __init__(self, data_source, key, bs):
137 |         self.data_source,self.key,self.bs = data_source,key,bs
138 | 
139 |     def __len__(self): return len(self.data_source)
140 | 
141 |     def __iter__(self):
142 |         idxs = np.random.permutation(len(self.data_source))
143 |         sz = self.bs*50
144 |         ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)]
145 |         sort_idx = sum([sorted(s, key=self.key, reverse=True) for s in ck_idx], [])
146 |         sz = self.bs
147 |         ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)]
148 |         sort_idx = np.concatenate(np.random.permutation(ck_idx[1:]))
149 |         sort_idx = np.concatenate((ck_idx[0], sort_idx))
150 |         return iter(sort_idx)
151 | 
152 | 
153 | class LanguageModelLoader():
154 |     """ Returns a language model iterator that iterates through batches that are of length N(bptt,5)
155 |     The first batch returned is always bptt+25; the max possible width.  This is done because of they way that pytorch
156 |     allocates cuda memory in order to prevent multiple buffers from being created as the batch width grows.
157 |     """
158 |     def __init__(self, nums, bs, bptt, backwards=False):
159 |         self.bs,self.bptt,self.backwards = bs,bptt,backwards
160 |         self.data = self.batchify(nums)
161 |         self.i,self.iter = 0,0
162 |         self.n = len(self.data)
163 | 
164 |     def __iter__(self):
165 |         self.i,self.iter = 0,0
166 |         while self.i < self.n-1 and self.iter<len(self):
167 |             if self.i == 0:
168 |                 seq_len = self.bptt + 5 * 5
169 |             else:
170 |                 bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.
171 |                 seq_len = max(5, int(np.random.normal(bptt, 5)))
172 |             res = self.get_batch(self.i, seq_len)
173 |             self.i += seq_len
174 |             self.iter += 1
175 |             yield res
176 | 
177 |     def __len__(self): return self.n // self.bptt - 1
178 | 
179 |     def batchify(self, data):
180 |         nb = data.shape[0] // self.bs
181 |         data = np.array(data[:nb*self.bs])
182 |         data = data.reshape(self.bs, -1).T
183 |         if self.backwards: data=data[::-1]
184 |         return T(data)
185 | 
186 |     def get_batch(self, i, seq_len):
187 |         source = self.data
188 |         seq_len = min(seq_len, len(source) - 1 - i)
189 |         return source[i:i+seq_len], source[i+1:i+1+seq_len].view(-1)
190 | 
191 | 
192 | class LanguageModel(BasicModel):
193 |     def get_layer_groups(self):
194 |         m = self.model[0]
195 |         return [*zip(m.rnns, m.dropouths), (self.model[1], m.dropouti)]
196 | 
197 | 
198 | class LanguageModelData():
199 |     def __init__(self, path, pad_idx, n_tok, trn_dl, val_dl, test_dl=None, bptt=70, backwards=False, **kwargs):
200 |         self.path,self.pad_idx,self.n_tok = path,pad_idx,n_tok
201 |         self.trn_dl,self.val_dl,self.test_dl = trn_dl,val_dl,test_dl
202 | 
203 |     def get_model(self, opt_fn, emb_sz, n_hid, n_layers, **kwargs):
204 |         m = get_language_model(self.n_tok, emb_sz, n_hid, n_layers, self.pad_idx, **kwargs)
205 |         model = LanguageModel(to_gpu(m))
206 |         return RNN_Learner(self, model, opt_fn=opt_fn)
207 | 
208 | 
209 | class RNN_Learner(Learner):
210 |     def __init__(self, data, models, **kwargs):
211 |         super().__init__(data, models, **kwargs)
212 |         self.crit = F.cross_entropy
213 | 
214 |     def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))
215 |     def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))
216 | 
217 | 
218 | class TextModel(BasicModel):
219 |     def get_layer_groups(self):
220 |         m = self.model[0]
221 |         return [(m.encoder, m.dropouti), *zip(m.rnns, m.dropouths), (self.model[1])]
222 | 
223 | 


--------------------------------------------------------------------------------
/fastai/lm_rnn.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from .imports import *
  3 | from .torch_imports import *
  4 | from .rnn_reg import LockedDropout,WeightDrop,EmbeddingDropout
  5 | from .model import Stepper
  6 | 
  7 | 
  8 | def seq2seq_reg(output, xtra, loss, alpha=0, beta=0):
  9 |     hs,dropped_hs = xtra
 10 |     if alpha:  # Activation Regularization
 11 |         loss = loss + sum(alpha * dropped_hs[-1].pow(2).mean())
 12 |     if beta:   # Temporal Activation Regularization (slowness)
 13 |         h = hs[-1]
 14 |         if len(h)>1: loss = loss + sum(beta * (h[1:] - h[:-1]).pow(2).mean())
 15 |     return loss
 16 | 
 17 | 
 18 | def repackage_var(h):
 19 |     """Wraps h in new Variables, to detach them from their history."""
 20 |     return Variable(h.data) if type(h) == Variable else tuple(repackage_var(v) for v in h)
 21 | 
 22 | 
 23 | class RNN_Encoder(nn.Module):
 24 | 
 25 |     """A custom RNN encoder network that uses
 26 |         - an embedding matrix to encode input,
 27 |         - a stack of LSTM layers to drive the network, and
 28 |         - variational dropouts in the embedding and LSTM layers
 29 | 
 30 |         The architecture for this network was inspired by the work done in
 31 |         "Regularizing and Optimizing LSTM Language Models".
 32 |         (https://arxiv.org/pdf/1708.02182.pdf)
 33 |     """
 34 | 
 35 |     initrange=0.1
 36 | 
 37 |     def __init__(self, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False,
 38 |                  dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5):
 39 |         """ Default constructor for the RNN_Encoder class
 40 | 
 41 |             Args:
 42 |                 bs (int): batch size of input data
 43 |                 ntoken (int): number of vocabulary (or tokens) in the source dataset
 44 |                 emb_sz (int): the embedding size to use to encode each token
 45 |                 nhid (int): number of hidden activation per LSTM layer
 46 |                 nlayers (int): number of LSTM layers to use in the architecture
 47 |                 pad_token (int): the int value used for padding text.
 48 |                 dropouth (float): dropout to apply to the activations going from one LSTM layer to another
 49 |                 dropouti (float): dropout to apply to the input layer.
 50 |                 dropoute (float): dropout to apply to the embedding layer.
 51 |                 wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.
 52 | 
 53 |             Returns:
 54 |                 None
 55 |           """
 56 | 
 57 |         super().__init__()
 58 |         self.ndir = 2 if bidir else 1
 59 |         self.bs = 1
 60 |         self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token)
 61 |         self.encoder_with_dropout = EmbeddingDropout(self.encoder)
 62 |         self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,
 63 |              1, bidirectional=bidir, dropout=dropouth) for l in range(nlayers)]
 64 |         if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]
 65 |         self.rnns = torch.nn.ModuleList(self.rnns)
 66 |         self.encoder.weight.data.uniform_(-self.initrange, self.initrange)
 67 | 
 68 |         self.emb_sz,self.nhid,self.nlayers,self.dropoute = emb_sz,nhid,nlayers,dropoute
 69 |         self.dropouti = LockedDropout(dropouti)
 70 |         self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)])
 71 | 
 72 |     def forward(self, input):
 73 |         """ Invoked during the forward propagation of the RNN_Encoder module.
 74 |         Args:
 75 |             input (Tensor): input of shape (sentence length x batch_size)
 76 | 
 77 |         Returns:
 78 |             raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using
 79 |             dropouth, list of tensors evaluated from each RNN layer using dropouth,
 80 |         """
 81 |         sl,bs = input.size()
 82 |         if bs!=self.bs:
 83 |             self.bs=bs
 84 |             self.reset()
 85 | 
 86 |         emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)
 87 |         emb = self.dropouti(emb)
 88 | 
 89 |         raw_output = emb
 90 |         new_hidden,raw_outputs,outputs = [],[],[]
 91 |         for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
 92 |             current_input = raw_output
 93 |             with warnings.catch_warnings():
 94 |                 warnings.simplefilter("ignore")
 95 |                 raw_output, new_h = rnn(raw_output, self.hidden[l])
 96 |             new_hidden.append(new_h)
 97 |             raw_outputs.append(raw_output)
 98 |             if l != self.nlayers - 1: raw_output = drop(raw_output)
 99 |             outputs.append(raw_output)
100 | 
101 |         self.hidden = repackage_var(new_hidden)
102 |         return raw_outputs, outputs
103 | 
104 |     def one_hidden(self, l):
105 |         nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir
106 |         return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training)
107 | 
108 |     def reset(self):
109 |         self.weights = next(self.parameters()).data
110 |         self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)]
111 | 
112 | 
113 | class MultiBatchRNN(RNN_Encoder):
114 |     def __init__(self, bptt, max_seq, *args, **kwargs):
115 |         self.max_seq,self.bptt = max_seq,bptt
116 |         super().__init__(*args, **kwargs)
117 | 
118 |     def concat(self, arrs):
119 |         return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))]
120 | 
121 |     def forward(self, input):
122 |         sl,bs = input.size()
123 |         for l in self.hidden:
124 |             for h in l: h.data.zero_()
125 |         raw_outputs, outputs = [],[]
126 |         for i in range(0, sl, self.bptt):
127 |             r, o = super().forward(input[i: min(i+self.bptt, sl)])
128 |             if i>(sl-self.max_seq):
129 |                 raw_outputs.append(r)
130 |                 outputs.append(o)
131 |         return self.concat(raw_outputs), self.concat(outputs)
132 | 
133 | class LinearDecoder(nn.Module):
134 |     initrange=0.1
135 |     def __init__(self, n_out, nhid, dropout, tie_encoder=None):
136 |         super().__init__()
137 |         self.decoder = nn.Linear(nhid, n_out, bias=False)
138 |         self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
139 |         self.dropout = LockedDropout(dropout)
140 |         if tie_encoder: self.decoder.weight = tie_encoder.weight
141 | 
142 |     def forward(self, input):
143 |         raw_outputs, outputs = input
144 |         output = self.dropout(outputs[-1])
145 |         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
146 |         result = decoded.view(-1, decoded.size(1))
147 |         return result, raw_outputs, outputs
148 | 
149 | 
150 | class LinearBlock(nn.Module):
151 |     def __init__(self, ni, nf, drop):
152 |         super().__init__()
153 |         self.lin = nn.Linear(ni, nf)
154 |         self.drop = nn.Dropout(drop)
155 |         self.bn = nn.BatchNorm1d(ni)
156 | 
157 |     def forward(self, x): return self.lin(self.drop(self.bn(x)))
158 | 
159 | 
160 | class PoolingLinearClassifier(nn.Module):
161 |     def __init__(self, layers, drops):
162 |         super().__init__()
163 |         self.layers = nn.ModuleList([
164 |             LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])
165 | 
166 |     def pool(self, x, bs, is_max):
167 |         f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
168 |         return f(x.permute(1,2,0), (1,)).view(bs,-1)
169 | 
170 |     def forward(self, input):
171 |         raw_outputs, outputs = input
172 |         output = outputs[-1]
173 |         sl,bs,_ = output.size()
174 |         avgpool = self.pool(output, bs, False)
175 |         mxpool = self.pool(output, bs, True)
176 |         x = torch.cat([output[-1], mxpool, avgpool], 1)
177 |         for l in self.layers:
178 |             l_x = l(x)
179 |             x = F.relu(l_x)
180 |         l_x = F.log_softmax(l_x)
181 |         return l_x, raw_outputs, outputs
182 |     
183 | class LinearRegression(nn.Module):
184 |     def __init__(self, layers, drops):
185 |         super().__init__()
186 |         self.layers = nn.ModuleList([
187 |             LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])
188 | 
189 |     def forward(self, input):
190 |         raw_outputs, outputs = input
191 |         x = outputs[-1][-1]
192 |         for l in self.layers:
193 |             l_x = l(x)
194 |             x = F.relu(l_x)
195 |         #l_x = F.l1_loss(x)
196 |         return l_x, raw_outputs, outputs
197 | 
198 | class SequentialRNN(nn.Sequential):
199 |     def reset(self):
200 |         for c in self.children():
201 |             if hasattr(c, 'reset'): c.reset()
202 | 
203 | 
204 | def get_language_model(n_tok, emb_sz, nhid, nlayers, pad_token,
205 |                  dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True):
206 |     """Returns a SequentialRNN model.
207 | 
208 |     A RNN_Encoder layer is instantiated using the parameters provided.
209 | 
210 |     This is followed by the creation of a LinearDecoder layer.
211 | 
212 |     Also by default (i.e. tie_weights = True), the embedding matrix used in the RNN_Encoder
213 |     is used to  instantiate the weights for the LinearDecoder layer.
214 | 
215 |     The SequentialRNN layer is the native torch's Sequential wrapper that puts the RNN_Encoder and
216 |     LinearDecoder layers sequentially in the model.
217 | 
218 |     Args:
219 |         n_tok (int): number of unique vocabulary words (or tokens) in the source dataset
220 |         emb_sz (int): the embedding size to use to encode each token
221 |         nhid (int): number of hidden activation per LSTM layer
222 |         nlayers (int): number of LSTM layers to use in the architecture
223 |         pad_token (int): the int value used for padding text.
224 |         dropouth (float): dropout to apply to the activations going from one LSTM layer to another
225 |         dropouti (float): dropout to apply to the input layer.
226 |         dropoute (float): dropout to apply to the embedding layer.
227 |         wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.
228 |         tie_weights (bool): decide if the weights of the embedding matrix in the RNN encoder should be tied to the
229 |             weights of the LinearDecoder layer.
230 |     Returns:
231 |         A SequentialRNN model
232 |     """
233 | 
234 |     rnn_enc = RNN_Encoder(n_tok, emb_sz, nhid=nhid, nlayers=nlayers, pad_token=pad_token,
235 |                  dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
236 |     enc = rnn_enc.encoder if tie_weights else None
237 |     return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc))
238 | 
239 | 
240 | def get_rnn_classifer(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
241 |                       dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
242 |     rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
243 |                       dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
244 |     return SequentialRNN(rnn_enc, PoolingLinearClassifier(layers, drops))
245 | 
246 | def get_rnn_regression(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
247 |                       dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
248 |     rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
249 |                       dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
250 |     return SequentialRNN(rnn_enc, LinearRegression(layers, drops))
251 | 
252 | 


--------------------------------------------------------------------------------
/alica_bkw.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## Alica"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 2,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "# Put these at the top of every notebook, to get automatic reloading and inline plotting\n",
  17 |     "%reload_ext autoreload\n",
  18 |     "%autoreload 2\n",
  19 |     "%matplotlib inline"
  20 |    ]
  21 |   },
  22 |   {
  23 |    "cell_type": "code",
  24 |    "execution_count": 3,
  25 |    "metadata": {},
  26 |    "outputs": [],
  27 |    "source": [
  28 |     "from fastai.text import *\n",
  29 |     "import html"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": 4,
  35 |    "metadata": {},
  36 |    "outputs": [],
  37 |    "source": [
  38 |     "BOS = 'xbos'  # beginning-of-sentence tag\n",
  39 |     "FLD = 'xfld'  # data field tag\n",
  40 |     "\n",
  41 |     "PATH=Path('data/alica/')"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "markdown",
  46 |    "metadata": {},
  47 |    "source": [
  48 |     "## Standardize format"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 5,
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "CLAS_PATH=Path('data/alica_clas/')\n",
  58 |     "CLAS_PATH.mkdir(exist_ok=True)\n",
  59 |     "\n",
  60 |     "LM_PATH=Path('data/alica_lm/')\n",
  61 |     "LM_PATH.mkdir(exist_ok=True)"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 6,
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "CLASSES = ['good', 'neutral', 'bad']\n",
  71 |     "#d = {\"good\":2,\"neutral\":1,\"bad\":0}\n",
  72 |     "d = {\"good\":1,\"bad\":0}\n",
  73 |     "col_names = ['labels','text']"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "markdown",
  78 |    "metadata": {},
  79 |    "source": [
  80 |     "## Language model tokens"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "code",
  85 |    "execution_count": 7,
  86 |    "metadata": {},
  87 |    "outputs": [],
  88 |    "source": [
  89 |     "chunksize=96000"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 8,
  95 |    "metadata": {},
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "re1 = re.compile(r'  +')\n",
  99 |     "\n",
 100 |     "def fixup(x):\n",
 101 |     "    x = x.replace(u'\\xa0', u' ')\n",
 102 |     "    x = x.replace('|', '\\n')\n",
 103 |     "    x = x.replace('\\n\\n\\n', '\\n')\n",
 104 |     "    x = x.replace('\\n\\n', '\\n')\n",
 105 |     "    x = x.replace(\". . .\",\"...\")\n",
 106 |     "    x = \" , \".join(x.split(\",\"))\n",
 107 |     "    x = \" . \".join(x.split(\".\"))\n",
 108 |     "    x = x.replace(\".  .  .\",\"...\")\n",
 109 |     "    \n",
 110 |     "    return re1.sub(' ', html.unescape(x))"
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "code",
 115 |    "execution_count": 9,
 116 |    "metadata": {},
 117 |    "outputs": [],
 118 |    "source": [
 119 |     "def get_texts(df):\n",
 120 |     "    labels = df[0].values.astype(np.int64)\n",
 121 |     "    texts = '\\n' + df[1].astype(str)\n",
 122 |     "    texts = texts.apply(fixup).values.astype(str)\n",
 123 |     "\n",
 124 |     "    tok = Tokenizer().proc_all_mp(partition_by_cores(texts), 'xx')\n",
 125 |     "    return tok, list(labels)"
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "code",
 130 |    "execution_count": 10,
 131 |    "metadata": {},
 132 |    "outputs": [],
 133 |    "source": [
 134 |     "def get_all(df):\n",
 135 |     "    tok, labels = [], []\n",
 136 |     "    for i, r in tqdm(enumerate(df)):\n",
 137 |     "        tok_, labels_ = get_texts(r)\n",
 138 |     "        tok += tok_;\n",
 139 |     "        labels += labels_\n",
 140 |     "    return tok, labels"
 141 |    ]
 142 |   },
 143 |   {
 144 |    "cell_type": "code",
 145 |    "execution_count": null,
 146 |    "metadata": {},
 147 |    "outputs": [],
 148 |    "source": [
 149 |     "tmp_lm = []\n",
 150 |     "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')\n",
 151 |     "\n",
 152 |     "for x in tqdm(trn_lm):\n",
 153 |     "    tmp_lm.append(x[::-1])\n",
 154 |     "    \n",
 155 |     "np.save(LM_PATH/'tmp'/'trn_ids_bkw.npy', np.array(tmp_lm))"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": null,
 161 |    "metadata": {},
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "tmp_lm = []\n",
 165 |     "val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')\n",
 166 |     "\n",
 167 |     "for x in tqdm(val_lm):\n",
 168 |     "    tmp_lm.append(x[::-1])\n",
 169 |     "    \n",
 170 |     "np.save(LM_PATH/'tmp'/'val_ids_bkw.npy', np.array(tmp_lm))"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "code",
 175 |    "execution_count": null,
 176 |    "metadata": {},
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "tmp_lm = []\n",
 180 |     "trn_lm = np.load(CLAS_PATH/'tmp'/'trn_ids.npy')\n",
 181 |     "\n",
 182 |     "for x in tqdm(trn_lm):\n",
 183 |     "    tmp_lm.append(x[::-1])\n",
 184 |     "    \n",
 185 |     "np.save(CLAS_PATH/'tmp'/'trn_ids_bkw.npy', np.array(tmp_lm))"
 186 |    ]
 187 |   },
 188 |   {
 189 |    "cell_type": "code",
 190 |    "execution_count": null,
 191 |    "metadata": {},
 192 |    "outputs": [],
 193 |    "source": [
 194 |     "tmp_lm = []\n",
 195 |     "val_lm = np.load(CLAS_PATH/'tmp'/'val_ids.npy')\n",
 196 |     "\n",
 197 |     "for x in tqdm(val_lm):\n",
 198 |     "    tmp_lm.append(x[::-1])\n",
 199 |     "    \n",
 200 |     "np.save(CLAS_PATH/'tmp'/'val_ids_bkw.npy', np.array(tmp_lm))"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "code",
 205 |    "execution_count": null,
 206 |    "metadata": {},
 207 |    "outputs": [],
 208 |    "source": [
 209 |     "tmp_lm = []\n",
 210 |     "tst_lm = np.load(CLAS_PATH/'tmp'/'tst_ids.npy')\n",
 211 |     "\n",
 212 |     "for x in tqdm(tst_lm):\n",
 213 |     "    tmp_lm.append(x[::-1])\n",
 214 |     "    \n",
 215 |     "np.save(CLAS_PATH/'tmp'/'tst_ids_bkw.npy', np.array(tmp_lm))"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "code",
 220 |    "execution_count": null,
 221 |    "metadata": {},
 222 |    "outputs": [],
 223 |    "source": [
 224 |     "tmp_lm = []\n",
 225 |     "tst_lm = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
 226 |     "\n",
 227 |     "for x in tqdm(tst_lm):\n",
 228 |     "    tmp_lm.append(x[::-1])\n",
 229 |     "    \n",
 230 |     "np.save(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy', np.array(tmp_lm))"
 231 |    ]
 232 |   },
 233 |   {
 234 |    "cell_type": "code",
 235 |    "execution_count": null,
 236 |    "metadata": {},
 237 |    "outputs": [],
 238 |    "source": [
 239 |     "for i in range(10):\n",
 240 |     "    tmp_lm = []\n",
 241 |     "    trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy')\n",
 242 |     "\n",
 243 |     "    for x in tqdm(trn_lm):\n",
 244 |     "        tmp_lm.append(x[::-1])\n",
 245 |     "\n",
 246 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy', np.array(tmp_lm))"
 247 |    ]
 248 |   },
 249 |   {
 250 |    "cell_type": "code",
 251 |    "execution_count": null,
 252 |    "metadata": {},
 253 |    "outputs": [],
 254 |    "source": [
 255 |     "for i in range(10):\n",
 256 |     "    tmp_lm = []\n",
 257 |     "    val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}.npy')\n",
 258 |     "\n",
 259 |     "    for x in tqdm(val_lm):\n",
 260 |     "        tmp_lm.append(x[::-1])\n",
 261 |     "\n",
 262 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy', np.array(tmp_lm))"
 263 |    ]
 264 |   },
 265 |   {
 266 |    "cell_type": "code",
 267 |    "execution_count": null,
 268 |    "metadata": {},
 269 |    "outputs": [],
 270 |    "source": [
 271 |     "for i in range(10):\n",
 272 |     "    tmp_lm = []\n",
 273 |     "    trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy')\n",
 274 |     "\n",
 275 |     "    for x in tqdm(trn_lm):\n",
 276 |     "        tmp_lm.append(x[::-1])\n",
 277 |     "\n",
 278 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_.npy', np.array(tmp_lm))"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": null,
 284 |    "metadata": {},
 285 |    "outputs": [],
 286 |    "source": [
 287 |     "for i in range(10):\n",
 288 |     "    tmp_lm = []\n",
 289 |     "    val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy')\n",
 290 |     "\n",
 291 |     "    for x in tqdm(val_lm):\n",
 292 |     "        tmp_lm.append(x[::-1])\n",
 293 |     "\n",
 294 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_.npy', np.array(tmp_lm))"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": null,
 300 |    "metadata": {},
 301 |    "outputs": [],
 302 |    "source": [
 303 |     "for i in range(10):\n",
 304 |     "    tmp_lm = []\n",
 305 |     "    trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy')\n",
 306 |     "\n",
 307 |     "    for x in tqdm(trn_lm):\n",
 308 |     "        tmp_lm.append(x[::-1])\n",
 309 |     "\n",
 310 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_n.npy', np.array(tmp_lm))\n",
 311 |     "\n",
 312 |     "for i in range(10):\n",
 313 |     "    tmp_lm = []\n",
 314 |     "    val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy')\n",
 315 |     "\n",
 316 |     "    for x in tqdm(val_lm):\n",
 317 |     "        tmp_lm.append(x[::-1])\n",
 318 |     "\n",
 319 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_n.npy', np.array(tmp_lm))"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": null,
 325 |    "metadata": {},
 326 |    "outputs": [],
 327 |    "source": [
 328 |     "for i in range(10):\n",
 329 |     "    tmp_lm = []\n",
 330 |     "    trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy')\n",
 331 |     "\n",
 332 |     "    for x in tqdm(trn_lm):\n",
 333 |     "        tmp_lm.append(x[::-1])\n",
 334 |     "\n",
 335 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_r.npy', np.array(tmp_lm))\n",
 336 |     "\n",
 337 |     "for i in range(10):\n",
 338 |     "    tmp_lm = []\n",
 339 |     "    val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy')\n",
 340 |     "\n",
 341 |     "    for x in tqdm(val_lm):\n",
 342 |     "        tmp_lm.append(x[::-1])\n",
 343 |     "\n",
 344 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_r.npy', np.array(tmp_lm))"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "markdown",
 349 |    "metadata": {},
 350 |    "source": [
 351 |     "## Language model"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": null,
 357 |    "metadata": {},
 358 |    "outputs": [],
 359 |    "source": [
 360 |     "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids_bkw.npy')\n",
 361 |     "val_lm = np.load(LM_PATH/'tmp'/'val_ids_bkw.npy')\n",
 362 |     "itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))\n",
 363 |     "\n",
 364 |     "vs=len(itos)\n",
 365 |     "vs,len(trn_lm)"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "code",
 370 |    "execution_count": null,
 371 |    "metadata": {},
 372 |    "outputs": [],
 373 |    "source": [
 374 |     "em_sz,nh,nl = 400,1150,3\n",
 375 |     "\n",
 376 |     "wd=1e-7\n",
 377 |     "bptt=70\n",
 378 |     "bs=52\n",
 379 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": null,
 385 |    "metadata": {},
 386 |    "outputs": [],
 387 |    "source": [
 388 |     "trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)\n",
 389 |     "val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)\n",
 390 |     "md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)"
 391 |    ]
 392 |   },
 393 |   {
 394 |    "cell_type": "code",
 395 |    "execution_count": null,
 396 |    "metadata": {},
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7"
 400 |    ]
 401 |   },
 402 |   {
 403 |    "cell_type": "code",
 404 |    "execution_count": null,
 405 |    "metadata": {},
 406 |    "outputs": [],
 407 |    "source": [
 408 |     "learner= md.get_model(opt_fn, em_sz, nh, nl, \n",
 409 |     "    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])\n",
 410 |     "\n",
 411 |     "learner.metrics = [accuracy]\n",
 412 |     "learner.unfreeze()"
 413 |    ]
 414 |   },
 415 |   {
 416 |    "cell_type": "code",
 417 |    "execution_count": null,
 418 |    "metadata": {},
 419 |    "outputs": [],
 420 |    "source": [
 421 |     "lr=1e-3\n",
 422 |     "lrs = lr"
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": null,
 428 |    "metadata": {},
 429 |    "outputs": [],
 430 |    "source": [
 431 |     "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": null,
 437 |    "metadata": {},
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "learner.save('lm_last_ft_bkw')"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "code",
 445 |    "execution_count": null,
 446 |    "metadata": {},
 447 |    "outputs": [],
 448 |    "source": [
 449 |     "learner.load('lm_last_ft_bkw')"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": null,
 455 |    "metadata": {},
 456 |    "outputs": [],
 457 |    "source": [
 458 |     "learner.unfreeze()"
 459 |    ]
 460 |   },
 461 |   {
 462 |    "cell_type": "code",
 463 |    "execution_count": null,
 464 |    "metadata": {},
 465 |    "outputs": [],
 466 |    "source": [
 467 |     "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "code",
 472 |    "execution_count": null,
 473 |    "metadata": {},
 474 |    "outputs": [],
 475 |    "source": [
 476 |     "learner.save('lm0_bkw')"
 477 |    ]
 478 |   },
 479 |   {
 480 |    "cell_type": "code",
 481 |    "execution_count": null,
 482 |    "metadata": {},
 483 |    "outputs": [],
 484 |    "source": [
 485 |     "learner.save_encoder('lm_enc0_bkw')"
 486 |    ]
 487 |   },
 488 |   {
 489 |    "cell_type": "code",
 490 |    "execution_count": null,
 491 |    "metadata": {},
 492 |    "outputs": [],
 493 |    "source": [
 494 |     "learner.load('lm0_bkw')"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "code",
 499 |    "execution_count": null,
 500 |    "metadata": {
 501 |     "scrolled": false
 502 |    },
 503 |    "outputs": [],
 504 |    "source": [
 505 |     "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=10)"
 506 |    ]
 507 |   },
 508 |   {
 509 |    "cell_type": "code",
 510 |    "execution_count": null,
 511 |    "metadata": {},
 512 |    "outputs": [],
 513 |    "source": [
 514 |     "# 3.955312   3.864959   0.374353 \n",
 515 |     "learner.save('lm1_bkw')"
 516 |    ]
 517 |   },
 518 |   {
 519 |    "cell_type": "code",
 520 |    "execution_count": null,
 521 |    "metadata": {},
 522 |    "outputs": [],
 523 |    "source": [
 524 |     "learner.save_encoder('lm_enc1_bkw')"
 525 |    ]
 526 |   },
 527 |   {
 528 |    "cell_type": "code",
 529 |    "execution_count": null,
 530 |    "metadata": {},
 531 |    "outputs": [],
 532 |    "source": [
 533 |     "learner.sched.plot_loss()"
 534 |    ]
 535 |   },
 536 |   {
 537 |    "cell_type": "markdown",
 538 |    "metadata": {},
 539 |    "source": [
 540 |     "## Predict"
 541 |    ]
 542 |   },
 543 |   {
 544 |    "cell_type": "code",
 545 |    "execution_count": null,
 546 |    "metadata": {},
 547 |    "outputs": [],
 548 |    "source": [
 549 |     "path = \"/mnt/6676114C76111E7D/Kaggle/Alica/\""
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "code",
 554 |    "execution_count": null,
 555 |    "metadata": {},
 556 |    "outputs": [],
 557 |    "source": [
 558 |     "#folds neutral=bad\n",
 559 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 560 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 561 |     "\n",
 562 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
 563 |     "vs = len(itos)\n",
 564 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
 565 |     "bs = 48\n",
 566 |     "c = 2\n",
 567 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
 568 |     "\n",
 569 |     "lr=1e-3\n",
 570 |     "lrm = 2.6\n",
 571 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
 572 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
 573 |     "\n",
 574 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n",
 575 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
 576 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
 577 |     "tst_samp = SimpleSampler(tst_clas)\n",
 578 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
 579 |     "\n",
 580 |     "for i in tqdm(range(10)):\n",
 581 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy')\n",
 582 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy')\n",
 583 |     "    \n",
 584 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy'))\n",
 585 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}.npy'))\n",
 586 |     "\n",
 587 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
 588 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
 589 |     "    \n",
 590 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
 591 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
 592 |     "    \n",
 593 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
 594 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
 595 |     "    \n",
 596 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
 597 |     "\n",
 598 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
 599 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
 600 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
 601 |     "\n",
 602 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
 603 |     "\n",
 604 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
 605 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
 606 |     "    learn.clip=25.\n",
 607 |     "    learn.metrics = [accuracy]\n",
 608 |     "\n",
 609 |     "#     wd = 0\n",
 610 |     "#     learn.load_encoder('lm_enc1_bkw')\n",
 611 |     "#     learn.freeze_to(-1)\n",
 612 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 613 |     "#     learn.save(f'fold_{i}_bkw')\n",
 614 |     "\n",
 615 |     "#     learn.freeze_to(-2)\n",
 616 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 617 |     "#     learn.save(f'fold_{i}_bkw')\n",
 618 |     "    \n",
 619 |     "#     wd = 1e-7\n",
 620 |     "#     learn.unfreeze()\n",
 621 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 622 |     "#     learn.save(f'fold_{i}_bkw')\n",
 623 |     "    \n",
 624 |     "#     learn.fit(lrs, 2, wds=wd, cycle_len=1)\n",
 625 |     "#     learn.save(f'fold_{i}_bkw')\n",
 626 |     "    learn.load(f'./10_folds_bkw_bad_85921/fold_{i}_bkw')\n",
 627 |     "    \n",
 628 |     "    wd = 1e-7\n",
 629 |     "    learn.unfreeze()\n",
 630 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
 631 |     "    learn.save(path+f'fold_{i}_bkw')\n",
 632 |     "    \n",
 633 |     "    preds = learn.predict(is_test=True)\n",
 634 |     "    \n",
 635 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
 636 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)] \n",
 637 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
 638 |     "    df.to_csv(f\"./_blend1/_fold_{i}_bkw.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
 639 |    ]
 640 |   },
 641 |   {
 642 |    "cell_type": "code",
 643 |    "execution_count": null,
 644 |    "metadata": {},
 645 |    "outputs": [],
 646 |    "source": [
 647 |     "#folds neutral=good\n",
 648 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 649 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 650 |     "\n",
 651 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
 652 |     "vs = len(itos)\n",
 653 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
 654 |     "bs = 48\n",
 655 |     "c = 2\n",
 656 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
 657 |     "\n",
 658 |     "lr=1e-3\n",
 659 |     "lrm = 2.6\n",
 660 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
 661 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
 662 |     "\n",
 663 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n",
 664 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
 665 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
 666 |     "tst_samp = SimpleSampler(tst_clas)\n",
 667 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
 668 |     "\n",
 669 |     "for i in tqdm(range(10)):\n",
 670 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_.npy')\n",
 671 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_.npy')\n",
 672 |     "    \n",
 673 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy'))\n",
 674 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy'))\n",
 675 |     "\n",
 676 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
 677 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
 678 |     "    \n",
 679 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
 680 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
 681 |     "    \n",
 682 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
 683 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
 684 |     "    \n",
 685 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
 686 |     "\n",
 687 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
 688 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
 689 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
 690 |     "\n",
 691 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
 692 |     "\n",
 693 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
 694 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
 695 |     "    learn.clip=25.\n",
 696 |     "    learn.metrics = [accuracy]\n",
 697 |     "\n",
 698 |     "#     wd = 0\n",
 699 |     "#     learn.load_encoder('lm_enc1_bkw')\n",
 700 |     "#     learn.freeze_to(-1)\n",
 701 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 702 |     "#     learn.save(f'fold_{i}_bkw_')\n",
 703 |     "\n",
 704 |     "#     learn.freeze_to(-2)\n",
 705 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 706 |     "#     learn.save(f'fold_{i}_bkw_')\n",
 707 |     "    \n",
 708 |     "#     wd = 1e-7\n",
 709 |     "#     learn.unfreeze()\n",
 710 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 711 |     "#     learn.save(f'fold_{i}_bkw_')\n",
 712 |     "    \n",
 713 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
 714 |     "#     learn.save(f'fold_{i}_bkw_')\n",
 715 |     "    learn.load(f'./10_folds_bkw_good_85650/fold_{i}_bkw_')\n",
 716 |     "    \n",
 717 |     "    wd = 1e-7\n",
 718 |     "    learn.unfreeze()\n",
 719 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
 720 |     "    learn.save(path+f'fold_{i}_bkw_')\n",
 721 |     "    \n",
 722 |     "    preds = learn.predict(is_test=True)\n",
 723 |     "    \n",
 724 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
 725 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)] \n",
 726 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
 727 |     "    df.to_csv(f\"./_blend1/_fold_{i}_bkw_.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {},
 734 |    "outputs": [],
 735 |    "source": [
 736 |     "#folds 3\n",
 737 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 738 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 739 |     "\n",
 740 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
 741 |     "vs = len(itos)\n",
 742 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
 743 |     "bs = 48\n",
 744 |     "c = 3\n",
 745 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
 746 |     "\n",
 747 |     "lr=1e-3\n",
 748 |     "lrm = 2.6\n",
 749 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
 750 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
 751 |     "\n",
 752 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n",
 753 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
 754 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
 755 |     "tst_samp = SimpleSampler(tst_clas)\n",
 756 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
 757 |     "\n",
 758 |     "for i in tqdm(range(10)):\n",
 759 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy')\n",
 760 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy')\n",
 761 |     "    \n",
 762 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy'))\n",
 763 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy'))\n",
 764 |     "\n",
 765 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
 766 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
 767 |     "    \n",
 768 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
 769 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
 770 |     "    \n",
 771 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
 772 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
 773 |     "    \n",
 774 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
 775 |     "\n",
 776 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
 777 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
 778 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
 779 |     "\n",
 780 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
 781 |     "\n",
 782 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
 783 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
 784 |     "    learn.clip=25.\n",
 785 |     "    learn.metrics = [accuracy]\n",
 786 |     "\n",
 787 |     "#     wd = 0\n",
 788 |     "#     learn.load_encoder('lm_enc1_bkw')\n",
 789 |     "#     learn.freeze_to(-1)\n",
 790 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 791 |     "#     learn.save(f'fold_{i}_bkw_3')\n",
 792 |     "\n",
 793 |     "#     learn.freeze_to(-2)\n",
 794 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 795 |     "#     learn.save(f'fold_{i}_bkw_3')\n",
 796 |     "    \n",
 797 |     "#     wd = 1e-7\n",
 798 |     "#     learn.unfreeze()\n",
 799 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 800 |     "#     learn.save(f'fold_{i}_bkw_3')\n",
 801 |     "    \n",
 802 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
 803 |     "#     learn.save(f'fold_{i}_bkw_3')\n",
 804 |     "    \n",
 805 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
 806 |     "#     learn.save(f'fold_{i}_bkw_3')\n",
 807 |     "    learn.load(f'./10_folds_bkw_3_1.4_86795/fold_{i}_bkw_3_')\n",
 808 |     "    \n",
 809 |     "    wd = 1e-7\n",
 810 |     "    learn.unfreeze()\n",
 811 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
 812 |     "    learn.save(path+f'fold_{i}_bkw_3')\n",
 813 |     "    \n",
 814 |     "    preds = learn.predict(is_test=True)\n",
 815 |     "    \n",
 816 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
 817 |     "    df[\"prob1\"] = [c[1] for c in np.exp(preds)]\n",
 818 |     "    df[\"prob2\"] = [c[2] for c in np.exp(preds)]\n",
 819 |     "    df.to_csv(f\"./_blend1/_fold_{i}_bkw_3.tsv\",columns=[2,3,\"prob1\",\"prob2\"],index=False,sep=\"\\t\",header=False)"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "code",
 824 |    "execution_count": null,
 825 |    "metadata": {},
 826 |    "outputs": [],
 827 |    "source": [
 828 |     "#folds neutral=good\n",
 829 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 830 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 831 |     "\n",
 832 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
 833 |     "vs = len(itos)\n",
 834 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
 835 |     "bs = 48\n",
 836 |     "c = 2\n",
 837 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
 838 |     "\n",
 839 |     "lr=1e-3\n",
 840 |     "lrm = 2.6\n",
 841 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
 842 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
 843 |     "\n",
 844 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n",
 845 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
 846 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
 847 |     "tst_samp = SimpleSampler(tst_clas)\n",
 848 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
 849 |     "\n",
 850 |     "for i in tqdm(range(10)):\n",
 851 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_n.npy')\n",
 852 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_n.npy')\n",
 853 |     "    \n",
 854 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy'))\n",
 855 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy'))\n",
 856 |     "\n",
 857 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
 858 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
 859 |     "    \n",
 860 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
 861 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
 862 |     "    \n",
 863 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
 864 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
 865 |     "    \n",
 866 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
 867 |     "\n",
 868 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
 869 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
 870 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
 871 |     "\n",
 872 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
 873 |     "\n",
 874 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
 875 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
 876 |     "    learn.clip=25.\n",
 877 |     "    learn.metrics = [accuracy]\n",
 878 |     "\n",
 879 |     "#     wd = 0\n",
 880 |     "#     learn.load_encoder('lm_enc1_bkw')\n",
 881 |     "#     learn.freeze_to(-1)\n",
 882 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 883 |     "#     learn.save(f'fold_{i}_bkw_n')\n",
 884 |     "\n",
 885 |     "#     learn.freeze_to(-2)\n",
 886 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 887 |     "#     learn.save(f'fold_{i}_bkw_n')\n",
 888 |     "    \n",
 889 |     "#     wd = 1e-7\n",
 890 |     "#     learn.unfreeze()\n",
 891 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 892 |     "#     learn.save(f'fold_{i}_bkw_n')\n",
 893 |     "    \n",
 894 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
 895 |     "#     learn.save(f'fold_{i}_bkw_n')\n",
 896 |     "    learn.load(f'./10_folds_bkw_neutral/fold_{i}_bkw_n')\n",
 897 |     "    \n",
 898 |     "    wd = 1e-7\n",
 899 |     "    learn.unfreeze()\n",
 900 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
 901 |     "    learn.save(path+f'fold_{i}_bkw_n')\n",
 902 |     "    \n",
 903 |     "    preds = learn.predict(is_test=True)\n",
 904 |     "    \n",
 905 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
 906 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)] \n",
 907 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
 908 |     "    df.to_csv(f\"./_blend1/_fold_{i}_bkw_n.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": null,
 914 |    "metadata": {},
 915 |    "outputs": [],
 916 |    "source": [
 917 |     "#folds reg\n",
 918 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 919 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 920 |     "\n",
 921 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
 922 |     "vs = len(itos)\n",
 923 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
 924 |     "bs = 48\n",
 925 |     "c = 1\n",
 926 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
 927 |     "\n",
 928 |     "lr=1e-3\n",
 929 |     "lrm = 2.6\n",
 930 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
 931 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
 932 |     "\n",
 933 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n",
 934 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
 935 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
 936 |     "tst_samp = SimpleSampler(tst_clas)\n",
 937 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
 938 |     "\n",
 939 |     "for i in tqdm(range(10)):\n",
 940 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_r.npy')\n",
 941 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_r.npy')\n",
 942 |     "    \n",
 943 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy'))\n",
 944 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy'))\n",
 945 |     "\n",
 946 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
 947 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
 948 |     "    \n",
 949 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
 950 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
 951 |     "    \n",
 952 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
 953 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
 954 |     "    \n",
 955 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
 956 |     "\n",
 957 |     "    m = get_rnn_regression(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
 958 |     "              layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n",
 959 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
 960 |     "\n",
 961 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
 962 |     "\n",
 963 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
 964 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
 965 |     "    learn.clip=25.\n",
 966 |     "    \n",
 967 |     "    learn.crit = F.mse_loss\n",
 968 |     "\n",
 969 |     "#     wd = 0\n",
 970 |     "#     learn.load_encoder('lm_enc1')\n",
 971 |     "#     learn.freeze_to(-1)\n",
 972 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 973 |     "#     learn.save(f'fold_{i}_bkw_r')\n",
 974 |     "\n",
 975 |     "#     learn.freeze_to(-2)\n",
 976 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 977 |     "#     learn.save(f'fold_{i}_bkw_r')\n",
 978 |     "    \n",
 979 |     "#     wd = 1e-7\n",
 980 |     "#     learn.unfreeze()\n",
 981 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
 982 |     "#     learn.save(f'fold_{i}_bkw_r')\n",
 983 |     "    \n",
 984 |     "#     learn.fit(lrs, 5, wds=wd, cycle_len=1)\n",
 985 |     "#     learn.save(f'fold_{i}_bkw_r')\n",
 986 |     "    learn.load(f'./10_folds_bkw_mse_86040/fold_{i}_bkw_r')\n",
 987 |     "    preds = learn.predict(is_test=True)\n",
 988 |     "    \n",
 989 |     "    wd = 1e-7\n",
 990 |     "    learn.unfreeze()\n",
 991 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
 992 |     "    learn.save(path+f'fold_{i}_bkw_r')\n",
 993 |     "    \n",
 994 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
 995 |     "    df[\"prob\"] = [c[0] for c in preds]\n",
 996 |     "    df.to_csv(f\"./_blend1/_fold_{i}_bkw_r.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
 997 |    ]
 998 |   }
 999 |  ],
1000 |  "metadata": {
1001 |   "_draft": {
1002 |    "nbviewer_url": "https://gist.github.com/0dd0df21cf404cf2bb51d0148c8b7d8b"
1003 |   },
1004 |   "gist": {
1005 |    "data": {
1006 |     "description": "fastai.text imdb example",
1007 |     "public": true
1008 |    },
1009 |    "id": "0dd0df21cf404cf2bb51d0148c8b7d8b"
1010 |   },
1011 |   "kernelspec": {
1012 |    "display_name": "Python [default]",
1013 |    "language": "python",
1014 |    "name": "python3"
1015 |   },
1016 |   "language_info": {
1017 |    "codemirror_mode": {
1018 |     "name": "ipython",
1019 |     "version": 3
1020 |    },
1021 |    "file_extension": ".py",
1022 |    "mimetype": "text/x-python",
1023 |    "name": "python",
1024 |    "nbconvert_exporter": "python",
1025 |    "pygments_lexer": "ipython3",
1026 |    "version": "3.6.4"
1027 |   },
1028 |   "toc": {
1029 |    "colors": {
1030 |     "hover_highlight": "#DAA520",
1031 |     "navigate_num": "#000000",
1032 |     "navigate_text": "#333333",
1033 |     "running_highlight": "#FF0000",
1034 |     "selected_highlight": "#FFD700",
1035 |     "sidebar_border": "#EEEEEE",
1036 |     "wrapper_background": "#FFFFFF"
1037 |    },
1038 |    "moveMenuLeft": true,
1039 |    "nav_menu": {
1040 |     "height": "86px",
1041 |     "width": "252px"
1042 |    },
1043 |    "navigate_menu": true,
1044 |    "number_sections": true,
1045 |    "sideBar": true,
1046 |    "threshold": 4,
1047 |    "toc_cell": false,
1048 |    "toc_section_display": "block",
1049 |    "toc_window_display": false,
1050 |    "widenNotebook": false
1051 |   }
1052 |  },
1053 |  "nbformat": 4,
1054 |  "nbformat_minor": 2
1055 | }
1056 | 


--------------------------------------------------------------------------------
/alica_fwd.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## Alica"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": null,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "# Put these at the top of every notebook, to get automatic reloading and inline plotting\n",
  17 |     "%reload_ext autoreload\n",
  18 |     "%autoreload 2\n",
  19 |     "%matplotlib inline"
  20 |    ]
  21 |   },
  22 |   {
  23 |    "cell_type": "code",
  24 |    "execution_count": null,
  25 |    "metadata": {},
  26 |    "outputs": [],
  27 |    "source": [
  28 |     "from fastai.text import *\n",
  29 |     "import html"
  30 |    ]
  31 |   },
  32 |   {
  33 |    "cell_type": "code",
  34 |    "execution_count": null,
  35 |    "metadata": {},
  36 |    "outputs": [],
  37 |    "source": [
  38 |     "BOS = 'xbos'  # beginning-of-sentence tag\n",
  39 |     "FLD = 'xfld'  # data field tag\n",
  40 |     "\n",
  41 |     "PATH=Path('data/alica/')"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "markdown",
  46 |    "metadata": {},
  47 |    "source": [
  48 |     "## Standardize format"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": null,
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "CLAS_PATH=Path('data/alica_clas/')\n",
  58 |     "CLAS_PATH.mkdir(exist_ok=True)\n",
  59 |     "\n",
  60 |     "LM_PATH=Path('data/alica_lm/')\n",
  61 |     "LM_PATH.mkdir(exist_ok=True)"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": null,
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "CLASSES = ['good', 'neutral', 'bad']\n",
  71 |     "d = {\"good\":2,\"neutral\":1,\"bad\":0}\n",
  72 |     "#d = {\"good\":1,\"neutral\":0}\n",
  73 |     "#d = {\"good\":1,\"bad\":0}\n",
  74 |     "col_names = ['labels','text']"
  75 |    ]
  76 |   },
  77 |   {
  78 |    "cell_type": "code",
  79 |    "execution_count": null,
  80 |    "metadata": {},
  81 |    "outputs": [],
  82 |    "source": [
  83 |     "(CLAS_PATH/'classes.txt').open('w').writelines(f'{o}\\n' for o in CLASSES)"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "code",
  88 |    "execution_count": null,
  89 |    "metadata": {},
  90 |    "outputs": [],
  91 |    "source": [
  92 |     "df = pd.read_csv(PATH/\"public.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
  93 |     "df.fillna(\"\", inplace=True)\n",
  94 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
  95 |     "df[\"id\"] = df[0]\n",
  96 |     "df[\"num\"] = df[4]\n",
  97 |     "df[\"labels\"] = [0]*len(df)\n",
  98 |     "\n",
  99 |     "df.to_csv(CLAS_PATH/\"test.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "metadata": {},
 106 |    "outputs": [],
 107 |    "source": [
 108 |     "df = pd.read_csv(PATH/\"final.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 109 |     "df.fillna(\"\", inplace=True)\n",
 110 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 111 |     "df[\"id\"] = df[0]\n",
 112 |     "df[\"num\"] = df[4]\n",
 113 |     "df[\"labels\"] = [0]*len(df)\n",
 114 |     "\n",
 115 |     "df.to_csv(CLAS_PATH/\"test_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": null,
 121 |    "metadata": {},
 122 |    "outputs": [],
 123 |    "source": [
 124 |     "#folds neutral=bad\n",
 125 |     "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 126 |     "df.head()\n",
 127 |     "\n",
 128 |     "df.fillna(\"\", inplace=True)\n",
 129 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 130 |     "df[\"id\"] = df[0]\n",
 131 |     "df[\"num\"] = df[4]\n",
 132 |     "df[\"labels\"] = df[6].apply(lambda x: d[x.replace(\"neutral\",\"bad\")])\n",
 133 |     "\n",
 134 |     "from sklearn.model_selection import KFold\n",
 135 |     "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n",
 136 |     "i = 0\n",
 137 |     "for train_index, valid_index in kf.split(df):\n",
 138 |     "    dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n",
 139 |     "    \n",
 140 |     "    dff.to_csv(CLAS_PATH/f\"train_{i}.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 141 |     "    dfv.to_csv(CLAS_PATH/f\"valid_{i}.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 142 |     "    \n",
 143 |     "    i+=1\n"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "code",
 148 |    "execution_count": null,
 149 |    "metadata": {},
 150 |    "outputs": [],
 151 |    "source": [
 152 |     "#folds neutral=good\n",
 153 |     "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 154 |     "df.head()\n",
 155 |     "\n",
 156 |     "df.fillna(\"\", inplace=True)\n",
 157 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 158 |     "df[\"id\"] = df[0]\n",
 159 |     "df[\"num\"] = df[4]\n",
 160 |     "df[\"labels\"] = df[6].apply(lambda x: d[x.replace(\"neutral\",\"good\")])\n",
 161 |     "\n",
 162 |     "from sklearn.model_selection import KFold\n",
 163 |     "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n",
 164 |     "i = 0\n",
 165 |     "for train_index, valid_index in kf.split(df):\n",
 166 |     "    dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n",
 167 |     "    \n",
 168 |     "    dff.to_csv(CLAS_PATH/f\"train_{i}_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 169 |     "    dfv.to_csv(CLAS_PATH/f\"valid_{i}_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 170 |     "    \n",
 171 |     "    i+=1"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "code",
 176 |    "execution_count": null,
 177 |    "metadata": {},
 178 |    "outputs": [],
 179 |    "source": [
 180 |     "#folds 3\n",
 181 |     "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 182 |     "df.head()\n",
 183 |     "\n",
 184 |     "df.fillna(\"\", inplace=True)\n",
 185 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 186 |     "df[\"id\"] = df[0]\n",
 187 |     "df[\"num\"] = df[4]\n",
 188 |     "df[\"labels\"] = df[6].apply(lambda x: d[x])\n",
 189 |     "\n",
 190 |     "from sklearn.model_selection import KFold\n",
 191 |     "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n",
 192 |     "i = 0\n",
 193 |     "for train_index, valid_index in kf.split(df):\n",
 194 |     "    dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n",
 195 |     "    \n",
 196 |     "    dff.to_csv(CLAS_PATH/f\"train_{i}_3.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 197 |     "    dfv.to_csv(CLAS_PATH/f\"valid_{i}_3.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 198 |     "    \n",
 199 |     "    i+=1"
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "code",
 204 |    "execution_count": null,
 205 |    "metadata": {},
 206 |    "outputs": [],
 207 |    "source": [
 208 |     "#folds no neutral\n",
 209 |     "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 210 |     "\n",
 211 |     "df.fillna(\"\", inplace=True)\n",
 212 |     "df = df[df[6]!=\"neutral\"]\n",
 213 |     "\n",
 214 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 215 |     "df[\"id\"] = df[0]\n",
 216 |     "df[\"num\"] = df[4]\n",
 217 |     "df[\"labels\"] = df[6].apply(lambda x: d[x])\n",
 218 |     "\n",
 219 |     "from sklearn.model_selection import KFold\n",
 220 |     "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n",
 221 |     "i = 0\n",
 222 |     "for train_index, valid_index in kf.split(df):\n",
 223 |     "    dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n",
 224 |     "    \n",
 225 |     "    dff.to_csv(CLAS_PATH/f\"train_{i}_n.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 226 |     "    dfv.to_csv(CLAS_PATH/f\"valid_{i}_n.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 227 |     "    \n",
 228 |     "    i+=1"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": null,
 234 |    "metadata": {},
 235 |    "outputs": [],
 236 |    "source": [
 237 |     "#folds reg\n",
 238 |     "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n",
 239 |     "\n",
 240 |     "df.fillna(\"\", inplace=True)\n",
 241 |     "df = df[df[6]!=\"neutral\"]\n",
 242 |     "\n",
 243 |     "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n",
 244 |     "df[\"id\"] = df[0]\n",
 245 |     "df[\"num\"] = df[4]\n",
 246 |     "\n",
 247 |     "res = []\n",
 248 |     "for x,y in zip(df[6], df[7]):\n",
 249 |     "    if x == \"good\":\n",
 250 |     "        t = 1+y\n",
 251 |     "    if x == \"neutral\":\n",
 252 |     "        t = 1 \n",
 253 |     "    if x == \"bad\":\n",
 254 |     "        t = 1-y\n",
 255 |     "    res.append(t)\n",
 256 |     "\n",
 257 |     "df[\"labels\"] = res\n",
 258 |     "\n",
 259 |     "from sklearn.model_selection import KFold\n",
 260 |     "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n",
 261 |     "i = 0\n",
 262 |     "for train_index, valid_index in kf.split(df):\n",
 263 |     "    dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n",
 264 |     "    \n",
 265 |     "    dff.to_csv(CLAS_PATH/f\"train_{i}_r.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 266 |     "    dfv.to_csv(CLAS_PATH/f\"valid_{i}_r.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n",
 267 |     "    \n",
 268 |     "    i+=1"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "markdown",
 273 |    "metadata": {},
 274 |    "source": [
 275 |     "## OpenSubtitles"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": null,
 281 |    "metadata": {},
 282 |    "outputs": [],
 283 |    "source": [
 284 |     "df = pd.read_csv(PATH/\"OpenSubtitles2016.en-ru.ru\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": null,
 290 |    "metadata": {},
 291 |    "outputs": [],
 292 |    "source": [
 293 |     "texts = df[0]\n",
 294 |     "len(texts)"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": null,
 300 |    "metadata": {},
 301 |    "outputs": [],
 302 |    "source": [
 303 |     "trn_texts,val_texts = sklearn.model_selection.train_test_split(texts, test_size=0.1)"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": null,
 309 |    "metadata": {},
 310 |    "outputs": [],
 311 |    "source": [
 312 |     "len(trn_texts), len(val_texts)"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": null,
 318 |    "metadata": {},
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=col_names)\n",
 322 |     "df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=col_names)\n",
 323 |     "\n",
 324 |     "df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)\n",
 325 |     "df_val.to_csv(LM_PATH/'test.csv', header=False, index=False)"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "markdown",
 330 |    "metadata": {},
 331 |    "source": [
 332 |     "## Language model tokens"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": null,
 338 |    "metadata": {},
 339 |    "outputs": [],
 340 |    "source": [
 341 |     "chunksize=96000"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "code",
 346 |    "execution_count": null,
 347 |    "metadata": {},
 348 |    "outputs": [],
 349 |    "source": [
 350 |     "re1 = re.compile(r'  +')\n",
 351 |     "\n",
 352 |     "def fixup(x):\n",
 353 |     "    x = x.replace(u'\\xa0', u' ')\n",
 354 |     "    x = x.replace('|', '\\n')\n",
 355 |     "    x = x.replace('\\n\\n\\n', '\\n')\n",
 356 |     "    x = x.replace('\\n\\n', '\\n')\n",
 357 |     "    x = x.replace(\". . .\",\"...\")\n",
 358 |     "    x = \" , \".join(x.split(\",\"))\n",
 359 |     "    x = \" . \".join(x.split(\".\"))\n",
 360 |     "    x = x.replace(\".  .  .\",\"...\")\n",
 361 |     "    \n",
 362 |     "    return re1.sub(' ', html.unescape(x))"
 363 |    ]
 364 |   },
 365 |   {
 366 |    "cell_type": "code",
 367 |    "execution_count": null,
 368 |    "metadata": {},
 369 |    "outputs": [],
 370 |    "source": [
 371 |     "def get_texts(df):\n",
 372 |     "    #labels = df[0].values.astype(np.int64)\n",
 373 |     "    labels = df[0].values.astype(float) #for reg\n",
 374 |     "    texts = '\\n' + df[1].astype(str)\n",
 375 |     "    texts = texts.apply(fixup).values.astype(str)\n",
 376 |     "\n",
 377 |     "    tok = Tokenizer().proc_all_mp(partition_by_cores(texts), 'xx')\n",
 378 |     "    return tok, list(labels)"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": null,
 384 |    "metadata": {},
 385 |    "outputs": [],
 386 |    "source": [
 387 |     "def get_all(df):\n",
 388 |     "    tok, labels = [], []\n",
 389 |     "    for i, r in tqdm(enumerate(df)):\n",
 390 |     "        tok_, labels_ = get_texts(r)\n",
 391 |     "        tok += tok_;\n",
 392 |     "        labels += labels_\n",
 393 |     "    return tok, labels"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "code",
 398 |    "execution_count": null,
 399 |    "metadata": {},
 400 |    "outputs": [],
 401 |    "source": [
 402 |     "df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)\n",
 403 |     "tok_val, val_labels = get_all(df_val)"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": null,
 409 |    "metadata": {
 410 |     "scrolled": false
 411 |    },
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)\n",
 415 |     "tok_trn, trn_labels = get_all(df_trn)"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": null,
 421 |    "metadata": {},
 422 |    "outputs": [],
 423 |    "source": []
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": null,
 428 |    "metadata": {},
 429 |    "outputs": [],
 430 |    "source": [
 431 |     "from random import choice\n",
 432 |     "choice(tok_trn)"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "code",
 437 |    "execution_count": null,
 438 |    "metadata": {},
 439 |    "outputs": [],
 440 |    "source": [
 441 |     "(LM_PATH/'tmp').mkdir(exist_ok=True)"
 442 |    ]
 443 |   },
 444 |   {
 445 |    "cell_type": "code",
 446 |    "execution_count": null,
 447 |    "metadata": {},
 448 |    "outputs": [],
 449 |    "source": [
 450 |     "np.save(LM_PATH/'tmp'/'tok_trn_0.npy', tok_trn[:4000000])\n",
 451 |     "np.save(LM_PATH/'tmp'/'tok_trn_1.npy', tok_trn[4000000:8000000])\n",
 452 |     "np.save(LM_PATH/'tmp'/'tok_trn_2.npy', tok_trn[8000000:10000000])\n",
 453 |     "np.save(LM_PATH/'tmp'/'tok_trn_3.npy', tok_trn[10000000:12000000])\n",
 454 |     "np.save(LM_PATH/'tmp'/'tok_trn_4.npy', tok_trn[12000000:14000000])\n",
 455 |     "np.save(LM_PATH/'tmp'/'tok_trn_5.npy', tok_trn[14000000:15000000])\n",
 456 |     "np.save(LM_PATH/'tmp'/'tok_trn_6.npy', tok_trn[15000000:])\n",
 457 |     "np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "code",
 462 |    "execution_count": null,
 463 |    "metadata": {},
 464 |    "outputs": [],
 465 |    "source": [
 466 |     "tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "code",
 471 |    "execution_count": null,
 472 |    "metadata": {},
 473 |    "outputs": [],
 474 |    "source": [
 475 |     "tok_trn = np.array([])\n",
 476 |     "for i in tqdm(range(7)):\n",
 477 |     "    tok_trn = np.append(tok_trn, np.load(LM_PATH/'tmp'/f'tok_trn_{i}.npy'))"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": null,
 483 |    "metadata": {
 484 |     "scrolled": true
 485 |    },
 486 |    "outputs": [],
 487 |    "source": [
 488 |     "freq = Counter(p for o in tok_trn for p in o)\n",
 489 |     "freq.most_common(25)"
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "code",
 494 |    "execution_count": null,
 495 |    "metadata": {},
 496 |    "outputs": [],
 497 |    "source": []
 498 |   },
 499 |   {
 500 |    "cell_type": "code",
 501 |    "execution_count": null,
 502 |    "metadata": {},
 503 |    "outputs": [],
 504 |    "source": [
 505 |     "max_vocab = 60000\n",
 506 |     "min_freq = 2"
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "code",
 511 |    "execution_count": null,
 512 |    "metadata": {},
 513 |    "outputs": [],
 514 |    "source": [
 515 |     "itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]\n",
 516 |     "itos.insert(0, '_pad_')\n",
 517 |     "itos.insert(0, '_unk_')"
 518 |    ]
 519 |   },
 520 |   {
 521 |    "cell_type": "code",
 522 |    "execution_count": null,
 523 |    "metadata": {},
 524 |    "outputs": [],
 525 |    "source": [
 526 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 527 |     "len(itos)"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "code",
 532 |    "execution_count": null,
 533 |    "metadata": {},
 534 |    "outputs": [],
 535 |    "source": [
 536 |     "trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 537 |     "val_lm = np.array([[stoi[o] for o in p] for p in tok_val])"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "code",
 542 |    "execution_count": null,
 543 |    "metadata": {},
 544 |    "outputs": [],
 545 |    "source": [
 546 |     "np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)\n",
 547 |     "np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)\n",
 548 |     "pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "code",
 553 |    "execution_count": null,
 554 |    "metadata": {},
 555 |    "outputs": [],
 556 |    "source": [
 557 |     "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')\n",
 558 |     "val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')\n",
 559 |     "itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))"
 560 |    ]
 561 |   },
 562 |   {
 563 |    "cell_type": "code",
 564 |    "execution_count": null,
 565 |    "metadata": {},
 566 |    "outputs": [],
 567 |    "source": [
 568 |     "vs=len(itos)\n",
 569 |     "vs,len(trn_lm)"
 570 |    ]
 571 |   },
 572 |   {
 573 |    "cell_type": "markdown",
 574 |    "metadata": {},
 575 |    "source": [
 576 |     "## Language model"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": null,
 582 |    "metadata": {},
 583 |    "outputs": [],
 584 |    "source": [
 585 |     "em_sz,nh,nl = 400,1150,3\n",
 586 |     "\n",
 587 |     "wd=1e-7\n",
 588 |     "bptt=70\n",
 589 |     "bs=52\n",
 590 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))"
 591 |    ]
 592 |   },
 593 |   {
 594 |    "cell_type": "code",
 595 |    "execution_count": null,
 596 |    "metadata": {},
 597 |    "outputs": [],
 598 |    "source": [
 599 |     "trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)\n",
 600 |     "val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)\n",
 601 |     "md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)"
 602 |    ]
 603 |   },
 604 |   {
 605 |    "cell_type": "code",
 606 |    "execution_count": null,
 607 |    "metadata": {},
 608 |    "outputs": [],
 609 |    "source": [
 610 |     "drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7"
 611 |    ]
 612 |   },
 613 |   {
 614 |    "cell_type": "code",
 615 |    "execution_count": null,
 616 |    "metadata": {},
 617 |    "outputs": [],
 618 |    "source": [
 619 |     "learner= md.get_model(opt_fn, em_sz, nh, nl, \n",
 620 |     "    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])\n",
 621 |     "\n",
 622 |     "learner.metrics = [accuracy]\n",
 623 |     "learner.unfreeze()"
 624 |    ]
 625 |   },
 626 |   {
 627 |    "cell_type": "code",
 628 |    "execution_count": null,
 629 |    "metadata": {},
 630 |    "outputs": [],
 631 |    "source": [
 632 |     "lr=1e-3\n",
 633 |     "lrs = lr"
 634 |    ]
 635 |   },
 636 |   {
 637 |    "cell_type": "code",
 638 |    "execution_count": null,
 639 |    "metadata": {},
 640 |    "outputs": [],
 641 |    "source": [
 642 |     "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)"
 643 |    ]
 644 |   },
 645 |   {
 646 |    "cell_type": "code",
 647 |    "execution_count": null,
 648 |    "metadata": {},
 649 |    "outputs": [],
 650 |    "source": [
 651 |     "learner.save('lm_last_ft')"
 652 |    ]
 653 |   },
 654 |   {
 655 |    "cell_type": "code",
 656 |    "execution_count": null,
 657 |    "metadata": {},
 658 |    "outputs": [],
 659 |    "source": [
 660 |     "learner.load('lm_last_ft')"
 661 |    ]
 662 |   },
 663 |   {
 664 |    "cell_type": "code",
 665 |    "execution_count": null,
 666 |    "metadata": {},
 667 |    "outputs": [],
 668 |    "source": [
 669 |     "learner.unfreeze()"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": null,
 675 |    "metadata": {},
 676 |    "outputs": [],
 677 |    "source": [
 678 |     "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": null,
 684 |    "metadata": {},
 685 |    "outputs": [],
 686 |    "source": [
 687 |     "learner.load('lm0')"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": null,
 693 |    "metadata": {
 694 |     "scrolled": false
 695 |    },
 696 |    "outputs": [],
 697 |    "source": [
 698 |     "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=5)"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": null,
 704 |    "metadata": {},
 705 |    "outputs": [],
 706 |    "source": [
 707 |     "#3.980186   3.859496   0.364186\n",
 708 |     "learner.save('lm1')"
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "code",
 713 |    "execution_count": null,
 714 |    "metadata": {},
 715 |    "outputs": [],
 716 |    "source": [
 717 |     "learner.save_encoder('lm_enc1')"
 718 |    ]
 719 |   },
 720 |   {
 721 |    "cell_type": "code",
 722 |    "execution_count": null,
 723 |    "metadata": {},
 724 |    "outputs": [],
 725 |    "source": [
 726 |     "learner.load('lm1')"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "code",
 731 |    "execution_count": null,
 732 |    "metadata": {},
 733 |    "outputs": [],
 734 |    "source": [
 735 |     "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=5)"
 736 |    ]
 737 |   },
 738 |   {
 739 |    "cell_type": "code",
 740 |    "execution_count": null,
 741 |    "metadata": {},
 742 |    "outputs": [],
 743 |    "source": [
 744 |     "learner.sched.plot_loss()"
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "code",
 749 |    "execution_count": null,
 750 |    "metadata": {},
 751 |    "outputs": [],
 752 |    "source": [
 753 |     "learner.save('lm2')\n",
 754 |     "learner.save_encoder('lm_enc2')\n",
 755 |     "#3.961754   3.840744   0.365777"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "markdown",
 760 |    "metadata": {},
 761 |    "source": [
 762 |     "## Classifier tokens"
 763 |    ]
 764 |   },
 765 |   {
 766 |    "cell_type": "code",
 767 |    "execution_count": null,
 768 |    "metadata": {},
 769 |    "outputs": [],
 770 |    "source": [
 771 |     "#df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 772 |     "#df_val = pd.read_csv(CLAS_PATH/'valid.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 773 |     "#df_tst = pd.read_csv(CLAS_PATH/'test.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 774 |     "df_tst = pd.read_csv(CLAS_PATH/'test_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)"
 775 |    ]
 776 |   },
 777 |   {
 778 |    "cell_type": "code",
 779 |    "execution_count": null,
 780 |    "metadata": {},
 781 |    "outputs": [],
 782 |    "source": [
 783 |     "#from random import choice\n",
 784 |     "#choice(tok_tst)"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": null,
 790 |    "metadata": {},
 791 |    "outputs": [],
 792 |    "source": [
 793 |     "#tok_trn, trn_labels = get_all(df_trn)\n",
 794 |     "#tok_val, val_labels = get_all(df_val)\n",
 795 |     "tok_tst, tst_labels = get_all(df_tst)\n"
 796 |    ]
 797 |   },
 798 |   {
 799 |    "cell_type": "code",
 800 |    "execution_count": null,
 801 |    "metadata": {},
 802 |    "outputs": [],
 803 |    "source": [
 804 |     "(CLAS_PATH/'tmp').mkdir(exist_ok=True)\n",
 805 |     "\n",
 806 |     "#np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)\n",
 807 |     "#np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)\n",
 808 |     "#np.save(CLAS_PATH/'tmp'/'tok_tst.npy', tok_tst)\n",
 809 |     "np.save(CLAS_PATH/'tmp'/'tok_tst_.npy', tok_tst)\n",
 810 |     "\n",
 811 |     "#np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)\n",
 812 |     "#np.save(CLAS_PATH/'tmp'/'val_labels.npy', val_labels)\n",
 813 |     "#np.save(CLAS_PATH/'tmp'/'tst_labels.npy', tst_labels)\n",
 814 |     "np.save(CLAS_PATH/'tmp'/'tst_labels_.npy', tst_labels)"
 815 |    ]
 816 |   },
 817 |   {
 818 |    "cell_type": "code",
 819 |    "execution_count": null,
 820 |    "metadata": {},
 821 |    "outputs": [],
 822 |    "source": [
 823 |     "#tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')\n",
 824 |     "#tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')\n",
 825 |     "#tok_tst = np.load(CLAS_PATH/'tmp'/'tok_tst.npy')\n",
 826 |     "tok_tst = np.load(CLAS_PATH/'tmp'/'tok_tst_.npy')"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "code",
 831 |    "execution_count": null,
 832 |    "metadata": {},
 833 |    "outputs": [],
 834 |    "source": [
 835 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 836 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 837 |     "len(itos)"
 838 |    ]
 839 |   },
 840 |   {
 841 |    "cell_type": "code",
 842 |    "execution_count": null,
 843 |    "metadata": {},
 844 |    "outputs": [],
 845 |    "source": [
 846 |     "#trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 847 |     "#val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
 848 |     "tst_clas = np.array([[stoi[o] for o in p] for p in tok_tst])"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": null,
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "#np.save(CLAS_PATH/'tmp'/'trn_ids.npy', trn_clas)\n",
 858 |     "#np.save(CLAS_PATH/'tmp'/'val_ids.npy', val_clas)\n",
 859 |     "#np.save(CLAS_PATH/'tmp'/'tst_ids.npy', tst_clas)\n",
 860 |     "np.save(CLAS_PATH/'tmp'/'tst_ids_.npy', tst_clas)"
 861 |    ]
 862 |   },
 863 |   {
 864 |    "cell_type": "code",
 865 |    "execution_count": null,
 866 |    "metadata": {},
 867 |    "outputs": [],
 868 |    "source": [
 869 |     "#folds nuetral=bad\n",
 870 |     "chunksize=96000\n",
 871 |     "\n",
 872 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 873 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 874 |     "\n",
 875 |     "for i in tqdm(range(10)):\n",
 876 |     "    df_trn = pd.read_csv(CLAS_PATH/f'train_{i}.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 877 |     "    df_val = pd.read_csv(CLAS_PATH/f'valid_{i}.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 878 |     "    \n",
 879 |     "    tok_trn, trn_labels = get_all(df_trn)\n",
 880 |     "    tok_val, val_labels = get_all(df_val)\n",
 881 |     "    \n",
 882 |     "    np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}.npy', tok_trn)\n",
 883 |     "    np.save(CLAS_PATH/'tmp'/f'tok_val_{i}.npy', tok_val)\n",
 884 |     "\n",
 885 |     "    np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy', trn_labels)\n",
 886 |     "    np.save(CLAS_PATH/'tmp'/f'val_labels_{i}.npy', val_labels)\n",
 887 |     "    \n",
 888 |     "    trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 889 |     "    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
 890 |     "\n",
 891 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy', trn_clas)\n",
 892 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}.npy', val_clas)"
 893 |    ]
 894 |   },
 895 |   {
 896 |    "cell_type": "code",
 897 |    "execution_count": null,
 898 |    "metadata": {},
 899 |    "outputs": [],
 900 |    "source": [
 901 |     "#folds nuetral=good\n",
 902 |     "chunksize=96000\n",
 903 |     "\n",
 904 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 905 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 906 |     "\n",
 907 |     "for i in tqdm(range(10)):\n",
 908 |     "    df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 909 |     "    df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 910 |     "    \n",
 911 |     "    tok_trn, trn_labels = get_all(df_trn)\n",
 912 |     "    tok_val, val_labels = get_all(df_val)\n",
 913 |     "    \n",
 914 |     "    np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_.npy', tok_trn)\n",
 915 |     "    np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_.npy', tok_val)\n",
 916 |     "\n",
 917 |     "    np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy', trn_labels)\n",
 918 |     "    np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy', val_labels)\n",
 919 |     "    \n",
 920 |     "    trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 921 |     "    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
 922 |     "\n",
 923 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy', trn_clas)\n",
 924 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy', val_clas)"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "code",
 929 |    "execution_count": null,
 930 |    "metadata": {},
 931 |    "outputs": [],
 932 |    "source": [
 933 |     "#folds 3\n",
 934 |     "chunksize=96000\n",
 935 |     "\n",
 936 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 937 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 938 |     "\n",
 939 |     "for i in tqdm(range(10)):\n",
 940 |     "    df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_3.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 941 |     "    df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_3.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 942 |     "    \n",
 943 |     "    tok_trn, trn_labels = get_all(df_trn)\n",
 944 |     "    tok_val, val_labels = get_all(df_val)\n",
 945 |     "    \n",
 946 |     "    np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_3.npy', tok_trn)\n",
 947 |     "    np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_3.npy', tok_val)\n",
 948 |     "\n",
 949 |     "    np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy', trn_labels)\n",
 950 |     "    np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy', val_labels)\n",
 951 |     "    \n",
 952 |     "    trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 953 |     "    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
 954 |     "\n",
 955 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_3.npy', trn_clas)\n",
 956 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_3.npy', val_clas)"
 957 |    ]
 958 |   },
 959 |   {
 960 |    "cell_type": "code",
 961 |    "execution_count": null,
 962 |    "metadata": {},
 963 |    "outputs": [],
 964 |    "source": [
 965 |     "#folds no neutral\n",
 966 |     "chunksize=96000\n",
 967 |     "\n",
 968 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
 969 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
 970 |     "\n",
 971 |     "for i in tqdm(range(10)):\n",
 972 |     "    df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_n.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 973 |     "    df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_n.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
 974 |     "    \n",
 975 |     "    tok_trn, trn_labels = get_all(df_trn)\n",
 976 |     "    tok_val, val_labels = get_all(df_val)\n",
 977 |     "    \n",
 978 |     "    np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_n.npy', tok_trn)\n",
 979 |     "    np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_n.npy', tok_val)\n",
 980 |     "\n",
 981 |     "    np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy', trn_labels)\n",
 982 |     "    np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy', val_labels)\n",
 983 |     "    \n",
 984 |     "    trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
 985 |     "    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
 986 |     "\n",
 987 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy', trn_clas)\n",
 988 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy', val_clas)"
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": null,
 994 |    "metadata": {},
 995 |    "outputs": [],
 996 |    "source": [
 997 |     "#folds reg\n",
 998 |     "chunksize=96000\n",
 999 |     "\n",
1000 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1001 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1002 |     "\n",
1003 |     "for i in tqdm(range(10)):\n",
1004 |     "    df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_r.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
1005 |     "    df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_r.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n",
1006 |     "    \n",
1007 |     "    tok_trn, trn_labels = get_all(df_trn)\n",
1008 |     "    tok_val, val_labels = get_all(df_val)\n",
1009 |     "    \n",
1010 |     "    np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_r.npy', tok_trn)\n",
1011 |     "    np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_r.npy', tok_val)\n",
1012 |     "\n",
1013 |     "    np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy', trn_labels)\n",
1014 |     "    np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy', val_labels)\n",
1015 |     "    \n",
1016 |     "    trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n",
1017 |     "    val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n",
1018 |     "\n",
1019 |     "    np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy', trn_clas)\n",
1020 |     "    np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy', val_clas)"
1021 |    ]
1022 |   },
1023 |   {
1024 |    "cell_type": "markdown",
1025 |    "metadata": {},
1026 |    "source": [
1027 |     "## Predict"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": null,
1033 |    "metadata": {},
1034 |    "outputs": [],
1035 |    "source": [
1036 |     "path = \"/mnt/6676114C76111E7D/Kaggle/Alica/\""
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "code",
1041 |    "execution_count": null,
1042 |    "metadata": {},
1043 |    "outputs": [],
1044 |    "source": [
1045 |     "#folds neutral=bad\n",
1046 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1047 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1048 |     "\n",
1049 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
1050 |     "vs = len(itos)\n",
1051 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
1052 |     "bs = 48\n",
1053 |     "c = 2\n",
1054 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
1055 |     "\n",
1056 |     "lr=1e-3\n",
1057 |     "lrm = 2.6\n",
1058 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
1059 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
1060 |     "\n",
1061 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
1062 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
1063 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
1064 |     "tst_samp = SimpleSampler(tst_clas)\n",
1065 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
1066 |     "\n",
1067 |     "for i in tqdm(range(10)):\n",
1068 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy')\n",
1069 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}.npy')\n",
1070 |     "    \n",
1071 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy'))\n",
1072 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}.npy'))\n",
1073 |     "\n",
1074 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
1075 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
1076 |     "    \n",
1077 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
1078 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
1079 |     "    \n",
1080 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
1081 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
1082 |     "    \n",
1083 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
1084 |     "\n",
1085 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
1086 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
1087 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
1088 |     "\n",
1089 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
1090 |     "\n",
1091 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
1092 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
1093 |     "    learn.clip=25.\n",
1094 |     "    learn.metrics = [accuracy]\n",
1095 |     "\n",
1096 |     "#     wd = 0\n",
1097 |     "#     learn.load_encoder('lm_enc1')\n",
1098 |     "#     learn.freeze_to(-1)\n",
1099 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1100 |     "#     learn.save(f'fold_{i}')\n",
1101 |     "\n",
1102 |     "#     learn.freeze_to(-2)\n",
1103 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1104 |     "#     learn.save(f'fold_{i}')\n",
1105 |     "    \n",
1106 |     "#     wd = 1e-7\n",
1107 |     "#     learn.unfreeze()\n",
1108 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1109 |     "#     learn.save(f'fold_{i}')\n",
1110 |     "    \n",
1111 |     "#     learn.fit(lrs, 2, wds=wd, cycle_len=1)\n",
1112 |     "#     learn.save(f'fold_{i}')\n",
1113 |     "    learn.load(f'./10_folds_fwd_bad_86283/fold_{i}')\n",
1114 |     "    \n",
1115 |     "    wd = 1e-7\n",
1116 |     "    learn.unfreeze()\n",
1117 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
1118 |     "    learn.save(path+f'fold_{i}')\n",
1119 |     "\n",
1120 |     "    preds = learn.predict(is_test=True)\n",
1121 |     "    \n",
1122 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
1123 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)] \n",
1124 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
1125 |     "    df.to_csv(f\"./blend1/_fold_{i}.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
1126 |    ]
1127 |   },
1128 |   {
1129 |    "cell_type": "code",
1130 |    "execution_count": null,
1131 |    "metadata": {},
1132 |    "outputs": [],
1133 |    "source": []
1134 |   },
1135 |   {
1136 |    "cell_type": "code",
1137 |    "execution_count": null,
1138 |    "metadata": {},
1139 |    "outputs": [],
1140 |    "source": []
1141 |   },
1142 |   {
1143 |    "cell_type": "code",
1144 |    "execution_count": null,
1145 |    "metadata": {},
1146 |    "outputs": [],
1147 |    "source": [
1148 |     "#folds neutral=good\n",
1149 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1150 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1151 |     "\n",
1152 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
1153 |     "vs = len(itos)\n",
1154 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
1155 |     "bs = 48\n",
1156 |     "c = 2\n",
1157 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
1158 |     "\n",
1159 |     "lr=1e-3\n",
1160 |     "lrm = 2.6\n",
1161 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
1162 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
1163 |     "\n",
1164 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
1165 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
1166 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
1167 |     "tst_samp = SimpleSampler(tst_clas)\n",
1168 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
1169 |     "\n",
1170 |     "for i in tqdm(range(10)):\n",
1171 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy')\n",
1172 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy')\n",
1173 |     "    \n",
1174 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy'))\n",
1175 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy'))\n",
1176 |     "\n",
1177 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
1178 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
1179 |     "    \n",
1180 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
1181 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
1182 |     "    \n",
1183 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
1184 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
1185 |     "    \n",
1186 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
1187 |     "\n",
1188 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
1189 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
1190 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
1191 |     "\n",
1192 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
1193 |     "\n",
1194 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
1195 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
1196 |     "    learn.clip=25.\n",
1197 |     "    learn.metrics = [accuracy]\n",
1198 |     "\n",
1199 |     "#     wd = 0\n",
1200 |     "#     learn.load_encoder('lm_enc1')\n",
1201 |     "#     learn.freeze_to(-1)\n",
1202 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1203 |     "#     learn.save(f'fold_{i}_')\n",
1204 |     "\n",
1205 |     "#     learn.freeze_to(-2)\n",
1206 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1207 |     "#     learn.save(f'fold_{i}_')\n",
1208 |     "    \n",
1209 |     "#     wd = 1e-7\n",
1210 |     "#     learn.unfreeze()\n",
1211 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1212 |     "#     learn.save(f'fold_{i}_')\n",
1213 |     "    \n",
1214 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1215 |     "#     learn.save(f'fold_{i}_')\n",
1216 |     "    learn.load(f'./10_folds_fwd_good_85739/fold_{i}_')\n",
1217 |     "    \n",
1218 |     "    wd = 1e-7\n",
1219 |     "    learn.unfreeze()\n",
1220 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
1221 |     "    learn.save(path+f'fold_{i}_')\n",
1222 |     "        \n",
1223 |     "    preds = learn.predict(is_test=True)\n",
1224 |     "    \n",
1225 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
1226 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)] \n",
1227 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
1228 |     "    df.to_csv(f\"./blend1/_fold_{i}_.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
1229 |    ]
1230 |   },
1231 |   {
1232 |    "cell_type": "code",
1233 |    "execution_count": null,
1234 |    "metadata": {},
1235 |    "outputs": [],
1236 |    "source": [
1237 |     "#folds 3\n",
1238 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1239 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1240 |     "\n",
1241 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
1242 |     "vs = len(itos)\n",
1243 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
1244 |     "bs = 48\n",
1245 |     "c = 3\n",
1246 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
1247 |     "\n",
1248 |     "lr=1e-3\n",
1249 |     "lrm = 2.6\n",
1250 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
1251 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
1252 |     "\n",
1253 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
1254 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
1255 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
1256 |     "tst_samp = SimpleSampler(tst_clas)\n",
1257 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
1258 |     "\n",
1259 |     "for i in tqdm(range(10)):\n",
1260 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_3.npy')\n",
1261 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_3.npy')\n",
1262 |     "    \n",
1263 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy'))\n",
1264 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy'))\n",
1265 |     "\n",
1266 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
1267 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
1268 |     "    \n",
1269 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
1270 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
1271 |     "    \n",
1272 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
1273 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
1274 |     "    \n",
1275 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
1276 |     "\n",
1277 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
1278 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
1279 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
1280 |     "\n",
1281 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
1282 |     "\n",
1283 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
1284 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
1285 |     "    learn.clip=25.\n",
1286 |     "    learn.metrics = [accuracy]\n",
1287 |     "\n",
1288 |     "#     wd = 0\n",
1289 |     "#     learn.load_encoder('lm_enc1')\n",
1290 |     "#     learn.freeze_to(-1)\n",
1291 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1292 |     "#     learn.save(f'fold_{i}_3')\n",
1293 |     "\n",
1294 |     "#     learn.freeze_to(-2)\n",
1295 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1296 |     "#     learn.save(f'fold_{i}_3')\n",
1297 |     "    \n",
1298 |     "#     wd = 1e-7\n",
1299 |     "#     learn.unfreeze()\n",
1300 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1301 |     "#     learn.save(f'fold_{i}_3')\n",
1302 |     "    \n",
1303 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1304 |     "#     learn.save(f'fold_{i}_3')\n",
1305 |     "    \n",
1306 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1307 |     "#     learn.save(f'fold_{i}_3')\n",
1308 |     "    learn.load(f'./10_folds_fwd_3_1.4_86962/fold_{i}_3')\n",
1309 |     "    \n",
1310 |     "    wd = 1e-7\n",
1311 |     "    learn.unfreeze()\n",
1312 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
1313 |     "    learn.save(path+f'fold_{i}_3')\n",
1314 |     "    \n",
1315 |     "    preds = learn.predict(is_test=True)\n",
1316 |     "    \n",
1317 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
1318 |     "    df[\"prob1\"] = [c[1] for c in np.exp(preds)]\n",
1319 |     "    df[\"prob2\"] = [c[2] for c in np.exp(preds)]\n",
1320 |     "    df.to_csv(f\"./blend1/_fold_{i}_3.tsv\",columns=[2,3,\"prob1\",\"prob2\"],index=False,sep=\"\\t\",header=False)"
1321 |    ]
1322 |   },
1323 |   {
1324 |    "cell_type": "code",
1325 |    "execution_count": null,
1326 |    "metadata": {},
1327 |    "outputs": [],
1328 |    "source": [
1329 |     "#folds neutral\n",
1330 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1331 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1332 |     "\n",
1333 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
1334 |     "vs = len(itos)\n",
1335 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
1336 |     "bs = 48\n",
1337 |     "c = 2\n",
1338 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
1339 |     "\n",
1340 |     "lr=1e-3\n",
1341 |     "lrm = 2.6\n",
1342 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
1343 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
1344 |     "\n",
1345 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
1346 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
1347 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
1348 |     "tst_samp = SimpleSampler(tst_clas)\n",
1349 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
1350 |     "\n",
1351 |     "for i in tqdm(range(10)):\n",
1352 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy')\n",
1353 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy')\n",
1354 |     "    \n",
1355 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy'))\n",
1356 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy'))\n",
1357 |     "\n",
1358 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
1359 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
1360 |     "    \n",
1361 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
1362 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
1363 |     "    \n",
1364 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
1365 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
1366 |     "    \n",
1367 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
1368 |     "\n",
1369 |     "    m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
1370 |     "              layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n",
1371 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
1372 |     "\n",
1373 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
1374 |     "\n",
1375 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
1376 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
1377 |     "    learn.clip=25.\n",
1378 |     "    learn.metrics = [accuracy]\n",
1379 |     "\n",
1380 |     "#     wd = 0\n",
1381 |     "#     learn.load_encoder('lm_enc1')\n",
1382 |     "#     learn.freeze_to(-1)\n",
1383 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1384 |     "#     learn.save(f'fold_{i}_n')\n",
1385 |     "\n",
1386 |     "#     learn.freeze_to(-2)\n",
1387 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1388 |     "#     learn.save(f'fold_{i}_n')\n",
1389 |     "    \n",
1390 |     "#     wd = 1e-7\n",
1391 |     "#     learn.unfreeze()\n",
1392 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1393 |     "#     learn.save(f'fold_{i}_n')\n",
1394 |     "    \n",
1395 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1396 |     "#     learn.save(f'fold_{i}_n')\n",
1397 |     "    learn.load(f'./10_folds_fwd_neutral/fold_{i}_n')\n",
1398 |     "    \n",
1399 |     "    wd = 1e-7\n",
1400 |     "    learn.unfreeze()\n",
1401 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
1402 |     "    learn.save(path+f'fold_{i}_n')\n",
1403 |     "    \n",
1404 |     "    preds = learn.predict(is_test=True)\n",
1405 |     "    \n",
1406 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
1407 |     "    df[\"prob\"] = [c[1] for c in np.exp(preds)]\n",
1408 |     "    df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n",
1409 |     "    df.to_csv(f\"./blend1/_fold_{i}_n.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
1410 |    ]
1411 |   },
1412 |   {
1413 |    "cell_type": "code",
1414 |    "execution_count": null,
1415 |    "metadata": {},
1416 |    "outputs": [],
1417 |    "source": [
1418 |     "#folds reg\n",
1419 |     "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n",
1420 |     "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n",
1421 |     "\n",
1422 |     "bptt,em_sz,nh,nl = 70,400,1150,3\n",
1423 |     "vs = len(itos)\n",
1424 |     "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n",
1425 |     "bs = 48\n",
1426 |     "c = 1\n",
1427 |     "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n",
1428 |     "\n",
1429 |     "lr=1e-3\n",
1430 |     "lrm = 2.6\n",
1431 |     "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n",
1432 |     "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n",
1433 |     "\n",
1434 |     "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n",
1435 |     "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n",
1436 |     "tst_ds = TextDataset(tst_clas, tst_labels)\n",
1437 |     "tst_samp = SimpleSampler(tst_clas)\n",
1438 |     "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n",
1439 |     "\n",
1440 |     "for i in tqdm(range(10)):\n",
1441 |     "    trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy')\n",
1442 |     "    val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy')\n",
1443 |     "    \n",
1444 |     "    trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy'))\n",
1445 |     "    val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy'))\n",
1446 |     "\n",
1447 |     "    trn_ds = TextDataset(trn_clas, trn_labels)\n",
1448 |     "    val_ds = TextDataset(val_clas, val_labels)\n",
1449 |     "    \n",
1450 |     "    trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n",
1451 |     "    val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n",
1452 |     "    \n",
1453 |     "    trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n",
1454 |     "    val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n",
1455 |     "    \n",
1456 |     "    md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n",
1457 |     "\n",
1458 |     "    m = get_rnn_regression(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n",
1459 |     "              layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n",
1460 |     "              dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n",
1461 |     "\n",
1462 |     "    opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n",
1463 |     "\n",
1464 |     "    learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n",
1465 |     "    learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n",
1466 |     "    learn.clip=25.\n",
1467 |     "    \n",
1468 |     "    learn.crit = F.mse_loss\n",
1469 |     "\n",
1470 |     "#     wd = 0\n",
1471 |     "#     learn.load_encoder('lm_enc1')\n",
1472 |     "#     learn.freeze_to(-1)\n",
1473 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1474 |     "#     learn.save(f'fold_{i}_r')\n",
1475 |     "\n",
1476 |     "#     learn.freeze_to(-2)\n",
1477 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1478 |     "#     learn.save(f'fold_{i}_r')\n",
1479 |     "    \n",
1480 |     "#     wd = 1e-7\n",
1481 |     "#     learn.unfreeze()\n",
1482 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n",
1483 |     "#     learn.save(f'fold_{i}_r')\n",
1484 |     "    \n",
1485 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1486 |     "#     learn.save(f'fold_{i}_r')\n",
1487 |     "    \n",
1488 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1489 |     "#     learn.save(f'fold_{i}_r')\n",
1490 |     "    \n",
1491 |     "#     learn.fit(lrs, 1, wds=wd, cycle_len=1)\n",
1492 |     "#     learn.save(f'fold_{i}_r')\n",
1493 |     "    learn.load(f'./10_folds_fwd_mse_86488/fold_{i}_r')\n",
1494 |     "    \n",
1495 |     "    wd = 1e-7\n",
1496 |     "    learn.unfreeze()\n",
1497 |     "    learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n",
1498 |     "    learn.save(path+f'fold_{i}_r')\n",
1499 |     "    \n",
1500 |     "    preds = learn.predict(is_test=True)\n",
1501 |     "    \n",
1502 |     "    df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n",
1503 |     "    df[\"prob\"] = [c[0] for c in preds]\n",
1504 |     "    df.to_csv(f\"./blend1/fold_{i}_r.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)"
1505 |    ]
1506 |   }
1507 |  ],
1508 |  "metadata": {
1509 |   "_draft": {
1510 |    "nbviewer_url": "https://gist.github.com/0dd0df21cf404cf2bb51d0148c8b7d8b"
1511 |   },
1512 |   "gist": {
1513 |    "data": {
1514 |     "description": "fastai.text imdb example",
1515 |     "public": true
1516 |    },
1517 |    "id": "0dd0df21cf404cf2bb51d0148c8b7d8b"
1518 |   },
1519 |   "kernelspec": {
1520 |    "display_name": "Python [default]",
1521 |    "language": "python",
1522 |    "name": "python3"
1523 |   },
1524 |   "language_info": {
1525 |    "codemirror_mode": {
1526 |     "name": "ipython",
1527 |     "version": 3
1528 |    },
1529 |    "file_extension": ".py",
1530 |    "mimetype": "text/x-python",
1531 |    "name": "python",
1532 |    "nbconvert_exporter": "python",
1533 |    "pygments_lexer": "ipython3",
1534 |    "version": "3.6.4"
1535 |   },
1536 |   "toc": {
1537 |    "colors": {
1538 |     "hover_highlight": "#DAA520",
1539 |     "navigate_num": "#000000",
1540 |     "navigate_text": "#333333",
1541 |     "running_highlight": "#FF0000",
1542 |     "selected_highlight": "#FFD700",
1543 |     "sidebar_border": "#EEEEEE",
1544 |     "wrapper_background": "#FFFFFF"
1545 |    },
1546 |    "moveMenuLeft": true,
1547 |    "nav_menu": {
1548 |     "height": "86px",
1549 |     "width": "252px"
1550 |    },
1551 |    "navigate_menu": true,
1552 |    "number_sections": true,
1553 |    "sideBar": true,
1554 |    "threshold": 4,
1555 |    "toc_cell": false,
1556 |    "toc_section_display": "block",
1557 |    "toc_window_display": false,
1558 |    "widenNotebook": false
1559 |   }
1560 |  },
1561 |  "nbformat": 4,
1562 |  "nbformat_minor": 2
1563 | }
1564 | 


--------------------------------------------------------------------------------