├── README.md ├── blend.py ├── fastai ├── text.py └── lm_rnn.py ├── alica_bkw.ipynb └── alica_fwd.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Yandex Algorithm 2018 ML track 2 | 3 | [Final results: 2nd place](https://contest.yandex.ru/algorithm2018/contest/7914/standings/?lang=en) 4 | 5 | To reproduce the second place solution run `blend.py` script. It will blend predictions from 100 models. 6 | 7 | Language and classification model training as well as prediction is in `alica_fwd.ipynb` file. Backwards model training and prediction (short version based on forward model) is in `alica_bkw.ipynb` file. 8 | 9 | In order to run the solution install [fast.ai library](https://github.com/fastai/fastai). Then replace two files (`lm_rnn.py` and `text.py`) from fastai folder. 10 | -------------------------------------------------------------------------------- /blend.py: -------------------------------------------------------------------------------- 1 | #blend_all folder is here https://drive.google.com/open?id=1dzOJX4eBqFekAh5ZQPv31jq3qfyKwO0h 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from tqdm import tqdm 6 | from os import listdir 7 | from collections import defaultdict 8 | import operator 9 | 10 | path = "./blend_all/" 11 | files = listdir(path) 12 | 13 | d = defaultdict(lambda:defaultdict(lambda:0)) 14 | 15 | for f in tqdm(files): 16 | df = pd.read_csv(path+f,sep="\t",header=None) 17 | if len(df.columns)==3: 18 | for i,j,k in zip(df[0],df[1],df[2]): 19 | if f.endswith("_r.tsv"): 20 | d[i][j]+=k/len(files)/2.22 21 | else: 22 | d[i][j]+=k/len(files) 23 | else: 24 | for i,j,k,l in zip(df[0],df[1],df[2],df[3]): 25 | d[i][j]+=(k/1.5+l)/len(files)*2.5 26 | 27 | res = [] 28 | for i in sorted(list(set(df[0]))): 29 | for j in [c[0] for c in sorted(d[i].items(), key=operator.itemgetter(1), reverse=True)]: 30 | res.append(j) 31 | 32 | df[1] = res 33 | df.to_csv("blend_all.csv.gz", sep="\t", header=False, index=False, columns=[0,1], compression='gzip') 34 | 35 | #87302 LB 36 | -------------------------------------------------------------------------------- /fastai/text.py: -------------------------------------------------------------------------------- 1 | from .core import * 2 | from .learner import * 3 | from .lm_rnn import * 4 | from torch.utils.data.sampler import Sampler 5 | import spacy 6 | from spacy.symbols import ORTH 7 | 8 | re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])') 9 | def tokenize(s): return re_tok.sub(r' \1 ', s).split() 10 | 11 | def texts_labels_from_folders(path, folders): 12 | texts,labels = [],[] 13 | for idx,label in enumerate(folders): 14 | for fname in glob(os.path.join(path, label, '*.*')): 15 | texts.append(open(fname, 'r').read()) 16 | labels.append(idx) 17 | return texts, np.array(labels).astype(np.int64) 18 | 19 | def numericalize_tok(tokens, max_vocab=50000, min_freq=0, unk_tok="_unk_", pad_tok="_pad_", bos_tok="_bos_", eos_tok="_eos_"): 20 | """Takes in text tokens and returns int2tok and tok2int coverters 21 | 22 | Arguments: 23 | tokens(list): List of tokens. Can be a list of strings, or a list of lists of strings. 24 | max_vocab(int): Number of tokens to return in the vocab (sorted by frequency) 25 | min_freq(int): Minimum number of instances a token must be present in order to be preserved. 26 | unk_tok(str): Token to use when unknown tokens are encountered in the source text. 27 | pad_tok(str): Token to use when padding sequences. 28 | """ 29 | if isinstance(tokens, str): 30 | raise ValueError("Expected to receive a list of tokens. Received a string instead") 31 | if isinstance(tokens[0], list): 32 | tokens = [p for o in tokens for p in o] 33 | freq = Counter(tokens) 34 | int2tok = [o for o,c in freq.most_common(max_vocab) if c>min_freq] 35 | unk_id = 3 36 | int2tok.insert(0, bos_tok) 37 | int2tok.insert(1, pad_tok) 38 | int2tok.insert(2, eos_tok) 39 | int2tok.insert(unk_id, unk_tok) 40 | tok2int = collections.defaultdict(lambda:unk_id, {v:k for k,v in enumerate(int2tok)}) 41 | return int2tok, tok2int 42 | 43 | class Tokenizer(): 44 | def __init__(self, lang='en'): 45 | self.re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE) 46 | self.tok = spacy.load(lang) 47 | for w in ('','',''): 48 | self.tok.tokenizer.add_special_case(w, [{ORTH: w}]) 49 | 50 | def sub_br(self,x): return self.re_br.sub("\n", x) 51 | 52 | def spacy_tok(self,x): 53 | return [t.text for t in self.tok.tokenizer(self.sub_br(x))] 54 | 55 | re_rep = re.compile(r'(\S)(\1{3,})') 56 | re_word_rep = re.compile(r'(\b\w+\W+)(\1{3,})') 57 | 58 | @staticmethod 59 | def replace_rep(m): 60 | TK_REP = 'tk_rep' 61 | c,cc = m.groups() 62 | return f' {TK_REP} {len(cc)+1} {c} ' 63 | 64 | @staticmethod 65 | def replace_wrep(m): 66 | TK_WREP = 'tk_wrep' 67 | c,cc = m.groups() 68 | return f' {TK_WREP} {len(cc.split())+1} {c} ' 69 | 70 | @staticmethod 71 | def do_caps(ss): 72 | TOK_UP,TOK_SENT,TOK_MIX = ' t_up ',' t_st ',' t_mx ' 73 | res = [] 74 | prev='.' 75 | re_word = re.compile('\w') 76 | re_nonsp = re.compile('\S') 77 | for s in re.findall(r'\w+|\W+', ss): 78 | res += ([TOK_UP,s.lower()] if (s.isupper() and (len(s)>2)) 79 | # else [TOK_SENT,s.lower()] if (s.istitle() and re_word.search(prev)) 80 | else [s.lower()]) 81 | # if re_nonsp.search(s): prev = s 82 | return ''.join(res) 83 | 84 | def proc_text(self, s): 85 | s = self.re_rep.sub(Tokenizer.replace_rep, s) 86 | s = self.re_word_rep.sub(Tokenizer.replace_wrep, s) 87 | s = Tokenizer.do_caps(s) 88 | s = re.sub(r'([/#])', r' \1 ', s) 89 | s = re.sub(' {2,}', ' ', s) 90 | return self.spacy_tok(s) 91 | 92 | @staticmethod 93 | def proc_all(ss, lang): 94 | tok = Tokenizer(lang) 95 | return [tok.proc_text(s) for s in ss] 96 | 97 | @staticmethod 98 | def proc_all_mp(ss, lang='en'): 99 | ncpus = num_cpus()//2 100 | with ProcessPoolExecutor(ncpus) as e: 101 | return sum(e.map(Tokenizer.proc_all, ss, [lang]*len(ss)), []) 102 | 103 | 104 | class TextDataset(Dataset): 105 | def __init__(self, x, y, backwards=False, sos=None, eos=None): 106 | self.x,self.y,self.backwards,self.sos,self.eos = x,y,backwards,sos,eos 107 | 108 | def __getitem__(self, idx): 109 | x = self.x[idx] 110 | if self.backwards: x = list(reversed(x)) 111 | if self.eos is not None: x = x + [self.eos] 112 | if self.sos is not None: x = [self.sos]+x 113 | return np.array(x),self.y[idx] 114 | 115 | def __len__(self): return len(self.x) 116 | 117 | 118 | class SortSampler(Sampler): 119 | def __init__(self, data_source, key): self.data_source,self.key = data_source,key 120 | def __len__(self): return len(self.data_source) 121 | def __iter__(self): 122 | return iter(sorted(range(len(self.data_source)), key=self.key)) 123 | 124 | class SimpleSampler(Sampler): 125 | def __init__(self, data_source): self.data_source = data_source 126 | def __len__(self): return len(self.data_source) 127 | def __iter__(self): 128 | return iter(range(len(self.data_source))) 129 | 130 | class SortishSampler(Sampler): 131 | """Returns an iterator that traverses the the data in randomly ordered batches that are approximately the same size. 132 | The max key size batch is always returned in the first call because of pytorch cuda memory allocation sequencing. 133 | Without that max key returned first multiple buffers may be allocated when the first created isn't large enough 134 | to hold the next in the sequence. 135 | """ 136 | def __init__(self, data_source, key, bs): 137 | self.data_source,self.key,self.bs = data_source,key,bs 138 | 139 | def __len__(self): return len(self.data_source) 140 | 141 | def __iter__(self): 142 | idxs = np.random.permutation(len(self.data_source)) 143 | sz = self.bs*50 144 | ck_idx = [idxs[i:i+sz] for i in range(0, len(idxs), sz)] 145 | sort_idx = sum([sorted(s, key=self.key, reverse=True) for s in ck_idx], []) 146 | sz = self.bs 147 | ck_idx = [sort_idx[i:i+sz] for i in range(0, len(sort_idx), sz)] 148 | sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) 149 | sort_idx = np.concatenate((ck_idx[0], sort_idx)) 150 | return iter(sort_idx) 151 | 152 | 153 | class LanguageModelLoader(): 154 | """ Returns a language model iterator that iterates through batches that are of length N(bptt,5) 155 | The first batch returned is always bptt+25; the max possible width. This is done because of they way that pytorch 156 | allocates cuda memory in order to prevent multiple buffers from being created as the batch width grows. 157 | """ 158 | def __init__(self, nums, bs, bptt, backwards=False): 159 | self.bs,self.bptt,self.backwards = bs,bptt,backwards 160 | self.data = self.batchify(nums) 161 | self.i,self.iter = 0,0 162 | self.n = len(self.data) 163 | 164 | def __iter__(self): 165 | self.i,self.iter = 0,0 166 | while self.i < self.n-1 and self.iter1: loss = loss + sum(beta * (h[1:] - h[:-1]).pow(2).mean()) 15 | return loss 16 | 17 | 18 | def repackage_var(h): 19 | """Wraps h in new Variables, to detach them from their history.""" 20 | return Variable(h.data) if type(h) == Variable else tuple(repackage_var(v) for v in h) 21 | 22 | 23 | class RNN_Encoder(nn.Module): 24 | 25 | """A custom RNN encoder network that uses 26 | - an embedding matrix to encode input, 27 | - a stack of LSTM layers to drive the network, and 28 | - variational dropouts in the embedding and LSTM layers 29 | 30 | The architecture for this network was inspired by the work done in 31 | "Regularizing and Optimizing LSTM Language Models". 32 | (https://arxiv.org/pdf/1708.02182.pdf) 33 | """ 34 | 35 | initrange=0.1 36 | 37 | def __init__(self, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False, 38 | dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5): 39 | """ Default constructor for the RNN_Encoder class 40 | 41 | Args: 42 | bs (int): batch size of input data 43 | ntoken (int): number of vocabulary (or tokens) in the source dataset 44 | emb_sz (int): the embedding size to use to encode each token 45 | nhid (int): number of hidden activation per LSTM layer 46 | nlayers (int): number of LSTM layers to use in the architecture 47 | pad_token (int): the int value used for padding text. 48 | dropouth (float): dropout to apply to the activations going from one LSTM layer to another 49 | dropouti (float): dropout to apply to the input layer. 50 | dropoute (float): dropout to apply to the embedding layer. 51 | wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights. 52 | 53 | Returns: 54 | None 55 | """ 56 | 57 | super().__init__() 58 | self.ndir = 2 if bidir else 1 59 | self.bs = 1 60 | self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token) 61 | self.encoder_with_dropout = EmbeddingDropout(self.encoder) 62 | self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir, 63 | 1, bidirectional=bidir, dropout=dropouth) for l in range(nlayers)] 64 | if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns] 65 | self.rnns = torch.nn.ModuleList(self.rnns) 66 | self.encoder.weight.data.uniform_(-self.initrange, self.initrange) 67 | 68 | self.emb_sz,self.nhid,self.nlayers,self.dropoute = emb_sz,nhid,nlayers,dropoute 69 | self.dropouti = LockedDropout(dropouti) 70 | self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)]) 71 | 72 | def forward(self, input): 73 | """ Invoked during the forward propagation of the RNN_Encoder module. 74 | Args: 75 | input (Tensor): input of shape (sentence length x batch_size) 76 | 77 | Returns: 78 | raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using 79 | dropouth, list of tensors evaluated from each RNN layer using dropouth, 80 | """ 81 | sl,bs = input.size() 82 | if bs!=self.bs: 83 | self.bs=bs 84 | self.reset() 85 | 86 | emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0) 87 | emb = self.dropouti(emb) 88 | 89 | raw_output = emb 90 | new_hidden,raw_outputs,outputs = [],[],[] 91 | for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)): 92 | current_input = raw_output 93 | with warnings.catch_warnings(): 94 | warnings.simplefilter("ignore") 95 | raw_output, new_h = rnn(raw_output, self.hidden[l]) 96 | new_hidden.append(new_h) 97 | raw_outputs.append(raw_output) 98 | if l != self.nlayers - 1: raw_output = drop(raw_output) 99 | outputs.append(raw_output) 100 | 101 | self.hidden = repackage_var(new_hidden) 102 | return raw_outputs, outputs 103 | 104 | def one_hidden(self, l): 105 | nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir 106 | return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training) 107 | 108 | def reset(self): 109 | self.weights = next(self.parameters()).data 110 | self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)] 111 | 112 | 113 | class MultiBatchRNN(RNN_Encoder): 114 | def __init__(self, bptt, max_seq, *args, **kwargs): 115 | self.max_seq,self.bptt = max_seq,bptt 116 | super().__init__(*args, **kwargs) 117 | 118 | def concat(self, arrs): 119 | return [torch.cat([l[si] for l in arrs]) for si in range(len(arrs[0]))] 120 | 121 | def forward(self, input): 122 | sl,bs = input.size() 123 | for l in self.hidden: 124 | for h in l: h.data.zero_() 125 | raw_outputs, outputs = [],[] 126 | for i in range(0, sl, self.bptt): 127 | r, o = super().forward(input[i: min(i+self.bptt, sl)]) 128 | if i>(sl-self.max_seq): 129 | raw_outputs.append(r) 130 | outputs.append(o) 131 | return self.concat(raw_outputs), self.concat(outputs) 132 | 133 | class LinearDecoder(nn.Module): 134 | initrange=0.1 135 | def __init__(self, n_out, nhid, dropout, tie_encoder=None): 136 | super().__init__() 137 | self.decoder = nn.Linear(nhid, n_out, bias=False) 138 | self.decoder.weight.data.uniform_(-self.initrange, self.initrange) 139 | self.dropout = LockedDropout(dropout) 140 | if tie_encoder: self.decoder.weight = tie_encoder.weight 141 | 142 | def forward(self, input): 143 | raw_outputs, outputs = input 144 | output = self.dropout(outputs[-1]) 145 | decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) 146 | result = decoded.view(-1, decoded.size(1)) 147 | return result, raw_outputs, outputs 148 | 149 | 150 | class LinearBlock(nn.Module): 151 | def __init__(self, ni, nf, drop): 152 | super().__init__() 153 | self.lin = nn.Linear(ni, nf) 154 | self.drop = nn.Dropout(drop) 155 | self.bn = nn.BatchNorm1d(ni) 156 | 157 | def forward(self, x): return self.lin(self.drop(self.bn(x))) 158 | 159 | 160 | class PoolingLinearClassifier(nn.Module): 161 | def __init__(self, layers, drops): 162 | super().__init__() 163 | self.layers = nn.ModuleList([ 164 | LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)]) 165 | 166 | def pool(self, x, bs, is_max): 167 | f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d 168 | return f(x.permute(1,2,0), (1,)).view(bs,-1) 169 | 170 | def forward(self, input): 171 | raw_outputs, outputs = input 172 | output = outputs[-1] 173 | sl,bs,_ = output.size() 174 | avgpool = self.pool(output, bs, False) 175 | mxpool = self.pool(output, bs, True) 176 | x = torch.cat([output[-1], mxpool, avgpool], 1) 177 | for l in self.layers: 178 | l_x = l(x) 179 | x = F.relu(l_x) 180 | l_x = F.log_softmax(l_x) 181 | return l_x, raw_outputs, outputs 182 | 183 | class LinearRegression(nn.Module): 184 | def __init__(self, layers, drops): 185 | super().__init__() 186 | self.layers = nn.ModuleList([ 187 | LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)]) 188 | 189 | def forward(self, input): 190 | raw_outputs, outputs = input 191 | x = outputs[-1][-1] 192 | for l in self.layers: 193 | l_x = l(x) 194 | x = F.relu(l_x) 195 | #l_x = F.l1_loss(x) 196 | return l_x, raw_outputs, outputs 197 | 198 | class SequentialRNN(nn.Sequential): 199 | def reset(self): 200 | for c in self.children(): 201 | if hasattr(c, 'reset'): c.reset() 202 | 203 | 204 | def get_language_model(n_tok, emb_sz, nhid, nlayers, pad_token, 205 | dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True): 206 | """Returns a SequentialRNN model. 207 | 208 | A RNN_Encoder layer is instantiated using the parameters provided. 209 | 210 | This is followed by the creation of a LinearDecoder layer. 211 | 212 | Also by default (i.e. tie_weights = True), the embedding matrix used in the RNN_Encoder 213 | is used to instantiate the weights for the LinearDecoder layer. 214 | 215 | The SequentialRNN layer is the native torch's Sequential wrapper that puts the RNN_Encoder and 216 | LinearDecoder layers sequentially in the model. 217 | 218 | Args: 219 | n_tok (int): number of unique vocabulary words (or tokens) in the source dataset 220 | emb_sz (int): the embedding size to use to encode each token 221 | nhid (int): number of hidden activation per LSTM layer 222 | nlayers (int): number of LSTM layers to use in the architecture 223 | pad_token (int): the int value used for padding text. 224 | dropouth (float): dropout to apply to the activations going from one LSTM layer to another 225 | dropouti (float): dropout to apply to the input layer. 226 | dropoute (float): dropout to apply to the embedding layer. 227 | wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights. 228 | tie_weights (bool): decide if the weights of the embedding matrix in the RNN encoder should be tied to the 229 | weights of the LinearDecoder layer. 230 | Returns: 231 | A SequentialRNN model 232 | """ 233 | 234 | rnn_enc = RNN_Encoder(n_tok, emb_sz, nhid=nhid, nlayers=nlayers, pad_token=pad_token, 235 | dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop) 236 | enc = rnn_enc.encoder if tie_weights else None 237 | return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc)) 238 | 239 | 240 | def get_rnn_classifer(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False, 241 | dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5): 242 | rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir, 243 | dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop) 244 | return SequentialRNN(rnn_enc, PoolingLinearClassifier(layers, drops)) 245 | 246 | def get_rnn_regression(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False, 247 | dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5): 248 | rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir, 249 | dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop) 250 | return SequentialRNN(rnn_enc, LinearRegression(layers, drops)) 251 | 252 | -------------------------------------------------------------------------------- /alica_bkw.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Alica" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Put these at the top of every notebook, to get automatic reloading and inline plotting\n", 17 | "%reload_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "%matplotlib inline" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from fastai.text import *\n", 29 | "import html" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "BOS = 'xbos' # beginning-of-sentence tag\n", 39 | "FLD = 'xfld' # data field tag\n", 40 | "\n", 41 | "PATH=Path('data/alica/')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Standardize format" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "CLAS_PATH=Path('data/alica_clas/')\n", 58 | "CLAS_PATH.mkdir(exist_ok=True)\n", 59 | "\n", 60 | "LM_PATH=Path('data/alica_lm/')\n", 61 | "LM_PATH.mkdir(exist_ok=True)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "CLASSES = ['good', 'neutral', 'bad']\n", 71 | "#d = {\"good\":2,\"neutral\":1,\"bad\":0}\n", 72 | "d = {\"good\":1,\"bad\":0}\n", 73 | "col_names = ['labels','text']" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Language model tokens" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "chunksize=96000" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "re1 = re.compile(r' +')\n", 99 | "\n", 100 | "def fixup(x):\n", 101 | " x = x.replace(u'\\xa0', u' ')\n", 102 | " x = x.replace('|', '\\n')\n", 103 | " x = x.replace('\\n\\n\\n', '\\n')\n", 104 | " x = x.replace('\\n\\n', '\\n')\n", 105 | " x = x.replace(\". . .\",\"...\")\n", 106 | " x = \" , \".join(x.split(\",\"))\n", 107 | " x = \" . \".join(x.split(\".\"))\n", 108 | " x = x.replace(\". . .\",\"...\")\n", 109 | " \n", 110 | " return re1.sub(' ', html.unescape(x))" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 9, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "def get_texts(df):\n", 120 | " labels = df[0].values.astype(np.int64)\n", 121 | " texts = '\\n' + df[1].astype(str)\n", 122 | " texts = texts.apply(fixup).values.astype(str)\n", 123 | "\n", 124 | " tok = Tokenizer().proc_all_mp(partition_by_cores(texts), 'xx')\n", 125 | " return tok, list(labels)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def get_all(df):\n", 135 | " tok, labels = [], []\n", 136 | " for i, r in tqdm(enumerate(df)):\n", 137 | " tok_, labels_ = get_texts(r)\n", 138 | " tok += tok_;\n", 139 | " labels += labels_\n", 140 | " return tok, labels" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "tmp_lm = []\n", 150 | "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')\n", 151 | "\n", 152 | "for x in tqdm(trn_lm):\n", 153 | " tmp_lm.append(x[::-1])\n", 154 | " \n", 155 | "np.save(LM_PATH/'tmp'/'trn_ids_bkw.npy', np.array(tmp_lm))" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "tmp_lm = []\n", 165 | "val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')\n", 166 | "\n", 167 | "for x in tqdm(val_lm):\n", 168 | " tmp_lm.append(x[::-1])\n", 169 | " \n", 170 | "np.save(LM_PATH/'tmp'/'val_ids_bkw.npy', np.array(tmp_lm))" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "tmp_lm = []\n", 180 | "trn_lm = np.load(CLAS_PATH/'tmp'/'trn_ids.npy')\n", 181 | "\n", 182 | "for x in tqdm(trn_lm):\n", 183 | " tmp_lm.append(x[::-1])\n", 184 | " \n", 185 | "np.save(CLAS_PATH/'tmp'/'trn_ids_bkw.npy', np.array(tmp_lm))" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "tmp_lm = []\n", 195 | "val_lm = np.load(CLAS_PATH/'tmp'/'val_ids.npy')\n", 196 | "\n", 197 | "for x in tqdm(val_lm):\n", 198 | " tmp_lm.append(x[::-1])\n", 199 | " \n", 200 | "np.save(CLAS_PATH/'tmp'/'val_ids_bkw.npy', np.array(tmp_lm))" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "tmp_lm = []\n", 210 | "tst_lm = np.load(CLAS_PATH/'tmp'/'tst_ids.npy')\n", 211 | "\n", 212 | "for x in tqdm(tst_lm):\n", 213 | " tmp_lm.append(x[::-1])\n", 214 | " \n", 215 | "np.save(CLAS_PATH/'tmp'/'tst_ids_bkw.npy', np.array(tmp_lm))" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "tmp_lm = []\n", 225 | "tst_lm = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 226 | "\n", 227 | "for x in tqdm(tst_lm):\n", 228 | " tmp_lm.append(x[::-1])\n", 229 | " \n", 230 | "np.save(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy', np.array(tmp_lm))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "for i in range(10):\n", 240 | " tmp_lm = []\n", 241 | " trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy')\n", 242 | "\n", 243 | " for x in tqdm(trn_lm):\n", 244 | " tmp_lm.append(x[::-1])\n", 245 | "\n", 246 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy', np.array(tmp_lm))" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "for i in range(10):\n", 256 | " tmp_lm = []\n", 257 | " val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}.npy')\n", 258 | "\n", 259 | " for x in tqdm(val_lm):\n", 260 | " tmp_lm.append(x[::-1])\n", 261 | "\n", 262 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy', np.array(tmp_lm))" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "for i in range(10):\n", 272 | " tmp_lm = []\n", 273 | " trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy')\n", 274 | "\n", 275 | " for x in tqdm(trn_lm):\n", 276 | " tmp_lm.append(x[::-1])\n", 277 | "\n", 278 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_.npy', np.array(tmp_lm))" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "for i in range(10):\n", 288 | " tmp_lm = []\n", 289 | " val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy')\n", 290 | "\n", 291 | " for x in tqdm(val_lm):\n", 292 | " tmp_lm.append(x[::-1])\n", 293 | "\n", 294 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_.npy', np.array(tmp_lm))" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "for i in range(10):\n", 304 | " tmp_lm = []\n", 305 | " trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy')\n", 306 | "\n", 307 | " for x in tqdm(trn_lm):\n", 308 | " tmp_lm.append(x[::-1])\n", 309 | "\n", 310 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_n.npy', np.array(tmp_lm))\n", 311 | "\n", 312 | "for i in range(10):\n", 313 | " tmp_lm = []\n", 314 | " val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy')\n", 315 | "\n", 316 | " for x in tqdm(val_lm):\n", 317 | " tmp_lm.append(x[::-1])\n", 318 | "\n", 319 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_n.npy', np.array(tmp_lm))" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "for i in range(10):\n", 329 | " tmp_lm = []\n", 330 | " trn_lm = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy')\n", 331 | "\n", 332 | " for x in tqdm(trn_lm):\n", 333 | " tmp_lm.append(x[::-1])\n", 334 | "\n", 335 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_r.npy', np.array(tmp_lm))\n", 336 | "\n", 337 | "for i in range(10):\n", 338 | " tmp_lm = []\n", 339 | " val_lm = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy')\n", 340 | "\n", 341 | " for x in tqdm(val_lm):\n", 342 | " tmp_lm.append(x[::-1])\n", 343 | "\n", 344 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_r.npy', np.array(tmp_lm))" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "## Language model" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids_bkw.npy')\n", 361 | "val_lm = np.load(LM_PATH/'tmp'/'val_ids_bkw.npy')\n", 362 | "itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))\n", 363 | "\n", 364 | "vs=len(itos)\n", 365 | "vs,len(trn_lm)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "em_sz,nh,nl = 400,1150,3\n", 375 | "\n", 376 | "wd=1e-7\n", 377 | "bptt=70\n", 378 | "bs=52\n", 379 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)\n", 389 | "val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)\n", 390 | "md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "learner= md.get_model(opt_fn, em_sz, nh, nl, \n", 409 | " dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])\n", 410 | "\n", 411 | "learner.metrics = [accuracy]\n", 412 | "learner.unfreeze()" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": null, 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "lr=1e-3\n", 422 | "lrs = lr" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "learner.save('lm_last_ft_bkw')" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "learner.load('lm_last_ft_bkw')" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "learner.unfreeze()" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "learner.save('lm0_bkw')" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "learner.save_encoder('lm_enc0_bkw')" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": null, 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "learner.load('lm0_bkw')" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": null, 500 | "metadata": { 501 | "scrolled": false 502 | }, 503 | "outputs": [], 504 | "source": [ 505 | "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=10)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "# 3.955312 3.864959 0.374353 \n", 515 | "learner.save('lm1_bkw')" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": {}, 522 | "outputs": [], 523 | "source": [ 524 | "learner.save_encoder('lm_enc1_bkw')" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "learner.sched.plot_loss()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "## Predict" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "path = \"/mnt/6676114C76111E7D/Kaggle/Alica/\"" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "#folds neutral=bad\n", 559 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 560 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 561 | "\n", 562 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 563 | "vs = len(itos)\n", 564 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 565 | "bs = 48\n", 566 | "c = 2\n", 567 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 568 | "\n", 569 | "lr=1e-3\n", 570 | "lrm = 2.6\n", 571 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 572 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 573 | "\n", 574 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n", 575 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 576 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 577 | "tst_samp = SimpleSampler(tst_clas)\n", 578 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 579 | "\n", 580 | "for i in tqdm(range(10)):\n", 581 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy')\n", 582 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy')\n", 583 | " \n", 584 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy'))\n", 585 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}.npy'))\n", 586 | "\n", 587 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 588 | " val_ds = TextDataset(val_clas, val_labels)\n", 589 | " \n", 590 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 591 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 592 | " \n", 593 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 594 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 595 | " \n", 596 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 597 | "\n", 598 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 599 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 600 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 601 | "\n", 602 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 603 | "\n", 604 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 605 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 606 | " learn.clip=25.\n", 607 | " learn.metrics = [accuracy]\n", 608 | "\n", 609 | "# wd = 0\n", 610 | "# learn.load_encoder('lm_enc1_bkw')\n", 611 | "# learn.freeze_to(-1)\n", 612 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 613 | "# learn.save(f'fold_{i}_bkw')\n", 614 | "\n", 615 | "# learn.freeze_to(-2)\n", 616 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 617 | "# learn.save(f'fold_{i}_bkw')\n", 618 | " \n", 619 | "# wd = 1e-7\n", 620 | "# learn.unfreeze()\n", 621 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 622 | "# learn.save(f'fold_{i}_bkw')\n", 623 | " \n", 624 | "# learn.fit(lrs, 2, wds=wd, cycle_len=1)\n", 625 | "# learn.save(f'fold_{i}_bkw')\n", 626 | " learn.load(f'./10_folds_bkw_bad_85921/fold_{i}_bkw')\n", 627 | " \n", 628 | " wd = 1e-7\n", 629 | " learn.unfreeze()\n", 630 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 631 | " learn.save(path+f'fold_{i}_bkw')\n", 632 | " \n", 633 | " preds = learn.predict(is_test=True)\n", 634 | " \n", 635 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 636 | " df[\"prob\"] = [c[1] for c in np.exp(preds)] \n", 637 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 638 | " df.to_csv(f\"./_blend1/_fold_{i}_bkw.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "#folds neutral=good\n", 648 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 649 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 650 | "\n", 651 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 652 | "vs = len(itos)\n", 653 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 654 | "bs = 48\n", 655 | "c = 2\n", 656 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 657 | "\n", 658 | "lr=1e-3\n", 659 | "lrm = 2.6\n", 660 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 661 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 662 | "\n", 663 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n", 664 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 665 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 666 | "tst_samp = SimpleSampler(tst_clas)\n", 667 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 668 | "\n", 669 | "for i in tqdm(range(10)):\n", 670 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_.npy')\n", 671 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_.npy')\n", 672 | " \n", 673 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy'))\n", 674 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy'))\n", 675 | "\n", 676 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 677 | " val_ds = TextDataset(val_clas, val_labels)\n", 678 | " \n", 679 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 680 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 681 | " \n", 682 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 683 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 684 | " \n", 685 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 686 | "\n", 687 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 688 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 689 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 690 | "\n", 691 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 692 | "\n", 693 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 694 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 695 | " learn.clip=25.\n", 696 | " learn.metrics = [accuracy]\n", 697 | "\n", 698 | "# wd = 0\n", 699 | "# learn.load_encoder('lm_enc1_bkw')\n", 700 | "# learn.freeze_to(-1)\n", 701 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 702 | "# learn.save(f'fold_{i}_bkw_')\n", 703 | "\n", 704 | "# learn.freeze_to(-2)\n", 705 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 706 | "# learn.save(f'fold_{i}_bkw_')\n", 707 | " \n", 708 | "# wd = 1e-7\n", 709 | "# learn.unfreeze()\n", 710 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 711 | "# learn.save(f'fold_{i}_bkw_')\n", 712 | " \n", 713 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 714 | "# learn.save(f'fold_{i}_bkw_')\n", 715 | " learn.load(f'./10_folds_bkw_good_85650/fold_{i}_bkw_')\n", 716 | " \n", 717 | " wd = 1e-7\n", 718 | " learn.unfreeze()\n", 719 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 720 | " learn.save(path+f'fold_{i}_bkw_')\n", 721 | " \n", 722 | " preds = learn.predict(is_test=True)\n", 723 | " \n", 724 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 725 | " df[\"prob\"] = [c[1] for c in np.exp(preds)] \n", 726 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 727 | " df.to_csv(f\"./_blend1/_fold_{i}_bkw_.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": {}, 734 | "outputs": [], 735 | "source": [ 736 | "#folds 3\n", 737 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 738 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 739 | "\n", 740 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 741 | "vs = len(itos)\n", 742 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 743 | "bs = 48\n", 744 | "c = 3\n", 745 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 746 | "\n", 747 | "lr=1e-3\n", 748 | "lrm = 2.6\n", 749 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 750 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 751 | "\n", 752 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n", 753 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 754 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 755 | "tst_samp = SimpleSampler(tst_clas)\n", 756 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 757 | "\n", 758 | "for i in tqdm(range(10)):\n", 759 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw.npy')\n", 760 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw.npy')\n", 761 | " \n", 762 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy'))\n", 763 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy'))\n", 764 | "\n", 765 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 766 | " val_ds = TextDataset(val_clas, val_labels)\n", 767 | " \n", 768 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 769 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 770 | " \n", 771 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 772 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 773 | " \n", 774 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 775 | "\n", 776 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 777 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 778 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 779 | "\n", 780 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 781 | "\n", 782 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 783 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 784 | " learn.clip=25.\n", 785 | " learn.metrics = [accuracy]\n", 786 | "\n", 787 | "# wd = 0\n", 788 | "# learn.load_encoder('lm_enc1_bkw')\n", 789 | "# learn.freeze_to(-1)\n", 790 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 791 | "# learn.save(f'fold_{i}_bkw_3')\n", 792 | "\n", 793 | "# learn.freeze_to(-2)\n", 794 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 795 | "# learn.save(f'fold_{i}_bkw_3')\n", 796 | " \n", 797 | "# wd = 1e-7\n", 798 | "# learn.unfreeze()\n", 799 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 800 | "# learn.save(f'fold_{i}_bkw_3')\n", 801 | " \n", 802 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 803 | "# learn.save(f'fold_{i}_bkw_3')\n", 804 | " \n", 805 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 806 | "# learn.save(f'fold_{i}_bkw_3')\n", 807 | " learn.load(f'./10_folds_bkw_3_1.4_86795/fold_{i}_bkw_3_')\n", 808 | " \n", 809 | " wd = 1e-7\n", 810 | " learn.unfreeze()\n", 811 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 812 | " learn.save(path+f'fold_{i}_bkw_3')\n", 813 | " \n", 814 | " preds = learn.predict(is_test=True)\n", 815 | " \n", 816 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 817 | " df[\"prob1\"] = [c[1] for c in np.exp(preds)]\n", 818 | " df[\"prob2\"] = [c[2] for c in np.exp(preds)]\n", 819 | " df.to_csv(f\"./_blend1/_fold_{i}_bkw_3.tsv\",columns=[2,3,\"prob1\",\"prob2\"],index=False,sep=\"\\t\",header=False)" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": {}, 826 | "outputs": [], 827 | "source": [ 828 | "#folds neutral=good\n", 829 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 830 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 831 | "\n", 832 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 833 | "vs = len(itos)\n", 834 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 835 | "bs = 48\n", 836 | "c = 2\n", 837 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 838 | "\n", 839 | "lr=1e-3\n", 840 | "lrm = 2.6\n", 841 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 842 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 843 | "\n", 844 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n", 845 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 846 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 847 | "tst_samp = SimpleSampler(tst_clas)\n", 848 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 849 | "\n", 850 | "for i in tqdm(range(10)):\n", 851 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_n.npy')\n", 852 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_n.npy')\n", 853 | " \n", 854 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy'))\n", 855 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy'))\n", 856 | "\n", 857 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 858 | " val_ds = TextDataset(val_clas, val_labels)\n", 859 | " \n", 860 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 861 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 862 | " \n", 863 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 864 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 865 | " \n", 866 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 867 | "\n", 868 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 869 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 870 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 871 | "\n", 872 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 873 | "\n", 874 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 875 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 876 | " learn.clip=25.\n", 877 | " learn.metrics = [accuracy]\n", 878 | "\n", 879 | "# wd = 0\n", 880 | "# learn.load_encoder('lm_enc1_bkw')\n", 881 | "# learn.freeze_to(-1)\n", 882 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 883 | "# learn.save(f'fold_{i}_bkw_n')\n", 884 | "\n", 885 | "# learn.freeze_to(-2)\n", 886 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 887 | "# learn.save(f'fold_{i}_bkw_n')\n", 888 | " \n", 889 | "# wd = 1e-7\n", 890 | "# learn.unfreeze()\n", 891 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 892 | "# learn.save(f'fold_{i}_bkw_n')\n", 893 | " \n", 894 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 895 | "# learn.save(f'fold_{i}_bkw_n')\n", 896 | " learn.load(f'./10_folds_bkw_neutral/fold_{i}_bkw_n')\n", 897 | " \n", 898 | " wd = 1e-7\n", 899 | " learn.unfreeze()\n", 900 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 901 | " learn.save(path+f'fold_{i}_bkw_n')\n", 902 | " \n", 903 | " preds = learn.predict(is_test=True)\n", 904 | " \n", 905 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 906 | " df[\"prob\"] = [c[1] for c in np.exp(preds)] \n", 907 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 908 | " df.to_csv(f\"./_blend1/_fold_{i}_bkw_n.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": null, 914 | "metadata": {}, 915 | "outputs": [], 916 | "source": [ 917 | "#folds reg\n", 918 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 919 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 920 | "\n", 921 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 922 | "vs = len(itos)\n", 923 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 924 | "bs = 48\n", 925 | "c = 1\n", 926 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 927 | "\n", 928 | "lr=1e-3\n", 929 | "lrm = 2.6\n", 930 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 931 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 932 | "\n", 933 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_bkw_.npy')\n", 934 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 935 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 936 | "tst_samp = SimpleSampler(tst_clas)\n", 937 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 938 | "\n", 939 | "for i in tqdm(range(10)):\n", 940 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_bkw_r.npy')\n", 941 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_bkw_r.npy')\n", 942 | " \n", 943 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy'))\n", 944 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy'))\n", 945 | "\n", 946 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 947 | " val_ds = TextDataset(val_clas, val_labels)\n", 948 | " \n", 949 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 950 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 951 | " \n", 952 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 953 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 954 | " \n", 955 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 956 | "\n", 957 | " m = get_rnn_regression(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 958 | " layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n", 959 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 960 | "\n", 961 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 962 | "\n", 963 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 964 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 965 | " learn.clip=25.\n", 966 | " \n", 967 | " learn.crit = F.mse_loss\n", 968 | "\n", 969 | "# wd = 0\n", 970 | "# learn.load_encoder('lm_enc1')\n", 971 | "# learn.freeze_to(-1)\n", 972 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 973 | "# learn.save(f'fold_{i}_bkw_r')\n", 974 | "\n", 975 | "# learn.freeze_to(-2)\n", 976 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 977 | "# learn.save(f'fold_{i}_bkw_r')\n", 978 | " \n", 979 | "# wd = 1e-7\n", 980 | "# learn.unfreeze()\n", 981 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 982 | "# learn.save(f'fold_{i}_bkw_r')\n", 983 | " \n", 984 | "# learn.fit(lrs, 5, wds=wd, cycle_len=1)\n", 985 | "# learn.save(f'fold_{i}_bkw_r')\n", 986 | " learn.load(f'./10_folds_bkw_mse_86040/fold_{i}_bkw_r')\n", 987 | " preds = learn.predict(is_test=True)\n", 988 | " \n", 989 | " wd = 1e-7\n", 990 | " learn.unfreeze()\n", 991 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 992 | " learn.save(path+f'fold_{i}_bkw_r')\n", 993 | " \n", 994 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 995 | " df[\"prob\"] = [c[0] for c in preds]\n", 996 | " df.to_csv(f\"./_blend1/_fold_{i}_bkw_r.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 997 | ] 998 | } 999 | ], 1000 | "metadata": { 1001 | "_draft": { 1002 | "nbviewer_url": "https://gist.github.com/0dd0df21cf404cf2bb51d0148c8b7d8b" 1003 | }, 1004 | "gist": { 1005 | "data": { 1006 | "description": "fastai.text imdb example", 1007 | "public": true 1008 | }, 1009 | "id": "0dd0df21cf404cf2bb51d0148c8b7d8b" 1010 | }, 1011 | "kernelspec": { 1012 | "display_name": "Python [default]", 1013 | "language": "python", 1014 | "name": "python3" 1015 | }, 1016 | "language_info": { 1017 | "codemirror_mode": { 1018 | "name": "ipython", 1019 | "version": 3 1020 | }, 1021 | "file_extension": ".py", 1022 | "mimetype": "text/x-python", 1023 | "name": "python", 1024 | "nbconvert_exporter": "python", 1025 | "pygments_lexer": "ipython3", 1026 | "version": "3.6.4" 1027 | }, 1028 | "toc": { 1029 | "colors": { 1030 | "hover_highlight": "#DAA520", 1031 | "navigate_num": "#000000", 1032 | "navigate_text": "#333333", 1033 | "running_highlight": "#FF0000", 1034 | "selected_highlight": "#FFD700", 1035 | "sidebar_border": "#EEEEEE", 1036 | "wrapper_background": "#FFFFFF" 1037 | }, 1038 | "moveMenuLeft": true, 1039 | "nav_menu": { 1040 | "height": "86px", 1041 | "width": "252px" 1042 | }, 1043 | "navigate_menu": true, 1044 | "number_sections": true, 1045 | "sideBar": true, 1046 | "threshold": 4, 1047 | "toc_cell": false, 1048 | "toc_section_display": "block", 1049 | "toc_window_display": false, 1050 | "widenNotebook": false 1051 | } 1052 | }, 1053 | "nbformat": 4, 1054 | "nbformat_minor": 2 1055 | } 1056 | -------------------------------------------------------------------------------- /alica_fwd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Alica" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Put these at the top of every notebook, to get automatic reloading and inline plotting\n", 17 | "%reload_ext autoreload\n", 18 | "%autoreload 2\n", 19 | "%matplotlib inline" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "from fastai.text import *\n", 29 | "import html" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "BOS = 'xbos' # beginning-of-sentence tag\n", 39 | "FLD = 'xfld' # data field tag\n", 40 | "\n", 41 | "PATH=Path('data/alica/')" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "## Standardize format" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "CLAS_PATH=Path('data/alica_clas/')\n", 58 | "CLAS_PATH.mkdir(exist_ok=True)\n", 59 | "\n", 60 | "LM_PATH=Path('data/alica_lm/')\n", 61 | "LM_PATH.mkdir(exist_ok=True)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "CLASSES = ['good', 'neutral', 'bad']\n", 71 | "d = {\"good\":2,\"neutral\":1,\"bad\":0}\n", 72 | "#d = {\"good\":1,\"neutral\":0}\n", 73 | "#d = {\"good\":1,\"bad\":0}\n", 74 | "col_names = ['labels','text']" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "(CLAS_PATH/'classes.txt').open('w').writelines(f'{o}\\n' for o in CLASSES)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "df = pd.read_csv(PATH/\"public.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 93 | "df.fillna(\"\", inplace=True)\n", 94 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 95 | "df[\"id\"] = df[0]\n", 96 | "df[\"num\"] = df[4]\n", 97 | "df[\"labels\"] = [0]*len(df)\n", 98 | "\n", 99 | "df.to_csv(CLAS_PATH/\"test.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "df = pd.read_csv(PATH/\"final.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 109 | "df.fillna(\"\", inplace=True)\n", 110 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 111 | "df[\"id\"] = df[0]\n", 112 | "df[\"num\"] = df[4]\n", 113 | "df[\"labels\"] = [0]*len(df)\n", 114 | "\n", 115 | "df.to_csv(CLAS_PATH/\"test_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "#folds neutral=bad\n", 125 | "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 126 | "df.head()\n", 127 | "\n", 128 | "df.fillna(\"\", inplace=True)\n", 129 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 130 | "df[\"id\"] = df[0]\n", 131 | "df[\"num\"] = df[4]\n", 132 | "df[\"labels\"] = df[6].apply(lambda x: d[x.replace(\"neutral\",\"bad\")])\n", 133 | "\n", 134 | "from sklearn.model_selection import KFold\n", 135 | "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n", 136 | "i = 0\n", 137 | "for train_index, valid_index in kf.split(df):\n", 138 | " dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n", 139 | " \n", 140 | " dff.to_csv(CLAS_PATH/f\"train_{i}.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 141 | " dfv.to_csv(CLAS_PATH/f\"valid_{i}.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 142 | " \n", 143 | " i+=1\n" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "#folds neutral=good\n", 153 | "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 154 | "df.head()\n", 155 | "\n", 156 | "df.fillna(\"\", inplace=True)\n", 157 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 158 | "df[\"id\"] = df[0]\n", 159 | "df[\"num\"] = df[4]\n", 160 | "df[\"labels\"] = df[6].apply(lambda x: d[x.replace(\"neutral\",\"good\")])\n", 161 | "\n", 162 | "from sklearn.model_selection import KFold\n", 163 | "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n", 164 | "i = 0\n", 165 | "for train_index, valid_index in kf.split(df):\n", 166 | " dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n", 167 | " \n", 168 | " dff.to_csv(CLAS_PATH/f\"train_{i}_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 169 | " dfv.to_csv(CLAS_PATH/f\"valid_{i}_.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 170 | " \n", 171 | " i+=1" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "#folds 3\n", 181 | "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 182 | "df.head()\n", 183 | "\n", 184 | "df.fillna(\"\", inplace=True)\n", 185 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 186 | "df[\"id\"] = df[0]\n", 187 | "df[\"num\"] = df[4]\n", 188 | "df[\"labels\"] = df[6].apply(lambda x: d[x])\n", 189 | "\n", 190 | "from sklearn.model_selection import KFold\n", 191 | "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n", 192 | "i = 0\n", 193 | "for train_index, valid_index in kf.split(df):\n", 194 | " dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n", 195 | " \n", 196 | " dff.to_csv(CLAS_PATH/f\"train_{i}_3.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 197 | " dfv.to_csv(CLAS_PATH/f\"valid_{i}_3.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 198 | " \n", 199 | " i+=1" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "#folds no neutral\n", 209 | "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 210 | "\n", 211 | "df.fillna(\"\", inplace=True)\n", 212 | "df = df[df[6]!=\"neutral\"]\n", 213 | "\n", 214 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 215 | "df[\"id\"] = df[0]\n", 216 | "df[\"num\"] = df[4]\n", 217 | "df[\"labels\"] = df[6].apply(lambda x: d[x])\n", 218 | "\n", 219 | "from sklearn.model_selection import KFold\n", 220 | "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n", 221 | "i = 0\n", 222 | "for train_index, valid_index in kf.split(df):\n", 223 | " dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n", 224 | " \n", 225 | " dff.to_csv(CLAS_PATH/f\"train_{i}_n.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 226 | " dfv.to_csv(CLAS_PATH/f\"valid_{i}_n.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 227 | " \n", 228 | " i+=1" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "#folds reg\n", 238 | "df = pd.read_csv(PATH/\"train.tsv\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)\n", 239 | "\n", 240 | "df.fillna(\"\", inplace=True)\n", 241 | "df = df[df[6]!=\"neutral\"]\n", 242 | "\n", 243 | "df[\"text\"] = df[1]+\"|\"+df[2]+\"|\"+df[3]+\"|\"+df[5]\n", 244 | "df[\"id\"] = df[0]\n", 245 | "df[\"num\"] = df[4]\n", 246 | "\n", 247 | "res = []\n", 248 | "for x,y in zip(df[6], df[7]):\n", 249 | " if x == \"good\":\n", 250 | " t = 1+y\n", 251 | " if x == \"neutral\":\n", 252 | " t = 1 \n", 253 | " if x == \"bad\":\n", 254 | " t = 1-y\n", 255 | " res.append(t)\n", 256 | "\n", 257 | "df[\"labels\"] = res\n", 258 | "\n", 259 | "from sklearn.model_selection import KFold\n", 260 | "kf = KFold(n_splits=10, random_state=42, shuffle=True)\n", 261 | "i = 0\n", 262 | "for train_index, valid_index in kf.split(df):\n", 263 | " dff, dfv = df.iloc[train_index,:], df.iloc[valid_index,:]\n", 264 | " \n", 265 | " dff.to_csv(CLAS_PATH/f\"train_{i}_r.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 266 | " dfv.to_csv(CLAS_PATH/f\"valid_{i}_r.csv\", columns=col_names+[\"id\",\"num\"], sep=\"\\t\", header=False, index=False)\n", 267 | " \n", 268 | " i+=1" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "## OpenSubtitles" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df = pd.read_csv(PATH/\"OpenSubtitles2016.en-ru.ru\", sep = \"\\t\", header = None, quoting=csv.QUOTE_NONE)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "texts = df[0]\n", 294 | "len(texts)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "trn_texts,val_texts = sklearn.model_selection.train_test_split(texts, test_size=0.1)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "len(trn_texts), len(val_texts)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "df_trn = pd.DataFrame({'text':trn_texts, 'labels':[0]*len(trn_texts)}, columns=col_names)\n", 322 | "df_val = pd.DataFrame({'text':val_texts, 'labels':[0]*len(val_texts)}, columns=col_names)\n", 323 | "\n", 324 | "df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)\n", 325 | "df_val.to_csv(LM_PATH/'test.csv', header=False, index=False)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "## Language model tokens" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "chunksize=96000" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "re1 = re.compile(r' +')\n", 351 | "\n", 352 | "def fixup(x):\n", 353 | " x = x.replace(u'\\xa0', u' ')\n", 354 | " x = x.replace('|', '\\n')\n", 355 | " x = x.replace('\\n\\n\\n', '\\n')\n", 356 | " x = x.replace('\\n\\n', '\\n')\n", 357 | " x = x.replace(\". . .\",\"...\")\n", 358 | " x = \" , \".join(x.split(\",\"))\n", 359 | " x = \" . \".join(x.split(\".\"))\n", 360 | " x = x.replace(\". . .\",\"...\")\n", 361 | " \n", 362 | " return re1.sub(' ', html.unescape(x))" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "def get_texts(df):\n", 372 | " #labels = df[0].values.astype(np.int64)\n", 373 | " labels = df[0].values.astype(float) #for reg\n", 374 | " texts = '\\n' + df[1].astype(str)\n", 375 | " texts = texts.apply(fixup).values.astype(str)\n", 376 | "\n", 377 | " tok = Tokenizer().proc_all_mp(partition_by_cores(texts), 'xx')\n", 378 | " return tok, list(labels)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "def get_all(df):\n", 388 | " tok, labels = [], []\n", 389 | " for i, r in tqdm(enumerate(df)):\n", 390 | " tok_, labels_ = get_texts(r)\n", 391 | " tok += tok_;\n", 392 | " labels += labels_\n", 393 | " return tok, labels" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)\n", 403 | "tok_val, val_labels = get_all(df_val)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": { 410 | "scrolled": false 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)\n", 415 | "tok_trn, trn_labels = get_all(df_trn)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "from random import choice\n", 432 | "choice(tok_trn)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "(LM_PATH/'tmp').mkdir(exist_ok=True)" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "np.save(LM_PATH/'tmp'/'tok_trn_0.npy', tok_trn[:4000000])\n", 451 | "np.save(LM_PATH/'tmp'/'tok_trn_1.npy', tok_trn[4000000:8000000])\n", 452 | "np.save(LM_PATH/'tmp'/'tok_trn_2.npy', tok_trn[8000000:10000000])\n", 453 | "np.save(LM_PATH/'tmp'/'tok_trn_3.npy', tok_trn[10000000:12000000])\n", 454 | "np.save(LM_PATH/'tmp'/'tok_trn_4.npy', tok_trn[12000000:14000000])\n", 455 | "np.save(LM_PATH/'tmp'/'tok_trn_5.npy', tok_trn[14000000:15000000])\n", 456 | "np.save(LM_PATH/'tmp'/'tok_trn_6.npy', tok_trn[15000000:])\n", 457 | "np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "tok_trn = np.array([])\n", 476 | "for i in tqdm(range(7)):\n", 477 | " tok_trn = np.append(tok_trn, np.load(LM_PATH/'tmp'/f'tok_trn_{i}.npy'))" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "scrolled": true 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "freq = Counter(p for o in tok_trn for p in o)\n", 489 | "freq.most_common(25)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "max_vocab = 60000\n", 506 | "min_freq = 2" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": null, 512 | "metadata": {}, 513 | "outputs": [], 514 | "source": [ 515 | "itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]\n", 516 | "itos.insert(0, '_pad_')\n", 517 | "itos.insert(0, '_unk_')" 518 | ] 519 | }, 520 | { 521 | "cell_type": "code", 522 | "execution_count": null, 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 527 | "len(itos)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": null, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 537 | "val_lm = np.array([[stoi[o] for o in p] for p in tok_val])" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": {}, 544 | "outputs": [], 545 | "source": [ 546 | "np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)\n", 547 | "np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)\n", 548 | "pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')\n", 558 | "val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')\n", 559 | "itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "vs=len(itos)\n", 569 | "vs,len(trn_lm)" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": {}, 575 | "source": [ 576 | "## Language model" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "em_sz,nh,nl = 400,1150,3\n", 586 | "\n", 587 | "wd=1e-7\n", 588 | "bptt=70\n", 589 | "bs=52\n", 590 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)\n", 600 | "val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)\n", 601 | "md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [ 619 | "learner= md.get_model(opt_fn, em_sz, nh, nl, \n", 620 | " dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])\n", 621 | "\n", 622 | "learner.metrics = [accuracy]\n", 623 | "learner.unfreeze()" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "metadata": {}, 630 | "outputs": [], 631 | "source": [ 632 | "lr=1e-3\n", 633 | "lrs = lr" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "learner.save('lm_last_ft')" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": null, 657 | "metadata": {}, 658 | "outputs": [], 659 | "source": [ 660 | "learner.load('lm_last_ft')" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "learner.unfreeze()" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": null, 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [ 687 | "learner.load('lm0')" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "metadata": { 694 | "scrolled": false 695 | }, 696 | "outputs": [], 697 | "source": [ 698 | "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=5)" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": null, 704 | "metadata": {}, 705 | "outputs": [], 706 | "source": [ 707 | "#3.980186 3.859496 0.364186\n", 708 | "learner.save('lm1')" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "learner.save_encoder('lm_enc1')" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "learner.load('lm1')" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=5)" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "learner.sched.plot_loss()" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "learner.save('lm2')\n", 754 | "learner.save_encoder('lm_enc2')\n", 755 | "#3.961754 3.840744 0.365777" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "metadata": {}, 761 | "source": [ 762 | "## Classifier tokens" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "#df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 772 | "#df_val = pd.read_csv(CLAS_PATH/'valid.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 773 | "#df_tst = pd.read_csv(CLAS_PATH/'test.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 774 | "df_tst = pd.read_csv(CLAS_PATH/'test_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": null, 780 | "metadata": {}, 781 | "outputs": [], 782 | "source": [ 783 | "#from random import choice\n", 784 | "#choice(tok_tst)" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "#tok_trn, trn_labels = get_all(df_trn)\n", 794 | "#tok_val, val_labels = get_all(df_val)\n", 795 | "tok_tst, tst_labels = get_all(df_tst)\n" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": null, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "(CLAS_PATH/'tmp').mkdir(exist_ok=True)\n", 805 | "\n", 806 | "#np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)\n", 807 | "#np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)\n", 808 | "#np.save(CLAS_PATH/'tmp'/'tok_tst.npy', tok_tst)\n", 809 | "np.save(CLAS_PATH/'tmp'/'tok_tst_.npy', tok_tst)\n", 810 | "\n", 811 | "#np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)\n", 812 | "#np.save(CLAS_PATH/'tmp'/'val_labels.npy', val_labels)\n", 813 | "#np.save(CLAS_PATH/'tmp'/'tst_labels.npy', tst_labels)\n", 814 | "np.save(CLAS_PATH/'tmp'/'tst_labels_.npy', tst_labels)" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": null, 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "#tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')\n", 824 | "#tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')\n", 825 | "#tok_tst = np.load(CLAS_PATH/'tmp'/'tok_tst.npy')\n", 826 | "tok_tst = np.load(CLAS_PATH/'tmp'/'tok_tst_.npy')" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 836 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 837 | "len(itos)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "#trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 847 | "#val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 848 | "tst_clas = np.array([[stoi[o] for o in p] for p in tok_tst])" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "#np.save(CLAS_PATH/'tmp'/'trn_ids.npy', trn_clas)\n", 858 | "#np.save(CLAS_PATH/'tmp'/'val_ids.npy', val_clas)\n", 859 | "#np.save(CLAS_PATH/'tmp'/'tst_ids.npy', tst_clas)\n", 860 | "np.save(CLAS_PATH/'tmp'/'tst_ids_.npy', tst_clas)" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": null, 866 | "metadata": {}, 867 | "outputs": [], 868 | "source": [ 869 | "#folds nuetral=bad\n", 870 | "chunksize=96000\n", 871 | "\n", 872 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 873 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 874 | "\n", 875 | "for i in tqdm(range(10)):\n", 876 | " df_trn = pd.read_csv(CLAS_PATH/f'train_{i}.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 877 | " df_val = pd.read_csv(CLAS_PATH/f'valid_{i}.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 878 | " \n", 879 | " tok_trn, trn_labels = get_all(df_trn)\n", 880 | " tok_val, val_labels = get_all(df_val)\n", 881 | " \n", 882 | " np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}.npy', tok_trn)\n", 883 | " np.save(CLAS_PATH/'tmp'/f'tok_val_{i}.npy', tok_val)\n", 884 | "\n", 885 | " np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy', trn_labels)\n", 886 | " np.save(CLAS_PATH/'tmp'/f'val_labels_{i}.npy', val_labels)\n", 887 | " \n", 888 | " trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 889 | " val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 890 | "\n", 891 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy', trn_clas)\n", 892 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}.npy', val_clas)" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": null, 898 | "metadata": {}, 899 | "outputs": [], 900 | "source": [ 901 | "#folds nuetral=good\n", 902 | "chunksize=96000\n", 903 | "\n", 904 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 905 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 906 | "\n", 907 | "for i in tqdm(range(10)):\n", 908 | " df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 909 | " df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 910 | " \n", 911 | " tok_trn, trn_labels = get_all(df_trn)\n", 912 | " tok_val, val_labels = get_all(df_val)\n", 913 | " \n", 914 | " np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_.npy', tok_trn)\n", 915 | " np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_.npy', tok_val)\n", 916 | "\n", 917 | " np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy', trn_labels)\n", 918 | " np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy', val_labels)\n", 919 | " \n", 920 | " trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 921 | " val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 922 | "\n", 923 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy', trn_clas)\n", 924 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy', val_clas)" 925 | ] 926 | }, 927 | { 928 | "cell_type": "code", 929 | "execution_count": null, 930 | "metadata": {}, 931 | "outputs": [], 932 | "source": [ 933 | "#folds 3\n", 934 | "chunksize=96000\n", 935 | "\n", 936 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 937 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 938 | "\n", 939 | "for i in tqdm(range(10)):\n", 940 | " df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_3.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 941 | " df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_3.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 942 | " \n", 943 | " tok_trn, trn_labels = get_all(df_trn)\n", 944 | " tok_val, val_labels = get_all(df_val)\n", 945 | " \n", 946 | " np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_3.npy', tok_trn)\n", 947 | " np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_3.npy', tok_val)\n", 948 | "\n", 949 | " np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy', trn_labels)\n", 950 | " np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy', val_labels)\n", 951 | " \n", 952 | " trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 953 | " val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 954 | "\n", 955 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_3.npy', trn_clas)\n", 956 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_3.npy', val_clas)" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": null, 962 | "metadata": {}, 963 | "outputs": [], 964 | "source": [ 965 | "#folds no neutral\n", 966 | "chunksize=96000\n", 967 | "\n", 968 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 969 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 970 | "\n", 971 | "for i in tqdm(range(10)):\n", 972 | " df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_n.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 973 | " df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_n.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 974 | " \n", 975 | " tok_trn, trn_labels = get_all(df_trn)\n", 976 | " tok_val, val_labels = get_all(df_val)\n", 977 | " \n", 978 | " np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_n.npy', tok_trn)\n", 979 | " np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_n.npy', tok_val)\n", 980 | "\n", 981 | " np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy', trn_labels)\n", 982 | " np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy', val_labels)\n", 983 | " \n", 984 | " trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 985 | " val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 986 | "\n", 987 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy', trn_clas)\n", 988 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy', val_clas)" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": null, 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [ 997 | "#folds reg\n", 998 | "chunksize=96000\n", 999 | "\n", 1000 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1001 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1002 | "\n", 1003 | "for i in tqdm(range(10)):\n", 1004 | " df_trn = pd.read_csv(CLAS_PATH/f'train_{i}_r.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 1005 | " df_val = pd.read_csv(CLAS_PATH/f'valid_{i}_r.csv', header=None, chunksize=chunksize, sep=\"\\t\", quoting=csv.QUOTE_NONE)\n", 1006 | " \n", 1007 | " tok_trn, trn_labels = get_all(df_trn)\n", 1008 | " tok_val, val_labels = get_all(df_val)\n", 1009 | " \n", 1010 | " np.save(CLAS_PATH/'tmp'/f'tok_trn_{i}_r.npy', tok_trn)\n", 1011 | " np.save(CLAS_PATH/'tmp'/f'tok_val_{i}_r.npy', tok_val)\n", 1012 | "\n", 1013 | " np.save(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy', trn_labels)\n", 1014 | " np.save(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy', val_labels)\n", 1015 | " \n", 1016 | " trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])\n", 1017 | " val_clas = np.array([[stoi[o] for o in p] for p in tok_val])\n", 1018 | "\n", 1019 | " np.save(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy', trn_clas)\n", 1020 | " np.save(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy', val_clas)" 1021 | ] 1022 | }, 1023 | { 1024 | "cell_type": "markdown", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "## Predict" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": null, 1033 | "metadata": {}, 1034 | "outputs": [], 1035 | "source": [ 1036 | "path = \"/mnt/6676114C76111E7D/Kaggle/Alica/\"" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": null, 1042 | "metadata": {}, 1043 | "outputs": [], 1044 | "source": [ 1045 | "#folds neutral=bad\n", 1046 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1047 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1048 | "\n", 1049 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 1050 | "vs = len(itos)\n", 1051 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 1052 | "bs = 48\n", 1053 | "c = 2\n", 1054 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 1055 | "\n", 1056 | "lr=1e-3\n", 1057 | "lrm = 2.6\n", 1058 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 1059 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 1060 | "\n", 1061 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 1062 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 1063 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 1064 | "tst_samp = SimpleSampler(tst_clas)\n", 1065 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 1066 | "\n", 1067 | "for i in tqdm(range(10)):\n", 1068 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}.npy')\n", 1069 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}.npy')\n", 1070 | " \n", 1071 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}.npy'))\n", 1072 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}.npy'))\n", 1073 | "\n", 1074 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 1075 | " val_ds = TextDataset(val_clas, val_labels)\n", 1076 | " \n", 1077 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 1078 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 1079 | " \n", 1080 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 1081 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 1082 | " \n", 1083 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 1084 | "\n", 1085 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 1086 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 1087 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 1088 | "\n", 1089 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 1090 | "\n", 1091 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 1092 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 1093 | " learn.clip=25.\n", 1094 | " learn.metrics = [accuracy]\n", 1095 | "\n", 1096 | "# wd = 0\n", 1097 | "# learn.load_encoder('lm_enc1')\n", 1098 | "# learn.freeze_to(-1)\n", 1099 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1100 | "# learn.save(f'fold_{i}')\n", 1101 | "\n", 1102 | "# learn.freeze_to(-2)\n", 1103 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1104 | "# learn.save(f'fold_{i}')\n", 1105 | " \n", 1106 | "# wd = 1e-7\n", 1107 | "# learn.unfreeze()\n", 1108 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1109 | "# learn.save(f'fold_{i}')\n", 1110 | " \n", 1111 | "# learn.fit(lrs, 2, wds=wd, cycle_len=1)\n", 1112 | "# learn.save(f'fold_{i}')\n", 1113 | " learn.load(f'./10_folds_fwd_bad_86283/fold_{i}')\n", 1114 | " \n", 1115 | " wd = 1e-7\n", 1116 | " learn.unfreeze()\n", 1117 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 1118 | " learn.save(path+f'fold_{i}')\n", 1119 | "\n", 1120 | " preds = learn.predict(is_test=True)\n", 1121 | " \n", 1122 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 1123 | " df[\"prob\"] = [c[1] for c in np.exp(preds)] \n", 1124 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 1125 | " df.to_csv(f\"./blend1/_fold_{i}.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": null, 1131 | "metadata": {}, 1132 | "outputs": [], 1133 | "source": [] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": null, 1138 | "metadata": {}, 1139 | "outputs": [], 1140 | "source": [] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "execution_count": null, 1145 | "metadata": {}, 1146 | "outputs": [], 1147 | "source": [ 1148 | "#folds neutral=good\n", 1149 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1150 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1151 | "\n", 1152 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 1153 | "vs = len(itos)\n", 1154 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 1155 | "bs = 48\n", 1156 | "c = 2\n", 1157 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 1158 | "\n", 1159 | "lr=1e-3\n", 1160 | "lrm = 2.6\n", 1161 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 1162 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 1163 | "\n", 1164 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 1165 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 1166 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 1167 | "tst_samp = SimpleSampler(tst_clas)\n", 1168 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 1169 | "\n", 1170 | "for i in tqdm(range(10)):\n", 1171 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_.npy')\n", 1172 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_.npy')\n", 1173 | " \n", 1174 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_.npy'))\n", 1175 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_.npy'))\n", 1176 | "\n", 1177 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 1178 | " val_ds = TextDataset(val_clas, val_labels)\n", 1179 | " \n", 1180 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 1181 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 1182 | " \n", 1183 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 1184 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 1185 | " \n", 1186 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 1187 | "\n", 1188 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 1189 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 1190 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 1191 | "\n", 1192 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 1193 | "\n", 1194 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 1195 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 1196 | " learn.clip=25.\n", 1197 | " learn.metrics = [accuracy]\n", 1198 | "\n", 1199 | "# wd = 0\n", 1200 | "# learn.load_encoder('lm_enc1')\n", 1201 | "# learn.freeze_to(-1)\n", 1202 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1203 | "# learn.save(f'fold_{i}_')\n", 1204 | "\n", 1205 | "# learn.freeze_to(-2)\n", 1206 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1207 | "# learn.save(f'fold_{i}_')\n", 1208 | " \n", 1209 | "# wd = 1e-7\n", 1210 | "# learn.unfreeze()\n", 1211 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1212 | "# learn.save(f'fold_{i}_')\n", 1213 | " \n", 1214 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1215 | "# learn.save(f'fold_{i}_')\n", 1216 | " learn.load(f'./10_folds_fwd_good_85739/fold_{i}_')\n", 1217 | " \n", 1218 | " wd = 1e-7\n", 1219 | " learn.unfreeze()\n", 1220 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 1221 | " learn.save(path+f'fold_{i}_')\n", 1222 | " \n", 1223 | " preds = learn.predict(is_test=True)\n", 1224 | " \n", 1225 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 1226 | " df[\"prob\"] = [c[1] for c in np.exp(preds)] \n", 1227 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 1228 | " df.to_csv(f\"./blend1/_fold_{i}_.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 1229 | ] 1230 | }, 1231 | { 1232 | "cell_type": "code", 1233 | "execution_count": null, 1234 | "metadata": {}, 1235 | "outputs": [], 1236 | "source": [ 1237 | "#folds 3\n", 1238 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1239 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1240 | "\n", 1241 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 1242 | "vs = len(itos)\n", 1243 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 1244 | "bs = 48\n", 1245 | "c = 3\n", 1246 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 1247 | "\n", 1248 | "lr=1e-3\n", 1249 | "lrm = 2.6\n", 1250 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 1251 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 1252 | "\n", 1253 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 1254 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 1255 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 1256 | "tst_samp = SimpleSampler(tst_clas)\n", 1257 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 1258 | "\n", 1259 | "for i in tqdm(range(10)):\n", 1260 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_3.npy')\n", 1261 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_3.npy')\n", 1262 | " \n", 1263 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_3.npy'))\n", 1264 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_3.npy'))\n", 1265 | "\n", 1266 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 1267 | " val_ds = TextDataset(val_clas, val_labels)\n", 1268 | " \n", 1269 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 1270 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 1271 | " \n", 1272 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 1273 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 1274 | " \n", 1275 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 1276 | "\n", 1277 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 1278 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 1279 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 1280 | "\n", 1281 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 1282 | "\n", 1283 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 1284 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 1285 | " learn.clip=25.\n", 1286 | " learn.metrics = [accuracy]\n", 1287 | "\n", 1288 | "# wd = 0\n", 1289 | "# learn.load_encoder('lm_enc1')\n", 1290 | "# learn.freeze_to(-1)\n", 1291 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1292 | "# learn.save(f'fold_{i}_3')\n", 1293 | "\n", 1294 | "# learn.freeze_to(-2)\n", 1295 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1296 | "# learn.save(f'fold_{i}_3')\n", 1297 | " \n", 1298 | "# wd = 1e-7\n", 1299 | "# learn.unfreeze()\n", 1300 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1301 | "# learn.save(f'fold_{i}_3')\n", 1302 | " \n", 1303 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1304 | "# learn.save(f'fold_{i}_3')\n", 1305 | " \n", 1306 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1307 | "# learn.save(f'fold_{i}_3')\n", 1308 | " learn.load(f'./10_folds_fwd_3_1.4_86962/fold_{i}_3')\n", 1309 | " \n", 1310 | " wd = 1e-7\n", 1311 | " learn.unfreeze()\n", 1312 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 1313 | " learn.save(path+f'fold_{i}_3')\n", 1314 | " \n", 1315 | " preds = learn.predict(is_test=True)\n", 1316 | " \n", 1317 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 1318 | " df[\"prob1\"] = [c[1] for c in np.exp(preds)]\n", 1319 | " df[\"prob2\"] = [c[2] for c in np.exp(preds)]\n", 1320 | " df.to_csv(f\"./blend1/_fold_{i}_3.tsv\",columns=[2,3,\"prob1\",\"prob2\"],index=False,sep=\"\\t\",header=False)" 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": null, 1326 | "metadata": {}, 1327 | "outputs": [], 1328 | "source": [ 1329 | "#folds neutral\n", 1330 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1331 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1332 | "\n", 1333 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 1334 | "vs = len(itos)\n", 1335 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 1336 | "bs = 48\n", 1337 | "c = 2\n", 1338 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 1339 | "\n", 1340 | "lr=1e-3\n", 1341 | "lrm = 2.6\n", 1342 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 1343 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 1344 | "\n", 1345 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 1346 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 1347 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 1348 | "tst_samp = SimpleSampler(tst_clas)\n", 1349 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 1350 | "\n", 1351 | "for i in tqdm(range(10)):\n", 1352 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_n.npy')\n", 1353 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_n.npy')\n", 1354 | " \n", 1355 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_n.npy'))\n", 1356 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_n.npy'))\n", 1357 | "\n", 1358 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 1359 | " val_ds = TextDataset(val_clas, val_labels)\n", 1360 | " \n", 1361 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 1362 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 1363 | " \n", 1364 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 1365 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 1366 | " \n", 1367 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 1368 | "\n", 1369 | " m = get_rnn_classifer(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 1370 | " layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],\n", 1371 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 1372 | "\n", 1373 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 1374 | "\n", 1375 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 1376 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 1377 | " learn.clip=25.\n", 1378 | " learn.metrics = [accuracy]\n", 1379 | "\n", 1380 | "# wd = 0\n", 1381 | "# learn.load_encoder('lm_enc1')\n", 1382 | "# learn.freeze_to(-1)\n", 1383 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1384 | "# learn.save(f'fold_{i}_n')\n", 1385 | "\n", 1386 | "# learn.freeze_to(-2)\n", 1387 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1388 | "# learn.save(f'fold_{i}_n')\n", 1389 | " \n", 1390 | "# wd = 1e-7\n", 1391 | "# learn.unfreeze()\n", 1392 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1393 | "# learn.save(f'fold_{i}_n')\n", 1394 | " \n", 1395 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1396 | "# learn.save(f'fold_{i}_n')\n", 1397 | " learn.load(f'./10_folds_fwd_neutral/fold_{i}_n')\n", 1398 | " \n", 1399 | " wd = 1e-7\n", 1400 | " learn.unfreeze()\n", 1401 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 1402 | " learn.save(path+f'fold_{i}_n')\n", 1403 | " \n", 1404 | " preds = learn.predict(is_test=True)\n", 1405 | " \n", 1406 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 1407 | " df[\"prob\"] = [c[1] for c in np.exp(preds)]\n", 1408 | " df.sort_values(by=[2,\"prob\"], inplace=True, ascending=[True,False])\n", 1409 | " df.to_csv(f\"./blend1/_fold_{i}_n.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 1410 | ] 1411 | }, 1412 | { 1413 | "cell_type": "code", 1414 | "execution_count": null, 1415 | "metadata": {}, 1416 | "outputs": [], 1417 | "source": [ 1418 | "#folds reg\n", 1419 | "itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))\n", 1420 | "stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})\n", 1421 | "\n", 1422 | "bptt,em_sz,nh,nl = 70,400,1150,3\n", 1423 | "vs = len(itos)\n", 1424 | "opt_fn = partial(optim.Adam, betas=(0.8, 0.99))\n", 1425 | "bs = 48\n", 1426 | "c = 1\n", 1427 | "dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.75\n", 1428 | "\n", 1429 | "lr=1e-3\n", 1430 | "lrm = 2.6\n", 1431 | "lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])\n", 1432 | "#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])\n", 1433 | "\n", 1434 | "tst_clas = np.load(CLAS_PATH/'tmp'/'tst_ids_.npy')\n", 1435 | "tst_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'tst_labels_.npy'))\n", 1436 | "tst_ds = TextDataset(tst_clas, tst_labels)\n", 1437 | "tst_samp = SimpleSampler(tst_clas)\n", 1438 | "tst_dl = DataLoader(tst_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=tst_samp)\n", 1439 | "\n", 1440 | "for i in tqdm(range(10)):\n", 1441 | " trn_clas = np.load(CLAS_PATH/'tmp'/f'trn_ids_{i}_r.npy')\n", 1442 | " val_clas = np.load(CLAS_PATH/'tmp'/f'val_ids_{i}_r.npy')\n", 1443 | " \n", 1444 | " trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'trn_labels_{i}_r.npy'))\n", 1445 | " val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/f'val_labels_{i}_r.npy'))\n", 1446 | "\n", 1447 | " trn_ds = TextDataset(trn_clas, trn_labels)\n", 1448 | " val_ds = TextDataset(val_clas, val_labels)\n", 1449 | " \n", 1450 | " trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)\n", 1451 | " val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))\n", 1452 | " \n", 1453 | " trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)\n", 1454 | " val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)\n", 1455 | " \n", 1456 | " md = ModelData(PATH, trn_dl, val_dl, tst_dl)\n", 1457 | "\n", 1458 | " m = get_rnn_regression(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,\n", 1459 | " layers=[em_sz, 50, c], drops=[dps[4], 0.1],\n", 1460 | " dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])\n", 1461 | "\n", 1462 | " opt_fn = partial(optim.Adam, betas=(0.7, 0.99))\n", 1463 | "\n", 1464 | " learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)\n", 1465 | " learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)\n", 1466 | " learn.clip=25.\n", 1467 | " \n", 1468 | " learn.crit = F.mse_loss\n", 1469 | "\n", 1470 | "# wd = 0\n", 1471 | "# learn.load_encoder('lm_enc1')\n", 1472 | "# learn.freeze_to(-1)\n", 1473 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1474 | "# learn.save(f'fold_{i}_r')\n", 1475 | "\n", 1476 | "# learn.freeze_to(-2)\n", 1477 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1478 | "# learn.save(f'fold_{i}_r')\n", 1479 | " \n", 1480 | "# wd = 1e-7\n", 1481 | "# learn.unfreeze()\n", 1482 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))\n", 1483 | "# learn.save(f'fold_{i}_r')\n", 1484 | " \n", 1485 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1486 | "# learn.save(f'fold_{i}_r')\n", 1487 | " \n", 1488 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1489 | "# learn.save(f'fold_{i}_r')\n", 1490 | " \n", 1491 | "# learn.fit(lrs, 1, wds=wd, cycle_len=1)\n", 1492 | "# learn.save(f'fold_{i}_r')\n", 1493 | " learn.load(f'./10_folds_fwd_mse_86488/fold_{i}_r')\n", 1494 | " \n", 1495 | " wd = 1e-7\n", 1496 | " learn.unfreeze()\n", 1497 | " learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(10,10,0.95,0.85))\n", 1498 | " learn.save(path+f'fold_{i}_r')\n", 1499 | " \n", 1500 | " preds = learn.predict(is_test=True)\n", 1501 | " \n", 1502 | " df = pd.read_csv(CLAS_PATH/'test_.csv', header=None, sep = \"\\t\", quoting=csv.QUOTE_NONE)\n", 1503 | " df[\"prob\"] = [c[0] for c in preds]\n", 1504 | " df.to_csv(f\"./blend1/fold_{i}_r.tsv\",columns=[2,3,\"prob\"],index=False,sep=\"\\t\",header=False)" 1505 | ] 1506 | } 1507 | ], 1508 | "metadata": { 1509 | "_draft": { 1510 | "nbviewer_url": "https://gist.github.com/0dd0df21cf404cf2bb51d0148c8b7d8b" 1511 | }, 1512 | "gist": { 1513 | "data": { 1514 | "description": "fastai.text imdb example", 1515 | "public": true 1516 | }, 1517 | "id": "0dd0df21cf404cf2bb51d0148c8b7d8b" 1518 | }, 1519 | "kernelspec": { 1520 | "display_name": "Python [default]", 1521 | "language": "python", 1522 | "name": "python3" 1523 | }, 1524 | "language_info": { 1525 | "codemirror_mode": { 1526 | "name": "ipython", 1527 | "version": 3 1528 | }, 1529 | "file_extension": ".py", 1530 | "mimetype": "text/x-python", 1531 | "name": "python", 1532 | "nbconvert_exporter": "python", 1533 | "pygments_lexer": "ipython3", 1534 | "version": "3.6.4" 1535 | }, 1536 | "toc": { 1537 | "colors": { 1538 | "hover_highlight": "#DAA520", 1539 | "navigate_num": "#000000", 1540 | "navigate_text": "#333333", 1541 | "running_highlight": "#FF0000", 1542 | "selected_highlight": "#FFD700", 1543 | "sidebar_border": "#EEEEEE", 1544 | "wrapper_background": "#FFFFFF" 1545 | }, 1546 | "moveMenuLeft": true, 1547 | "nav_menu": { 1548 | "height": "86px", 1549 | "width": "252px" 1550 | }, 1551 | "navigate_menu": true, 1552 | "number_sections": true, 1553 | "sideBar": true, 1554 | "threshold": 4, 1555 | "toc_cell": false, 1556 | "toc_section_display": "block", 1557 | "toc_window_display": false, 1558 | "widenNotebook": false 1559 | } 1560 | }, 1561 | "nbformat": 4, 1562 | "nbformat_minor": 2 1563 | } 1564 | --------------------------------------------------------------------------------