├── Diagram.jpg ├── NoImage.jpg ├── README.md ├── test_data.py ├── petfinder ├── test.py ├── model.py └── data.py ├── .gitignore ├── test.py ├── Structured Only.ipynb ├── Fastai PetFinder.ipynb └── PetFinder Language Model.ipynb /Diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EtienneT/fastai-petfinder/HEAD/Diagram.jpg -------------------------------------------------------------------------------- /NoImage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/EtienneT/fastai-petfinder/HEAD/NoImage.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fastai-petfinder 2 | Merging image, tabular and text data in a neural network with fastai with the [PetFinder Kaggle competition](https://www.kaggle.com/c/petfinder-adoption-prediction/). 3 | 4 | The main notebook is [Fastai PetFinder](https://github.com/EtienneT/fastai-petfinder/blob/master/Fastai%20PetFinder.ipynb), but you need to run [PetFinder Language Model](https://github.com/EtienneT/fastai-petfinder/blob/master/PetFinder%20Language%20Model.ipynb) before to fine tune a language model on the data. 5 | 6 | ![Diagram](Diagram.jpg) 7 | -------------------------------------------------------------------------------- /test_data.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import json 5 | import os 6 | import feather 7 | from pathlib import Path 8 | from pandas.io.json import json_normalize 9 | from tqdm import tqdm 10 | import fastai 11 | from PetData import * 12 | 13 | from fastai.tabular import * 14 | 15 | pets = get_data() 16 | 17 | dep_var = 'AdoptionSpeed' 18 | cont_names, cat_names = cont_cat_split(pets, 50, dep_var=dep_var) 19 | cat_names.remove('Filename') 20 | cat_names.remove('PicturePath') 21 | 22 | miss = FillMissing(cat_names, cont_names) 23 | 24 | df = miss.apply_train(pets) 25 | 26 | df.columns 27 | -------------------------------------------------------------------------------- /petfinder/test.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import cohen_kappa_score 2 | import numpy as np 3 | import pandas as pd 4 | from functools import partial 5 | import scipy as sp 6 | 7 | __all__ = ['OptimizedRounder'] 8 | 9 | # Credits to https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved 10 | class OptimizedRounder(object): 11 | def __init__(self): 12 | self.coef_ = 0 13 | 14 | def _kappa_loss(self, coef, X, y): 15 | preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4]) 16 | return -cohen_kappa_score(y, preds, weights = 'quadratic') 17 | 18 | def fit(self, X, y): 19 | loss_partial = partial(self._kappa_loss, X = X, y = y) 20 | initial_coef = [0.5, 1.5, 2.5, 3.5] 21 | self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead') 22 | 23 | def predict(self, X, coef): 24 | preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4]) 25 | return preds 26 | 27 | def coefficients(self): 28 | return self.coef_['x'] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | /.vscode/ 106 | /logs/ 107 | /models/ 108 | /test/ 109 | /test_images/ 110 | /test_metadata/ 111 | /test_sentiment/ 112 | /train/ 113 | /train_images/ 114 | /train_metadata/ 115 | /train_sentiment/ 116 | 117 | /*.pkl 118 | /*.csv 119 | /*.feather -------------------------------------------------------------------------------- /petfinder/model.py: -------------------------------------------------------------------------------- 1 | from fastai.torch_core import * 2 | from fastai.vision import * 3 | from fastai.tabular.models import * 4 | from fastai.tabular import * 5 | from fastai.layers import * 6 | from fastai.text import * 7 | from fastai.callbacks import * 8 | from fastai.metrics import * 9 | import torch 10 | 11 | __all__ = ['ImageTabularTextLearner', 'collate_mixed', 'image_tabular_text_learner', 'normalize_custom_funcs'] 12 | 13 | class ImageTabularTextModel(nn.Module): 14 | def __init__(self, emb_szs:ListSizes, n_cont:int, vocab_sz:int, encoder, use_trainer): 15 | super().__init__() 16 | self.use_trainer = use_trainer 17 | self.cnn = create_body(models.resnet34) 18 | nf = num_features_model(self.cnn) * 2 19 | drop = .5 20 | 21 | self.lm_encoder = SequentialRNN(encoder[0], PoolingLinearClassifier([400 * 3] + [32], [.4])) 22 | 23 | self.tab = TabularModel(emb_szs, n_cont, 128, [512, 256]) 24 | 25 | self.reduce = nn.Sequential(*([AdaptiveConcatPool2d(), Flatten()] + bn_drop_lin(nf, 512, bn=True, p=drop, actn=nn.ReLU(inplace=True)))) 26 | self.merge = nn.Sequential(*bn_drop_lin(512 + 128 + 32, 128, bn=True, p=drop, actn=nn.ReLU(inplace=True))) 27 | self.final = nn.Sequential(*bn_drop_lin(128, 1, bn=False, p=0., actn=None)) 28 | 29 | def forward(self, img:Tensor, x:Tensor, text:Tensor) -> Tensor: 30 | imgCnn = self.cnn(img) 31 | imgLatent = self.reduce(imgCnn) 32 | tabLatent = self.tab(x[0], x[1]) 33 | textLatent = self.lm_encoder(text) 34 | 35 | cat = torch.cat([imgLatent, F.relu(tabLatent), F.relu(textLatent[0])], dim=1) 36 | 37 | pred = self.final(self.merge(cat)) 38 | pred = torch.sigmoid(pred) * 4 # making sure this is in the range 0-4 39 | 40 | if(not self.use_trainer): 41 | return pred 42 | else: 43 | return pred, textLatent 44 | 45 | def reset(self): 46 | for c in self.children(): 47 | if hasattr(c, 'reset'): c.reset() 48 | 49 | def collate_mixed(samples, pad_idx:int=0): 50 | # Find max length of the text from the MixedItemList 51 | max_len = max([len(s[0].data[2]) for s in samples]) 52 | 53 | for s in samples: 54 | res = np.zeros(max_len + pad_idx, dtype=np.int64) 55 | res[:len(s[0].data[2])] = s[0].data[2] 56 | s[0].data[2] = res 57 | 58 | return data_collate(samples) 59 | 60 | def split_layers(model:nn.Module) -> List[nn.Module]: 61 | groups = [[model.cnn, model.lm_encoder]] 62 | groups += [[model.tab, model.reduce, model.merge, model.final]] 63 | return groups 64 | 65 | class RNNTrainerCustom(RNNTrainer): 66 | def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs): 67 | "Save the extra outputs for later and only returns the true output." 68 | self.raw_out,self.out = last_output[1][1],last_output[1][2] 69 | return {'last_output': last_output[0]} 70 | 71 | 72 | def _normalize_images_batch(b:Tuple[Tensor,Tensor], mean:FloatTensor, std:FloatTensor)->Tuple[Tensor,Tensor]: 73 | "`b` = `x`,`y` - normalize `x` array of imgs and `do_y` optionally `y`." 74 | x,y = b 75 | mean,std = mean.to(x[0].device),std.to(x[0].device) 76 | x[0] = normalize(x[0],mean,std) 77 | return x,y 78 | 79 | def normalize_custom_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False)->Tuple[Callable,Callable]: 80 | "Create normalize/denormalize func using `mean` and `std`, can specify `do_y` and `device`." 81 | mean,std = tensor(mean),tensor(std) 82 | return (partial(_normalize_images_batch, mean=mean, std=std), 83 | partial(denormalize, mean=mean, std=std)) 84 | 85 | class ImageTabularTextLearner(Learner): 86 | def __init__(self, data:DataBunch, model:nn.Module, use_trainer:bool=False, alpha:float=2., beta:float=1., **learn_kwargs): 87 | super().__init__(data, model, **learn_kwargs) 88 | if(use_trainer): 89 | self.callbacks.append(RNNTrainerCustom(self, alpha=alpha, beta=beta)) 90 | self.split(split_layers) 91 | 92 | def image_tabular_text_learner(data, len_cont_names, vocab_sz, data_lm, use_trainer:bool=False): 93 | l = text_classifier_learner(data_lm, AWD_LSTM, drop_mult=0.5) 94 | l.load_encoder('fine_tuned_enc') 95 | 96 | emb = data.train_ds.x.item_lists[1].get_emb_szs() 97 | model = ImageTabularTextModel(emb, len_cont_names, vocab_sz, l.model, use_trainer) 98 | 99 | learn = ImageTabularTextLearner(data, model, use_trainer, metrics=[mae, rmse]) 100 | return learn -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #%% [markdown] 2 | # # Image, tabular and text data in the same deep learning model 3 | # 4 | # Deep learning has advanced tremendously in the last 2-3 years. Researchers are always pushing more and more the boundaries of the state of the art in various sub-domain. To do that researchers also have to specialize and imerse themselves in one domain of deep learning. We often see deep learning models handling image data or text data or structured data. But we rarely see them used together when you have a dataset that contain them all. Datasets in the real world are much messier than academic datasets. Being able to leverage everything you have in your data can yield very interesting results. 5 | # 6 | # # Transfer Learning 7 | # 8 | # Transfer learning in deep learning has also become very popular in recent years, more specially for image data with pre-trained ImageNet models that leverage models trained on millions of images that are expensive to train but allow you to re-use that knowledge for other tasks. More recently text data started having its own transfer learning moment with pre-trained models like ULMFit, BERT and GPT-2. 9 | 10 | #%% 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | import matplotlib.pyplot as plt 15 | import numpy as np 16 | import pandas as pd 17 | import json 18 | import os 19 | import feather 20 | from fastai.text import * 21 | 22 | from petfinder.data import * 23 | 24 | 25 | #%% 26 | path = 'C:\\work\\ML\\PetFinder\\' 27 | bs=64 28 | 29 | pets = get_data() 30 | petsTest = get_data(True) 31 | 32 | # pets['IsTest'] = False 33 | # petsTest['IsTest'] = True 34 | 35 | # pets = pd.concat([pets, petsTest]) 36 | 37 | # pets = feather.read_dataframe(path + 'pets.feather') 38 | data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs) 39 | 40 | #%% 41 | from fastai.tabular import * 42 | from fastai.vision import * 43 | from fastai.metrics import * 44 | from fastai.text import * 45 | 46 | dep_var = 'AdoptionSpeed' 47 | cont_names, cat_names = cont_cat_split(pets, 50, dep_var=dep_var) 48 | procs = [FillMissing, Categorify, Normalize] 49 | cat_names.remove('Filename') 50 | cat_names.remove('PicturePath') 51 | cat_names.remove('PetID') 52 | cat_names.remove('Description') 53 | 54 | 55 | #%% 56 | # cont_names, cat_names 57 | 58 | #%% 59 | from petfinder.model import * 60 | 61 | #%% 62 | from fastai.callbacks import * 63 | 64 | bs = 32 65 | size = 224 66 | np.random.seed(42) 67 | 68 | data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs) 69 | vocab = data_lm.vocab 70 | 71 | imgList = ImageList.from_df(pets, path=path, cols='PicturePath') 72 | tabList = TabularList.from_df(pets, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path) 73 | textList = TextList.from_df(pets, cols='Description', path=path, vocab=vocab) 74 | 75 | norm, denorm = normalize_custom_funcs(*imagenet_stats) 76 | 77 | if os.path.isfile(path + 'mixed_img_tab_text.pkl') != True : 78 | mixed = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df) 79 | .random_split_by_pct(.1) 80 | .label_from_df(cols='AdoptionSpeed', label_cls=CategoryList) 81 | .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size)) 82 | 83 | outfile = open(path + 'mixed_img_tab_text.pkl', 'wb') 84 | pickle.dump(mixed, outfile) 85 | outfile.close() 86 | else: 87 | infile = open(path + 'mixed_img_tab_text.pkl','rb') 88 | mixed = pickle.load(infile) 89 | infile.close() 90 | 91 | 92 | #%% 93 | # data_text = textList.random_split_by_pct(.1).label_from_df(cols='AdoptionSpeed').databunch(bs=bs) 94 | # data_text.save('text-classification-databunch.pkl') 95 | data_text = load_data(path, 'text-classification-databunch.pkl') 96 | 97 | 98 | #%% 99 | data = mixed.databunch(bs=bs, collate_fn=collate_mixed, num_workers=0) 100 | data.add_tfm(norm) # normalize images 101 | 102 | 103 | #%% 104 | cat_names = mixed.train.x.item_lists[1].cat_names 105 | cont_names = mixed.train.x.item_lists[1].cont_names 106 | 107 | 108 | #%% 109 | # from fastai.callbacks.tensorboard import LearnerTensorboardWriter 110 | 111 | learn = image_tabular_text_learner(data, len(cont_names), len(vocab.itos), data_text) 112 | 113 | learn.callback_fns +=[partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.005, patience=3)] 114 | # learn.callback_fns += [(partial(LearnerTensorboardWriter, base_dir=Path(path + 'logs\\'), name='mixed-metadata'))] 115 | 116 | 117 | #%% 118 | data.c 119 | 120 | learn.lr_find() 121 | 122 | learn.load('mixed-300') 123 | 124 | # imgList = ImageList.from_df(petsTest, path=path, cols='PicturePath') 125 | # tabList = TabularList.from_df(petsTest, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path) 126 | # textList = TextList.from_df(petsTest, cols='Description', path=path, vocab=vocab) 127 | 128 | # norm, denorm = normalize_custom_funcs(*imagenet_stats) 129 | 130 | # mixedTest = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df)) 131 | 132 | # learn = load_learner(path, 'mixed.pkl', test=mixedTest) 133 | 134 | pets['IsTest'] = False 135 | petsTest['IsTest'] = True 136 | 137 | petsAll = pd.concat([pets, petsTest]) 138 | petsAll[pets.IsTest == True].AdoptionSpeed = -1 139 | 140 | imgListTest = ImageList.from_df(petsAll, path=path, cols='PicturePath') 141 | tabListTest = TabularList.from_df(petsAll, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path) 142 | textListTest = TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab) 143 | 144 | mixedTest = (MixedItemList([imgListTest, tabListTest, textListTest], path, inner_df=tabListTest.inner_df) 145 | .split_from_df(col='IsTest') 146 | .label_from_df(cols='AdoptionSpeed', label_cls=CategoryList) 147 | .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size)) 148 | 149 | #%% 150 | # learn.lr_find() -------------------------------------------------------------------------------- /petfinder/data.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import json 5 | import os 6 | import feather 7 | from pathlib import Path 8 | from pandas.io.json import json_normalize 9 | from tqdm import tqdm 10 | 11 | __all__ = ['get_data', 'quadratic_weighted_kappa'] 12 | 13 | def get_data(isTest:bool=False, useMetadata:bool=False): 14 | name = 'train' 15 | if(isTest): 16 | name = 'test' 17 | 18 | p = Path('.') 19 | 20 | petsFeather = 'pets_' + name + '.feather' 21 | if os.path.isfile(petsFeather) != True: 22 | pets = pd.read_csv(name + '\\' + name + '.csv') 23 | 24 | pImages = p / (name + '_images') 25 | pSentiments = p / (name + '_sentiment') 26 | 27 | images = [x for x in pImages.iterdir()] 28 | images = pd.DataFrame([x for x in map(lambda x: (x.name.split('.')[0].split('-')[0], x.name), images)], columns=['PetID', 'Filename']) 29 | 30 | petsImages = pd.merge(pets, images, how='left', on='PetID') 31 | 32 | petsImages['NoImage'] = petsImages['Filename'].isna() 33 | petsImages['Filename'] = petsImages['Filename'].fillna('..\\NoImage.jpg') 34 | 35 | byRescuerCount = pets.groupby(['RescuerID']).PetID.nunique().reset_index().rename({'PetID': 'RescuerDogCount'}, axis=1) 36 | petsImages = pd.merge(petsImages, byRescuerCount, how='left', on='RescuerID') 37 | 38 | cat = ['Type', 'Name', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'RescuerID'] 39 | cont = ['Age', 'Fee', 'Quantity', 'RescuerDogCount', 'VideoAmt', 'PhotoAmt'] 40 | for x in cat: 41 | petsImages[x] = petsImages[x].astype('category') 42 | for x in cont: 43 | petsImages[x] = petsImages[x].astype('float') 44 | 45 | petsImages['PicturePath'] = petsImages.apply(lambda x: str(name + '_images\\' + x['Filename']), axis=1) 46 | sentimentJsons = [x for x in pSentiments.iterdir()] 47 | 48 | petsSentiments = pd.DataFrame() 49 | sentiment_feather = name + '_sentiments.feather' 50 | if os.path.isfile(sentiment_feather) != True: 51 | for s in tqdm(sentimentJsons, desc='Sentiments'): 52 | with open(s, encoding='utf8') as json_data: 53 | d = json.load(json_data) 54 | df = json_normalize(d['sentences']) 55 | line = {} 56 | m = df.mean().to_dict() 57 | line['PetID'] = s.name.split('.')[0] 58 | line['AvgSentenceSentimentMagnitude'] = m['sentiment.magnitude'] 59 | line['AvgSentenceSentimentScore'] = m['sentiment.score'] 60 | line['SentimentMagnitude'] = d['documentSentiment']['magnitude'] 61 | line['SentimentScore'] = d['documentSentiment']['score'] 62 | petsSentiments = petsSentiments.append(line, ignore_index=True) 63 | 64 | petsSentiments = petsSentiments.reset_index(drop=True) 65 | petsSentiments.to_feather(sentiment_feather) 66 | else: 67 | petsSentiments = feather.read_dataframe(sentiment_feather) 68 | 69 | pets = pd.merge(petsImages, petsSentiments, how='left', on='PetID') 70 | 71 | if(useMetadata): 72 | petsMetadata = pd.DataFrame() 73 | meta_feather = name + '_metadata.feather' 74 | if os.path.isfile(meta_feather) != True: 75 | pMetadata = p / (name + '_metadata') 76 | metadataJsons = [x for x in pMetadata.iterdir()] 77 | 78 | lst = [] 79 | errors = [] 80 | for s in tqdm(metadataJsons, desc='Metadata'): 81 | with open(s, encoding='utf8') as json_data: 82 | try: 83 | d = json.load(json_data) 84 | df = json_normalize(d['labelAnnotations']) 85 | df = df.set_index('description').T 86 | df['PetID'] = s.name.split('-')[0] 87 | lst.append(df.loc['score'].to_dict()) 88 | except: 89 | errors.append(s.name) 90 | petsMetadata = pd.DataFrame(lst) 91 | petsMetadata = petsMetadata.groupby('PetID').mean() 92 | petsMetadata = petsMetadata.fillna(0) 93 | 94 | petsMetadata = petsMetadata.reset_index() 95 | petsMetadata.to_feather(meta_feather) 96 | else: 97 | petsMetadata = feather.read_dataframe(meta_feather) 98 | 99 | pets = pd.merge(pets, petsMetadata, how='left', on='PetID') 100 | 101 | pets['NoDescription'] = pets['Description'].isna() 102 | pets['Description'] = pets['Description'].fillna('No description') 103 | 104 | # state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP 105 | state_gdp = { 106 | 41336: 116.679, 107 | 41325: 40.596, 108 | 41367: 23.02, 109 | 41401: 190.075, 110 | 41415: 5.984, 111 | 41324: 37.274, 112 | 41332: 42.389, 113 | 41335: 52.452, 114 | 41330: 67.629, 115 | 41380: 5.642, 116 | 41327: 81.284, 117 | 41345: 80.167, 118 | 41342: 121.414, 119 | 41326: 280.698, 120 | 41361: 32.270 121 | } 122 | 123 | # state population: https://en.wikipedia.org/wiki/Malaysia 124 | state_population = { 125 | 41336: 33.48283, 126 | 41325: 19.47651, 127 | 41367: 15.39601, 128 | 41401: 16.74621, 129 | 41415: 0.86908, 130 | 41324: 8.21110, 131 | 41332: 10.21064, 132 | 41335: 15.00817, 133 | 41330: 23.52743, 134 | 41380: 2.31541, 135 | 41327: 15.61383, 136 | 41345: 32.06742, 137 | 41342: 24.71140, 138 | 41326: 54.62141, 139 | 41361: 10.35977 140 | } 141 | 142 | pets["state_gdp"] = pets['State'].map(state_gdp) 143 | pets["state_population"] = pets['State'].map(state_population) 144 | pets["gdp_vs_population"] = pets["state_gdp"] / pets["state_population"] 145 | 146 | pets = pets.reset_index(drop=True) 147 | 148 | pets.to_feather(petsFeather) 149 | 150 | return pets 151 | else: 152 | pets = feather.read_dataframe(petsFeather) 153 | 154 | return pets 155 | 156 | # The following 3 functions have been taken from Ben Hamner's github repository 157 | # https://github.com/benhamner/Metrics 158 | def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None): 159 | """ 160 | Returns the confusion matrix between rater's ratings 161 | """ 162 | assert(len(rater_a) == len(rater_b)) 163 | if min_rating is None: 164 | min_rating = min(rater_a + rater_b) 165 | if max_rating is None: 166 | max_rating = max(rater_a + rater_b) 167 | num_ratings = int(max_rating - min_rating + 1) 168 | conf_mat = [[0 for i in range(num_ratings)] 169 | for j in range(num_ratings)] 170 | for a, b in zip(rater_a, rater_b): 171 | conf_mat[a - min_rating][b - min_rating] += 1 172 | return conf_mat 173 | 174 | 175 | def histogram(ratings, min_rating=None, max_rating=None): 176 | """ 177 | Returns the counts of each type of rating that a rater made 178 | """ 179 | if min_rating is None: 180 | min_rating = min(ratings) 181 | if max_rating is None: 182 | max_rating = max(ratings) 183 | num_ratings = int(max_rating - min_rating + 1) 184 | hist_ratings = [0 for x in range(num_ratings)] 185 | for r in ratings: 186 | hist_ratings[r - min_rating] += 1 187 | return hist_ratings 188 | 189 | 190 | def quadratic_weighted_kappa(y, y_pred): 191 | """ 192 | Calculates the quadratic weighted kappa 193 | axquadratic_weighted_kappa calculates the quadratic weighted kappa 194 | value, which is a measure of inter-rater agreement between two raters 195 | that provide discrete numeric ratings. Potential values range from -1 196 | (representing complete disagreement) to 1 (representing complete 197 | agreement). A kappa value of 0 is expected if all agreement is due to 198 | chance. 199 | quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b 200 | each correspond to a list of integer ratings. These lists must have the 201 | same length. 202 | The ratings should be integers, and it is assumed that they contain 203 | the complete range of possible ratings. 204 | quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating 205 | is the minimum possible rating, and max_rating is the maximum possible 206 | rating 207 | """ 208 | rater_a = y 209 | rater_b = y_pred 210 | min_rating=None 211 | max_rating=None 212 | rater_a = np.array(rater_a, dtype=int) 213 | rater_b = np.array(rater_b, dtype=int) 214 | assert(len(rater_a) == len(rater_b)) 215 | if min_rating is None: 216 | min_rating = min(min(rater_a), min(rater_b)) 217 | if max_rating is None: 218 | max_rating = max(max(rater_a), max(rater_b)) 219 | conf_mat = Cmatrix(rater_a, rater_b, 220 | min_rating, max_rating) 221 | num_ratings = len(conf_mat) 222 | num_scored_items = float(len(rater_a)) 223 | 224 | hist_rater_a = histogram(rater_a, min_rating, max_rating) 225 | hist_rater_b = histogram(rater_b, min_rating, max_rating) 226 | 227 | numerator = 0.0 228 | denominator = 0.0 229 | 230 | for i in range(num_ratings): 231 | for j in range(num_ratings): 232 | expected_count = (hist_rater_a[i] * hist_rater_b[j] 233 | / num_scored_items) 234 | d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0) 235 | numerator += d * conf_mat[i][j] / num_scored_items 236 | denominator += d * expected_count / num_scored_items 237 | 238 | return (1.0 - numerator / denominator) -------------------------------------------------------------------------------- /Structured Only.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "The autoreload extension is already loaded. To reload it, use:\n", 13 | " %reload_ext autoreload\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import warnings\n", 19 | "warnings.filterwarnings('ignore')\n", 20 | "\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "import json\n", 25 | "import os\n", 26 | "import feather\n", 27 | "from fastai.text import *\n", 28 | "\n", 29 | "from petfinder.data import *\n", 30 | "\n", 31 | "%matplotlib inline\n", 32 | "%load_ext autoreload\n", 33 | "%autoreload 2" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 18, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n", 43 | "bs=64\n", 44 | "\n", 45 | "pets = get_data(isTest=False)\n", 46 | "petsTest = get_data(isTest=True)\n", 47 | "\n", 48 | "petsTest['AdoptionSpeed'] = 0\n", 49 | "\n", 50 | "pets.AdoptionSpeed = pets.AdoptionSpeed.astype(float)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 19, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "from fastai.tabular import *\n", 60 | "from fastai.vision import *\n", 61 | "from fastai.metrics import *\n", 62 | "from fastai.text import *\n", 63 | "\n", 64 | "dep_var = 'AdoptionSpeed'\n", 65 | "cont_names, cat_names = cont_cat_split(pets, dep_var=dep_var)\n", 66 | "procs = [FillMissing, Categorify, Normalize]\n", 67 | "cat_names.remove('Filename')\n", 68 | "cat_names.remove('PicturePath')\n", 69 | "cat_names.remove('PetID')\n", 70 | "cat_names.remove('Description')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 20, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "(['Type',\n", 82 | " 'Name',\n", 83 | " 'Breed1',\n", 84 | " 'Breed2',\n", 85 | " 'Gender',\n", 86 | " 'Color1',\n", 87 | " 'Color2',\n", 88 | " 'Color3',\n", 89 | " 'MaturitySize',\n", 90 | " 'FurLength',\n", 91 | " 'Vaccinated',\n", 92 | " 'Dewormed',\n", 93 | " 'Sterilized',\n", 94 | " 'Health',\n", 95 | " 'State',\n", 96 | " 'RescuerID',\n", 97 | " 'NoImage',\n", 98 | " 'NoDescription'],\n", 99 | " ['Age',\n", 100 | " 'Quantity',\n", 101 | " 'Fee',\n", 102 | " 'VideoAmt',\n", 103 | " 'PhotoAmt',\n", 104 | " 'RescuerDogCount',\n", 105 | " 'AvgSentenceSentimentMagnitude',\n", 106 | " 'AvgSentenceSentimentScore',\n", 107 | " 'SentimentMagnitude',\n", 108 | " 'SentimentScore',\n", 109 | " 'state_gdp',\n", 110 | " 'state_population',\n", 111 | " 'gdp_vs_population'])" 112 | ] 113 | }, 114 | "execution_count": 20, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "cat_names, cont_names" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 21, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "byPetID = pets.groupby('PetID').size().reset_index()\n", 130 | "byPetID = byPetID.sample(frac=.1).drop([0], axis=1)\n", 131 | "byPetID['IsValidation'] = True\n", 132 | "pets = pd.merge(pets, byPetID, how='left', on='PetID')\n", 133 | "pets.IsValidation = pets.IsValidation.fillna(False)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 29, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "data = (TabularList.from_df(pets, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)\n", 143 | " .split_from_df(col='IsValidation')\n", 144 | " .label_from_df(cols=dep_var, label_cls=FloatList)\n", 145 | " .databunch())" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 30, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "kappa = KappaScore()\n", 155 | "kappa.weights = \"quadratic\"\n", 156 | "learn = tabular_learner(data, layers=[200,100], metrics=[rmse], y_range=[0, 4])\n", 157 | "# learn.loss = MSELossFlat\n", 158 | "\n", 159 | "learn = learn.to_fp16()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 31, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "# learn.lr_find()\n", 169 | "# learn.recorder.plot()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 32, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/html": [ 180 | "Total time: 05:13

\n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | "
epochtrain_lossvalid_lossroot_mean_squared_errortime
00.4270181.1965361.07228201:22
10.2172841.1243431.03763501:16
20.0975551.1370391.04307601:16
30.0489811.1930351.06629501:17
" 221 | ], 222 | "text/plain": [ 223 | "" 224 | ] 225 | }, 226 | "metadata": {}, 227 | "output_type": "display_data" 228 | } 229 | ], 230 | "source": [ 231 | "learn.fit_one_cycle(4, 1e-2)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 33, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "preds,y = learn.get_preds(ds_type=DatasetType.Valid)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 35, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "preds = preds.numpy().round()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 36, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "# preds = torch.softmax(preds, dim=1).argmax(1).numpy()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 37, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "predictions = pets[pets.IsValidation == True]\n", 268 | "predictions['Prediction'] = preds\n", 269 | "predictions = predictions.groupby('PetID').mean().round()[['Prediction', 'AdoptionSpeed']]\n", 270 | "preds, y = predictions['Prediction'], predictions['AdoptionSpeed']" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 38, 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "0.372985251366838" 282 | ] 283 | }, 284 | "execution_count": 38, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "quadratic_weighted_kappa(preds, y)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 16, 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "(58652, 33)" 302 | ] 303 | }, 304 | "execution_count": 16, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "pets.shape" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 30, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "name='train'\n", 320 | "p = pd.read_csv(name + '\\\\' + name + '.csv')" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 31, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "(14993, 24)" 332 | ] 333 | }, 334 | "execution_count": 31, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "p.shape" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [] 349 | } 350 | ], 351 | "metadata": { 352 | "kernelspec": { 353 | "display_name": "Python 3", 354 | "language": "python", 355 | "name": "python3" 356 | }, 357 | "language_info": { 358 | "codemirror_mode": { 359 | "name": "ipython", 360 | "version": 3 361 | }, 362 | "file_extension": ".py", 363 | "mimetype": "text/x-python", 364 | "name": "python", 365 | "nbconvert_exporter": "python", 366 | "pygments_lexer": "ipython3", 367 | "version": "3.7.1" 368 | } 369 | }, 370 | "nbformat": 4, 371 | "nbformat_minor": 2 372 | } 373 | -------------------------------------------------------------------------------- /Fastai PetFinder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Merge image, structured and text data in the same neural net with fast.ai\n", 8 | "\n", 9 | "In this notebook we will predict the adoption speed of pets in the [PetFinder Kaggle competition](https://www.kaggle.com/c/petfinder-adoption-prediction/). This competition give access to tree kind of data, **image** of the pets, **structured** data like their age, breed, color etc and finally **text** data in the form of a description of the pet.\n", 10 | "\n", 11 | "It would be very interesting to be able to merge all this data inside the same neural network so that the network can use whatever information from all data to actually predictic how fast a pet is going to get adopted.\n", 12 | "\n", 13 | "Keep in mind that **this is my first Kaggle competition**, so I might not be using the best strategies or validation schemes, but I just wanted to explore this idea of merging different type of data inside the same neural network.\n", 14 | "\n", 15 | "## Fast.ai\n", 16 | "We are going to use fast.ai to do that because it offers a lot of stuff we need to do this. Mainly a very intuitive [data block](https://docs.fast.ai/data_block.html) that we will use to get our various data from disk, line them up and pass them as input to our neural network. It also provide with easily accessible pre-trained models we will be able to use for our tasks.\n", 17 | "\n", 18 | "## Leveraging pre-trained models\n", 19 | "![caption](Diagram.jpg)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import warnings\n", 29 | "warnings.filterwarnings('ignore')\n", 30 | "\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import numpy as np\n", 33 | "import pandas as pd\n", 34 | "import json\n", 35 | "import os\n", 36 | "import feather\n", 37 | "from fastai.text import *\n", 38 | "\n", 39 | "from petfinder.data import *\n", 40 | "\n", 41 | "%matplotlib inline\n", 42 | "%load_ext autoreload\n", 43 | "%autoreload 2" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Get the structured data\n", 51 | "The method get_data contains all the data wrangling rather boring stuff. We open the structured data train.csv where we have information for each pet (identified by a PetID). We have information like the age of the pet, the breed, the color, was it vaccinated, a textual description of the pet etc. The PetFinder competition also ran the description inside the google sentiment analysis service and provided us with that. I use some of this information and create some new columns for that too.\n", 52 | "\n", 53 | "We also find images in the train_images folder. We create a dataframe where we have a row containing the PetID of the image and the path on disk of the image. We then merge this dataframe to the main structured data by PetID. This yield a dataframe with one row per image where all the structured information about the pet is there for each row.\n", 54 | "\n", 55 | "Kaggle also provided some metadata for each pet, but I didn't spend the time parsing those files...\n", 56 | "\n", 57 | "We have to predict between 5 AdoptionSpeed. This is a classification problem, but a lot of people in the competition used a regression and then found the best rounding using the class OptimizedRounder at the of this notebook. I tried using multi-class classification with this model but didn't have good results." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n", 67 | "bs=64\n", 68 | "\n", 69 | "pets = get_data(isTest=False)\n", 70 | "petsTest = get_data(isTest=True)\n", 71 | "\n", 72 | "pets.AdoptionSpeed = pets.AdoptionSpeed.astype(float)\n", 73 | "\n", 74 | "petsTest['AdoptionSpeed'] = 0" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# Language Model\n", 82 | "\n", 83 | "See the notebook *PetFinder Language Model* on how we train and fine tune a text language model on the pet description" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "# Structured data\n", 91 | "\n", 92 | "Here we have some decisions to make for our structured variables. We need to decide which one is going to be a categorical variable and which one is going to be contiuous.\n", 93 | "\n", 94 | "Even if a variable is a number doesnt mean it should be continuous variable. If the variable only contains a small amount of unique values, it might be better to model it as a categorical variable. We can use [embeddings](https://www.fast.ai/2018/04/29/categorical-embeddings/) for categorical data which will allow us to learn a far richer representation for them and is sometimes more powerful than using a continuous variable.\n", 95 | "\n", 96 | "Fastai takes care of defining those embeddings size, it also fill missing values and normalize the structured data for the neural network." 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 3, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "from fastai.tabular import *\n", 106 | "from fastai.vision import *\n", 107 | "from fastai.metrics import *\n", 108 | "from fastai.text import *\n", 109 | "\n", 110 | "dep_var = 'AdoptionSpeed'\n", 111 | "cont_names, cat_names = cont_cat_split(pets, dep_var=dep_var, max_card=10)\n", 112 | "procs = [FillMissing, Categorify, Normalize]\n", 113 | "cat_names.remove('Filename')\n", 114 | "cat_names.remove('PicturePath')\n", 115 | "cat_names.remove('PetID')\n", 116 | "cat_names.remove('Description')\n", 117 | "\n", 118 | "# for name in cont_names:\n", 119 | "# pets[name] = np.log(pets[name] - pets[name].min() + 1)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 4, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "(['Age',\n", 131 | " 'Quantity',\n", 132 | " 'Fee',\n", 133 | " 'VideoAmt',\n", 134 | " 'PhotoAmt',\n", 135 | " 'RescuerDogCount',\n", 136 | " 'AvgSentenceSentimentMagnitude',\n", 137 | " 'AvgSentenceSentimentScore',\n", 138 | " 'SentimentMagnitude',\n", 139 | " 'SentimentScore',\n", 140 | " 'state_gdp',\n", 141 | " 'state_population',\n", 142 | " 'gdp_vs_population'],\n", 143 | " ['Type',\n", 144 | " 'Name',\n", 145 | " 'Breed1',\n", 146 | " 'Breed2',\n", 147 | " 'Gender',\n", 148 | " 'Color1',\n", 149 | " 'Color2',\n", 150 | " 'Color3',\n", 151 | " 'MaturitySize',\n", 152 | " 'FurLength',\n", 153 | " 'Vaccinated',\n", 154 | " 'Dewormed',\n", 155 | " 'Sterilized',\n", 156 | " 'Health',\n", 157 | " 'State',\n", 158 | " 'RescuerID',\n", 159 | " 'NoImage',\n", 160 | " 'NoDescription'])" 161 | ] 162 | }, 163 | "execution_count": 4, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "cont_names, cat_names" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 5, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "from petfinder.model import *" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "# Loading and lining up the data\n", 186 | "\n", 187 | "We want to load our data. Ideally we would like to re-use existing functionnality and not have to write custom data loader. fast.ai got us covered, thanks to the amazing [data block api](https://docs.fast.ai/data_block.html)!\n", 188 | "\n", 189 | "First we need to am ItemList per type of data. One for image, structured and text. Each of them do pre-processing to the input, keep track of processing they do on data like normalization etc.\n", 190 | "\n", 191 | "But then we merge them using a MixedItemList. MixedItemList simply get an item from each ItemList it contains and merge them together into one Item. Then when fast.ai pass data to our model in the forward method, we can expect as many input as we have ItemList in our MixedItemList.\n", 192 | "\n", 193 | "I pickle the MixedItemList to avoid having to recompute it when I reload the notebook because some of the ItemList pre-processing can be long (like TextItemList)." 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 6, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "byPetID = pets.groupby('PetID').size().reset_index()\n", 203 | "byPetID = byPetID.sample(frac=.1, random_state=42).drop([0], axis=1)\n", 204 | "byPetID['IsValidation'] = True\n", 205 | "pets = pd.merge(pets, byPetID, how='left', on='PetID')\n", 206 | "pets.IsValidation = pets.IsValidation.fillna(False)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "from fastai.callbacks import *\n", 216 | "\n", 217 | "bs = 32\n", 218 | "size = 224\n", 219 | "np.random.seed(42)\n", 220 | "\n", 221 | "data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)\n", 222 | "vocab = data_lm.vocab\n", 223 | "\n", 224 | "imgList = ImageList.from_df(pets, path=path, cols='PicturePath')\n", 225 | "tabList = TabularList.from_df(pets, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)\n", 226 | "textList = TextList.from_df(pets, cols='Description', path=path, vocab=vocab)\n", 227 | "\n", 228 | "if os.path.isfile(path + 'mixed_img_tab_text.pkl') != True :\n", 229 | " mixed = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df)\n", 230 | " .split_from_df(col='IsValidation')\n", 231 | " .label_from_df(cols='AdoptionSpeed', label_cls=FloatList)\n", 232 | " .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))\n", 233 | "\n", 234 | " outfile = open(path + 'mixed_img_tab_text.pkl', 'wb')\n", 235 | " pickle.dump(mixed, outfile)\n", 236 | " outfile.close()\n", 237 | "else:\n", 238 | " infile = open(path + 'mixed_img_tab_text.pkl','rb')\n", 239 | " mixed = pickle.load(infile)\n", 240 | " infile.close()" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "This makes a text databunch used later on to create our learner (for the text portion of our learner). We need this to construct a pre-trained RNN for classification." 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 8, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "if os.path.isfile(path + 'text-classification-databunch.pkl'):\n", 257 | " data_text = load_data(path, 'text-classification-databunch.pkl')\n", 258 | "else:\n", 259 | " petsAll = pd.concat([pets, petsTest])\n", 260 | " petsAll = petsAll.dropna(subset=['Description'])\n", 261 | " \n", 262 | " data_text = (TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab)).split_none().label_from_df(cols='AdoptionSpeed').databunch(bs=bs)\n", 263 | " data_text.save('text-classification-databunch.pkl')" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "# Special functions\n", 271 | "Neural network frameworks like to process data in batches. Batches have to have a pre-defined size. In our case we are using image and structured data which should always have the same size, but our text data can vary in size. The description for each pet will be different.\n", 272 | "\n", 273 | "We have to modify some function in fastai to make it work with our inputs. First since we are using a pre-trained resnet34 network for our images, we need to normalize our images using statistics from ImageNet. But the normalize method for images from fastai expects a certain tensor shape. We need to create a custom normalize function to take into account our custom tensor shape.\n", 274 | "\n", 275 | "Each row in our batch will contain an array of stuff, first the image data, then the structured data and last the text data.\n", 276 | "\n", 277 | "``` python\n", 278 | "\n", 279 | "def _normalize_images_batch(b:Tuple[Tensor,Tensor], mean:FloatTensor, std:FloatTensor)->Tuple[Tensor,Tensor]:\n", 280 | " \"`b` = `x`,`y` - normalize `x` array of imgs and `do_y` optionally `y`.\"\n", 281 | " x,y = b\n", 282 | " mean,std = mean.to(x[0].device),std.to(x[0].device)\n", 283 | " x[0] = normalize(x[0],mean,std)\n", 284 | " return x,y\n", 285 | "\n", 286 | "def normalize_custom_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False)->Tuple[Callable,Callable]:\n", 287 | " \"Create normalize/denormalize func using `mean` and `std`, can specify `do_y` and `device`.\"\n", 288 | " mean,std = tensor(mean),tensor(std)\n", 289 | " return (partial(_normalize_images_batch, mean=mean, std=std),\n", 290 | " partial(denormalize, mean=mean, std=std))\n", 291 | "```\n", 292 | "\n", 293 | "**collate_mixed** is the method responsible to take a batch with variable size rows (because of the variable Description text size) and make them all of equal length so that we can have uniform batch sizes. We basically find the row in the batch which have to longest text, take its length and make all other rows the same length by padding them with zeroes at the end.\n", 294 | "\n", 295 | "``` python\n", 296 | "def collate_mixed(samples, pad_idx:int=0):\n", 297 | " # Find max length of the text from the MixedItemList\n", 298 | " max_len = max([len(s[0].data[2]) for s in samples])\n", 299 | "\n", 300 | " for s in samples:\n", 301 | " res = np.zeros(max_len + pad_idx, dtype=np.int64)\n", 302 | " res[:len(s[0].data[2])] = s[0].data[2]\n", 303 | " s[0].data[2] = res\n", 304 | "\n", 305 | " return data_collate(samples)\n", 306 | "```\n", 307 | "\n", 308 | "Then we transform our MixedItemList into a databunch with our collate function for equal size batches and we also normalize the images using our custom normalize function from earlier." 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 9, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "data = mixed.databunch(bs=bs, collate_fn=collate_mixed)\n", 318 | "\n", 319 | "norm, denorm = normalize_custom_funcs(*imagenet_stats)\n", 320 | "data.add_tfm(norm) # normalize images" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": {}, 326 | "source": [ 327 | "When fastai process your structured data, it creates new columns for any columns that had NaN values. This new column is True when the other column was NaN, otherwise false. If you want to use those columns, simply uncomment the next cell." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 10, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# cat_names = mixed.train.x.item_lists[1].cat_names\n", 337 | "# cont_names = mixed.train.x.item_lists[1].cont_names" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "# Custom model\n", 345 | "Here is the custom PyTorch model I created. It expects a list of embeddings size for each categorical variable (emb_szs), the number of continuous variable (n_cont), the size of the text vocabulary for the language model and finally we have our pre-trained language model encoder that gets passed (encoder).\n", 346 | "\n", 347 | "**self.cnn** is responsible for the image data. Notice the we use AdaptiveConcatPool2d to be able to have any image size as input.\n", 348 | "\n", 349 | "**self.lm_encoder** is responsible for the text data. It uses our fine-tuned language model encoder we trained in the notebook PetFinder Language Model.\n", 350 | "\n", 351 | "**self.tab** is responsible for the structured data. It will create embeddings for categorical variables.\n", 352 | "\n", 353 | "**self.reduce** is simply to reduce the size of the output of the cnn to a more manageable size.\n", 354 | "\n", 355 | "Once the data is passed through each specialist network (cnn, encoder and tabular), we concatenate their output into a single vector.\n", 356 | "\n", 357 | "**self.merge and self.final** are then responsible to reduce this concatenated vector to the final size of 5 which is the number of possible AdoptionSpeed we want to predict. AdoptionSpeed is a categorical variable with 5 unique values.\n", 358 | "\n", 359 | "**use_trainer** is set to true if we are using RNNTrainer\n", 360 | "\n", 361 | "The **reset** method is used to reset the internal state of the RNN in self.lm_encoder.\n", 362 | "\n", 363 | "We are outputing one output for regression and forcing it in the range 0-4.\n", 364 | "\n", 365 | "``` python\n", 366 | "class ImageTabularTextModel(nn.Module):\n", 367 | " def __init__(self, emb_szs:ListSizes, n_cont:int, vocab_sz:int, encoder, use_trainer):\n", 368 | " super().__init__()\n", 369 | " self.use_trainer = use_trainer\n", 370 | " self.cnn = create_body(models.resnet34)\n", 371 | " nf = num_features_model(self.cnn) * 2\n", 372 | " drop = .5\n", 373 | "\n", 374 | " self.lm_encoder = SequentialRNN(encoder[0], PoolingLinearClassifier([400 * 3] + [32], [.4]))\n", 375 | "\n", 376 | " self.tab = TabularModel(emb_szs, n_cont, 128, [512, 256])\n", 377 | "\n", 378 | " self.reduce = nn.Sequential(*([AdaptiveConcatPool2d(), Flatten()] + bn_drop_lin(nf, 512, bn=True, p=drop, actn=nn.ReLU(inplace=True))))\n", 379 | " self.merge = nn.Sequential(*bn_drop_lin(512 + 128 + 32, 128, bn=True, p=drop, actn=nn.ReLU(inplace=True)))\n", 380 | " self.final = nn.Sequential(*bn_drop_lin(128, 1, bn=False, p=0., actn=None))\n", 381 | "\n", 382 | " def forward(self, img:Tensor, x:Tensor, text:Tensor) -> Tensor:\n", 383 | " imgCnn = self.cnn(img)\n", 384 | " imgLatent = self.reduce(imgCnn)\n", 385 | " tabLatent = self.tab(x[0], x[1])\n", 386 | " textLatent = self.lm_encoder(text)\n", 387 | "\n", 388 | " cat = torch.cat([imgLatent, F.relu(tabLatent), F.relu(textLatent[0])], dim=1)\n", 389 | "\n", 390 | " pred = self.final(self.merge(cat))\n", 391 | " pred = torch.sigmoid(pred) * 4 # making sure this is in the range 0-4\n", 392 | "\n", 393 | " if(not self.use_trainer):\n", 394 | " return pred\n", 395 | " else:\n", 396 | " return pred, textLatent\n", 397 | " \n", 398 | " def reset(self):\n", 399 | " for c in self.children():\n", 400 | " if hasattr(c, 'reset'): c.reset()\n", 401 | "```\n", 402 | "\n", 403 | "# Custom learner functions\n", 404 | "\n", 405 | "We need a split_layer function to tell fastai how to split the layers when doing [discriminative learning rates](https://towardsdatascience.com/understanding-learning-rates-and-how-it-improves-performance-in-deep-learning-d0d4059c1c10). This is also what determines which layer to freeze when when we call the Learner.freeze method. This one could certainly be better... Looking at other split layers for the pre-trained RNN and reset, we should probably structure this differently.\n", 406 | "\n", 407 | "``` python\n", 408 | "def split_layers(model:nn.Module) -> List[nn.Module]:\n", 409 | " groups = [[model.cnn, model.lm_encoder]]\n", 410 | " groups += [[model.tab, model.reduce, model.merge, model.final]]\n", 411 | " return groups\n", 412 | "```\n", 413 | "\n", 414 | "We create our custom Learner class to be able to set some custom parameters. I added an option to use RNNTrainer which is supposed to help if the language model is overfitting. It is based on the [AWD_LSTM paper](https://arxiv.org/abs/1708.02182). I had to modify the default version because of how I was passing data to it.\n", 415 | "\n", 416 | "``` python\n", 417 | "class RNNTrainerCustom(RNNTrainer):\n", 418 | " def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs):\n", 419 | " \"Save the extra outputs for later and only returns the true output.\"\n", 420 | " self.raw_out,self.out = last_output[1][1],last_output[1][2]\n", 421 | " return {'last_output': last_output[0]}\n", 422 | "\n", 423 | "class ImageTabularTextLearner(Learner):\n", 424 | " def __init__(self, data:DataBunch, model:nn.Module, use_trainer:bool=False, alpha:float=2., beta:float=1., **learn_kwargs):\n", 425 | " super().__init__(data, model, **learn_kwargs)\n", 426 | " if(use_trainer):\n", 427 | " self.callbacks.append(RNNTrainerCustom(self, alpha=alpha, beta=beta))\n", 428 | " self.split(split_layers)\n", 429 | "```\n", 430 | "\n", 431 | "Finally an helper method constructing our model and learner. We use the text_classifier_learner method from fastai to construct a pre-trained language model where we load our fine-tuned encoder. This method returns a learner though, but we only care about the model it returns which we use in our own model.\n", 432 | "\n", 433 | "The metric this Kaggle competition [evaluate on the quadratic weighted kappa](https://www.kaggle.com/c/petfinder-adoption-prediction/overview/evaluation). So we will track it to see how we are doing.\n", 434 | "\n", 435 | "``` python\n", 436 | "def image_tabular_text_learner(data, len_cont_names, vocab_sz, data_lm, use_trainer:bool=False):\n", 437 | " l = text_classifier_learner(data_lm, AWD_LSTM, drop_mult=0.5)\n", 438 | " l.load_encoder('fine_tuned_enc')\n", 439 | "\n", 440 | " emb = data.train_ds.x.item_lists[1].get_emb_szs()\n", 441 | " model = ImageTabularTextModel(emb, len_cont_names, vocab_sz, l.model, use_trainer)\n", 442 | "\n", 443 | " learn = ImageTabularTextLearner(data, model, use_trainer, metrics=[mae])\n", 444 | " return learn\n", 445 | "```" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 11, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "learn = image_tabular_text_learner(data, len(cont_names), len(vocab.itos), data_text, use_trainer=True)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 12, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "# learn.callback_fns +=[partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.005, patience=3)]\n", 464 | "# learn.callback_fns += [(partial(LearnerTensorboardWriter, base_dir=Path(path + 'logs\\\\'), name='mixed-metadata'))]" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 13, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "data": { 474 | "text/plain": [ 475 | "1" 476 | ] 477 | }, 478 | "execution_count": 13, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "data.c" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 14, 490 | "metadata": {}, 491 | "outputs": [], 492 | "source": [ 493 | "# learn.lr_find()\n", 494 | "# learn.recorder.plot()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 15, 500 | "metadata": {}, 501 | "outputs": [], 502 | "source": [ 503 | "lr = 1e-3" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 16, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "data": { 513 | "text/html": [ 514 | "Total time: 20:36

\n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | "
epochtrain_lossvalid_lossmean_absolute_errortime
00.4266301.1405550.85783610:22
10.1817551.1488880.83764210:13
" 541 | ], 542 | "text/plain": [ 543 | "" 544 | ] 545 | }, 546 | "metadata": {}, 547 | "output_type": "display_data" 548 | }, 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "Better model found at epoch 0 with mean_absolute_error value: 0.8578364849090576.\n", 554 | "Better model found at epoch 1 with mean_absolute_error value: 0.8376424312591553.\n" 555 | ] 556 | } 557 | ], 558 | "source": [ 559 | "# learn.to_fp16 doesn't work with this model for some reason\n", 560 | "# learn = learn.to_fp16()\n", 561 | "learn.freeze()\n", 562 | "learn.fit_one_cycle(2, lr, callbacks=SaveModelCallback(learn, every='improvement', mode='min', monitor='mean_absolute_error', name='mixed'))" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "learn.purge()" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 22, 577 | "metadata": {}, 578 | "outputs": [], 579 | "source": [ 580 | "# learn.lr_find()\n", 581 | "# learn.recorder.plot()" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "bs=8\n", 591 | "data = mixed.databunch(bs=bs, collate_fn=collate_mixed)\n", 592 | "\n", 593 | "norm, denorm = normalize_custom_funcs(*imagenet_stats)\n", 594 | "data.add_tfm(norm) # normalize images\n", 595 | "\n", 596 | "learn = image_tabular_text_learner(data, len(cont_names), len(vocab.itos), data_text, use_trainer=True)\n", 597 | "# learn.callback_fns +=[partial(EarlyStoppingCallback, monitor='kappa_score', min_delta=0.005, patience=3)]\n", 598 | "learn.load('mixed-unfrozen')" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [ 606 | { 607 | "data": { 608 | "text/html": [ 609 | "\n", 610 | "

\n", 611 | " \n", 623 | " \n", 624 | " 25.00% [1/4 45:29<2:16:27]\n", 625 | "
\n", 626 | " \n", 627 | "\n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | "
epochtrain_lossvalid_lossmean_absolute_errorroot_mean_squared_errortime
00.2529651.1490330.8574930.96154845:28

\n", 649 | "\n", 650 | "

\n", 651 | " \n", 663 | " \n", 664 | " 57.66% [3794/6580 25:16<18:33 0.2089]\n", 665 | "
\n", 666 | " " 667 | ], 668 | "text/plain": [ 669 | "" 670 | ] 671 | }, 672 | "metadata": {}, 673 | "output_type": "display_data" 674 | }, 675 | { 676 | "name": "stdout", 677 | "output_type": "stream", 678 | "text": [ 679 | "Better model found at epoch 0 with mean_absolute_error value: 0.8574932813644409.\n" 680 | ] 681 | } 682 | ], 683 | "source": [ 684 | "learn.unfreeze()\n", 685 | "learn.fit_one_cycle(4, max_lr=slice(1e-6,1e-4), callbacks=SaveModelCallback(learn, every='improvement', mode='min', monitor='mean_absolute_error', name='mixed-unfrozen'))" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": null, 691 | "metadata": {}, 692 | "outputs": [], 693 | "source": [ 694 | "learn.load('mixed')" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 24, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "p,y = learn.get_preds(ds_type=DatasetType.Valid)" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 25, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "from petfinder.test import *" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 37, 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [ 721 | "optR = OptimizedRounder()\n", 722 | "optR.fit(p.numpy()[:, 0], y.numpy())\n", 723 | "coeff = optR.coefficients()" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 69, 729 | "metadata": {}, 730 | "outputs": [], 731 | "source": [ 732 | "preds = optR.predict(p.numpy()[:, 0], coeff).astype(int)" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 71, 738 | "metadata": {}, 739 | "outputs": [], 740 | "source": [ 741 | "predictions = pets[pets.IsValidation == True][['PetID', 'AdoptionSpeed']]\n", 742 | "predictions['Prediction'] = preds\n", 743 | "predictions = predictions.groupby('PetID').mean()[['Prediction', 'AdoptionSpeed']]\n", 744 | "# preds, y = predictions['Prediction'], predictions['AdoptionSpeed']" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 72, 750 | "metadata": {}, 751 | "outputs": [ 752 | { 753 | "data": { 754 | "text/plain": [ 755 | "0.4232357217350937" 756 | ] 757 | }, 758 | "execution_count": 72, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "quadratic_weighted_kappa(predictions['Prediction'], predictions['AdoptionSpeed'])" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "# Generating a submission for the competition\n", 772 | "\n", 773 | "Unfortunately fastai export does not support MixedItemList yet. So to test my code on the test set I had to trick fastai in thinking that the test set is actually the validation set. I just set all labels of the test set to be 0." 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 74, 779 | "metadata": {}, 780 | "outputs": [], 781 | "source": [ 782 | "pets['IsTest'] = False\n", 783 | "petsTest['IsTest'] = True\n", 784 | "petsTest['AdoptionSpeed'] = 0\n", 785 | "\n", 786 | "petsAll = pd.concat([pets, petsTest])" 787 | ] 788 | }, 789 | { 790 | "cell_type": "markdown", 791 | "metadata": {}, 792 | "source": [ 793 | "This is pretty much the same code as training, but here we use .split_from_df(col='IsTest') to tell fastai that the validation are only the rows in the dataframe where the column IsTest is True." 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 75, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [ 802 | "imgListTest = ImageList.from_df(petsAll, path=path, cols='PicturePath')\n", 803 | "tabListTest = TabularList.from_df(petsAll, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)\n", 804 | "textListTest = TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab)\n", 805 | "\n", 806 | "mixedTest = (MixedItemList([imgListTest, tabListTest, textListTest], path, inner_df=tabListTest.inner_df)\n", 807 | " .split_from_df(col='IsTest')\n", 808 | " .label_from_df(cols='AdoptionSpeed', label_cls=FloatList)\n", 809 | " .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "dataTest = mixedTest.databunch(bs=bs, collate_fn=collate_mixed)\n", 819 | "dataTest.add_tfm(norm) # normalize images\n", 820 | "\n", 821 | "learn = image_tabular_text_learner(dataTest, len(cont_names), len(vocab.itos), data_text, use_trainer=True)\n", 822 | "learn.load('mixed')" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 107, 828 | "metadata": {}, 829 | "outputs": [], 830 | "source": [ 831 | "preds,y = learn.get_preds(ds_type=DatasetType.Valid)" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": 112, 837 | "metadata": {}, 838 | "outputs": [], 839 | "source": [ 840 | "p,y = preds.numpy()[:, 0], y.numpy()" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 118, 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "optR = OptimizedRounder()\n", 850 | "preds = optR.predict(p, coeff).astype(int)" 851 | ] 852 | }, 853 | { 854 | "cell_type": "code", 855 | "execution_count": 139, 856 | "metadata": {}, 857 | "outputs": [], 858 | "source": [ 859 | "predictions = petsTest\n", 860 | "predictions['AdoptionSpeed'] = preds\n", 861 | "predictions = predictions.groupby('PetID').mean()['AdoptionSpeed'].reset_index()\n", 862 | "predictions['AdoptionSpeed'] = predictions['AdoptionSpeed'].astype(int)\n", 863 | "predictions.to_csv('submission.csv', index=False)" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [] 872 | } 873 | ], 874 | "metadata": { 875 | "kernelspec": { 876 | "display_name": "Python 3", 877 | "language": "python", 878 | "name": "python3" 879 | }, 880 | "language_info": { 881 | "codemirror_mode": { 882 | "name": "ipython", 883 | "version": 3 884 | }, 885 | "file_extension": ".py", 886 | "mimetype": "text/x-python", 887 | "name": "python", 888 | "nbconvert_exporter": "python", 889 | "pygments_lexer": "ipython3", 890 | "version": "3.7.1" 891 | } 892 | }, 893 | "nbformat": 4, 894 | "nbformat_minor": 2 895 | } 896 | -------------------------------------------------------------------------------- /PetFinder Language Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import warnings\n", 10 | "warnings.filterwarnings('ignore')\n", 11 | "\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import numpy as np\n", 14 | "import pandas as pd\n", 15 | "import json\n", 16 | "import os\n", 17 | "import feather\n", 18 | "from fastai.text import *\n", 19 | "\n", 20 | "from petfinder.data import *\n", 21 | "\n", 22 | "%matplotlib inline\n", 23 | "%load_ext autoreload\n", 24 | "%autoreload 2" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n", 34 | "bs=64\n", 35 | "\n", 36 | "pets = get_data(isTest=False)\n", 37 | "petsTest = get_data(isTest=True)\n", 38 | "\n", 39 | "petsTest['AdoptionSpeed'] = 0" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Language Model\n", 47 | "\n", 48 | "First let's handle our language model. In fast.ai, you can use a pre-trained language model called ULMFit trained on all the text of wikipedia on trying to predict the next word in a sentence. In our case we don't want to predict the next word in a sentence, but we want to use what this language model learned to help us with our task of predicting how fast a pet is going to get adopted.\n", 49 | "\n", 50 | "In fast.ai, you can fine-tune a pre-trained model on your own corpora of text to make it better at handling your own domain. In our case, descriptions of pets.\n", 51 | "\n", 52 | "We are going to use the description from both the training and test set to fine tune this pre-trained model to give us as much text as possible to fine tune it." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "petsAll = pd.concat([pets, petsTest])\n", 62 | "petsAll = petsAll.dropna(subset=['Description'])\n", 63 | "\n", 64 | "descriptions = petsAll.groupby(['PetID', 'Description']).size().to_frame().reset_index().set_index('PetID')" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Then let's prepare a DataBunch for those description. Basically in the background fastai is tokenizing this text and numericalize it to make it usesable by the neural network. This DataBunch is also going to be used to know our vocabulary (what word does this language model know) later on when we are going to use it in our network." 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "data_lm = (TextList.from_df(descriptions, cols='Description').split_by_rand_pct(0.1).label_for_lm().databunch(bs=bs, path=path))\n", 81 | "data_lm.save('data_lm_descriptions.pkl')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/html": [ 101 | "\n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | "
idxtext
0beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human
1fine ) . xxmaj the owner is xxmaj korean lady , a mother of 2 babies - 18 months and 6 months and she 's pregnant again so planning to go back to xxmaj korea . xxmaj please contact if you are interested . xxbos xxmaj she is very friendly and cute . xxmaj because i want to move , the new apartment can not raise a cat , no
2was rescued . i have no heart to put him back on the street after neuturing . xxmaj he will not be able to survive . i knw for a fact , he has an owner before he was abandoned . xxmaj will you be able to give xxmaj luke a forever home .. let him feel the love he used to have ? xxmaj give me a call at
3including xxmaj xxunk xxup xxunk have to direct part order from xxmaj xxunk and waited for 3 weeks long to reached xxmaj malaysia . xxmaj ha ! xxmaj ha ! xxmaj ha ! xxup what a xxup good xxup laugh xxbos xxmaj happy yappy 4 month old puppies were dumped at the pet clinic mercilessly . 4 months later , these 4 puppies have grown to be the lovable xxmaj
4facilitate the adopter to be entitled for the starter pack , cos posting in a group only entitles one adopter . xxbos xxmaj nak bagi pet ni sebab dah banyak sgt . lagipun mase x cukup nak jaga .. my pet betul2 tak terurus sekarang ... call . i tinggal area segambut , kl . xxup telah xxup selamat xxup di xxup rumah xxup baru xxrep 4 . xxbos xxmaj
" 131 | ], 132 | "text/plain": [ 133 | "" 134 | ] 135 | }, 136 | "metadata": {}, 137 | "output_type": "display_data" 138 | } 139 | ], 140 | "source": [ 141 | "data_lm.show_batch()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "## Language model fine-tuning" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 7, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 8, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "text/html": [], 168 | "text/plain": [ 169 | "" 170 | ] 171 | }, 172 | "metadata": {}, 173 | "output_type": "display_data" 174 | }, 175 | { 176 | "name": "stdout", 177 | "output_type": "stream", 178 | "text": [ 179 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 180 | ] 181 | }, 182 | { 183 | "data": { 184 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEKCAYAAAA4t9PUAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xl4VOXd//H3N/tKAiRAWEJYRVmFgAKKYF3qvlQraq1Lq6UufWp/dnts+7Raa6ttrdZatS5tXVvXugJqBRRFSJB9UXZCWMKWhJCELPfvjxk0xoQEZs6cSfJ5XddcOTnnzMz3ZkI+uc99zn3MOYeIiMiRivG7ABERadsUJCIiEhIFiYiIhERBIiIiIVGQiIhISBQkIiISEgWJiIiEREEiIiIhUZCIiEhI4vwu4HBlZWW5vLw8v8sQEWlTCgsLdzrnsr147TYXJHl5eRQUFPhdhohIm2JmG716bR3aEhGRkChIREQkJAoSEREJiYJERERC4ulgu5ltAMqBOqDWOZffaPsPgcsb1HI0kO2c2+1lXSIiEj6ROGtrinNuZ1MbnHN3A3cDmNk5wM0KERGRtiWaDm1dCjzjdxEiInJ4vA4SB8w0s0Izu665ncwsBfgq8ILH9UgLyqpqeHjOWmYs30bx3kp0K2YRaYnXh7YmOueKzawb8JaZrXLOzWliv3OAuc0d1gqG0HUAubm53lXbwdXVO256+mNmf1Ly2bqstASG9crgqB7pDO6WzqDuaQzslkZKQpu7llVEPOLpbwPnXHHw6w4zewkYBzQVJFM5xGEt59zDwMMA+fn5+hPZI3fPWM3sT0r41blDGd47g6VFpSzdUsqyLaV8sGYXB+rqP9s3xsDMPvvaPyuVcf26MDavC+P6daF7pyQfWyIikeRZkJhZKhDjnCsPLp8G3NbEfhnAScA3vKpFWvafRVt4cPZaLj8ulysn5AEwOrfzZ9tr6+rZuHs/n24vZ21JBVU1dTgH9c5RW+9YubWMFwqL+OeHgVkYThyUxR3nDye3a4ofzRGRCPKyR9IdeMnMDr7P08656WY2DcA592BwvwuAmc65Cg9r8cXakn28tWI7/bJSGdQtjdwuKcTFRtP5DQHLtpTy4xeWMDavM/93ztAm94mLjWFAdhoDstOafZ3aunpWbC3jvU938tdZazn9T3O45fSjuGpCHrEx5lX5IuIza2uDqfn5+a4tTNronOOiBz+kcOOez9YlxMVwdE4nLhvXh/NG9SIpPtbTGqpq6pi/fjfby6rolZlMr87J5GQkEx9r7K44QNGeSor2VPKbN1ZS7xyv3HgC2emJYXnvraWV3PrSMv67agej+mRy86mDye2SQk5GkuftFpEvM7PCxtfyhe21FSTemP1JCVc+Np+fnXU0Y/O68Mn2ctbs2MfsT0pYta2cLqkJXH5cLlcc35duIY4n1NTVU1ZZQ2nwsXRLKbNWl/Dh2l1U1tR9YV8zSIiNobr28/GOtMQ4nrn2eIb3zgipjsacc7yyuJhfvrKcPftrPlvfJTWBgd3SOO2Y7pw+tAd9uujwl4jXFCQNtIUgcc5xwQMfUFJezbu3TCYhLuYL2+at281jc9fz9srtxMfE8MPTj+JbJ/QjptHhn8oDdbywsIjivZXsq66lvKqW8qoayiprKasKhEZZZQ0VB+oal0DfrilMHpzN5KO6kZeVyta9lRTtrWTLnkoqqmvp1TmZXpnJ9O6cQt+uKaQmeneUs7SyhuXFpWzdW8XW0kqKS6tYuHEPq7aVAzCsVyfOGJbDhaN7kZOR7FkdIh2ZgqSBthAk767awdV/X8CdFw7n0nHNn668cVcFd7y+kpkrtjNxYFf+cPEoemQkUV8f+Ev+d9NXsbW0itgYIz0pjvSkONIS4+mUFEdGcjydkuPJCD46JcWRkRJY7peVRr+s1Ai2+Mhs2FnBjOXbeHPZNhZt3kuMwQmDsvl6fm9OPaY7iXE6BCYSLgqSBqI9SJxznPeXueyuOMC7t0wmvoXBdecc/1qwmV+9uoKEuBj+5yuDeGVxMYs272VYr078/KxjGNevC8GTFtqtjbsqeL6wiBcKiyguraJragI/OWMIF43pfcRtr6t3VNXUUVlTR+WBOjbu2s/y4lKWF5exYmsZFdW1ZKcnkp2WSHZ6IikJcVTW1FFVU8f+A7UA5GQkk5ORRM/MZPp0SWFIj3SN8UibpCBpINqD5O0V2/n2Pwu462sj+PrYPq1+3vqdFXz/2Y9ZXFRKt/REfvTVIVx4bK8vHe5q7+rqHXPX7OS+dz6lYOMejuvXhTsuGM7Abs2fLdZQ8d5Knp2/iecKi9haWtXkPj0zkjimZwYZyfHs3FdNSXk1O8qrqTxQS3JCHCkJsSTHx+JwbC2toryq9rPnxsUYQ3LSGdE7k1G9M8nP60y/rNR2H/TS9ilIGojmIHHOcfaf32dfdS1v/+CkFnsjjdXU1TN7dQnjB3T1dMyiLaivd/yrYDN3vrGSypo6rpqQR1ZaImXBMaKK6lrSkuLomppIl7QEkuNjmb5sK/9dtQMHTB6czag+nUlOiCE5PpbE+Fh6ZiQztGcnOqcmHFYtZVU1bN1bxfqdFSwp2svior0s2VxKeXUgYLLSEhnXrzPH9evK+cf2IiM53oN/EZHQKEgaiOYgmbF8G995opDfXzySi8b09rucdqGkvJo7Xl/By4uKAYiNMTolxZGSEBc48aBBbyErLYFLxvZh6thcz88Eq693rC3Zx4INe1iwYTfz1+9my95KuqQmcPMpg7h0XG5UXjMkHZeCpIFoDZL6eseZ971HdW09b908Sb9Ewmzv/gPEx8aQkhD7hcNIB2rr2bP/AKWVNeR1Tf3CGXKRtmxLKb9+fQXz1u1mULc0bj3raCYf1c23ekQa8jJIOvbxkzB6dUkxq7aVc9+lxypEPJCZ0vThqIS4GLp3SoqKub2G9crgmWuPZ+aK7dz5xkquenwB3TslMrJ3JqNyMxnVJ5P8vl18DTsRL3SYINlRVsU/P9zIzacODvt0HTV19dzz1icM6ZHO2cNzwvra0raYGacP7cGUo7rx4sIi5q3bxaLNe5m5YjsAA7JTufvikV+Yx0ykreswQbJgwx7uf3cN3TslcsX4vLC+9vOFRWzYtZ9Hr8zvcGdZSdMS4mKYOi6XqcHriPZUHOCDtbu44/UVXPTXD7j2xP7cfOpgnUos7UKHCZIzh/dgwoCu3D1jNWcOz6FrWnjmlKqqqePetz9ldG4mJw/R8XBpWufUBM4akcOkwVn85o1VPDRnHW+t3M7Zw3PYXlbN1rIqtpVWkpIQx6TB2Uw+KpuRvTM12aW0CR1qsP3T7eWcce97XJzfmzsvHBGWeh55bx2/fn0lz1x7POMHdA3La0r79/6nO/nxC0soLq0kOy2RnIzAOM/OfdUs2ryXegeZKfEc2yeTjOR40oKzGmQkx9MjI5EenZLpmZlEj4wkzQAgraLB9jAZ1D2dqybk8ejc9Uwdm8vIPpkhvd6+6loemLWWEwdlKUTksJwwKIv3fjSFOue+dL3RnooDvLdmJ7NW72D1tnLW7axgX1VgrrWGNxeDwISbD10xhokDsyJZvsgXdKgeCUB5VQ1Tfj+bXp2Teem7Ez4b06ivdywvLmNITnqrLyS89+1PueftT/jPDRNDDiWR1qiormVbWRXbSqvYWlrF3+asY9Pu/fzzW+MYm9fF7/IkinnZI+lw5yGmJ8Xzv2cOYfHmvTxfWERpZQ2Pvr+ek/8wi3Puf58bn15IXX3L4bpmxz4emLWGM4b1UIhIxKQmxjEgO42JA7O4aExvnvz2ceRkJnH14wtYtHmv3+VJB9XhggTggmN7kd+3M7e/toLxd77D7a+toGtaIlcc35cZy7fzs5eXcqieWm1dPf/vucUkJ8Tyq3ObvqOgSCRkpyfy9LePp0tqAt989COWF5f6XZJ0QB0ySMyM284bRqfkeM4ansNrN53AC9+dwO3nD+OGKQN4Zv5m/jDzk2af/9CcdSzevJfbzhsW8k2pRELVIyOJp689jrTEOK54dD5LitQzkcjqcGMkLXHO8dMXl/Lsgs384uxjuOaEfl/YvnJrGefe/z6nHdOD+y87VrO+StTYsLOCyx/5iF0V1fzx66M4UxfHSgMaI4kgM+PX5w/j9KHdue21Ffz0xaW8/+lOaurqOVBbzw/+vZiM5HhuP3+YQkSiSl5WKv+5cSJDe2Zw/VML+fM7nx7yEK1IuKhH0oyqmjp+/vIyXluylcqaOjKS4+mfncrHm/by8BVjOG1oD89rEDkSVTV1/PTFpbz08RbOHpHDcf26sL2smh3lVeyuqOHC0b3UW+mANPtvA5Ge/beqpo45n5Qwffk23lm5gzOH9wjbxYwiXnHO8cCstdw9YzUAMRa4b0psjLG1tIqrJuTxv2cerQkkOxAFSQPROo28SDTaVlpFjEHXYIgcqK3nzjdX8vjcDRybm8kDl48mJyPZ7zIlAjRGIiJHpEdGEt06JX02Z1dCXAz/d85Q/nLZaD7ZVs5Z973Pu6t2+FyltHUKEpEO6KwRObxy0wlkpyVy9d8X8KPnF1NWVeN3WdJGKUhEOqgB2Wm8ctNErp88gOcLizj9njnMWq3eiRw+BYlIB5YYF8uPvjqEF6+fSGpiHFc9voA731zpd1nSxihIRIRRfTJ57aYTuHRcLg/NXsc/Ptjgd0nShnSoaeRFpHlJ8bH8+vxhlJRX86tXl9O3awqTj9LN2qRl6pGIyGdiY4x7p45iSI9O3Pj0x6zeVu53SdIGKEhE5AtSE+N49Kp8UhJiuebvC9i5r9rvkiTKKUhE5EtyMpJ55Mp8dlVUc/XjC9hdccDvkiSKKUhEpEkjegeufP9kezkXPfgBRXv2+12SRCkFiYg06+Qh3XniW8exs7yar/31A42ZSJM8DRIz22BmS81skZk1OUGWmU0Obl9uZrO9rEdEDt+4fl3497TxAFz84Acs2LDb54ok2kSiRzLFOTeqqcnCzCwTeAA41zk3FLg4AvWIyGEa0qMTL3x3AlnpiVz52HxWbi3zuySJIn4f2roMeNE5twnAOaf5GUSiVO/OKTx77fF0Sorn2/8o0Nlc8hmvg8QBM82s0Myua2L7YKCzmc0K7vPNpl7EzK4zswIzKygpKfG0YBFpXrdOSfztm4GzuaY9UUh1bZ3fJUkU8DpIJjrnRgNnADeY2aRG2+OAMcBZwOnAz81scOMXcc497JzLd87lZ2dne1yyiBzK8N4Z/OHiURRs3MOtLy3T7XzF2yBxzhUHv+4AXgLGNdqlCJjunKtwzu0E5gAjvaxJREJ31ogcvn/KIJ4vLOJv763zuxzxmWdBYmapZpZ+cBk4DVjWaLf/ACeaWZyZpQDHAZp6VKQN+N7JgzhreA53vrmK+et1JldH5mWPpDvwvpktBuYDrzvnppvZNDObBuCcWwlMB5YE93nEOdc4bEQkCsXEGHddNILcLinc/K9FujFWB6Z7totISBZu2sPFD37IeSN78sdLRvldjjRD92wXkag1OrczN508kBc/3sKri4v9Lkd8oCARkZDdOGUgx+ZmcutLSyneW+l3ORJhChIRCVlcbAx/umQUtfWO//fvxdTW1ftdkkSQgkREwqJv11R+de5QPly3i+8+tZCqGl2s2FEoSEQkbC7O78Ovzh3K2yu3c+Vj83UmVwehIBGRsLpyQh5/umQUhRv3MPWheZSUa06u9k5BIiJhd96oXjx61VjW76zgogc/YFtpld8liYcUJCLiiZMGZ/PUtcdRUl7Nd5/SBI/tmYJERDwzOrczd180ko837eX211b4XY54REEiIp46a0QO35nUnyfnbeL5wiK/yxEPKEhExHM/PP0oxvfvyq0vLWXZllK/y5EwU5CIiOfiYmO4/7Jj6ZqawHeeKGRPxQG/S5IwUpCISER0TUvkr98Yw7ayKv783zV+lyNhpCARkYgZ2SeTC47txVMfbWRHuU4Jbi8UJCISUTdMGUhNXT2PvLfe71IkTBQkIhJR/bJSOW9UL574cCO79umq9/ZAQSIiEXfDlIFU1dbx6PvqlbQHChIRibiB3dI4a3gO//hgA3v36wyutk5BIiK+uOnkQVQcqOMx9UraPAWJiPjiqB7pnDGsB4/P3UBppaabb8sUJCLim5tOHkR5dS1PztvodykSAgWJiPjmmJ6dmDCgK88u2ER9vfO7HDlCChIR8dUlY/uweXclH6zd5XcpcoQUJCLiq9OH9iAzJZ5nFmzyuxQ5QgoSEfFVUnwsFx7bm5nLt7Fbkzm2SQoSEfHd1HF9qKlzvLhQ9ytpixQkIuK7wd3TGZ2byTPzN+GcBt3bGgWJiESFqWNzWVtSQeHGPX6XIodJQSIiUeHskTmkJcbxzPzNfpcih0lBIiJRISUhjnNH9eT1pcW60r2NUZCISNSYOrYPVTX1vLJoi9+lyGFQkIhI1BjeK4NjcjrxzPzNGnRvQxQkIhI1zIxLj8tlxdYylm4p9bscaSVPg8TMNpjZUjNbZGYFTWyfbGalwe2LzOwXXtYjItHvvFE9SY6P5Zn5utK9rYhEj2SKc26Ucy6/me3vBbePcs7dFoF6RCSKdUqK55yROfxnUTH7qmv9LkdaQYe2RCTqXDoul/0H6nhlUbHfpUgreB0kDphpZoVmdl0z+4w3s8Vm9qaZDfW4HhFpA0b1yWRIj3Qd3mojvA6Sic650cAZwA1mNqnR9oVAX+fcSODPwMtNvYiZXWdmBWZWUFJS4m3FIuI7M+PScbks3VLKMg26Rz1Pg8Q5Vxz8ugN4CRjXaHuZc25fcPkNIN7Mspp4nYedc/nOufzs7GwvSxaRKHH+sb1IjItRr6QN8CxIzCzVzNIPLgOnAcsa7dPDzCy4PC5Yj+5uIyJkJMdz9oie/GdRMRUadI9qXvZIugPvm9liYD7wunNuuplNM7NpwX0uApYF97kPmOp0FZKIBF12XB/2Vdfy2hINukcza2u/t/Pz811BwZcuSRGRdsg5x1f/9B5m8Mb3TiQmxvwuqc0ys8JDXIYRklb1SMxsgJklBpcnm9n3zCzTi4JERA4yM66fMoBV28p5c9k2v8uRZrT20NYLQJ2ZDQQeBfoBT3tWlYhI0NkjejKoWxr3vP0JdfVt6whKR9HaIKl3ztUCFwB/cs7dDOR4V5aISEBsjPH9UwazZsc+Xl2ssZJo1NogqTGzS4ErgdeC6+K9KUlE5IvOGNaDIT3SufedT6mtq/e7HGmktUFyNTAeuMM5t97M+gFPeleWiMjnYmKMH5w6mPU7K3jxY92rJNq0Kkiccyucc99zzj1jZp2BdOfcbz2uTUTkM6ce050RvTO4751POVCrXkk0ae1ZW7PMrJOZdQEWA4+b2R+9LU1E5HNmxs2nDqZoTyXPFeq+7tGktYe2MpxzZcCFwOPOuTHAKd6VJSLyZZMHZzM6N5O//HcNNRoriRqtDZI4M8sBvs7ng+0iIhFlZtx48kCKS6t0BlcUaW2Q3AbMANY65xaYWX/gU+/KEhFp2pSjunFU93Qemr1O93WPEq0dbH/OOTfCOffd4PfrnHNf87Y0EZEvMzO+c1J/Vm8vZ9Zq3VYiGrR2sL23mb1kZjvMbLuZvWBmvb0uTkSkKeeM7EnPjCT+Onut36UIrT+09TjwCtAT6AW8GlwnIhJx8bExfOvE/sxfv5uFm/b4XU6H19ogyXbOPe6cqw0+/g7oDlMi4pupY/uQkRzPQ+qV+K61QbLTzL5hZrHBxzfQDahExEepiXFcOb4vM1dsZ23JPr/L6dBaGyTXEDj1dxuwlcANqa72qigRkdb45oQ8EmJjeHj2Or9L6dBae9bWJufcuc65bOdcN+fc+QQuThQR8U1WWiJfz+/DSx9vYXtZld/ldFih3Gr3B2GrQkTkCF17Yn9q6+t5bO56v0vpsEIJEt3zUkR8l9s1hTOH5/D0vE2UVdX4XU6HFEqQ6JJSEYkK004aQHl1LU9/tMnvUjqkQwaJmZWbWVkTj3IC15SIiPhuWK8MThiYxWPvr6e6ts7vcjqcQwaJcy7dOdepiUe6cy4uUkWKiLRk2kkD2FFezcu68VXEhXJoS0Qkakwc2JWhPTvx0Jx11NfryHskKUhEpF0ITOY4gHUlFby1crvf5XQoChIRaTfOHNaDPl2SeXD2Wk0xH0EKEhFpN+JiY7juxP58vGkv89bt9rucDkNBIiLtysX5fchOT+T+d3XvvUhRkIhIu5IUH8t1J/Zn7ppdFG7UFPORoCARkXbnsuNy6ZwSz/3/Va8kEhQkItLupCbG8e0T+/Pu6hKWbSn1u5x2T0EiIu3SFeP7kp4Ux/3/XeN3Ke2egkRE2qVOSfFcPSGP6cu3sXpbud/ltGsKEhFpt66e2I/UhFj+8q56JV7yNEjMbIOZLTWzRWZWcIj9xppZnZld5GU9ItKxdE5N4Bvj+/LakmLW7FCvxCuR6JFMcc6Ncs7lN7XRzGKB3wEzIlCLiHQw153Yn5SEOO6avtrvUtqtaDi0dRPwArDD70JEpP3pmpbIdyb1Z+aK7RRs0NXuXvA6SBww08wKzey6xhvNrBdwAfCgx3WISAf2rRP70S09kd+8sVJzcHnA6yCZ6JwbDZwB3GBmkxpt/xPwY+fcIe9EY2bXmVmBmRWUlJR4VauItFMpCXHcfOpgFm7ay4zlmhk43DwNEudccfDrDuAlYFyjXfKBZ81sA3AR8ICZnd/E6zzsnMt3zuVnZ2d7WbKItFMXj+nNwG5p3DV9FTV19X6X0654FiRmlmpm6QeXgdOAZQ33cc71c87lOefygOeB651zL3tVk4h0XHGxMfz4q0NYt7OCfy3Y7Hc57YqXPZLuwPtmthiYD7zunJtuZtPMbJqH7ysi0qRTju7GuLwu/OntT6morvW7nHbDs/uuO+fWASObWN/kwLpz7iqvahERgcBdFH98xlF87a8f8nxhEVdOyPO7pHYhGk7/FRGJmDF9uzC0ZyeeXbBZZ3CFiYJERDqcqeNyWbm1jKWaGTgsFCQi0uGcO7InSfExPKtB97BQkIhIh5ORHM+Zw3N4ZVEx+w9o0D1UChIR6ZCmjs1lX3Utry/Z6ncpbZ6CREQ6pLF5nemflaprSsJAQSIiHZKZccnYPhRs3KMp5kOkIBGRDuvC0b2JizH1SkKkIBGRDis7PZFTju7OCwu3cKBW828dKQWJiHRol4zrw+6KA7y1QrMCHykFiYh0aJMGZdMzI4lnF2zyu5Q2S0EiIh1abIxxcX4f3l+zk8279/tdTpukIBGRDu/rY/sA8FyBBt2PhIJERDq8XpnJTBqUzb8LiqjVTa8Om4JERAS4dFwftpVVMfsT3c77cClIRESArxzdnay0RE3keAQUJCIiQHxsDBeN6c1/V+1gR1mV3+W0KQoSEZGgS8b2oa7e8Vxhkd+ltCkKEhGRoH5ZqRzfvwv/WrCZ+nrdPbG1FCQiIg1cOi6XTbv38+G6XX6X0mYoSEREGjh9aA8yU+J5ct5Gv0tpMxQkIiINJMXHMnVsLjOWb2PL3kq/y2kTFCQiIo1cMb4vAE98qF5JayhIREQa6ZWZzOlDe/Dsgk1U1dT5XU7UU5CIiDThygl57N1fw38WbfG7lKinIBERacJx/bowpEc6j8/dgHM6FfhQFCQiIk0wM66emMeqbeV8tH633+VENQWJiEgzzhvVi8yUeP4+d4PfpUQ1BYmISDOS4mO5dFwuM1dso2iPbnrVHAWJiMghfOP4wKnAj7y33udKopeCRETkEHplJnPJ2Fz++eEGCjfu8bucqKQgERFpwf+eOYScjGRueW4xlQd0XUljChIRkRakJ8Vz90UjWL+zgrtmrPK7nKjjaZCY2QYzW2pmi8ysoInt55nZkoPbzewEL+sRETlSEwZm8c3xfXl87gbmaWbgL4hEj2SKc26Ucy6/iW3vACOdc6OAa4BHIlCPiMgR+ckZQ+jbNYUfPr+Yiupav8uJGr4e2nLO7XOfXzKaCujyURGJWikJcfz+4pEU7ankt2/qENdBXgeJA2aaWaGZXdfUDmZ2gZmtAl4n0CsREYlaY/O68M3j+/LURxtZta3M73KigtdBMtE5Nxo4A7jBzCY13sE595JzbghwPnB7Uy9iZtcFx1AKSkpKvK1YRKQFN586mPSkeG5/bYXm4cLjIHHOFQe/7gBeAsYdYt85wAAzy2pi28POuXznXH52drZn9YqItEZmSgLfP2UQc9fs4p2VO/wux3eeBYmZpZpZ+sFl4DRgWaN9BpqZBZdHAwmATocQkaj3jeP70j87lTveWMmB2nq/y/GVlz2S7sD7ZrYYmA+87pybbmbTzGxacJ+vAcvMbBHwF+ASp36iiLQB8bEx/PysY1i/s4InOvj93a2t/d7Oz893BQVfuiRFRCTinHN887H5LN68l9k/nELn1AS/S2qWmRU2cxlGyHRlu4jIETIzfn72MeyrruWetz/xuxzfKEhEREIwuHs6l47L5emPNrFxV4Xf5fhCQSIiEqL/+cog4mKNP77VMXslChIRkRB165TE1RP78criYlZu7XgXKSpIRETCYNqkAaQnxvH7Gav9LiXiFCQiImGQkRLPd04awDurdlCwYbff5USUgkREJEyunphHVloid81Y3aGmTlGQiIiESUpCHN/7ykDmr9/NnE93+l1OxChIRETCaOrYXPp0Seau6auoq+8YvRIFiYhIGCXExfDD04ewvLiMp+dv8ruciFCQiIiE2Tkjcpg4sCt3TV/FjvIqv8vxnIJERCTMzIzbzxtGdU09v3l9pd/leE5BIiLigf7ZaUybPICXFxUzd037HnhXkIiIeOT6yQPo2zWFn728jKqaOr/L8YyCRETEI0nxsdx+3jDW76zgodnr/C7HMwoSEREPTRqczdkjcvjLrDWsKG6f83ApSEREPPbLc4eSmRzPDU8vpLyqxu9ywk5BIiLisay0RP586bFs2r2fn7ywtN1Nn6IgERGJgOP6d+WW047i9aVb+eeH7ese7woSEZEI+c6k/pw8pBu/fn0Fizfv9bucsFGQiIhESEyM8YeLR9ItPYnrn1rIhp2tuzVvdW0dD89Zy8eb9nhc4ZFRkIiIRFDn1AT+cvloyqtqOOPe93hy3sY1SilLAAAJLElEQVRmx0ycc8xcvo3T7pnDb95YxYzl2yNcbesoSEREImxUn0xm3nwS+Xmd+dnLy7jy8QVsKw3MyeWco7Syho837eHyRz7iuicKiY+N4R/XjOMnZwzxufKmWVs7eyA/P98VFBT4XYaISMicczw5byN3vLGSGDNSEmLZu7+G2uD085kp8dx8ymAuOy6X+NjQ/u43s0LnXH446m4szosXFRGRlpkZV4zP44RB2Tw0ey1mRpfUeDqnJNAlNYGTh3QjMyXB7zJbpCAREfFZv6xUfvu1EX6XccQ0RiIiIiFRkIiISEgUJCIiEhIFiYiIhERBIiIiIVGQiIhISBQkIiISEgWJiIiEpM1NkWJmJUDjyfwzgNIW1h3q+4PLDddlATuPsMym6jmcfQ63PS0th9KWlmptaZ/29Nm0pi2N13n52ejn7NDr2+rPWXPbQv1sUp1z2S1WfiScc23+ATzc0rpDfX9wudG6gnDWczj7HG57WloOpS2htqc9fTataUskPxv9nLXPn7No/GxaerSXQ1uvtmLdob5/tZl9wlnP4exzuO1pzXIoQmlPe/psWtOWxuu8/Gz0c3bo9W3156y5bX5+NofU5g5tRYqZFTiPZsqMtPbUFmhf7VFbold7ao/XbWkvPRIvPOx3AWHUntoC7as9akv0ak/t8bQt6pGIiEhI1CMREZGQtPsgMbPHzGyHmS07gueOMbOlZrbGzO4zM2uw7SYzW21my83srvBWfciawt4eM/ulmW0xs0XBx5nhr7zJejz5bILbbzEzZ2ZZ4au4xZq8+GxuN7Mlwc9lppn1DH/lTdbjRVvuNrNVwfa8ZGaZ4a+82Zq8aM/Fwf//9Wbm+VhKKG1o5vWuNLNPg48rG6w/5P+tJnl5Slg0PIBJwGhg2RE8dz4wHjDgTeCM4PopwNtAYvD7bm28Pb8EbmkPn01wWx9gBoHrjbLacnuATg32+R7wYBtuy2lAXHD5d8Dv2vhnczRwFDALyI/WNgTry2u0rguwLvi1c3C586Hae6hHu++ROOfmALsbrjOzAWY23cwKzew9MxvS+HlmlkPgP/GHLvCv+0/g/ODm7wK/dc5VB99jh7et+JxH7fGFh225B/gRENEBQC/a45wra7BrKhFqk0dtmemcqw3uOg/o7W0rPudRe1Y651ZHov7g+x1RG5pxOvCWc263c24P8Bbw1SP9PdHug6QZDwM3OefGALcADzSxTy+gqMH3RcF1AIOBE83sIzObbWZjPa22ZaG2B+DG4CGHx8yss3eltiiktpjZucAW59xirwttpZA/GzO7w8w2A5cDv/Cw1paE4+fsoGsI/LXrp3C2xy+taUNTegGbG3x/sF1H1N4Od892M0sDJgDPNTj0l9jUrk2sO/jXYByB7uDxwFjg32bWP5jgERWm9vwVuD34/e3AHwj8R4+oUNtiZinArQQOofguTJ8NzrlbgVvN7KfAjcD/hbnUFoWrLcHXuhWoBZ4KZ42HI5zt8cuh2mBmVwP/E1w3EHjDzA4A651zF9B8u46ovR0uSAj0wvY650Y1XGlmsUBh8NtXCPxybdj17g0UB5eLgBeDwTHfzOoJzGVT4mXhzQi5Pc657Q2e9zfgNS8LPoRQ2zIA6AcsDv7H6g0sNLNxzrltHtfelHD8rDX0NPA6PgQJYWpLcFD3bOArfvzh1UC4Pxs/NNkGAOfc48DjAGY2C7jKObehwS5FwOQG3/cmMJZSxJG01+sBomh4AHk0GKACPgAuDi4bMLKZ5y0g0Os4OOh0ZnD9NOC24PJgAl1Ea8PtyWmwz83As221LY322UAEB9s9+mwGNdjnJuD5NtyWrwIrgOxIfiZe/6wRocH2I20DzQ+2rydwZKVzcLlLa9rbZF1+fKAR/uF5BtgK1BBI228R+Kt1OrA4+IP9i2aemw8sA9YC9/P5BZwJwJPBbQuBk9t4e54AlgJLCPwVltNW29Jonw1E9qwtLz6bF4LrlxCYN6lXG27LGgJ/dC0KPiJyBpqH7bkg+FrVwHZgRjS2gSaCJLj+muBnsga4uqX2HuqhK9tFRCQkHfWsLRERCRMFiYiIhERBIiIiIVGQiIhISBQkIiISEgWJtAtmti/C7/eImR0Tpteqs8DsvsvM7NWWZsU1s0wzuz4c7y0SDjr9V9oFM9vnnEsL4+vFuc8nGPRUw9rN7B/AJ865Ow6xfx7wmnNuWCTqE2mJeiTSbplZtpm9YGYLgo+JwfXjzOwDM/s4+PWo4PqrzOw5M3sVmGlmk81slpk9b4H7aDx18N4MwfX5weV9wYkVF5vZPDPrHlw/IPj9AjO7rZW9pg/5fALKNDN7x8wWWuD+EOcF9/ktMCDYi7k7uO8Pg++zxMx+FcZ/RpEWKUikPbsXuMc5Nxb4GvBIcP0qYJJz7lgCs+n+psFzxgNXOudODn5/LPB94BigPzCxifdJBeY550YCc4BrG7z/vcH3b3G+ouA8T18hMLsAQBVwgXNuNIF74PwhGGQ/AdY650Y5535oZqcBg4BxwChgjJlNaun9RMKlI07aKB3HKcAxDWZG7WRm6UAG8A8zG0RgZtP4Bs95yznX8J4P851zRQBmtojAXEfvN3qfA3w+0WUhcGpweTyf38vhaeD3zdSZ3OC1CwncGwICcx39JhgK9QR6Kt2beP5pwcfHwe/TCATLnGbeTySsFCTSnsUA451zlQ1XmtmfgXedcxcExxtmNdhc0eg1qhss19H0/5ka9/lgY3P7HEqlc26UmWUQCKQbgPsI3H8kGxjjnKsxsw1AUhPPN+BO59xDh/m+ImGhQ1vSns0kcP8OAMzs4HTbGcCW4PJVHr7/PAKH1ACmtrSzc66UwO10bzGzeAJ17giGyBSgb3DXciC9wVNnANcE70+BmfUys25haoNIixQk0l6kmFlRg8cPCPxSzg8OQK8gMP0/wF3AnWY2F4j1sKbvAz8ws/lADlDa0hOccx8TmMl1KoEbP+WbWQGB3smq4D67gLnB04Xvds7NJHDo7EMzWwo8zxeDRsRTOv1XxCPBOzZWOuecmU0FLnXOndfS80TaGo2RiHhnDHB/8Eyrvfhw+2KRSFCPREREQqIxEhERCYmCREREQqIgERGRkChIREQkJAoSEREJiYJERERC8v8BikY42wFHVVMAAAAASUVORK5CYII=\n", 185 | "text/plain": [ 186 | "
" 187 | ] 188 | }, 189 | "metadata": { 190 | "needs_background": "light" 191 | }, 192 | "output_type": "display_data" 193 | } 194 | ], 195 | "source": [ 196 | "learn.lr_find()\n", 197 | "learn.recorder.plot(skip_end=15)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 9, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "lr = 5e-2" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 10, 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/html": [ 217 | "Total time: 00:44

\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | "
epochtrain_lossvalid_lossaccuracytime
03.7267013.4433510.34779400:44
" 237 | ], 238 | "text/plain": [ 239 | "" 240 | ] 241 | }, 242 | "metadata": {}, 243 | "output_type": "display_data" 244 | } 245 | ], 246 | "source": [ 247 | "learn.fit_one_cycle(1, lr, moms=(.8, .7))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 11, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "data": { 257 | "text/plain": [ 258 | "LanguageLearner(data=TextLMDataBunch;\n", 259 | "\n", 260 | "Train: LabelList (17069 items)\n", 261 | "x: LMTextList\n", 262 | "xxbos xxmaj xxunk was rescued from the construction site behind my house . xxmaj he is quite the manja type and loves to play . xxmaj he makes a good companion and playmate for young children . xxmaj he is quite the handsome chap with a distinct mark on his face like a beauty mark .,xxbos went to teluk xxunk xxunk restaurant saw this female puppies alone by the beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk,xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human , loves human touches , loves other dogs . xxmaj we are looking for a forever home that could continue letting xxmaj cherry do all that she loves . xxmaj we want to find xxmaj cherry a home which will treat her as a house pet and stays indoor . xxmaj do n't be mistaken , xxmaj cherry is very alert at strangers and noises at the gate , but being a watch dog should not be her full time ' job ' . xxmaj if you love a dog who loves to manja , pls call us . xxmaj home visits will be arranged with potential adopter , and adopter to reimburse spaying n vaccination costs . xxup tq . :),xxbos xxmaj this puppy rescued from xxup dbkl pound last wednesday , and been sent to vet for check out . xxmaj pls give this lovely puppy a loving home if you can .,xxbos xxmaj he is cute and fun to be with .\n", 263 | "y: LMLabelList\n", 264 | ",,,,\n", 265 | "Path: .;\n", 266 | "\n", 267 | "Valid: LabelList (1896 items)\n", 268 | "x: LMTextList\n", 269 | "xxbos xxmaj healthy puppy for adoption . xxmaj commitment to spay is compulsory . xxmaj interested to adopt pls contact xxmaj amy,xxbos a healthy , clean , sweet little girl in xxmaj xxunk . send me message if you can give her a nice home .,xxbos xxmaj looking for a xxup serious adopter for xxmaj omey . xxmaj the adoption fee only for xxunk and will be return back . xxmaj feel free to whatapps me . * xxmaj she is mixed siamese and xxmaj domestic xxmaj short xxmaj hair * xxmaj diet - xxmaj canned xxmaj food xxmaj mackerel xxmaj fussie xxmaj cat + kibbles xxmaj blackwood ( xxmaj chicken and xxmaj corn ) * xxmaj she is litter trained , but you may need to train her in the new home . * xxmaj for adopter who never have cat , i will guide you thoroughly and xxunk kibbles will be given for free .,xxbos xxmaj kitten to let go for serious adopter . xxmaj playful & xxmaj healthy . xxmaj diet : xxmaj royal xxmaj canin 32 . 1st vaccinated completed , new owner have to follow up with 2nd & 3rd vaccination for xxmaj fila . xxmaj she is litter train . xxmaj looking for experienced and serious adopter only . xxmaj price are completed with 1st vaccine , deworm , and anti - flea vaccine .,xxbos xxmaj labrador cross , huge in size at the age of 3 months , chest with a star symbol , very unique .\n", 270 | "y: LMLabelList\n", 271 | ",,,,\n", 272 | "Path: .;\n", 273 | "\n", 274 | "Test: None, model=SequentialRNN(\n", 275 | " (0): AWD_LSTM(\n", 276 | " (encoder): Embedding(9853, 400, padding_idx=1)\n", 277 | " (encoder_dp): EmbeddingDropout(\n", 278 | " (emb): Embedding(9853, 400, padding_idx=1)\n", 279 | " )\n", 280 | " (rnns): ModuleList(\n", 281 | " (0): WeightDropout(\n", 282 | " (module): LSTM(400, 1150, batch_first=True)\n", 283 | " )\n", 284 | " (1): WeightDropout(\n", 285 | " (module): LSTM(1150, 1150, batch_first=True)\n", 286 | " )\n", 287 | " (2): WeightDropout(\n", 288 | " (module): LSTM(1150, 400, batch_first=True)\n", 289 | " )\n", 290 | " )\n", 291 | " (input_dp): RNNDropout()\n", 292 | " (hidden_dps): ModuleList(\n", 293 | " (0): RNNDropout()\n", 294 | " (1): RNNDropout()\n", 295 | " (2): RNNDropout()\n", 296 | " )\n", 297 | " )\n", 298 | " (1): LinearDecoder(\n", 299 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n", 300 | " (output_dp): RNNDropout()\n", 301 | " )\n", 302 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=WindowsPath('C:/work/ML/PetFinder'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", 303 | "learn: LanguageLearner(data=TextLMDataBunch;\n", 304 | "\n", 305 | "Train: LabelList (17069 items)\n", 306 | "x: LMTextList\n", 307 | "xxbos xxmaj xxunk was rescued from the construction site behind my house . xxmaj he is quite the manja type and loves to play . xxmaj he makes a good companion and playmate for young children . xxmaj he is quite the handsome chap with a distinct mark on his face like a beauty mark .,xxbos went to teluk xxunk xxunk restaurant saw this female puppies alone by the beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk,xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human , loves human touches , loves other dogs . xxmaj we are looking for a forever home that could continue letting xxmaj cherry do all that she loves . xxmaj we want to find xxmaj cherry a home which will treat her as a house pet and stays indoor . xxmaj do n't be mistaken , xxmaj cherry is very alert at strangers and noises at the gate , but being a watch dog should not be her full time ' job ' . xxmaj if you love a dog who loves to manja , pls call us . xxmaj home visits will be arranged with potential adopter , and adopter to reimburse spaying n vaccination costs . xxup tq . :),xxbos xxmaj this puppy rescued from xxup dbkl pound last wednesday , and been sent to vet for check out . xxmaj pls give this lovely puppy a loving home if you can .,xxbos xxmaj he is cute and fun to be with .\n", 308 | "y: LMLabelList\n", 309 | ",,,,\n", 310 | "Path: .;\n", 311 | "\n", 312 | "Valid: LabelList (1896 items)\n", 313 | "x: LMTextList\n", 314 | "xxbos xxmaj healthy puppy for adoption . xxmaj commitment to spay is compulsory . xxmaj interested to adopt pls contact xxmaj amy,xxbos a healthy , clean , sweet little girl in xxmaj xxunk . send me message if you can give her a nice home .,xxbos xxmaj looking for a xxup serious adopter for xxmaj omey . xxmaj the adoption fee only for xxunk and will be return back . xxmaj feel free to whatapps me . * xxmaj she is mixed siamese and xxmaj domestic xxmaj short xxmaj hair * xxmaj diet - xxmaj canned xxmaj food xxmaj mackerel xxmaj fussie xxmaj cat + kibbles xxmaj blackwood ( xxmaj chicken and xxmaj corn ) * xxmaj she is litter trained , but you may need to train her in the new home . * xxmaj for adopter who never have cat , i will guide you thoroughly and xxunk kibbles will be given for free .,xxbos xxmaj kitten to let go for serious adopter . xxmaj playful & xxmaj healthy . xxmaj diet : xxmaj royal xxmaj canin 32 . 1st vaccinated completed , new owner have to follow up with 2nd & 3rd vaccination for xxmaj fila . xxmaj she is litter train . xxmaj looking for experienced and serious adopter only . xxmaj price are completed with 1st vaccine , deworm , and anti - flea vaccine .,xxbos xxmaj labrador cross , huge in size at the age of 3 months , chest with a star symbol , very unique .\n", 315 | "y: LMLabelList\n", 316 | ",,,,\n", 317 | "Path: .;\n", 318 | "\n", 319 | "Test: None, model=SequentialRNN(\n", 320 | " (0): AWD_LSTM(\n", 321 | " (encoder): Embedding(9853, 400, padding_idx=1)\n", 322 | " (encoder_dp): EmbeddingDropout(\n", 323 | " (emb): Embedding(9853, 400, padding_idx=1)\n", 324 | " )\n", 325 | " (rnns): ModuleList(\n", 326 | " (0): WeightDropout(\n", 327 | " (module): LSTM(400, 1150, batch_first=True)\n", 328 | " )\n", 329 | " (1): WeightDropout(\n", 330 | " (module): LSTM(1150, 1150, batch_first=True)\n", 331 | " )\n", 332 | " (2): WeightDropout(\n", 333 | " (module): LSTM(1150, 400, batch_first=True)\n", 334 | " )\n", 335 | " )\n", 336 | " (input_dp): RNNDropout()\n", 337 | " (hidden_dps): ModuleList(\n", 338 | " (0): RNNDropout()\n", 339 | " (1): RNNDropout()\n", 340 | " (2): RNNDropout()\n", 341 | " )\n", 342 | " )\n", 343 | " (1): LinearDecoder(\n", 344 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n", 345 | " (output_dp): RNNDropout()\n", 346 | " )\n", 347 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=WindowsPath('C:/work/ML/PetFinder'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", 348 | " (0): WeightDropout(\n", 349 | " (module): LSTM(400, 1150, batch_first=True)\n", 350 | " )\n", 351 | " (1): RNNDropout()\n", 352 | "), Sequential(\n", 353 | " (0): WeightDropout(\n", 354 | " (module): LSTM(1150, 1150, batch_first=True)\n", 355 | " )\n", 356 | " (1): RNNDropout()\n", 357 | "), Sequential(\n", 358 | " (0): WeightDropout(\n", 359 | " (module): LSTM(1150, 400, batch_first=True)\n", 360 | " )\n", 361 | " (1): RNNDropout()\n", 362 | "), Sequential(\n", 363 | " (0): Embedding(9853, 400, padding_idx=1)\n", 364 | " (1): EmbeddingDropout(\n", 365 | " (emb): Embedding(9853, 400, padding_idx=1)\n", 366 | " )\n", 367 | " (2): LinearDecoder(\n", 368 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n", 369 | " (output_dp): RNNDropout()\n", 370 | " )\n", 371 | ")], add_time=True, silent=None)\n", 372 | "alpha: 2.0\n", 373 | "beta: 1.0], layer_groups=[Sequential(\n", 374 | " (0): WeightDropout(\n", 375 | " (module): LSTM(400, 1150, batch_first=True)\n", 376 | " )\n", 377 | " (1): RNNDropout()\n", 378 | "), Sequential(\n", 379 | " (0): WeightDropout(\n", 380 | " (module): LSTM(1150, 1150, batch_first=True)\n", 381 | " )\n", 382 | " (1): RNNDropout()\n", 383 | "), Sequential(\n", 384 | " (0): WeightDropout(\n", 385 | " (module): LSTM(1150, 400, batch_first=True)\n", 386 | " )\n", 387 | " (1): RNNDropout()\n", 388 | "), Sequential(\n", 389 | " (0): Embedding(9853, 400, padding_idx=1)\n", 390 | " (1): EmbeddingDropout(\n", 391 | " (emb): Embedding(9853, 400, padding_idx=1)\n", 392 | " )\n", 393 | " (2): LinearDecoder(\n", 394 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n", 395 | " (output_dp): RNNDropout()\n", 396 | " )\n", 397 | ")], add_time=True, silent=None)" 398 | ] 399 | }, 400 | "execution_count": 11, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "learn.save('fit_head')\n", 407 | "learn.load('fit_head')" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 12, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "Total time: 09:54

\n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | "
epochtrain_lossvalid_lossaccuracytime
03.3760023.3379660.36089200:59
13.2189353.1980840.38081000:58
23.0883213.0879030.39780700:58
32.9003793.0200420.40895501:00
42.7759252.9847350.41636700:59
52.6517212.9659520.41998400:59
62.5462532.9636630.42205201:00
72.4719172.9635490.42282700:59
82.3994692.9675430.42377900:59
92.3763232.9713590.42361501:00
" 501 | ], 502 | "text/plain": [ 503 | "" 504 | ] 505 | }, 506 | "metadata": {}, 507 | "output_type": "display_data" 508 | } 509 | ], 510 | "source": [ 511 | "learn.unfreeze()\n", 512 | "learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "Here the most important part is save_encoder. We are saving the part of the language model responsible to encode this sentence into a tensor of information. We are going to using this fine-tuned encoder in some other part of the neural network." 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 13, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "learn.save('fine_tuned')\n", 529 | "learn.save_encoder('fine_tuned_enc')" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [] 538 | } 539 | ], 540 | "metadata": { 541 | "kernelspec": { 542 | "display_name": "Python 3", 543 | "language": "python", 544 | "name": "python3" 545 | }, 546 | "language_info": { 547 | "codemirror_mode": { 548 | "name": "ipython", 549 | "version": 3 550 | }, 551 | "file_extension": ".py", 552 | "mimetype": "text/x-python", 553 | "name": "python", 554 | "nbconvert_exporter": "python", 555 | "pygments_lexer": "ipython3", 556 | "version": "3.7.1" 557 | } 558 | }, 559 | "nbformat": 4, 560 | "nbformat_minor": 2 561 | } 562 | --------------------------------------------------------------------------------