├── Diagram.jpg
├── NoImage.jpg
├── README.md
├── test_data.py
├── petfinder
├── test.py
├── model.py
└── data.py
├── .gitignore
├── test.py
├── Structured Only.ipynb
├── Fastai PetFinder.ipynb
└── PetFinder Language Model.ipynb
/Diagram.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EtienneT/fastai-petfinder/HEAD/Diagram.jpg
--------------------------------------------------------------------------------
/NoImage.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EtienneT/fastai-petfinder/HEAD/NoImage.jpg
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # fastai-petfinder
2 | Merging image, tabular and text data in a neural network with fastai with the [PetFinder Kaggle competition](https://www.kaggle.com/c/petfinder-adoption-prediction/).
3 |
4 | The main notebook is [Fastai PetFinder](https://github.com/EtienneT/fastai-petfinder/blob/master/Fastai%20PetFinder.ipynb), but you need to run [PetFinder Language Model](https://github.com/EtienneT/fastai-petfinder/blob/master/PetFinder%20Language%20Model.ipynb) before to fine tune a language model on the data.
5 |
6 | 
7 |
--------------------------------------------------------------------------------
/test_data.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 | import json
5 | import os
6 | import feather
7 | from pathlib import Path
8 | from pandas.io.json import json_normalize
9 | from tqdm import tqdm
10 | import fastai
11 | from PetData import *
12 |
13 | from fastai.tabular import *
14 |
15 | pets = get_data()
16 |
17 | dep_var = 'AdoptionSpeed'
18 | cont_names, cat_names = cont_cat_split(pets, 50, dep_var=dep_var)
19 | cat_names.remove('Filename')
20 | cat_names.remove('PicturePath')
21 |
22 | miss = FillMissing(cat_names, cont_names)
23 |
24 | df = miss.apply_train(pets)
25 |
26 | df.columns
27 |
--------------------------------------------------------------------------------
/petfinder/test.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics import cohen_kappa_score
2 | import numpy as np
3 | import pandas as pd
4 | from functools import partial
5 | import scipy as sp
6 |
7 | __all__ = ['OptimizedRounder']
8 |
9 | # Credits to https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
10 | class OptimizedRounder(object):
11 | def __init__(self):
12 | self.coef_ = 0
13 |
14 | def _kappa_loss(self, coef, X, y):
15 | preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
16 | return -cohen_kappa_score(y, preds, weights = 'quadratic')
17 |
18 | def fit(self, X, y):
19 | loss_partial = partial(self._kappa_loss, X = X, y = y)
20 | initial_coef = [0.5, 1.5, 2.5, 3.5]
21 | self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method = 'nelder-mead')
22 |
23 | def predict(self, X, coef):
24 | preds = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3, 4])
25 | return preds
26 |
27 | def coefficients(self):
28 | return self.coef_['x']
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | /.vscode/
106 | /logs/
107 | /models/
108 | /test/
109 | /test_images/
110 | /test_metadata/
111 | /test_sentiment/
112 | /train/
113 | /train_images/
114 | /train_metadata/
115 | /train_sentiment/
116 |
117 | /*.pkl
118 | /*.csv
119 | /*.feather
--------------------------------------------------------------------------------
/petfinder/model.py:
--------------------------------------------------------------------------------
1 | from fastai.torch_core import *
2 | from fastai.vision import *
3 | from fastai.tabular.models import *
4 | from fastai.tabular import *
5 | from fastai.layers import *
6 | from fastai.text import *
7 | from fastai.callbacks import *
8 | from fastai.metrics import *
9 | import torch
10 |
11 | __all__ = ['ImageTabularTextLearner', 'collate_mixed', 'image_tabular_text_learner', 'normalize_custom_funcs']
12 |
13 | class ImageTabularTextModel(nn.Module):
14 | def __init__(self, emb_szs:ListSizes, n_cont:int, vocab_sz:int, encoder, use_trainer):
15 | super().__init__()
16 | self.use_trainer = use_trainer
17 | self.cnn = create_body(models.resnet34)
18 | nf = num_features_model(self.cnn) * 2
19 | drop = .5
20 |
21 | self.lm_encoder = SequentialRNN(encoder[0], PoolingLinearClassifier([400 * 3] + [32], [.4]))
22 |
23 | self.tab = TabularModel(emb_szs, n_cont, 128, [512, 256])
24 |
25 | self.reduce = nn.Sequential(*([AdaptiveConcatPool2d(), Flatten()] + bn_drop_lin(nf, 512, bn=True, p=drop, actn=nn.ReLU(inplace=True))))
26 | self.merge = nn.Sequential(*bn_drop_lin(512 + 128 + 32, 128, bn=True, p=drop, actn=nn.ReLU(inplace=True)))
27 | self.final = nn.Sequential(*bn_drop_lin(128, 1, bn=False, p=0., actn=None))
28 |
29 | def forward(self, img:Tensor, x:Tensor, text:Tensor) -> Tensor:
30 | imgCnn = self.cnn(img)
31 | imgLatent = self.reduce(imgCnn)
32 | tabLatent = self.tab(x[0], x[1])
33 | textLatent = self.lm_encoder(text)
34 |
35 | cat = torch.cat([imgLatent, F.relu(tabLatent), F.relu(textLatent[0])], dim=1)
36 |
37 | pred = self.final(self.merge(cat))
38 | pred = torch.sigmoid(pred) * 4 # making sure this is in the range 0-4
39 |
40 | if(not self.use_trainer):
41 | return pred
42 | else:
43 | return pred, textLatent
44 |
45 | def reset(self):
46 | for c in self.children():
47 | if hasattr(c, 'reset'): c.reset()
48 |
49 | def collate_mixed(samples, pad_idx:int=0):
50 | # Find max length of the text from the MixedItemList
51 | max_len = max([len(s[0].data[2]) for s in samples])
52 |
53 | for s in samples:
54 | res = np.zeros(max_len + pad_idx, dtype=np.int64)
55 | res[:len(s[0].data[2])] = s[0].data[2]
56 | s[0].data[2] = res
57 |
58 | return data_collate(samples)
59 |
60 | def split_layers(model:nn.Module) -> List[nn.Module]:
61 | groups = [[model.cnn, model.lm_encoder]]
62 | groups += [[model.tab, model.reduce, model.merge, model.final]]
63 | return groups
64 |
65 | class RNNTrainerCustom(RNNTrainer):
66 | def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs):
67 | "Save the extra outputs for later and only returns the true output."
68 | self.raw_out,self.out = last_output[1][1],last_output[1][2]
69 | return {'last_output': last_output[0]}
70 |
71 |
72 | def _normalize_images_batch(b:Tuple[Tensor,Tensor], mean:FloatTensor, std:FloatTensor)->Tuple[Tensor,Tensor]:
73 | "`b` = `x`,`y` - normalize `x` array of imgs and `do_y` optionally `y`."
74 | x,y = b
75 | mean,std = mean.to(x[0].device),std.to(x[0].device)
76 | x[0] = normalize(x[0],mean,std)
77 | return x,y
78 |
79 | def normalize_custom_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False)->Tuple[Callable,Callable]:
80 | "Create normalize/denormalize func using `mean` and `std`, can specify `do_y` and `device`."
81 | mean,std = tensor(mean),tensor(std)
82 | return (partial(_normalize_images_batch, mean=mean, std=std),
83 | partial(denormalize, mean=mean, std=std))
84 |
85 | class ImageTabularTextLearner(Learner):
86 | def __init__(self, data:DataBunch, model:nn.Module, use_trainer:bool=False, alpha:float=2., beta:float=1., **learn_kwargs):
87 | super().__init__(data, model, **learn_kwargs)
88 | if(use_trainer):
89 | self.callbacks.append(RNNTrainerCustom(self, alpha=alpha, beta=beta))
90 | self.split(split_layers)
91 |
92 | def image_tabular_text_learner(data, len_cont_names, vocab_sz, data_lm, use_trainer:bool=False):
93 | l = text_classifier_learner(data_lm, AWD_LSTM, drop_mult=0.5)
94 | l.load_encoder('fine_tuned_enc')
95 |
96 | emb = data.train_ds.x.item_lists[1].get_emb_szs()
97 | model = ImageTabularTextModel(emb, len_cont_names, vocab_sz, l.model, use_trainer)
98 |
99 | learn = ImageTabularTextLearner(data, model, use_trainer, metrics=[mae, rmse])
100 | return learn
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | #%% [markdown]
2 | # # Image, tabular and text data in the same deep learning model
3 | #
4 | # Deep learning has advanced tremendously in the last 2-3 years. Researchers are always pushing more and more the boundaries of the state of the art in various sub-domain. To do that researchers also have to specialize and imerse themselves in one domain of deep learning. We often see deep learning models handling image data or text data or structured data. But we rarely see them used together when you have a dataset that contain them all. Datasets in the real world are much messier than academic datasets. Being able to leverage everything you have in your data can yield very interesting results.
5 | #
6 | # # Transfer Learning
7 | #
8 | # Transfer learning in deep learning has also become very popular in recent years, more specially for image data with pre-trained ImageNet models that leverage models trained on millions of images that are expensive to train but allow you to re-use that knowledge for other tasks. More recently text data started having its own transfer learning moment with pre-trained models like ULMFit, BERT and GPT-2.
9 |
10 | #%%
11 | import warnings
12 | warnings.filterwarnings('ignore')
13 |
14 | import matplotlib.pyplot as plt
15 | import numpy as np
16 | import pandas as pd
17 | import json
18 | import os
19 | import feather
20 | from fastai.text import *
21 |
22 | from petfinder.data import *
23 |
24 |
25 | #%%
26 | path = 'C:\\work\\ML\\PetFinder\\'
27 | bs=64
28 |
29 | pets = get_data()
30 | petsTest = get_data(True)
31 |
32 | # pets['IsTest'] = False
33 | # petsTest['IsTest'] = True
34 |
35 | # pets = pd.concat([pets, petsTest])
36 |
37 | # pets = feather.read_dataframe(path + 'pets.feather')
38 | data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)
39 |
40 | #%%
41 | from fastai.tabular import *
42 | from fastai.vision import *
43 | from fastai.metrics import *
44 | from fastai.text import *
45 |
46 | dep_var = 'AdoptionSpeed'
47 | cont_names, cat_names = cont_cat_split(pets, 50, dep_var=dep_var)
48 | procs = [FillMissing, Categorify, Normalize]
49 | cat_names.remove('Filename')
50 | cat_names.remove('PicturePath')
51 | cat_names.remove('PetID')
52 | cat_names.remove('Description')
53 |
54 |
55 | #%%
56 | # cont_names, cat_names
57 |
58 | #%%
59 | from petfinder.model import *
60 |
61 | #%%
62 | from fastai.callbacks import *
63 |
64 | bs = 32
65 | size = 224
66 | np.random.seed(42)
67 |
68 | data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)
69 | vocab = data_lm.vocab
70 |
71 | imgList = ImageList.from_df(pets, path=path, cols='PicturePath')
72 | tabList = TabularList.from_df(pets, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)
73 | textList = TextList.from_df(pets, cols='Description', path=path, vocab=vocab)
74 |
75 | norm, denorm = normalize_custom_funcs(*imagenet_stats)
76 |
77 | if os.path.isfile(path + 'mixed_img_tab_text.pkl') != True :
78 | mixed = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df)
79 | .random_split_by_pct(.1)
80 | .label_from_df(cols='AdoptionSpeed', label_cls=CategoryList)
81 | .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))
82 |
83 | outfile = open(path + 'mixed_img_tab_text.pkl', 'wb')
84 | pickle.dump(mixed, outfile)
85 | outfile.close()
86 | else:
87 | infile = open(path + 'mixed_img_tab_text.pkl','rb')
88 | mixed = pickle.load(infile)
89 | infile.close()
90 |
91 |
92 | #%%
93 | # data_text = textList.random_split_by_pct(.1).label_from_df(cols='AdoptionSpeed').databunch(bs=bs)
94 | # data_text.save('text-classification-databunch.pkl')
95 | data_text = load_data(path, 'text-classification-databunch.pkl')
96 |
97 |
98 | #%%
99 | data = mixed.databunch(bs=bs, collate_fn=collate_mixed, num_workers=0)
100 | data.add_tfm(norm) # normalize images
101 |
102 |
103 | #%%
104 | cat_names = mixed.train.x.item_lists[1].cat_names
105 | cont_names = mixed.train.x.item_lists[1].cont_names
106 |
107 |
108 | #%%
109 | # from fastai.callbacks.tensorboard import LearnerTensorboardWriter
110 |
111 | learn = image_tabular_text_learner(data, len(cont_names), len(vocab.itos), data_text)
112 |
113 | learn.callback_fns +=[partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.005, patience=3)]
114 | # learn.callback_fns += [(partial(LearnerTensorboardWriter, base_dir=Path(path + 'logs\\'), name='mixed-metadata'))]
115 |
116 |
117 | #%%
118 | data.c
119 |
120 | learn.lr_find()
121 |
122 | learn.load('mixed-300')
123 |
124 | # imgList = ImageList.from_df(petsTest, path=path, cols='PicturePath')
125 | # tabList = TabularList.from_df(petsTest, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)
126 | # textList = TextList.from_df(petsTest, cols='Description', path=path, vocab=vocab)
127 |
128 | # norm, denorm = normalize_custom_funcs(*imagenet_stats)
129 |
130 | # mixedTest = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df))
131 |
132 | # learn = load_learner(path, 'mixed.pkl', test=mixedTest)
133 |
134 | pets['IsTest'] = False
135 | petsTest['IsTest'] = True
136 |
137 | petsAll = pd.concat([pets, petsTest])
138 | petsAll[pets.IsTest == True].AdoptionSpeed = -1
139 |
140 | imgListTest = ImageList.from_df(petsAll, path=path, cols='PicturePath')
141 | tabListTest = TabularList.from_df(petsAll, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)
142 | textListTest = TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab)
143 |
144 | mixedTest = (MixedItemList([imgListTest, tabListTest, textListTest], path, inner_df=tabListTest.inner_df)
145 | .split_from_df(col='IsTest')
146 | .label_from_df(cols='AdoptionSpeed', label_cls=CategoryList)
147 | .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))
148 |
149 | #%%
150 | # learn.lr_find()
--------------------------------------------------------------------------------
/petfinder/data.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 | import pandas as pd
4 | import json
5 | import os
6 | import feather
7 | from pathlib import Path
8 | from pandas.io.json import json_normalize
9 | from tqdm import tqdm
10 |
11 | __all__ = ['get_data', 'quadratic_weighted_kappa']
12 |
13 | def get_data(isTest:bool=False, useMetadata:bool=False):
14 | name = 'train'
15 | if(isTest):
16 | name = 'test'
17 |
18 | p = Path('.')
19 |
20 | petsFeather = 'pets_' + name + '.feather'
21 | if os.path.isfile(petsFeather) != True:
22 | pets = pd.read_csv(name + '\\' + name + '.csv')
23 |
24 | pImages = p / (name + '_images')
25 | pSentiments = p / (name + '_sentiment')
26 |
27 | images = [x for x in pImages.iterdir()]
28 | images = pd.DataFrame([x for x in map(lambda x: (x.name.split('.')[0].split('-')[0], x.name), images)], columns=['PetID', 'Filename'])
29 |
30 | petsImages = pd.merge(pets, images, how='left', on='PetID')
31 |
32 | petsImages['NoImage'] = petsImages['Filename'].isna()
33 | petsImages['Filename'] = petsImages['Filename'].fillna('..\\NoImage.jpg')
34 |
35 | byRescuerCount = pets.groupby(['RescuerID']).PetID.nunique().reset_index().rename({'PetID': 'RescuerDogCount'}, axis=1)
36 | petsImages = pd.merge(petsImages, byRescuerCount, how='left', on='RescuerID')
37 |
38 | cat = ['Type', 'Name', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'State', 'RescuerID']
39 | cont = ['Age', 'Fee', 'Quantity', 'RescuerDogCount', 'VideoAmt', 'PhotoAmt']
40 | for x in cat:
41 | petsImages[x] = petsImages[x].astype('category')
42 | for x in cont:
43 | petsImages[x] = petsImages[x].astype('float')
44 |
45 | petsImages['PicturePath'] = petsImages.apply(lambda x: str(name + '_images\\' + x['Filename']), axis=1)
46 | sentimentJsons = [x for x in pSentiments.iterdir()]
47 |
48 | petsSentiments = pd.DataFrame()
49 | sentiment_feather = name + '_sentiments.feather'
50 | if os.path.isfile(sentiment_feather) != True:
51 | for s in tqdm(sentimentJsons, desc='Sentiments'):
52 | with open(s, encoding='utf8') as json_data:
53 | d = json.load(json_data)
54 | df = json_normalize(d['sentences'])
55 | line = {}
56 | m = df.mean().to_dict()
57 | line['PetID'] = s.name.split('.')[0]
58 | line['AvgSentenceSentimentMagnitude'] = m['sentiment.magnitude']
59 | line['AvgSentenceSentimentScore'] = m['sentiment.score']
60 | line['SentimentMagnitude'] = d['documentSentiment']['magnitude']
61 | line['SentimentScore'] = d['documentSentiment']['score']
62 | petsSentiments = petsSentiments.append(line, ignore_index=True)
63 |
64 | petsSentiments = petsSentiments.reset_index(drop=True)
65 | petsSentiments.to_feather(sentiment_feather)
66 | else:
67 | petsSentiments = feather.read_dataframe(sentiment_feather)
68 |
69 | pets = pd.merge(petsImages, petsSentiments, how='left', on='PetID')
70 |
71 | if(useMetadata):
72 | petsMetadata = pd.DataFrame()
73 | meta_feather = name + '_metadata.feather'
74 | if os.path.isfile(meta_feather) != True:
75 | pMetadata = p / (name + '_metadata')
76 | metadataJsons = [x for x in pMetadata.iterdir()]
77 |
78 | lst = []
79 | errors = []
80 | for s in tqdm(metadataJsons, desc='Metadata'):
81 | with open(s, encoding='utf8') as json_data:
82 | try:
83 | d = json.load(json_data)
84 | df = json_normalize(d['labelAnnotations'])
85 | df = df.set_index('description').T
86 | df['PetID'] = s.name.split('-')[0]
87 | lst.append(df.loc['score'].to_dict())
88 | except:
89 | errors.append(s.name)
90 | petsMetadata = pd.DataFrame(lst)
91 | petsMetadata = petsMetadata.groupby('PetID').mean()
92 | petsMetadata = petsMetadata.fillna(0)
93 |
94 | petsMetadata = petsMetadata.reset_index()
95 | petsMetadata.to_feather(meta_feather)
96 | else:
97 | petsMetadata = feather.read_dataframe(meta_feather)
98 |
99 | pets = pd.merge(pets, petsMetadata, how='left', on='PetID')
100 |
101 | pets['NoDescription'] = pets['Description'].isna()
102 | pets['Description'] = pets['Description'].fillna('No description')
103 |
104 | # state GDP: https://en.wikipedia.org/wiki/List_of_Malaysian_states_by_GDP
105 | state_gdp = {
106 | 41336: 116.679,
107 | 41325: 40.596,
108 | 41367: 23.02,
109 | 41401: 190.075,
110 | 41415: 5.984,
111 | 41324: 37.274,
112 | 41332: 42.389,
113 | 41335: 52.452,
114 | 41330: 67.629,
115 | 41380: 5.642,
116 | 41327: 81.284,
117 | 41345: 80.167,
118 | 41342: 121.414,
119 | 41326: 280.698,
120 | 41361: 32.270
121 | }
122 |
123 | # state population: https://en.wikipedia.org/wiki/Malaysia
124 | state_population = {
125 | 41336: 33.48283,
126 | 41325: 19.47651,
127 | 41367: 15.39601,
128 | 41401: 16.74621,
129 | 41415: 0.86908,
130 | 41324: 8.21110,
131 | 41332: 10.21064,
132 | 41335: 15.00817,
133 | 41330: 23.52743,
134 | 41380: 2.31541,
135 | 41327: 15.61383,
136 | 41345: 32.06742,
137 | 41342: 24.71140,
138 | 41326: 54.62141,
139 | 41361: 10.35977
140 | }
141 |
142 | pets["state_gdp"] = pets['State'].map(state_gdp)
143 | pets["state_population"] = pets['State'].map(state_population)
144 | pets["gdp_vs_population"] = pets["state_gdp"] / pets["state_population"]
145 |
146 | pets = pets.reset_index(drop=True)
147 |
148 | pets.to_feather(petsFeather)
149 |
150 | return pets
151 | else:
152 | pets = feather.read_dataframe(petsFeather)
153 |
154 | return pets
155 |
156 | # The following 3 functions have been taken from Ben Hamner's github repository
157 | # https://github.com/benhamner/Metrics
158 | def Cmatrix(rater_a, rater_b, min_rating=None, max_rating=None):
159 | """
160 | Returns the confusion matrix between rater's ratings
161 | """
162 | assert(len(rater_a) == len(rater_b))
163 | if min_rating is None:
164 | min_rating = min(rater_a + rater_b)
165 | if max_rating is None:
166 | max_rating = max(rater_a + rater_b)
167 | num_ratings = int(max_rating - min_rating + 1)
168 | conf_mat = [[0 for i in range(num_ratings)]
169 | for j in range(num_ratings)]
170 | for a, b in zip(rater_a, rater_b):
171 | conf_mat[a - min_rating][b - min_rating] += 1
172 | return conf_mat
173 |
174 |
175 | def histogram(ratings, min_rating=None, max_rating=None):
176 | """
177 | Returns the counts of each type of rating that a rater made
178 | """
179 | if min_rating is None:
180 | min_rating = min(ratings)
181 | if max_rating is None:
182 | max_rating = max(ratings)
183 | num_ratings = int(max_rating - min_rating + 1)
184 | hist_ratings = [0 for x in range(num_ratings)]
185 | for r in ratings:
186 | hist_ratings[r - min_rating] += 1
187 | return hist_ratings
188 |
189 |
190 | def quadratic_weighted_kappa(y, y_pred):
191 | """
192 | Calculates the quadratic weighted kappa
193 | axquadratic_weighted_kappa calculates the quadratic weighted kappa
194 | value, which is a measure of inter-rater agreement between two raters
195 | that provide discrete numeric ratings. Potential values range from -1
196 | (representing complete disagreement) to 1 (representing complete
197 | agreement). A kappa value of 0 is expected if all agreement is due to
198 | chance.
199 | quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
200 | each correspond to a list of integer ratings. These lists must have the
201 | same length.
202 | The ratings should be integers, and it is assumed that they contain
203 | the complete range of possible ratings.
204 | quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
205 | is the minimum possible rating, and max_rating is the maximum possible
206 | rating
207 | """
208 | rater_a = y
209 | rater_b = y_pred
210 | min_rating=None
211 | max_rating=None
212 | rater_a = np.array(rater_a, dtype=int)
213 | rater_b = np.array(rater_b, dtype=int)
214 | assert(len(rater_a) == len(rater_b))
215 | if min_rating is None:
216 | min_rating = min(min(rater_a), min(rater_b))
217 | if max_rating is None:
218 | max_rating = max(max(rater_a), max(rater_b))
219 | conf_mat = Cmatrix(rater_a, rater_b,
220 | min_rating, max_rating)
221 | num_ratings = len(conf_mat)
222 | num_scored_items = float(len(rater_a))
223 |
224 | hist_rater_a = histogram(rater_a, min_rating, max_rating)
225 | hist_rater_b = histogram(rater_b, min_rating, max_rating)
226 |
227 | numerator = 0.0
228 | denominator = 0.0
229 |
230 | for i in range(num_ratings):
231 | for j in range(num_ratings):
232 | expected_count = (hist_rater_a[i] * hist_rater_b[j]
233 | / num_scored_items)
234 | d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
235 | numerator += d * conf_mat[i][j] / num_scored_items
236 | denominator += d * expected_count / num_scored_items
237 |
238 | return (1.0 - numerator / denominator)
--------------------------------------------------------------------------------
/Structured Only.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "The autoreload extension is already loaded. To reload it, use:\n",
13 | " %reload_ext autoreload\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import warnings\n",
19 | "warnings.filterwarnings('ignore')\n",
20 | "\n",
21 | "import matplotlib.pyplot as plt\n",
22 | "import numpy as np\n",
23 | "import pandas as pd\n",
24 | "import json\n",
25 | "import os\n",
26 | "import feather\n",
27 | "from fastai.text import *\n",
28 | "\n",
29 | "from petfinder.data import *\n",
30 | "\n",
31 | "%matplotlib inline\n",
32 | "%load_ext autoreload\n",
33 | "%autoreload 2"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 18,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n",
43 | "bs=64\n",
44 | "\n",
45 | "pets = get_data(isTest=False)\n",
46 | "petsTest = get_data(isTest=True)\n",
47 | "\n",
48 | "petsTest['AdoptionSpeed'] = 0\n",
49 | "\n",
50 | "pets.AdoptionSpeed = pets.AdoptionSpeed.astype(float)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 19,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "from fastai.tabular import *\n",
60 | "from fastai.vision import *\n",
61 | "from fastai.metrics import *\n",
62 | "from fastai.text import *\n",
63 | "\n",
64 | "dep_var = 'AdoptionSpeed'\n",
65 | "cont_names, cat_names = cont_cat_split(pets, dep_var=dep_var)\n",
66 | "procs = [FillMissing, Categorify, Normalize]\n",
67 | "cat_names.remove('Filename')\n",
68 | "cat_names.remove('PicturePath')\n",
69 | "cat_names.remove('PetID')\n",
70 | "cat_names.remove('Description')"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 20,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "(['Type',\n",
82 | " 'Name',\n",
83 | " 'Breed1',\n",
84 | " 'Breed2',\n",
85 | " 'Gender',\n",
86 | " 'Color1',\n",
87 | " 'Color2',\n",
88 | " 'Color3',\n",
89 | " 'MaturitySize',\n",
90 | " 'FurLength',\n",
91 | " 'Vaccinated',\n",
92 | " 'Dewormed',\n",
93 | " 'Sterilized',\n",
94 | " 'Health',\n",
95 | " 'State',\n",
96 | " 'RescuerID',\n",
97 | " 'NoImage',\n",
98 | " 'NoDescription'],\n",
99 | " ['Age',\n",
100 | " 'Quantity',\n",
101 | " 'Fee',\n",
102 | " 'VideoAmt',\n",
103 | " 'PhotoAmt',\n",
104 | " 'RescuerDogCount',\n",
105 | " 'AvgSentenceSentimentMagnitude',\n",
106 | " 'AvgSentenceSentimentScore',\n",
107 | " 'SentimentMagnitude',\n",
108 | " 'SentimentScore',\n",
109 | " 'state_gdp',\n",
110 | " 'state_population',\n",
111 | " 'gdp_vs_population'])"
112 | ]
113 | },
114 | "execution_count": 20,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "cat_names, cont_names"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 21,
126 | "metadata": {},
127 | "outputs": [],
128 | "source": [
129 | "byPetID = pets.groupby('PetID').size().reset_index()\n",
130 | "byPetID = byPetID.sample(frac=.1).drop([0], axis=1)\n",
131 | "byPetID['IsValidation'] = True\n",
132 | "pets = pd.merge(pets, byPetID, how='left', on='PetID')\n",
133 | "pets.IsValidation = pets.IsValidation.fillna(False)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 29,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "data = (TabularList.from_df(pets, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)\n",
143 | " .split_from_df(col='IsValidation')\n",
144 | " .label_from_df(cols=dep_var, label_cls=FloatList)\n",
145 | " .databunch())"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 30,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "kappa = KappaScore()\n",
155 | "kappa.weights = \"quadratic\"\n",
156 | "learn = tabular_learner(data, layers=[200,100], metrics=[rmse], y_range=[0, 4])\n",
157 | "# learn.loss = MSELossFlat\n",
158 | "\n",
159 | "learn = learn.to_fp16()"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 31,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "# learn.lr_find()\n",
169 | "# learn.recorder.plot()"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 32,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "data": {
179 | "text/html": [
180 | "Total time: 05:13
\n",
181 | " \n",
182 | "
\n",
183 | "
epoch
\n",
184 | "
train_loss
\n",
185 | "
valid_loss
\n",
186 | "
root_mean_squared_error
\n",
187 | "
time
\n",
188 | "
\n",
189 | " \n",
190 | " \n",
191 | "
\n",
192 | "
0
\n",
193 | "
0.427018
\n",
194 | "
1.196536
\n",
195 | "
1.072282
\n",
196 | "
01:22
\n",
197 | "
\n",
198 | "
\n",
199 | "
1
\n",
200 | "
0.217284
\n",
201 | "
1.124343
\n",
202 | "
1.037635
\n",
203 | "
01:16
\n",
204 | "
\n",
205 | "
\n",
206 | "
2
\n",
207 | "
0.097555
\n",
208 | "
1.137039
\n",
209 | "
1.043076
\n",
210 | "
01:16
\n",
211 | "
\n",
212 | "
\n",
213 | "
3
\n",
214 | "
0.048981
\n",
215 | "
1.193035
\n",
216 | "
1.066295
\n",
217 | "
01:17
\n",
218 | "
\n",
219 | " \n",
220 | "
"
221 | ],
222 | "text/plain": [
223 | ""
224 | ]
225 | },
226 | "metadata": {},
227 | "output_type": "display_data"
228 | }
229 | ],
230 | "source": [
231 | "learn.fit_one_cycle(4, 1e-2)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 33,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "preds,y = learn.get_preds(ds_type=DatasetType.Valid)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 35,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "preds = preds.numpy().round()"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 36,
255 | "metadata": {},
256 | "outputs": [],
257 | "source": [
258 | "# preds = torch.softmax(preds, dim=1).argmax(1).numpy()"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": 37,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "predictions = pets[pets.IsValidation == True]\n",
268 | "predictions['Prediction'] = preds\n",
269 | "predictions = predictions.groupby('PetID').mean().round()[['Prediction', 'AdoptionSpeed']]\n",
270 | "preds, y = predictions['Prediction'], predictions['AdoptionSpeed']"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 38,
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "0.372985251366838"
282 | ]
283 | },
284 | "execution_count": 38,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "quadratic_weighted_kappa(preds, y)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 16,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "(58652, 33)"
302 | ]
303 | },
304 | "execution_count": 16,
305 | "metadata": {},
306 | "output_type": "execute_result"
307 | }
308 | ],
309 | "source": [
310 | "pets.shape"
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 30,
316 | "metadata": {},
317 | "outputs": [],
318 | "source": [
319 | "name='train'\n",
320 | "p = pd.read_csv(name + '\\\\' + name + '.csv')"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 31,
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "(14993, 24)"
332 | ]
333 | },
334 | "execution_count": 31,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "p.shape"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {},
347 | "outputs": [],
348 | "source": []
349 | }
350 | ],
351 | "metadata": {
352 | "kernelspec": {
353 | "display_name": "Python 3",
354 | "language": "python",
355 | "name": "python3"
356 | },
357 | "language_info": {
358 | "codemirror_mode": {
359 | "name": "ipython",
360 | "version": 3
361 | },
362 | "file_extension": ".py",
363 | "mimetype": "text/x-python",
364 | "name": "python",
365 | "nbconvert_exporter": "python",
366 | "pygments_lexer": "ipython3",
367 | "version": "3.7.1"
368 | }
369 | },
370 | "nbformat": 4,
371 | "nbformat_minor": 2
372 | }
373 |
--------------------------------------------------------------------------------
/Fastai PetFinder.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Merge image, structured and text data in the same neural net with fast.ai\n",
8 | "\n",
9 | "In this notebook we will predict the adoption speed of pets in the [PetFinder Kaggle competition](https://www.kaggle.com/c/petfinder-adoption-prediction/). This competition give access to tree kind of data, **image** of the pets, **structured** data like their age, breed, color etc and finally **text** data in the form of a description of the pet.\n",
10 | "\n",
11 | "It would be very interesting to be able to merge all this data inside the same neural network so that the network can use whatever information from all data to actually predictic how fast a pet is going to get adopted.\n",
12 | "\n",
13 | "Keep in mind that **this is my first Kaggle competition**, so I might not be using the best strategies or validation schemes, but I just wanted to explore this idea of merging different type of data inside the same neural network.\n",
14 | "\n",
15 | "## Fast.ai\n",
16 | "We are going to use fast.ai to do that because it offers a lot of stuff we need to do this. Mainly a very intuitive [data block](https://docs.fast.ai/data_block.html) that we will use to get our various data from disk, line them up and pass them as input to our neural network. It also provide with easily accessible pre-trained models we will be able to use for our tasks.\n",
17 | "\n",
18 | "## Leveraging pre-trained models\n",
19 | ""
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import warnings\n",
29 | "warnings.filterwarnings('ignore')\n",
30 | "\n",
31 | "import matplotlib.pyplot as plt\n",
32 | "import numpy as np\n",
33 | "import pandas as pd\n",
34 | "import json\n",
35 | "import os\n",
36 | "import feather\n",
37 | "from fastai.text import *\n",
38 | "\n",
39 | "from petfinder.data import *\n",
40 | "\n",
41 | "%matplotlib inline\n",
42 | "%load_ext autoreload\n",
43 | "%autoreload 2"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "metadata": {},
49 | "source": [
50 | "# Get the structured data\n",
51 | "The method get_data contains all the data wrangling rather boring stuff. We open the structured data train.csv where we have information for each pet (identified by a PetID). We have information like the age of the pet, the breed, the color, was it vaccinated, a textual description of the pet etc. The PetFinder competition also ran the description inside the google sentiment analysis service and provided us with that. I use some of this information and create some new columns for that too.\n",
52 | "\n",
53 | "We also find images in the train_images folder. We create a dataframe where we have a row containing the PetID of the image and the path on disk of the image. We then merge this dataframe to the main structured data by PetID. This yield a dataframe with one row per image where all the structured information about the pet is there for each row.\n",
54 | "\n",
55 | "Kaggle also provided some metadata for each pet, but I didn't spend the time parsing those files...\n",
56 | "\n",
57 | "We have to predict between 5 AdoptionSpeed. This is a classification problem, but a lot of people in the competition used a regression and then found the best rounding using the class OptimizedRounder at the of this notebook. I tried using multi-class classification with this model but didn't have good results."
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {},
64 | "outputs": [],
65 | "source": [
66 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n",
67 | "bs=64\n",
68 | "\n",
69 | "pets = get_data(isTest=False)\n",
70 | "petsTest = get_data(isTest=True)\n",
71 | "\n",
72 | "pets.AdoptionSpeed = pets.AdoptionSpeed.astype(float)\n",
73 | "\n",
74 | "petsTest['AdoptionSpeed'] = 0"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "# Language Model\n",
82 | "\n",
83 | "See the notebook *PetFinder Language Model* on how we train and fine tune a text language model on the pet description"
84 | ]
85 | },
86 | {
87 | "cell_type": "markdown",
88 | "metadata": {},
89 | "source": [
90 | "# Structured data\n",
91 | "\n",
92 | "Here we have some decisions to make for our structured variables. We need to decide which one is going to be a categorical variable and which one is going to be contiuous.\n",
93 | "\n",
94 | "Even if a variable is a number doesnt mean it should be continuous variable. If the variable only contains a small amount of unique values, it might be better to model it as a categorical variable. We can use [embeddings](https://www.fast.ai/2018/04/29/categorical-embeddings/) for categorical data which will allow us to learn a far richer representation for them and is sometimes more powerful than using a continuous variable.\n",
95 | "\n",
96 | "Fastai takes care of defining those embeddings size, it also fill missing values and normalize the structured data for the neural network."
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 3,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "from fastai.tabular import *\n",
106 | "from fastai.vision import *\n",
107 | "from fastai.metrics import *\n",
108 | "from fastai.text import *\n",
109 | "\n",
110 | "dep_var = 'AdoptionSpeed'\n",
111 | "cont_names, cat_names = cont_cat_split(pets, dep_var=dep_var, max_card=10)\n",
112 | "procs = [FillMissing, Categorify, Normalize]\n",
113 | "cat_names.remove('Filename')\n",
114 | "cat_names.remove('PicturePath')\n",
115 | "cat_names.remove('PetID')\n",
116 | "cat_names.remove('Description')\n",
117 | "\n",
118 | "# for name in cont_names:\n",
119 | "# pets[name] = np.log(pets[name] - pets[name].min() + 1)"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 4,
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "(['Age',\n",
131 | " 'Quantity',\n",
132 | " 'Fee',\n",
133 | " 'VideoAmt',\n",
134 | " 'PhotoAmt',\n",
135 | " 'RescuerDogCount',\n",
136 | " 'AvgSentenceSentimentMagnitude',\n",
137 | " 'AvgSentenceSentimentScore',\n",
138 | " 'SentimentMagnitude',\n",
139 | " 'SentimentScore',\n",
140 | " 'state_gdp',\n",
141 | " 'state_population',\n",
142 | " 'gdp_vs_population'],\n",
143 | " ['Type',\n",
144 | " 'Name',\n",
145 | " 'Breed1',\n",
146 | " 'Breed2',\n",
147 | " 'Gender',\n",
148 | " 'Color1',\n",
149 | " 'Color2',\n",
150 | " 'Color3',\n",
151 | " 'MaturitySize',\n",
152 | " 'FurLength',\n",
153 | " 'Vaccinated',\n",
154 | " 'Dewormed',\n",
155 | " 'Sterilized',\n",
156 | " 'Health',\n",
157 | " 'State',\n",
158 | " 'RescuerID',\n",
159 | " 'NoImage',\n",
160 | " 'NoDescription'])"
161 | ]
162 | },
163 | "execution_count": 4,
164 | "metadata": {},
165 | "output_type": "execute_result"
166 | }
167 | ],
168 | "source": [
169 | "cont_names, cat_names"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 5,
175 | "metadata": {},
176 | "outputs": [],
177 | "source": [
178 | "from petfinder.model import *"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "# Loading and lining up the data\n",
186 | "\n",
187 | "We want to load our data. Ideally we would like to re-use existing functionnality and not have to write custom data loader. fast.ai got us covered, thanks to the amazing [data block api](https://docs.fast.ai/data_block.html)!\n",
188 | "\n",
189 | "First we need to am ItemList per type of data. One for image, structured and text. Each of them do pre-processing to the input, keep track of processing they do on data like normalization etc.\n",
190 | "\n",
191 | "But then we merge them using a MixedItemList. MixedItemList simply get an item from each ItemList it contains and merge them together into one Item. Then when fast.ai pass data to our model in the forward method, we can expect as many input as we have ItemList in our MixedItemList.\n",
192 | "\n",
193 | "I pickle the MixedItemList to avoid having to recompute it when I reload the notebook because some of the ItemList pre-processing can be long (like TextItemList)."
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 6,
199 | "metadata": {},
200 | "outputs": [],
201 | "source": [
202 | "byPetID = pets.groupby('PetID').size().reset_index()\n",
203 | "byPetID = byPetID.sample(frac=.1, random_state=42).drop([0], axis=1)\n",
204 | "byPetID['IsValidation'] = True\n",
205 | "pets = pd.merge(pets, byPetID, how='left', on='PetID')\n",
206 | "pets.IsValidation = pets.IsValidation.fillna(False)"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 7,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "from fastai.callbacks import *\n",
216 | "\n",
217 | "bs = 32\n",
218 | "size = 224\n",
219 | "np.random.seed(42)\n",
220 | "\n",
221 | "data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)\n",
222 | "vocab = data_lm.vocab\n",
223 | "\n",
224 | "imgList = ImageList.from_df(pets, path=path, cols='PicturePath')\n",
225 | "tabList = TabularList.from_df(pets, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)\n",
226 | "textList = TextList.from_df(pets, cols='Description', path=path, vocab=vocab)\n",
227 | "\n",
228 | "if os.path.isfile(path + 'mixed_img_tab_text.pkl') != True :\n",
229 | " mixed = (MixedItemList([imgList, tabList, textList], path, inner_df=tabList.inner_df)\n",
230 | " .split_from_df(col='IsValidation')\n",
231 | " .label_from_df(cols='AdoptionSpeed', label_cls=FloatList)\n",
232 | " .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))\n",
233 | "\n",
234 | " outfile = open(path + 'mixed_img_tab_text.pkl', 'wb')\n",
235 | " pickle.dump(mixed, outfile)\n",
236 | " outfile.close()\n",
237 | "else:\n",
238 | " infile = open(path + 'mixed_img_tab_text.pkl','rb')\n",
239 | " mixed = pickle.load(infile)\n",
240 | " infile.close()"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "This makes a text databunch used later on to create our learner (for the text portion of our learner). We need this to construct a pre-trained RNN for classification."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 8,
253 | "metadata": {},
254 | "outputs": [],
255 | "source": [
256 | "if os.path.isfile(path + 'text-classification-databunch.pkl'):\n",
257 | " data_text = load_data(path, 'text-classification-databunch.pkl')\n",
258 | "else:\n",
259 | " petsAll = pd.concat([pets, petsTest])\n",
260 | " petsAll = petsAll.dropna(subset=['Description'])\n",
261 | " \n",
262 | " data_text = (TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab)).split_none().label_from_df(cols='AdoptionSpeed').databunch(bs=bs)\n",
263 | " data_text.save('text-classification-databunch.pkl')"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "# Special functions\n",
271 | "Neural network frameworks like to process data in batches. Batches have to have a pre-defined size. In our case we are using image and structured data which should always have the same size, but our text data can vary in size. The description for each pet will be different.\n",
272 | "\n",
273 | "We have to modify some function in fastai to make it work with our inputs. First since we are using a pre-trained resnet34 network for our images, we need to normalize our images using statistics from ImageNet. But the normalize method for images from fastai expects a certain tensor shape. We need to create a custom normalize function to take into account our custom tensor shape.\n",
274 | "\n",
275 | "Each row in our batch will contain an array of stuff, first the image data, then the structured data and last the text data.\n",
276 | "\n",
277 | "``` python\n",
278 | "\n",
279 | "def _normalize_images_batch(b:Tuple[Tensor,Tensor], mean:FloatTensor, std:FloatTensor)->Tuple[Tensor,Tensor]:\n",
280 | " \"`b` = `x`,`y` - normalize `x` array of imgs and `do_y` optionally `y`.\"\n",
281 | " x,y = b\n",
282 | " mean,std = mean.to(x[0].device),std.to(x[0].device)\n",
283 | " x[0] = normalize(x[0],mean,std)\n",
284 | " return x,y\n",
285 | "\n",
286 | "def normalize_custom_funcs(mean:FloatTensor, std:FloatTensor, do_x:bool=True, do_y:bool=False)->Tuple[Callable,Callable]:\n",
287 | " \"Create normalize/denormalize func using `mean` and `std`, can specify `do_y` and `device`.\"\n",
288 | " mean,std = tensor(mean),tensor(std)\n",
289 | " return (partial(_normalize_images_batch, mean=mean, std=std),\n",
290 | " partial(denormalize, mean=mean, std=std))\n",
291 | "```\n",
292 | "\n",
293 | "**collate_mixed** is the method responsible to take a batch with variable size rows (because of the variable Description text size) and make them all of equal length so that we can have uniform batch sizes. We basically find the row in the batch which have to longest text, take its length and make all other rows the same length by padding them with zeroes at the end.\n",
294 | "\n",
295 | "``` python\n",
296 | "def collate_mixed(samples, pad_idx:int=0):\n",
297 | " # Find max length of the text from the MixedItemList\n",
298 | " max_len = max([len(s[0].data[2]) for s in samples])\n",
299 | "\n",
300 | " for s in samples:\n",
301 | " res = np.zeros(max_len + pad_idx, dtype=np.int64)\n",
302 | " res[:len(s[0].data[2])] = s[0].data[2]\n",
303 | " s[0].data[2] = res\n",
304 | "\n",
305 | " return data_collate(samples)\n",
306 | "```\n",
307 | "\n",
308 | "Then we transform our MixedItemList into a databunch with our collate function for equal size batches and we also normalize the images using our custom normalize function from earlier."
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": 9,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "data = mixed.databunch(bs=bs, collate_fn=collate_mixed)\n",
318 | "\n",
319 | "norm, denorm = normalize_custom_funcs(*imagenet_stats)\n",
320 | "data.add_tfm(norm) # normalize images"
321 | ]
322 | },
323 | {
324 | "cell_type": "markdown",
325 | "metadata": {},
326 | "source": [
327 | "When fastai process your structured data, it creates new columns for any columns that had NaN values. This new column is True when the other column was NaN, otherwise false. If you want to use those columns, simply uncomment the next cell."
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 10,
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "# cat_names = mixed.train.x.item_lists[1].cat_names\n",
337 | "# cont_names = mixed.train.x.item_lists[1].cont_names"
338 | ]
339 | },
340 | {
341 | "cell_type": "markdown",
342 | "metadata": {},
343 | "source": [
344 | "# Custom model\n",
345 | "Here is the custom PyTorch model I created. It expects a list of embeddings size for each categorical variable (emb_szs), the number of continuous variable (n_cont), the size of the text vocabulary for the language model and finally we have our pre-trained language model encoder that gets passed (encoder).\n",
346 | "\n",
347 | "**self.cnn** is responsible for the image data. Notice the we use AdaptiveConcatPool2d to be able to have any image size as input.\n",
348 | "\n",
349 | "**self.lm_encoder** is responsible for the text data. It uses our fine-tuned language model encoder we trained in the notebook PetFinder Language Model.\n",
350 | "\n",
351 | "**self.tab** is responsible for the structured data. It will create embeddings for categorical variables.\n",
352 | "\n",
353 | "**self.reduce** is simply to reduce the size of the output of the cnn to a more manageable size.\n",
354 | "\n",
355 | "Once the data is passed through each specialist network (cnn, encoder and tabular), we concatenate their output into a single vector.\n",
356 | "\n",
357 | "**self.merge and self.final** are then responsible to reduce this concatenated vector to the final size of 5 which is the number of possible AdoptionSpeed we want to predict. AdoptionSpeed is a categorical variable with 5 unique values.\n",
358 | "\n",
359 | "**use_trainer** is set to true if we are using RNNTrainer\n",
360 | "\n",
361 | "The **reset** method is used to reset the internal state of the RNN in self.lm_encoder.\n",
362 | "\n",
363 | "We are outputing one output for regression and forcing it in the range 0-4.\n",
364 | "\n",
365 | "``` python\n",
366 | "class ImageTabularTextModel(nn.Module):\n",
367 | " def __init__(self, emb_szs:ListSizes, n_cont:int, vocab_sz:int, encoder, use_trainer):\n",
368 | " super().__init__()\n",
369 | " self.use_trainer = use_trainer\n",
370 | " self.cnn = create_body(models.resnet34)\n",
371 | " nf = num_features_model(self.cnn) * 2\n",
372 | " drop = .5\n",
373 | "\n",
374 | " self.lm_encoder = SequentialRNN(encoder[0], PoolingLinearClassifier([400 * 3] + [32], [.4]))\n",
375 | "\n",
376 | " self.tab = TabularModel(emb_szs, n_cont, 128, [512, 256])\n",
377 | "\n",
378 | " self.reduce = nn.Sequential(*([AdaptiveConcatPool2d(), Flatten()] + bn_drop_lin(nf, 512, bn=True, p=drop, actn=nn.ReLU(inplace=True))))\n",
379 | " self.merge = nn.Sequential(*bn_drop_lin(512 + 128 + 32, 128, bn=True, p=drop, actn=nn.ReLU(inplace=True)))\n",
380 | " self.final = nn.Sequential(*bn_drop_lin(128, 1, bn=False, p=0., actn=None))\n",
381 | "\n",
382 | " def forward(self, img:Tensor, x:Tensor, text:Tensor) -> Tensor:\n",
383 | " imgCnn = self.cnn(img)\n",
384 | " imgLatent = self.reduce(imgCnn)\n",
385 | " tabLatent = self.tab(x[0], x[1])\n",
386 | " textLatent = self.lm_encoder(text)\n",
387 | "\n",
388 | " cat = torch.cat([imgLatent, F.relu(tabLatent), F.relu(textLatent[0])], dim=1)\n",
389 | "\n",
390 | " pred = self.final(self.merge(cat))\n",
391 | " pred = torch.sigmoid(pred) * 4 # making sure this is in the range 0-4\n",
392 | "\n",
393 | " if(not self.use_trainer):\n",
394 | " return pred\n",
395 | " else:\n",
396 | " return pred, textLatent\n",
397 | " \n",
398 | " def reset(self):\n",
399 | " for c in self.children():\n",
400 | " if hasattr(c, 'reset'): c.reset()\n",
401 | "```\n",
402 | "\n",
403 | "# Custom learner functions\n",
404 | "\n",
405 | "We need a split_layer function to tell fastai how to split the layers when doing [discriminative learning rates](https://towardsdatascience.com/understanding-learning-rates-and-how-it-improves-performance-in-deep-learning-d0d4059c1c10). This is also what determines which layer to freeze when when we call the Learner.freeze method. This one could certainly be better... Looking at other split layers for the pre-trained RNN and reset, we should probably structure this differently.\n",
406 | "\n",
407 | "``` python\n",
408 | "def split_layers(model:nn.Module) -> List[nn.Module]:\n",
409 | " groups = [[model.cnn, model.lm_encoder]]\n",
410 | " groups += [[model.tab, model.reduce, model.merge, model.final]]\n",
411 | " return groups\n",
412 | "```\n",
413 | "\n",
414 | "We create our custom Learner class to be able to set some custom parameters. I added an option to use RNNTrainer which is supposed to help if the language model is overfitting. It is based on the [AWD_LSTM paper](https://arxiv.org/abs/1708.02182). I had to modify the default version because of how I was passing data to it.\n",
415 | "\n",
416 | "``` python\n",
417 | "class RNNTrainerCustom(RNNTrainer):\n",
418 | " def on_loss_begin(self, last_output:Tuple[Tensor,Tensor,Tensor], **kwargs):\n",
419 | " \"Save the extra outputs for later and only returns the true output.\"\n",
420 | " self.raw_out,self.out = last_output[1][1],last_output[1][2]\n",
421 | " return {'last_output': last_output[0]}\n",
422 | "\n",
423 | "class ImageTabularTextLearner(Learner):\n",
424 | " def __init__(self, data:DataBunch, model:nn.Module, use_trainer:bool=False, alpha:float=2., beta:float=1., **learn_kwargs):\n",
425 | " super().__init__(data, model, **learn_kwargs)\n",
426 | " if(use_trainer):\n",
427 | " self.callbacks.append(RNNTrainerCustom(self, alpha=alpha, beta=beta))\n",
428 | " self.split(split_layers)\n",
429 | "```\n",
430 | "\n",
431 | "Finally an helper method constructing our model and learner. We use the text_classifier_learner method from fastai to construct a pre-trained language model where we load our fine-tuned encoder. This method returns a learner though, but we only care about the model it returns which we use in our own model.\n",
432 | "\n",
433 | "The metric this Kaggle competition [evaluate on the quadratic weighted kappa](https://www.kaggle.com/c/petfinder-adoption-prediction/overview/evaluation). So we will track it to see how we are doing.\n",
434 | "\n",
435 | "``` python\n",
436 | "def image_tabular_text_learner(data, len_cont_names, vocab_sz, data_lm, use_trainer:bool=False):\n",
437 | " l = text_classifier_learner(data_lm, AWD_LSTM, drop_mult=0.5)\n",
438 | " l.load_encoder('fine_tuned_enc')\n",
439 | "\n",
440 | " emb = data.train_ds.x.item_lists[1].get_emb_szs()\n",
441 | " model = ImageTabularTextModel(emb, len_cont_names, vocab_sz, l.model, use_trainer)\n",
442 | "\n",
443 | " learn = ImageTabularTextLearner(data, model, use_trainer, metrics=[mae])\n",
444 | " return learn\n",
445 | "```"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": 11,
451 | "metadata": {},
452 | "outputs": [],
453 | "source": [
454 | "learn = image_tabular_text_learner(data, len(cont_names), len(vocab.itos), data_text, use_trainer=True)"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 12,
460 | "metadata": {},
461 | "outputs": [],
462 | "source": [
463 | "# learn.callback_fns +=[partial(EarlyStoppingCallback, monitor='accuracy', min_delta=0.005, patience=3)]\n",
464 | "# learn.callback_fns += [(partial(LearnerTensorboardWriter, base_dir=Path(path + 'logs\\\\'), name='mixed-metadata'))]"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": 13,
470 | "metadata": {},
471 | "outputs": [
472 | {
473 | "data": {
474 | "text/plain": [
475 | "1"
476 | ]
477 | },
478 | "execution_count": 13,
479 | "metadata": {},
480 | "output_type": "execute_result"
481 | }
482 | ],
483 | "source": [
484 | "data.c"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 14,
490 | "metadata": {},
491 | "outputs": [],
492 | "source": [
493 | "# learn.lr_find()\n",
494 | "# learn.recorder.plot()"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 15,
500 | "metadata": {},
501 | "outputs": [],
502 | "source": [
503 | "lr = 1e-3"
504 | ]
505 | },
506 | {
507 | "cell_type": "code",
508 | "execution_count": 16,
509 | "metadata": {},
510 | "outputs": [
511 | {
512 | "data": {
513 | "text/html": [
514 | "Total time: 20:36
\n",
666 | " "
667 | ],
668 | "text/plain": [
669 | ""
670 | ]
671 | },
672 | "metadata": {},
673 | "output_type": "display_data"
674 | },
675 | {
676 | "name": "stdout",
677 | "output_type": "stream",
678 | "text": [
679 | "Better model found at epoch 0 with mean_absolute_error value: 0.8574932813644409.\n"
680 | ]
681 | }
682 | ],
683 | "source": [
684 | "learn.unfreeze()\n",
685 | "learn.fit_one_cycle(4, max_lr=slice(1e-6,1e-4), callbacks=SaveModelCallback(learn, every='improvement', mode='min', monitor='mean_absolute_error', name='mixed-unfrozen'))"
686 | ]
687 | },
688 | {
689 | "cell_type": "code",
690 | "execution_count": null,
691 | "metadata": {},
692 | "outputs": [],
693 | "source": [
694 | "learn.load('mixed')"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 24,
700 | "metadata": {},
701 | "outputs": [],
702 | "source": [
703 | "p,y = learn.get_preds(ds_type=DatasetType.Valid)"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 25,
709 | "metadata": {},
710 | "outputs": [],
711 | "source": [
712 | "from petfinder.test import *"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 37,
718 | "metadata": {},
719 | "outputs": [],
720 | "source": [
721 | "optR = OptimizedRounder()\n",
722 | "optR.fit(p.numpy()[:, 0], y.numpy())\n",
723 | "coeff = optR.coefficients()"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": 69,
729 | "metadata": {},
730 | "outputs": [],
731 | "source": [
732 | "preds = optR.predict(p.numpy()[:, 0], coeff).astype(int)"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 71,
738 | "metadata": {},
739 | "outputs": [],
740 | "source": [
741 | "predictions = pets[pets.IsValidation == True][['PetID', 'AdoptionSpeed']]\n",
742 | "predictions['Prediction'] = preds\n",
743 | "predictions = predictions.groupby('PetID').mean()[['Prediction', 'AdoptionSpeed']]\n",
744 | "# preds, y = predictions['Prediction'], predictions['AdoptionSpeed']"
745 | ]
746 | },
747 | {
748 | "cell_type": "code",
749 | "execution_count": 72,
750 | "metadata": {},
751 | "outputs": [
752 | {
753 | "data": {
754 | "text/plain": [
755 | "0.4232357217350937"
756 | ]
757 | },
758 | "execution_count": 72,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "quadratic_weighted_kappa(predictions['Prediction'], predictions['AdoptionSpeed'])"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "# Generating a submission for the competition\n",
772 | "\n",
773 | "Unfortunately fastai export does not support MixedItemList yet. So to test my code on the test set I had to trick fastai in thinking that the test set is actually the validation set. I just set all labels of the test set to be 0."
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": 74,
779 | "metadata": {},
780 | "outputs": [],
781 | "source": [
782 | "pets['IsTest'] = False\n",
783 | "petsTest['IsTest'] = True\n",
784 | "petsTest['AdoptionSpeed'] = 0\n",
785 | "\n",
786 | "petsAll = pd.concat([pets, petsTest])"
787 | ]
788 | },
789 | {
790 | "cell_type": "markdown",
791 | "metadata": {},
792 | "source": [
793 | "This is pretty much the same code as training, but here we use .split_from_df(col='IsTest') to tell fastai that the validation are only the rows in the dataframe where the column IsTest is True."
794 | ]
795 | },
796 | {
797 | "cell_type": "code",
798 | "execution_count": 75,
799 | "metadata": {},
800 | "outputs": [],
801 | "source": [
802 | "imgListTest = ImageList.from_df(petsAll, path=path, cols='PicturePath')\n",
803 | "tabListTest = TabularList.from_df(petsAll, cat_names=cat_names, cont_names=cont_names, procs=procs, path=path)\n",
804 | "textListTest = TextList.from_df(petsAll, cols='Description', path=path, vocab=vocab)\n",
805 | "\n",
806 | "mixedTest = (MixedItemList([imgListTest, tabListTest, textListTest], path, inner_df=tabListTest.inner_df)\n",
807 | " .split_from_df(col='IsTest')\n",
808 | " .label_from_df(cols='AdoptionSpeed', label_cls=FloatList)\n",
809 | " .transform([[get_transforms()[0], [], []], [get_transforms()[1], [], []]], size=size))"
810 | ]
811 | },
812 | {
813 | "cell_type": "code",
814 | "execution_count": null,
815 | "metadata": {},
816 | "outputs": [],
817 | "source": [
818 | "dataTest = mixedTest.databunch(bs=bs, collate_fn=collate_mixed)\n",
819 | "dataTest.add_tfm(norm) # normalize images\n",
820 | "\n",
821 | "learn = image_tabular_text_learner(dataTest, len(cont_names), len(vocab.itos), data_text, use_trainer=True)\n",
822 | "learn.load('mixed')"
823 | ]
824 | },
825 | {
826 | "cell_type": "code",
827 | "execution_count": 107,
828 | "metadata": {},
829 | "outputs": [],
830 | "source": [
831 | "preds,y = learn.get_preds(ds_type=DatasetType.Valid)"
832 | ]
833 | },
834 | {
835 | "cell_type": "code",
836 | "execution_count": 112,
837 | "metadata": {},
838 | "outputs": [],
839 | "source": [
840 | "p,y = preds.numpy()[:, 0], y.numpy()"
841 | ]
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": 118,
846 | "metadata": {},
847 | "outputs": [],
848 | "source": [
849 | "optR = OptimizedRounder()\n",
850 | "preds = optR.predict(p, coeff).astype(int)"
851 | ]
852 | },
853 | {
854 | "cell_type": "code",
855 | "execution_count": 139,
856 | "metadata": {},
857 | "outputs": [],
858 | "source": [
859 | "predictions = petsTest\n",
860 | "predictions['AdoptionSpeed'] = preds\n",
861 | "predictions = predictions.groupby('PetID').mean()['AdoptionSpeed'].reset_index()\n",
862 | "predictions['AdoptionSpeed'] = predictions['AdoptionSpeed'].astype(int)\n",
863 | "predictions.to_csv('submission.csv', index=False)"
864 | ]
865 | },
866 | {
867 | "cell_type": "code",
868 | "execution_count": null,
869 | "metadata": {},
870 | "outputs": [],
871 | "source": []
872 | }
873 | ],
874 | "metadata": {
875 | "kernelspec": {
876 | "display_name": "Python 3",
877 | "language": "python",
878 | "name": "python3"
879 | },
880 | "language_info": {
881 | "codemirror_mode": {
882 | "name": "ipython",
883 | "version": 3
884 | },
885 | "file_extension": ".py",
886 | "mimetype": "text/x-python",
887 | "name": "python",
888 | "nbconvert_exporter": "python",
889 | "pygments_lexer": "ipython3",
890 | "version": "3.7.1"
891 | }
892 | },
893 | "nbformat": 4,
894 | "nbformat_minor": 2
895 | }
896 |
--------------------------------------------------------------------------------
/PetFinder Language Model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import warnings\n",
10 | "warnings.filterwarnings('ignore')\n",
11 | "\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import numpy as np\n",
14 | "import pandas as pd\n",
15 | "import json\n",
16 | "import os\n",
17 | "import feather\n",
18 | "from fastai.text import *\n",
19 | "\n",
20 | "from petfinder.data import *\n",
21 | "\n",
22 | "%matplotlib inline\n",
23 | "%load_ext autoreload\n",
24 | "%autoreload 2"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "path = 'C:\\\\work\\\\ML\\\\PetFinder\\\\'\n",
34 | "bs=64\n",
35 | "\n",
36 | "pets = get_data(isTest=False)\n",
37 | "petsTest = get_data(isTest=True)\n",
38 | "\n",
39 | "petsTest['AdoptionSpeed'] = 0"
40 | ]
41 | },
42 | {
43 | "cell_type": "markdown",
44 | "metadata": {},
45 | "source": [
46 | "# Language Model\n",
47 | "\n",
48 | "First let's handle our language model. In fast.ai, you can use a pre-trained language model called ULMFit trained on all the text of wikipedia on trying to predict the next word in a sentence. In our case we don't want to predict the next word in a sentence, but we want to use what this language model learned to help us with our task of predicting how fast a pet is going to get adopted.\n",
49 | "\n",
50 | "In fast.ai, you can fine-tune a pre-trained model on your own corpora of text to make it better at handling your own domain. In our case, descriptions of pets.\n",
51 | "\n",
52 | "We are going to use the description from both the training and test set to fine tune this pre-trained model to give us as much text as possible to fine tune it."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "petsAll = pd.concat([pets, petsTest])\n",
62 | "petsAll = petsAll.dropna(subset=['Description'])\n",
63 | "\n",
64 | "descriptions = petsAll.groupby(['PetID', 'Description']).size().to_frame().reset_index().set_index('PetID')"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {},
70 | "source": [
71 | "Then let's prepare a DataBunch for those description. Basically in the background fastai is tokenizing this text and numericalize it to make it usesable by the neural network. This DataBunch is also going to be used to know our vocabulary (what word does this language model know) later on when we are going to use it in our network."
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "data_lm = (TextList.from_df(descriptions, cols='Description').split_by_rand_pct(0.1).label_for_lm().databunch(bs=bs, path=path))\n",
81 | "data_lm.save('data_lm_descriptions.pkl')"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 5,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "data_lm = load_data(path, 'data_lm_descriptions.pkl', bs=bs)"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 6,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/html": [
101 | "
\n",
102 | " \n",
103 | "
\n",
104 | "
idx
\n",
105 | "
text
\n",
106 | "
\n",
107 | " \n",
108 | " \n",
109 | "
\n",
110 | "
0
\n",
111 | "
beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human
\n",
112 | "
\n",
113 | "
\n",
114 | "
1
\n",
115 | "
fine ) . xxmaj the owner is xxmaj korean lady , a mother of 2 babies - 18 months and 6 months and she 's pregnant again so planning to go back to xxmaj korea . xxmaj please contact if you are interested . xxbos xxmaj she is very friendly and cute . xxmaj because i want to move , the new apartment can not raise a cat , no
\n",
116 | "
\n",
117 | "
\n",
118 | "
2
\n",
119 | "
was rescued . i have no heart to put him back on the street after neuturing . xxmaj he will not be able to survive . i knw for a fact , he has an owner before he was abandoned . xxmaj will you be able to give xxmaj luke a forever home .. let him feel the love he used to have ? xxmaj give me a call at
\n",
120 | "
\n",
121 | "
\n",
122 | "
3
\n",
123 | "
including xxmaj xxunk xxup xxunk have to direct part order from xxmaj xxunk and waited for 3 weeks long to reached xxmaj malaysia . xxmaj ha ! xxmaj ha ! xxmaj ha ! xxup what a xxup good xxup laugh xxbos xxmaj happy yappy 4 month old puppies were dumped at the pet clinic mercilessly . 4 months later , these 4 puppies have grown to be the lovable xxmaj
\n",
124 | "
\n",
125 | "
\n",
126 | "
4
\n",
127 | "
facilitate the adopter to be entitled for the starter pack , cos posting in a group only entitles one adopter . xxbos xxmaj nak bagi pet ni sebab dah banyak sgt . lagipun mase x cukup nak jaga .. my pet betul2 tak terurus sekarang ... call . i tinggal area segambut , kl . xxup telah xxup selamat xxup di xxup rumah xxup baru xxrep 4 . xxbos xxmaj
"
237 | ],
238 | "text/plain": [
239 | ""
240 | ]
241 | },
242 | "metadata": {},
243 | "output_type": "display_data"
244 | }
245 | ],
246 | "source": [
247 | "learn.fit_one_cycle(1, lr, moms=(.8, .7))"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 11,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "data": {
257 | "text/plain": [
258 | "LanguageLearner(data=TextLMDataBunch;\n",
259 | "\n",
260 | "Train: LabelList (17069 items)\n",
261 | "x: LMTextList\n",
262 | "xxbos xxmaj xxunk was rescued from the construction site behind my house . xxmaj he is quite the manja type and loves to play . xxmaj he makes a good companion and playmate for young children . xxmaj he is quite the handsome chap with a distinct mark on his face like a beauty mark .,xxbos went to teluk xxunk xxunk restaurant saw this female puppies alone by the beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk,xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human , loves human touches , loves other dogs . xxmaj we are looking for a forever home that could continue letting xxmaj cherry do all that she loves . xxmaj we want to find xxmaj cherry a home which will treat her as a house pet and stays indoor . xxmaj do n't be mistaken , xxmaj cherry is very alert at strangers and noises at the gate , but being a watch dog should not be her full time ' job ' . xxmaj if you love a dog who loves to manja , pls call us . xxmaj home visits will be arranged with potential adopter , and adopter to reimburse spaying n vaccination costs . xxup tq . :),xxbos xxmaj this puppy rescued from xxup dbkl pound last wednesday , and been sent to vet for check out . xxmaj pls give this lovely puppy a loving home if you can .,xxbos xxmaj he is cute and fun to be with .\n",
263 | "y: LMLabelList\n",
264 | ",,,,\n",
265 | "Path: .;\n",
266 | "\n",
267 | "Valid: LabelList (1896 items)\n",
268 | "x: LMTextList\n",
269 | "xxbos xxmaj healthy puppy for adoption . xxmaj commitment to spay is compulsory . xxmaj interested to adopt pls contact xxmaj amy,xxbos a healthy , clean , sweet little girl in xxmaj xxunk . send me message if you can give her a nice home .,xxbos xxmaj looking for a xxup serious adopter for xxmaj omey . xxmaj the adoption fee only for xxunk and will be return back . xxmaj feel free to whatapps me . * xxmaj she is mixed siamese and xxmaj domestic xxmaj short xxmaj hair * xxmaj diet - xxmaj canned xxmaj food xxmaj mackerel xxmaj fussie xxmaj cat + kibbles xxmaj blackwood ( xxmaj chicken and xxmaj corn ) * xxmaj she is litter trained , but you may need to train her in the new home . * xxmaj for adopter who never have cat , i will guide you thoroughly and xxunk kibbles will be given for free .,xxbos xxmaj kitten to let go for serious adopter . xxmaj playful & xxmaj healthy . xxmaj diet : xxmaj royal xxmaj canin 32 . 1st vaccinated completed , new owner have to follow up with 2nd & 3rd vaccination for xxmaj fila . xxmaj she is litter train . xxmaj looking for experienced and serious adopter only . xxmaj price are completed with 1st vaccine , deworm , and anti - flea vaccine .,xxbos xxmaj labrador cross , huge in size at the age of 3 months , chest with a star symbol , very unique .\n",
270 | "y: LMLabelList\n",
271 | ",,,,\n",
272 | "Path: .;\n",
273 | "\n",
274 | "Test: None, model=SequentialRNN(\n",
275 | " (0): AWD_LSTM(\n",
276 | " (encoder): Embedding(9853, 400, padding_idx=1)\n",
277 | " (encoder_dp): EmbeddingDropout(\n",
278 | " (emb): Embedding(9853, 400, padding_idx=1)\n",
279 | " )\n",
280 | " (rnns): ModuleList(\n",
281 | " (0): WeightDropout(\n",
282 | " (module): LSTM(400, 1150, batch_first=True)\n",
283 | " )\n",
284 | " (1): WeightDropout(\n",
285 | " (module): LSTM(1150, 1150, batch_first=True)\n",
286 | " )\n",
287 | " (2): WeightDropout(\n",
288 | " (module): LSTM(1150, 400, batch_first=True)\n",
289 | " )\n",
290 | " )\n",
291 | " (input_dp): RNNDropout()\n",
292 | " (hidden_dps): ModuleList(\n",
293 | " (0): RNNDropout()\n",
294 | " (1): RNNDropout()\n",
295 | " (2): RNNDropout()\n",
296 | " )\n",
297 | " )\n",
298 | " (1): LinearDecoder(\n",
299 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n",
300 | " (output_dp): RNNDropout()\n",
301 | " )\n",
302 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=WindowsPath('C:/work/ML/PetFinder'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n",
303 | "learn: LanguageLearner(data=TextLMDataBunch;\n",
304 | "\n",
305 | "Train: LabelList (17069 items)\n",
306 | "x: LMTextList\n",
307 | "xxbos xxmaj xxunk was rescued from the construction site behind my house . xxmaj he is quite the manja type and loves to play . xxmaj he makes a good companion and playmate for young children . xxmaj he is quite the handsome chap with a distinct mark on his face like a beauty mark .,xxbos went to teluk xxunk xxunk restaurant saw this female puppies alone by the beach .. xxmaj adopters must vaccinate , spay and keep puppy indoors / fenced xxmaj call / whatsapp : xxmaj address : teluk xxunk,xxbos xxmaj cherry was a stray dog we rescued from the streets . xxmaj she behaves like a total house pet , who would believe she used to stray for a year plus ! xxmaj cherry loves to be indoor , loves to be near human , loves human touches , loves other dogs . xxmaj we are looking for a forever home that could continue letting xxmaj cherry do all that she loves . xxmaj we want to find xxmaj cherry a home which will treat her as a house pet and stays indoor . xxmaj do n't be mistaken , xxmaj cherry is very alert at strangers and noises at the gate , but being a watch dog should not be her full time ' job ' . xxmaj if you love a dog who loves to manja , pls call us . xxmaj home visits will be arranged with potential adopter , and adopter to reimburse spaying n vaccination costs . xxup tq . :),xxbos xxmaj this puppy rescued from xxup dbkl pound last wednesday , and been sent to vet for check out . xxmaj pls give this lovely puppy a loving home if you can .,xxbos xxmaj he is cute and fun to be with .\n",
308 | "y: LMLabelList\n",
309 | ",,,,\n",
310 | "Path: .;\n",
311 | "\n",
312 | "Valid: LabelList (1896 items)\n",
313 | "x: LMTextList\n",
314 | "xxbos xxmaj healthy puppy for adoption . xxmaj commitment to spay is compulsory . xxmaj interested to adopt pls contact xxmaj amy,xxbos a healthy , clean , sweet little girl in xxmaj xxunk . send me message if you can give her a nice home .,xxbos xxmaj looking for a xxup serious adopter for xxmaj omey . xxmaj the adoption fee only for xxunk and will be return back . xxmaj feel free to whatapps me . * xxmaj she is mixed siamese and xxmaj domestic xxmaj short xxmaj hair * xxmaj diet - xxmaj canned xxmaj food xxmaj mackerel xxmaj fussie xxmaj cat + kibbles xxmaj blackwood ( xxmaj chicken and xxmaj corn ) * xxmaj she is litter trained , but you may need to train her in the new home . * xxmaj for adopter who never have cat , i will guide you thoroughly and xxunk kibbles will be given for free .,xxbos xxmaj kitten to let go for serious adopter . xxmaj playful & xxmaj healthy . xxmaj diet : xxmaj royal xxmaj canin 32 . 1st vaccinated completed , new owner have to follow up with 2nd & 3rd vaccination for xxmaj fila . xxmaj she is litter train . xxmaj looking for experienced and serious adopter only . xxmaj price are completed with 1st vaccine , deworm , and anti - flea vaccine .,xxbos xxmaj labrador cross , huge in size at the age of 3 months , chest with a star symbol , very unique .\n",
315 | "y: LMLabelList\n",
316 | ",,,,\n",
317 | "Path: .;\n",
318 | "\n",
319 | "Test: None, model=SequentialRNN(\n",
320 | " (0): AWD_LSTM(\n",
321 | " (encoder): Embedding(9853, 400, padding_idx=1)\n",
322 | " (encoder_dp): EmbeddingDropout(\n",
323 | " (emb): Embedding(9853, 400, padding_idx=1)\n",
324 | " )\n",
325 | " (rnns): ModuleList(\n",
326 | " (0): WeightDropout(\n",
327 | " (module): LSTM(400, 1150, batch_first=True)\n",
328 | " )\n",
329 | " (1): WeightDropout(\n",
330 | " (module): LSTM(1150, 1150, batch_first=True)\n",
331 | " )\n",
332 | " (2): WeightDropout(\n",
333 | " (module): LSTM(1150, 400, batch_first=True)\n",
334 | " )\n",
335 | " )\n",
336 | " (input_dp): RNNDropout()\n",
337 | " (hidden_dps): ModuleList(\n",
338 | " (0): RNNDropout()\n",
339 | " (1): RNNDropout()\n",
340 | " (2): RNNDropout()\n",
341 | " )\n",
342 | " )\n",
343 | " (1): LinearDecoder(\n",
344 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n",
345 | " (output_dp): RNNDropout()\n",
346 | " )\n",
347 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=WindowsPath('C:/work/ML/PetFinder'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n",
348 | " (0): WeightDropout(\n",
349 | " (module): LSTM(400, 1150, batch_first=True)\n",
350 | " )\n",
351 | " (1): RNNDropout()\n",
352 | "), Sequential(\n",
353 | " (0): WeightDropout(\n",
354 | " (module): LSTM(1150, 1150, batch_first=True)\n",
355 | " )\n",
356 | " (1): RNNDropout()\n",
357 | "), Sequential(\n",
358 | " (0): WeightDropout(\n",
359 | " (module): LSTM(1150, 400, batch_first=True)\n",
360 | " )\n",
361 | " (1): RNNDropout()\n",
362 | "), Sequential(\n",
363 | " (0): Embedding(9853, 400, padding_idx=1)\n",
364 | " (1): EmbeddingDropout(\n",
365 | " (emb): Embedding(9853, 400, padding_idx=1)\n",
366 | " )\n",
367 | " (2): LinearDecoder(\n",
368 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n",
369 | " (output_dp): RNNDropout()\n",
370 | " )\n",
371 | ")], add_time=True, silent=None)\n",
372 | "alpha: 2.0\n",
373 | "beta: 1.0], layer_groups=[Sequential(\n",
374 | " (0): WeightDropout(\n",
375 | " (module): LSTM(400, 1150, batch_first=True)\n",
376 | " )\n",
377 | " (1): RNNDropout()\n",
378 | "), Sequential(\n",
379 | " (0): WeightDropout(\n",
380 | " (module): LSTM(1150, 1150, batch_first=True)\n",
381 | " )\n",
382 | " (1): RNNDropout()\n",
383 | "), Sequential(\n",
384 | " (0): WeightDropout(\n",
385 | " (module): LSTM(1150, 400, batch_first=True)\n",
386 | " )\n",
387 | " (1): RNNDropout()\n",
388 | "), Sequential(\n",
389 | " (0): Embedding(9853, 400, padding_idx=1)\n",
390 | " (1): EmbeddingDropout(\n",
391 | " (emb): Embedding(9853, 400, padding_idx=1)\n",
392 | " )\n",
393 | " (2): LinearDecoder(\n",
394 | " (decoder): Linear(in_features=400, out_features=9853, bias=True)\n",
395 | " (output_dp): RNNDropout()\n",
396 | " )\n",
397 | ")], add_time=True, silent=None)"
398 | ]
399 | },
400 | "execution_count": 11,
401 | "metadata": {},
402 | "output_type": "execute_result"
403 | }
404 | ],
405 | "source": [
406 | "learn.save('fit_head')\n",
407 | "learn.load('fit_head')"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 12,
413 | "metadata": {},
414 | "outputs": [
415 | {
416 | "data": {
417 | "text/html": [
418 | "Total time: 09:54
\n",
419 | " \n",
420 | "
\n",
421 | "
epoch
\n",
422 | "
train_loss
\n",
423 | "
valid_loss
\n",
424 | "
accuracy
\n",
425 | "
time
\n",
426 | "
\n",
427 | " \n",
428 | " \n",
429 | "
\n",
430 | "
0
\n",
431 | "
3.376002
\n",
432 | "
3.337966
\n",
433 | "
0.360892
\n",
434 | "
00:59
\n",
435 | "
\n",
436 | "
\n",
437 | "
1
\n",
438 | "
3.218935
\n",
439 | "
3.198084
\n",
440 | "
0.380810
\n",
441 | "
00:58
\n",
442 | "
\n",
443 | "
\n",
444 | "
2
\n",
445 | "
3.088321
\n",
446 | "
3.087903
\n",
447 | "
0.397807
\n",
448 | "
00:58
\n",
449 | "
\n",
450 | "
\n",
451 | "
3
\n",
452 | "
2.900379
\n",
453 | "
3.020042
\n",
454 | "
0.408955
\n",
455 | "
01:00
\n",
456 | "
\n",
457 | "
\n",
458 | "
4
\n",
459 | "
2.775925
\n",
460 | "
2.984735
\n",
461 | "
0.416367
\n",
462 | "
00:59
\n",
463 | "
\n",
464 | "
\n",
465 | "
5
\n",
466 | "
2.651721
\n",
467 | "
2.965952
\n",
468 | "
0.419984
\n",
469 | "
00:59
\n",
470 | "
\n",
471 | "
\n",
472 | "
6
\n",
473 | "
2.546253
\n",
474 | "
2.963663
\n",
475 | "
0.422052
\n",
476 | "
01:00
\n",
477 | "
\n",
478 | "
\n",
479 | "
7
\n",
480 | "
2.471917
\n",
481 | "
2.963549
\n",
482 | "
0.422827
\n",
483 | "
00:59
\n",
484 | "
\n",
485 | "
\n",
486 | "
8
\n",
487 | "
2.399469
\n",
488 | "
2.967543
\n",
489 | "
0.423779
\n",
490 | "
00:59
\n",
491 | "
\n",
492 | "
\n",
493 | "
9
\n",
494 | "
2.376323
\n",
495 | "
2.971359
\n",
496 | "
0.423615
\n",
497 | "
01:00
\n",
498 | "
\n",
499 | " \n",
500 | "
"
501 | ],
502 | "text/plain": [
503 | ""
504 | ]
505 | },
506 | "metadata": {},
507 | "output_type": "display_data"
508 | }
509 | ],
510 | "source": [
511 | "learn.unfreeze()\n",
512 | "learn.fit_one_cycle(10, 1e-3, moms=(0.8,0.7))"
513 | ]
514 | },
515 | {
516 | "cell_type": "markdown",
517 | "metadata": {},
518 | "source": [
519 | "Here the most important part is save_encoder. We are saving the part of the language model responsible to encode this sentence into a tensor of information. We are going to using this fine-tuned encoder in some other part of the neural network."
520 | ]
521 | },
522 | {
523 | "cell_type": "code",
524 | "execution_count": 13,
525 | "metadata": {},
526 | "outputs": [],
527 | "source": [
528 | "learn.save('fine_tuned')\n",
529 | "learn.save_encoder('fine_tuned_enc')"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": null,
535 | "metadata": {},
536 | "outputs": [],
537 | "source": []
538 | }
539 | ],
540 | "metadata": {
541 | "kernelspec": {
542 | "display_name": "Python 3",
543 | "language": "python",
544 | "name": "python3"
545 | },
546 | "language_info": {
547 | "codemirror_mode": {
548 | "name": "ipython",
549 | "version": 3
550 | },
551 | "file_extension": ".py",
552 | "mimetype": "text/x-python",
553 | "name": "python",
554 | "nbconvert_exporter": "python",
555 | "pygments_lexer": "ipython3",
556 | "version": "3.7.1"
557 | }
558 | },
559 | "nbformat": 4,
560 | "nbformat_minor": 2
561 | }
562 |
--------------------------------------------------------------------------------