├── language-model ├── inltk │ ├── __init__.py │ └── tokenizer.py ├── embeddings.tsv ├── embeddings_metadata.tsv ├── embeddings_transformer.tsv ├── embeddings_transformer_metadata.tsv ├── embedding_projector_config.json ├── embedding_projector_transformer_config.json ├── Malyalam_Language_Model_Transformer.ipynb └── Malyalam_Language_Model_ULMFiT.ipynb ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── datasets-preparation └── get-all-article-links-for-malyalam-wikipedia.ipynb ├── tokenizer └── Malyalam Tokenization.ipynb └── classification └── Malyalam_Classification_Model.ipynb /language-model/inltk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.tsv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /language-model/embeddings.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0509f1b101ebcace773f9eccdd9be921002e751a4288804e64d49b410a8a1755 3 | size 44800254 4 | -------------------------------------------------------------------------------- /language-model/embeddings_metadata.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fd8271d6b64b47143fc1c9f993ba016097473def375edfe0896552ae8adb292c 3 | size 206263 4 | -------------------------------------------------------------------------------- /language-model/embeddings_transformer.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:19c1637aa657771404126967d7350e4275602a87c1eb6043de21525737b7ef16 3 | size 48413207 4 | -------------------------------------------------------------------------------- /language-model/embeddings_transformer_metadata.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fd8271d6b64b47143fc1c9f993ba016097473def375edfe0896552ae8adb292c 3 | size 206263 4 | -------------------------------------------------------------------------------- /language-model/inltk/tokenizer.py: -------------------------------------------------------------------------------- 1 | from fastai.text import * 2 | import sentencepiece as spm 3 | 4 | class MalyalamTokenizer(BaseTokenizer): 5 | def __init__(self, lang:str): 6 | self.lang = lang 7 | self.sp = spm.SentencePieceProcessor() 8 | self.sp.Load("/home/gaurav/PycharmProjects/nlp-for-malyalam/tokenizer/malyalam_lm.model") 9 | 10 | def tokenizer(self, t:str) -> List[str]: 11 | return self.sp.EncodeAsPieces(t) 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | classification/.ipynb_checkpoints/* 2 | classification/models/* 3 | classification/tmp/* 4 | classification/Malyalam_News_Classification.csv 5 | datasets-preparation/.ipynb_checkpoints/* 6 | datasets-preparation/all_malyalam_wikipedia_links.pkl 7 | datasets-preparation/geckodriver.log 8 | language-model/.ipynb_checkpoints 9 | language-model/MalyalamDataset/* 10 | language-model/MalyalamWikipediaArticles/* 11 | tokenizer/.ipynb_checkpoints/* 12 | tokenizer/malyalam_lm.model 13 | tokenizer/malyalam_lm.vocab -------------------------------------------------------------------------------- /language-model/embedding_projector_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Malayalam Embedding Vectors - ULMFiT", 5 | "tensorShape": [ 6 | 10000, 7 | 400 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-malyalam/master/language-model/embeddings.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-malyalam/master/language-model/embeddings_metadata.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /language-model/embedding_projector_transformer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Malayalam Embedding Vectors - TransformerXL", 5 | "tensorShape": [ 6 | 10000, 7 | 410 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-malyalam/master/language-model/embeddings_transformer.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-malyalam/master/language-model/embeddings_transformer_metadata.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gaurav 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP for Malayalam 2 | 3 | This repository contains State of the Art Language models 4 | and Classifier for Malayalam, which is spoken by the Malayali people 5 | in the Indian state of Kerala and the union territories of 6 | Lakshadweep and Puducherry. 7 | 8 | The models trained here have been used in [Natural Language Toolkit for Indic Languages 9 | (iNLTK)](https://github.com/goru001/inltk) 10 | 11 | ## Dataset 12 | 13 | #### Created as part of this project 14 | 15 | 1. [Malayalam Wikipedia Articles](https://www.kaggle.com/disisbig/malayalam-wikipedia-articles) 16 | 17 | 2. [Malayalam News Dataset](https://www.kaggle.com/disisbig/malyalam-news-dataset) 18 | 19 | #### Open Source Datasets 20 | 1. [iNLTK Headlines Corpus - Malayalam](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) : Uses the Malayalam News Dataset prepared above 21 | 22 | ## Results 23 | 24 | ### Language Model Perplexity (on validation set) 25 | 26 | | Architecture/Dataset | Malayalam Wikipedia Articles | 27 | |:--------:|:----:| 28 | | ULMFiT | 26.39 | 29 | | TransformerXL | 25.79 | 30 | 31 | 32 | ### Classification Metrics 33 | 34 | ##### ULMFiT 35 | 36 | | Dataset | Accuracy | MCC | Notebook to Reproduce results | 37 | |:--------:|:----:|:----:|:----:| 38 | | iNLTK Headlines Corpus - Malayalam | 95.56 | 93.29 | [Link](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model.ipynb) | 39 | 40 | ### Visualizations 41 | 42 | ##### Word Embeddings 43 | 44 | | Architecture | Visualization | 45 | |:--------:|:----:| 46 | | ULMFiT | [Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_config.json) | 47 | | TransformerXL | [Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-malyalam/master/language-model/embedding_projector_transformer_config.json) | 48 | 49 | 50 | ### Results of using Transfer Learning + Data Augmentation from iNLTK 51 | 52 | ##### On using complete training set (with Transfer learning) 53 | 54 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 55 | |:--------:|:----:|:----:|:----:|:----:| 56 | | iNLTK Headlines Corpus - Malayalam | (5036, 630, 630) | 95.56 | 93.29 | [Link](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model.ipynb) | 57 | 58 | 59 | ##### On using 10% of training set (with Transfer learning) 60 | 61 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 62 | |:--------:|:----:|:----:|:----:|:----:| 63 | | iNLTK Headlines Corpus - Malayalam | (503, 630, 630) | 82.38 | 73.47 | [Link](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model_without_aug.ipynb) | 64 | 65 | ##### On using 10% of training set (with Transfer learning + Data Augmentation) 66 | 67 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 68 | |:--------:|:----:|:----:|:----:|:----:| 69 | | iNLTK Headlines Corpus - Malayalam | (503, 630, 630) | 84.29 | 76.36 | [Link](https://github.com/goru001/nlp-for-malyalam/blob/master/classification/Malyalam_Classification_Model_with_aug.ipynb) | 70 | 71 | 72 | ## Pretrained Models 73 | 74 | #### Language Models 75 | Download pretrained Language Model from [here](https://drive.google.com/open?id=1QHNR6xGN8JbvPEuDRXtb18J9WbGm9AwV) 76 | 77 | 78 | #### Tokenizer 79 | 80 | Trained tokenizer using Google's [sentencepiece](https://github.com/google/sentencepiece) 81 | 82 | Download the trained model and vocabulary from [here](https://drive.google.com/open?id=1jZ1QXVEhZnlQi2zyJG_O7l2r0pW38cbe) -------------------------------------------------------------------------------- /datasets-preparation/get-all-article-links-for-malyalam-wikipedia.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from urllib.request import urlopen\n", 10 | "import pickle" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "html_doc = ''\n", 20 | "with urlopen('https://ml.wikipedia.org/wiki/%E0%B4%AA%E0%B5%8D%E0%B4%B0%E0%B4%A7%E0%B4%BE%E0%B4%A8_%E0%B4%A4%E0%B4%BE%E0%B5%BE') as response:\n", 21 | " for line in response:\n", 22 | " line = line.decode('utf-8')\n", 23 | " html_doc = html_doc + line.replace('\\n','')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from bs4 import BeautifulSoup\n", 33 | "soup = BeautifulSoup(html_doc, 'html.parser')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "'പ്രധാന താൾ'" 45 | ] 46 | }, 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "soup.h1.string" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 5, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "tab = soup.find(\"table\",{\"style\":\"margin-top:0em; border:2px solid #e1eaee; border-collapse:separate;font-size:90%; -moz-border-radius:10px\"})" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 6, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "anchors = tab.find_all('a')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 7, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "home_url = 'https://ml.wikipedia.org' \n", 81 | "links = [home_url + anchor['href'] for anchor in anchors]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "51" 93 | ] 94 | }, 95 | "execution_count": 8, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "len(links)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 10, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "all_links = []" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 13, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "345\n", 123 | "345\n", 124 | "690\n", 125 | "690\n", 126 | "1035\n", 127 | "1035\n", 128 | "1308\n", 129 | "1308\n", 130 | "1653\n", 131 | "1653\n", 132 | "1775\n", 133 | "1775\n", 134 | "1815\n", 135 | "1815\n", 136 | "2160\n", 137 | "2160\n", 138 | "2505\n", 139 | "2505\n", 140 | "2850\n", 141 | "2850\n", 142 | "3195\n", 143 | "3195\n", 144 | "3540\n", 145 | "3540\n", 146 | "3595\n", 147 | "3595\n", 148 | "3595\n", 149 | "3940\n", 150 | "3940\n", 151 | "4253\n", 152 | "4253\n", 153 | "4598\n", 154 | "4598\n", 155 | "4659\n", 156 | "4659\n", 157 | "4665\n", 158 | "4665\n", 159 | "5010\n", 160 | "5010\n", 161 | "5068\n", 162 | "5068\n", 163 | "5413\n", 164 | "5413\n", 165 | "5446\n", 166 | "5446\n", 167 | "5531\n", 168 | "5531\n", 169 | "5876\n", 170 | "5876\n", 171 | "5882\n", 172 | "5882\n", 173 | "6227\n", 174 | "6227\n", 175 | "6238\n", 176 | "6238\n", 177 | "6243\n", 178 | "6243\n", 179 | "6588\n", 180 | "6588\n", 181 | "6606\n", 182 | "6606\n", 183 | "6951\n", 184 | "6951\n", 185 | "7208\n", 186 | "7208\n", 187 | "7553\n", 188 | "7553\n", 189 | "7898\n", 190 | "7898\n", 191 | "8243\n", 192 | "8243\n", 193 | "8588\n", 194 | "8588\n", 195 | "8933\n", 196 | "8933\n", 197 | "9278\n", 198 | "9278\n", 199 | "9623\n", 200 | "9623\n", 201 | "9968\n", 202 | "9968\n", 203 | "10313\n", 204 | "10313\n", 205 | "10658\n", 206 | "10658\n", 207 | "10669\n", 208 | "10669\n", 209 | "10688\n", 210 | "10688\n", 211 | "11033\n", 212 | "11033\n", 213 | "11378\n", 214 | "11378\n", 215 | "11723\n", 216 | "11723\n", 217 | "12068\n", 218 | "12068\n", 219 | "12413\n", 220 | "12413\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "# Main code\n", 226 | "prev_len = 0\n", 227 | "for link in links: \n", 228 | " while link:\n", 229 | " html_doc = ''\n", 230 | " with urlopen(link) as response:\n", 231 | " for line in response:\n", 232 | " line = line.decode('utf-8')\n", 233 | " html_doc = html_doc + line.replace('\\n','')\n", 234 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 235 | " div = soup.find('div',{'class':'mw-prefixindex-body'})\n", 236 | " if div:\n", 237 | " anchors = div.find_all('a');\n", 238 | " all_links = all_links + [home_url + anchor['href'] for anchor in anchors]\n", 239 | " print(len(set(all_links)))\n", 240 | " if prev_len == len(set(all_links)):\n", 241 | " break\n", 242 | " nav_div = soup.find('div',{'class':'mw-prefixindex-nav'})\n", 243 | " if nav_div and len(nav_div.find_all('a')) == 2:\n", 244 | " link = home_url + nav_div.find_all('a')[1]['href']\n", 245 | " prev_len = len(set(all_links))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 14, 251 | "metadata": {}, 252 | "outputs": [ 253 | { 254 | "data": { 255 | "text/plain": [ 256 | "12413" 257 | ] 258 | }, 259 | "execution_count": 14, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "len(set(all_links))" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 15, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/plain": [ 276 | "12413" 277 | ] 278 | }, 279 | "execution_count": 15, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "all_links = list(set(all_links)); len(all_links)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 16, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "with open('all_malyalam_wikipedia_links.pkl', 'wb') as f:\n", 295 | " pickle.dump(all_links, f)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 17, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/plain": [ 306 | "'https://ml.wikipedia.org/wiki/%E0%B4%93%E0%B4%95%E0%B5%8D%E0%B4%B2%E0%B5%BB%E0%B4%A1%E0%B5%8D_%E0%B4%AA%E0%B5%8D%E0%B4%B0%E0%B4%AD%E0%B5%81'" 307 | ] 308 | }, 309 | "execution_count": 17, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "all_links[160]" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | } 325 | ], 326 | "metadata": { 327 | "kernelspec": { 328 | "display_name": "Python 3", 329 | "language": "python", 330 | "name": "python3" 331 | }, 332 | "language_info": { 333 | "codemirror_mode": { 334 | "name": "ipython", 335 | "version": 3 336 | }, 337 | "file_extension": ".py", 338 | "mimetype": "text/x-python", 339 | "name": "python", 340 | "nbconvert_exporter": "python", 341 | "pygments_lexer": "ipython3", 342 | "version": "3.6.7" 343 | } 344 | }, 345 | "nbformat": 4, 346 | "nbformat_minor": 2 347 | } 348 | -------------------------------------------------------------------------------- /tokenizer/Malyalam Tokenization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sentencepiece as spm\n", 10 | "import pickle\n", 11 | "import pathlib" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "path = pathlib.Path('/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "p = path.glob('MalyalamWikipediaArticles/*')\n", 30 | "files = [x for x in p if x.is_file()]" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 6, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "12388" 42 | ] 43 | }, 44 | "execution_count": 6, 45 | "metadata": {}, 46 | "output_type": "execute_result" 47 | } 48 | ], 49 | "source": [ 50 | "len(files)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 7, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "files = [str(file) for file in files]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 8, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "flist = ','.join(files)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 9, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "True" 80 | ] 81 | }, 82 | "execution_count": 9, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "spm.SentencePieceTrainer.Train(f'--input={flist} --model_prefix=malyalam_lm --vocab_size=10000')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 10, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "with open(path/'MalyalamWikipediaArticles/1781.pkl', 'rb') as f:\n", 98 | " text = pickle.load(f)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 11, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "'മലപ്പുറം ജില്ലയിലെ തിരൂരങ്ങാടി താലൂക്കിൽ വേങ്ങര ബ്ളോക്കിലാണ് അബ്ദുറഹിമാൻ നഗർ ഗ്രാമപഞ്ചായത്ത് സ്ഥിതി ചെയ്യുന്നത്. അബ്ദുറഹിമാൻ നഗർ വില്ലേജുപരിധിയിൽ ഉൾപ്പെടുന്ന അബ്ദുറഹിമാൻ നഗർ ഗ്രാമപഞ്ചായത്തിനു 14.83 ചതുരശ്രകിലോമീറ്റർ വിസ്തീർണ്ണമുണ്ട്.\\nപഞ്ചായത്തിന്റെ അതിരുകൾ വടക്കു ഭാഗത്ത് തേഞ്ഞിപ്പലം, കണ്ണമംഗലം, മൂന്നിയൂർ പഞ്ചായത്തുകളും, കിഴക്കുഭാഗത്ത് വേങ്ങര, കണ്ണമംഗലം പഞ്ചായത്തുകളും, തെക്കുഭാഗത്ത് തിരൂരങ്ങാടി, വേങ്ങര പഞ്ചായത്തുകളും, പടിഞ്ഞാറുഭാഗത്ത് മൂന്നിയൂർ, തിരൂരങ്ങാടി, തേഞ്ഞിപ്പലം പഞ്ചായത്തുകളുമാണ്.കോഴിക്കോട് വിമാനത്താവളത്തിൽ നിന്നും കോഴിക്കോട് സർവ്വകലാശാലയിൽ നിന്നും 9 കിലോമീറ്റർ സമദൂരത്തായി സ്ഥിതി ചെയ്യുന്ന ഒരു കൊച്ചുഗ്രാമമാണ് അബ്ദുറഹിമാൻ നഗർ ഗ്രാമപഞ്ചായത്ത്. പശ്ചിമഘട്ടത്തിൽ നിന്നുത്ഭവിച്ച് മലപ്പുറം ജില്ലയിലെ വിവിധ പ്രദേശങ്ങളിലൂടെ ഒഴുകി അറബിക്കടലിൽ ചേരുന്ന കടലുണ്ടിപ്പുഴയുടെ തീരത്ത് സ്ഥിതി ചെയ്യുന്ന ഈ പഞ്ചായത്തിനു അയൽപഞ്ചായത്തുകളുടെ പകുതി വിസ്തൃതിയേ ഉള്ളൂ. കൊടുവായൂർ എന്ന പേരിലാണ് ആദ്യകാലങ്ങളിൽ ഈ ഗ്രാമം അറിയപ്പെട്ടിരുന്നത്. കടലുണ്ടിപുഴ, പട്ടിശ്ശേരിപാടം, പെരുവള്ളൂർപാടം, കുറ്റൂർപാടം എന്നിവയാൽ ചുറ്റപ്പെട്ടുകിടക്കുന്ന ഈ ഗ്രാമം വർഷകാലങ്ങളിൽ ഒരു ദ്വീപിന്റെ പ്രതീതി സൃഷ്ടിക്കുമായിരുന്നു.\\n1963 ഡിസംബർ 4-നാണ് പഞ്ചായത്തിലേക്ക് ആദ്യമായി തെരഞ്ഞെടുപ്പ് നടന്നത്. 1956-ൽ കേരള സംസ്ഥാനം നിലവിൽ വരുന്ന കാലഘട്ടം വരെ ഈ ഗ്രാമം മദിരാശി സംസ്ഥാനത്തിന്റെ ഭാഗമായിരുന്നു. മദിരാശി അസംബ്ളിയിലേക്ക് നടന്ന തെരഞ്ഞടുപ്പിൽ, ഈ ഗ്രാമവാസികൾ കോട്ടക്കൽ ഫർക്കയിലായിരുന്നു ഉൾപ്പെട്ടിരുന്നത്. മണ്ഡലങ്ങൾ വീണ്ടും വിഭജിക്കപ്പെട്ടതോടെ ഈ ഗ്രാമം തിരൂരങ്ങാടി നിയോജക മണ്ഡലത്തിൽ ഉൾപ്പെട്ടു.\\nകോൺഗ്രസ് പ്രസ്ഥാനത്തിന് വളരെയേറെ വേരുകളുള്ള ഒരു ഗ്രാമമായിരുന്നു കൊടുവായൂർ. സ്വാതന്ത്ര്യസമരനായകൻ അബ്ദുറഹിമാൻ സാഹിബിന്റെയും സഹപ്രവർത്തകരുടെയും പ്രവർത്തനമേഖല കൂടിയായിരുന്നു ഈ പ്രദേശം. എന്ത് പേര് സ്വീകരിക്കണമെന്ന കാര്യത്തിൽ അഭിപ്രായവ്യത്യാസമുണ്ടായിരുന്നങ്കിലും അന്നത്തെ പ്രബലകക്ഷികളായ കോൺഗ്രസും മുസ്ളീംലീഗും പഞ്ചായത്തിന്റെ പേരു മാറ്റണം എന്ന കാര്യത്തിൽ ഒരേ അഭിപ്രായക്കാരായിരുന്നു. കൊടുവായൂരിലെ കോൺഗ്രസ് നേതാവും എ.ആർ.നഗറിലെ പ്രഥമ പ്രസിഡന്റുമായിരുന്ന വി.അഹമ്മദ് ആസാദ് ഈ ആവശ്യത്തിനു വേണ്ടി ഉറച്ചുപ്രവർത്തിച്ചു. മാറിവരുന്ന പേരു അബ്ദുറഹിമാൻ സാഹിബിന്റേത് ആയിരിക്കണമെന്ന് അക്കാലത്ത് ആസാദ് കോൺഗ്രസ് കമ്മിറ്റിയിൽ ഉന്നയിക്കുകയും പ്രദേശ് കോൺഗ്രസ് കമ്മിറ്റിയെ കൊണ്ട് ഈ പേര് താത്വികമായി അംഗീകരിപ്പിക്കുകയും ചെയ്തു. 1962 ലാണ് കൊടുവായൂരിന്റെ പേര് അബ്ദുറഹിമാൻ നഗർ എന്നാക്കി വിജ്ഞാപനം പുറപ്പെടുവിച്ചത്. തുടർന്ന് നടന്ന പ്രവർത്തനഫലമായി വി.കെ.പടി പോസ്റ്റോഫീസ് അബ്ദുറഹിമാൻ നഗർ പോസ്റ്റാഫീസാക്കി മാറ്റി. 1969 കാലഘട്ടം വരെ വില്ലേജിന്റെ പേര് കൊടുവായൂർ എന്നുതന്നെ നിലനിന്നുപോന്നു. 1969-ലെ സർക്കാരാണ് കൊടുവായൂർ വില്ലേജിന്റെ പേരു അബ്ദുറഹിമാൻ നഗർ എന്നാക്കിമാറ്റിയത്.\\n\\n'" 110 | ] 111 | }, 112 | "execution_count": 11, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "text" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 12, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "sp = spm.SentencePieceProcessor()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 13, 133 | "metadata": {}, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "True" 139 | ] 140 | }, 141 | "execution_count": 13, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "sp.Load(\"malyalam_lm.model\")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 14, 153 | "metadata": {}, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "'▁മലപ്പുറം ▁ജില്ലയിലെ ▁തിര ൂര ങ്ങാടി ▁താലൂക്കിൽ ▁വേങ്ങ ര ▁ബ്ളോക്കി ലാണ് ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁ഗ്രാമപഞ്ചായത്ത് ▁സ്ഥിതി ▁ചെയ്യുന്നത് . ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁വില്ലേജ ു പരിധി യിൽ ▁ഉൾപ്പെടുന്ന ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁ഗ്രാമപഞ്ചായത്ത ിനു ▁14 . 83 ▁ച തുരശ്രകിലോമീറ്റർ ▁വിസ്തീർണ്ണ മുണ്ട് . ▁പഞ്ചായത്തിന്റെ ▁അതിരുകൾ ▁വടക്കു ▁ഭാഗത്ത് ▁തേ ഞ്ഞി പ്പ ലം , ▁കണ്ണ മംഗലം , ▁മൂന്ന ിയ ൂർ ▁പഞ്ചായത്ത ുകളും , ▁കിഴക്കുഭാഗത്ത ് ▁വേങ്ങ ര , ▁കണ്ണ മംഗലം ▁പഞ്ചായത്ത ുകളും , ▁തെക്കുഭാഗത്ത ് ▁തിര ൂര ങ്ങാടി , ▁വേങ്ങ ര ▁പഞ്ചായത്ത ുകളും , ▁പടിഞ്ഞാറുഭാഗത്ത ് ▁മൂന്ന ിയ ൂർ , ▁തിര ൂര ങ്ങാടി , ▁തേ ഞ്ഞി പ്പ ലം ▁പഞ്ചായത്ത ു കളുമാണ് . കോഴിക്കോട് ▁വിമാനത്താവള ത്തിൽ ▁നിന്നും ▁കോഴിക്കോട് ▁സർവ്വകലാശാലയിൽ ▁നിന്നും ▁9 ▁കിലോമീറ്റർ ▁സമ ദൂര ത്ത ായി ▁സ്ഥിതി ▁ചെയ്യുന്ന ▁ഒരു ▁കൊച്ചു ഗ്രാമ മാണ് ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁ഗ്രാമപഞ്ചായത്ത് . ▁പശ്ചിമഘട്ട ത്തിൽ ▁നിന്നു ത് ഭവ ിച്ച് ▁മലപ്പുറം ▁ജില്ലയിലെ ▁വിവിധ ▁പ്രദേശ ങ്ങളിലൂടെ ▁ഒഴുകി ▁അറബിക്കടലി ൽ ▁ചേരുന്ന ▁കടലുണ്ടി പ്പുഴ യുടെ ▁തീരത്ത് ▁സ്ഥിതി ▁ചെയ്യുന്ന ▁ഈ ▁പഞ്ചായത്ത ിനു ▁അയൽ പഞ്ച ായ ത്ത ുകളുടെ ▁പകുതി ▁വിസ്തൃതി യേ ▁ഉള്ള ൂ . ▁കൊടു വായ ൂർ ▁എന്ന ▁പേരിലാണ് ▁ആദ്യകാലങ്ങളിൽ ▁ഈ ▁ഗ്രാമ ം ▁അറിയപ്പെട്ടിര ുന്നത് . ▁കടലുണ്ടി പുഴ , ▁പട്ട ി ശ്ശേരി പാട ം , ▁പെരു വ ള്ള ൂർ പാട ം , ▁കുറ്റ ൂർ പാട ം ▁എന്നിവ യാൽ ▁ചുറ്റപ്പെട്ട ുകിടക്കുന്ന ▁ഈ ▁ഗ്രാമ ം ▁വർഷ കാല ങ്ങളിൽ ▁ഒരു ▁ദ്വീപ ിന്റെ ▁പ്ര തീ തി ▁സൃഷ്ടിക്ക ുമായിരുന്നു . ▁1963 ▁ഡിസംബർ ▁4 - നാണ് ▁പഞ്ചായത്ത ിലേക്ക് ▁ആദ്യമായി ▁തെരഞ്ഞെടുപ്പ ് ▁നടന്നത് . ▁1956 - ൽ ▁കേരള ▁സംസ്ഥാന ം ▁നിലവിൽ ▁വരുന്ന ▁കാലഘട്ട ം ▁വരെ ▁ഈ ▁ഗ്രാമ ം ▁മദിരാശി ▁സംസ്ഥാനത്തിന്റെ ▁ഭാഗമായിരുന്നു . ▁മദിരാശി ▁അസ ം ബ് ളി യിലേക്ക് ▁നടന്ന ▁തെ ര ഞ്ഞ ടുപ്പ ിൽ , ▁ഈ ▁ഗ്രാമ വാസികൾ ▁കോട്ട ക്കൽ ▁ഫ ർ ക്ക യിലായിരുന്നു ▁ഉൾപ്പെട്ട ിരുന്നത് . ▁മണ്ഡല ങ്ങൾ ▁വീണ്ടും ▁വിഭജിക്ക പ്പെട്ട തോടെ ▁ഈ ▁ഗ്രാമ ം ▁തിര ൂര ങ്ങാടി ▁നിയോജക ▁മണ്ഡലത്തിൽ ▁ഉൾപ്പെട്ട ു . ▁കോൺഗ്രസ് ▁പ്രസ്ഥാന ത്തിന് ▁വളരെയേറെ ▁വേര ു കളുള്ള ▁ഒരു ▁ഗ്രാമ മായിരുന്നു ▁കൊടു വായ ൂർ . ▁സ്വാതന്ത്ര്യസമര നായക ൻ ▁അബ്ദു റ ഹി മാൻ ▁സാഹിബ ിന്റെയും ▁സഹ പ്രവർത്തക രുടെയും ▁പ്രവർത്തന മേഖല ▁കൂടിയ ായിരുന്നു ▁ഈ ▁പ്രദേശം . ▁എന്ത ് ▁പേര് ▁സ്വീകരിക്ക ണമെന്ന ▁കാര്യത്തിൽ ▁അഭിപ്രായ വ്യത്യാസ മുണ്ടായിരുന്ന ങ്ക ിലും ▁അന്നത്തെ ▁പ്രബല കക്ഷി കളായ ▁കോൺഗ്രസ ും ▁മുസ് ള ീ ം ലീ ഗ ും ▁പഞ്ചായത്തിന്റെ ▁പേരു ▁മാറ്റ ണം ▁എന്ന ▁കാര്യത്തിൽ ▁ഒരേ ▁അഭിപ്രായ ക്കാര ായിരുന്നു . ▁കൊടു വായ ൂരിലെ ▁കോൺഗ്രസ് ▁നേതാവ ും ▁എ . ആർ . ന ഗ റിലെ ▁പ്രഥമ ▁പ്രസിഡന്റ ുമായിരുന്ന ▁വി . അ ഹ മ്മ ദ് ▁ആസാദ് ▁ഈ ▁ആവശ്യ ത്തിനു ▁വേണ്ടി ▁ഉറച്ച ു പ്രവർത്തി ച്ചു . ▁മാറി വരുന്ന ▁പേരു ▁അബ്ദു റ ഹി മാൻ ▁സാഹിബ ി ന്റേത ് ▁ആയി രിക്ക ണമെന്ന് ▁അക്കാലത്ത് ▁ആസാദ് ▁കോൺഗ്രസ് ▁കമ്മിറ്റി യിൽ ▁ഉന്നയിക്ക ുകയും ▁പ്രദേശ ് ▁കോൺഗ്രസ് ▁കമ്മിറ്റി യെ ▁കൊണ്ട് ▁ഈ ▁പേര് ▁താ ത്വ ിക മായി ▁അംഗ ീ ക രി പ്പിക്കുകയും ▁ചെയ്തു . ▁1962 ▁ലാണ് ▁കൊടു വായ ൂര ിന്റെ ▁പേര് ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁എന്ന ാക്കി ▁വി ജ്ഞ ാപ നം ▁പുറപ്പെടുവിച്ച ത് . ▁തുടർന്ന് ▁നടന്ന ▁പ്രവർത്തന ഫലമായി ▁വി . കെ . പടി ▁പോ സ്റ്റോ ഫീസ ് ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁പോ സ്റ്റ ാ ഫീസ ാക്കി ▁മാറ്റി . ▁1969 ▁കാലഘട്ട ം ▁വരെ ▁വില്ലേജ ിന്റെ ▁പേര് ▁കൊടു വായ ൂർ ▁എന്ന ുതന്നെ ▁നിലനിന്ന ുപോന്നു . ▁1969 - ലെ ▁സർക്കാര ാണ് ▁കൊടു വായ ൂർ ▁വില്ലേജ ിന്റെ ▁പേരു ▁അബ്ദു റ ഹി മാൻ ▁ന ഗർ ▁എന്ന ാക്കി മാറ്റ ിയത് .'" 159 | ] 160 | }, 161 | "execution_count": 14, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "' '.join(sp.EncodeAsPieces(text))" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [] 176 | } 177 | ], 178 | "metadata": { 179 | "kernelspec": { 180 | "display_name": "Python 3", 181 | "language": "python", 182 | "name": "python3" 183 | }, 184 | "language_info": { 185 | "codemirror_mode": { 186 | "name": "ipython", 187 | "version": 3 188 | }, 189 | "file_extension": ".py", 190 | "mimetype": "text/x-python", 191 | "name": "python", 192 | "nbconvert_exporter": "python", 193 | "pygments_lexer": "ipython3", 194 | "version": "3.6.7" 195 | } 196 | }, 197 | "nbformat": 4, 198 | "nbformat_minor": 2 199 | } 200 | -------------------------------------------------------------------------------- /language-model/Malyalam_Language_Model_Transformer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from fastai.text import *\n", 21 | "import numpy as np\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "import pickle\n", 24 | "import sentencepiece as spm" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "('1.0.57', '1.1.0')" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import fastai, torch\n", 45 | "fastai.__version__ , torch.__version__" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "torch.cuda.set_device(0)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model\r\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "!pwd" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "path = Path('/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from inltk.tokenizer import MalyalamTokenizer" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "inltk.tokenizer.MalyalamTokenizer" 101 | ] 102 | }, 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "MalyalamTokenizer" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# class MalyalamTokenizer(BaseTokenizer):\n", 119 | "# def __init__(self, lang:str):\n", 120 | "# self.lang = lang\n", 121 | "# self.sp = spm.SentencePieceProcessor()\n", 122 | "# self.sp.Load(str(path/\"../tokenizer/malyalam_lm.model\"))\n", 123 | " \n", 124 | "# def tokenizer(self, t:str) -> List[str]:\n", 125 | "# return self.sp.EncodeAsPieces(t)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "sp = spm.SentencePieceProcessor()\n", 135 | "sp.Load(str(path/\"../tokenizer/malyalam_lm.model\"))\n", 136 | "itos = [sp.IdToPiece(int(i)) for i in range(10000)]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# 10,000 is the vocab size that we chose in sentencepiece\n", 146 | "malyalam_vocab = Vocab(itos)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 12, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "tokenizer = Tokenizer(tok_func=MalyalamTokenizer, lang='ml')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 13, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "['xxunk',\n", 167 | " 'xxpad',\n", 168 | " 'xxbos',\n", 169 | " 'xxeos',\n", 170 | " 'xxfld',\n", 171 | " 'xxmaj',\n", 172 | " 'xxup',\n", 173 | " 'xxrep',\n", 174 | " 'xxwrep']" 175 | ] 176 | }, 177 | "execution_count": 13, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "tokenizer.special_cases" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 14, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "data_lm = TextLMDataBunch.from_folder(path=path/'transformer', tokenizer=tokenizer, vocab=malyalam_vocab)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 15, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "64" 204 | ] 205 | }, 206 | "execution_count": 15, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "data_lm.batch_size" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 16, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "data_lm.save()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 17, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/html": [ 232 | "\n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
idxtext
0ിക യിൽ ▁ഉൾപ്പെടുത്തിയ ിരിക്കുന്നത് . ▁സീ ബോ ൾഡ് സ് ▁ബീച്ച് ▁എന്ന യിനം ▁ബീ ച് ▁വൃക്ഷ ങ്ങൾ ▁നിറഞ്ഞ ▁വന മേഖല യാണ് ▁ഇവിടത്തെ ▁ഒരു ▁പ്രത്യേകത . ▁ശൈത്യകാലത്ത ് ▁പൂർണ്ണമായ ും ▁ഇലപ്പൊ ഴ ിക്കുന്ന ▁ഈ ▁മര ങ്ങൾ ▁ശൈത്യ ത്തിന്റെ ▁അവസാന ത്തോടെ ▁ഒരു ▁ശി ശി ര നി ദ്ര യിൽ നിന്ന െന്നപോലെ ▁ഉ ണ ര ുകയും ▁വീണ്ടും ▁ഇലകൾ ▁ത ളി ർ ക്ക ുവാൻ ▁ആരംഭിക്ക ുകയും ▁ചെയ്യുന്നു . ▁ജപ്പാ ന ിൽനിന്നും ▁ആദ്യമായി ▁ലോകപൈതൃക പട്ട ിക യിൽ
1സി യാർ ▁അലി ം ▁അക്ബർ ▁സാ നി ▁വാ ലാ ▁ഷാ ൻ ▁പാദ ് ഷാ - ഇ - ബാഹ ് ർ - ഉ - ബാർ ▁എന്നാണ് ▁മുഴുവൻ ▁പേര് . ▁മുഗൾ ▁സാമ്രാജ്യ ത്തിലെ ▁ദുർബല നായ ▁ചക്രവർത്തി യാ യാണ് ▁ഫറൂഖ് ▁സി യാർ ▁വിലയിരുത്ത പ്പെടുന്നത് . ▁ഉപ ജാ പ ക സംഘ ത്തിന്റെ ▁പ്രേരണ യാൽ ▁പല തവണ ▁ഇദ്ദേഹം ▁വഴി തെ റ്റ ുകയും ▁സ്വതന്ത്ര മായി ▁ഭരണം ▁നടത്താൻ ▁സാധിക്ക ാതെ ▁വരികയും ▁ചെയ്തു . ▁ഹ സ്സ ൻ ▁അലി
2▁പരീക്ഷ യും ▁വിജയിച്ചു . ▁ശ്രീ മൂലം ▁തിരുനാൾ ▁ മഹാരാജാവ ് ▁18 90 ൽ ▁എ . ആ റിനെ ▁സംസ്കൃത ▁പാഠ ശാല യിൽ ▁ഇൻ സ് പെ ക്ട റായി ▁നിയമ ിച്ചു . ▁എ . ആർ . ▁ഈ ▁കാലയളവിൽ ▁നിഷ് ക ൃഷ്ട മായ ▁പാഠ ്യ പദ്ധതി യും ▁പാശ്ചാത്യ രീതി യിലുള്ള ▁ശിക്ഷ ാക്രമ വും ▁നടപ്പാക്ക ി . ▁ജോലി ക്കിടയിൽ ▁സംസ്കൃത ത്തിൽ ▁എം . എ . ▁എഴുതിയ െടുത്തു . ▁18 94 ൽ ▁സംസ്കൃത ▁മഹാ പാഠ ശാല
3• ▁വംശ പത്ര പതി തം • ▁വംശ യ ഷ്ട ിക • ▁വംശ സ്ഥ ം • ▁വ ് യാള ം • ▁ശങ്കര ചര ിത ം • ▁ശ ശ ധര ബി ംബ ം • ▁ശശി കല • ▁ശശി കല • ▁ശ ാ ർദ്ദ ൂ ല വി ക്രീ ഡി തം • ▁ ശാല ിനി • ▁ശിഖര ിണി • ▁ശിവ ം • ▁ശി താ ഗ്ര • ▁ശുദ്ധ വി രാ ൾ • ▁ശിശു ഭ ൃത
4ശ ിക്കാൻ ▁തുടങ്ങി . ▁ഈ ▁സമയത്ത് ▁തന്റെ ▁തോ ക്കിൽ ▁നിന്ന് ▁മംഗൽ ▁സ്വയം ▁വെടി യു തി ർ ക്കാൻ ▁ശ്രമിച്ച െങ്കിലും ▁പരാജയപ്പെട്ടു . ▁നി സ് സാര മായ ▁പര ു ക്ക േറ്റ ▁മംഗൽ ▁പാണ്ഡേ യെ ▁അറസ്റ്റ് ▁ചെയ്തു . ▁ബംഗാൾ ▁സൈന്യ ത്തിൽ ▁പുതിയ തായി ▁എത്തിയ ▁എൻ ഫീൽഡ ് - പി - 53 ▁തോ ക്ക ുകളിൽ ▁ഉപയോഗിക്കുന്ന ▁തിര കള െക്കുറിച്ചുള്ള ▁ദു രീ കരിക്ക ാത്ത ▁സംശയ ങ്ങളായിരുന്നു ▁മംഗൽ ▁പാണ്ഡേ യുടെ ▁പെരു മാറ്റ ത്തിനു ▁കാരണമായി ▁ചൂണ്ടിക്കാണിക്ക ുന്നത്
" 262 | ], 263 | "text/plain": [ 264 | "" 265 | ] 266 | }, 267 | "metadata": {}, 268 | "output_type": "display_data" 269 | } 270 | ], 271 | "source": [ 272 | "data_lm.show_batch()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 18, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "10000" 284 | ] 285 | }, 286 | "execution_count": 18, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "len(data_lm.vocab.itos)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 19, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "learn = language_model_learner(data_lm, TransformerXL, pretrained=False)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 20, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "20" 313 | ] 314 | }, 315 | "execution_count": 20, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "gc.collect()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 21, 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/html": [], 332 | "text/plain": [ 333 | "" 334 | ] 335 | }, 336 | "metadata": {}, 337 | "output_type": "display_data" 338 | }, 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "learn.lr_find()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 22, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "image/png": "\n", 359 | "text/plain": [ 360 | "
" 361 | ] 362 | }, 363 | "metadata": { 364 | "needs_background": "light" 365 | }, 366 | "output_type": "display_data" 367 | } 368 | ], 369 | "source": [ 370 | "learn.recorder.plot()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 23, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/html": [ 381 | "\n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | "
epochtrain_lossvalid_lossaccuracytime
06.0461366.0032500.16235207:06
15.2789915.1990690.21860107:06
24.6567304.6434280.26900007:06
34.3312464.3085170.30227907:05
44.1746034.0379170.32949607:06
53.8753003.8079560.35818807:06
63.6911413.6247890.38225307:05
73.4180573.4632050.40648307:06
83.1960443.3759670.41993307:07
93.0775713.3604510.42267107:06
" 464 | ], 465 | "text/plain": [ 466 | "" 467 | ] 468 | }, 469 | "metadata": {}, 470 | "output_type": "display_data" 471 | }, 472 | { 473 | "name": "stdout", 474 | "output_type": "stream", 475 | "text": [ 476 | "Better model found at epoch 0 with accuracy value: 0.16235168278217316.\n", 477 | "Better model found at epoch 1 with accuracy value: 0.21860076487064362.\n", 478 | "Better model found at epoch 2 with accuracy value: 0.2690003216266632.\n", 479 | "Better model found at epoch 3 with accuracy value: 0.3022788166999817.\n", 480 | "Better model found at epoch 4 with accuracy value: 0.3294961452484131.\n", 481 | "Better model found at epoch 5 with accuracy value: 0.3581877648830414.\n", 482 | "Better model found at epoch 6 with accuracy value: 0.3822525143623352.\n", 483 | "Better model found at epoch 7 with accuracy value: 0.4064827263355255.\n", 484 | "Better model found at epoch 8 with accuracy value: 0.4199325740337372.\n", 485 | "Better model found at epoch 9 with accuracy value: 0.422670841217041.\n" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "learn.fit_one_cycle(10, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='model')])" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 24, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "data": { 500 | "text/html": [ 501 | "\n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | "
epochtrain_lossvalid_lossaccuracytime
03.1488853.3499590.42462907:04
13.1623823.3240640.42868307:05
23.1614863.2900350.43457007:05
32.9977873.2643770.43863907:05
42.9151103.2596190.43958707:06
" 549 | ], 550 | "text/plain": [ 551 | "" 552 | ] 553 | }, 554 | "metadata": {}, 555 | "output_type": "display_data" 556 | }, 557 | { 558 | "name": "stdout", 559 | "output_type": "stream", 560 | "text": [ 561 | "Better model found at epoch 0 with accuracy value: 0.42462852597236633.\n", 562 | "Better model found at epoch 1 with accuracy value: 0.4286832809448242.\n", 563 | "Better model found at epoch 2 with accuracy value: 0.43456965684890747.\n", 564 | "Better model found at epoch 3 with accuracy value: 0.43863922357559204.\n", 565 | "Better model found at epoch 4 with accuracy value: 0.4395868182182312.\n" 566 | ] 567 | } 568 | ], 569 | "source": [ 570 | "learn.fit_one_cycle(5, 1e-4, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='model2')])" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 25, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "TEXT = \"ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി\"\n", 580 | "N_WORDS = 40\n", 581 | "N_SENTENCES = 2" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 26, 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "name": "stdout", 591 | "output_type": "stream", 592 | "text": [ 593 | "ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി ▁എന്ന ▁പേരിൽ ▁അറിയപ്പെടുന്ന ▁ജാ ഫർ ▁സ ക ാനി ഫി ക് ▁കമ്പനി ക്ക് ▁നേതൃത്വം ▁നൽകിയ ▁സേവന മാണ് ▁ഫ ഗ് ▁ റിയ . ▁. ആ ത് മ നാ ഭ വർമ്മ യുടെ ▁നേതൃത്വത്തിൽ ▁ഇന്ത്യയിലെ ▁മൂന്നു ▁ജില്ല കളായി ▁വിഭജിച്ച ് , ▁പഞ്ചാബ് , ▁ഹരിയാന\n", 594 | "ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി യും ▁പ വ ▁കടയ്ക്കൽ ▁ഭരണ ത്തിനെതിരെ യുള്ള ▁ഒരു ▁ഇന്ത്യൻ ▁ഭരണ സം ഭവ മാണ് ▁ഫി സ ൽ ▁അസ ം . ▁. ഇ തി ർ ▁ ഥ േ ര വാദ ▁ഈ ▁വിഭാഗ ത്തിന്റെ ▁മാ പ്പു വഴി യാണ് ▁പ്രധാനമായും ▁ശിപായി മാർ ▁എന്ന് ▁അറിയപ്പെടുന്നത്\n" 595 | ] 596 | } 597 | ], 598 | "source": [ 599 | "print(\"\\n\".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 27, 605 | "metadata": {}, 606 | "outputs": [ 607 | { 608 | "data": { 609 | "text/plain": [ 610 | "25.790339917193062" 611 | ] 612 | }, 613 | "execution_count": 27, 614 | "metadata": {}, 615 | "output_type": "execute_result" 616 | } 617 | ], 618 | "source": [ 619 | "np.exp(3.25)" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 28, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "defaults.device = torch.device('cpu')\n", 629 | "learn.model.eval()\n", 630 | "learn.export()" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 29, 636 | "metadata": {}, 637 | "outputs": [], 638 | "source": [ 639 | "# Generating embedding vectors for visualization" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 30, 645 | "metadata": {}, 646 | "outputs": [ 647 | { 648 | "data": { 649 | "text/plain": [ 650 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model')" 651 | ] 652 | }, 653 | "execution_count": 30, 654 | "metadata": {}, 655 | "output_type": "execute_result" 656 | } 657 | ], 658 | "source": [ 659 | "path" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": 31, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "defaults.device = torch.device('cpu')" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 32, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "# learn = load_learner(path / 'MalyalamDataset/')" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 35, 683 | "metadata": {}, 684 | "outputs": [], 685 | "source": [ 686 | "encoder = get_model(learn.model)[0]" 687 | ] 688 | }, 689 | { 690 | "cell_type": "code", 691 | "execution_count": 36, 692 | "metadata": {}, 693 | "outputs": [ 694 | { 695 | "data": { 696 | "text/plain": [ 697 | "torch.Size([10000, 410])" 698 | ] 699 | }, 700 | "execution_count": 36, 701 | "metadata": {}, 702 | "output_type": "execute_result" 703 | } 704 | ], 705 | "source": [ 706 | "encoder.state_dict()['encoder.weight'].shape" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 37, 712 | "metadata": {}, 713 | "outputs": [], 714 | "source": [ 715 | "embeddings = encoder.state_dict()['encoder.weight']" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": 38, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "embeddings = np.array(embeddings)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 39, 730 | "metadata": {}, 731 | "outputs": [ 732 | { 733 | "data": { 734 | "text/plain": [ 735 | "(410,)" 736 | ] 737 | }, 738 | "execution_count": 39, 739 | "metadata": {}, 740 | "output_type": "execute_result" 741 | } 742 | ], 743 | "source": [ 744 | "embeddings[0].shape" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 40, 750 | "metadata": {}, 751 | "outputs": [], 752 | "source": [ 753 | "df = pd.DataFrame(embeddings)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 41, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/html": [ 764 | "
\n", 765 | "\n", 778 | "\n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | "
0123456789...400401402403404405406407408409
00.036030-0.2338230.1654360.232610-0.027157-0.255889-0.1574480.3048900.0102450.149751...0.2210980.110326-0.1669550.060656-0.039286-0.102857-0.1547500.071319-0.287154-0.146374
1-0.2883750.1953350.130055-0.2894550.251860-0.0644110.2248590.058616-0.069197-0.087954...-0.3235850.028612-0.217118-0.301141-0.1529530.137157-0.027333-0.0506280.154908-0.041128
2-0.2969030.1728180.115393-0.2873700.248925-0.0626230.2310340.056768-0.073595-0.080606...-0.3384180.031345-0.228342-0.311688-0.1648380.148972-0.027800-0.0489570.162479-0.006186
30.269789-0.2809550.4064440.0658250.104121-0.268758-0.088995-0.1178790.173256-0.018210...0.3438190.245318-0.0241080.3061430.100854-0.223558-0.362481-0.200894-0.102091-0.271015
4-0.0557370.325437-0.1686850.155597-0.132933-0.0844050.159368-0.1896360.1127560.215854...0.2765630.0534580.088126-0.1708330.159408-0.358251-0.140460-0.117527-0.064717-0.118483
\n", 928 | "

5 rows × 410 columns

\n", 929 | "
" 930 | ], 931 | "text/plain": [ 932 | " 0 1 2 3 4 5 6 \\\n", 933 | "0 0.036030 -0.233823 0.165436 0.232610 -0.027157 -0.255889 -0.157448 \n", 934 | "1 -0.288375 0.195335 0.130055 -0.289455 0.251860 -0.064411 0.224859 \n", 935 | "2 -0.296903 0.172818 0.115393 -0.287370 0.248925 -0.062623 0.231034 \n", 936 | "3 0.269789 -0.280955 0.406444 0.065825 0.104121 -0.268758 -0.088995 \n", 937 | "4 -0.055737 0.325437 -0.168685 0.155597 -0.132933 -0.084405 0.159368 \n", 938 | "\n", 939 | " 7 8 9 ... 400 401 402 403 \\\n", 940 | "0 0.304890 0.010245 0.149751 ... 0.221098 0.110326 -0.166955 0.060656 \n", 941 | "1 0.058616 -0.069197 -0.087954 ... -0.323585 0.028612 -0.217118 -0.301141 \n", 942 | "2 0.056768 -0.073595 -0.080606 ... -0.338418 0.031345 -0.228342 -0.311688 \n", 943 | "3 -0.117879 0.173256 -0.018210 ... 0.343819 0.245318 -0.024108 0.306143 \n", 944 | "4 -0.189636 0.112756 0.215854 ... 0.276563 0.053458 0.088126 -0.170833 \n", 945 | "\n", 946 | " 404 405 406 407 408 409 \n", 947 | "0 -0.039286 -0.102857 -0.154750 0.071319 -0.287154 -0.146374 \n", 948 | "1 -0.152953 0.137157 -0.027333 -0.050628 0.154908 -0.041128 \n", 949 | "2 -0.164838 0.148972 -0.027800 -0.048957 0.162479 -0.006186 \n", 950 | "3 0.100854 -0.223558 -0.362481 -0.200894 -0.102091 -0.271015 \n", 951 | "4 0.159408 -0.358251 -0.140460 -0.117527 -0.064717 -0.118483 \n", 952 | "\n", 953 | "[5 rows x 410 columns]" 954 | ] 955 | }, 956 | "execution_count": 41, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "df.head()" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 42, 968 | "metadata": {}, 969 | "outputs": [ 970 | { 971 | "data": { 972 | "text/plain": [ 973 | "(10000, 410)" 974 | ] 975 | }, 976 | "execution_count": 42, 977 | "metadata": {}, 978 | "output_type": "execute_result" 979 | } 980 | ], 981 | "source": [ 982 | "df.shape" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": 43, 988 | "metadata": {}, 989 | "outputs": [], 990 | "source": [ 991 | "df.to_csv('embeddings_transformer.tsv', sep='\\t', index=False, header=False)" 992 | ] 993 | }, 994 | { 995 | "cell_type": "code", 996 | "execution_count": 44, 997 | "metadata": {}, 998 | "outputs": [], 999 | "source": [ 1000 | "df2 = pd.DataFrame(itos)" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": 45, 1006 | "metadata": {}, 1007 | "outputs": [ 1008 | { 1009 | "data": { 1010 | "text/html": [ 1011 | "
\n", 1012 | "\n", 1025 | "\n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | "
0
0<unk>
1<s>
2</s>
3.
4
\n", 1055 | "
" 1056 | ], 1057 | "text/plain": [ 1058 | " 0\n", 1059 | "0 \n", 1060 | "1 \n", 1061 | "2 \n", 1062 | "3 .\n", 1063 | "4 ്" 1064 | ] 1065 | }, 1066 | "execution_count": 45, 1067 | "metadata": {}, 1068 | "output_type": "execute_result" 1069 | } 1070 | ], 1071 | "source": [ 1072 | "df2.head()" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 46, 1078 | "metadata": {}, 1079 | "outputs": [ 1080 | { 1081 | "data": { 1082 | "text/plain": [ 1083 | "(10000, 1)" 1084 | ] 1085 | }, 1086 | "execution_count": 46, 1087 | "metadata": {}, 1088 | "output_type": "execute_result" 1089 | } 1090 | ], 1091 | "source": [ 1092 | "df2.shape" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "code", 1097 | "execution_count": 47, 1098 | "metadata": {}, 1099 | "outputs": [], 1100 | "source": [ 1101 | "df2.to_csv('embeddings_transformer_metadata.tsv', sep='\\t', index=False, header=False)" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": 48, 1107 | "metadata": { 1108 | "scrolled": true 1109 | }, 1110 | "outputs": [ 1111 | { 1112 | "data": { 1113 | "text/plain": [ 1114 | "tensor([-2.8838e-01, 1.9534e-01, 1.3005e-01, -2.8946e-01, 2.5186e-01,\n", 1115 | " -6.4411e-02, 2.2486e-01, 5.8616e-02, -6.9197e-02, -8.7954e-02,\n", 1116 | " -2.1713e-01, -3.0053e-01, 1.8226e-02, 2.5914e-01, -9.8454e-02,\n", 1117 | " 1.0505e-01, -4.4822e-02, 1.6573e-01, 1.7450e-01, -1.0521e-01,\n", 1118 | " 2.3799e-01, -1.1365e-01, -5.5788e-04, -1.8098e-01, 2.3345e-01,\n", 1119 | " -3.9576e-01, -2.7768e-01, 1.2207e-01, 2.0038e-02, 3.5984e-01,\n", 1120 | " 3.5331e-02, -1.8174e-02, 2.9681e-01, 2.0294e-01, -8.7180e-02,\n", 1121 | " -2.5348e-01, -2.2914e-01, 1.2741e-01, -9.5824e-02, -1.2549e-01,\n", 1122 | " -1.3070e-01, 2.3285e-01, -6.4115e-04, -1.6414e-01, 3.2314e-02,\n", 1123 | " 4.8819e-02, -3.2767e-02, 1.8885e-01, 2.4161e-02, 6.0405e-02,\n", 1124 | " -5.3400e-01, -5.7614e-02, 1.7913e-03, 1.7609e-02, 1.3095e-01,\n", 1125 | " 2.2092e-01, -2.9997e-01, 2.3730e-01, 1.6538e-01, -5.6922e-02,\n", 1126 | " -1.7874e-01, -4.7354e-01, 2.5619e-01, 3.8571e-02, 4.3480e-02,\n", 1127 | " -1.2724e-02, 2.6596e-01, -1.9775e-01, -2.3582e-01, -2.1683e-01,\n", 1128 | " 1.5224e-01, 6.6215e-02, -3.2893e-01, 1.1441e-01, -8.3338e-02,\n", 1129 | " 3.1561e-01, 1.9787e-01, 9.6679e-02, -9.2316e-02, -1.0881e-04,\n", 1130 | " 2.0920e-01, -3.4589e-01, -2.1215e-01, 3.1736e-01, 1.4133e-01,\n", 1131 | " -1.2340e-01, 1.0533e-01, -4.3781e-01, -2.0628e-02, 1.2088e-01,\n", 1132 | " -1.7912e-01, -2.6534e-01, 4.4276e-02, 2.0965e-02, 2.2486e-01,\n", 1133 | " 4.4109e-01, -3.7244e-01, 4.0038e-01, 1.8425e-01, -1.4884e-01,\n", 1134 | " -1.5071e-02, -2.4635e-01, -3.6049e-02, -2.2783e-01, 1.7215e-01,\n", 1135 | " 1.4973e-01, 6.7226e-02, 1.9532e-01, 1.3776e-01, 1.0603e-01,\n", 1136 | " 7.0171e-02, -3.8417e-01, -2.1944e-01, 1.4848e-01, -4.9258e-01,\n", 1137 | " 7.1341e-03, -2.7200e-01, -2.1039e-01, 1.5305e-01, -1.9583e-01,\n", 1138 | " 3.7116e-02, -3.0209e-01, 3.1676e-01, -8.9855e-02, -1.4409e-01,\n", 1139 | " 7.0481e-02, 1.5803e-01, -3.1598e-01, 2.4918e-01, 3.2757e-01,\n", 1140 | " 9.6821e-02, 1.3569e-01, 9.8998e-02, -8.7309e-02, 9.5984e-03,\n", 1141 | " 1.6745e-01, -2.7856e-01, 3.2482e-01, 1.1077e-02, 3.4520e-01,\n", 1142 | " 1.5244e-01, -3.2342e-01, -1.2987e-01, 3.4953e-01, -1.1936e-01,\n", 1143 | " 4.4935e-01, 4.5081e-01, -1.7663e-01, -5.2981e-02, 9.2756e-02,\n", 1144 | " -6.9319e-02, -2.0573e-01, 1.0120e-01, -2.4884e-01, -3.1443e-02,\n", 1145 | " -4.7552e-02, 2.4262e-01, -7.0443e-03, 3.9893e-02, 2.2480e-01,\n", 1146 | " -1.5965e-02, 5.7924e-02, -1.8049e-01, 3.4861e-02, -1.6075e-01,\n", 1147 | " 1.9049e-01, -2.6809e-02, 2.1276e-01, -1.9859e-01, 1.8087e-02,\n", 1148 | " -3.1181e-02, -1.0761e-01, -2.6631e-01, -4.1918e-01, 2.7606e-01,\n", 1149 | " -2.4925e-01, -2.8636e-01, -3.9361e-01, 3.9108e-02, 3.9979e-02,\n", 1150 | " 5.3247e-02, -2.9006e-01, 3.3666e-02, -5.0415e-02, 1.2083e-01,\n", 1151 | " 3.0564e-01, -3.5833e-01, -2.5813e-01, 4.4581e-02, -1.8699e-01,\n", 1152 | " -2.0797e-01, -2.1827e-01, 1.4717e-01, -5.9601e-02, 2.3340e-01,\n", 1153 | " -7.7547e-02, 8.4026e-02, 2.8860e-01, 8.5435e-02, -3.9307e-01,\n", 1154 | " 3.3717e-01, 1.7597e-01, -1.4221e-01, -5.2757e-01, 1.1033e-01,\n", 1155 | " 4.0478e-01, -2.2899e-01, 5.7683e-02, -9.1090e-02, 7.2483e-02,\n", 1156 | " 1.8983e-01, 4.3432e-02, -2.7083e-01, -2.7190e-01, 2.4520e-02,\n", 1157 | " 1.6569e-01, 4.4634e-02, 1.4841e-01, -2.3093e-01, 2.0638e-02,\n", 1158 | " 6.2671e-02, 1.5323e-01, 8.6391e-02, -7.5145e-02, 8.6195e-03,\n", 1159 | " -5.7552e-02, 4.7943e-02, -1.6412e-01, 8.6597e-02, -2.5979e-01,\n", 1160 | " -2.1874e-01, 7.6014e-02, -3.3145e-01, -9.0766e-03, -1.2265e-02,\n", 1161 | " 4.5087e-02, 1.3022e-01, 1.0089e-01, -6.1816e-02, -9.7662e-02,\n", 1162 | " 1.9633e-01, 5.4836e-02, 3.7174e-01, -1.2320e-01, -3.7002e-02,\n", 1163 | " 1.2523e-01, -8.2034e-02, -2.8874e-01, -1.5272e-01, 2.6082e-01,\n", 1164 | " -2.0304e-01, -1.8871e-02, 3.7768e-01, -1.3122e-01, -1.7187e-01,\n", 1165 | " -1.0562e-01, 7.4058e-02, -4.2772e-02, -1.4564e-01, 3.0764e-01,\n", 1166 | " -1.7767e-01, 7.4432e-02, 6.9531e-02, 1.1068e-01, 1.8409e-01,\n", 1167 | " 1.9461e-01, 2.0585e-02, -1.3006e-01, 1.0369e-01, 1.1617e-01,\n", 1168 | " 9.8805e-02, 1.0067e-01, 2.8984e-01, -2.3558e-01, 1.1534e-01,\n", 1169 | " 1.8415e-02, 6.4342e-02, -7.3890e-03, -3.8759e-02, -1.5071e-01,\n", 1170 | " -3.2378e-02, -1.2249e-01, 1.7066e-01, -1.7944e-01, -1.4582e-02,\n", 1171 | " 4.6765e-02, 2.0999e-01, 1.7588e-01, 4.5993e-01, 2.0563e-01,\n", 1172 | " -4.2226e-01, -7.6879e-02, -5.4039e-02, 2.4141e-01, -1.0069e-02,\n", 1173 | " -1.7887e-01, -7.7009e-02, 7.6933e-02, 2.1367e-01, -8.0767e-02,\n", 1174 | " -2.2710e-02, 4.6927e-01, -1.7875e-01, -1.6417e-01, 4.7441e-01,\n", 1175 | " -5.1980e-02, -4.3865e-01, 9.6392e-02, 1.8355e-02, -1.5423e-02,\n", 1176 | " -3.3658e-01, -1.8137e-01, 7.5613e-01, -2.7590e-01, 2.4600e-01,\n", 1177 | " 1.9881e-01, -1.6262e-01, -1.3816e-01, -1.3194e-01, 3.1065e-01,\n", 1178 | " 1.4178e-01, -9.3501e-02, -9.5745e-02, -2.5625e-01, -4.9562e-02,\n", 1179 | " -1.6788e-01, -1.4434e-01, -1.1277e-01, 3.3917e-01, -9.6268e-02,\n", 1180 | " -1.8590e-01, -6.1858e-02, -1.7861e-01, -3.0107e-02, -4.2993e-03,\n", 1181 | " -1.9788e-02, -1.6314e-01, -6.1347e-01, -3.0971e-01, -1.1148e-01,\n", 1182 | " 5.9865e-03, -2.5755e-01, -1.1950e-01, -2.2245e-01, 7.0275e-03,\n", 1183 | " 7.5973e-02, -1.2953e-01, 2.3324e-02, 1.2015e-01, 2.2907e-01,\n", 1184 | " 9.6272e-02, 3.5353e-01, -2.2327e-01, -5.7879e-02, -3.1217e-01,\n", 1185 | " -2.2524e-01, 1.9043e-01, 7.3059e-02, -6.0037e-02, -9.8536e-03,\n", 1186 | " 2.7009e-01, -6.1627e-01, 1.4076e-01, 3.5828e-02, -6.2491e-03,\n", 1187 | " -5.2967e-02, -2.3984e-01, 8.3550e-02, 2.2164e-01, -3.5937e-01,\n", 1188 | " -2.1227e-01, -4.0357e-02, -2.0878e-01, 2.1424e-01, 1.1324e-01,\n", 1189 | " 1.8347e-01, -2.4963e-01, 1.4396e-01, 9.9599e-02, -3.8192e-02,\n", 1190 | " 1.7846e-01, -2.4785e-01, -2.7105e-01, -9.1905e-02, 1.7556e-01,\n", 1191 | " -2.9240e-01, -1.4066e-01, 2.1633e-01, -4.5221e-01, 2.2523e-01,\n", 1192 | " 5.8754e-02, 2.9438e-01, -6.4212e-02, 3.0164e-01, -1.2735e-01,\n", 1193 | " 6.3906e-02, 1.2580e-01, -4.0675e-01, 3.2477e-02, -2.7988e-01,\n", 1194 | " -3.2359e-01, 2.8612e-02, -2.1712e-01, -3.0114e-01, -1.5295e-01,\n", 1195 | " 1.3716e-01, -2.7333e-02, -5.0628e-02, 1.5491e-01, -4.1128e-02],\n", 1196 | " device='cuda:0')" 1197 | ] 1198 | }, 1199 | "execution_count": 48, 1200 | "metadata": {}, 1201 | "output_type": "execute_result" 1202 | } 1203 | ], 1204 | "source": [ 1205 | "encoder.state_dict()['encoder.weight'][1]" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": null, 1211 | "metadata": {}, 1212 | "outputs": [], 1213 | "source": [] 1214 | } 1215 | ], 1216 | "metadata": { 1217 | "kernelspec": { 1218 | "display_name": "Python 3", 1219 | "language": "python", 1220 | "name": "python3" 1221 | }, 1222 | "language_info": { 1223 | "codemirror_mode": { 1224 | "name": "ipython", 1225 | "version": 3 1226 | }, 1227 | "file_extension": ".py", 1228 | "mimetype": "text/x-python", 1229 | "name": "python", 1230 | "nbconvert_exporter": "python", 1231 | "pygments_lexer": "ipython3", 1232 | "version": "3.7.4" 1233 | } 1234 | }, 1235 | "nbformat": 4, 1236 | "nbformat_minor": 2 1237 | } 1238 | -------------------------------------------------------------------------------- /language-model/Malyalam_Language_Model_ULMFiT.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from fastai.text import *\n", 21 | "import numpy as np\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "import pickle\n", 24 | "import sentencepiece as spm" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "('1.0.50.post1', '1.0.1.post2')" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import fastai, torch\n", 45 | "fastai.__version__ , torch.__version__" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "torch.cuda.set_device(0)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stdout", 64 | "output_type": "stream", 65 | "text": [ 66 | "/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model\r\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "!pwd" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "path = Path('/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 7, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from inltk.tokenizer import MalyalamTokenizer" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 8, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "inltk.tokenizer.MalyalamTokenizer" 101 | ] 102 | }, 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "MalyalamTokenizer" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 9, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# class MalyalamTokenizer(BaseTokenizer):\n", 119 | "# def __init__(self, lang:str):\n", 120 | "# self.lang = lang\n", 121 | "# self.sp = spm.SentencePieceProcessor()\n", 122 | "# self.sp.Load(str(path/\"../tokenizer/malyalam_lm.model\"))\n", 123 | " \n", 124 | "# def tokenizer(self, t:str) -> List[str]:\n", 125 | "# return self.sp.EncodeAsPieces(t)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "sp = spm.SentencePieceProcessor()\n", 135 | "sp.Load(str(path/\"../tokenizer/malyalam_lm.model\"))\n", 136 | "itos = [sp.IdToPiece(int(i)) for i in range(10000)]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# 10,000 is the vocab size that we chose in sentencepiece\n", 146 | "malyalam_vocab = Vocab(itos)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 13, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "tokenizer = Tokenizer(tok_func=MalyalamTokenizer, lang='ml')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 14, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "['xxunk',\n", 167 | " 'xxpad',\n", 168 | " 'xxbos',\n", 169 | " 'xxeos',\n", 170 | " 'xxfld',\n", 171 | " 'xxmaj',\n", 172 | " 'xxup',\n", 173 | " 'xxrep',\n", 174 | " 'xxwrep']" 175 | ] 176 | }, 177 | "execution_count": 14, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "tokenizer.special_cases" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 15, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "data_lm = TextLMDataBunch.from_folder(path=path/'MalyalamDataset', tokenizer=tokenizer, vocab=malyalam_vocab)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 16, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/plain": [ 203 | "64" 204 | ] 205 | }, 206 | "execution_count": 16, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "data_lm.batch_size" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 17, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "data_lm.save()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 18, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/html": [ 232 | "\n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | "
idxtext
0ിക യിൽ ▁ഉൾപ്പെടുത്തിയ ിരിക്കുന്നത് . ▁സീ ബോ ൾഡ് സ് ▁ബീച്ച് ▁എന്ന യിനം ▁ബീ ച് ▁വൃക്ഷ ങ്ങൾ ▁നിറഞ്ഞ ▁വന മേഖല യാണ് ▁ഇവിടത്തെ ▁ഒരു ▁പ്രത്യേകത . ▁ശൈത്യകാലത്ത ് ▁പൂർണ്ണമായ ും ▁ഇലപ്പൊ ഴ ിക്കുന്ന ▁ഈ ▁മര ങ്ങൾ ▁ശൈത്യ ത്തിന്റെ ▁അവസാന ത്തോടെ ▁ഒരു ▁ശി ശി ര നി ദ്ര യിൽ നിന്ന െന്നപോലെ ▁ഉ ണ ര ുകയും ▁വീണ്ടും ▁ഇലകൾ ▁ത ളി ർ ക്ക ുവാൻ ▁ആരംഭിക്ക ുകയും ▁ചെയ്യുന്നു . ▁ജപ്പാ ന ിൽനിന്നും ▁ആദ്യമായി ▁ലോകപൈതൃക പട്ട ിക
1സി യാർ ▁അലി ം ▁അക്ബർ ▁സാ നി ▁വാ ലാ ▁ഷാ ൻ ▁പാദ ് ഷാ - ഇ - ബാഹ ് ർ - ഉ - ബാർ ▁എന്നാണ് ▁മുഴുവൻ ▁പേര് . ▁മുഗൾ ▁സാമ്രാജ്യ ത്തിലെ ▁ദുർബല നായ ▁ചക്രവർത്തി യാ യാണ് ▁ഫറൂഖ് ▁സി യാർ ▁വിലയിരുത്ത പ്പെടുന്നത് . ▁ഉപ ജാ പ ക സംഘ ത്തിന്റെ ▁പ്രേരണ യാൽ ▁പല തവണ ▁ഇദ്ദേഹം ▁വഴി തെ റ്റ ുകയും ▁സ്വതന്ത്ര മായി ▁ഭരണം ▁നടത്താൻ ▁സാധിക്ക ാതെ ▁വരികയും ▁ചെയ്തു . ▁ഹ സ്സ ൻ
2▁പരീക്ഷ യും ▁വിജയിച്ചു . ▁ശ്രീ മൂലം ▁തിരുനാൾ ▁ മഹാരാജാവ ് ▁18 90 ൽ ▁എ . ആ റിനെ ▁സംസ്കൃത ▁പാഠ ശാല യിൽ ▁ഇൻ സ് പെ ക്ട റായി ▁നിയമ ിച്ചു . ▁എ . ആർ . ▁ഈ ▁കാലയളവിൽ ▁നിഷ് ക ൃഷ്ട മായ ▁പാഠ ്യ പദ്ധതി യും ▁പാശ്ചാത്യ രീതി യിലുള്ള ▁ശിക്ഷ ാക്രമ വും ▁നടപ്പാക്ക ി . ▁ജോലി ക്കിടയിൽ ▁സംസ്കൃത ത്തിൽ ▁എം . എ . ▁എഴുതിയ െടുത്തു . ▁18 94 ൽ ▁സംസ്കൃത ▁മഹാ പാഠ
3• ▁വംശ പത്ര പതി തം • ▁വംശ യ ഷ്ട ിക • ▁വംശ സ്ഥ ം • ▁വ ് യാള ം • ▁ശങ്കര ചര ിത ം • ▁ശ ശ ധര ബി ംബ ം • ▁ശശി കല • ▁ശശി കല • ▁ശ ാ ർദ്ദ ൂ ല വി ക്രീ ഡി തം • ▁ ശാല ിനി • ▁ശിഖര ിണി • ▁ശിവ ം • ▁ശി താ ഗ്ര • ▁ശുദ്ധ വി രാ ൾ • ▁ശിശു ഭ
4ശ ിക്കാൻ ▁തുടങ്ങി . ▁ഈ ▁സമയത്ത് ▁തന്റെ ▁തോ ക്കിൽ ▁നിന്ന് ▁മംഗൽ ▁സ്വയം ▁വെടി യു തി ർ ക്കാൻ ▁ശ്രമിച്ച െങ്കിലും ▁പരാജയപ്പെട്ടു . ▁നി സ് സാര മായ ▁പര ു ക്ക േറ്റ ▁മംഗൽ ▁പാണ്ഡേ യെ ▁അറസ്റ്റ് ▁ചെയ്തു . ▁ബംഗാൾ ▁സൈന്യ ത്തിൽ ▁പുതിയ തായി ▁എത്തിയ ▁എൻ ഫീൽഡ ് - പി - 53 ▁തോ ക്ക ുകളിൽ ▁ഉപയോഗിക്കുന്ന ▁തിര കള െക്കുറിച്ചുള്ള ▁ദു രീ കരിക്ക ാത്ത ▁സംശയ ങ്ങളായിരുന്നു ▁മംഗൽ ▁പാണ്ഡേ യുടെ ▁പെരു മാറ്റ ത്തിനു ▁കാരണമായി ▁ചൂണ്ടിക്കാണിക്ക
" 262 | ], 263 | "text/plain": [ 264 | "" 265 | ] 266 | }, 267 | "metadata": {}, 268 | "output_type": "display_data" 269 | } 270 | ], 271 | "source": [ 272 | "data_lm.show_batch()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 19, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "text/plain": [ 283 | "10000" 284 | ] 285 | }, 286 | "execution_count": 19, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "len(data_lm.vocab.itos)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 20, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stderr", 302 | "output_type": "stream", 303 | "text": [ 304 | "/home/gaurav/anaconda3/envs/fastai-bleed/lib/python3.6/site-packages/fastai/datasets.py:164: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.\n", 305 | " with open(fpath, 'r') as yaml_file: return yaml.load(yaml_file)\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "learn = language_model_learner(data_lm, AWD_LSTM, drop_mult=0.3)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 21, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "1406" 322 | ] 323 | }, 324 | "execution_count": 21, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "gc.collect()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 28, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 343 | ] 344 | } 345 | ], 346 | "source": [ 347 | "learn.lr_find()" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 29, 353 | "metadata": {}, 354 | "outputs": [ 355 | { 356 | "data": { 357 | "image/png": "\n", 358 | "text/plain": [ 359 | "
" 360 | ] 361 | }, 362 | "metadata": { 363 | "needs_background": "light" 364 | }, 365 | "output_type": "display_data" 366 | } 367 | ], 368 | "source": [ 369 | "learn.recorder.plot()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 30, 375 | "metadata": {}, 376 | "outputs": [ 377 | { 378 | "data": { 379 | "text/html": [ 380 | "Total time: 02:16

\n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
epochtrain_lossvalid_lossaccuracy
15.3133895.3179960.192926
\n" 394 | ], 395 | "text/plain": [ 396 | "" 397 | ] 398 | }, 399 | "metadata": {}, 400 | "output_type": "display_data" 401 | } 402 | ], 403 | "source": [ 404 | "learn.fit_one_cycle(1, 1e-2, moms=(0.8,0.7))" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 31, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "learn.save('first', with_opt=True)" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 32, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "learn.load('first', with_opt=True);" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 33, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "learn.unfreeze()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 34, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "data": { 441 | "text/html": [ 442 | "Total time: 11:23

\n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | "
epochtrain_lossvalid_lossaccuracy
14.8212204.9471710.226521
24.5401154.5865940.260965
34.2070164.2629990.297599
43.7625843.9469110.340401
53.6152023.8681990.352610
\n" 480 | ], 481 | "text/plain": [ 482 | "" 483 | ] 484 | }, 485 | "metadata": {}, 486 | "output_type": "display_data" 487 | } 488 | ], 489 | "source": [ 490 | "learn.fit_one_cycle(5, 1e-2, moms=(0.8,0.7))" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 35, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "learn.save('second_ml_lm', with_opt=True)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 36, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "learn.load('second_ml_lm', with_opt=True);" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 37, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "data": { 518 | "text/html": [ 519 | "Total time: 1:31:27

\n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | "
epochtrain_lossvalid_lossaccuracy
13.6394833.8626050.353660
23.5767213.8546420.354983
33.4987123.8416520.356866
43.5023303.8243110.359579
53.5296913.8015330.363115
63.5058683.7785970.366981
73.4883653.7525620.371378
83.4337613.7221810.376249
93.4946403.6926920.380994
103.2425383.6657240.385454
113.2512803.6340290.390975
123.3553043.6067410.395468
133.2753743.5772460.400490
143.2860863.5499620.405409
153.1778153.5262140.409612
163.1253043.5050810.413546
173.0747533.4826470.417718
183.1584923.4580980.422340
193.0503823.4424660.425541
202.9311133.4240890.429223
212.9336853.4051930.432484
223.0564413.3905840.435969
232.9843613.3742690.438778
242.9196793.3585810.441577
252.7842263.3466760.443988
262.7288863.3345910.446676
272.7611923.3236450.449142
282.7935623.3153690.450861
292.7823213.3067410.452649
302.7266623.2985170.454130
312.7075903.2913790.455593
322.7119213.2878840.456573
332.7502363.2826500.457487
342.7450883.2800910.458257
352.7076993.2770610.458906
362.7431323.2753010.459181
372.4870563.2747650.459493
382.6370833.2735660.459546
392.6238423.2734170.459646
402.6539913.2736270.459651
\n" 767 | ], 768 | "text/plain": [ 769 | "" 770 | ] 771 | }, 772 | "metadata": {}, 773 | "output_type": "display_data" 774 | } 775 | ], 776 | "source": [ 777 | "learn.fit_one_cycle(40, 1e-3, moms=(0.8,0.7))" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 38, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "learn.save('third_ml_lm', with_opt=True)" 787 | ] 788 | }, 789 | { 790 | "cell_type": "code", 791 | "execution_count": 22, 792 | "metadata": {}, 793 | "outputs": [], 794 | "source": [ 795 | "learn.load('third_ml_lm', with_opt=True);" 796 | ] 797 | }, 798 | { 799 | "cell_type": "code", 800 | "execution_count": 23, 801 | "metadata": {}, 802 | "outputs": [], 803 | "source": [ 804 | "TEXT = \"ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി\"\n", 805 | "N_WORDS = 40\n", 806 | "N_SENTENCES = 2" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": 24, 812 | "metadata": {}, 813 | "outputs": [ 814 | { 815 | "name": "stdout", 816 | "output_type": "stream", 817 | "text": [ 818 | "ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി ▁അധികാര ത്തില ിരുന്ന തിനെ ▁തുടർന്ന് ▁ഭരണ ാധികാര ത്തിനെതിരെ യുള്ള ▁പ്രക്ഷോഭ ങ്ങൾ ▁തുടര ുകയും ▁നടപ്പാക്ക ുകയും ▁ചെയ്തു . ▁എന്നാൽ ▁ഈ ▁നിയമം ▁ ബ്രിട്ടീഷുകാരുടെ ▁കൈ യില ക പ്പെട ാതെ ▁വന്ന തിനാൽ ▁ഇന്ത്യയുടെ ▁സ്വാതന്ത്ര്യ ത്തിന് ▁ശേഷം ▁സി . എം . എസ് . ▁വൈസ്\n", 819 | "ബംഗാളിലെ ▁ഭരണം ▁കമ്പനി ക്ക് ▁കർശന മായ ▁ഒരു ▁സ്കൂൾ ▁സ്ഥാപിക്ക ുന്നതിന് ▁വേണ്ടി ▁വിദ്യാഭ്യാസ ▁സംവിധാന ത്തിനായി ▁നടത്തുന്ന ▁പദ്ധതി യാണ് ▁ഇന്റർനാഷണൽ ▁ഇൻസ്റ്റിറ്റ്യൂ ട്ട് ▁ഓഫ് ▁ടെക്നോളജി . ▁കേരള ▁സംസ്ഥാന ▁ഐ . ടി . ഒ ▁ യുടെ ▁സ്ഥാപക ൻ ▁എന്ന ▁നിലയിൽ ▁പ്രവർത്തിച്ച ിട്ടുണ്ട് . ▁1999 ▁ൽ ▁ജ യിൽ\n" 820 | ] 821 | } 822 | ], 823 | "source": [ 824 | "print(\"\\n\".join(learn.predict(TEXT, N_WORDS, temperature=0.75) for _ in range(N_SENTENCES)))" 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": 42, 830 | "metadata": {}, 831 | "outputs": [ 832 | { 833 | "data": { 834 | "text/plain": [ 835 | "26.39039188081262" 836 | ] 837 | }, 838 | "execution_count": 42, 839 | "metadata": {}, 840 | "output_type": "execute_result" 841 | } 842 | ], 843 | "source": [ 844 | "np.exp(3.273)" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 25, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [ 853 | "defaults.device = torch.device('cpu')\n", 854 | "learn.model.eval()\n", 855 | "learn.export()" 856 | ] 857 | }, 858 | { 859 | "cell_type": "code", 860 | "execution_count": 12, 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "# Generating embedding vectors for visualization" 865 | ] 866 | }, 867 | { 868 | "cell_type": "code", 869 | "execution_count": 13, 870 | "metadata": {}, 871 | "outputs": [ 872 | { 873 | "data": { 874 | "text/plain": [ 875 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-malyalam/language-model')" 876 | ] 877 | }, 878 | "execution_count": 13, 879 | "metadata": {}, 880 | "output_type": "execute_result" 881 | } 882 | ], 883 | "source": [ 884 | "path" 885 | ] 886 | }, 887 | { 888 | "cell_type": "code", 889 | "execution_count": 14, 890 | "metadata": {}, 891 | "outputs": [], 892 | "source": [ 893 | "defaults.device = torch.device('cpu')" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": 15, 899 | "metadata": {}, 900 | "outputs": [], 901 | "source": [ 902 | "learn = load_learner(path / 'MalyalamDataset/')" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": 16, 908 | "metadata": {}, 909 | "outputs": [], 910 | "source": [ 911 | "encoder = get_model(learn.model)[0]" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 17, 917 | "metadata": {}, 918 | "outputs": [ 919 | { 920 | "data": { 921 | "text/plain": [ 922 | "torch.Size([10000, 400])" 923 | ] 924 | }, 925 | "execution_count": 17, 926 | "metadata": {}, 927 | "output_type": "execute_result" 928 | } 929 | ], 930 | "source": [ 931 | "encoder.state_dict()['encoder.weight'].shape" 932 | ] 933 | }, 934 | { 935 | "cell_type": "code", 936 | "execution_count": 18, 937 | "metadata": {}, 938 | "outputs": [], 939 | "source": [ 940 | "embeddings = encoder.state_dict()['encoder.weight']" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 19, 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [ 949 | "embeddings = np.array(embeddings)" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": 20, 955 | "metadata": {}, 956 | "outputs": [ 957 | { 958 | "data": { 959 | "text/plain": [ 960 | "(400,)" 961 | ] 962 | }, 963 | "execution_count": 20, 964 | "metadata": {}, 965 | "output_type": "execute_result" 966 | } 967 | ], 968 | "source": [ 969 | "embeddings[0].shape" 970 | ] 971 | }, 972 | { 973 | "cell_type": "code", 974 | "execution_count": 21, 975 | "metadata": {}, 976 | "outputs": [], 977 | "source": [ 978 | "df = pd.DataFrame(embeddings)" 979 | ] 980 | }, 981 | { 982 | "cell_type": "code", 983 | "execution_count": 22, 984 | "metadata": {}, 985 | "outputs": [ 986 | { 987 | "data": { 988 | "text/html": [ 989 | "

\n", 990 | "\n", 1003 | "\n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | "
0123456789...390391392393394395396397398399
00.291967-0.002322-0.209247-0.458362-0.2118710.5251190.2169680.683054-0.000421-0.451211...-0.018488-0.2096550.5763930.7421100.626956-0.273615-0.226769-0.459669-0.1385130.613912
1-0.2735490.049740-1.247350-1.3723030.257762-0.763619-0.4829610.1712560.6848940.376489...0.5475700.7437070.798664-0.216511-0.258916-0.239699-0.0225760.5938540.076901-0.260444
2-0.2309500.072833-1.153181-1.2880290.275702-0.715835-0.4789830.0997320.6929430.347465...0.5258080.6402400.775471-0.225178-0.279536-0.167131-0.0100200.6089660.082256-0.256163
31.527329-0.1991700.261630-0.4113900.225107-0.0327930.3071980.282542-0.511452-0.663876...-0.9426140.0649600.0446300.4067660.304243-0.106900-0.3111881.198795-0.061395-0.982869
4-0.253548-0.032533-0.4132300.2018650.2094670.4166061.055918-0.1906800.999718-0.150951...0.0358560.7365210.9324990.8111800.393260-0.1273880.5852110.2527970.1050650.084277
\n", 1153 | "

5 rows × 400 columns

\n", 1154 | "
" 1155 | ], 1156 | "text/plain": [ 1157 | " 0 1 2 3 4 5 6 \\\n", 1158 | "0 0.291967 -0.002322 -0.209247 -0.458362 -0.211871 0.525119 0.216968 \n", 1159 | "1 -0.273549 0.049740 -1.247350 -1.372303 0.257762 -0.763619 -0.482961 \n", 1160 | "2 -0.230950 0.072833 -1.153181 -1.288029 0.275702 -0.715835 -0.478983 \n", 1161 | "3 1.527329 -0.199170 0.261630 -0.411390 0.225107 -0.032793 0.307198 \n", 1162 | "4 -0.253548 -0.032533 -0.413230 0.201865 0.209467 0.416606 1.055918 \n", 1163 | "\n", 1164 | " 7 8 9 ... 390 391 392 393 \\\n", 1165 | "0 0.683054 -0.000421 -0.451211 ... -0.018488 -0.209655 0.576393 0.742110 \n", 1166 | "1 0.171256 0.684894 0.376489 ... 0.547570 0.743707 0.798664 -0.216511 \n", 1167 | "2 0.099732 0.692943 0.347465 ... 0.525808 0.640240 0.775471 -0.225178 \n", 1168 | "3 0.282542 -0.511452 -0.663876 ... -0.942614 0.064960 0.044630 0.406766 \n", 1169 | "4 -0.190680 0.999718 -0.150951 ... 0.035856 0.736521 0.932499 0.811180 \n", 1170 | "\n", 1171 | " 394 395 396 397 398 399 \n", 1172 | "0 0.626956 -0.273615 -0.226769 -0.459669 -0.138513 0.613912 \n", 1173 | "1 -0.258916 -0.239699 -0.022576 0.593854 0.076901 -0.260444 \n", 1174 | "2 -0.279536 -0.167131 -0.010020 0.608966 0.082256 -0.256163 \n", 1175 | "3 0.304243 -0.106900 -0.311188 1.198795 -0.061395 -0.982869 \n", 1176 | "4 0.393260 -0.127388 0.585211 0.252797 0.105065 0.084277 \n", 1177 | "\n", 1178 | "[5 rows x 400 columns]" 1179 | ] 1180 | }, 1181 | "execution_count": 22, 1182 | "metadata": {}, 1183 | "output_type": "execute_result" 1184 | } 1185 | ], 1186 | "source": [ 1187 | "df.head()" 1188 | ] 1189 | }, 1190 | { 1191 | "cell_type": "code", 1192 | "execution_count": 23, 1193 | "metadata": {}, 1194 | "outputs": [ 1195 | { 1196 | "data": { 1197 | "text/plain": [ 1198 | "(10000, 400)" 1199 | ] 1200 | }, 1201 | "execution_count": 23, 1202 | "metadata": {}, 1203 | "output_type": "execute_result" 1204 | } 1205 | ], 1206 | "source": [ 1207 | "df.shape" 1208 | ] 1209 | }, 1210 | { 1211 | "cell_type": "code", 1212 | "execution_count": 24, 1213 | "metadata": {}, 1214 | "outputs": [], 1215 | "source": [ 1216 | "df.to_csv('embeddings.tsv', sep='\\t', index=False, header=False)" 1217 | ] 1218 | }, 1219 | { 1220 | "cell_type": "code", 1221 | "execution_count": 25, 1222 | "metadata": {}, 1223 | "outputs": [], 1224 | "source": [ 1225 | "df2 = pd.DataFrame(itos)" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "code", 1230 | "execution_count": 26, 1231 | "metadata": {}, 1232 | "outputs": [ 1233 | { 1234 | "data": { 1235 | "text/html": [ 1236 | "
\n", 1237 | "\n", 1250 | "\n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | "
0
0<unk>
1<s>
2</s>
3.
4
\n", 1280 | "
" 1281 | ], 1282 | "text/plain": [ 1283 | " 0\n", 1284 | "0 \n", 1285 | "1 \n", 1286 | "2 \n", 1287 | "3 .\n", 1288 | "4 ്" 1289 | ] 1290 | }, 1291 | "execution_count": 26, 1292 | "metadata": {}, 1293 | "output_type": "execute_result" 1294 | } 1295 | ], 1296 | "source": [ 1297 | "df2.head()" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "execution_count": 27, 1303 | "metadata": {}, 1304 | "outputs": [ 1305 | { 1306 | "data": { 1307 | "text/plain": [ 1308 | "(10000, 1)" 1309 | ] 1310 | }, 1311 | "execution_count": 27, 1312 | "metadata": {}, 1313 | "output_type": "execute_result" 1314 | } 1315 | ], 1316 | "source": [ 1317 | "df2.shape" 1318 | ] 1319 | }, 1320 | { 1321 | "cell_type": "code", 1322 | "execution_count": 28, 1323 | "metadata": {}, 1324 | "outputs": [], 1325 | "source": [ 1326 | "df2.to_csv('embeddings_metadata.tsv', sep='\\t', index=False, header=False)" 1327 | ] 1328 | }, 1329 | { 1330 | "cell_type": "code", 1331 | "execution_count": 29, 1332 | "metadata": {}, 1333 | "outputs": [ 1334 | { 1335 | "data": { 1336 | "text/plain": [ 1337 | "tensor([-2.7355e-01, 4.9740e-02, -1.2474e+00, -1.3723e+00, 2.5776e-01,\n", 1338 | " -7.6362e-01, -4.8296e-01, 1.7126e-01, 6.8489e-01, 3.7649e-01,\n", 1339 | " 1.2907e-01, 1.1018e-01, 5.1568e-01, 7.9365e-02, -1.9693e-01,\n", 1340 | " 1.5606e+00, 1.3909e-01, 4.4212e-01, 2.8622e-01, 1.9001e-01,\n", 1341 | " 4.1099e-01, 1.1254e+00, -2.0873e-01, -7.7392e-02, 5.6829e-01,\n", 1342 | " 3.9020e-01, 6.2949e-01, 1.7590e-01, 3.3018e-01, 2.8685e-01,\n", 1343 | " -7.7555e-01, 3.0622e-01, 1.7872e-01, 9.3845e-01, 4.2830e-02,\n", 1344 | " -8.5402e-02, -5.2553e-01, 4.3522e-01, 2.7174e-01, -1.2583e-01,\n", 1345 | " 4.8475e-01, -3.7728e-01, -3.6608e-01, 5.4688e-01, 2.1373e-01,\n", 1346 | " -5.6125e-01, -2.7174e-01, -9.3416e-01, 2.4786e-01, -5.0529e-01,\n", 1347 | " -3.4035e-01, -2.1120e-01, 3.7982e-02, -1.2824e-01, -3.4154e-01,\n", 1348 | " 8.7507e-01, 4.3182e-01, 3.1867e-01, -1.1197e+00, -1.2728e-01,\n", 1349 | " 9.6618e-01, -7.5389e-02, -3.4188e-01, 2.2546e-01, 1.7317e-01,\n", 1350 | " 1.4208e+00, 1.1709e-01, -1.9090e-02, -4.0858e-02, -1.1572e-01,\n", 1351 | " 1.7867e-01, -2.2730e-01, 9.5290e-01, 8.5460e-01, -5.4994e-02,\n", 1352 | " -2.2447e+00, 7.6660e-01, -4.2285e-01, -2.3511e-01, 1.4848e-01,\n", 1353 | " -4.1073e-01, 3.3902e-01, 4.5169e-01, -1.0511e-01, 5.9478e-01,\n", 1354 | " 1.8716e-01, 5.2635e-01, 2.2042e-01, -3.4843e-02, 1.6912e-01,\n", 1355 | " -2.4716e-01, 3.5367e-01, 1.9962e-01, 2.4432e-01, -3.0583e-01,\n", 1356 | " -2.6313e-01, -5.4300e-02, 1.6807e-01, 3.3860e-01, 4.0517e-01,\n", 1357 | " -3.4211e-01, 2.5578e-01, -2.6645e-01, -8.5716e-02, -1.5947e+00,\n", 1358 | " -5.5090e-02, 3.2921e-01, -1.8224e-01, 9.1738e-01, -1.0322e+00,\n", 1359 | " 1.9760e+00, -5.7727e-01, 5.0660e-01, 4.9145e-01, 3.2897e-01,\n", 1360 | " 5.2335e-02, 1.0763e-01, 7.4897e-02, -4.4596e-02, -1.9440e-02,\n", 1361 | " -4.4593e-01, 4.0274e-01, -4.8848e-01, -2.7417e-01, -9.5853e-02,\n", 1362 | " 5.4816e-01, -1.9212e-01, 1.4258e-01, 2.8511e-01, 1.9044e-01,\n", 1363 | " -1.3431e-01, 2.5034e-01, 5.3367e-02, -3.2784e-01, -1.7451e-01,\n", 1364 | " 1.6196e-03, 9.6878e-01, -4.0620e-01, 4.7420e-01, -3.1002e-01,\n", 1365 | " 2.4126e-01, 9.3827e-01, -2.2575e-01, 1.1790e+00, 2.1420e-01,\n", 1366 | " 1.2176e-01, 3.6928e-01, -3.7387e-01, 1.6095e-02, -1.1043e+00,\n", 1367 | " 3.8309e-01, -3.0535e-01, 2.2009e-01, 4.8657e-01, 3.3336e-01,\n", 1368 | " -2.0681e-02, -3.7418e-01, -9.1325e-01, 2.2386e-01, 1.1208e-01,\n", 1369 | " 7.6523e-01, 5.1890e-01, -1.8135e-01, 8.4246e-01, 1.7637e-01,\n", 1370 | " 3.2233e-01, 2.9010e-01, 6.5388e-01, -1.6610e+00, -4.2370e-02,\n", 1371 | " 2.0479e-01, -6.9538e-02, -1.2717e-01, 3.3148e-01, -1.8729e-01,\n", 1372 | " 6.3649e-01, -2.2464e-01, -1.0190e-01, 4.6866e-01, -5.5892e-02,\n", 1373 | " 3.6058e-01, 3.2559e-01, 3.4009e-01, -2.7135e-01, -1.0699e+00,\n", 1374 | " 4.3009e-02, -5.3111e-01, -2.7182e-01, 1.5959e-01, -5.1326e-01,\n", 1375 | " -5.8041e-01, 1.0743e-01, -2.5454e-01, -1.9223e-01, 5.0041e-01,\n", 1376 | " -1.0436e-01, -8.8933e-02, 6.4387e-01, -1.5711e-01, 2.6400e-01,\n", 1377 | " 5.4522e-01, 2.8376e-02, -4.5845e-01, 5.4044e-01, -4.0886e-01,\n", 1378 | " 5.7746e-01, 8.4322e-02, -1.6316e-01, 1.0962e+00, -2.6665e-01,\n", 1379 | " -3.2856e-01, 2.6375e-01, -7.4362e-01, -2.1490e-01, -5.0500e-01,\n", 1380 | " -1.1855e-01, 4.5519e-01, -4.1367e-01, -7.2039e-02, -4.5755e-02,\n", 1381 | " 9.1112e-02, -3.6581e-01, 5.9046e-01, 2.2897e-02, 3.3079e-01,\n", 1382 | " 7.2433e-02, 3.7962e-01, -3.9742e-01, -9.3628e-02, -3.8505e-01,\n", 1383 | " 1.9628e-01, 1.9122e-02, 9.5473e-01, 2.0479e-01, 7.0306e-02,\n", 1384 | " -6.7453e-01, 4.6173e-01, 2.3797e-01, -3.8475e-01, -1.3172e-01,\n", 1385 | " -3.8401e-02, 3.6653e-01, -1.4525e-01, 6.5865e-01, 8.4010e-01,\n", 1386 | " -1.5269e-01, 2.0602e-01, 8.4053e-01, 4.1965e-02, -3.9192e-01,\n", 1387 | " -1.9203e+00, 1.1139e+00, 4.7090e-01, -9.1036e-01, 2.8499e-01,\n", 1388 | " -1.3596e+00, -4.3914e-01, -2.1283e-01, 5.1143e-01, 4.0260e-01,\n", 1389 | " -3.0227e-01, -1.9529e-01, 7.4790e-02, 2.5873e-01, -9.0964e-02,\n", 1390 | " 6.0427e-01, 3.1723e-02, 7.2567e-02, -4.8054e-02, -2.5665e-01,\n", 1391 | " 3.1949e-01, -2.5791e-01, -2.4215e-01, -1.2559e-01, -3.1404e-02,\n", 1392 | " -1.0723e+00, -1.4186e-01, -8.5835e-01, 8.4293e-01, 7.3532e-02,\n", 1393 | " -1.0460e+00, -4.9860e-01, 5.6247e-01, -5.0750e-01, 3.4033e-01,\n", 1394 | " 2.1924e-01, -8.0956e-02, -8.7188e-01, 5.4076e-01, -2.3494e-01,\n", 1395 | " -1.9719e-01, -3.3236e-01, -8.6199e-02, 4.6256e-01, 2.2004e-01,\n", 1396 | " 1.8080e-01, 2.4210e-01, -5.9047e-02, -2.5285e-01, -1.9066e-01,\n", 1397 | " -9.8962e-01, 1.2047e-01, -1.3332e-01, 2.4901e-01, -2.3874e-01,\n", 1398 | " 2.9169e-01, 3.4328e-01, -1.0401e+00, -1.0636e-01, -8.3790e-01,\n", 1399 | " -2.2283e-01, 3.7870e-02, 3.0247e-01, -3.2111e-01, -3.9612e-01,\n", 1400 | " 3.8965e-01, 6.4064e-02, 1.2912e+00, 3.6725e-01, 8.3852e-02,\n", 1401 | " -9.8076e-01, -2.5177e-01, 3.2505e-01, 2.8850e-01, 6.8628e-04,\n", 1402 | " 1.0167e+00, -3.3983e-01, -7.0606e-02, -4.1021e-01, 6.2122e-02,\n", 1403 | " 5.7021e-01, 1.8068e-01, 2.2632e-01, -1.7197e-01, -1.1161e-01,\n", 1404 | " -7.9958e-01, -1.0696e-01, 5.4813e-01, -2.5078e-01, -2.2282e-01,\n", 1405 | " 1.1968e-01, 5.5584e-01, -4.2861e-01, -3.8036e-01, -5.1863e-01,\n", 1406 | " -4.4458e-01, -4.3260e-01, 1.0323e-01, -9.5130e-01, -4.5454e-01,\n", 1407 | " -2.5369e-01, 1.6794e-02, 2.4722e-01, -5.3022e-01, 1.2644e-01,\n", 1408 | " -4.2388e-01, -5.0187e-01, 1.0373e-01, 7.9540e-03, -3.2078e-01,\n", 1409 | " 1.2055e+00, 3.5049e-01, -4.6069e-01, 1.9396e-01, 9.6956e-01,\n", 1410 | " 4.6293e-01, -5.9837e-02, 3.0735e-01, 6.1025e-02, 3.4897e-01,\n", 1411 | " 4.2811e-02, 7.1975e-01, -3.9895e-02, 1.6942e-01, 3.3076e-02,\n", 1412 | " -2.4922e-01, 3.2290e-01, 4.2509e-01, -3.4705e-02, 4.4104e-01,\n", 1413 | " -2.7307e-01, 1.1856e-01, 4.9750e-02, -2.6810e-01, 1.4808e-01,\n", 1414 | " -1.9500e-01, -4.4822e-01, 5.8663e-01, 4.4076e-02, 2.4390e-01,\n", 1415 | " 5.4757e-01, 7.4371e-01, 7.9866e-01, -2.1651e-01, -2.5892e-01,\n", 1416 | " -2.3970e-01, -2.2576e-02, 5.9385e-01, 7.6901e-02, -2.6044e-01])" 1417 | ] 1418 | }, 1419 | "execution_count": 29, 1420 | "metadata": {}, 1421 | "output_type": "execute_result" 1422 | } 1423 | ], 1424 | "source": [ 1425 | "encoder.state_dict()['encoder.weight'][1]" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "code", 1430 | "execution_count": null, 1431 | "metadata": {}, 1432 | "outputs": [], 1433 | "source": [] 1434 | } 1435 | ], 1436 | "metadata": { 1437 | "kernelspec": { 1438 | "display_name": "Python 3", 1439 | "language": "python", 1440 | "name": "python3" 1441 | }, 1442 | "language_info": { 1443 | "codemirror_mode": { 1444 | "name": "ipython", 1445 | "version": 3 1446 | }, 1447 | "file_extension": ".py", 1448 | "mimetype": "text/x-python", 1449 | "name": "python", 1450 | "nbconvert_exporter": "python", 1451 | "pygments_lexer": "ipython3", 1452 | "version": "3.7.4" 1453 | } 1454 | }, 1455 | "nbformat": 4, 1456 | "nbformat_minor": 2 1457 | } 1458 | -------------------------------------------------------------------------------- /classification/Malyalam_Classification_Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from fastai.text import *\n", 10 | "import numpy as np\n", 11 | "from sklearn.model_selection import train_test_split\n", 12 | "import pickle\n", 13 | "import sentencepiece as spm\n", 14 | "import re\n", 15 | "import pdb" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "('1.0.57', '1.0.0')" 27 | ] 28 | }, 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "import fastai, torch\n", 36 | "fastai.__version__ , torch.__version__" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "torch.cuda.set_device(0)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def random_seed(seed_value, use_cuda):\n", 55 | " np.random.seed(seed_value) \n", 56 | " torch.manual_seed(seed_value) \n", 57 | " random.seed(seed_value)\n", 58 | " if use_cuda:\n", 59 | " torch.cuda.manual_seed(seed_value)\n", 60 | " torch.cuda.manual_seed_all(seed_value) \n", 61 | " torch.backends.cudnn.deterministic = True\n", 62 | " torch.backends.cudnn.benchmark = False" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "random_seed(42, True)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "/data/home/ubuntu/gaurav/in/nlp-for-malyalam/classification\r\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "!pwd" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 7, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "path = Path('./')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 8, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/html": [ 108 | "
\n", 109 | "\n", 122 | "\n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | "
01
0businessജോലിയില്‍ നിന്ന് ഒരു ബ്രേക്ക് എടുക്കുന്നതിനു മ...
1businessകമ്ബോളങ്ങള്‍ കരടിയുടെ പിടിയില്‍
2businessകൊച്ചി മെട്രോയുടെ ബ്രാന്‍ഡ് അംബാസിഡറായി നടന്‍ ...
3businessഇന്ധനവിലയില്‍ വീണ്ടും വര്‍ദ്ധനവ്, പെട്രോളിന് 1...
4sportsഫെഡറേഷന്‍ കപ്പ‌് അത‌്‌ലറ്റിക‌്സിന‌് ഇന്ന‌് തുട...
\n", 158 | "
" 159 | ], 160 | "text/plain": [ 161 | " 0 1\n", 162 | "0 business ജോലിയില്‍ നിന്ന് ഒരു ബ്രേക്ക് എടുക്കുന്നതിനു മ...\n", 163 | "1 business കമ്ബോളങ്ങള്‍ കരടിയുടെ പിടിയില്‍\n", 164 | "2 business കൊച്ചി മെട്രോയുടെ ബ്രാന്‍ഡ് അംബാസിഡറായി നടന്‍ ...\n", 165 | "3 business ഇന്ധനവിലയില്‍ വീണ്ടും വര്‍ദ്ധനവ്, പെട്രോളിന് 1...\n", 166 | "4 sports ഫെഡറേഷന്‍ കപ്പ‌് അത‌്‌ലറ്റിക‌്സിന‌് ഇന്ന‌് തുട..." 167 | ] 168 | }, 169 | "execution_count": 8, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "df_train = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-train.csv', header=None)\n", 176 | "df_train.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 9, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/html": [ 187 | "
\n", 188 | "\n", 201 | "\n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | "
01
0businessട്രെയിന്‍ യാത്രയില്‍ ഇനി കുലുക്കം കുറയും, ജെര്...
1sportsപാ​ലാ സെ​ന്‍റ് തോ​മ​സ് ചാ​മ്ബ്യന്മാ​ര്‍
2sportsഓസ്‌ട്രേലിയയ്‌ക്കെതിരെ ഇനി ധോണിയില്ല; ലോകകപ്പി...
3sportsടെസ്റ്റിന് വേഗം കൂട്ടാന്‍ എം.സി.സി
4sportsഓള്‍ ഇംഗ്ലണ്ട് ബാഡ്മിന്റണില്‍ ശ്രീകാന്തും പുറത...
\n", 237 | "
" 238 | ], 239 | "text/plain": [ 240 | " 0 1\n", 241 | "0 business ട്രെയിന്‍ യാത്രയില്‍ ഇനി കുലുക്കം കുറയും, ജെര്...\n", 242 | "1 sports പാ​ലാ സെ​ന്‍റ് തോ​മ​സ് ചാ​മ്ബ്യന്മാ​ര്‍\n", 243 | "2 sports ഓസ്‌ട്രേലിയയ്‌ക്കെതിരെ ഇനി ധോണിയില്ല; ലോകകപ്പി...\n", 244 | "3 sports ടെസ്റ്റിന് വേഗം കൂട്ടാന്‍ എം.സി.സി\n", 245 | "4 sports ഓള്‍ ഇംഗ്ലണ്ട് ബാഡ്മിന്റണില്‍ ശ്രീകാന്തും പുറത..." 246 | ] 247 | }, 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "df_valid = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-valid.csv', header=None)\n", 255 | "df_valid.head()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 10, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "data": { 265 | "text/html": [ 266 | "
\n", 267 | "\n", 280 | "\n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | "
01
0sportsഇഞ്ചുറി ടൈം പെനാല്‍റ്റിയില്‍ എഫ് സി പോര്‍ട്ടോ
1entertainmentആമിര്‍ ഖാന്റെ ഏറ്റവും പുതിയ ചിത്രം ലാല്‍ സിങ് ...
2sportsഐ പി എല്ലിന് മുന്‍പായി ഓസ്‌ട്രേലിയന്‍ ടീമിനൊപ്...
3businessസാമ്ബത്തിക ജീവിതം സുരക്ഷിതമാക്കണോ? ഈ അഞ്ച് ശീല...
4businessഎല്‍ഇഡി ബള്‍ബുകള്‍ ലഭ്യമാക്കും; പദ്ധതിയുടെ രജി...
\n", 316 | "
" 317 | ], 318 | "text/plain": [ 319 | " 0 1\n", 320 | "0 sports ഇഞ്ചുറി ടൈം പെനാല്‍റ്റിയില്‍ എഫ് സി പോര്‍ട്ടോ\n", 321 | "1 entertainment ആമിര്‍ ഖാന്റെ ഏറ്റവും പുതിയ ചിത്രം ലാല്‍ സിങ് ...\n", 322 | "2 sports ഐ പി എല്ലിന് മുന്‍പായി ഓസ്‌ട്രേലിയന്‍ ടീമിനൊപ്...\n", 323 | "3 business സാമ്ബത്തിക ജീവിതം സുരക്ഷിതമാക്കണോ? ഈ അഞ്ച് ശീല...\n", 324 | "4 business എല്‍ഇഡി ബള്‍ബുകള്‍ ലഭ്യമാക്കും; പദ്ധതിയുടെ രജി..." 325 | ] 326 | }, 327 | "execution_count": 10, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "df_test = pd.read_csv(path/'../../classification_public_datasets/inltk-headlines/ml/ml-test.csv', header=None)\n", 334 | "df_test.head()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 11, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "((5036, 2), (630, 2), (630, 2))" 346 | ] 347 | }, 348 | "execution_count": 11, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "df_train.shape, df_valid.shape, df_test.shape" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 12, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/plain": [ 365 | "((0, 2), (0, 2), (0, 2))" 366 | ] 367 | }, 368 | "execution_count": 12, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "df_train[df_train[0].isnull()].shape, df_valid[df_valid[0].isnull()].shape, df_test[df_test[0].isnull()].shape" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 13, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [ 383 | "label_cols = [0]" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 14, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "class MalyalamTokenizer(BaseTokenizer):\n", 393 | " def __init__(self, lang:str):\n", 394 | " self.lang = lang\n", 395 | " self.sp = spm.SentencePieceProcessor()\n", 396 | " self.sp.Load(str('./../../models/malayalam/tokenizer/malyalam_lm.model'))\n", 397 | " \n", 398 | " def tokenizer(self, t:str) -> List[str]:\n", 399 | " return self.sp.EncodeAsPieces(t)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 15, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "sp = spm.SentencePieceProcessor()\n", 409 | "sp.Load(str('./../../models/malayalam/tokenizer/malyalam_lm.model'))\n", 410 | "itos = [sp.IdToPiece(int(i)) for i in range(10000)]" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 16, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "# 10,000 is the vocab size that we chose in sentencepiece\n", 420 | "malyalam_vocab = Vocab(itos)" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 17, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "tokenizer = Tokenizer(tok_func=MalyalamTokenizer, lang='ml')" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 18, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "['xxunk',\n", 441 | " 'xxpad',\n", 442 | " 'xxbos',\n", 443 | " 'xxeos',\n", 444 | " 'xxfld',\n", 445 | " 'xxmaj',\n", 446 | " 'xxup',\n", 447 | " 'xxrep',\n", 448 | " 'xxwrep']" 449 | ] 450 | }, 451 | "execution_count": 18, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "tokenizer.special_cases" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 19, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/html": [], 468 | "text/plain": [ 469 | "" 470 | ] 471 | }, 472 | "metadata": {}, 473 | "output_type": "display_data" 474 | }, 475 | { 476 | "data": { 477 | "text/html": [], 478 | "text/plain": [ 479 | "" 480 | ] 481 | }, 482 | "metadata": {}, 483 | "output_type": "display_data" 484 | }, 485 | { 486 | "data": { 487 | "text/html": [], 488 | "text/plain": [ 489 | "" 490 | ] 491 | }, 492 | "metadata": {}, 493 | "output_type": "display_data" 494 | } 495 | ], 496 | "source": [ 497 | "data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=malyalam_vocab)" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 20, 503 | "metadata": {}, 504 | "outputs": [ 505 | { 506 | "data": { 507 | "text/html": [ 508 | "\n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | "
idxtext
0് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു ▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം ▁ x x bo s ▁ഓഹരി ▁സൂചി ക കള ് ▁നേട്ട ത്തില ് ▁
1▁500 ▁പേര് ▁ ക്ക് ▁ജോലി ▁ x x bo s ▁പോ ണ് ▁നടി യായി ▁ര മ ്യാ ▁കൃഷ്ണ ന് ▁ ; ▁സുപ്രധാന ▁സീ ന ിന് ▁എടുത്ത ത് ▁ 37 ▁ട േ ക്ക െന്ന് ▁താര ം ▁ x x bo s ▁സി യാ ല് ▁ഇനി ▁കൊച്ചി യുടെ ▁മോ ട്ടോ ര ് ▁സ്പ ോ ര ് ▁ട ് സ് ▁ഹ ബ്ബ ് ▁ x x bo s ▁' അ ട ൂര ്
2യാ ട്ട ▁ x x bo s ▁സര ് ▁വ്വ ം ▁താള മയ ത്തിന് ▁റെ ▁തെ ലു ഗ് ▁ട്ര െയ് ▁ല ര ് ▁പുറത്തു ▁വീട്ടു ▁ x x bo s ▁വൈ റ ലാ കാ ന് ▁വ ഴു ത ന ▁എത്തുന്ന ു ▁ x x bo s ▁ഐ എ എ ▁ലോ ▁ക ▁ഉ ▁ ച്ച ▁കോ ▁ടി ▁കൊ ▁ ച്ചി ▁ യി ▁ല ് ▁സ ▁മാ ▁പി ▁ ച്ചു ▁ x
3ടിയ ▁ഇന്ത്യ ക്ക് ▁തോ ല് ▁വി യും ▁പരമ ് ബ ര ▁നഷ്ട വും ▁ x x bo s ▁അക്കൗണ്ട ില ് ▁നിന്ന് ▁പണം ▁ചോര ാം ; ▁ഹി ഡ ന് ▁ആ പ്പ ുകള ് ▁ ; ▁പുതിയ ▁തട്ടി പ്പ ുകള ് ▁ഇങ്ങനെ യാണ് ▁ x x bo s ▁തു വ്വ ൂര ് ▁അഖിലേന്ത്യാ ▁സെ വ ന് ▁ സിന്റെ ▁ഫൈനല ് ▁ഇന്ന് ▁ x x bo s ▁വേറിട്ട ▁പ്രവചന ം <unk> ▁ഇത്
4▁പിന്മാറ ് റ മെന്ന് ▁റി പ്പോ ര ് ▁ ട്ട് ▁ x x bo s ▁ഐ . എസ് . എ ല് ▁രണ്ടാം ▁സെമി ▁ഫെ െ ന ലി ല് ▁മു ം ബെ െ ▁സിറ്റി ▁ജയ ത്തോടെ ▁പുറത്തേക്ക ് ▁ x x bo s ▁കു മ ് ബ ള ങ്ങി യിലെ ▁ഫ്ര ാങ്ക ി , ▁മാ ത്യ ു ▁തോ മ സിനെ ▁കണ്ടെത്തിയ തി ങ്ങനെ - ▁വി ഡി യോ ▁ x x bo
" 538 | ], 539 | "text/plain": [ 540 | "" 541 | ] 542 | }, 543 | "metadata": {}, 544 | "output_type": "display_data" 545 | } 546 | ], 547 | "source": [ 548 | "data_lm.show_batch()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 21, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "awd_lstm_config = awd_lstm_lm_config.copy()\n", 558 | "awd_lstm_config['n_hid'] = 1150\n", 559 | "learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, config=awd_lstm_config, pretrained=False)" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 22, 565 | "metadata": { 566 | "scrolled": true 567 | }, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "LanguageLearner(data=TextLMDataBunch;\n", 573 | "\n", 574 | "Train: LabelList (5036 items)\n", 575 | "x: LMTextList\n", 576 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 577 | "y: LMLabelList\n", 578 | ",,,,\n", 579 | "Path: .;\n", 580 | "\n", 581 | "Valid: LabelList (630 items)\n", 582 | "x: LMTextList\n", 583 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 584 | "y: LMLabelList\n", 585 | ",,,,\n", 586 | "Path: .;\n", 587 | "\n", 588 | "Test: LabelList (630 items)\n", 589 | "x: LMTextList\n", 590 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 591 | "y: EmptyLabelList\n", 592 | ",,,,\n", 593 | "Path: ., model=SequentialRNN(\n", 594 | " (0): AWD_LSTM(\n", 595 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 596 | " (encoder_dp): EmbeddingDropout(\n", 597 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 598 | " )\n", 599 | " (rnns): ModuleList(\n", 600 | " (0): WeightDropout(\n", 601 | " (module): LSTM(400, 1150, batch_first=True)\n", 602 | " )\n", 603 | " (1): WeightDropout(\n", 604 | " (module): LSTM(1150, 1150, batch_first=True)\n", 605 | " )\n", 606 | " (2): WeightDropout(\n", 607 | " (module): LSTM(1150, 400, batch_first=True)\n", 608 | " )\n", 609 | " )\n", 610 | " (input_dp): RNNDropout()\n", 611 | " (hidden_dps): ModuleList(\n", 612 | " (0): RNNDropout()\n", 613 | " (1): RNNDropout()\n", 614 | " (2): RNNDropout()\n", 615 | " )\n", 616 | " )\n", 617 | " (1): LinearDecoder(\n", 618 | " (decoder): Linear(in_features=400, out_features=10000, bias=True)\n", 619 | " (output_dp): RNNDropout()\n", 620 | " )\n", 621 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", 622 | "learn: LanguageLearner(data=TextLMDataBunch;\n", 623 | "\n", 624 | "Train: LabelList (5036 items)\n", 625 | "x: LMTextList\n", 626 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 627 | "y: LMLabelList\n", 628 | ",,,,\n", 629 | "Path: .;\n", 630 | "\n", 631 | "Valid: LabelList (630 items)\n", 632 | "x: LMTextList\n", 633 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 634 | "y: LMLabelList\n", 635 | ",,,,\n", 636 | "Path: .;\n", 637 | "\n", 638 | "Test: LabelList (630 items)\n", 639 | "x: LMTextList\n", 640 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 641 | "y: EmptyLabelList\n", 642 | ",,,,\n", 643 | "Path: ., model=SequentialRNN(\n", 644 | " (0): AWD_LSTM(\n", 645 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 646 | " (encoder_dp): EmbeddingDropout(\n", 647 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 648 | " )\n", 649 | " (rnns): ModuleList(\n", 650 | " (0): WeightDropout(\n", 651 | " (module): LSTM(400, 1150, batch_first=True)\n", 652 | " )\n", 653 | " (1): WeightDropout(\n", 654 | " (module): LSTM(1150, 1150, batch_first=True)\n", 655 | " )\n", 656 | " (2): WeightDropout(\n", 657 | " (module): LSTM(1150, 400, batch_first=True)\n", 658 | " )\n", 659 | " )\n", 660 | " (input_dp): RNNDropout()\n", 661 | " (hidden_dps): ModuleList(\n", 662 | " (0): RNNDropout()\n", 663 | " (1): RNNDropout()\n", 664 | " (2): RNNDropout()\n", 665 | " )\n", 666 | " )\n", 667 | " (1): LinearDecoder(\n", 668 | " (decoder): Linear(in_features=400, out_features=10000, bias=True)\n", 669 | " (output_dp): RNNDropout()\n", 670 | " )\n", 671 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", 672 | " (0): WeightDropout(\n", 673 | " (module): LSTM(400, 1150, batch_first=True)\n", 674 | " )\n", 675 | " (1): RNNDropout()\n", 676 | "), Sequential(\n", 677 | " (0): WeightDropout(\n", 678 | " (module): LSTM(1150, 1150, batch_first=True)\n", 679 | " )\n", 680 | " (1): RNNDropout()\n", 681 | "), Sequential(\n", 682 | " (0): WeightDropout(\n", 683 | " (module): LSTM(1150, 400, batch_first=True)\n", 684 | " )\n", 685 | " (1): RNNDropout()\n", 686 | "), Sequential(\n", 687 | " (0): Embedding(10000, 400, padding_idx=1)\n", 688 | " (1): EmbeddingDropout(\n", 689 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 690 | " )\n", 691 | " (2): LinearDecoder(\n", 692 | " (decoder): Linear(in_features=400, out_features=10000, bias=True)\n", 693 | " (output_dp): RNNDropout()\n", 694 | " )\n", 695 | ")], add_time=True, silent=False, cb_fns_registered=False)\n", 696 | "alpha: 2.0\n", 697 | "beta: 1.0], layer_groups=[Sequential(\n", 698 | " (0): WeightDropout(\n", 699 | " (module): LSTM(400, 1150, batch_first=True)\n", 700 | " )\n", 701 | " (1): RNNDropout()\n", 702 | "), Sequential(\n", 703 | " (0): WeightDropout(\n", 704 | " (module): LSTM(1150, 1150, batch_first=True)\n", 705 | " )\n", 706 | " (1): RNNDropout()\n", 707 | "), Sequential(\n", 708 | " (0): WeightDropout(\n", 709 | " (module): LSTM(1150, 400, batch_first=True)\n", 710 | " )\n", 711 | " (1): RNNDropout()\n", 712 | "), Sequential(\n", 713 | " (0): Embedding(10000, 400, padding_idx=1)\n", 714 | " (1): EmbeddingDropout(\n", 715 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 716 | " )\n", 717 | " (2): LinearDecoder(\n", 718 | " (decoder): Linear(in_features=400, out_features=10000, bias=True)\n", 719 | " (output_dp): RNNDropout()\n", 720 | " )\n", 721 | ")], add_time=True, silent=False, cb_fns_registered=False)" 722 | ] 723 | }, 724 | "execution_count": 22, 725 | "metadata": {}, 726 | "output_type": "execute_result" 727 | } 728 | ], 729 | "source": [ 730 | "# Loading the pretrained language model on malyalam wikipedia\n", 731 | "learn.load('../../../models/malayalam/lm/ULMFiT/third_ml_lm', with_opt=True)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 23, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [ 740 | "# Fine tuning the prtrained LM on current dataset" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 24, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "learn.freeze()" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 25, 755 | "metadata": {}, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/html": [ 760 | "\n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | "
epochtrain_lossvalid_lossaccuracytime
05.0460894.4841700.33160700:02
" 780 | ], 781 | "text/plain": [ 782 | "" 783 | ] 784 | }, 785 | "metadata": {}, 786 | "output_type": "display_data" 787 | } 788 | ], 789 | "source": [ 790 | "learn.fit_one_cycle(1, 1e-2)" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 26, 796 | "metadata": {}, 797 | "outputs": [], 798 | "source": [ 799 | "learn.unfreeze()" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 27, 805 | "metadata": {}, 806 | "outputs": [ 807 | { 808 | "data": { 809 | "text/html": [ 810 | "\n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | "
epochtrain_lossvalid_lossaccuracytime
04.3594474.1087920.36214300:03
14.0481413.7251880.40799100:03
23.7698453.5501220.42696400:03
33.5635103.4808080.43455400:03
43.4359143.4716800.43611600:03
" 858 | ], 859 | "text/plain": [ 860 | "" 861 | ] 862 | }, 863 | "metadata": {}, 864 | "output_type": "display_data" 865 | } 866 | ], 867 | "source": [ 868 | "learn.fit_one_cycle(5, 1e-3)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 28, 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "data": { 878 | "text/plain": [ 879 | "'മലയാള ികളായ ▁വിമാന യാത്ര ക്കാര ് ▁ x x bo s ▁ഇരു ▁ദേശീയ രും ▁ഒഴിവാക്ക'" 880 | ] 881 | }, 882 | "execution_count": 28, 883 | "metadata": {}, 884 | "output_type": "execute_result" 885 | } 886 | ], 887 | "source": [ 888 | "learn.predict('മലയാള ികളായ ▁വിമാന യാത്ര ക്കാര',n_words=10)" 889 | ] 890 | }, 891 | { 892 | "cell_type": "code", 893 | "execution_count": 29, 894 | "metadata": {}, 895 | "outputs": [], 896 | "source": [ 897 | "learn.save_encoder('fine_tuned_enc')" 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 30, 903 | "metadata": {}, 904 | "outputs": [ 905 | { 906 | "data": { 907 | "text/html": [], 908 | "text/plain": [ 909 | "" 910 | ] 911 | }, 912 | "metadata": {}, 913 | "output_type": "display_data" 914 | }, 915 | { 916 | "data": { 917 | "text/html": [], 918 | "text/plain": [ 919 | "" 920 | ] 921 | }, 922 | "metadata": {}, 923 | "output_type": "display_data" 924 | }, 925 | { 926 | "data": { 927 | "text/html": [], 928 | "text/plain": [ 929 | "" 930 | ] 931 | }, 932 | "metadata": {}, 933 | "output_type": "display_data" 934 | } 935 | ], 936 | "source": [ 937 | "data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=malyalam_vocab, bs=16)" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 31, 943 | "metadata": {}, 944 | "outputs": [ 945 | { 946 | "data": { 947 | "text/html": [ 948 | "\n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | "
texttarget
▁ x x bo s ▁ശ ▁ബ ▁ രി ▁മ ▁ല ▁വി ▁ക ▁സ ▁ന ത്തിനായി ▁സ ▁ര ് ▁ ക്കാ ▁ര ് ▁നി ▁യ ▁ന് ത്ര ▁ ണ ▁ ത്തി ▁ല ് ▁പ്ര ▁ ത് യേ ▁ക ▁ക ▁മ ് ബ ▁നി ▁ര ൂ ▁പീ ▁ക ▁ രി ▁ ക്കാ ▁ന് ▁തീ ▁ രു ▁മാ ▁ന ം ; ▁നടപടി ▁തീ ▁ x x re p ▁5 ▁ര ് ▁business
▁ x x bo s ▁സ്വ ന്ത ക്കാര നെ ▁ഗവ ര ് ▁ ണ റ ാക്കിയ ത് ▁വെ റു തേ യായ ില്ല ; ▁വര ് ▁ഷ ം ▁അവസാന ി ക്കാന ് ▁കാത്തിരിക്ക ാതെ ▁28 ,000 ▁കോടി ▁കേന്ദ്ര ▁സര ് ▁ ക്കാര ിന് ▁ഇട ക്കാല ▁ലാഭ വി ഹിത മായി ▁ന ല് ▁കി ▁റിസ ര ് ▁വ ് ▁ബാങ്ക് ; ▁ആഗ സ്റ്റ ില ് ▁4 0,000 ▁കോടി ▁ന ല് ▁ക ിയ തിന്business
▁ x x bo s ▁വെള്ള യും ▁സി ല് ▁വ റും ▁കല ര ് ▁ ന്ന ▁നിറ മുള്ള ▁ല ഹ ങ്ക ▁ചോള ിയ ണി ഞ്ഞ് ▁സ യേ ഷ യ െത്തിയ പ്പോ ള ് ▁അതേ ▁നിറത്തിലുള്ള ▁പൈ ജാ മ യും ▁ജാക്കറ ് റും ▁ധരിച്ച ് ▁ആര്യ യും ; ▁ഹൈദരാബാദ ിലെ ▁താ ജ് ▁ഫലക ് ▁നൂ മ ▁പാല സി ല് ▁സംഗീത ് ▁ചടങ്ങ ു കളോടെ ▁നട ന് ▁ആര്യ - ▁സ യേ ഷentertainment
▁ x x bo s ▁സഹകരണ ▁സംഘ ങ്ങളുടെ ▁പേര ിനൊപ്പം ▁' ബാങ്ക ് ' ▁എന്ന് ▁ചേര ് ▁ ത്ത ിട്ടു ണ്ട െ ങ്ക ില ് ▁നിക്ഷേപ ങ്ങള ് ▁ ക്ക് ▁നികുതി ▁ന ല് ▁ക ണമെന്ന് ▁ആ ദ ായ ▁നി ക തി ▁വകുപ്പ ് ; ▁നിക്ഷേപ ത്തിന്റെ ▁പലിശ യില ് ▁നിന്നും ▁നികുതി ▁ഈ ടാ ക്കാന ് ▁ജില്ലാ ▁സഹകരണ ▁ബാങ്ക ുകള ് ▁ ക്ക് ▁നിര ് ▁ദ് ദേശം ; ▁ലൈ സ ന്business
▁ x x bo s ▁സോ ഷ്യ ല് ▁മീഡിയ യിലെ ▁തെരഞ്ഞെടുപ്പ ് ▁പ്രചരണ ങ്ങള ് ▁പെരു മാറ്റ ച്ച ട്ട ം ▁ല ം ഘ ിക്കുന്ന ില്ലെന്ന് ▁ഉറപ്പുവരുത്ത ാന ് ▁ മാര ് ▁ഗ നിര ് ▁ദ് ദേശ ങ്ങളുമായി ▁തെരഞ്ഞെടുപ്പ ് ▁കമ്മീഷന ് ▁ ; ▁രാഷ്ട്രീയ ▁പരസ്യ ങ്ങളും ▁പ്രചരണ ങ്ങളും ▁സോ ഷ്യ ല് ▁മീഡിയ യില ് ▁പോ സ്റ്റ് ▁ചെയ്യുന്നതിന ് ▁മു ന് ▁കൂ ര ് ▁അനുമതി ▁വാങ്ങ ണംbusiness
" 978 | ], 979 | "text/plain": [ 980 | "" 981 | ] 982 | }, 983 | "metadata": {}, 984 | "output_type": "display_data" 985 | } 986 | ], 987 | "source": [ 988 | "data_clas.show_batch()" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": 32, 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [ 997 | "del awd_lstm_config['tie_weights']\n", 998 | "del awd_lstm_config['out_bias']" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 33, 1004 | "metadata": {}, 1005 | "outputs": [], 1006 | "source": [ 1007 | "learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5, config=awd_lstm_config)" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": 34, 1013 | "metadata": { 1014 | "scrolled": true 1015 | }, 1016 | "outputs": [ 1017 | { 1018 | "data": { 1019 | "text/plain": [ 1020 | "RNNLearner(data=TextClasDataBunch;\n", 1021 | "\n", 1022 | "Train: LabelList (5036 items)\n", 1023 | "x: TextList\n", 1024 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 1025 | "y: CategoryList\n", 1026 | "business,business,business,business,sports\n", 1027 | "Path: .;\n", 1028 | "\n", 1029 | "Valid: LabelList (630 items)\n", 1030 | "x: TextList\n", 1031 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 1032 | "y: CategoryList\n", 1033 | "business,sports,sports,sports,sports\n", 1034 | "Path: .;\n", 1035 | "\n", 1036 | "Test: LabelList (630 items)\n", 1037 | "x: TextList\n", 1038 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 1039 | "y: EmptyLabelList\n", 1040 | ",,,,\n", 1041 | "Path: ., model=SequentialRNN(\n", 1042 | " (0): MultiBatchEncoder(\n", 1043 | " (module): AWD_LSTM(\n", 1044 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 1045 | " (encoder_dp): EmbeddingDropout(\n", 1046 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1047 | " )\n", 1048 | " (rnns): ModuleList(\n", 1049 | " (0): WeightDropout(\n", 1050 | " (module): LSTM(400, 1150, batch_first=True)\n", 1051 | " )\n", 1052 | " (1): WeightDropout(\n", 1053 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1054 | " )\n", 1055 | " (2): WeightDropout(\n", 1056 | " (module): LSTM(1150, 400, batch_first=True)\n", 1057 | " )\n", 1058 | " )\n", 1059 | " (input_dp): RNNDropout()\n", 1060 | " (hidden_dps): ModuleList(\n", 1061 | " (0): RNNDropout()\n", 1062 | " (1): RNNDropout()\n", 1063 | " (2): RNNDropout()\n", 1064 | " )\n", 1065 | " )\n", 1066 | " )\n", 1067 | " (1): PoolingLinearClassifier(\n", 1068 | " (layers): Sequential(\n", 1069 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1070 | " (1): Dropout(p=0.05)\n", 1071 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1072 | " (3): ReLU(inplace)\n", 1073 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1074 | " (5): Dropout(p=0.1)\n", 1075 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1076 | " )\n", 1077 | " )\n", 1078 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", 1079 | "learn: RNNLearner(data=TextClasDataBunch;\n", 1080 | "\n", 1081 | "Train: LabelList (5036 items)\n", 1082 | "x: TextList\n", 1083 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 1084 | "y: CategoryList\n", 1085 | "business,business,business,business,sports\n", 1086 | "Path: .;\n", 1087 | "\n", 1088 | "Valid: LabelList (630 items)\n", 1089 | "x: TextList\n", 1090 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 1091 | "y: CategoryList\n", 1092 | "business,sports,sports,sports,sports\n", 1093 | "Path: .;\n", 1094 | "\n", 1095 | "Test: LabelList (630 items)\n", 1096 | "x: TextList\n", 1097 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 1098 | "y: EmptyLabelList\n", 1099 | ",,,,\n", 1100 | "Path: ., model=SequentialRNN(\n", 1101 | " (0): MultiBatchEncoder(\n", 1102 | " (module): AWD_LSTM(\n", 1103 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 1104 | " (encoder_dp): EmbeddingDropout(\n", 1105 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1106 | " )\n", 1107 | " (rnns): ModuleList(\n", 1108 | " (0): WeightDropout(\n", 1109 | " (module): LSTM(400, 1150, batch_first=True)\n", 1110 | " )\n", 1111 | " (1): WeightDropout(\n", 1112 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1113 | " )\n", 1114 | " (2): WeightDropout(\n", 1115 | " (module): LSTM(1150, 400, batch_first=True)\n", 1116 | " )\n", 1117 | " )\n", 1118 | " (input_dp): RNNDropout()\n", 1119 | " (hidden_dps): ModuleList(\n", 1120 | " (0): RNNDropout()\n", 1121 | " (1): RNNDropout()\n", 1122 | " (2): RNNDropout()\n", 1123 | " )\n", 1124 | " )\n", 1125 | " )\n", 1126 | " (1): PoolingLinearClassifier(\n", 1127 | " (layers): Sequential(\n", 1128 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1129 | " (1): Dropout(p=0.05)\n", 1130 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1131 | " (3): ReLU(inplace)\n", 1132 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1133 | " (5): Dropout(p=0.1)\n", 1134 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1135 | " )\n", 1136 | " )\n", 1137 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", 1138 | " (0): Embedding(10000, 400, padding_idx=1)\n", 1139 | " (1): EmbeddingDropout(\n", 1140 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1141 | " )\n", 1142 | "), Sequential(\n", 1143 | " (0): WeightDropout(\n", 1144 | " (module): LSTM(400, 1150, batch_first=True)\n", 1145 | " )\n", 1146 | " (1): RNNDropout()\n", 1147 | "), Sequential(\n", 1148 | " (0): WeightDropout(\n", 1149 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1150 | " )\n", 1151 | " (1): RNNDropout()\n", 1152 | "), Sequential(\n", 1153 | " (0): WeightDropout(\n", 1154 | " (module): LSTM(1150, 400, batch_first=True)\n", 1155 | " )\n", 1156 | " (1): RNNDropout()\n", 1157 | "), Sequential(\n", 1158 | " (0): PoolingLinearClassifier(\n", 1159 | " (layers): Sequential(\n", 1160 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1161 | " (1): Dropout(p=0.05)\n", 1162 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1163 | " (3): ReLU(inplace)\n", 1164 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1165 | " (5): Dropout(p=0.1)\n", 1166 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1167 | " )\n", 1168 | " )\n", 1169 | ")], add_time=True, silent=False, cb_fns_registered=False)\n", 1170 | "alpha: 2.0\n", 1171 | "beta: 1.0], layer_groups=[Sequential(\n", 1172 | " (0): Embedding(10000, 400, padding_idx=1)\n", 1173 | " (1): EmbeddingDropout(\n", 1174 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1175 | " )\n", 1176 | "), Sequential(\n", 1177 | " (0): WeightDropout(\n", 1178 | " (module): LSTM(400, 1150, batch_first=True)\n", 1179 | " )\n", 1180 | " (1): RNNDropout()\n", 1181 | "), Sequential(\n", 1182 | " (0): WeightDropout(\n", 1183 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1184 | " )\n", 1185 | " (1): RNNDropout()\n", 1186 | "), Sequential(\n", 1187 | " (0): WeightDropout(\n", 1188 | " (module): LSTM(1150, 400, batch_first=True)\n", 1189 | " )\n", 1190 | " (1): RNNDropout()\n", 1191 | "), Sequential(\n", 1192 | " (0): PoolingLinearClassifier(\n", 1193 | " (layers): Sequential(\n", 1194 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1195 | " (1): Dropout(p=0.05)\n", 1196 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1197 | " (3): ReLU(inplace)\n", 1198 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1199 | " (5): Dropout(p=0.1)\n", 1200 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1201 | " )\n", 1202 | " )\n", 1203 | ")], add_time=True, silent=False, cb_fns_registered=False)" 1204 | ] 1205 | }, 1206 | "execution_count": 34, 1207 | "metadata": {}, 1208 | "output_type": "execute_result" 1209 | } 1210 | ], 1211 | "source": [ 1212 | "learn.load_encoder('fine_tuned_enc')" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "code", 1217 | "execution_count": 35, 1218 | "metadata": {}, 1219 | "outputs": [], 1220 | "source": [ 1221 | "learn.freeze()" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "code", 1226 | "execution_count": 36, 1227 | "metadata": {}, 1228 | "outputs": [ 1229 | { 1230 | "data": { 1231 | "text/plain": [ 1232 | "CrossEntropyLoss()" 1233 | ] 1234 | }, 1235 | "execution_count": 36, 1236 | "metadata": {}, 1237 | "output_type": "execute_result" 1238 | } 1239 | ], 1240 | "source": [ 1241 | "learn.loss_func.func" 1242 | ] 1243 | }, 1244 | { 1245 | "cell_type": "code", 1246 | "execution_count": 37, 1247 | "metadata": {}, 1248 | "outputs": [], 1249 | "source": [ 1250 | "mcc = MatthewsCorreff()" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "code", 1255 | "execution_count": 38, 1256 | "metadata": {}, 1257 | "outputs": [], 1258 | "source": [ 1259 | "learn.metrics = [mcc, accuracy]" 1260 | ] 1261 | }, 1262 | { 1263 | "cell_type": "code", 1264 | "execution_count": 39, 1265 | "metadata": {}, 1266 | "outputs": [ 1267 | { 1268 | "data": { 1269 | "text/html": [ 1270 | "\n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | "
epochtrain_lossvalid_lossmatthews_correffaccuracytime
00.5257150.4368060.7292110.81746000:05
" 1292 | ], 1293 | "text/plain": [ 1294 | "" 1295 | ] 1296 | }, 1297 | "metadata": {}, 1298 | "output_type": "display_data" 1299 | } 1300 | ], 1301 | "source": [ 1302 | "learn.fit_one_cycle(1, 1e-2)" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 40, 1308 | "metadata": {}, 1309 | "outputs": [ 1310 | { 1311 | "data": { 1312 | "text/html": [ 1313 | "\n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | "
epochtrain_lossvalid_lossmatthews_correffaccuracytime
00.3358730.2146800.8956720.93015900:06
" 1335 | ], 1336 | "text/plain": [ 1337 | "" 1338 | ] 1339 | }, 1340 | "metadata": {}, 1341 | "output_type": "display_data" 1342 | } 1343 | ], 1344 | "source": [ 1345 | "learn.freeze_to(-2)\n", 1346 | "learn.fit_one_cycle(1, 1e-2)" 1347 | ] 1348 | }, 1349 | { 1350 | "cell_type": "code", 1351 | "execution_count": 41, 1352 | "metadata": {}, 1353 | "outputs": [], 1354 | "source": [ 1355 | "learn.save('second-full')" 1356 | ] 1357 | }, 1358 | { 1359 | "cell_type": "code", 1360 | "execution_count": 42, 1361 | "metadata": {}, 1362 | "outputs": [ 1363 | { 1364 | "data": { 1365 | "text/html": [ 1366 | "\n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | "
epochtrain_lossvalid_lossmatthews_correffaccuracytime
00.1851380.1830640.9142770.94285700:12
10.1294240.1918660.9288060.95238100:13
20.1017390.2217870.9239800.94920600:12
30.0713990.2256530.9170860.94444400:13
40.0482540.2170630.9287380.95238100:13
" 1420 | ], 1421 | "text/plain": [ 1422 | "" 1423 | ] 1424 | }, 1425 | "metadata": {}, 1426 | "output_type": "display_data" 1427 | }, 1428 | { 1429 | "name": "stdout", 1430 | "output_type": "stream", 1431 | "text": [ 1432 | "Better model found at epoch 0 with accuracy value: 0.9428571462631226.\n", 1433 | "Better model found at epoch 1 with accuracy value: 0.9523809552192688.\n" 1434 | ] 1435 | } 1436 | ], 1437 | "source": [ 1438 | "learn.unfreeze()\n", 1439 | "learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])" 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "execution_count": 43, 1445 | "metadata": {}, 1446 | "outputs": [ 1447 | { 1448 | "data": { 1449 | "text/plain": [ 1450 | "RNNLearner(data=TextClasDataBunch;\n", 1451 | "\n", 1452 | "Train: LabelList (5036 items)\n", 1453 | "x: TextList\n", 1454 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 1455 | "y: CategoryList\n", 1456 | "business,business,business,business,sports\n", 1457 | "Path: .;\n", 1458 | "\n", 1459 | "Valid: LabelList (630 items)\n", 1460 | "x: TextList\n", 1461 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 1462 | "y: CategoryList\n", 1463 | "business,sports,sports,sports,sports\n", 1464 | "Path: .;\n", 1465 | "\n", 1466 | "Test: LabelList (630 items)\n", 1467 | "x: TextList\n", 1468 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 1469 | "y: EmptyLabelList\n", 1470 | ",,,,\n", 1471 | "Path: ., model=SequentialRNN(\n", 1472 | " (0): MultiBatchEncoder(\n", 1473 | " (module): AWD_LSTM(\n", 1474 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 1475 | " (encoder_dp): EmbeddingDropout(\n", 1476 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1477 | " )\n", 1478 | " (rnns): ModuleList(\n", 1479 | " (0): WeightDropout(\n", 1480 | " (module): LSTM(400, 1150, batch_first=True)\n", 1481 | " )\n", 1482 | " (1): WeightDropout(\n", 1483 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1484 | " )\n", 1485 | " (2): WeightDropout(\n", 1486 | " (module): LSTM(1150, 400, batch_first=True)\n", 1487 | " )\n", 1488 | " )\n", 1489 | " (input_dp): RNNDropout()\n", 1490 | " (hidden_dps): ModuleList(\n", 1491 | " (0): RNNDropout()\n", 1492 | " (1): RNNDropout()\n", 1493 | " (2): RNNDropout()\n", 1494 | " )\n", 1495 | " )\n", 1496 | " )\n", 1497 | " (1): PoolingLinearClassifier(\n", 1498 | " (layers): Sequential(\n", 1499 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1500 | " (1): Dropout(p=0.05)\n", 1501 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1502 | " (3): ReLU(inplace)\n", 1503 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1504 | " (5): Dropout(p=0.1)\n", 1505 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1506 | " )\n", 1507 | " )\n", 1508 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[MatthewsCorreff(), ], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[RNNTrainer\n", 1509 | "learn: RNNLearner(data=TextClasDataBunch;\n", 1510 | "\n", 1511 | "Train: LabelList (5036 items)\n", 1512 | "x: TextList\n", 1513 | "▁ x x bo s ▁ജോലി യില ് ▁നിന്ന് ▁ഒരു ▁ബ്ര േക്ക് ▁എടുക്ക ുന്നതിനു ▁മു ന് ▁പ ് . .,▁ x x bo s ▁ക മ ് ബോ ള ങ്ങള ് ▁കര ടി യുടെ ▁പിടി യില ്,▁ x x bo s ▁കൊച്ചി ▁മെട്രോ യുടെ ▁ബ്രാ ന് ▁ ഡ് ▁അംബ ാ സി ഡ റായി ▁നട ന് ▁സുരേഷ് ▁ഗോപി യെ ▁നിയമ ിച്ചു,▁ x x bo s ▁ഇന്ധന വില യില ് ▁വീണ്ടും ▁വര ് ▁ ദ്ധ ന വ് , ▁പെട്രോ ള ിന് ▁14 ▁പൈ സ യും ▁ഡീ സ ലി ന് ▁15 ▁പൈ സ യും ▁വര ് ▁ ദ്ധ ിച്ചു,▁ x x bo s ▁ഫെഡറ േഷന ് ▁കപ്പ ▁ ് ▁അത ▁ ് ▁ല റ്റി ക ▁ ് സി ന ▁ ് ▁ഇന്ന ▁ ് ▁തുടക്കം\n", 1514 | "y: CategoryList\n", 1515 | "business,business,business,business,sports\n", 1516 | "Path: .;\n", 1517 | "\n", 1518 | "Valid: LabelList (630 items)\n", 1519 | "x: TextList\n", 1520 | "▁ x x bo s ▁ട്രെയിന ് ▁യാത്ര യില ് ▁ഇനി ▁കുല ു ക്കം ▁കുറയ ും , ▁ജെ ര ് ▁ ക്ക ിങ് ▁ഒഴിവാക്ക ുന്നതിനുള്ള ▁നൂതന ▁സാങ്കേതിക ▁വിദ്യ ▁പ്രീ മിയ ം ▁ട്രെയിന ുകളില ്,▁ x x bo s ▁പാ ▁ലാ ▁സെ ▁ന് ▁റ ് ▁തോ ▁മ ▁സ് ▁ചാ ▁മ ് ബ് യ ന് മാ ▁ര ്,▁ x x bo s ▁ഓ സ് ▁ട്ര േലിയ യ് ▁ ക്കെതിരെ ▁ഇനി ▁ധ ോ ണിയ ില്ല ; ▁ലോകകപ്പ ിന് ▁മു മ ് ബ് ▁ഋഷഭ ് ▁പന്ത ിന് ▁സു വര ് ▁ ണാ വസ രം ,▁ x x bo s ▁ടെസ്റ്റ ിന് ▁ വേഗ ം ▁കൂട്ട ാന ് ▁എം . സി . സി,▁ x x bo s ▁ഓ ള ് ▁ഇംഗ്ലണ്ട് ▁ബാ ഡ് മി ന്റ ണി ല് ▁ശ്രീ ക ാന്ത ും ▁പുറത്ത് ; ▁ഇന്ത്യ ന് ▁പ്രതീക്ഷ കള ് ▁അവസാനിച്ചു\n", 1521 | "y: CategoryList\n", 1522 | "business,sports,sports,sports,sports\n", 1523 | "Path: .;\n", 1524 | "\n", 1525 | "Test: LabelList (630 items)\n", 1526 | "x: TextList\n", 1527 | "▁ x x bo s ▁ഇ ഞ്ചു റി ▁ടൈ ം ▁പെ നാ ല് ▁ റ്റി യില ് ▁എഫ് ▁സി ▁പോര ് ▁ ട്ടോ,▁ x x bo s ▁ആ മി ര ് ▁ഖാന്റെ ▁ഏറ്റവും ▁പുതിയ ▁ചിത്രം ▁ലാ ല് ▁സിങ് ▁ഛ ദ്ദ ; ഒ ക്ട ോ ബറി ല് ▁ചിത്രീകരണ മാര ം ഭി ക്കും,▁ x x bo s ▁ഐ ▁പി ▁എല്ല ിന് ▁മു ന് ▁പ ായി ▁ഓ സ് ▁ട്ര േലിയ ന് ▁ടീമ ിനൊപ്പം ▁ചേര ാന ൊരു ങ്ങി ▁സ് മി ത്തും ▁ വാര ് ▁ ണ റും,▁ x x bo s ▁സാമ ് ബ ത്തി ക ▁ജീവിതം ▁സുരക്ഷിത മാ ക്ക ണോ ▁ഈ ▁അഞ്ച് ▁ ശീല ങ്ങള ് ▁നേരത്തേ ▁തുടങ്ങ ൂ . . .,▁ x x bo s ▁എ ല് ▁ഇ ഡി ▁ബ ള ് ▁ബ ുകള ് ▁ലഭ്യമാക്ക ും ; ▁പദ്ധതിയുടെ ▁രജ ിസ് ▁ട്ര േഷന ് ▁ മാര ് ▁ ച്ച് ▁ഒന്ന ിന് ▁ആരംഭിക്ക ും\n", 1528 | "y: EmptyLabelList\n", 1529 | ",,,,\n", 1530 | "Path: ., model=SequentialRNN(\n", 1531 | " (0): MultiBatchEncoder(\n", 1532 | " (module): AWD_LSTM(\n", 1533 | " (encoder): Embedding(10000, 400, padding_idx=1)\n", 1534 | " (encoder_dp): EmbeddingDropout(\n", 1535 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1536 | " )\n", 1537 | " (rnns): ModuleList(\n", 1538 | " (0): WeightDropout(\n", 1539 | " (module): LSTM(400, 1150, batch_first=True)\n", 1540 | " )\n", 1541 | " (1): WeightDropout(\n", 1542 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1543 | " )\n", 1544 | " (2): WeightDropout(\n", 1545 | " (module): LSTM(1150, 400, batch_first=True)\n", 1546 | " )\n", 1547 | " )\n", 1548 | " (input_dp): RNNDropout()\n", 1549 | " (hidden_dps): ModuleList(\n", 1550 | " (0): RNNDropout()\n", 1551 | " (1): RNNDropout()\n", 1552 | " (2): RNNDropout()\n", 1553 | " )\n", 1554 | " )\n", 1555 | " )\n", 1556 | " (1): PoolingLinearClassifier(\n", 1557 | " (layers): Sequential(\n", 1558 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1559 | " (1): Dropout(p=0.05)\n", 1560 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1561 | " (3): ReLU(inplace)\n", 1562 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1563 | " (5): Dropout(p=0.1)\n", 1564 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1565 | " )\n", 1566 | " )\n", 1567 | "), opt_func=functools.partial(, betas=(0.9, 0.99)), loss_func=FlattenedLoss of CrossEntropyLoss(), metrics=[MatthewsCorreff(), ], true_wd=True, bn_wd=True, wd=0.01, train_bn=True, path=PosixPath('.'), model_dir='models', callback_fns=[functools.partial(, add_time=True, silent=False)], callbacks=[...], layer_groups=[Sequential(\n", 1568 | " (0): Embedding(10000, 400, padding_idx=1)\n", 1569 | " (1): EmbeddingDropout(\n", 1570 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1571 | " )\n", 1572 | "), Sequential(\n", 1573 | " (0): WeightDropout(\n", 1574 | " (module): LSTM(400, 1150, batch_first=True)\n", 1575 | " )\n", 1576 | " (1): RNNDropout()\n", 1577 | "), Sequential(\n", 1578 | " (0): WeightDropout(\n", 1579 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1580 | " )\n", 1581 | " (1): RNNDropout()\n", 1582 | "), Sequential(\n", 1583 | " (0): WeightDropout(\n", 1584 | " (module): LSTM(1150, 400, batch_first=True)\n", 1585 | " )\n", 1586 | " (1): RNNDropout()\n", 1587 | "), Sequential(\n", 1588 | " (0): PoolingLinearClassifier(\n", 1589 | " (layers): Sequential(\n", 1590 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1591 | " (1): Dropout(p=0.05)\n", 1592 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1593 | " (3): ReLU(inplace)\n", 1594 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1595 | " (5): Dropout(p=0.1)\n", 1596 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1597 | " )\n", 1598 | " )\n", 1599 | ")], add_time=True, silent=False, cb_fns_registered=True)\n", 1600 | "alpha: 2.0\n", 1601 | "beta: 1.0], layer_groups=[Sequential(\n", 1602 | " (0): Embedding(10000, 400, padding_idx=1)\n", 1603 | " (1): EmbeddingDropout(\n", 1604 | " (emb): Embedding(10000, 400, padding_idx=1)\n", 1605 | " )\n", 1606 | "), Sequential(\n", 1607 | " (0): WeightDropout(\n", 1608 | " (module): LSTM(400, 1150, batch_first=True)\n", 1609 | " )\n", 1610 | " (1): RNNDropout()\n", 1611 | "), Sequential(\n", 1612 | " (0): WeightDropout(\n", 1613 | " (module): LSTM(1150, 1150, batch_first=True)\n", 1614 | " )\n", 1615 | " (1): RNNDropout()\n", 1616 | "), Sequential(\n", 1617 | " (0): WeightDropout(\n", 1618 | " (module): LSTM(1150, 400, batch_first=True)\n", 1619 | " )\n", 1620 | " (1): RNNDropout()\n", 1621 | "), Sequential(\n", 1622 | " (0): PoolingLinearClassifier(\n", 1623 | " (layers): Sequential(\n", 1624 | " (0): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1625 | " (1): Dropout(p=0.05)\n", 1626 | " (2): Linear(in_features=1200, out_features=50, bias=True)\n", 1627 | " (3): ReLU(inplace)\n", 1628 | " (4): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n", 1629 | " (5): Dropout(p=0.1)\n", 1630 | " (6): Linear(in_features=50, out_features=3, bias=True)\n", 1631 | " )\n", 1632 | " )\n", 1633 | ")], add_time=True, silent=False, cb_fns_registered=True)" 1634 | ] 1635 | }, 1636 | "execution_count": 43, 1637 | "metadata": {}, 1638 | "output_type": "execute_result" 1639 | } 1640 | ], 1641 | "source": [ 1642 | "learn.load('final')" 1643 | ] 1644 | }, 1645 | { 1646 | "cell_type": "code", 1647 | "execution_count": 44, 1648 | "metadata": {}, 1649 | "outputs": [ 1650 | { 1651 | "data": { 1652 | "text/html": [], 1653 | "text/plain": [ 1654 | "" 1655 | ] 1656 | }, 1657 | "metadata": {}, 1658 | "output_type": "display_data" 1659 | }, 1660 | { 1661 | "data": { 1662 | "text/html": [ 1663 | "
\n", 1664 | "\n", 1677 | "\n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | " \n", 1687 | " \n", 1688 | " \n", 1689 | " \n", 1690 | " \n", 1691 | " \n", 1692 | " \n", 1693 | " \n", 1694 | " \n", 1695 | " \n", 1696 | " \n", 1697 | " \n", 1698 | " \n", 1699 | " \n", 1700 | " \n", 1701 | " \n", 1702 | " \n", 1703 | " \n", 1704 | " \n", 1705 | " \n", 1706 | " \n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | "
queryactual_labelpredicted_labelentertainmentsportsbusiness
0ഇഞ്ചുറി ടൈം പെനാല്‍റ്റിയില്‍ എഫ് സി പോര്‍ട്ടോsportssports0.004463760.762580.232957
1ആമിര്‍ ഖാന്റെ ഏറ്റവും പുതിയ ചിത്രം ലാല്‍ സിങ് ...entertainmententertainment0.9970330.002190870.000776317
2ഐ പി എല്ലിന് മുന്‍പായി ഓസ്‌ട്രേലിയന്‍ ടീമിനൊപ്...sportssports3.74899e-060.999680.00031659
3സാമ്ബത്തിക ജീവിതം സുരക്ഷിതമാക്കണോ? ഈ അഞ്ച് ശീല...businessbusiness0.001766030.005766940.992467
4എല്‍ഇഡി ബള്‍ബുകള്‍ ലഭ്യമാക്കും; പദ്ധതിയുടെ രജി...businessbusiness0.08443050.001915640.913654
\n", 1737 | "
" 1738 | ], 1739 | "text/plain": [ 1740 | " query actual_label \\\n", 1741 | "0 ഇഞ്ചുറി ടൈം പെനാല്‍റ്റിയില്‍ എഫ് സി പോര്‍ട്ടോ sports \n", 1742 | "1 ആമിര്‍ ഖാന്റെ ഏറ്റവും പുതിയ ചിത്രം ലാല്‍ സിങ് ... entertainment \n", 1743 | "2 ഐ പി എല്ലിന് മുന്‍പായി ഓസ്‌ട്രേലിയന്‍ ടീമിനൊപ്... sports \n", 1744 | "3 സാമ്ബത്തിക ജീവിതം സുരക്ഷിതമാക്കണോ? ഈ അഞ്ച് ശീല... business \n", 1745 | "4 എല്‍ഇഡി ബള്‍ബുകള്‍ ലഭ്യമാക്കും; പദ്ധതിയുടെ രജി... business \n", 1746 | "\n", 1747 | " predicted_label entertainment sports business \n", 1748 | "0 sports 0.00446376 0.76258 0.232957 \n", 1749 | "1 entertainment 0.997033 0.00219087 0.000776317 \n", 1750 | "2 sports 3.74899e-06 0.99968 0.00031659 \n", 1751 | "3 business 0.00176603 0.00576694 0.992467 \n", 1752 | "4 business 0.0844305 0.00191564 0.913654 " 1753 | ] 1754 | }, 1755 | "execution_count": 44, 1756 | "metadata": {}, 1757 | "output_type": "execute_result" 1758 | } 1759 | ], 1760 | "source": [ 1761 | "from sklearn.metrics import accuracy_score, matthews_corrcoef\n", 1762 | "df_dict = {'query': list(df_test[1]), 'actual_label': list(df_test[0]), 'predicted_label': ['']*df_test.shape[0]}\n", 1763 | "all_nodes = list(set(df_train[0]))\n", 1764 | "for node in all_nodes:\n", 1765 | " df_dict[node] = ['']*df_test.shape[0]\n", 1766 | " \n", 1767 | "i2c = {}\n", 1768 | "for key, value in learn.data.c2i.items():\n", 1769 | " i2c[value] = key\n", 1770 | " \n", 1771 | "df_result = pd.DataFrame(df_dict)\n", 1772 | "preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)\n", 1773 | "for index, row in df_result.iterrows():\n", 1774 | " for node in all_nodes:\n", 1775 | " row[node] = preds[0][index][learn.data.c2i[node]].item()\n", 1776 | " row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]\n", 1777 | "df_result.head()" 1778 | ] 1779 | }, 1780 | { 1781 | "cell_type": "code", 1782 | "execution_count": 45, 1783 | "metadata": {}, 1784 | "outputs": [ 1785 | { 1786 | "data": { 1787 | "text/plain": [ 1788 | "0.9555555555555556" 1789 | ] 1790 | }, 1791 | "execution_count": 45, 1792 | "metadata": {}, 1793 | "output_type": "execute_result" 1794 | } 1795 | ], 1796 | "source": [ 1797 | "accuracy_score(df_result['actual_label'], df_result['predicted_label'])" 1798 | ] 1799 | }, 1800 | { 1801 | "cell_type": "code", 1802 | "execution_count": 46, 1803 | "metadata": {}, 1804 | "outputs": [ 1805 | { 1806 | "data": { 1807 | "text/plain": [ 1808 | "0.9328807382603987" 1809 | ] 1810 | }, 1811 | "execution_count": 46, 1812 | "metadata": {}, 1813 | "output_type": "execute_result" 1814 | } 1815 | ], 1816 | "source": [ 1817 | "matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])" 1818 | ] 1819 | }, 1820 | { 1821 | "cell_type": "code", 1822 | "execution_count": 47, 1823 | "metadata": {}, 1824 | "outputs": [], 1825 | "source": [ 1826 | "df_result.to_csv('inltk_headlines_ml.csv', index=False)" 1827 | ] 1828 | }, 1829 | { 1830 | "cell_type": "code", 1831 | "execution_count": null, 1832 | "metadata": {}, 1833 | "outputs": [], 1834 | "source": [] 1835 | } 1836 | ], 1837 | "metadata": { 1838 | "kernelspec": { 1839 | "display_name": "in", 1840 | "language": "python", 1841 | "name": "in" 1842 | }, 1843 | "language_info": { 1844 | "codemirror_mode": { 1845 | "name": "ipython", 1846 | "version": 3 1847 | }, 1848 | "file_extension": ".py", 1849 | "mimetype": "text/x-python", 1850 | "name": "python", 1851 | "nbconvert_exporter": "python", 1852 | "pygments_lexer": "ipython3", 1853 | "version": "3.6.3" 1854 | } 1855 | }, 1856 | "nbformat": 4, 1857 | "nbformat_minor": 2 1858 | } 1859 | --------------------------------------------------------------------------------