├── language-model ├── inltk │ ├── __init__.py │ └── tokenizer.py ├── embeddings.tsv ├── embeddings_metadata.tsv ├── embeddings_subset.tsv ├── embeddings_metadata_subset.tsv ├── transformer3_embeddings.tsv ├── transformer3_embeddings_metadata.tsv ├── sentence_encodings │ ├── inltk_sentence_encoding_hi.tsv │ ├── inltk_sentence_encoding_metadata_hi.tsv │ └── encoding_projector_config.json ├── embedding_projector_config_30k.json ├── embedding_projector_config.json ├── embedding_projector_config_transformerxl.json ├── Hindi_Language_Model_ULMFiT_172k.ipynb └── Hindi_Language_Model_TransformerXL_172k.ipynb ├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md └── datasets-preparation ├── get-hindi-movie-reviews-2.ipynb ├── get-all-article-links-for-hindi-wikipedia.ipynb └── get-hindi-movie-reviews-1.ipynb /language-model/inltk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.tsv filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /language-model/embeddings.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0aad0e839d1504f870c0cc8c01c296b3782643bd0e22545221245c36fc5b7868 3 | size 135714671 4 | -------------------------------------------------------------------------------- /language-model/embeddings_metadata.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7904e7247f33fe5530976f86736f265a6a9756d934b2027c7f4a7c3844b0cd72 3 | size 545893 4 | -------------------------------------------------------------------------------- /language-model/embeddings_subset.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6b9bf600948fa5ab1a8979c199a85ed021e300ddcfa4a19514452aaa0f8117b8 3 | size 22557761 4 | -------------------------------------------------------------------------------- /language-model/embeddings_metadata_subset.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:014bdea9a43fbde001b372c1301d04de718d0f878b1669f3482a793bcaf4e51d 3 | size 80941 4 | -------------------------------------------------------------------------------- /language-model/transformer3_embeddings.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b7f2c471d67f82af4f96eca4ab7308da369c6bd44cd5cccd142d832301e92306 3 | size 143518326 4 | -------------------------------------------------------------------------------- /language-model/transformer3_embeddings_metadata.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:3e594568456f1388ce3ff25eecf5524ecb253946db85bc74897aa219e0d552c4 3 | size 547994 4 | -------------------------------------------------------------------------------- /language-model/sentence_encodings/inltk_sentence_encoding_hi.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:636682aeb9fe4fa5aa70674f5e0f1730bf4b3fbea8a8c659cdf33c33d9cccb75 3 | size 75536 4 | -------------------------------------------------------------------------------- /language-model/sentence_encodings/inltk_sentence_encoding_metadata_hi.tsv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:30cf51dc1da0e76a8247bf8589a3e9ae8aad209e3f8560bed6281159ea49e6a5 3 | size 498 4 | -------------------------------------------------------------------------------- /language-model/inltk/tokenizer.py: -------------------------------------------------------------------------------- 1 | from fastai.text import * 2 | import sentencepiece as spm 3 | 4 | class HindiTokenizer(BaseTokenizer): 5 | def __init__(self, lang:str): 6 | self.lang = lang 7 | self.sp = spm.SentencePieceProcessor() 8 | self.sp.Load("/home/gaurav/PycharmProjects/nlp-for-hindi/tokenizer/hindi_lm.model") 9 | 10 | def tokenizer(self, t:str) -> List[str]: 11 | return self.sp.EncodeAsPieces(t) 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | language-model/models/* 2 | language-model/tmp/* 3 | language-model/train/* 4 | language-model/valid/* 5 | datasets-preparation/hindi-wikipedia-dataset/* 6 | classification-movie-review/models/* 7 | classification-movie-review/tmp/* 8 | classification-bbc-news/bbc-hindiv01/* 9 | classification-bbc-news/models/* 10 | classification-bbc-news/tmp/* 11 | classification-bbc-news/bbc-hindiv01.tar.gz 12 | language-model/embeddings 13 | language-model/HindiDataset/ 14 | tokenizer/.ipynb_checkpoints/ -------------------------------------------------------------------------------- /language-model/embedding_projector_config_30k.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Hindi Embedding Vectors - ULMFiT", 5 | "tensorShape": [ 6 | 30000, 7 | 400 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/embeddings.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/embeddings_metadata.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /language-model/embedding_projector_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Subset of Hindi Embedding Vectors", 5 | "tensorShape": [ 6 | 5000, 7 | 400 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/embeddings_subset.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/embeddings_metadata_subset.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /language-model/embedding_projector_config_transformerxl.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Hindi Embedding Vectors - TransformerXL", 5 | "tensorShape": [ 6 | 30000, 7 | 410 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/transformer3_embeddings.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/transformer3_embeddings_metadata.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /language-model/sentence_encodings/encoding_projector_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "embeddings": [ 3 | { 4 | "tensorName": "Hindi Sentence Encodings", 5 | "tensorShape": [ 6 | 9, 7 | 400 8 | ], 9 | "tensorPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/sentence_encodings/inltk_sentence_encoding_hi.tsv", 10 | "metadataPath": "https://media.githubusercontent.com/media/goru001/nlp-for-hindi/master/language-model/sentence_encodings/inltk_sentence_encoding_metadata_hi.tsv" 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Gaurav 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP for Hindi 2 | This repository contains State of the Art Language models and Classifier for Hindi language 3 | (spoken in Indian sub-continent). 4 | 5 | The models trained here have been used in [Natural Language Toolkit for Indic Languages 6 | (iNLTK)](https://github.com/goru001/inltk) 7 | 8 | 9 | ## Dataset 10 | 11 | #### Created as part of this project 12 | 1. [Hindi Wikipedia Articles - 172k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-172k) 13 | 14 | 2. [Hindi Wikipedia Articles - 55k](https://www.kaggle.com/disisbig/hindi-wikipedia-articles-55k) 15 | 16 | 3. [Hindi Movie Reviews Dataset](https://www.kaggle.com/disisbig/hindi-movie-reviews-dataset) 17 | 18 | 4. [Hindi Text Short Summarization Corpus](https://www.kaggle.com/disisbig/hindi-text-short-summarization-corpus) 19 | 20 | 5. [Hindi Text Short and Large Summarization Corpus](https://www.kaggle.com/disisbig/hindi-text-short-and-large-summarization-corpus) 21 | 22 | 23 | #### Open Source Datasets 24 | 1. [BBC News Articles](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) : Sentiment analysis corpus for Hindi documents extracted from BBC news website. 25 | 26 | 2. [IIT Patna Product Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) : Sentiment analysis corpus for product reviews posted in Hindi. 27 | 28 | 3. [IIT Patna Movie Reviews](https://github.com/ai4bharat-indicnlp/indicnlp_corpus#publicly-available-classification-datasets) : Sentiment analysis corpus for movie reviews posted in Hindi. 29 | 30 | ## Results 31 | 32 | ### Language Model Perplexity (on validation set) 33 | 34 | | Architecture/Dataset | Hindi Wikipedia Articles - 172k | Hindi Wikipedia Articles - 55k | 35 | |:--------:|:----:|:----:| 36 | | ULMFiT | 34.06 | 35.87 | 37 | | TransformerXL | 26.09 | 34.78 | 38 | 39 | **Note**: [Nirant](https://github.com/NirantK) has done previous [SOTA work with 40 | Hindi Language Model](https://github.com/NirantK/hindi2vec) and achieved perplexity of ~46. 41 | The scores above aren't directly comparable with his score because his train and validation set 42 | were different and [they aren't available for reproducibility](https://github.com/NirantK/hindi2vec/issues/1) 43 | 44 | 45 | ### Classification Metrics 46 | 47 | ##### ULMFiT 48 | 49 | | Dataset | Accuracy | MCC | Notebook to Reproduce results | 50 | |:--------:|:----:|:----:|:----:| 51 | | BBC News Articles | 78.75 | 71.61 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_BBC_Articles.ipynb) | 52 | | IIT Patna Movie Reviews | 57.74 | 37.23 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie.ipynb) | 53 | | IIT Patna Product Reviews | 75.71 | 59.76 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP_Product.ipynb) | 54 | 55 | 56 | 57 | ### Visualizations 58 | 59 | ##### Word Embeddings 60 | 61 | | Architecture | Visualization | 62 | |:--------:|:----:| 63 | | ULMFiT | [Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_30k.json) | 64 | | TransformerXL | [Embeddings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/embedding_projector_config_transformerxl.json) | 65 | 66 | ##### Sentence Embeddings 67 | 68 | | Architecture | Visualization | 69 | |:--------:|:----:| 70 | | ULMFiT | [Encodings projection](https://projector.tensorflow.org/?config=https://raw.githubusercontent.com/goru001/nlp-for-hindi/master/language-model/sentence_encodings/encoding_projector_config.json) | 71 | 72 | 73 | 74 | 75 | ### Results of using Transfer Learning + Data Augmentation from iNLTK 76 | 77 | ##### On using complete training set (with Transfer learning) 78 | 79 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 80 | |:--------:|:----:|:----:|:----:|:----:| 81 | | IIT Patna Movie Reviews | (2480, 310, 310) | 57.74 | 37.23 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie.ipynb) | 82 | 83 | 84 | ##### On using 20% of training set (with Transfer learning) 85 | 86 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 87 | |:--------:|:----:|:----:|:----:|:----:| 88 | | IIT Patna Movie Reviews | (496, 310, 310) | 47.74 | 20.50 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie_without_Data_Aug.ipynb) | 89 | 90 | ##### On using 20% of training set (with Transfer learning + Data Augmentation) 91 | 92 | | Dataset | Dataset size (train, valid, test) | Accuracy | MCC | Notebook to Reproduce results | 93 | |:--------:|:----:|:----:|:----:|:----:| 94 | | IIT Patna Movie Reviews | (496, 310, 310) | 56.13 | 34.39 | [Link](https://github.com/goru001/nlp-for-hindi/blob/master/classification-benchmarks/Hindi_Classification_Model_IITP%2BMovie_with_Data_Aug.ipynb) | 95 | 96 | 97 | ## Pretrained Models 98 | 99 | #### Language Models 100 | Download pretrained Language Models of ULMFiT, TransformerXL trained on 101 | [Hindi Wikipedia Articles - 172k and Hindi Wikipedia Articles - 55k](https://github.com/goru001/nlp-for-hindi#dataset) 102 | from [here](https://drive.google.com/open?id=1_8l5HFHHm4cboA-tkGbn3i6sfOWLmGyC) 103 | 104 | #### Tokenizer 105 | 106 | Unsupervised training using Google's [sentencepiece](https://github.com/google/sentencepiece) 107 | 108 | Download the trained model and vocabulary from [here](https://drive.google.com/open?id=1TVuqY3Lad_KdY5Aj8ynGYVvoX5qgk2fJ) 109 | 110 | -------------------------------------------------------------------------------- /datasets-preparation/get-hindi-movie-reviews-2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 23, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from urllib.request import urlopen, Request\n", 10 | "from bs4 import BeautifulSoup\n", 11 | "import re\n", 12 | "import pickle\n", 13 | "number_of_pages = 15\n", 14 | "all_articles = []" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 24, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "For 1 - 20\n", 27 | "For 1 - 40\n", 28 | "For 1 - 60\n", 29 | "For 1 - 80\n", 30 | "For 1 - 100\n", 31 | "For 1 - 120\n", 32 | "For 1 - 140\n", 33 | "For 1 - 160\n", 34 | "For 1 - 180\n", 35 | "For 1 - 200\n", 36 | "For 1 - 220\n", 37 | "For 1 - 240\n", 38 | "For 1 - 260\n", 39 | "For 1 - 280\n", 40 | "For 1 - 300\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "for page in range(1,number_of_pages+1):\n", 46 | " url = 'https://navbharattimes.indiatimes.com/movie-masti/movie-review/articlelist/2325387.cms?curpg=' + str(page)\n", 47 | " req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})\n", 48 | " html_doc = ''\n", 49 | " with urlopen(req) as response:\n", 50 | " for line in response:\n", 51 | " line = line.decode('utf-8')\n", 52 | " html_doc = html_doc + line.replace('\\n','')\n", 53 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 54 | " h2s = soup.find_all('h2',{'class':'moviename'})\n", 55 | " for h in h2s:\n", 56 | " all_articles.append(h.a['href'])\n", 57 | " print('For 1 - ' + str(len(all_articles)))\n", 58 | "articleRatingMapping = {}" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 81, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "Saved 285with rating2.5 / 5\n", 71 | "Saved 286with rating4 / 5\n", 72 | "Saved 287with rating3.5 / 5\n", 73 | "Saved 288with rating3 / 5\n", 74 | "Saved 289with rating3 / 5\n", 75 | "Saved 290with rating3.5 / 5\n", 76 | "Saved 291with rating3 / 5\n", 77 | "Saved 292with rating3.5 / 5\n", 78 | "Saved 293with rating2.5 / 5\n", 79 | "Saved 294with rating2 / 5\n", 80 | "Saved 295with rating1 / 5\n", 81 | "Saved 296with rating2 / 5\n", 82 | "Saved 297with rating2 / 5\n", 83 | "Saved 298with rating3 / 5\n", 84 | "Saved 299with rating3 / 5\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "for c, url in enumerate(all_articles[285:]):\n", 90 | " c = c + 285\n", 91 | " req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})\n", 92 | " html_doc = ''\n", 93 | " with urlopen(req) as response:\n", 94 | " for line in response:\n", 95 | " line = line.decode('utf-8')\n", 96 | " html_doc = html_doc + line.replace('\\n','')\n", 97 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 98 | " for div in soup.find_all(\"div\", {'class':'imagecaption'}): \n", 99 | " div.decompose()\n", 100 | " div = soup.find('div',{'class':'Normal'})\n", 101 | " article = div.text\n", 102 | " article = article.replace('X','')\n", 103 | " article = article.strip(u'\\u200b')\n", 104 | " article = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', article)\n", 105 | " article = article.replace(u'\\ufeff','')\n", 106 | " article = article.replace(u'\\xa0', u' ')\n", 107 | " article = article.replace(' ', ' ')\n", 108 | " article = article.replace(' , ', ', ')\n", 109 | " rdiv = soup.find('div', {'id':'selfrating'})\n", 110 | " rating = '---'\n", 111 | " if rdiv:\n", 112 | " rating = rdiv.find('span',{'class':'numrating'}).text\n", 113 | " articleRatingMapping[c] = {\n", 114 | " 'url': url,\n", 115 | " 'article': article,\n", 116 | " 'rating': rating\n", 117 | " }\n", 118 | " print('Saved ' + str(c) + 'with rating' + str(rating))" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 52, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "'3.5 / 5'" 130 | ] 131 | }, 132 | "execution_count": 52, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "rdiv = soup.find('div', {'id':'selfrating'})\n", 139 | "rating = rdiv.find('span',{'class':'numrating'}).text\n", 140 | "rating" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 38, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "s = \"इस फिल्म के शो स्टॉपर साफ तौर पर अनिल कपूर हैं, जिन्होंने अपने बेहतरीन अदाकारी का परिचय दिया है और जिसके लिए आपको 'फन्ने खां' एक बार जरूर देखनी चाहिए। X \"" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 63, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "all_articles[82]=\"https://navbharattimes.indiatimes.com/movie-masti/movie-review/DARKEST-HOUR-MOVIE-REVIEW-%20in-hindi/moviereview/62565228.cms\"" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 78, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "temp = articleRatingMapping" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 82, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "300" 179 | ] 180 | }, 181 | "execution_count": 82, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "len(articleRatingMapping)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 84, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "import pickle\n", 197 | "with open('HindiArticleRatingMapping','wb') as f:\n", 198 | " pickle.dump(articleRatingMapping,f)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.6.7" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /datasets-preparation/get-all-article-links-for-hindi-wikipedia.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 289, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from urllib.request import urlopen\n", 10 | "import pickle" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 290, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "html_doc = ''\n", 20 | "with urlopen('https://hi.wikipedia.org/wiki/%E0%A4%AE%E0%A5%81%E0%A4%96%E0%A4%AA%E0%A5%83%E0%A4%B7%E0%A5%8D%E0%A4%A0') as response:\n", 21 | " for line in response:\n", 22 | " line = line.decode('utf-8')\n", 23 | " html_doc = html_doc + line.replace('\\n','')" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 291, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from bs4 import BeautifulSoup\n", 33 | "soup = BeautifulSoup(html_doc, 'html.parser')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 292, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "'मुखपृष्ठ'" 45 | ] 46 | }, 47 | "execution_count": 292, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "soup.h1.string" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 295, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "tab = soup.find(\"table\",{\"style\":\"border:2px solid #e1eaee; border-collapse:separate;font-size:120%\"})" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 296, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "anchors = tab.find_all('a')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 297, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "home_url = 'https://hi.wikipedia.org' \n", 81 | "links = [home_url + anchor['href'] for anchor in anchors]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 298, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "54" 93 | ] 94 | }, 95 | "execution_count": 298, 96 | "metadata": {}, 97 | "output_type": "execute_result" 98 | } 99 | ], 100 | "source": [ 101 | "len(links)" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 299, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "all_links = []" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 300, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "345\n", 123 | "690\n", 124 | "1035\n", 125 | "1380\n", 126 | "1725\n", 127 | "2070\n", 128 | "2415\n", 129 | "2760\n", 130 | "3105\n", 131 | "3450\n", 132 | "3795\n", 133 | "4140\n", 134 | "4485\n", 135 | "4830\n", 136 | "5175\n", 137 | "5520\n", 138 | "5727\n", 139 | "5727\n", 140 | "6072\n", 141 | "6417\n", 142 | "6762\n", 143 | "7107\n", 144 | "7452\n", 145 | "7797\n", 146 | "8142\n", 147 | "8487\n", 148 | "8832\n", 149 | "9177\n", 150 | "9522\n", 151 | "9867\n", 152 | "10212\n", 153 | "10557\n", 154 | "10902\n", 155 | "11247\n", 156 | "11592\n", 157 | "11937\n", 158 | "12282\n", 159 | "12627\n", 160 | "12972\n", 161 | "13317\n", 162 | "13662\n", 163 | "14007\n", 164 | "14352\n", 165 | "14697\n", 166 | "15042\n", 167 | "15387\n", 168 | "15732\n", 169 | "16077\n", 170 | "16422\n", 171 | "16767\n", 172 | "17112\n", 173 | "17457\n", 174 | "17802\n", 175 | "18147\n", 176 | "18492\n", 177 | "18837\n", 178 | "19182\n", 179 | "19527\n", 180 | "19872\n", 181 | "20217\n", 182 | "20562\n", 183 | "20907\n", 184 | "21252\n", 185 | "21597\n", 186 | "21942\n", 187 | "22287\n", 188 | "22632\n", 189 | "22977\n", 190 | "23322\n", 191 | "23667\n", 192 | "24012\n", 193 | "24357\n", 194 | "24702\n", 195 | "25047\n", 196 | "25392\n", 197 | "25737\n", 198 | "26082\n", 199 | "26427\n", 200 | "26772\n", 201 | "27117\n", 202 | "27462\n", 203 | "27807\n", 204 | "28152\n", 205 | "28497\n", 206 | "28842\n", 207 | "29187\n", 208 | "29532\n", 209 | "29877\n", 210 | "30222\n", 211 | "30567\n", 212 | "30912\n", 213 | "31257\n", 214 | "31602\n", 215 | "31947\n", 216 | "32292\n", 217 | "32637\n", 218 | "32982\n", 219 | "33327\n", 220 | "33672\n", 221 | "34017\n", 222 | "34362\n", 223 | "34707\n", 224 | "35052\n", 225 | "35397\n", 226 | "35742\n", 227 | "36087\n", 228 | "36432\n", 229 | "36777\n", 230 | "37122\n", 231 | "37467\n", 232 | "37812\n", 233 | "38157\n", 234 | "38502\n", 235 | "38847\n", 236 | "39192\n", 237 | "39537\n", 238 | "39882\n", 239 | "40227\n", 240 | "40572\n", 241 | "40917\n", 242 | "41262\n", 243 | "41607\n", 244 | "41952\n", 245 | "42297\n", 246 | "42642\n", 247 | "42987\n", 248 | "43332\n", 249 | "43677\n", 250 | "44022\n", 251 | "44367\n", 252 | "44712\n", 253 | "45057\n", 254 | "45402\n", 255 | "45747\n", 256 | "46092\n", 257 | "46437\n", 258 | "46782\n", 259 | "47127\n", 260 | "47472\n", 261 | "47817\n", 262 | "48162\n", 263 | "48507\n", 264 | "48852\n", 265 | "49197\n", 266 | "49542\n", 267 | "49887\n", 268 | "50232\n", 269 | "50577\n", 270 | "50922\n", 271 | "51267\n", 272 | "51612\n", 273 | "51957\n", 274 | "52302\n", 275 | "52647\n", 276 | "52992\n", 277 | "53337\n", 278 | "53682\n", 279 | "54027\n", 280 | "54372\n", 281 | "54717\n", 282 | "55062\n", 283 | "55407\n", 284 | "55752\n", 285 | "56097\n", 286 | "56442\n", 287 | "56787\n", 288 | "57132\n", 289 | "57477\n", 290 | "57822\n", 291 | "58167\n", 292 | "58512\n", 293 | "58857\n", 294 | "59202\n", 295 | "59547\n", 296 | "59892\n", 297 | "60237\n", 298 | "60582\n", 299 | "60927\n", 300 | "61272\n", 301 | "61617\n", 302 | "61962\n", 303 | "62307\n", 304 | "62652\n", 305 | "62997\n", 306 | "63342\n", 307 | "63687\n", 308 | "64032\n", 309 | "64377\n", 310 | "64722\n", 311 | "65067\n", 312 | "65412\n", 313 | "65757\n", 314 | "66102\n", 315 | "66447\n", 316 | "66792\n", 317 | "67137\n", 318 | "67482\n", 319 | "67827\n", 320 | "68172\n", 321 | "68517\n", 322 | "68862\n", 323 | "69207\n", 324 | "69552\n", 325 | "69897\n", 326 | "70242\n", 327 | "70587\n", 328 | "70932\n", 329 | "71277\n", 330 | "71622\n", 331 | "71967\n", 332 | "72312\n", 333 | "72657\n", 334 | "73002\n", 335 | "73347\n", 336 | "73692\n", 337 | "74037\n", 338 | "74382\n", 339 | "74727\n", 340 | "75072\n", 341 | "75417\n", 342 | "75762\n", 343 | "76107\n", 344 | "76452\n", 345 | "76797\n", 346 | "77142\n", 347 | "77487\n", 348 | "77832\n", 349 | "78177\n", 350 | "78522\n", 351 | "78867\n", 352 | "79212\n", 353 | "79557\n", 354 | "79902\n", 355 | "80247\n", 356 | "80592\n", 357 | "80937\n", 358 | "81282\n", 359 | "81627\n", 360 | "81972\n", 361 | "82317\n", 362 | "82662\n", 363 | "83007\n", 364 | "83352\n", 365 | "83697\n", 366 | "84042\n", 367 | "84387\n", 368 | "84732\n", 369 | "85077\n", 370 | "85422\n", 371 | "85767\n", 372 | "86112\n", 373 | "86457\n", 374 | "86802\n", 375 | "87147\n", 376 | "87492\n", 377 | "87837\n", 378 | "88182\n", 379 | "88527\n", 380 | "88872\n", 381 | "89217\n", 382 | "89562\n", 383 | "89907\n", 384 | "90252\n", 385 | "90597\n", 386 | "90942\n", 387 | "91287\n", 388 | "91632\n", 389 | "91977\n", 390 | "92322\n", 391 | "92667\n", 392 | "93012\n", 393 | "93357\n", 394 | "93702\n", 395 | "94047\n", 396 | "94392\n", 397 | "94737\n", 398 | "95082\n", 399 | "95427\n", 400 | "95772\n", 401 | "96117\n", 402 | "96462\n", 403 | "96807\n", 404 | "97152\n", 405 | "97497\n", 406 | "97842\n", 407 | "98187\n", 408 | "98532\n", 409 | "98877\n", 410 | "99222\n", 411 | "99567\n", 412 | "99912\n", 413 | "100257\n", 414 | "100602\n", 415 | "100947\n", 416 | "101292\n", 417 | "101637\n", 418 | "101982\n", 419 | "102327\n", 420 | "102672\n", 421 | "103017\n", 422 | "103362\n", 423 | "103707\n", 424 | "104052\n", 425 | "104397\n", 426 | "104742\n", 427 | "105087\n", 428 | "105432\n", 429 | "105777\n", 430 | "106122\n", 431 | "106467\n", 432 | "106812\n", 433 | "107157\n", 434 | "107502\n", 435 | "107847\n", 436 | "108192\n", 437 | "108537\n", 438 | "108882\n", 439 | "109227\n", 440 | "109572\n", 441 | "109917\n", 442 | "110262\n", 443 | "110607\n", 444 | "110952\n", 445 | "111297\n", 446 | "111642\n", 447 | "111987\n", 448 | "112332\n", 449 | "112677\n", 450 | "113022\n", 451 | "113367\n", 452 | "113712\n", 453 | "114057\n", 454 | "114402\n", 455 | "114747\n", 456 | "115092\n", 457 | "115437\n", 458 | "115782\n", 459 | "116127\n", 460 | "116472\n", 461 | "116817\n", 462 | "117162\n", 463 | "117507\n", 464 | "117852\n", 465 | "118197\n", 466 | "118542\n", 467 | "118887\n", 468 | "119232\n", 469 | "119577\n", 470 | "119922\n", 471 | "120267\n", 472 | "120612\n", 473 | "120957\n", 474 | "121302\n", 475 | "121647\n", 476 | "121992\n", 477 | "122337\n", 478 | "122682\n", 479 | "123027\n", 480 | "123372\n", 481 | "123717\n", 482 | "124062\n", 483 | "124407\n", 484 | "124752\n", 485 | "125097\n", 486 | "125442\n", 487 | "125787\n", 488 | "126132\n", 489 | "126477\n", 490 | "126822\n", 491 | "127167\n", 492 | "127512\n", 493 | "127857\n", 494 | "128202\n", 495 | "128547\n", 496 | "128892\n", 497 | "129237\n", 498 | "129582\n", 499 | "129927\n", 500 | "130272\n", 501 | "130617\n", 502 | "130962\n", 503 | "131307\n", 504 | "131652\n", 505 | "131997\n", 506 | "132342\n", 507 | "132687\n", 508 | "133032\n", 509 | "133377\n", 510 | "133722\n", 511 | "134067\n", 512 | "134412\n", 513 | "134757\n", 514 | "135102\n", 515 | "135447\n", 516 | "135792\n", 517 | "136137\n", 518 | "136482\n", 519 | "136827\n", 520 | "137172\n", 521 | "137517\n", 522 | "137862\n", 523 | "138207\n", 524 | "138552\n", 525 | "138897\n", 526 | "139242\n", 527 | "139587\n", 528 | "139932\n", 529 | "140277\n", 530 | "140622\n", 531 | "140967\n", 532 | "141312\n", 533 | "141657\n", 534 | "142002\n", 535 | "142347\n", 536 | "142692\n", 537 | "143037\n", 538 | "143382\n", 539 | "143727\n", 540 | "144072\n", 541 | "144417\n", 542 | "144762\n", 543 | "145107\n", 544 | "145452\n", 545 | "145797\n", 546 | "146142\n", 547 | "146487\n", 548 | "146832\n", 549 | "147177\n", 550 | "147522\n", 551 | "147867\n", 552 | "148212\n", 553 | "148557\n", 554 | "148902\n", 555 | "149247\n", 556 | "149592\n", 557 | "149937\n", 558 | "150282\n", 559 | "150627\n", 560 | "150972\n", 561 | "151317\n", 562 | "151662\n", 563 | "152007\n", 564 | "152352\n", 565 | "152697\n", 566 | "153042\n", 567 | "153387\n", 568 | "153732\n", 569 | "154077\n", 570 | "154422\n", 571 | "154767\n", 572 | "155112\n", 573 | "155457\n", 574 | "155802\n", 575 | "156147\n", 576 | "156492\n", 577 | "156837\n", 578 | "157182\n", 579 | "157527\n", 580 | "157872\n", 581 | "158217\n", 582 | "158562\n", 583 | "158907\n", 584 | "159252\n", 585 | "159597\n", 586 | "159942\n", 587 | "160287\n", 588 | "160632\n", 589 | "160977\n", 590 | "161322\n", 591 | "161667\n", 592 | "162012\n", 593 | "162357\n", 594 | "162702\n", 595 | "163047\n", 596 | "163392\n", 597 | "163737\n", 598 | "164082\n", 599 | "164427\n", 600 | "164772\n", 601 | "165117\n", 602 | "165462\n", 603 | "165807\n", 604 | "166152\n", 605 | "166497\n", 606 | "166842\n", 607 | "167187\n", 608 | "167532\n", 609 | "167877\n", 610 | "168222\n", 611 | "168567\n", 612 | "168912\n", 613 | "169257\n", 614 | "169602\n", 615 | "169947\n", 616 | "170292\n", 617 | "170637\n", 618 | "170982\n", 619 | "171327\n", 620 | "171672\n", 621 | "172017\n", 622 | "172295\n", 623 | "172295\n", 624 | "172295\n", 625 | "172295\n", 626 | "172295\n", 627 | "172295\n", 628 | "172295\n", 629 | "172295\n", 630 | "172295\n", 631 | "172295\n", 632 | "172295\n", 633 | "172295\n", 634 | "172295\n", 635 | "172295\n", 636 | "172295\n", 637 | "172295\n", 638 | "172295\n", 639 | "172295\n", 640 | "172295\n", 641 | "172295\n", 642 | "172295\n", 643 | "172295\n", 644 | "172295\n", 645 | "172295\n", 646 | "172295\n", 647 | "172295\n", 648 | "172295\n", 649 | "172295\n", 650 | "172295\n", 651 | "172295\n", 652 | "172295\n", 653 | "172295\n", 654 | "172295\n", 655 | "172295\n", 656 | "172295\n", 657 | "172295\n", 658 | "172295\n", 659 | "172295\n", 660 | "172295\n", 661 | "172295\n", 662 | "172295\n", 663 | "172295\n", 664 | "172295\n", 665 | "172295\n", 666 | "172295\n", 667 | "172295\n", 668 | "172295\n", 669 | "172295\n", 670 | "172295\n", 671 | "172295\n", 672 | "172295\n", 673 | "172295\n", 674 | "172295\n" 675 | ] 676 | } 677 | ], 678 | "source": [ 679 | "# Main code\n", 680 | "for link in links: \n", 681 | " while link:\n", 682 | " html_doc = ''\n", 683 | " with urlopen(link) as response:\n", 684 | " for line in response:\n", 685 | " line = line.decode('utf-8')\n", 686 | " html_doc = html_doc + line.replace('\\n','')\n", 687 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 688 | " div = soup.find('div',{'class':'mw-allpages-body'})\n", 689 | " if div:\n", 690 | " anchors = div.find_all('a');\n", 691 | " all_links = all_links + [home_url + anchor['href'] for anchor in anchors]\n", 692 | " print(len(set(all_links)))\n", 693 | " if prev_len == len(set(all_links)):\n", 694 | " break\n", 695 | " nav_div = soup.find('div',{'class':'mw-allpages-nav'})\n", 696 | " if nav_div and len(nav_div.find_all('a')) == 2:\n", 697 | " link = home_url + nav_div.find_all('a')[1]['href']\n", 698 | " prev_len = len(set(all_links))" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 301, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "data": { 708 | "text/plain": [ 709 | "172295" 710 | ] 711 | }, 712 | "execution_count": 301, 713 | "metadata": {}, 714 | "output_type": "execute_result" 715 | } 716 | ], 717 | "source": [ 718 | "len(set(all_links))" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 302, 724 | "metadata": {}, 725 | "outputs": [ 726 | { 727 | "data": { 728 | "text/plain": [ 729 | "172295" 730 | ] 731 | }, 732 | "execution_count": 302, 733 | "metadata": {}, 734 | "output_type": "execute_result" 735 | } 736 | ], 737 | "source": [ 738 | "all_links = list(set(all_links)); len(all_links)" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 303, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "with open('all_hindi_wikipedia_links.pkl', 'wb') as f:\n", 748 | " pickle.dump(all_links, f)" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": 304, 754 | "metadata": {}, 755 | "outputs": [ 756 | { 757 | "data": { 758 | "text/plain": [ 759 | "'https://hi.wikipedia.org/wiki/%E0%A4%AA%E0%A5%80%E0%A4%AA%E0%A4%B2%E0%A5%80_%E0%A4%98%E0%A4%A8%E0%A4%B6%E0%A4%BF%E0%A4%AF%E0%A4%BE%E0%A4%AE_(A.H.T.),%E0%A4%A0%E0%A4%BE%E0%A4%95%E0%A5%81%E0%A4%B0%E0%A4%A6%E0%A5%8D%E0%A4%B5%E0%A4%BE%E0%A4%B0%E0%A4%BE_(%E0%A4%AE%E0%A5%81%E0%A4%B0%E0%A4%BE%E0%A4%A6%E0%A4%BE%E0%A4%AC%E0%A4%BE%E0%A4%A6)'" 760 | ] 761 | }, 762 | "execution_count": 304, 763 | "metadata": {}, 764 | "output_type": "execute_result" 765 | } 766 | ], 767 | "source": [ 768 | "all_links[160]" 769 | ] 770 | } 771 | ], 772 | "metadata": { 773 | "kernelspec": { 774 | "display_name": "Python 3", 775 | "language": "python", 776 | "name": "python3" 777 | }, 778 | "language_info": { 779 | "codemirror_mode": { 780 | "name": "ipython", 781 | "version": 3 782 | }, 783 | "file_extension": ".py", 784 | "mimetype": "text/x-python", 785 | "name": "python", 786 | "nbconvert_exporter": "python", 787 | "pygments_lexer": "ipython3", 788 | "version": "3.6.7" 789 | } 790 | }, 791 | "nbformat": 4, 792 | "nbformat_minor": 2 793 | } 794 | -------------------------------------------------------------------------------- /datasets-preparation/get-hindi-movie-reviews-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from urllib.request import urlopen\n", 10 | "from bs4 import BeautifulSoup\n", 11 | "import re\n", 12 | "import pickle\n", 13 | "number_of_pages = 38\n", 14 | "all_articles = []" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 12, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import urllib.parse" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "For http://hindi.webdunia.com/bollywood-movie-review/1 20\n", 36 | "For http://hindi.webdunia.com/bollywood-movie-review/2 40\n", 37 | "For http://hindi.webdunia.com/bollywood-movie-review/3 60\n", 38 | "For http://hindi.webdunia.com/bollywood-movie-review/4 80\n", 39 | "For http://hindi.webdunia.com/bollywood-movie-review/5 100\n", 40 | "For http://hindi.webdunia.com/bollywood-movie-review/6 120\n", 41 | "For http://hindi.webdunia.com/bollywood-movie-review/7 140\n", 42 | "For http://hindi.webdunia.com/bollywood-movie-review/8 160\n", 43 | "For http://hindi.webdunia.com/bollywood-movie-review/9 180\n", 44 | "For http://hindi.webdunia.com/bollywood-movie-review/10 200\n", 45 | "For http://hindi.webdunia.com/bollywood-movie-review/11 220\n", 46 | "For http://hindi.webdunia.com/bollywood-movie-review/12 240\n", 47 | "For http://hindi.webdunia.com/bollywood-movie-review/13 260\n", 48 | "For http://hindi.webdunia.com/bollywood-movie-review/14 280\n", 49 | "For http://hindi.webdunia.com/bollywood-movie-review/15 300\n", 50 | "For http://hindi.webdunia.com/bollywood-movie-review/16 320\n", 51 | "For http://hindi.webdunia.com/bollywood-movie-review/17 340\n", 52 | "For http://hindi.webdunia.com/bollywood-movie-review/18 360\n", 53 | "For http://hindi.webdunia.com/bollywood-movie-review/19 380\n", 54 | "For http://hindi.webdunia.com/bollywood-movie-review/20 400\n", 55 | "For http://hindi.webdunia.com/bollywood-movie-review/21 420\n", 56 | "For http://hindi.webdunia.com/bollywood-movie-review/22 440\n", 57 | "For http://hindi.webdunia.com/bollywood-movie-review/23 460\n", 58 | "For http://hindi.webdunia.com/bollywood-movie-review/24 480\n", 59 | "For http://hindi.webdunia.com/bollywood-movie-review/25 500\n", 60 | "For http://hindi.webdunia.com/bollywood-movie-review/26 520\n", 61 | "For http://hindi.webdunia.com/bollywood-movie-review/27 540\n", 62 | "For http://hindi.webdunia.com/bollywood-movie-review/28 560\n", 63 | "For http://hindi.webdunia.com/bollywood-movie-review/29 580\n", 64 | "For http://hindi.webdunia.com/bollywood-movie-review/30 600\n", 65 | "For http://hindi.webdunia.com/bollywood-movie-review/31 620\n", 66 | "For http://hindi.webdunia.com/bollywood-movie-review/32 640\n", 67 | "For http://hindi.webdunia.com/bollywood-movie-review/33 660\n", 68 | "For http://hindi.webdunia.com/bollywood-movie-review/34 680\n", 69 | "For http://hindi.webdunia.com/bollywood-movie-review/35 700\n", 70 | "For http://hindi.webdunia.com/bollywood-movie-review/36 720\n", 71 | "For http://hindi.webdunia.com/bollywood-movie-review/37 740\n", 72 | "For http://hindi.webdunia.com/bollywood-movie-review/38 760\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "for i in range(1, number_of_pages+1):\n", 78 | " url = 'http://hindi.webdunia.com/bollywood-movie-review/' + str(i)\n", 79 | " html_doc = ''\n", 80 | " with urlopen(url) as response:\n", 81 | " for line in response:\n", 82 | " line = line.decode('utf-8')\n", 83 | " html_doc = html_doc + line.replace('\\n','')\n", 84 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 85 | " base_url = 'http://hindi.webdunia.com'\n", 86 | " first_article = soup.find('div',{'class':'list list_l wbx rel'})['onclick']\n", 87 | " first_article = first_article[13:-12]\n", 88 | " all_articles.append(base_url + first_article)\n", 89 | " rest_articles_div = soup.find_all('div', {'class':'col-sm-6 listItem'})\n", 90 | " for div in rest_articles_div:\n", 91 | " all_articles.append(base_url + div.find('a')['href'])\n", 92 | " print('For ' + url + ' ' + str(len(all_articles)))" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 14, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "Saved 0 with rating रेटिंग : 2.5/5\n", 105 | "Saved 1 with rating रेटिंग : 1.5/5\n", 106 | "Saved 2 with rating रेटिंग : 3/5\n", 107 | "Saved 3 with rating रेटिंग : 2/5\n", 108 | "googletag.cmd.push(function() { googletag.display('div-gpt-ad-1508249012268-118090100044'); });68-118090100044').addService(googletag.pubads());\n", 109 | "Saved 5 with rating रेटिंग : 3/5\n", 110 | "Saved 6 with rating रेटिंग : 1.5/5\n", 111 | "Saved 7 with rating रेटिंग : 1.5/5\n", 112 | "Saved 8 with rating रेटिंग : 3/5\n", 113 | "Saved 9 with rating रेटिंग : 2/5\n", 114 | "Saved 10 with rating रेटिंग : 3.5/5\n", 115 | "Saved 11 with rating रेटिंग : 2.5/5\n", 116 | "Saved 12 with rating रेटिंग : 2/5\n", 117 | "Saved 13 with rating रेटिंग : 1.5/5\n", 118 | "Saved 14 with rating रेटिंग : 4/5\n", 119 | "Saved 15 with rating रेटिंग : 2/5\n", 120 | "Saved 16 with rating रेटिंग : 3/5\n", 121 | "Saved 17 with rating रेटिंग : 2.5/5\n", 122 | "Saved 18 with rating रेटिंग : 3/5\n", 123 | "Saved 19 with rating रेटिंग : 1.5/5\n", 124 | "Saved 20 with rating रेटिंग : 3/5\n", 125 | "Saved 21 with rating रेटिंग : 1.5/5\n", 126 | "Saved 22 with rating रेटिंग : 3/5\n", 127 | "Saved 23 with rating रेटिंग : 3/5\n", 128 | "Saved 24 with rating रेटिंग : 3/5\n", 129 | "Saved 25 with rating रेटिंग : 4/5\n", 130 | "Saved 26 with rating रेटिंग : 3/5\n", 131 | "Saved 27 with rating रेटिंग : 3.5/5\n", 132 | "Saved 28 with rating रेटिंग : 3.5/5\n", 133 | "Saved 29 with rating रेटिंग : 3/5\n", 134 | "Saved 30 with rating रेटिंग : 3/5\n", 135 | "Saved 31 with rating रेटिंग : 2.5/5\n", 136 | "Saved 32 with rating रेटिंग : 2/5\n", 137 | "Saved 33 with rating रेटिंग : 3/5\n", 138 | "Saved 34 with rating रेटिंग : 1/5\n", 139 | "Saved 35 with rating रेटिंग : 2.5/5\n", 140 | "Saved 36 with rating रेटिंग : 3/5\n", 141 | "Saved 37 with rating रेटिंग : 2/5\n", 142 | "Saved 38 with rating रेटिंग : 4/5\n", 143 | "Saved 39 with rating रेटिंग : 1/5\n", 144 | "Saved 40 with rating रेटिंग : 2/5\n", 145 | "Saved 41 with rating रेटिंग : 3/5\n", 146 | "Saved 42 with rating रेटिंग : 3.5/5\n", 147 | "Saved 43 with rating रेटिंग : 3.5 /5\n", 148 | "Saved 44 with rating रेटिंग : 2.5/5\n", 149 | "Saved 45 with rating रेटिंग : 1.5/5\n", 150 | "Saved 46 with rating रेटिंग : 0.5/5\n", 151 | "Saved 47 with rating रेटिंग : 3.5/5\n", 152 | "Saved 48 with rating सेंसर सर्टिफिकेट : यूए * 2 घंटे 5 मिनट 28 सेकंड रेटिंग : 2/5\n", 153 | "Saved 49 with rating रेटिंग : 2.5/5\n", 154 | "Saved 50 with rating रेटिंग : 1.5/5\n", 155 | "Saved 51 with rating रेटिंग : 3.5/5\n", 156 | "Saved 52 with rating रेटिंग : 2.5/5\n", 157 | "Saved 53 with rating रेटिंग : 2.5/5\n", 158 | "Saved 54 with rating रेटिंग : 4/5\n", 159 | "Saved 55 with rating रेटिंग : 1.5/5\n", 160 | "Saved 56 with rating रेटिंग : 1/5\n", 161 | "Saved 57 with rating रेटिंग : 1.5/5\n", 162 | "Saved 58 with rating रेटिंग : 2.5/5\n", 163 | "Saved 59 with rating रेटिंग : 2/5\n", 164 | "Saved 60 with rating रेटिंग : 3/5\n", 165 | "Saved 61 with rating रेटिंग : 2/5\n", 166 | "Saved 62 with rating रेटिंग : 2/5\n", 167 | "Saved 63 with rating रेटिंग : 3/5\n", 168 | "Saved 64 with rating रेटिंग : 3/5\n", 169 | "Saved 65 with rating रेटिंग : 3/5\n", 170 | "Saved 66 with rating रेटिंग : 2/5\n", 171 | "Saved 67 with rating रेटिंग : 4.2/5\n", 172 | "Saved 68 with rating रेटिंग : 2.5/5\n", 173 | "Saved 69 with rating रेटिंग : 2/5\n", 174 | "Saved 70 with rating रेटिंग : 3.5/5\n", 175 | "Saved 71 with rating रेटिंग : 2.5/5\n", 176 | "Saved 72 with rating रेटिंग : 3/5\n", 177 | "Saved 73 with rating रेटिंग : 2/5\n", 178 | "Saved 74 with rating रेटिंग : 1.5/5\n", 179 | "Saved 75 with rating रेटिंग : 1/5\n", 180 | "Saved 76 with rating रेटिंग : 1.5/5\n", 181 | "Saved 77 with rating रेटिंग : 4/5\n", 182 | "Saved 78 with rating रेटिंग : 3/5\n", 183 | "Saved 79 with rating रेटिंग : 2/5\n", 184 | "Saved 80 with rating रेटिंग : 1/5\n", 185 | "Saved 81 with rating रेटिंग : 4/5\n", 186 | "Saved 82 with rating रेटिंग : 1.5/5\n", 187 | "Saved 83 with rating रेटिंग : 2/5\n", 188 | "Saved 84 with rating रेटिंग : 2.5/5\n", 189 | "Saved 85 with rating रेटिंग : 2.5/5\n", 190 | "Saved 86 with rating रेटिंग : 2/5 \n", 191 | "\t ved 87 with rating रेटिंग : 3.5/5 \n", 192 | "\tरेटिंग : 2/5 rating कलाकार : विद्युत जामवाल, अदा शर्मा, फ्रेडी दारूवाला, अनूप सिंह, शैफाली शाह, सतीश कौशिश\n", 193 | "Saved 89 with rating रेटिंग : 2.5/5 \n", 194 | "Saved 90 with rating रेटिंग : 3/5 \n", 195 | "Saved 91 with rating रेटिंग : 3.5/5 \n", 196 | "\tरेटिंग : 2/5 rating सेंसर सर्टिफिकेट : यूए * 2 घंटे 19 मिनट 48 सेकंड्स >\n", 197 | "Saved 93 with rating रेटिंग : 3.5/5 \n", 198 | "Saved 94 with rating रेटिंग : 1/5 \n", 199 | "Saved 95 with rating रेटिंग : 3/5 \n", 200 | "Saved 96 with rating रेटिंग : 3/5 \n", 201 | "Saved 97 with rating रेटिंग : 4/5 \n", 202 | "Saved 98 with rating रेटिंग : 1/5 \n", 203 | "Saved 99 with rating रेटिंग : 2.5/5 \n", 204 | "Saved 100 with rating रेटिंग : 3/5 \n", 205 | "Saved 101 with rating रेटिंग : 3/5\n", 206 | "Saved 102 with rating रेटिंग : 3/5 \n", 207 | "Saved 103 with rating रेटिंग : 2/5 \n", 208 | "Saved 104 with rating रेटिंग : 2/5 \n", 209 | "Saved 105 with rating रेटिंग : 2.5/5 \n", 210 | "Saved 106 with rating रेटिंग : 2/5 \n", 211 | "Saved 107 with rating रेटिंग : 0.5/5 \n", 212 | "Saved 108 with rating रेटिंग : 2/5 \n", 213 | "Saved 109 with rating रेटिंग : 3.5/5 \n", 214 | "Saved 110 with rating रेटिंग : 3.5/5\n", 215 | "Saved 111 with rating रेटिंग : 4/5 \n", 216 | "Saved 112 with rating रेटिंग : 1/5 \n", 217 | "Saved 113 with rating रेटिंग : 3/5 \n", 218 | "Saved 114 with rating रेटिंग : 3/5 \n", 219 | "Saved 115 with rating रेटिंग : 1.5/5 \n", 220 | "Saved 116 with rating रेटिंग : 3/5 \n", 221 | "\t\t\tरेटिंग : 1.5/5 ing सेंसर सर्टिफिकेट : यूए * 2 घंटे 35 मिनट\n", 222 | "Saved 118 with rating रेटिंग : 2.5/5 \n", 223 | "Saved 119 with rating रेटिंग : 3.5/5 \n", 224 | "Saved 120 with rating रेटिंग : 2.5/5 \n", 225 | "Saved 121 with rating रेटिंग : 1.5/5 \n", 226 | "Saved 122 with rating रेटिंग : 1/5 \n", 227 | "Saved 123 with rating रेटिंग : 0.5/5 \n", 228 | "Saved 124 with rating रेटिंग : 3/5 \n", 229 | "\tरेटिंग : 2/5h rating 1 घंटा 50 मिनट >\n", 230 | "Saved 126 with rating रेटिंग : 3/5 \n", 231 | "Saved 127 with rating रेटिंग : 2.5/5 \n", 232 | "Saved 128 with rating रेटिंग : 3.5/5 \n", 233 | "Saved 129 with rating रेटिंग : 1.5/5 \n", 234 | "Saved 130 with rating रेटिंग : 3/5 \n", 235 | "Saved 131 with rating रेटिंग : 2/5 \n", 236 | "Saved 132 with rating रेटिंग : 3/5 \n", 237 | "Saved 133 with rating रेटिंग : 2.5/5 \n", 238 | "Saved 134 with rating रेटिंग : 2/5 \n", 239 | "Saved 135 with rating रेटिंग : 1.5/5 \n", 240 | "Saved 136 with rating रेटिंग : 3.5/5 \n", 241 | "Saved 137 with rating रेटिंग : 3/5\n", 242 | "Saved 138 with rating रेटिंग : 3/5\n", 243 | "Saved 139 with rating रेटिंग : 1/5 \n", 244 | "Saved 140 with rating रेटिंग : 2.5/5 \n", 245 | "Saved 141 with rating रेटिंग : 4/5\n", 246 | "Saved 142 with rating रेटिंग : 3/5 \n", 247 | "\t ved 143 with rating रेटिंग : 2/5\n", 248 | "Saved 144 with rating रेटिंग : 3.5/5\n", 249 | "Saved 145 with rating रेटिंग : 2/5 \n", 250 | "Saved 146 with rating रेटिंग : 2.5/5 \n", 251 | "Saved 147 with rating रेटिंग : 3.5/5 \n", 252 | "Saved 148 with rating रेटिंग : 3.5/5 \n", 253 | "Saved 149 with rating रेटिंग : 2/5 \n", 254 | "Saved 150 with rating रेटिंग : 1.5/5 \n", 255 | "Saved 151 with rating रेटिंग : 1/5\n", 256 | "Saved 152 with rating रेटिंग : 2.5/5 \n", 257 | "Saved 153 with rating रेटिंग : 0/5 \n", 258 | "Saved 154 with rating रेटिंग : 3/5 \n", 259 | "Saved 155 with rating रेटिंग : 2/5 \n", 260 | "Saved 156 with rating रेटिंग : 2.5/5 \n", 261 | "Saved 157 with rating रेटिंग : 3.5/5 \n", 262 | "Saved 158 with rating रेटिंग : 2.5/5 \n", 263 | "Saved 159 with rating रेटिंग : 2/5 \n", 264 | "\t ved 160 with rating रेटिंग : 3.5/5\n", 265 | "Saved 161 with rating रेटिंग : 3.5/5 \n", 266 | "Saved 162 with rating रेटिंग : 2.5/5 \n", 267 | "Saved 163 with rating रेटिंग : 2/5 \n", 268 | "Saved 164 with rating रेटिंग : 4/5\n", 269 | "Saved 165 with rating रेटिंग : 2.5/5 \n", 270 | "Saved 166 with rating रेटिंग : 3/5 \n", 271 | "Saved 167 with rating रेटिंग : 2.5/5 \n", 272 | "Saved 168 with rating रेटिंग : 2/5 \n", 273 | "Saved 169 with rating रेटिंग : 3.5/5 \n", 274 | "Saved 170 with rating रेटिंग : 2/5 \n", 275 | "Saved 171 with rating रेटिंग : 2.5/5 \n", 276 | "Saved 172 with rating रेटिंग : 1.5/5 \n", 277 | "Saved 173 with rating रेटिंग : 1/5 \n", 278 | "\tरेटिंग : 2/5 rating कलाकार : जॉन अब्राहम, श्रुति हासन, अनिल कपूर, नाना पाटेकर, डिम्पल कपाड़िया, नसीरुद्दीन शाह, अंकिता श्रीवास्तव, परेश रावल, राजपाल यादव, रणजीत सेंसर सर्टिफिकेट : यूए * 2 घंटे 33 मिनट\n", 279 | "Saved 175 with rating रेटिंग : 2/5 \n", 280 | "Saved 176 with rating रेटिंग : 1/5 \n", 281 | "Saved 177 with rating रेटिंग : 3.5/ 5\n", 282 | "Saved 178 with rating रेटिंग : 2/5 \n", 283 | "Saved 179 with rating रेटिंग : 3.5/5 \n", 284 | "Saved 180 with rating रेटिंग : 1/5 \n", 285 | "Saved 181 with rating रेटिंग : 3/5\n", 286 | "Saved 182 with rating सेंसर सर्टिफिकेट : यूए * 2 घंटे 39 मिनट 19 सेकंड्स रेटिंग : 4/5 \n", 287 | "Saved 183 with rating रेटिंग : 3/5 \n", 288 | "Saved 184 with rating रेटिंग : 1.5/5 \n", 289 | "Saved 185 with rating रेटिंग : 2.5/5 \n", 290 | "Saved 186 with rating रेटिंग : 3/5 \n", 291 | "Saved 187 with rating रेटिंग : 1.5/5\n", 292 | "Saved 188 with rating रेटिंग : 2.5/5 \n", 293 | "Saved 189 with rating रेटिंग : 1/5\n", 294 | "Saved 190 with rating रेटिंग : 3.5/5 \n", 295 | "Saved 191 with rating रेटिंग : 2/5 \n", 296 | "Saved 192 with rating रेटिंग : 0.5/5 \n", 297 | "Saved 193 with rating रेटिंग : 3.5/5 \n", 298 | "Saved 194 with rating रेटिंग : 3/5 \n", 299 | "Saved 195 with rating रेटिंग : 3.5/5 \n", 300 | "Saved 196 with rating रेटिंग : 1/5\n", 301 | "Saved 197 with rating रेटिंग : 2.5/5 \n", 302 | "Saved 198 with rating रेटिंग : 1.5/5 \n", 303 | "Saved 199 with rating रेटिंग : 2.5/5 \n", 304 | "Saved 200 with rating रेटिंग : 3/5\n", 305 | "Saved 201 with rating रेटिंग : 2/5 \n", 306 | "Saved 202 with rating रेटिंग : 3.5/5 \n", 307 | "Saved 203 with rating रेटिंग : 1/5\n", 308 | "\tरेटिंग : 1.5/5rating कलाकार : नाना पाटेकर, गुल पनाग, आशुतोष राणा, विक्रम गोखले, मोहन आगाशे, गोविंद नामदेव, राज जुत्शी सेंसर सर्टिफिकेट : यूए * 1 घंटा 45 मिनट 40 सेकंड \n", 309 | "Saved 205 with rating रेटिंग : 3/5 \n", 310 | "Saved 206 with rating रेटिंग : 3/5\n", 311 | "Saved 207 with rating रेटिंग : 1.5/5 \n", 312 | "Saved 208 with rating रेटिंग : 2.5/5 \n", 313 | "Saved 209 with rating रेटिंग : 2.5/5 \n" 314 | ] 315 | }, 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Saved 210 with rating रेटिंग : 1/5\n", 321 | "Saved 211 with rating रेटिंग : 2/5\n", 322 | "Saved 212 with rating रेटिंग : 3/5\n", 323 | "Saved 213 with rating रेटिंग : 2/5\n", 324 | "Saved 214 with rating रेटिंग : 2/5\n", 325 | "Saved 215 with rating रेटिंग :\n", 326 | "Saved 216 with rating रेटिंग : 4/5\n", 327 | "Saved 217 with rating रेटिंग :\n", 328 | "Saved 218 with rating रेटिंग \n", 329 | "Saved 219 with rating रेटिंग : 1.5/5\n", 330 | "Saved 220 with rating रेटिंग : 1/5\n", 331 | "Saved 221 with rating रेटिंग : 1/5\n", 332 | "Saved 222 with rating रेटिंग : 3/5 \n", 333 | "Saved 223 with rating रेटिंग : 1/5 \n", 334 | "Saved 224 with rating रेटिंग : 2/5\n", 335 | "Saved 225 with rating रेटिंग : 2/5\n", 336 | "Saved 226 with rating रेटिंग : 3.5/5\n", 337 | "Saved 227 with rating रेटिंग : 2.5/5\n", 338 | "Saved 228 with rating रेटिंग : 1/5\n", 339 | "Saved 229 with rating रेटिंग : 1/5\n", 340 | "\tरेटिंग : 2.5/5 ating सेंसर सर्टिफिकेट : यू * 2 घंटे 59 मिनट 50 सेकंड\n", 341 | "Saved 231 with rating रेटिंग : 1.5/5\n", 342 | "Saved 232 with rating रेटिंग : 0.5/5\n", 343 | "Saved 233 with rating रेटिंग : 3.5/5\n", 344 | "Saved 234 with rating रेटिंग : 3/5\n", 345 | "Saved 235 with rating रेटिंग : 1/5 \n", 346 | "Saved 236 with rating रेटिंग : 3/5 \n", 347 | "Saved 237 with rating रेटिंग : 3/5\n", 348 | "Saved 238 with rating रेटिंग : 1.5/5 \n", 349 | "Saved 239 with rating रेटिंग : 3/5\n", 350 | "Saved 240 with rating रेटिंग : \n", 351 | "Saved 241 with rating फिल्म समीक्षा का शेष भाग और रेटिंग अगले पेज पर... \n", 352 | "Saved 242 with rating रेटिंग : 2.5/5\n", 353 | "Saved 243 with rating रेटिंग : 3/5\n", 354 | "Saved 244 with rating रेटिंग : 3/5\n", 355 | "Saved 247 with rating रेटिंग : 2.5/5\n", 356 | "Saved 248 with rating रेटिंग : 3/5\n", 357 | "Saved 249 with rating फिल्म समीक्षा का शेष भाग और रेटिंग... अगले पेज पर\n", 358 | "Saved 250 with rating रेटिंग : 0.5/5\n", 359 | "Saved 251 with rating रेटिंग : 3/5\n", 360 | "Saved 252 with rating रेटिंग : 1/5\n", 361 | "Saved 253 with rating रेटिंग : 2/5\n", 362 | "Saved 254 with rating रेटिंग : 2.5/5\n", 363 | "Saved 255 with rating रेटिंग : 3/5\n", 364 | "Saved 256 with rating रेटिंग : 0.5/5\n", 365 | "Saved 257 with rating रेटिंग : 1/5\n", 366 | "Saved 258 with rating रेटिंग : 2.5/5\n", 367 | "Saved 259 with rating रेटिंग : 3.5/5\n", 368 | "Saved 260 with rating रेटिंग : 3/5\n", 369 | "Saved 261 with rating रेटिंग : 2/5\n", 370 | "Saved 262 with rating रेटिंग : 3/5\n", 371 | "Saved 263 with rating रेटिंग : 2.5/5\n", 372 | "Saved 264 with rating रेटिंग : 3.5/5\n", 373 | "Saved 265 with rating रेटिंग : 1/5\n", 374 | "Saved 266 with rating रेटिंग : 0.5/5\n", 375 | "Saved 267 with rating रेटिंग : 2/5\n", 376 | "Saved 268 with rating रेटिंग : 3/5\n", 377 | "Saved 272 with rating रेटिंग : 3.5/5\n", 378 | "Saved 273 with rating रेटिंग : 1/5\n", 379 | "Saved 275 with rating /5\n", 380 | "Saved 276 with rating रेटिंग : 0.5/5\n", 381 | "Saved 277 with rating रेटिंग : 2/5\n", 382 | "Saved 278 with rating रेटिंग : 1.5/5\n", 383 | "Saved 279 with rating रेटिंग : 3.5/5\n", 384 | "Saved 280 with rating रेटिंग : 3/5\n", 385 | "Saved 281 with rating रेटिंग : 1.5/5\n", 386 | "Saved 282 with rating रेटिंग : 3/5\n", 387 | "Saved 283 with rating रेटिंग : 2.5/5\n", 388 | "Saved 284 with rating रेटिंग : 3.5/5\n", 389 | "Saved 285 with rating रेटिंग : 1/5\n", 390 | "Saved 286 with rating रेटिंग : 2.5/5\n", 391 | "Saved 287 with rating रेटिंग : 0/5\n", 392 | "Saved 288 with rating रेटिंग : 1/5\n", 393 | "Saved 289 with rating रेटिंग : 3.5/5\n", 394 | "Saved 290 with rating रेटिंग : 5/5\n", 395 | "Saved 291 with rating रेटिंग : 3.5/5\n", 396 | "Saved 292 with rating रेटिंग : 1/5\n", 397 | "Saved 293 with rating रेटिंग : 2\n", 398 | "Saved 294 with rating रेटिंग : 2/5\n", 399 | "Saved 295 with rating रेटिंग : 1/5\n", 400 | "Saved 296 with rating \n", 401 | "Saved 297 with rating रेटिंग : 2.5/5\n", 402 | "Saved 299 with rating रेटिंग : 3/5\n", 403 | "Saved 300 with rating रेटिंग : 2/5\n", 404 | "Saved 301 with rating रेटिंग : 1.5/5\n", 405 | "Saved 302 with rating रेटिंग : 2.5/5\n", 406 | "Saved 303 with rating रेटिंग : 2/5\n", 407 | "Saved 304 with rating रेटिंग : 3.5/5\n", 408 | "Saved 305 with rating रेटिंग : 1.5/5\n", 409 | "Saved 306 with rating रेटिंग : 0/5\n", 410 | "Saved 307 with rating रेटिंग : 2 /5\n", 411 | "Saved 308 with rating फिल्म समीक्षा का शेष भाग और रेटिंग अगले पेज पर..\n", 412 | "Saved 309 with rating रेटिंग : 2/5\n", 413 | "Saved 310 with rating रेटिंग : 3.5/5\n", 414 | "Saved 311 with rating फिल्म समीक्षा का शेष भाग और रेटिंग अगले पेज पर... \n", 415 | "Saved 312 with rating रेटिंग : 2/5\n", 416 | "Saved 313 with rating रेटिंग : 3/5\n", 417 | "Saved 314 with rating रेटिंग : 1/5\n", 418 | "Saved 315 with rating रेटिंग : 1/5\n", 419 | "Saved 316 with rating रेटिंग : 1.5/5\n", 420 | "Saved 317 with rating रेटिंग : 2.5/5\n", 421 | "Saved 318 with rating /5\n", 422 | "Saved 319 with rating रेटिंग : 1.5/5\n", 423 | "Saved 320 with rating रेटिंग : 3.5/5\n", 424 | "Saved 321 with rating रेटिंग : 2.5/5\n", 425 | "Saved 322 with rating रेटिंग : 2.5/5\n", 426 | "Saved 323 with rating रेटिंग : 2.5/5\n", 427 | "Saved 324 with rating रेटिंग : 1/5\n", 428 | "Saved 325 with rating रेटिंग : 3/5\n", 429 | "Saved 327 with rating रेटिंग : 1.5/5\n", 430 | "Saved 328 with rating \n", 431 | "Saved 330 with rating रेटिंग : 2.5/5\n", 432 | "Saved 331 with rating रेटिंग : 1.5/5\n", 433 | "Saved 332 with rating रेटिंग : 2/5\n", 434 | "Saved 333 with rating रेटिंग : 3.5/5\n", 435 | "Saved 334 with rating रेटिंग : 3/5\n", 436 | "Saved 335 with rating /5\n", 437 | "Saved 336 with rating रेटिंग : 3/5\n", 438 | "Saved 337 with rating रेटिंग : 3/5\n", 439 | "Saved 338 with rating रेटिंग : 2/5\n", 440 | "Saved 339 with rating रेटिंग : 2/5\n", 441 | "Saved 340 with rating रेटिंग : 3/5\n", 442 | "Saved 341 with rating रेटिंग : 2.5/5\n", 443 | "Saved 342 with rating रेटिंग : 3/5\n", 444 | "Saved 343 with rating रेटिंग : 2/5\n", 445 | "Saved 344 with rating रेटिंग : 3/5\n", 446 | "Saved 345 with rating रेटिंग : 4/5\n", 447 | "Saved 346 with rating रेटिंग : 2/5\n", 448 | "Saved 347 with rating रेटिंग : 3.5/5\n", 449 | "Saved 348 with rating रेटिंग : 2/5\n", 450 | "Saved 349 with rating रेटिंग : 2.5/5\n", 451 | "Saved 350 with rating रेटिंग : 2.5/5\n", 452 | "Saved 351 with rating रेटिंग : 2.5/5\n", 453 | "Saved 352 with rating रेटिंग : 2.5/5\n", 454 | "Saved 353 with rating रेटिंग : 3/5\n", 455 | "Saved 354 with rating रेटिंग 3/5\n", 456 | "Saved 355 with rating रेटिंग : 2.5/5\n", 457 | "Saved 356 with rating रेटिंग : 3/5\n", 458 | "Saved 357 with rating रेटिंग : 2.5/5\n", 459 | "Saved 358 with rating रेटिंग : 3/5\n", 460 | "Saved 359 with rating रेटिंग : 4.5/5\n", 461 | "Saved 360 with rating रेटिंग : 3.5/5\n", 462 | "Saved 361 with rating रेटिंग : 3/5\n", 463 | "Saved 362 with rating रेटिंग : 2.5/5\n", 464 | "Saved 363 with rating रेटिंग : 3.5/5\n", 465 | "Saved 364 with rating रेटिंग : 3/5\n", 466 | "Saved 365 with rating रेटिंग : 2/5\n", 467 | "Saved 366 with rating रेटिंग : 3.5/5\n", 468 | "Saved 367 with rating रेटिंग : 1.5/5\n", 469 | "Saved 368 with rating रेटिंग : 1/5\n", 470 | "Saved 369 with rating रेटिंग : 1.5/5\n", 471 | "Saved 370 with rating /5\n", 472 | "Saved 371 with rating रेटिंग : 3/5\n", 473 | "Saved 372 with rating रेटिंग : 1/5\n", 474 | "Saved 373 with rating रेटिंग : 3.5/5\n", 475 | "Saved 375 with rating रेटिंग : 2/5\n", 476 | "Saved 376 with rating रेटिंग : 2.5/5\n", 477 | "Saved 377 with rating रेटिंग : 3/5\n", 478 | "Saved 378 with rating रेटिंग : 3/5\n", 479 | "Saved 379 with rating /5\n", 480 | "Saved 380 with rating रेटिंग : 3/5\n", 481 | "Saved 381 with rating रेटिंग : 3/5\n", 482 | "Saved 382 with rating रेटिंग : 3/5\n", 483 | "Saved 383 with rating रेटिंग : 3/5\n", 484 | "Saved 384 with rating रेटिंग : 1.5/5\n", 485 | "Saved 385 with rating रेटिंग : 2/5\n", 486 | "Saved 386 with rating रेटिंग : 2/5\n", 487 | "Saved 387 with rating रेटिंग : 4.5/5\n", 488 | "Saved 388 with rating रेटिंग : 2/5\n", 489 | "Saved 389 with rating रेटिंग : 3/5\n", 490 | "Saved 390 with rating रेटिंग : 1/5\n", 491 | "Saved 391 with rating रेटिंग : 2/5\n", 492 | "Saved 392 with rating रेटिंग : 3/5\n", 493 | "Saved 393 with rating रेटिंग : 1.5/5\n", 494 | "Saved 394 with rating रेटिंग : 2.5/5\n", 495 | "Saved 397 with rating रेटिंग : 3.5/5\n", 496 | "Saved 400 with rating रेटिंग : 3.5/5\n", 497 | "Saved 401 with rating रेटिंग : 3/5\n", 498 | "Saved 402 with rating रेटिंग : 1.5/5\n", 499 | "Saved 403 with rating रेटिंग : 1/5\n", 500 | "Saved 404 with rating रेटिंग : 3.5/5\n", 501 | "Saved 405 with rating रेटिंग : 2/5\n", 502 | "Saved 406 with rating रेटिंग : 3/5\n", 503 | "Saved 408 with rating रेटिंग : 0.5/5\n", 504 | "Saved 409 with rating रेटिंग : 1/5\n", 505 | "Saved 411 with rating रेटिंग : 3/5\n", 506 | "Saved 412 with rating रेटिंग : 3/5\n", 507 | "Saved 413 with rating रेटिंग : 2/5\n", 508 | "Saved 414 with rating रेटिंग : 3.5/5\n", 509 | "Saved 415 with rating रेटिंग : 2/5\n", 510 | "Saved 417 with rating रेटिंग : 3/5\n", 511 | "Saved 418 with rating /5\n", 512 | "Saved 419 with rating रेटिंग : 3/5\n", 513 | "Saved 421 with rating रेटिंग : 1/5\n", 514 | "Saved 422 with rating रेटिंग : 1/5\n", 515 | "Saved 424 with rating रेटिंग : 1.5/5\n", 516 | "Saved 425 with rating रेटिंग : 2.5/5\n", 517 | "Saved 426 with rating रेटिंग : 2.5/5\n", 518 | "Saved 427 with rating रेटिंग : 3.5/5\n", 519 | "Saved 428 with rating /5\n", 520 | "Saved 431 with rating रेटिंग : 0.5/5\n", 521 | "Saved 432 with rating रेटिंग : 3/5\n", 522 | "Saved 433 with rating रेटिंग : 2/5\n", 523 | "Saved 434 with rating रेटिंग : 2/5\n", 524 | "Saved 435 with rating /5\n", 525 | "Saved 436 with rating रेटिंग : 3.5/5\n", 526 | "Saved 437 with rating रेटिंग : 3.5/5\n", 527 | "Saved 438 with rating रेटिंग : 4/5\n", 528 | "Saved 439 with rating रेटिंग : 3/5\n", 529 | "Saved 441 with rating रेटिंग : 2/5\n", 530 | "Saved 442 with rating रेटिंग : 3/5\n", 531 | "Saved 443 with rating रेटिंग : 2/5\n", 532 | "Saved 444 with rating रेटिंग : 2/5\n", 533 | "Saved 445 with rating रेटिंग : 1.5/5\n", 534 | "Saved 446 with rating रेटिंग : 1/5\n", 535 | "Saved 447 with rating रेटिंग : 3/5\n", 536 | "Saved 449 with rating रेटिंग : 3/5\n", 537 | "Saved 450 with rating रेटिंग : 2.5/5\n", 538 | "Saved 451 with rating रेटिंग : 2/5\n", 539 | "Saved 453 with rating रेटिंग : 2.5/5\n", 540 | "Saved 455 with rating रेटिंग : 2.5/5\n", 541 | "Saved 456 with rating रेटिंग : 2/5\n", 542 | "Saved 457 with rating रेटिंग : 3/5\n", 543 | "Saved 458 with rating रेटिंग : 2.5/5\n", 544 | "Saved 459 with rating रेटिंग : 2.5/5\n", 545 | "Saved 460 with rating रेटिंग : 2/5\n", 546 | "Saved 461 with rating रेटिंग : 2.5/5\n", 547 | "Saved 462 with rating /5\n" 548 | ] 549 | }, 550 | { 551 | "name": "stdout", 552 | "output_type": "stream", 553 | "text": [ 554 | "Saved 463 with rating रेटिंग : 1/5\n", 555 | "Saved 465 with rating रेटिंग : 2.5/5\n", 556 | "Saved 466 with rating रेटिंग : 2/5\n", 557 | "Saved 467 with rating रेटिंग : 3/5\n", 558 | "Saved 468 with rating रेटिंग : 2.5/5\n", 559 | "Saved 469 with rating रेटिंग : 3/5\n", 560 | "Saved 470 with rating रेटिंग : 2/5\n", 561 | "Saved 471 with rating रेटिंग : 3/5\n", 562 | "Saved 472 with rating रेटिंग : 3/5\n", 563 | "Saved 473 with rating रेटिंग : 3/5\n", 564 | "Saved 474 with rating रेटिंग : 2/5\n", 565 | "Saved 475 with rating रेटिंग : 2/5\n", 566 | "Saved 476 with rating रेटिंग : 3/5\n", 567 | "Saved 477 with rating \n", 568 | "Saved 478 with rating रेटिंग : 2.5/5\n", 569 | "Saved 479 with rating रेटिंग : 2.5/5\n", 570 | "Saved 480 with rating रेटिंग : 3/5\n", 571 | "Saved 481 with rating रेटिंग : 1.5/5\n", 572 | "Saved 483 with rating रेटिंग : 3/5\n", 573 | "Saved 484 with rating रेटिंग : 2/5\n", 574 | "Saved 485 with rating रेटिंग : 3/5\n", 575 | "Saved 486 with rating \n", 576 | "Saved 487 with rating रेटिंग : 2.5/5\n", 577 | "Saved 488 with rating रेटिंग : 2/5\n", 578 | "Saved 489 with rating रेटिंग : 1.5/5\n", 579 | "Saved 490 with rating रेटिंग : 2/5\n", 580 | "Saved 491 with rating रेटिंग : 0.5/5\n", 581 | "Saved 492 with rating रेटिंग : 2.5/5\n", 582 | "Saved 494 with rating /5\n", 583 | "Saved 495 with rating रेटिंग :2/5\n", 584 | "Saved 496 with rating रेटिंग : 2/5\n", 585 | "Saved 498 with rating रेटिंग : 2.5/5\n", 586 | "Saved 499 with rating रेटिंग : 1/5\n", 587 | "Saved 500 with rating रेटिंग : 2.5/5\n", 588 | "Saved 501 with rating रेटिंग : 3.5/5\n", 589 | "Saved 502 with rating रेटिंग : 2.5/5\n", 590 | "Saved 503 with rating /5\n", 591 | "Saved 506 with rating रेटिंग : 1.5/5\n", 592 | "Saved 507 with rating रेटिंग : 2/5\n", 593 | "Saved 508 with rating रेटिंग : 3/5\n", 594 | "Saved 509 with rating रेटिंग : 3/5\n", 595 | "Saved 510 with rating रेटिंग : 2.5/5\n", 596 | "Saved 511 with rating रेटिंग :1/5\n", 597 | "Saved 512 with rating रेटिंग : 3/5\n", 598 | "Saved 513 with rating रेटिंग : 2.5/5\n", 599 | "Saved 514 with rating रेटिंग : 3/5\n", 600 | "Saved 515 with rating रेटिंग : 2/5\n", 601 | "Saved 516 with rating रेटिंग : 1.5/5\n", 602 | "Saved 517 with rating रेटिंग : 2.5/5\n", 603 | "Saved 519 with rating रेटिंग : 1.5/5\n", 604 | "Saved 521 with rating रेटिंग : 1.5/5\n", 605 | "Saved 523 with rating रेटिंग : 2/5\n", 606 | "Saved 524 with rating रेटिंग : 3/5\n", 607 | "Saved 525 with rating रेटिंग : 1.5/5\n", 608 | "Saved 526 with rating रेटिंग : 3/5\n", 609 | "Saved 529 with rating रेटिंग : 2.5/5\n", 610 | "Saved 530 with rating रेटिंग : 3/5\n", 611 | "Saved 531 with rating रेटिंग : 2/5\n", 612 | "Saved 532 with rating रेटिंग : 3/5\n", 613 | "Saved 533 with rating रेटिंग : 1.5/5\n", 614 | "Saved 534 with rating रेटिंग : 3.5/5\n", 615 | "Saved 538 with rating रेटिंग : 3/5\n", 616 | "Saved 539 with rating रेटिंग : 3/5\n", 617 | "Saved 540 with rating रेटिंग : 2.5/5\n", 618 | "Saved 541 with rating रेटिंग : 1/5\n", 619 | "Saved 543 with rating रेटिंग : 3.5/5\n", 620 | "Saved 544 with rating रेटिंग : 3.5/5\n", 621 | "Saved 545 with rating रेटिंग : 2.5/5\n", 622 | "Saved 546 with rating रेटिंग : 2/5\n", 623 | "Saved 547 with rating /5\n", 624 | "Saved 548 with rating रेटिंग : 2.5/5\n", 625 | "Saved 549 with rating रेटिंग : 2/5\n", 626 | "Saved 550 with rating रेटिंग : 3/5\n", 627 | "Saved 551 with rating रेटिंग : 2/5\n", 628 | "Saved 552 with rating रेटिंग : 2.5/5\n", 629 | "Saved 553 with rating रेटिंग : 3.5/5\n", 630 | "Saved 554 with rating रेटिंग : 2/5\n", 631 | "Saved 555 with rating रेटिंग : 2.5/5\n", 632 | "Saved 557 with rating रेटिंग : 1/5\n", 633 | "Saved 558 with rating रेटिंग : 2/5\n", 634 | "Saved 559 with rating रेटिंग : 3/5\n", 635 | "Saved 560 with rating रेटिंग : 2/5\n", 636 | "Saved 561 with rating रेटिंग : 3/5\n", 637 | "Saved 562 with rating रेटिंग : 2.5/5\n", 638 | "Saved 563 with rating रेटिंग : 2.5/5\n", 639 | "Saved 564 with rating रेटिंग : 3/5\n", 640 | "Saved 568 with rating रेटिंग : 2/5\n", 641 | "Saved 569 with rating रेटिंग : 2/5\n", 642 | "Saved 572 with rating रेटिंग : 1.5/5\n", 643 | "Saved 573 with rating रेटिंग : 2.5/5\n", 644 | "Saved 575 with rating रेटिंग : 2.5/5\n", 645 | "Saved 577 with rating रेटिंग : 0/5\n", 646 | "Saved 578 with rating रेटिंग : 2.5/5\n", 647 | "Saved 579 with rating रेटिंग : 2/5\n", 648 | "Saved 583 with rating रेटिंग : 2.5/5\n", 649 | "Saved 584 with rating रेटिंग : 2/5\n", 650 | "Saved 585 with rating रेटिंग : 3/5\n", 651 | "Saved 586 with rating रेटिंग : 1.5/5\n", 652 | "Saved 587 with rating रेटिंग : 2.5/5\n", 653 | "Saved 588 with rating रेटिंग : 0.5/5\n", 654 | "Saved 590 with rating रेटिंग : 1.5/5\n", 655 | "Saved 592 with rating रेटिंग : 2/5\n", 656 | "Saved 594 with rating रेटिंग : 1/5\n", 657 | "Saved 596 with rating रेटिंग : 1/5\n", 658 | "Saved 600 with rating रेटिंग : 2/5\n", 659 | "Saved 601 with rating रेटिंग : 3/5\n", 660 | "Saved 603 with rating रेटिंग : 3/5\n", 661 | "Saved 606 with rating रेटिंग : 3.5/5\n", 662 | "Saved 607 with rating रेटिंग : 1/5\n", 663 | "Saved 608 with rating रेटिंग : 3/5\n", 664 | "Saved 611 with rating रेटिंग : 3/5\n", 665 | "Saved 615 with rating रेटिंग : 0.5/5\n", 666 | "Saved 617 with rating /5\n", 667 | "Saved 619 with rating रेटिंग : 2/5\n", 668 | "Saved 623 with rating रेटिंग : 2/5\n", 669 | "Saved 624 with rating रेटिंग : 3/5\n", 670 | "Saved 626 with rating रेटिंग : 3/5\n", 671 | "Saved 627 with rating रेटिंग : 3.5/5\n", 672 | "Saved 630 with rating रेटिंग : 1.5/5\n", 673 | "Saved 631 with rating रेटिंग : 3.5/5\n", 674 | "Saved 632 with rating रेटिंग : 1.5/5\n", 675 | "Saved 633 with rating रेटिंग : 2.5/5\n", 676 | "Saved 636 with rating रेटिंग : 2/5\n", 677 | "Saved 638 with rating रेटिंग : 2.5/5\n", 678 | "Saved 650 with rating रेटिंग : 3/5\n", 679 | "Saved 651 with rating रेटिंग : 2.5/5\n", 680 | "Saved 652 with rating रेटिंग : 2/5\n", 681 | "Saved 653 with rating रेटिंग : 3/5\n", 682 | "Saved 654 with rating रेटिंग : 2/5\n", 683 | "Saved 657 with rating रेटिंग : 3/5\n", 684 | "Saved 658 with rating रेटिंग : 0.5/5\n", 685 | "Saved 662 with rating रेटिंग : 2.5/5\n", 686 | "Saved 664 with rating रेटिंग : 3.5/5\n", 687 | "Saved 665 with rating रेटिंग : 2/5\n", 688 | "Saved 667 with rating रेटिंग : 1.5/5\n", 689 | "Saved 670 with rating रेटिंग : 0.5/5\n", 690 | "Saved 672 with rating रेटिंग : 1/5\n", 691 | "Saved 673 with rating रेटिंग : 2.5/5\n", 692 | "Saved 674 with rating रेटिंग : 2.5/5\n", 693 | "Saved 675 with rating रेटिंग 2.5/5\n", 694 | "Saved 676 with rating रेटिंग : 2.5/5\n", 695 | "Saved 678 with rating रेटिंग : 1.5\n", 696 | "Saved 681 with rating रेटिंग : * * 1/2\n", 697 | "Saved 683 with rating रेटिंग : \n", 698 | "Saved 684 with rating रेटिंग : \n", 699 | "Saved 685 with rating रेटिंग : 2.5/5\n", 700 | "Saved 687 with rating रेटिंग : 1.5/5\n", 701 | "Saved 688 with rating रेटिंग : 2.5/5\n", 702 | "Saved 690 with rating रेटिंग 3.5/5\n", 703 | "Saved 691 with rating रेटिंग : 2.5/5\n", 704 | "Saved 692 with rating रेटिंग : 0/5\n", 705 | "Saved 693 with rating रेटिंग : 2/5\n", 706 | "Saved 694 with rating रेटिंग : 3/5\n", 707 | "Saved 696 with rating \n", 708 | "Saved 698 with rating रेटिंग : 1/5\n", 709 | "Saved 699 with rating रेटिंग : 1.5/5\n", 710 | "Saved 700 with rating रेटिंग : 2.5/5\n", 711 | "Saved 701 with rating रेटिंग : 2/5\n", 712 | "Saved 703 with rating रेटिंग : 3.5/5\n", 713 | "Saved 704 with rating रेटिंग : 2.5/5\n", 714 | "Saved 705 with rating रेटिंग : 2.5/5\n", 715 | "Saved 707 with rating रेटिंग : 2.5/5\n", 716 | "Saved 708 with rating रेटिंग : 3.5/5\n", 717 | "Saved 710 with rating रेटिंग : 2.5/5\n", 718 | "Saved 711 with rating रेटिंग : 1.5/5\n", 719 | "Saved 712 with rating रेटिंग : 2.5/5\n", 720 | "Saved 714 with rating रेटिंग : 1/5\n", 721 | "Saved 715 with rating \n", 722 | "Saved 716 with rating रेटिंग : 3/5\n", 723 | "Saved 717 with rating \n", 724 | "Saved 718 with rating \n", 725 | "Saved 719 with rating रेटिंग : 2.5/5\n", 726 | "Saved 720 with rating रेटिंग : 2/5\n", 727 | "Saved 721 with rating रेटिंग : 4/5\n", 728 | "Saved 722 with rating रेटिंग : 3/5\n", 729 | "Saved 723 with rating रेटिंग : 2.5/5\n", 730 | "Saved 724 with rating रेटिंग : 3/5\n", 731 | "Saved 725 with rating रेटिंग : 2/5\n", 732 | "Saved 726 with rating रेटिंग : 2/5\n", 733 | "Saved 727 with rating रेटिंग : 3/5\n", 734 | "Saved 728 with rating रेटिंग : 2/5\n", 735 | "Saved 729 with rating रेटिंग : 3/5\n", 736 | "Saved 730 with rating रेटिंग : 1.5/5\n", 737 | "Saved 731 with rating रेटिंग : 3/5\n", 738 | "Saved 732 with rating रेटिंग : 2.5/5\n", 739 | "Saved 733 with rating रेटिंग : 0.5/5\n", 740 | "Saved 734 with rating रेटिंग : 3/5\n", 741 | "Saved 735 with rating रेटिंग : 2/5\n", 742 | "Saved 736 with rating रेटिंग : 1.5/5\n", 743 | "Saved 738 with rating रेटिंग : 2.5/5\n", 744 | "Saved 740 with rating रेटिंग : 1.5/5\n", 745 | "Saved 741 with rating रेटिंग : 2/5\n", 746 | "Saved 742 with rating रेटिंग : 1/5\n", 747 | "Saved 743 with rating रेटिंग : 4/5\n", 748 | "Saved 746 with rating रेटिंग : 1.5/5\n" 749 | ] 750 | } 751 | ], 752 | "source": [ 753 | "# We have ratings from -17\n", 754 | "articlesAndRating = {}\n", 755 | "for c, url in enumerate(all_articles):\n", 756 | " html_doc = ''\n", 757 | " url = urllib.parse.urlsplit(url)\n", 758 | " url = list(url)\n", 759 | " url[2] = urllib.parse.quote(url[2])\n", 760 | " url = urllib.parse.urlunsplit(url)\n", 761 | " with urlopen(url) as response:\n", 762 | " for line in response:\n", 763 | " line = line.decode('utf-8')\n", 764 | " html_doc = html_doc + line.replace('\\n','')\n", 765 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 766 | " main_div = soup.find('div',{'id':'ram'})\n", 767 | " divs = main_div.find_all('div', {'class':None, 'id':None})\n", 768 | " divs = divs + soup.find_all('font')\n", 769 | " if len(divs) == 0:\n", 770 | " divs = soup.find_all('div')\n", 771 | " article = ''\n", 772 | " rating = ''\n", 773 | " for div in divs:\n", 774 | " if 'google' in div.text or ('रेटिंग' and '/5') in div.text:\n", 775 | " if ('रेटिंग' and '/5') or ('रेटिंग' and '/ 5') in div.text:\n", 776 | " rating = div.text.strip(' \\t\\n\\r').replace(u'\\xa0', u' ').replace(u'\\ufeff','')\n", 777 | " continue\n", 778 | " article = article + div.text.strip(' \\t\\n\\r') + '\\n'\n", 779 | " if rating is '':\n", 780 | " elems = soup(text=re.compile(r'रेटिंग'))\n", 781 | " if len(elems):\n", 782 | " rating = elems[0]\n", 783 | " else:\n", 784 | " # calling again with second url\n", 785 | " url = url[:-5] + str(2) + url[-4:]\n", 786 | " html_doc = ''\n", 787 | " url = urllib.parse.urlsplit(url)\n", 788 | " url = list(url)\n", 789 | " url[2] = urllib.parse.quote(url[2])\n", 790 | " url = urllib.parse.urlunsplit(url)\n", 791 | " try:\n", 792 | " with urlopen(url) as response:\n", 793 | " for line in response:\n", 794 | " line = line.decode('utf-8')\n", 795 | " html_doc = html_doc + line.replace('\\n','')\n", 796 | " except:\n", 797 | " continue\n", 798 | " soup = BeautifulSoup(html_doc, 'html.parser')\n", 799 | " main_div = soup.find('div',{'id':'ram'})\n", 800 | " divs = main_div.find_all('div', {'class':None, 'id':None})\n", 801 | " divs = divs + soup.find_all('font')\n", 802 | " if len(divs) == 0:\n", 803 | " divs = soup.find_all('div')\n", 804 | " rating = ''\n", 805 | " for div in divs:\n", 806 | " if 'google' in div.text or ('रेटिंग' and '/5') in div.text:\n", 807 | " if ('रेटिंग' and '/5') or ('रेटिंग' and '/ 5') in div.text:\n", 808 | " rating = div.text.strip(' \\t\\n\\r').replace(u'\\xa0', u' ').replace(u'\\ufeff','')\n", 809 | " continue\n", 810 | " article = article + div.text.strip(' \\t\\n\\r') + '\\n'\n", 811 | " if rating is '':\n", 812 | " elems = soup(text=re.compile(r'रेटिंग'))\n", 813 | " if len(elems):\n", 814 | " rating = elems[0]\n", 815 | " \n", 816 | " article = article.strip(u'\\u200b')\n", 817 | " article = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', article)\n", 818 | " article = article.replace(u'\\ufeff','')\n", 819 | " article = article.replace(u'\\xa0', u' ')\n", 820 | " article = article.replace(' ', ' ');\n", 821 | " article = article.replace(' , ', ', ');\n", 822 | " articlesAndRating[c] = {\n", 823 | " 'url': url,\n", 824 | " 'article': article,\n", 825 | " 'rating': rating\n", 826 | " }\n", 827 | " print('Saved ' + str(c) + ' with rating ' + rating)" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 296, 833 | "metadata": {}, 834 | "outputs": [ 835 | { 836 | "data": { 837 | "text/plain": [ 838 | "{'article': \"\\n\\nविश्वरूप के पहले भाग में कहानी और पेश करने के तरीके में जो बिखराव था उसमें सुधार दूसरे भाग में भी नहीं हुआ। बल्कि यह कहना ठीक होगा कि दूसरे भाग में हालात और बिगड़ गए।\\n\\nविश्वरूप 2 पहले भाग का सीक्वल भी है और प्रिक्वल भी। यानी कि कहानी कई बार आगे-पीछे होती है। निर्देशक के रूप में कमल हासन ने कहानी को इस तरीके से पेश किया है दर्शक पूरी तरह कन्फ्यूज होते रहते हैं और ज्यादातर समय पल्ले ही नहीं पड़ता कि यह सब क्या हो रहा है और क्यों हो रहा है?\\n\\nपिछले भाग की तरह ही इस बार भी विज़ाम (कमल हासन) के निशाने पर उमर कुरैशी (राहुल बोस) और सलीम (जयदीप अहलावत) हैं जो न्यूयॉर्क से भाग निकले हैं और उनके निशाने पर भारत है जहां वे धमाकों के जरिये आतंक फैलाना चाहते हैं।\\n\\nलगभग ढाई घंटे की फिल्म है और अपनी बात पर यह आखिरी के 45 मिनट में आती है तब तक कहानी को खूब घुमाया फिराया गया है ताकि दर्शकों को बहलाया जा सके, लेकिन यह सब देखना बोरिंग है। फिल्म आपके सब्र की परीक्षा ले लेती है।\\n\\nकमल हासन की बड़ी असफलता यह है कि वे दर्शकों को फिल्म से बिलकुल भी नहीं जोड़ पाए। उन्होंने बात कहने में लंबा समय लिया है। लंबे सीन रचे हैं और फिल्म बेहद सुस्त रफ्तार से आगे बढ़ती है।\\n\\nकहने को तो यह थ्रिलर है, जिसमें स्काई डाइविंग है, पानी के अंदर की शूटिंग है, हेलिकॉप्टर हैं, फाइटिंग सीन हैं, बम-गोलियां हैं, लेकिन थ्रिल नहीं है। बिना ठोस कहानी के ये सब खोखले नजर आते हैं।\\n\\nइस एक्शन फिल्म में कमल हासन मिसफिट हैं। उन्हें देख लगता ही नहीं कि यह आदमी इतनी बढ़िया फाइटिंग या स्टंट्स कर सकता है। जब फिल्म के हीरो पर ही दर्शकों का विश्वास नहीं जम पाता तो कहानी पर कैसे होगा। इमोशनल सीन में कमल जरूर अपने अभिनय से प्रभावित करते हैं, लेकिन ध्यान रखने वाली बात यह है कि एक एक्शन से सजी थ्रिलर मूवी है।\\n\\nढंग की कहानी न होने पर परदे पर दिखाई तमाम मेहनत को बरबाद होते देखना तकलीफ भरा है। कई लोकेशन पर कहानी को फिल्माया गया है, पैसा खर्च किया गया है, हर सीन को को भव्य बनाने की कोशिश की गई है, लेकिन ढंग की कहानी नहीं चुनी गई।\\n\\nनिर्देशक के रूप में भी कमल हासन निराश करते हैं। गाने बिना सिचुएशन के डाल दिए गए हैं। मां-बेटा वाला ट्रेक इमोशनल जरूर करता है, लेकिन गौर से सोचा जाए तो यह महज फिल्म की लंबाई बढ़ाने के काम आता है। कमल ने बहुत सारी बातें फिल्म में कहने की कोशिश की है, लेकिन कहने का सलीका नहीं आया।\\n\\nउमर और सलीम को जितना खतरनाक बताया गया है उन्हें फिल्म में उतने फुटेज भी मिलने चाहिए थे, लेकिन वे फिल्म के आखिरी में ही नजर आते हैं जिससे वे उतने खतरनाक नहीं लगते।\\n\\nफिल्म की दोनों हीरोइनों पूजा कुमार और एंड्रिया जर्मिया का काम अच्छा है। बरसों बाद बड़े परदे पर वहीदा रहमान को देखना अच्छा लगता है। बाकी कलाकार दबे-दबे से रहे क्योंकि उनके रोल ठीक से नहीं लिखे गए थे।\\n\\n'विश्वरूप 2' देखने के बाद फिल्म से ज्यादा अफसोस कमल हासन के लिए होता है।\\n\\nबैनर : राजकमल फिल्म्स इंटरनेशनल, रोहित शेट्टी पिक्चर्स, रिलायंस एंटरटेनमेंट\\nनिर्माता : कमल हासन, चंद्रा हासन\\nनिर्देशक : कमल हासन\\nसंगीत : मोहम्मद घिब्रान\\nकलाकार : कमल हासन, राहुल बोस, पूजा कुमार, शेखर कपूर, जयदीप अहलावत, वहीदा रहमान\\nसेंसर सर्टिफिकेट : यूए * 2 घंटे 24 मिनट 43 सेकंड\\n\",\n", 839 | " 'rating': 'रेटिंग : 2/5',\n", 840 | " 'url': 'http://hindi.webdunia.com/bollywood-movie-review/vishwaroop-2-kamal-haasan-samay-tamrakar-review-in-hindi-118081000051_1.html'}" 841 | ] 842 | }, 843 | "execution_count": 296, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "articlesAndRating[0]" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": 277, 855 | "metadata": {}, 856 | "outputs": [], 857 | "source": [ 858 | "soup.find('span', {'style':'color:#ff0000;'})" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 28, 864 | "metadata": {}, 865 | "outputs": [ 866 | { 867 | "data": { 868 | "text/plain": [ 869 | "'http://hindi.webdunia.com/bollywood-movie-review/एक-चालीस-की-लास्ट-लोकल-धीमा-सफर-107051800023_1.htm'" 870 | ] 871 | }, 872 | "execution_count": 28, 873 | "metadata": {}, 874 | "output_type": "execute_result" 875 | } 876 | ], 877 | "source": [ 878 | "all_articles[269]" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": 274, 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "s = soup.find('font', {'style':'font-size:11.5pt; color:#FF0000'})" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 302, 893 | "metadata": {}, 894 | "outputs": [ 895 | { 896 | "data": { 897 | "text/plain": [ 898 | "['रेटिंग : 3.5/5']" 899 | ] 900 | }, 901 | "execution_count": 302, 902 | "metadata": {}, 903 | "output_type": "execute_result" 904 | } 905 | ], 906 | "source": [ 907 | "soup(text=re.compile(r'रेटिंग'))" 908 | ] 909 | }, 910 | { 911 | "cell_type": "code", 912 | "execution_count": 314, 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "data": { 917 | "text/plain": [ 918 | "'http://hindi.webdunia.com/bollywood-movie-review/%25E0%25A4%25AE%25E0%25A4%25BF%25E0%25A4%2595%25E0%25A5%2580-%25E0%25A4%25B5%25E0%25A4%25BE%25E0%25A4%25AF%25E0%25A4%25B0%25E0%25A4%25B8-%25E0%25A4%25AB%25E0%25A4%25BF%25E0%25A4%25B2%25E0%25A5%258D%25E0%25A4%25AE-%25E0%25A4%25B8%25E0%25A4%25AE%25E0%25A5%2580%25E0%25A4%2595%25E0%25A5%258D%25E0%25A4%25B7%25E0%25A4%25BE-113102500043_2.htm'" 919 | ] 920 | }, 921 | "execution_count": 314, 922 | "metadata": {}, 923 | "output_type": "execute_result" 924 | } 925 | ], 926 | "source": [ 927 | "url" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": 316, 933 | "metadata": {}, 934 | "outputs": [ 935 | { 936 | "data": { 937 | "text/plain": [ 938 | "" 939 | ] 940 | }, 941 | "execution_count": 316, 942 | "metadata": {}, 943 | "output_type": "execute_result" 944 | } 945 | ], 946 | "source": [ 947 | "response" 948 | ] 949 | }, 950 | { 951 | "cell_type": "code", 952 | "execution_count": 19, 953 | "metadata": {}, 954 | "outputs": [], 955 | "source": [ 956 | "import sys\n", 957 | "sys.setrecursionlimit(10000)\n", 958 | "with open('articlesRatingMapping','wb') as f:\n", 959 | " pickle.dump(articlesAndRating,f)" 960 | ] 961 | }, 962 | { 963 | "cell_type": "code", 964 | "execution_count": 25, 965 | "metadata": {}, 966 | "outputs": [ 967 | { 968 | "data": { 969 | "text/plain": [ 970 | "633" 971 | ] 972 | }, 973 | "execution_count": 25, 974 | "metadata": {}, 975 | "output_type": "execute_result" 976 | } 977 | ], 978 | "source": [ 979 | "len(articlesAndRating)" 980 | ] 981 | }, 982 | { 983 | "cell_type": "code", 984 | "execution_count": null, 985 | "metadata": {}, 986 | "outputs": [], 987 | "source": [] 988 | } 989 | ], 990 | "metadata": { 991 | "kernelspec": { 992 | "display_name": "Python 3", 993 | "language": "python", 994 | "name": "python3" 995 | }, 996 | "language_info": { 997 | "codemirror_mode": { 998 | "name": "ipython", 999 | "version": 3 1000 | }, 1001 | "file_extension": ".py", 1002 | "mimetype": "text/x-python", 1003 | "name": "python", 1004 | "nbconvert_exporter": "python", 1005 | "pygments_lexer": "ipython3", 1006 | "version": "3.6.7" 1007 | } 1008 | }, 1009 | "nbformat": 4, 1010 | "nbformat_minor": 2 1011 | } 1012 | -------------------------------------------------------------------------------- /language-model/Hindi_Language_Model_ULMFiT_172k.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from fastai.text import *\n", 21 | "import numpy as np\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "import pickle\n", 24 | "import sentencepiece as spm" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "('1.0.57', '1.1.0')" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import fastai, torch\n", 45 | "fastai.__version__ , torch.__version__" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Wed Oct 16 11:13:32 2019 \r\n", 58 | "+-----------------------------------------------------------------------------+\r\n", 59 | "| NVIDIA-SMI 390.116 Driver Version: 390.116 |\r\n", 60 | "|-------------------------------+----------------------+----------------------+\r\n", 61 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", 62 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", 63 | "|===============================+======================+======================|\r\n", 64 | "| 0 GeForce GTX 108... Off | 00000000:01:00.0 On | N/A |\r\n", 65 | "| 0% 54C P0 71W / 250W | 228MiB / 11177MiB | 0% Default |\r\n", 66 | "+-------------------------------+----------------------+----------------------+\r\n", 67 | " \r\n", 68 | "+-----------------------------------------------------------------------------+\r\n", 69 | "| Processes: GPU Memory |\r\n", 70 | "| GPU PID Type Process name Usage |\r\n", 71 | "|=============================================================================|\r\n", 72 | "| 0 1039 G /usr/lib/xorg/Xorg 130MiB |\r\n", 73 | "| 0 2165 G compiz 85MiB |\r\n", 74 | "+-----------------------------------------------------------------------------+\r\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "!nvidia-smi" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "torch.cuda.set_device(0)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "/home/gaurav/PycharmProjects/nlp-for-hindi/language-model\r\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "!pwd" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "path = Path('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 15, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# from inltk.tokenizer import HindiTokenizer" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 16, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# HindiTokenizer" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 10, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 144 | ] 145 | }, 146 | "execution_count": 10, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "path" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 11, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "class HindiTokenizer(BaseTokenizer):\n", 162 | " def __init__(self, lang:str):\n", 163 | " self.lang = lang\n", 164 | " self.sp = spm.SentencePieceProcessor()\n", 165 | " self.sp.Load(str(path/\"../tokenizer/hindi_lm_large.model\"))\n", 166 | " \n", 167 | " def tokenizer(self, t:str) -> List[str]:\n", 168 | " return self.sp.EncodeAsPieces(t)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 12, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "sp = spm.SentencePieceProcessor()\n", 178 | "sp.Load(str(path/\"../tokenizer/hindi_lm_large.model\"))\n", 179 | "itos = [sp.IdToPiece(int(i)) for i in range(30000)]" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 13, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "30000" 191 | ] 192 | }, 193 | "execution_count": 13, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "len(itos)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 14, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "['',\n", 211 | " '',\n", 212 | " '',\n", 213 | " '▁के',\n", 214 | " '।',\n", 215 | " '▁में',\n", 216 | " '▁है',\n", 217 | " ',',\n", 218 | " '▁की',\n", 219 | " '▁',\n", 220 | " '▁और',\n", 221 | " '▁से',\n", 222 | " '▁का',\n", 223 | " '▁को',\n", 224 | " '▁हैं',\n", 225 | " '▁एक',\n", 226 | " '▁पर',\n", 227 | " '.',\n", 228 | " '-',\n", 229 | " '▁ने']" 230 | ] 231 | }, 232 | "execution_count": 14, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "itos[:20]" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 15, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "# 30,000 is the vocab size that we chose in sentencepiece\n", 248 | "hindi_vocab = Vocab(itos)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 16, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "tokenizer = Tokenizer(tok_func=HindiTokenizer, lang='hi')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 17, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "['xxunk',\n", 269 | " 'xxpad',\n", 270 | " 'xxbos',\n", 271 | " 'xxeos',\n", 272 | " 'xxfld',\n", 273 | " 'xxmaj',\n", 274 | " 'xxup',\n", 275 | " 'xxrep',\n", 276 | " 'xxwrep']" 277 | ] 278 | }, 279 | "execution_count": 17, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "tokenizer.special_cases" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 18, 291 | "metadata": {}, 292 | "outputs": [ 293 | { 294 | "data": { 295 | "text/plain": [ 296 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 297 | ] 298 | }, 299 | "execution_count": 18, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "path" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 19, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "data_lm = TextLMDataBunch.from_folder(path=path/'hindi_transformer', tokenizer=tokenizer, vocab=hindi_vocab)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 20, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "64" 326 | ] 327 | }, 328 | "execution_count": 20, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "data_lm.batch_size" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 21, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "data_lm.save()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 22, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/html": [ 354 | "\n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | "
idxtext
0▁जे स्टन ▁जेम्स ▁बॉण्ड ▁1953 ▁में ▁अंग्रेज़ ▁लेखक ▁इयान ▁फ़्लेम िंग ▁द्वारा ▁रचित ▁एक ▁काल्पनिक ▁पात्र ▁है । ▁007 ▁के ▁गुप्त ▁नाम ▁से ▁प्रसिद्ध ▁यह ▁एजेंट ▁फ़्लेम िंग ▁की ▁बारह ▁पुस्तकों ▁व ▁दो ▁लघुकथा ओं ▁में ▁मौजूद ▁है । ▁1964 ▁में ▁फ़्लेम िंग ▁की ▁मृत्यु ▁के ▁पश्चात ▁छः ▁अन्य ▁लेखकों ▁ने ▁बॉण्ड ▁की ▁आधि कृत ▁पुस्तकें ▁लिखी ▁हैं , ▁जिनमें ▁किंग्स ले ▁ऐ मिस , ▁क्रिस्टोफ़र ▁वुड्स , ▁जॉन ▁गार्ड
1▁जो ▁क ल्हण ▁द्वारा ▁12 वीं ▁शताब्दी ▁ई . ▁में ▁लिखा ▁गया ▁था । ▁तब ▁तक ▁यहां ▁पूर्ण ▁हिन्दू ▁राज्य ▁रहा ▁था । यह ▁अशोक ▁महान ▁के ▁साम्राज्य ▁का ▁हिस्सा ▁भी ▁रहा । ▁लगभग ▁तीसरी ▁शताब्दी ▁में ▁अशोक ▁का ▁शासन ▁रहा ▁था । ▁तभी ▁यहां ▁बौद्ध ▁धर्म ▁का ▁आगमन ▁हुआ , ▁जो ▁आगे ▁चलकर ▁कुषाण ों ▁के ▁अधीन ▁सम ृ ध्द ▁हुआ ▁था । उ ज्ज ैन ▁के ▁महाराज ▁विक्रमादित्य
2▁पाकिस्तान ▁शामिल ▁हैं ▁जबकि ▁भारी ▁मात्रा ▁में ▁कैन ोला ▁ऑयल ▁और ▁मील ▁संयुक्त ▁राज्य ▁अमेरिका ▁जाता ▁है ▁और ▁इसकी ▁छोटी ▁मात्रा एं ▁मैक्सिको , ▁चीन ▁और ▁यूरोप ▁में ▁भेज ▁दी ▁जाती ▁हैं । ▁वर्ष ▁2002 - 2003 ▁के ▁मौसम ▁में ▁दुनिया ▁भर ▁में ▁लगभग ▁14 ▁मिलियन ▁मैट्रिक ▁टन ▁रे प सीड ▁ऑयल ▁का ▁उत्पादन ▁हुआ ▁था । ▁कैन ोला ▁को ▁पारंपरिक ▁पौध ▁प्रजनन ▁के ▁जरिये ▁रे प सीड ▁से ▁विकसित
3▁फिल्मों ▁में ▁दिखी ं । ▁पहले ▁अभिषेक ▁कपूर ▁की ▁फि तूर ▁में ▁जो ▁चार्ल्स ▁डिकेंस ▁के ▁उपन्यास ▁ग्रेट ▁एक्स्प ेक्ट ै शन ▁पर ▁आधारित ▁थी । ▁फिल्म ▁में ▁आदित्य ▁रॉय ▁कपूर ▁और ▁तब ु ▁भी ▁थे । ▁बाद ▁में ▁बार ▁बार ▁देखो ▁में ▁वह ▁सिद्धार्थ ▁मल्होत्रा ▁के ▁साथ ▁नज़र ▁आई । ▁दोनों ▁ही ▁फिल्म ▁सफल ▁नहीं ▁रही । ▁हिन्दुस्तान ▁के ▁विशाल ▁ठाकुर ▁ने ▁लिखा : ▁\" बार ▁बार ▁देखो ▁में
4▁नहीं ▁आते ▁हैं , ▁जबकि ▁मु वत् ता ▁के ▁सभी ▁हदीस ▁अन्य ▁सही ह ▁किताबों ▁में ▁शामिल ▁हैं । ▁सुन्नी ▁मुस्लिम ▁छह ▁प्रमुख ▁हदीस ▁संग्रह ों ▁को ▁उनके ▁सबसे ▁महत्वपूर्ण ▁मानते ▁हैं , ▁हालांकि ▁प्रामाणिकता ▁का ▁क्रम ▁मज़ ह ब ों ▁के ▁बीच ▁भिन्न ▁होता ▁है ▁इब्न ▁हज र ▁के ▁अनुसार , ▁पहले ▁दो , ▁जिसे ▁आमतौर ▁पर ▁दो ▁सह हि ह ▁के ▁रूप ▁में ▁जाना ▁जाता ▁है , ▁उनकी
" 384 | ], 385 | "text/plain": [ 386 | "" 387 | ] 388 | }, 389 | "metadata": {}, 390 | "output_type": "display_data" 391 | } 392 | ], 393 | "source": [ 394 | "data_lm.show_batch()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 23, 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "??language_model_learner" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 24, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "30000" 415 | ] 416 | }, 417 | "execution_count": 24, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "len(data_lm.vocab.itos)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 25, 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "learn = language_model_learner(data_lm, AWD_LSTM, pretrained=False)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 26, 438 | "metadata": {}, 439 | "outputs": [ 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "1604" 444 | ] 445 | }, 446 | "execution_count": 26, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "gc.collect()" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 27, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "data": { 462 | "text/plain": [ 463 | "SequentialRNN(\n", 464 | " (0): AWD_LSTM(\n", 465 | " (encoder): Embedding(30000, 400, padding_idx=1)\n", 466 | " (encoder_dp): EmbeddingDropout(\n", 467 | " (emb): Embedding(30000, 400, padding_idx=1)\n", 468 | " )\n", 469 | " (rnns): ModuleList(\n", 470 | " (0): WeightDropout(\n", 471 | " (module): LSTM(400, 1152, batch_first=True)\n", 472 | " )\n", 473 | " (1): WeightDropout(\n", 474 | " (module): LSTM(1152, 1152, batch_first=True)\n", 475 | " )\n", 476 | " (2): WeightDropout(\n", 477 | " (module): LSTM(1152, 400, batch_first=True)\n", 478 | " )\n", 479 | " )\n", 480 | " (input_dp): RNNDropout()\n", 481 | " (hidden_dps): ModuleList(\n", 482 | " (0): RNNDropout()\n", 483 | " (1): RNNDropout()\n", 484 | " (2): RNNDropout()\n", 485 | " )\n", 486 | " )\n", 487 | " (1): LinearDecoder(\n", 488 | " (decoder): Linear(in_features=400, out_features=30000, bias=True)\n", 489 | " (output_dp): RNNDropout()\n", 490 | " )\n", 491 | ")" 492 | ] 493 | }, 494 | "execution_count": 27, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "learn.model" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 28, 506 | "metadata": {}, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/html": [], 511 | "text/plain": [ 512 | "" 513 | ] 514 | }, 515 | "metadata": {}, 516 | "output_type": "display_data" 517 | }, 518 | { 519 | "name": "stdout", 520 | "output_type": "stream", 521 | "text": [ 522 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "learn.lr_find()" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 29, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX4AAAEGCAYAAABiq/5QAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAfBklEQVR4nO3deZScdZ3v8fe39+70ml6SdLYmCyCgCUmzyUURRdGjAnKdA6NnQL3DGecObgcd53BGHec444her3OdGQcVcBTxgrgAXgXGAXEBQickJERCEsjSnaS7k96702t97x/1dFI03ekl9TxV1fV5nVMnVb96qn7frnR/6qnf83t+Ze6OiIhkj5xUFyAiItFS8IuIZBkFv4hIllHwi4hkGQW/iEiWyUt1ATNRU1PjDQ0NqS5DRCSjbN68+ai7105sDy34zexO4N1Am7ufF7StB74FFAGjwF+6+6bpnquhoYGmpqawShURmZfMbP9k7WEO9dwNXDWh7SvA37n7euBzwW0REYlQaMHv7k8CHRObgfLgegVwKKz+RURkclGP8X8CeMTMvkr8TeeNEfcvIpL1op7V81Hgk+6+HPgk8N2pNjSzm82sycya2tvbIytQRGS+izr4bwR+Ely/H7hwqg3d/Q53b3T3xtra1xyUFhGROYo6+A8Bbw6uXwHsjrh/EZGsF+Z0znuBy4EaM2sGPg/8OfANM8sDBoGbw+pfREQmF1rwu/sNU9y1Maw+RUTmi9aeQb7/1H7et2Epq2pLk/rcWrJBRCQN7Tvazzcf38OhrsGkP7eCX0QkDXUOjABQtSA/6c+t4BcRSUOdA8MAVJUUJP25FfwiImlIwS8ikmW6BkYozMuhuCA36c+t4BcRSUOd/cOh7O2Dgl9EJC11DgxTtUDBLyKSNToHRqgqSf6MHlDwi4ikpc4BDfWIiGSVroERKrXHLyKSHWIxp2tgmIUa4xcRyQ49gyPEHCo11CMikh1OLNegoR4RkewQ5lm7oOAXEUk7XUHw6+CuiEiW6OiPD/Xo4K6ISJY4ucev4BcRyQqdA8Pk5hjlReF8SaKCX0QkzXQOjFBZnI+ZhfL8Cn4RkTTT2R/eAm2g4BcRSTvxdXrCmdEDCn4RkbQTX6dHe/wiIllDe/wiIlnE3ensH9EYv4hIthgYHmN4LBbacg2g4BcRSSsn1+nRUI+ISFboClbm1MFdEZEsEfbKnKDgFxFJKx398eBfuEBDPSIiWUFDPSIiWWZ8qKeyWHv8IiJZoWtghLKiPPJyw4tnBb+ISBrp6B8O7QtYxin4RUTSSOfAcKjj+6DgFxFJK10DI6GevAUKfhGRtBJfoE17/CIiWaOzf5hK7fGLiGSH4dEY/cNjLNQev4hIdugan8OfqbN6zOxOM2szsx0T2m8xs11m9oKZfSWs/kVEMk1ncNZuJh/cvRu4KrHBzN4CXA28wd3PBb4aYv8iIhkligXaIMTgd/cngY4JzR8FvuzuQ8E2bWH1LyKSaTr7Mzz4p3AmcJmZPWNmvzGzC6ba0MxuNrMmM2tqb2+PsEQRkdQ4MdQT4sqcEH3w5wFVwMXAp4H7zMwm29Dd73D3RndvrK2tjbJGEZGUyPihnik0Az/xuE1ADKiJuAYRkbTUNTBMUX4ORfm5ofYTdfD/DLgCwMzOBAqAoxHXICKSljr6R0Lf24f40EsozOxe4HKgxsyagc8DdwJ3BlM8h4Eb3d3DqkFEJJN0RbBcA4QY/O5+wxR3fTCsPkVEMlnnwHDoB3ZBZ+6KiKSNroGR0JdkBgW/iEjaONo3RHXIyzWAgl9EJC0MjozRMzhKXVlh6H0p+EVE0sDRviEAahX8IiLZob1XwS8iklVOBH9pUeh9KfhFRNJAu4Z6RESyy/gef3WpZvWIiGSFtt4hFi4oID83/FhW8IuIpIH23qFIpnKCgl9EJC209w5FMr4PCn4RkbTQ3jtEbamCX0QkK7g77X3a4xcRyRo9g6MMj8YU/CIi2SLKs3ZBwS8iknJtvYMAGuMXEckW43v8deUKfhGRrBDlOj2g4BcRSbn2viEKcnMoLw7t23BfRcEvIpJi4ydvmVkk/Sn4RURSrL13iJqIZvSAgl9EJOWiPGsXFPwiIil3NMKzdkHBLyKSUqNjMY71D0e2Mico+EVEUupY/zDu0Z21Cwp+EZGUinq5BlDwi4iklIJfRCTLnDxrV8EvIpIV2vu0xy8iklXae4coK8qjKD83sj4V/CIiKdTWOxjp3j4o+EVEUqq9dyjSOfyg4BcRSan4Am3RLMc8TsEvIpJCUa/TAwp+EZGU6R8apX94TGP8IiLZ4mgKpnKCgl9EJGVScdYuKPhFRFKmLQVn7UKIwW9md5pZm5ntmOS+W83MzawmrP5FRNLd+B5/Xfk8CX7gbuCqiY1mthy4EjgQYt8iImmvvXeI3ByjqqQg0n5DC353fxLomOSurwOfATysvkVEMsGh7uPUlRWSmxPNl6yPi3SM38zeC7S4+7YZbHuzmTWZWVN7e3sE1YmIRKu58zjLq0oi7zey4DezEuA24HMz2d7d73D3RndvrK2tDbc4EZEUaOk8zrKq4sj7nVHwm9lqMysMrl9uZh8zs8pZ9rUaOAPYZmb7gGXAFjNbPMvnERHJeCNjMQ53p3HwAw8AY2a2Bvgu8QD/4Ww6cvft7l7n7g3u3gA0Axvc/chsnkdEZD440j1IzGFZGg/1xNx9FLgW+N/u/klgyakeYGb3Ak8BZ5lZs5l95PRKFRGZPw52DgCkZI8/b4bbjZjZDcCNwHuCtvxTPcDdb5jm/oYZ9i0iMu80dx4H0nuP/0PAJcCX3P0VMzsD+EF4ZYmIzG/NncfJMVhcEe2SzDDDPX533wl8DMDMqoAyd/9ymIWJiMxnzZ0DLC4voiAv+pVzZjqr5wkzKzezhcA24C4z+1/hliYiMn81dx5PyTAPzHyop8Lde4D3AXe5+0bgbeGVJSIyv6VqDj/MPPjzzGwJ8CfAwyHWIyIy76VyDj/MPPi/CDwC7HX3Z81sFbA7vLJEROavVM7hh5kf3L0fuD/h9svAdWEVJSIyn6VyDj/M/ODuMjP7abC+fquZPWBmy8IuTkRkPkrlHH6Y+VDPXcCDQD2wFHgoaBMRkVlK5Rx+mHnw17r7Xe4+GlzuBrRkpojIHKRyDj/MPPiPmtkHzSw3uHwQOBZmYSIi81Uq5/DDzIP/w8Snch4BDgP/nfgyDiIiMkupnMMPMwx+dz/g7u9199pgaeVriJ/MJSIis5DqOfxwet/A9amkVSEikiVSPYcfTi/4o/12YBGReSDVc/jh9ILfk1aFiEiWSPUcfpjmzF0z62XygDcgdW9XIiIZKtVz+GGa4Hf3sqgKERHJBqmeww+nN9QjIiKzlOo5/KDgFxGJVKrn8IOCX0QkMukwhx8U/CIikUmHOfyg4BcRicz+Y8Ec/oXa4xcRyQq723oBWFuX2gmTCn4RkYjsbuujsiSfmtKClNah4BcRicie1j7W1pViltoVbxT8IiIRcHdeautlTYqHeUDBLyISiaN9w3QNjLC2rjTVpSj4RUSiMH5g98xF2uMXEckKe9r6AFi7SHv8IiJZYXdrH2VFedSVFaa6FAW/iEgUdrf1psWMHlDwi4hEYk9bX8pP3Bqn4BcRCVlH/zBH+4bTYnwfFPwiIqEbP7C7Jg2mcoKCX0QkdC+1Bmv0pMFUTlDwi4iEbk9bHwsKcqlP4ffsJlLwi4iEbHdbL2sWlaXFjB5Q8IuIhG53sDhbuggt+M3sTjNrM7MdCW23m9mLZva8mf3UzCrD6l9EJB10D4zQ1juUHcEP3A1cNaHtMeA8d38D8BLwNyH2LyKScnvaxw/sZkHwu/uTQMeEtkfdfTS4+TSwLKz+RUTSwe7WYI2eNDl5C1I7xv9h4JdT3WlmN5tZk5k1tbe3R1iWiEjy7G7royg/h6WVqf2e3UQpCX4zuw0YBe6Zaht3v8PdG929sba2NrriRESS6KXWXtbUlZKTkx4zeiAFwW9mNwLvBj7g7h51/yIiUXF3dh3pTathHoC8KDszs6uAvwbe7O4DUfYtIhK1Q92DtPUOsX55ek1gDHM6573AU8BZZtZsZh8BvgmUAY+Z2VYz+1ZY/YuIpNqW/Z0AbFxZleJKXi20PX53v2GS5u+G1Z+ISLrZvL+T4vxczl6cXkM9OnNXRCQkWw50sm55BXm56RW16VWNiMg8cXx4jJ2HetJumAcU/CIioXi+uYvRmCv4RUSyxeYD8QO75y9X8IuIZIUt+ztZVbuAqgUFqS7lNRT8IiJJ5u5sOdDFxhXpt7cPCn4RkaTbd2yAjv7htBzfBwW/iEjSbQ5O3Nqg4BcRyQ6b93dSVpTHmtr0WYM/kYJfRCTJtuzvZMOKqrRakTORgl9EJIl6Bkd4qa03bcf3QcEvIpJUWw904Z5+C7MlUvCLiCRR0/5OcgzWpdlSzIkU/CIiSfT4i22sX15JaWGkX3cyKwp+EZEkOdx9nO0t3Vx5zuJUl3JKCn4RkST5z52tAFx5zqIUV3JqCn4RkSR5dGcrq2oWsKYuPefvj1Pwi4gkQc/gCE+/fCzt9/ZBwS8ikhS/2dXOyJgr+EVEssVjO1upXlDA+Wm6ImciBb+IyGkaGYvx+K42rji7jtw0XaYhkYJfROQ0PfNyB72DoxkxzAMKfhGR0/bYziMU5edw2draVJcyIwp+EZHT4O48trOV/7amluKC3FSXMyMKfhGR07C9pZtD3YO8PUOGeUDBLyJyWn707EGK8nN4x3npvUxDIgW/iMgc9Q2N8vPnWnj3G+qpKM5PdTkzpuAXEZmjn29toX94jA9ctCLVpcyKgl9EZA7cnXuePsDrlpSzPo3X3p+Mgl9EZA62NXez83APH7hoBWbpf9JWIgW/iMgc3PP0fkoKcrl6fX2qS5k1Bb+IyCx1Hx/hoecPcfX6esqKMueg7jgFv4jILP10SzODIzH+9MKVqS5lTtL3SyGT4AsPvsA9z+yf1WOMCMbqpuniVHfPZihx4s+S+NiJTzM+RmmTbGATt7HXttuJ9pPb2Il/LeF2cD1ozzHICe7MNSMnuD/HjNycV1/yc43cnBwKco3CvFwK83IozM+ltDCX2rJC6sqKqC0rZGV1CUsrizNu3FUyw1jM+f7T+3nDsgpev6wi1eXMybwO/kvX1FAyi1OoPcRaTvQxTSd+qipmUeDETT2h44k1+IT2xBpes23QkPgYx4N/Ex8TtE24/1W3Pd5TzCHmjrsTi8Wvx9wZizljDmOxGKNjzuBIjNHYGCOjMYZGxxgciTE0GqN3cISh0dir6iwtzGPtolLOWlTG65aUc259OWcvKU/rL8CWzHB/00H2tvfzrx/YkOpS5mxe/xVcec6ijFktT+bO3ekdGqW9d4jWnkFeOdrPS0d62dXayyMvHOFHzx48sW1DdQnn1ldwTn0559SXc159BbVlhSmsXjJJ39AoX330JRpXVvHODDpTd6J5HfySHcyM8qJ8yovyWV1byhtX15y4z91p7Rli5+Fudh7q4YVDPWxv6eYX2w+f2GZxeRGvX1bB65fGP7q/YWkF1aV6M5DX+vff7OVo3xDf/rONGT2UqOCXec3MWFxRxOKKIq44++Snv+7jI/zxcA87WrrZHlwe29l64v6llcWsX15JY0MVFzQs5OzFZeTlai5ENjvcfZxv//Zl3rOuPiO+ZetUQgt+M7sTeDfQ5u7nBW0Lgf8LNAD7gD9x986wahCZSkVxPhevqubiVdUn2noHR9jR0sP2li6eb+7muQNdJz4ZLCjI5eJV1Vx+dh1XnF3H0sriVJUuKXL7I7uIOXzmHWelupTTFuYe/93AN4H/SGj7LPBrd/+ymX02uP3XIdYgMmNlRflcsrqaS1affDNo6TpO074ONr3SwZO72/n1i238LXDWojKuOm8x71m3hDV1ZakrWiKxvbmbn2xp4S/evJrlC0tSXc5pM59umsnpPLlZA/Bwwh7/LuBydz9sZkuAJ9x92rfPxsZGb2pqCq1OkZlwd14+2s/jL7bx2M5WNu3rwB3OXlzGe9bVc835S/VJYB46PjzGNf/ye471D/Fft15OeQadsGVmm9298TXtEQd/l7tXJtzf6e6TDpaZ2c3AzQArVqzYuH//7Obji4StrXeQX24/wsPPH+LZfZ2YwRtXV3PdhmW887wlGfNtTHJqt96/jQe2NPO9D13Im87MjK9WHJdxwZ9Ie/yS7g52DPCTLS38eMtBDnYcp6wwj/eur+f6C1Zk7Ek+Avc9e5DPPPA8H3vrWj515ZmpLmfWpgr+qGf1tJrZkoShnraI+xcJxfKFJXz8bWu55Yo1bNrXwX1NB/nx5mbueeYA5ywp54aLVnBNhq7rkq12Hurhb3++g0vXVPPxt65NdTlJFfUe/+3AsYSDuwvd/TPTPY/2+CUTdR8f4cFth7j3mQPsPNxzYiXHP71wpT4FpLmO/mGu+7c/0D80yi8+dlnGnuQX+VCPmd0LXA7UAK3A54GfAfcBK4ADwPvdvWO651LwSyZzd7Y1d/PDZ/bz4LZDDI7EOLe+nOsvXMHV6+sz6mBhNth/rJ+b7nqWlq7j/OAjF3HhGQtTXdKcpWSMP1kU/DJfdB8f4cGtLdy76SA7D/dQnJ/Lu16/hOs2LuXiM6rJycncs0Hng+cOdPI/vtfEmDvf+bNGGhsyN/RBwS+SVtyd7S3d3LvpIA9vO0Tv0ChLK4u5bsNSrjl/KatqS2f0PIMjY2xv6WbTKx3sbe+jb3CUvqH4ZXTs5N92Tg4sKMijvDi+tEVVST71lcXUVxaztLKYhpqSrD/+8Mvth/nkfVupKyvi7g9dMOP/g3Sm4BdJU4MjYzzywhEe2NLC73a3E3NYt6yCq9cv5T3r6l8zvuzuPPJCK3f9/hWeO9jFcLAyaX1FEeXF+ZQW5rGgMI/8hCUmYu70DY3Sc3yE3sFRjvYNvWZF06WVxZy5qJQzF5exflklG1ZWsai8KPwXIMX2tvfxpV/8kf96sY11yyr47k0XUDNP1mpS8ItkgNaeQR7adoifbW1hR0sPOQZvXF3De9fX845zF7Nlfydfe2wXO1p6aKgu4e3nLqZxZRWNDQtZuKBgxv24Ox39wxzqGqSla4C97f281NrLriO97G3vYyT4tLC0spiNK6u4eFX8jOaG6pKMXpwsUUf/MP/y+B6+94d9FOXncssVa7jp0gYK8+bP+RcKfpEMs7u1lwe3HeLnWw9xoGOAHIt/d8HyhcV8/K1ncs36+lAWjhsaHeOFQz1s2d/Jcwe62LSvg/beIQCWVBRxyepqLltbw6Wra6jLsE8EYzHnt7vbub+pmUd3HmE05lx/wXI+deVZGTtz51QU/CIZanxW0CMvHGHFwhKu27CMgrzoVgodX6riqb3HeOrlY/xhz1E6B0YAOHNRKZcEnwYuOqOaqll86ojKWMxp2tfBr144wq92HOFw9yBVJflce/4ybrhwOWsXzd+1lhT8IpIUsZiz83APv9tzlN/vOcqz+zoYHIlhBmcvLueSVdVcvGohF51RTUVJ9AeMh0bHePFwL9uau9h6sIvf7GrnWP8wBbk5XLa2hvdtWMbbzqmbV0M6U1Hwi0gohkdjbGvuin8i2HuMLQc6GRqNvxGsrSvlvPoKzltawbn15SypKKa6tICSgtw5HysYHo1xrH+Itp4h2nqHONIzyIFj/ew7NsD+Y/3sOzrA8Fj8wHVNaQGXrK7hHecu4vKz6rLuqzcV/CISicGRMbYd7OLplzvY1tzFjpZu2oJjBOOK8nMoK8pnPPrNIMeMHLOE65y4PRaLz0rqHRx9zWwkgMK8HFZWl7CyegGrahewblkl65ZXUl9RNG8ORs9FuqzVIyLzXFF+LhetquaihC+5aesZ5MUjvbT1DnGsb4hj/cP0DsaPE7gHF5yYx6eeevDv+O0cM8qK8igrzKO0MI+FpQXUlRWxqLyQReVF1JYW6uS3WVDwi0jo6sqLMm4G0HymLxEVEckyCn4RkSyj4BcRyTIKfhGRLKPgFxHJMgp+EZEso+AXEckyCn4RkSyTEUs2mFk3sHuSuyqA7lO0TXd9/N8a4OgcSpus/5ncP7H9VLdV9/R1TXf/XOqerC3KumfSFmbd09U80xqnqnOq64ltYdQ909+RmdSaeD1df7dXunvtax7l7ml/Ae6YaXti23TXE/5tSmZds637VLdVd2rqnqItsrpn0hZm3dPVPNe6Z/o7Elbd2ZYlU10yZajnoVm0PzSL61M970xN9/iZ1n2q26p76v5mev9c6p7qZ5mLudQ9k7Yw657JY+dSd6b8jkxsy5S6Z1RHRgz1hM3MmnySFezSneqOluqOVibWnSk1Z8oef9juSHUBc6S6o6W6o5WJdWdEzdrjFxHJMtrjFxHJMgp+EZEsM++C38zuNLM2M9sxh8duNLPtZrbHzP7ZEr6zzcxuMbNdZvaCmX0luVWHU7eZfcHMWsxsa3B5VybUnXD/rWbmZlaTvIpPPHcYr/ffm9nzwWv9qJnVZ0DNt5vZi0HdPzWzymTWHGLd7w/+FmNmltSDqadT7xTPd6OZ7Q4uNya0n/L3P1RzmXOazhfgTcAGYMccHrsJuAQw4JfAO4P2twD/CRQGt+sypO4vALdm2usd3LcceATYD9RkQt1AecI2HwO+lQE1vx3IC67/E/BPGfJavw44C3gCaEyHeoNaGia0LQReDv6tCq5Xnepni+Iy7/b43f1JoCOxzcxWm9mvzGyzmf3WzM6e+DgzW0L8D/cpj/+v/AdwTXD3R4Evu/tQ0EdbhtQduhDr/jrwGSCU2Qdh1O3uPQmbLkh27SHV/Ki7jwabPg0sS2bNIdb9R3fflexaT6feKbwDeMzdO9y9E3gMuCrVf7fzLvincAdwi7tvBG4F/nWSbZYCzQm3m4M2gDOBy8zsGTP7jZldEGq1J51u3QB/FXyMv9PMqsIr9VVOq24zey/Q4u7bwi50gtN+vc3sS2Z2EPgA8LkQax2XjN+RcR8mvucZhWTWHYWZ1DuZpcDBhNvjP0NKf7Z5/2XrZlYKvBG4P2EIrXCyTSdpG99jyyP+Me1i4ALgPjNbFbxThyJJdf8b8PfB7b8Hvkb8jzs0p1u3mZUAtxEfgohMkl5v3P024DYz+xvgr4DPJ7nUk4UkqebguW4DRoF7klnjZJJZdxROVa+ZfQj4eNC2Bvh/ZjYMvOLu1zL1z5DSn23eBz/xTzVd7r4+sdHMcoHNwc0HiYdk4sfcZcCh4Hoz8JMg6DeZWYz4Ykzt6Vy3u7cmPO7bwMMh1jvudOteDZwBbAv+yJYBW8zsQnc/ksZ1T/RD4BeEGPwkqebggOO7gbeGuTOTINmvddgmrRfA3e8C7gIwsyeAm9x9X8ImzcDlCbeXET8W0Ewqf7aoDiZEeQEaSDgwA/wBeH9w3YB1UzzuWeJ79eMHW94VtP8F8MXg+pnEP7pZBtS9JGGbTwI/yoTXe8I2+wjh4G5Ir/fahG1uAX6cATVfBewEasN4jcP+HSGEg7tzrZepD+6+QnzEoCq4vnAmP1uo/x9RdRTZDwT3AoeBEeLvqh8hvgf5K2Bb8Ev+uSke2wjsAPYC3+Tkmc0FwA+C+7YAV2RI3d8HtgPPE9+DWpIJdU/YZh/hzOoJ4/V+IGh/nvhiWUszoOY9xHdktgaXpM5ECrHua4PnGgJagUdSXS+TBH/Q/uHgdd4DfGg2v/9hXbRkg4hIlsmWWT0iIhJQ8IuIZBkFv4hIllHwi4hkGQW/iEiWUfBLRjKzvoj7+46ZnZOk5xqz+AqeO8zsoelWxDSzSjP7y2T0LQL6Bi7JUGbW5+6lSXy+PD+5WFmoEms3s+8BL7n7l06xfQPwsLufF0V9Mv9pj1/mDTOrNbMHzOzZ4HJp0H6hmf3BzJ4L/j0raL/JzO43s4eAR83scjN7wsx+bPE16u8ZXyM9aG8MrvcFi7FtM7OnzWxR0L46uP2smX1xhp9KnuLk4nSlZvZrM9ti8XXarw62+TKwOviUcHuw7aeDfp43s79L4ssoWUDBL/PJN4Cvu/sFwHXAd4L2F4E3ufv5xFfM/IeEx1wC3OjuVwS3zwc+AZwDrAIunaSfBcDT7r4OeBL484T+vxH0P+26K8HaNG8lflY1wCBwrbtvIP4dEF8L3ng+C+x19/Xu/mkzezuwFrgQWA9sNLM3TdefyLhsWKRNssfbgHMSVlAsN7MyoAL4npmtJb4CYn7CYx5z98S11ze5ezOAmW0lvmbL7yb0M8zJBe82A1cG1y/h5JrqPwS+OkWdxQnPvZn4Gu0QX7PlH4IQjxH/JLBokse/Pbg8F9wuJf5G8OQU/Ym8ioJf5pMc4BJ3P57YaGb/B3jc3a8NxsufSLi7f8JzDCVcH2Pyv5ERP3lwbKptTuW4u683swribyD/E/hn4mv41wIb3X3EzPYBRZM83oB/dPd/n2W/IoCGemR+eZT4GvgAmNn4MroVQEtw/aYQ+3+a+BATwPXTbezu3cS/ovFWM8snXmdbEPpvAVYGm/YCZQkPfQT4cLBOPGa21MzqkvQzSBZQ8EumKjGz5oTLp4iHaGNwwHMn8eW0Ab4C/KOZ/R7IDbGmTwCfMrNNwBKge7oHuPtzxFd8vJ74l6A0mlkT8b3/F4NtjgG/D6Z/3u7ujxIfSnrKzLYDP+bVbwwip6TpnCJJEnx72HF3dzO7HrjB3a+e7nEiUdMYv0jybAS+GczE6SLkr7kUmSvt8YuIZBmN8YuIZBkFv4hIllHwi4hkGQW/iEiWUfCLiGSZ/w90LzhYqON7MQAAAABJRU5ErkJggg==\n", 538 | "text/plain": [ 539 | "
" 540 | ] 541 | }, 542 | "metadata": { 543 | "needs_background": "light" 544 | }, 545 | "output_type": "display_data" 546 | } 547 | ], 548 | "source": [ 549 | "learn.recorder.plot()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 30, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/html": [ 560 | "\n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | "
epochtrain_lossvalid_lossaccuracytime
05.7514425.6437020.20533633:50
14.8174274.7930480.26674233:58
24.6126034.3956410.29561634:04
34.4223664.1967480.31133834:08
44.2399874.0650180.32228434:08
54.2062443.9811030.33069234:12
64.1137413.9115790.33719534:12
74.0142613.8510940.34392734:11
84.0321673.8009620.34959034:12
93.9555063.7655720.35440334:12
103.8586253.7150290.36013134:14
113.8461983.6741840.36529334:11
123.8825393.6478130.36900234:16
133.8116503.6130700.37372134:15
143.8697393.5908970.37696334:17
153.7866553.5631660.38075334:21
163.7580333.5459450.38325734:24
173.7936713.5356280.38471034:24
183.7537533.5296260.38556934:26
193.7888643.5282680.38575534:35
" 713 | ], 714 | "text/plain": [ 715 | "" 716 | ] 717 | }, 718 | "metadata": {}, 719 | "output_type": "display_data" 720 | }, 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "Better model found at epoch 0 with accuracy value: 0.205336332321167.\n", 726 | "Better model found at epoch 1 with accuracy value: 0.2667420506477356.\n", 727 | "Better model found at epoch 2 with accuracy value: 0.2956162393093109.\n", 728 | "Better model found at epoch 3 with accuracy value: 0.3113378584384918.\n", 729 | "Better model found at epoch 4 with accuracy value: 0.3222842514514923.\n", 730 | "Better model found at epoch 5 with accuracy value: 0.33069172501564026.\n", 731 | "Better model found at epoch 6 with accuracy value: 0.3371947109699249.\n", 732 | "Better model found at epoch 7 with accuracy value: 0.3439274728298187.\n", 733 | "Better model found at epoch 8 with accuracy value: 0.3495900630950928.\n", 734 | "Better model found at epoch 9 with accuracy value: 0.3544030785560608.\n", 735 | "Better model found at epoch 10 with accuracy value: 0.36013102531433105.\n", 736 | "Better model found at epoch 11 with accuracy value: 0.3652926981449127.\n", 737 | "Better model found at epoch 12 with accuracy value: 0.3690016567707062.\n", 738 | "Better model found at epoch 13 with accuracy value: 0.3737213909626007.\n", 739 | "Better model found at epoch 14 with accuracy value: 0.3769630491733551.\n", 740 | "Better model found at epoch 15 with accuracy value: 0.38075295090675354.\n", 741 | "Better model found at epoch 16 with accuracy value: 0.3832567632198334.\n", 742 | "Better model found at epoch 17 with accuracy value: 0.38471028208732605.\n", 743 | "Better model found at epoch 18 with accuracy value: 0.3855690658092499.\n", 744 | "Better model found at epoch 19 with accuracy value: 0.3857552409172058.\n" 745 | ] 746 | } 747 | ], 748 | "source": [ 749 | "learn.fit_one_cycle(20, 1e-3, moms=(0.8,0.7), callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='model')])" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 31, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "TEXT = \"जिसके लिये उन्हें \"\n", 759 | "N_WORDS = 40\n", 760 | "N_SENTENCES = 2" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": 32, 766 | "metadata": {}, 767 | "outputs": [ 768 | { 769 | "name": "stdout", 770 | "output_type": "stream", 771 | "text": [ 772 | "जिसके लिये उन्हें ▁all ▁भ्रम ▁तो ▁एम्पायर ्स ▁को ▁बनाने ▁में ▁बचा ना ▁होगा ▁लेकिन ▁इसके ▁लिए ▁उन्हें ▁बादशाह ▁के ▁खिलाफ ▁ऐसी ▁शिकायत ▁सुन नी ▁चाहिये । ▁ऐसी ▁प्रवृति ▁नहीं ▁है ▁कि ▁प्रसिद्द ▁व्यक्तियों ▁की ▁तरह ▁काम ▁करने ▁के ▁लिए ▁बाहर ▁की ▁ओर\n", 773 | "जिसके लिये उन्हें ▁सर्वश्रेष्ठ ▁सहायक ▁अभिनेत्री ▁के ▁लिये ▁नोबेल ▁पुरस्कार ▁प्राप्त ▁हुआ ▁है , ▁उन्होंने ▁उन्हें ▁सर्वश्रेष्ठ ▁गीतकार ▁के ▁लिए ▁' का उ नन्दन ▁फिल्म ▁पुरस्कार ' ▁दिया ▁था । ▁उन्होंने ▁एक ▁लाख ▁से ▁अधिक ▁साधनों ▁से ▁बैठकर ▁ही ▁बतौर ▁अभिनेता ▁की ▁कहानी\n" 774 | ] 775 | } 776 | ], 777 | "source": [ 778 | "print(\"\\n\".join(learn.predict(TEXT, N_WORDS, temperature=0.9) for _ in range(N_SENTENCES)))" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 33, 784 | "metadata": {}, 785 | "outputs": [ 786 | { 787 | "data": { 788 | "text/plain": [ 789 | "34.064916056257296" 790 | ] 791 | }, 792 | "execution_count": 33, 793 | "metadata": {}, 794 | "output_type": "execute_result" 795 | } 796 | ], 797 | "source": [ 798 | "np.exp(3.528268)" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": 34, 804 | "metadata": {}, 805 | "outputs": [], 806 | "source": [ 807 | "defaults.device = torch.device('cpu')\n", 808 | "learn.model.eval()\n", 809 | "learn.export()" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": 35, 815 | "metadata": {}, 816 | "outputs": [ 817 | { 818 | "data": { 819 | "text/plain": [ 820 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 821 | ] 822 | }, 823 | "execution_count": 35, 824 | "metadata": {}, 825 | "output_type": "execute_result" 826 | } 827 | ], 828 | "source": [ 829 | "path" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": 36, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "# learn = load_learner(path / 'HindiDataset')" 839 | ] 840 | }, 841 | { 842 | "cell_type": "code", 843 | "execution_count": 37, 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "encoder = get_model(learn.model)[0]" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": 38, 853 | "metadata": {}, 854 | "outputs": [ 855 | { 856 | "data": { 857 | "text/plain": [ 858 | "torch.Size([30000, 400])" 859 | ] 860 | }, 861 | "execution_count": 38, 862 | "metadata": {}, 863 | "output_type": "execute_result" 864 | } 865 | ], 866 | "source": [ 867 | "encoder.state_dict()['encoder.weight'].shape" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 39, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "embeddings = encoder.state_dict()['encoder.weight']" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 40, 882 | "metadata": {}, 883 | "outputs": [], 884 | "source": [ 885 | "embeddings = np.array(embeddings)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 41, 891 | "metadata": {}, 892 | "outputs": [ 893 | { 894 | "data": { 895 | "text/plain": [ 896 | "(400,)" 897 | ] 898 | }, 899 | "execution_count": 41, 900 | "metadata": {}, 901 | "output_type": "execute_result" 902 | } 903 | ], 904 | "source": [ 905 | "embeddings[0].shape" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 42, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "df = pd.DataFrame(embeddings)" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": 43, 920 | "metadata": {}, 921 | "outputs": [ 922 | { 923 | "data": { 924 | "text/plain": [ 925 | "(30000, 400)" 926 | ] 927 | }, 928 | "execution_count": 43, 929 | "metadata": {}, 930 | "output_type": "execute_result" 931 | } 932 | ], 933 | "source": [ 934 | "df.shape" 935 | ] 936 | }, 937 | { 938 | "cell_type": "code", 939 | "execution_count": 44, 940 | "metadata": {}, 941 | "outputs": [], 942 | "source": [ 943 | "df.to_csv('ulmfit_large_embeddings.tsv', sep='\\t', index=False, header=False)" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": 45, 949 | "metadata": {}, 950 | "outputs": [ 951 | { 952 | "data": { 953 | "text/html": [ 954 | "
\n", 955 | "\n", 968 | "\n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | "
0123456789...390391392393394395396397398399
00.0702600.161722-0.2838960.292114-0.033296-0.1386060.0086230.384469-0.056585-0.417674...-0.0130750.1866850.006372-0.027651-0.525675-0.067528-0.1884271.055574-0.360146-0.150507
10.0039900.034629-0.4067000.121944-0.0199290.0183090.0583610.0000140.0211540.013913...0.0350470.0272070.0796660.0672250.101667-0.001406-0.0602580.000648-0.029308-0.001614
20.0032570.053062-0.4069850.125515-0.0121830.0198950.0601970.0059500.0254590.019870...0.0300350.0208030.0801440.0575810.0946120.010723-0.059093-0.004867-0.027049-0.000550
30.1589010.116016-0.231982-0.141738-0.582064-0.1210100.0381290.0367280.7062860.113444...0.0261630.0619850.0720410.0552790.368510-0.168286-0.0136080.0102620.433321-0.062238
40.5153940.150388-0.560811-0.217824-0.0884850.2434840.0386390.2855780.639499-0.078303...-0.1416960.485272-0.2278860.1238810.7825690.233689-0.178229-0.279273-0.4877840.055523
\n", 1118 | "

5 rows × 400 columns

\n", 1119 | "
" 1120 | ], 1121 | "text/plain": [ 1122 | " 0 1 2 3 4 5 6 \\\n", 1123 | "0 0.070260 0.161722 -0.283896 0.292114 -0.033296 -0.138606 0.008623 \n", 1124 | "1 0.003990 0.034629 -0.406700 0.121944 -0.019929 0.018309 0.058361 \n", 1125 | "2 0.003257 0.053062 -0.406985 0.125515 -0.012183 0.019895 0.060197 \n", 1126 | "3 0.158901 0.116016 -0.231982 -0.141738 -0.582064 -0.121010 0.038129 \n", 1127 | "4 0.515394 0.150388 -0.560811 -0.217824 -0.088485 0.243484 0.038639 \n", 1128 | "\n", 1129 | " 7 8 9 ... 390 391 392 393 \\\n", 1130 | "0 0.384469 -0.056585 -0.417674 ... -0.013075 0.186685 0.006372 -0.027651 \n", 1131 | "1 0.000014 0.021154 0.013913 ... 0.035047 0.027207 0.079666 0.067225 \n", 1132 | "2 0.005950 0.025459 0.019870 ... 0.030035 0.020803 0.080144 0.057581 \n", 1133 | "3 0.036728 0.706286 0.113444 ... 0.026163 0.061985 0.072041 0.055279 \n", 1134 | "4 0.285578 0.639499 -0.078303 ... -0.141696 0.485272 -0.227886 0.123881 \n", 1135 | "\n", 1136 | " 394 395 396 397 398 399 \n", 1137 | "0 -0.525675 -0.067528 -0.188427 1.055574 -0.360146 -0.150507 \n", 1138 | "1 0.101667 -0.001406 -0.060258 0.000648 -0.029308 -0.001614 \n", 1139 | "2 0.094612 0.010723 -0.059093 -0.004867 -0.027049 -0.000550 \n", 1140 | "3 0.368510 -0.168286 -0.013608 0.010262 0.433321 -0.062238 \n", 1141 | "4 0.782569 0.233689 -0.178229 -0.279273 -0.487784 0.055523 \n", 1142 | "\n", 1143 | "[5 rows x 400 columns]" 1144 | ] 1145 | }, 1146 | "execution_count": 45, 1147 | "metadata": {}, 1148 | "output_type": "execute_result" 1149 | } 1150 | ], 1151 | "source": [ 1152 | "df.head()" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 46, 1158 | "metadata": {}, 1159 | "outputs": [ 1160 | { 1161 | "data": { 1162 | "text/plain": [ 1163 | "(30000, 400)" 1164 | ] 1165 | }, 1166 | "execution_count": 46, 1167 | "metadata": {}, 1168 | "output_type": "execute_result" 1169 | } 1170 | ], 1171 | "source": [ 1172 | "df.shape" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 47, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "data": { 1182 | "text/plain": [ 1183 | "30000" 1184 | ] 1185 | }, 1186 | "execution_count": 47, 1187 | "metadata": {}, 1188 | "output_type": "execute_result" 1189 | } 1190 | ], 1191 | "source": [ 1192 | "len(itos)" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 48, 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "df2 = pd.DataFrame(itos)" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": 49, 1207 | "metadata": {}, 1208 | "outputs": [ 1209 | { 1210 | "data": { 1211 | "text/html": [ 1212 | "
\n", 1213 | "\n", 1226 | "\n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | "
0
0<unk>
1<s>
2</s>
3▁के
4
\n", 1256 | "
" 1257 | ], 1258 | "text/plain": [ 1259 | " 0\n", 1260 | "0 \n", 1261 | "1 \n", 1262 | "2 \n", 1263 | "3 ▁के\n", 1264 | "4 ।" 1265 | ] 1266 | }, 1267 | "execution_count": 49, 1268 | "metadata": {}, 1269 | "output_type": "execute_result" 1270 | } 1271 | ], 1272 | "source": [ 1273 | "df2.head()" 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "code", 1278 | "execution_count": 50, 1279 | "metadata": {}, 1280 | "outputs": [ 1281 | { 1282 | "data": { 1283 | "text/plain": [ 1284 | "(30000, 1)" 1285 | ] 1286 | }, 1287 | "execution_count": 50, 1288 | "metadata": {}, 1289 | "output_type": "execute_result" 1290 | } 1291 | ], 1292 | "source": [ 1293 | "df2.shape" 1294 | ] 1295 | }, 1296 | { 1297 | "cell_type": "code", 1298 | "execution_count": 51, 1299 | "metadata": {}, 1300 | "outputs": [], 1301 | "source": [ 1302 | "df2.to_csv('ulmfit_large_embeddings_metadata.tsv', sep='\\t', index=False, header=False)" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 52, 1308 | "metadata": { 1309 | "scrolled": true 1310 | }, 1311 | "outputs": [ 1312 | { 1313 | "data": { 1314 | "text/plain": [ 1315 | "tensor([ 3.9896e-03, 3.4629e-02, -4.0670e-01, 1.2194e-01, -1.9929e-02,\n", 1316 | " 1.8309e-02, 5.8361e-02, 1.4350e-05, 2.1154e-02, 1.3913e-02,\n", 1317 | " -4.1637e-02, 3.5261e-02, 2.2660e-02, 4.5717e-03, -2.7113e-02,\n", 1318 | " -6.5313e-02, -1.1359e-03, -9.3003e-02, -1.2248e-02, -1.6794e-02,\n", 1319 | " 2.7482e-02, -1.4375e-02, 4.3184e-02, -4.9501e-02, 7.8045e-03,\n", 1320 | " 1.9203e-01, -9.0404e-02, 7.9326e-03, 9.6202e-03, 2.8788e-02,\n", 1321 | " 1.9584e-02, 7.5046e-02, -1.9784e-03, -4.2908e-02, 4.9432e-03,\n", 1322 | " -6.1185e-02, -3.6334e-02, 3.4995e-02, -3.8668e-02, 4.6059e-02,\n", 1323 | " -6.2501e-02, 2.9141e-02, -1.1699e-01, -3.2993e-02, -1.2340e-01,\n", 1324 | " -5.9029e-03, 6.7185e-02, -3.7538e-02, 5.7789e-02, -4.2478e-02,\n", 1325 | " 1.0205e-02, 8.6885e-02, -5.2492e-04, 4.3381e-02, -2.5761e-02,\n", 1326 | " 2.6653e-02, -6.8218e-02, -1.6502e-02, 8.1296e-03, -1.9546e-02,\n", 1327 | " -1.0692e-01, 8.3087e-02, 6.1522e-02, 1.0296e-02, 2.5809e-02,\n", 1328 | " 3.5692e-02, 3.6327e-02, 6.2493e-02, 1.5491e-01, -1.2330e-01,\n", 1329 | " 2.0124e-01, -7.0733e-03, 1.2579e-02, -3.7823e-02, 2.2434e-02,\n", 1330 | " 8.1680e-03, 3.2014e-02, -4.1190e-02, -2.6190e-02, 8.7246e-02,\n", 1331 | " -1.3912e-01, 1.1507e-02, -8.1481e-02, -4.6933e-02, -1.1786e-02,\n", 1332 | " 1.3372e-02, 3.2588e-02, 1.8975e-02, 1.0863e-01, 5.1077e-02,\n", 1333 | " -5.5736e-02, 2.0251e-03, -1.3002e-02, -5.9756e-02, 1.1953e-01,\n", 1334 | " 1.9101e-02, -4.0604e-02, -8.4240e-02, 4.7713e-02, 5.1612e-03,\n", 1335 | " 1.8735e-02, 1.4651e-02, 9.1230e-01, -5.4793e-02, -7.5600e-02,\n", 1336 | " 1.7903e-04, 1.1469e-01, -8.4688e-04, 4.0019e-02, 2.3708e-02,\n", 1337 | " 2.3580e-02, 3.5350e-02, -4.7755e-02, 3.1506e-02, 2.0875e-02,\n", 1338 | " -1.1744e-03, 7.1475e-02, 3.2980e-02, -5.4827e-02, -7.8449e-03,\n", 1339 | " 2.3295e-02, 4.4122e-02, -4.0129e-02, -2.0578e-02, 8.7522e-03,\n", 1340 | " 6.1318e-02, 2.2507e-02, -3.3308e-02, -5.4078e-02, 5.5343e-02,\n", 1341 | " 6.0475e-02, -3.1914e-03, -8.2345e-02, 2.7717e-02, -4.5604e-02,\n", 1342 | " 1.5436e-02, -3.7526e-02, -1.0993e-01, 5.0225e-02, 5.8796e-03,\n", 1343 | " 1.4176e-02, 4.7994e-02, 2.0361e-02, 5.7341e-02, -2.4026e+00,\n", 1344 | " -1.7308e-02, 1.4084e-02, -5.1157e-02, 1.4195e-02, 5.2165e-02,\n", 1345 | " -6.9561e-02, 2.6990e-02, 5.3683e-02, 1.0020e-01, 3.1138e-01,\n", 1346 | " -1.9321e-02, 1.9731e-02, -2.3378e-03, 3.0166e-02, 2.1067e-02,\n", 1347 | " -9.8012e-02, -6.6084e-02, -1.9933e-01, -4.9470e-02, -1.5310e-01,\n", 1348 | " -4.7517e-02, -1.2875e-02, -6.9725e-02, 2.4322e-02, -1.2482e-02,\n", 1349 | " 2.1977e-02, 2.2307e-02, 3.1355e-02, 3.2564e-02, -2.8850e-02,\n", 1350 | " -3.9165e-02, -2.7044e-02, -2.4999e-02, -6.2843e-03, 2.3731e-02,\n", 1351 | " 6.6348e-02, 1.5963e-02, 7.5836e-02, -3.6905e-03, -4.0696e-02,\n", 1352 | " 5.9066e-02, 6.8145e-02, 2.8232e-02, 1.5297e-02, -3.7014e-01,\n", 1353 | " 1.4036e-02, -2.8978e-02, -9.6864e-02, -5.1100e-02, -6.0850e-02,\n", 1354 | " 1.2948e-02, 2.4521e-03, -1.0369e-03, 2.6216e-02, -3.6271e-02,\n", 1355 | " -2.1348e-02, 3.9399e-03, 2.1349e-02, -1.2251e-02, -9.4905e-02,\n", 1356 | " 2.4837e-02, 8.5921e-02, 2.6679e-02, 1.1634e-03, -2.5319e-02,\n", 1357 | " -4.0517e-02, -1.7464e-02, 3.0301e-02, -1.1816e-02, 7.5057e-03,\n", 1358 | " 7.0751e-02, 1.8184e-02, -3.2228e-02, 3.9905e-02, 1.3867e-01,\n", 1359 | " -3.8270e-02, 2.8736e-02, -1.0112e-01, -4.4916e-02, -1.7799e-03,\n", 1360 | " -1.0043e-03, -3.4287e-03, 1.9268e-02, 1.1981e-02, 1.7961e-03,\n", 1361 | " 2.2495e-02, -3.1702e-02, -9.3024e-02, 4.9632e-02, 2.8290e-02,\n", 1362 | " 2.4110e-02, 5.9510e-02, 2.0343e-02, -6.9928e-02, -7.3729e-02,\n", 1363 | " 2.5439e-03, -2.9478e-03, -3.0492e-02, -2.9956e-02, 2.8754e-02,\n", 1364 | " 3.4454e-02, -4.8685e-02, -1.6420e-02, 4.0489e-02, 4.9390e-02,\n", 1365 | " -3.2069e-02, 4.1604e-02, -4.1826e-02, -4.7578e-02, -2.8571e-02,\n", 1366 | " -5.4877e-02, -8.2792e-02, -3.0598e-03, -2.5949e-02, -2.1504e-03,\n", 1367 | " 6.8280e-02, -1.8758e-02, -4.8444e-02, 2.5983e-02, 3.2601e-02,\n", 1368 | " -2.6121e-02, -3.9263e-02, 1.0144e-01, 5.0456e-03, -3.7807e-02,\n", 1369 | " -1.0898e-01, -1.5955e-01, 2.3768e-02, -1.3324e-01, 3.4804e-03,\n", 1370 | " -3.8621e-02, -1.1434e-03, 6.4886e-02, 1.0108e-03, 3.7277e-02,\n", 1371 | " 5.8445e-03, 2.6917e-02, -1.5601e-02, -2.3291e-02, 1.5676e-02,\n", 1372 | " -4.9895e-02, 1.9434e-02, 5.6187e-02, -1.4012e-02, -2.7559e-02,\n", 1373 | " -1.2770e-02, 1.0889e-01, 3.3884e-03, 3.4702e-02, -4.5949e-02,\n", 1374 | " 1.5612e-02, -9.3276e-03, 4.5600e-02, 4.9732e-02, 5.3435e-02,\n", 1375 | " -4.6817e-02, -2.7387e-02, -6.1836e-02, 1.2093e-02, -6.9735e-02,\n", 1376 | " 2.5105e-03, 7.0971e-04, 4.4027e-02, -4.6801e-02, -1.6921e-02,\n", 1377 | " 8.1619e-02, 7.2840e-03, 4.2603e-02, 9.1741e-02, -6.8875e-02,\n", 1378 | " -9.8295e-03, 3.6178e-03, 4.1215e-03, -2.2342e-02, -1.5675e-02,\n", 1379 | " 2.0410e-02, 2.3000e-02, -4.9163e-02, -2.6788e-02, -4.2531e-02,\n", 1380 | " 1.4527e-03, -2.7327e-02, -1.0790e-02, 2.5463e-03, -5.3925e-02,\n", 1381 | " 3.5149e-02, -5.7621e-02, -5.6419e-02, -8.6580e-02, 4.8375e-02,\n", 1382 | " -1.9785e-01, 3.4426e-02, -3.2230e-02, 1.0530e-02, -3.8754e-02,\n", 1383 | " -4.5052e-02, 1.5462e-02, 4.0886e-02, -2.7170e-02, -4.0465e-02,\n", 1384 | " 2.8688e-02, 3.4733e-02, -1.9942e-02, 2.3305e-02, -1.8977e-02,\n", 1385 | " 3.3000e-03, -1.1250e-02, 1.1707e-01, -3.0184e-02, -2.2679e+00,\n", 1386 | " 4.4809e-02, -4.3585e-02, 1.7188e-02, 6.3041e-02, -8.1576e-03,\n", 1387 | " -1.7810e-02, 1.7356e-02, 4.4350e-01, 6.4716e-02, -2.2541e-02,\n", 1388 | " -3.4297e-02, -9.7319e-03, 3.7758e-03, -5.0741e-02, 7.8162e-02,\n", 1389 | " 8.7522e-02, 1.5987e-02, -3.0472e-02, -4.5758e-02, 6.4824e-02,\n", 1390 | " -2.0536e-01, -1.3571e-02, -6.5024e-02, -8.4640e-03, -6.8738e-02,\n", 1391 | " -1.5000e-02, -4.5119e-02, -1.6405e-02, -6.8137e-03, 1.3170e-02,\n", 1392 | " 1.1337e-02, -2.7642e-03, 7.2331e-03, 1.3731e-02, 1.6689e-02,\n", 1393 | " 3.5047e-02, 2.7207e-02, 7.9666e-02, 6.7225e-02, 1.0167e-01,\n", 1394 | " -1.4059e-03, -6.0258e-02, 6.4771e-04, -2.9308e-02, -1.6137e-03],\n", 1395 | " device='cuda:0')" 1396 | ] 1397 | }, 1398 | "execution_count": 52, 1399 | "metadata": {}, 1400 | "output_type": "execute_result" 1401 | } 1402 | ], 1403 | "source": [ 1404 | "encoder.state_dict()['encoder.weight'][1]" 1405 | ] 1406 | }, 1407 | { 1408 | "cell_type": "code", 1409 | "execution_count": null, 1410 | "metadata": {}, 1411 | "outputs": [], 1412 | "source": [] 1413 | } 1414 | ], 1415 | "metadata": { 1416 | "kernelspec": { 1417 | "display_name": "Python 3", 1418 | "language": "python", 1419 | "name": "python3" 1420 | }, 1421 | "language_info": { 1422 | "codemirror_mode": { 1423 | "name": "ipython", 1424 | "version": 3 1425 | }, 1426 | "file_extension": ".py", 1427 | "mimetype": "text/x-python", 1428 | "name": "python", 1429 | "nbconvert_exporter": "python", 1430 | "pygments_lexer": "ipython3", 1431 | "version": "3.7.3" 1432 | } 1433 | }, 1434 | "nbformat": 4, 1435 | "nbformat_minor": 2 1436 | } 1437 | -------------------------------------------------------------------------------- /language-model/Hindi_Language_Model_TransformerXL_172k.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%reload_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from fastai.text import *\n", 21 | "import numpy as np\n", 22 | "from sklearn.model_selection import train_test_split\n", 23 | "import pickle\n", 24 | "import sentencepiece as spm" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "('1.0.57', '1.1.0')" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import fastai, torch\n", 45 | "fastai.__version__ , torch.__version__" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "Mon Oct 14 23:10:26 2019 \r\n", 58 | "+-----------------------------------------------------------------------------+\r\n", 59 | "| NVIDIA-SMI 390.116 Driver Version: 390.116 |\r\n", 60 | "|-------------------------------+----------------------+----------------------+\r\n", 61 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n", 62 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n", 63 | "|===============================+======================+======================|\r\n", 64 | "| 0 GeForce GTX 108... Off | 00000000:01:00.0 On | N/A |\r\n", 65 | "| 61% 66C P0 77W / 250W | 235MiB / 11177MiB | 0% Default |\r\n", 66 | "+-------------------------------+----------------------+----------------------+\r\n", 67 | " \r\n", 68 | "+-----------------------------------------------------------------------------+\r\n", 69 | "| Processes: GPU Memory |\r\n", 70 | "| GPU PID Type Process name Usage |\r\n", 71 | "|=============================================================================|\r\n", 72 | "| 0 1039 G /usr/lib/xorg/Xorg 110MiB |\r\n", 73 | "| 0 2165 G compiz 113MiB |\r\n", 74 | "+-----------------------------------------------------------------------------+\r\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "!nvidia-smi" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "torch.cuda.set_device(0)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "name": "stdout", 98 | "output_type": "stream", 99 | "text": [ 100 | "/home/gaurav/PycharmProjects/nlp-for-hindi/language-model\r\n" 101 | ] 102 | } 103 | ], 104 | "source": [ 105 | "!pwd" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 7, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "path = Path('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 10, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# train_files, test_files = train_test_split(files, test_size=0.2)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 11, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "# len(train_files), len(test_files)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 12, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# str(train_files[0]).split('/')[-1][:-4]" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 13, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# # Preparing dataset for fastai\n", 151 | "# for file in train_files:\n", 152 | "# with open(file, 'rb') as f:\n", 153 | "# text = pickle.load(f)\n", 154 | "# with open(path/'hindi_transformer'/'train'/(str(file).split('/')[-1][:-4]+'.txt'), \"w\") as text_file:\n", 155 | "# text_file.write(text)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 14, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "# for file in test_files:\n", 165 | "# with open(file, 'rb') as f:\n", 166 | "# text = pickle.load(f)\n", 167 | "# with open(path/'hindi_transformer'/'valid'/(str(file).split('/')[-1][:-4]+'.txt'), \"w\") as text_file:\n", 168 | "# text_file.write(text)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 15, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# from inltk.tokenizer import HindiTokenizer" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 16, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "# HindiTokenizer" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 17, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "text/plain": [ 197 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 198 | ] 199 | }, 200 | "execution_count": 17, 201 | "metadata": {}, 202 | "output_type": "execute_result" 203 | } 204 | ], 205 | "source": [ 206 | "path" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 18, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "class HindiTokenizer(BaseTokenizer):\n", 216 | " def __init__(self, lang:str):\n", 217 | " self.lang = lang\n", 218 | " self.sp = spm.SentencePieceProcessor()\n", 219 | " self.sp.Load(str(path/\"../tokenizer/hindi_lm_large.model\"))\n", 220 | " \n", 221 | " def tokenizer(self, t:str) -> List[str]:\n", 222 | " return self.sp.EncodeAsPieces(t)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 19, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "sp = spm.SentencePieceProcessor()\n", 232 | "sp.Load(str(path/\"../tokenizer/hindi_lm_large.model\"))\n", 233 | "itos = [sp.IdToPiece(int(i)) for i in range(30000)]" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 20, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/plain": [ 244 | "30000" 245 | ] 246 | }, 247 | "execution_count": 20, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "len(itos)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 21, 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "['',\n", 265 | " '',\n", 266 | " '',\n", 267 | " '▁के',\n", 268 | " '।',\n", 269 | " '▁में',\n", 270 | " '▁है',\n", 271 | " ',',\n", 272 | " '▁की',\n", 273 | " '▁',\n", 274 | " '▁और',\n", 275 | " '▁से',\n", 276 | " '▁का',\n", 277 | " '▁को',\n", 278 | " '▁हैं',\n", 279 | " '▁एक',\n", 280 | " '▁पर',\n", 281 | " '.',\n", 282 | " '-',\n", 283 | " '▁ने']" 284 | ] 285 | }, 286 | "execution_count": 21, 287 | "metadata": {}, 288 | "output_type": "execute_result" 289 | } 290 | ], 291 | "source": [ 292 | "itos[:20]" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 22, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "# 30,000 is the vocab size that we chose in sentencepiece\n", 302 | "hindi_vocab = Vocab(itos)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 23, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "tokenizer = Tokenizer(tok_func=HindiTokenizer, lang='hi')" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 24, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "['xxunk',\n", 323 | " 'xxpad',\n", 324 | " 'xxbos',\n", 325 | " 'xxeos',\n", 326 | " 'xxfld',\n", 327 | " 'xxmaj',\n", 328 | " 'xxup',\n", 329 | " 'xxrep',\n", 330 | " 'xxwrep']" 331 | ] 332 | }, 333 | "execution_count": 24, 334 | "metadata": {}, 335 | "output_type": "execute_result" 336 | } 337 | ], 338 | "source": [ 339 | "tokenizer.special_cases" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 25, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 351 | ] 352 | }, 353 | "execution_count": 25, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "path" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 26, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "data_lm = TextLMDataBunch.from_folder(path=path/'hindi_transformer', tokenizer=tokenizer, vocab=hindi_vocab)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 27, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "64" 380 | ] 381 | }, 382 | "execution_count": 27, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "data_lm.batch_size" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 28, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "data_lm.save()" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 29, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "data": { 407 | "text/html": [ 408 | "\n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | "
idxtext
0▁जे स्टन ▁जेम्स ▁बॉण्ड ▁1953 ▁में ▁अंग्रेज़ ▁लेखक ▁इयान ▁फ़्लेम िंग ▁द्वारा ▁रचित ▁एक ▁काल्पनिक ▁पात्र ▁है । ▁007 ▁के ▁गुप्त ▁नाम ▁से ▁प्रसिद्ध ▁यह ▁एजेंट ▁फ़्लेम िंग ▁की ▁बारह ▁पुस्तकों ▁व ▁दो ▁लघुकथा ओं ▁में ▁मौजूद ▁है । ▁1964 ▁में ▁फ़्लेम िंग ▁की ▁मृत्यु ▁के ▁पश्चात ▁छः ▁अन्य ▁लेखकों ▁ने ▁बॉण्ड ▁की ▁आधि कृत ▁पुस्तकें ▁लिखी ▁हैं , ▁जिनमें ▁किंग्स ले ▁ऐ मिस , ▁क्रिस्टोफ़र ▁वुड्स , ▁जॉन ▁गार्ड
1▁जो ▁क ल्हण ▁द्वारा ▁12 वीं ▁शताब्दी ▁ई . ▁में ▁लिखा ▁गया ▁था । ▁तब ▁तक ▁यहां ▁पूर्ण ▁हिन्दू ▁राज्य ▁रहा ▁था । यह ▁अशोक ▁महान ▁के ▁साम्राज्य ▁का ▁हिस्सा ▁भी ▁रहा । ▁लगभग ▁तीसरी ▁शताब्दी ▁में ▁अशोक ▁का ▁शासन ▁रहा ▁था । ▁तभी ▁यहां ▁बौद्ध ▁धर्म ▁का ▁आगमन ▁हुआ , ▁जो ▁आगे ▁चलकर ▁कुषाण ों ▁के ▁अधीन ▁सम ृ ध्द ▁हुआ ▁था । उ ज्ज ैन ▁के ▁महाराज ▁विक्रमादित्य
2▁पाकिस्तान ▁शामिल ▁हैं ▁जबकि ▁भारी ▁मात्रा ▁में ▁कैन ोला ▁ऑयल ▁और ▁मील ▁संयुक्त ▁राज्य ▁अमेरिका ▁जाता ▁है ▁और ▁इसकी ▁छोटी ▁मात्रा एं ▁मैक्सिको , ▁चीन ▁और ▁यूरोप ▁में ▁भेज ▁दी ▁जाती ▁हैं । ▁वर्ष ▁2002 - 2003 ▁के ▁मौसम ▁में ▁दुनिया ▁भर ▁में ▁लगभग ▁14 ▁मिलियन ▁मैट्रिक ▁टन ▁रे प सीड ▁ऑयल ▁का ▁उत्पादन ▁हुआ ▁था । ▁कैन ोला ▁को ▁पारंपरिक ▁पौध ▁प्रजनन ▁के ▁जरिये ▁रे प सीड ▁से ▁विकसित
3▁फिल्मों ▁में ▁दिखी ं । ▁पहले ▁अभिषेक ▁कपूर ▁की ▁फि तूर ▁में ▁जो ▁चार्ल्स ▁डिकेंस ▁के ▁उपन्यास ▁ग्रेट ▁एक्स्प ेक्ट ै शन ▁पर ▁आधारित ▁थी । ▁फिल्म ▁में ▁आदित्य ▁रॉय ▁कपूर ▁और ▁तब ु ▁भी ▁थे । ▁बाद ▁में ▁बार ▁बार ▁देखो ▁में ▁वह ▁सिद्धार्थ ▁मल्होत्रा ▁के ▁साथ ▁नज़र ▁आई । ▁दोनों ▁ही ▁फिल्म ▁सफल ▁नहीं ▁रही । ▁हिन्दुस्तान ▁के ▁विशाल ▁ठाकुर ▁ने ▁लिखा : ▁\" बार ▁बार ▁देखो ▁में
4▁नहीं ▁आते ▁हैं , ▁जबकि ▁मु वत् ता ▁के ▁सभी ▁हदीस ▁अन्य ▁सही ह ▁किताबों ▁में ▁शामिल ▁हैं । ▁सुन्नी ▁मुस्लिम ▁छह ▁प्रमुख ▁हदीस ▁संग्रह ों ▁को ▁उनके ▁सबसे ▁महत्वपूर्ण ▁मानते ▁हैं , ▁हालांकि ▁प्रामाणिकता ▁का ▁क्रम ▁मज़ ह ब ों ▁के ▁बीच ▁भिन्न ▁होता ▁है ▁इब्न ▁हज र ▁के ▁अनुसार , ▁पहले ▁दो , ▁जिसे ▁आमतौर ▁पर ▁दो ▁सह हि ह ▁के ▁रूप ▁में ▁जाना ▁जाता ▁है , ▁उनकी
" 438 | ], 439 | "text/plain": [ 440 | "" 441 | ] 442 | }, 443 | "metadata": {}, 444 | "output_type": "display_data" 445 | } 446 | ], 447 | "source": [ 448 | "data_lm.show_batch()" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 35, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "??language_model_learner" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 30, 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "30000" 469 | ] 470 | }, 471 | "execution_count": 30, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "len(data_lm.vocab.itos)" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 31, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "learn = language_model_learner(data_lm, TransformerXL, pretrained=False)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 32, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "data": { 496 | "text/plain": [ 497 | "20" 498 | ] 499 | }, 500 | "execution_count": 32, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "gc.collect()" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 33, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "data": { 516 | "text/plain": [ 517 | "SequentialRNN(\n", 518 | " (0): TransformerXL(\n", 519 | " (encoder): Embedding(30000, 410)\n", 520 | " (pos_enc): PositionalEncoding()\n", 521 | " (drop_emb): Dropout(p=0.1)\n", 522 | " (layers): ModuleList(\n", 523 | " (0): DecoderLayer(\n", 524 | " (mhra): MultiHeadRelativeAttention(\n", 525 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 526 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 527 | " (drop_att): Dropout(p=0.1)\n", 528 | " (drop_res): Dropout(p=0.1)\n", 529 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 530 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 531 | " )\n", 532 | " (ff): SequentialEx(\n", 533 | " (layers): ModuleList(\n", 534 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 535 | " (1): ReLU(inplace)\n", 536 | " (2): Dropout(p=0.1)\n", 537 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 538 | " (4): Dropout(p=0.1)\n", 539 | " (5): MergeLayer()\n", 540 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 541 | " )\n", 542 | " )\n", 543 | " )\n", 544 | " (1): DecoderLayer(\n", 545 | " (mhra): MultiHeadRelativeAttention(\n", 546 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 547 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 548 | " (drop_att): Dropout(p=0.1)\n", 549 | " (drop_res): Dropout(p=0.1)\n", 550 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 551 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 552 | " )\n", 553 | " (ff): SequentialEx(\n", 554 | " (layers): ModuleList(\n", 555 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 556 | " (1): ReLU(inplace)\n", 557 | " (2): Dropout(p=0.1)\n", 558 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 559 | " (4): Dropout(p=0.1)\n", 560 | " (5): MergeLayer()\n", 561 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 562 | " )\n", 563 | " )\n", 564 | " )\n", 565 | " (2): DecoderLayer(\n", 566 | " (mhra): MultiHeadRelativeAttention(\n", 567 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 568 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 569 | " (drop_att): Dropout(p=0.1)\n", 570 | " (drop_res): Dropout(p=0.1)\n", 571 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 572 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 573 | " )\n", 574 | " (ff): SequentialEx(\n", 575 | " (layers): ModuleList(\n", 576 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 577 | " (1): ReLU(inplace)\n", 578 | " (2): Dropout(p=0.1)\n", 579 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 580 | " (4): Dropout(p=0.1)\n", 581 | " (5): MergeLayer()\n", 582 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 583 | " )\n", 584 | " )\n", 585 | " )\n", 586 | " (3): DecoderLayer(\n", 587 | " (mhra): MultiHeadRelativeAttention(\n", 588 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 589 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 590 | " (drop_att): Dropout(p=0.1)\n", 591 | " (drop_res): Dropout(p=0.1)\n", 592 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 593 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 594 | " )\n", 595 | " (ff): SequentialEx(\n", 596 | " (layers): ModuleList(\n", 597 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 598 | " (1): ReLU(inplace)\n", 599 | " (2): Dropout(p=0.1)\n", 600 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 601 | " (4): Dropout(p=0.1)\n", 602 | " (5): MergeLayer()\n", 603 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 604 | " )\n", 605 | " )\n", 606 | " )\n", 607 | " (4): DecoderLayer(\n", 608 | " (mhra): MultiHeadRelativeAttention(\n", 609 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 610 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 611 | " (drop_att): Dropout(p=0.1)\n", 612 | " (drop_res): Dropout(p=0.1)\n", 613 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 614 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 615 | " )\n", 616 | " (ff): SequentialEx(\n", 617 | " (layers): ModuleList(\n", 618 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 619 | " (1): ReLU(inplace)\n", 620 | " (2): Dropout(p=0.1)\n", 621 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 622 | " (4): Dropout(p=0.1)\n", 623 | " (5): MergeLayer()\n", 624 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 625 | " )\n", 626 | " )\n", 627 | " )\n", 628 | " (5): DecoderLayer(\n", 629 | " (mhra): MultiHeadRelativeAttention(\n", 630 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 631 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 632 | " (drop_att): Dropout(p=0.1)\n", 633 | " (drop_res): Dropout(p=0.1)\n", 634 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 635 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 636 | " )\n", 637 | " (ff): SequentialEx(\n", 638 | " (layers): ModuleList(\n", 639 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 640 | " (1): ReLU(inplace)\n", 641 | " (2): Dropout(p=0.1)\n", 642 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 643 | " (4): Dropout(p=0.1)\n", 644 | " (5): MergeLayer()\n", 645 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 646 | " )\n", 647 | " )\n", 648 | " )\n", 649 | " (6): DecoderLayer(\n", 650 | " (mhra): MultiHeadRelativeAttention(\n", 651 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 652 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 653 | " (drop_att): Dropout(p=0.1)\n", 654 | " (drop_res): Dropout(p=0.1)\n", 655 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 656 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 657 | " )\n", 658 | " (ff): SequentialEx(\n", 659 | " (layers): ModuleList(\n", 660 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 661 | " (1): ReLU(inplace)\n", 662 | " (2): Dropout(p=0.1)\n", 663 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 664 | " (4): Dropout(p=0.1)\n", 665 | " (5): MergeLayer()\n", 666 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 667 | " )\n", 668 | " )\n", 669 | " )\n", 670 | " (7): DecoderLayer(\n", 671 | " (mhra): MultiHeadRelativeAttention(\n", 672 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 673 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 674 | " (drop_att): Dropout(p=0.1)\n", 675 | " (drop_res): Dropout(p=0.1)\n", 676 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 677 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 678 | " )\n", 679 | " (ff): SequentialEx(\n", 680 | " (layers): ModuleList(\n", 681 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 682 | " (1): ReLU(inplace)\n", 683 | " (2): Dropout(p=0.1)\n", 684 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 685 | " (4): Dropout(p=0.1)\n", 686 | " (5): MergeLayer()\n", 687 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 688 | " )\n", 689 | " )\n", 690 | " )\n", 691 | " (8): DecoderLayer(\n", 692 | " (mhra): MultiHeadRelativeAttention(\n", 693 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 694 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 695 | " (drop_att): Dropout(p=0.1)\n", 696 | " (drop_res): Dropout(p=0.1)\n", 697 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 698 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 699 | " )\n", 700 | " (ff): SequentialEx(\n", 701 | " (layers): ModuleList(\n", 702 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 703 | " (1): ReLU(inplace)\n", 704 | " (2): Dropout(p=0.1)\n", 705 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 706 | " (4): Dropout(p=0.1)\n", 707 | " (5): MergeLayer()\n", 708 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 709 | " )\n", 710 | " )\n", 711 | " )\n", 712 | " (9): DecoderLayer(\n", 713 | " (mhra): MultiHeadRelativeAttention(\n", 714 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 715 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 716 | " (drop_att): Dropout(p=0.1)\n", 717 | " (drop_res): Dropout(p=0.1)\n", 718 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 719 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 720 | " )\n", 721 | " (ff): SequentialEx(\n", 722 | " (layers): ModuleList(\n", 723 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 724 | " (1): ReLU(inplace)\n", 725 | " (2): Dropout(p=0.1)\n", 726 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 727 | " (4): Dropout(p=0.1)\n", 728 | " (5): MergeLayer()\n", 729 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 730 | " )\n", 731 | " )\n", 732 | " )\n", 733 | " (10): DecoderLayer(\n", 734 | " (mhra): MultiHeadRelativeAttention(\n", 735 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 736 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 737 | " (drop_att): Dropout(p=0.1)\n", 738 | " (drop_res): Dropout(p=0.1)\n", 739 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 740 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 741 | " )\n", 742 | " (ff): SequentialEx(\n", 743 | " (layers): ModuleList(\n", 744 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 745 | " (1): ReLU(inplace)\n", 746 | " (2): Dropout(p=0.1)\n", 747 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 748 | " (4): Dropout(p=0.1)\n", 749 | " (5): MergeLayer()\n", 750 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 751 | " )\n", 752 | " )\n", 753 | " )\n", 754 | " (11): DecoderLayer(\n", 755 | " (mhra): MultiHeadRelativeAttention(\n", 756 | " (attention): Linear(in_features=410, out_features=1230, bias=False)\n", 757 | " (out): Linear(in_features=410, out_features=410, bias=False)\n", 758 | " (drop_att): Dropout(p=0.1)\n", 759 | " (drop_res): Dropout(p=0.1)\n", 760 | " (ln): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 761 | " (r_attn): Linear(in_features=410, out_features=410, bias=False)\n", 762 | " )\n", 763 | " (ff): SequentialEx(\n", 764 | " (layers): ModuleList(\n", 765 | " (0): Linear(in_features=410, out_features=2100, bias=True)\n", 766 | " (1): ReLU(inplace)\n", 767 | " (2): Dropout(p=0.1)\n", 768 | " (3): Linear(in_features=2100, out_features=410, bias=True)\n", 769 | " (4): Dropout(p=0.1)\n", 770 | " (5): MergeLayer()\n", 771 | " (6): LayerNorm(torch.Size([410]), eps=1e-05, elementwise_affine=True)\n", 772 | " )\n", 773 | " )\n", 774 | " )\n", 775 | " )\n", 776 | " )\n", 777 | " (1): LinearDecoder(\n", 778 | " (decoder): Linear(in_features=410, out_features=30000, bias=True)\n", 779 | " (output_dp): RNNDropout()\n", 780 | " )\n", 781 | ")" 782 | ] 783 | }, 784 | "execution_count": 33, 785 | "metadata": {}, 786 | "output_type": "execute_result" 787 | } 788 | ], 789 | "source": [ 790 | "learn.model" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 34, 796 | "metadata": {}, 797 | "outputs": [ 798 | { 799 | "data": { 800 | "text/html": [], 801 | "text/plain": [ 802 | "" 803 | ] 804 | }, 805 | "metadata": {}, 806 | "output_type": "display_data" 807 | }, 808 | { 809 | "name": "stdout", 810 | "output_type": "stream", 811 | "text": [ 812 | "LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "learn.lr_find()" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 35, 823 | "metadata": {}, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZgAAAEGCAYAAABYV4NmAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3deXxV1bn/8c+TmZAQIAmQBJAZcQIlTCrgUCesWmy9lVt7cURb7WRtb61t7e2otv1Va+tA62wd64SCAyqDAyhBQUZBECSEIYwJkDnr98fZwTRNIMLZ2WeffN+v13mdvfdZ+5xncUKerLX2Xsucc4iIiERbQtABiIhIfFKCERERXyjBiIiIL5RgRETEF0owIiLii6SgA4imnJwc16dPn6DDEBEJjYULF25zzuX68d5xlWD69OlDUVFR0GGIiISGma3367196yIzs/vNbKuZLW107A9mttLMPjKz58yscwvnnm1mH5vZJ2b2E79iFBER//g5BvMgcHaTYzOBY5xzxwGrgBubnmRmicDfgHOAo4BJZnaUj3GKiIgPfEswzrm5wI4mx15zztV6u/OBns2cOhL4xDm31jlXDTwBXOBXnCIi4o8gryK7HHi5meMFwIZG+8XesWaZ2RQzKzKzotLS0iiHKCIihyqQBGNmNwG1wD+be7mZYy1OmOacm+qcK3TOFebm+nIhhIiIHII2v4rMzCYDXwZOd83PtFkM9Gq03xMoaYvYREQketq0BWNmZwP/C5zvnNvXQrEFwEAz62tmKcDFwLS2ilFERKLDz8uUHwfmAYPNrNjMrgD+CmQCM81skZnd45XNN7MZAN5FANcBrwIrgKecc8v8ilNEJMxmLt/CPXPWBB1Gs3zrInPOTWrm8H0tlC0BJjTanwHM8Ck0EZG4MXP5Zuau2sY14/sHHcp/0FxkIiIhVl5ZS2ZabE7KogQjIhJiSjAiIuKL8soaMtOSgw6jWUowIiIhphaMiIj4oqyyVi0YERGJvvLKGjqpBSMiItFUXVtPVW29ushERCS6yitrANRFJiIi0VVWGVn9RC0YERGJKrVgRETEF+VqwYiIiB8+b8EowYiISBQ1jMF0UheZiIhEU7kSjIiI+KGhiyxDXWQiIhJN5ZW1dExJJDHBgg6lWUowIiIhFcszKYMSjIhIaMXyTMqgBCMiElpKMCIi4gt1kYmIiC/K1IIRERE/qAUjIiK+KKusjdnFxsDHBGNm95vZVjNb2ujYRWa2zMzqzazwAOeuM7MlZrbIzIr8ilFEJKyqauuojuHFxsDfFsyDwNlNji0FLgTmtuL8U51zw5xzLSYiEZH2av80MR1it4vMt9TnnJtrZn2aHFsBYBabd52KiIRFrE/VD7E7BuOA18xsoZlNOVBBM5tiZkVmVlRaWtpG4YmIBGv/VP2psduCidUEc5Jz7gTgHOBaMxvXUkHn3FTnXKFzrjA3N7ftIhQRCZBaMIfIOVfiPW8FngNGBhuRiEhsifXlkiEGE4yZdTSzzIZt4EwiFweIiIinrD23YMzscWAeMNjMis3sCjObaGbFwBhgupm96pXNN7MZ3qndgbfNbDHwPjDdOfeKX3GKiIRRrC82Bv5eRTaphZeea6ZsCTDB214LDPUrLhGReBDri41BDHaRiYjIwcX6YmOgBCMiEkplFbE9DxkowYiIhFKsrwUDSjAiIqFUXlWjBCMiItFXXlkb0/OQgRKMiEgoRbrIlGBERCTKIouNqYtMRESiLNaXSwYlGBGR0GlYbCyW7+IHJRgRkdAJw0zKoAQjIhI6SjAiIuKLMCw2BkowIiKhoxaMiIj4oqwi9hcbAyUYEZHQUQtGRER8UeaNwegyZRERiaqGFkwsLzYGSjAiIqFTXllLRmpSTC82BkowIiKhE4Z5yEAJRkQkdMKw2BgowYiIhE5ksbHYHuAHJRgRkdBp9y0YM7vfzLaa2dJGxy4ys2VmVm9mhQc492wz+9jMPjGzn/gVo4hIGIVhsTHwtwXzIHB2k2NLgQuBuS2dZGaJwN+Ac4CjgElmdpRPMYqIhE67H+R3zs0FdjQ5tsI59/FBTh0JfOKcW+ucqwaeAC7wKUwRkdAJw2JjEJtjMAXAhkb7xd6xZpnZFDMrMrOi0tJS34MTEQlSZU04FhuD2Ewwzd055Foq7Jyb6pwrdM4V5ubm+hiWiEjwwjIPGcRmgikGejXa7wmUBBSLiEhM2b8WjBLMIVkADDSzvmaWAlwMTAs4JhGRmLC/BRPji42Bv5cpPw7MAwabWbGZXWFmE82sGBgDTDezV72y+WY2A8A5VwtcB7wKrACecs4t8ytOEZEwaUgwnTrEfoLxrY3lnJvUwkvPNVO2BJjQaH8GMMOn0EREQktdZCIi4gsN8ouIiC/KKsOxXDIowYiIhMr+xcZS1YIREZEoCstiY6AEIyISKmGZhwyUYEREQiUsU/WDEoyISKiUVYZjsTFQghERCZWyyhqyQnCTJSjBiIiEyu4KJRgREfFBWUUtnTQGIyIi0VRf7yivrAnFPGSgBCMiEhp7qmupd6iLTEREoqusIjJNTBhWswQlGBGR0CiraJiqX2MwIiISRbsbWjDqIhMRkWhqmElZXWQiIhJVDWMwGuQXEZGoUheZiIj4oqyyFjPIDMFaMKAEIyISGmUVNWSkJpEQgrVgQAlGRCQ0ykI0DxkowYiIhEZZZU1oriADJRgRkdAoq6gNzU2W4GOCMbP7zWyrmS1tdKyrmc00s9Xec5cWzq0zs0XeY5pfMYqIhEmYpuoHf1swDwJnNzn2E+AN59xA4A1vvzkVzrlh3uN8H2MUEQkNdZF5nHNzgR1NDl8APORtPwR8xa/PFxGJN2UV4ZmqH9p+DKa7c24TgPfcrYVyaWZWZGbzzeyAScjMpnhli0pLS6Mdr4hITKipq2dvdV38tWDMrL+ZpXrbp5jZd82ss49x9XbOFQL/DdxuZv1bKuicm+qcK3TOFebm5voYkohIcMorIzMpZ8XhIP8zQJ2ZDQDuA/oCjx3C520xszwA73lrc4WccyXe81pgNnD8IXyWiEjcKAvZNDHQ+gRT75yrBSYCtzvnfgDkHcLnTQMme9uTgReaFjCzLo1aSznAScDyQ/gsEZG4sTtki41B6xNMjZlNIpIUXvKOHbCWZvY4MA8YbGbFZnYFcAtwhpmtBs7w9jGzQjP7h3fqEKDIzBYDs4BbnHNKMCLSrjVM1Z+VHp4E09rOvMuAa4DfOuc+NbO+wKMHOsE5N6mFl05vpmwRcKW3/S5wbCvjiooF63aQYEZKYgIpSQkkJxpJCQmYQUKCYYBZQ6ze8+ex738fMyPBwDASEiA1KZHUpARSkxIwC8fcQSISm/avZhmiFkyrEozXgvguRLqwgEzn3C1+BtaWvnnfe1TW1Pv6GSmJkUSTmpywP/GkJCWQmvx5EkpLTiQ9JZEOyYl08J7TUxLpkJJEekpkOzMtiY4pSWSkJZGRmkR6SuQ5LVlJTCSefT5Vf3gG+VsVqZnNBs73yi8CSs1sjnPueh9jazMPXDqS6rp6qmu9R10ddfWR1olzUO8cDmj49d3we9zYvwEOHA3loc45qmvrqaqto6qmnsrauv3vX+U9qmvrqKyJlCmvrKW0vIrKmjr2VddRUV1HRU0dtfWumYj/U2KC0TElkcy0ZDLTkuiUlkynDkl07ZhC146pZHdMiWxnpJDdMYXsjMixtOTEaP9ziogP9neRhWiQv7WpMMs5V2ZmVwIPOOduNrOP/AysLY3pnx10CC2qrq2norqOvdW17KuuZU9VHXuratlTVcte79H4WFllDeWVtZRX1rBxVyVLNu5mx95qauqaT1SZqUnkZqaSk5FKbmYq3TqlkpeVRvdOafTolEZ+5w7kZaWRlKhp60SCVFZRQ1KC0SFEfxS2NsEkeZcV/xdwk4/xSBMpXlfa4QzsOefYU1XLjr3VbN9bzfY91ezYW8W2PdWUlldRuqeK0vIqlm8qY9bHleyrrvu38xMMenRKo2eXdHpnp9M3pyP9cjrSN7cjfbI7qhUk0gZ2e3fxh6krvLUJ5lfAq8A7zrkFZtYPWO1fWBJNZuZ1nSVzRHbHA5Z1zlFeVcuW3ZVsLqukZFcFxTsr2Lizgg079zF3VSn/Wli8v3yCwRHZHRnQLYOB3TIY3COTo/M70Tcng8SQLIokEgZllbWh6h6D1g/yPw083Wh/LfBVv4KS4JhZZPwmLZmB3TObLbOnqpZ12/aydtte1mzdw+qt5azesodZK7fuHzNKS07gyB6dOK5nFsOP6EJhn64UdO7QllURiStlFTV0SgvPAD+0fpC/J3AnkZseHfA28D3nXPEBT5S4lJGaxDEFWRxTkPVvx6tr61lTuoflJWUsKylj+abdPLOwmIfnrQcgLyuNwj5dGdm3K2P6daV/bkaomvsiQdodsokuofVdZA8QmRrmIm//Eu/YGX4EJeGUkpTAkLxODMnrxFeHR47V1TtWbi6jaN1OitbvZMGnO3hxcQkAORkpjOqbzUkDchg7MIdeXdMDjF4ktpVV1lDQJVy9AK1NMLnOuQca7T9oZt/3IyCJL4kJxtH5WRydn8XkE/vgnOOzHfuYv3Y7763dwbtrtjN9ySYA+mSnM3ZgLuMG5XJi/2w6poarO0DET2UVtaG6yRJan2C2mdklwOPe/iRguz8hSTwzM47I7sgR2R35+ojeOOdYU7qHt1Zv463V23jmg2Iemb+e5ESj8IiujB+cyymDcxncPVPdadJuOee8tWDC9UdXa6O9HPgr8GciYzDvEpk+RuSwmBkDumUyoFsml53Ul6raOhau28mcVaXMWVXKLS+v5JaXV9KjUxqneMnm5IG5ZKh1I+1IVW091XX1cXsV2WdE7uTfz+siu92PoKT9Sk1K5MQBOZw4IIcbJwxh8+5K5qzayuyPS5n+0SaeWLCB5ERjdL9svjSkO6cP6UbPLhq7kfhWFsKZlKH1LZjmXI8SjPisR1YaXx/Rm6+P6E1NXT0L1+/kzZVbeX35Fm6etoybpy3jmIJOTDg2j3OPzTvofT4iYbQ7hGvBwOElGHWIS5tKTkxgdL9sRvfL5qcThrCmdA+vL9/Cy0s3c9srH3PbKx8r2UhcCuM8ZHB4CaZ1szCK+KR/bgb9x2dw9fj+FO/cx8tLNjN9ySYlG4k7n0/VH66xxwNGa2blNJ9IDAjXBdkS13p2Seeqcf24aly/ZpPNsQVZnHtcJNnofhsJm7jsInPONT9XiEgMay7ZvLRk0/4r0ob2zOK8ofmce1weeVn6O0liX3vsIhOJeY2TzYYd+5i+ZBPTP9rEb6av4DfTVzCyT1fOG5bPl4/No0vHlKDDFWlWw1VkmfHURSYST3p1Teea8f25Znx/Pt22l5cWlzBtcQk/f34pv3pxGacM7sZXTyjg1CO7kZqkJQgkduyuqCHNWw03TJRgpF3qm9OR75w+kOtOG8CKTeU892Exzy8qYebyLWR1SGbi8QVcPLIXR/boFHSoIpRVhG+qflCCkXbOzDgqvxNH5R/F/559JO+s2c6/Fhbz2Huf8eC76xjWqzOTRvbivKH5pKfov4sEo6yyJnQ3WYISjMh+SYkJjB+Uy/hBuezYW82zHxTzxIIN/O8zS/jNSyuYeEIB/z2qt1o10ubCOFU/gK8LrZvZ/Wa21cyWNjrW1cxmmtlq77lLC+dO9sqsNrPJfsYp0lTXjilcObYfM38wjqevGcOXjurOEws2cPbtb3HhXe8w/aNN1NXrVjBpG2WVNaHsIvM1wQAPAmc3OfYT4A3n3EDgDW//35hZV+BmYBQwEri5pUQk4iczY0Sfrvz568N478bT+dm5Q9ixt5prH/uA0/80m8fe+4zKmrqgw5Q4F5mqP3wdTr4mGOfcXGBHk8MXAA952w8BX2nm1LOAmc65Hc65ncBM/jNRibSpLl6r5o0fnsLd3ziBTh2S+elzSxh72yymzl3DvuraoEOUOKUustbr7pzbBOA9d2umTAGwodF+sXdMJHCJCcY5x+bxwrUn8diVoxjcPZPfzVjJybfO4u7Za9hTpUQj0VNf7yhXF1lUNTeRZrMd3mY2xcyKzKyotLTU57BEPmdmnDggh0evHMUz3zqRYwuyuPWVlZx865vcNfsTKqrVdSaHb291LfUufFP1QzAJZouZ5QF4z1ubKVMM9Gq03xMoae7NnHNTnXOFzrnC3NzcqAcr0hrDj+jCQ5eP5PlrT+L4Xp257ZWPGfeHWTwyfz01dfVBhych9vk8ZBqDaY1pQMNVYZOBF5op8ypwppl18Qb3z/SOicS0Yb0688BlI3n6mjH0yU7n588v5fQ/zWH6R5twTledyRfXMJOyusiaMLPHgXnAYDMrNrMrgFuAM8xsNXCGt4+ZFZrZPwCcczuAXwMLvMevvGMioTCiT1eeunoMD1w6gvSURK597AMufWABn23fF3RoEjINE12GsYvM1zaXc25SCy+d3kzZIuDKRvv3A/f7FJqI78yMU4/sxrhBuTw8bx1/em0VZ/x5Dt89fSBXje1HSlKsDoFKLAnrVP0Qu4P8InEjMcG47KS+vH79eE47sht/ePVjzrvzbVZtKQ86NAmBhpmU1UUmIi3qkZXG3ZcM577JhWzfW8V5d77No/PXa2xGDqissmE1SyUYETmI04d0Z8b3xjKyb1d+9vxSvvXoB+zaVx10WBKjGrrIMnQnv4i0RrfMNB66bCQ/nXAkr6/YwoQ73uLDz3YGHZbEoLKKGjLTkkhMaO72wNimBCMSkIQEY8q4/jzzrRMxM/7r3nk8Mm+duszk34R1qn5QghEJ3NBenZn+3ZM5aUAOP39hGT94cpHmNZP9ykI6DxkowYjEhM7pKdw/eQTXnzGIFxaXMPFv77Jhh+6ZkfDOpAxKMCIxIyHB+O7pA3nospFs2l3BxLveZfGGXUGHJQHbua86lJcogxKMSMwZNyiXZ799ImnJCXx96jxeW7Y56JAkQJt3V5LfuUPQYRwSJRiRGDSgWybPffskBvfoxNWPLuS+tz8NOiQJwO6KGsqrasnvnBZ0KIdECUYkRuVmpvLEVaM586ju/Pql5fx2+nLqtUxzu1KyqwKAgs7pAUdyaJRgRGJYh5RE7vrGcCaPOYK/v/UpP3x6sab/b0caEkxYWzDhvDRBpB1JTDB+ef7R5Gam8sfXVrFjbzV3X3IC6Sn67xvvNu5vwWgMRkR8YmZcd9pAbrnwWN5aXcqkv7/Hzr2aXibebdxVQUpiAjkZqUGHckiUYERC5OKRvbnnkuGs2FTGRffOY9PuiqBDEh9t3FlBXuc0EkI4TQwowYiEzplH9+Dhy0eyeXclX7t7HmtL9wQdkvikZFdFaLvHQAlGJJRG98vmiSmjqayp46J75rF04+6gQxIflOwK7z0woAQjElrHFGTx1DVjSEtOZNLU+bz/qVYVjyfVtfVsKVeCEZGA9M/N4OlrxpDbKZX/uf895qwqDTokiZItZZU4Bz2VYEQkKPmdO/DU1WPom5PBVQ8V8cpSTS0TDzbuvwdGCUZEApSTEbnr/5iCTlz72Ac892Fx0CHJYdq407sHposSjIgELCs9mUeuGMWovl25/qnFPPTuuqBDksPQcBd/XlY47+IHJRiRuNIxNYn7Lx3Bl4Z05+Zpy/j9yys0f1lIbdxVQU5GCmnJiUGHcsgCSTBm9j0zW2pmy8zs+828foqZ7TazRd7jF0HEKRJGacmJ3HPJcC4Z3Zt756zl+08uoqq2Luiw5AvaGPJ7YCCAucjM7BjgKmAkUA28YmbTnXOrmxR9yzn35baOTyQeJCYYv77gGAo6p3PrKyvZWl7Jvd8sDO3CVe1Rya4KBnXPDDqMwxJEC2YIMN85t885VwvMASYGEIdIXDMzvnVKf+64eBgL1+/kwrveYd22vUGHJa3gnIuLFkwQCWYpMM7Mss0sHZgA9Gqm3BgzW2xmL5vZ0S29mZlNMbMiMysqLdU9ACJNXTCsgEevGMWOvdVc8Ld3ePeTbUGHJAexc18NlTX1ob5EGQJIMM65FcCtwEzgFWAxUNuk2AfAEc65ocCdwPMHeL+pzrlC51xhbm6uT1GLhNuoftm8cO3JdMtM5X/uf59H568POiQ5gJI4uAcGAhrkd87d55w7wTk3DtgBrG7yeplzbo+3PQNINrOcAEIViRu9s9N59tsnMnZgDj97fim/eGGpFi+LUcXePTA9Q3wPDAR3FVk377k3cCHweJPXe5iZedsjicS5va3jFIk3mWnJ/GPyCKaM68fD89bzjb+/x9byyqDDkibUgjk8z5jZcuBF4Frn3E4zu8bMrvFe/xqw1MwWA38BLnbO6WJ+kShITDB+OmEId1w8jI827uK8O9/mg892Bh2WNLJxVwUdkhPpkh7uq/4CWXPVOTe2mWP3NNr+K/DXNg1KpJ25YFgBg7pncvUjC7n43vn88vyjmTSyF17ngQSoZFcF+Z3TQv9d6E5+kXZsSF4npl13EqP7Z/PT55bw/ScXsaeq6TU30tYiCSbc3WOgBCPS7nVOT+HBS0dww5mDeHFxCeff+TbLS8qCDqtd27irIvQD/KAEIyJAQoJx3WkDeeyq0eypquUrd73DP99bj4Y+215lTR3b9lSTn6UEIyJxZHS/bGZ8byyj+nblpueWctXDC9m2pyrosNqVTbsjV/WFeZr+BkowIvJvcjJSeeiykfzs3CHMXV3K2bfP5Y0VW4IOq91oWAdGYzAiEpcSEowrx/bjxetOJicjlSseKuLGZ5ewr1oXAPit4R6YsM9DBkowInIAg3tk8sJ1J3H1uH48seAzzv3L2yzasCvosOJa8a4KzKBHiBcaa6AEIyIHlJqUyI0ThvD4VaOprq3nq3e/yx2vr6ZW08z4omRXBd0z00hODP+v5/DXQETaRMMFAOcdl8efX1/FRffOY/WW8qDDijsluyriYoAflGBE5AvI6pDM7Rcfz18mHc+n2/Yy4S9v8cdXP6ayRitmRsvGOLnJEpRgROQQnD80nzeuH895Q/P566xPOOv2uby1WusxHa6de6vZsGMf/XI6Bh1KVCjBiMghyc5I5f/91zAeu3IUCWZ88773ufKhBazcrFkADtWcVaXUOzhlcHysbaUEIyKH5cQBObz8vbH86KzBvPfpDs654y2uf3IRG3bsCzq00Hlz5VayO6YwtGfnoEOJCiUYETlsacmJXHvqAN768alMGdeP6Us2cdqfZvObl5azu6Im6PBCobaunjmrShk/OJeEhHDPotxACUZEoqZzego3njOE2T86hYnHF3DfO59y6h9n88/31uuy5oP4cMMudlfUcNqR3YIOJWqUYEQk6vKyOnDb14by4nUnMyA3g5ueW8qX73ybOatKNYFmC95cuZXEBGPswPgYfwElGBHx0TEFWTx59Wju+sYJ7KmqZfL973Px1PksXK8VNJuatXIrI/p0IatDuFexbEwJRkR8ZWZMODaPN344nv87/2jWlO7lq3e/qyvOGtm4q4KVm8vjqnsMlGBEpI2kJiUy+cQ+zP3xKfzorMG8711xdsPTi/dP8NhezVq5FUAJRkTkcKSnJHlXnJ3GlLH9mLa4hFP/OJvfv7yC3fva5xVns1ZupVfXDvTPzQg6lKhSghGRQGSlJ3PjhCG8+cPxnHtsHlPnrmXsbW9y1+xP2tWyAJU1dbyzZhunDe6GWXxcntxACUZEAtWzSzr/7+vDeOk7J1PYpyu3vfIx426bzUPvrqOqNv7nOJu3djuVNfWcGmfdY6AEIyIx4uj8LO6/dAT/umYM/XI7cvO0ZYy9dRa3vLySNaV7gg7PN7NWbqVDciKj+2UHHUrUBZJgzOx7ZrbUzJaZ2febed3M7C9m9omZfWRmJwQRp4i0vcI+XXlyymgevnwkxxZk8fe31nL6n+Zw4V3v8MT7n1FRHT+tGuccb67cykkDsklLTgw6nKhLausPNLNjgKuAkUA18IqZTXfOrW5U7BxgoPcYBdztPYtIO2BmjBuUy7hBuWwtr+T5DzfydFExP3l2Cbe8spKLR/Tmf8YcEfpp7Reu30nxzgq+c9qAoEPxRRAtmCHAfOfcPudcLTAHmNikzAXAwy5iPtDZzPLaOlARCV63zDSmjOvPaz8Yx5NTRjO6bzZT565h7G2z+PY/FzJnVSl19eGcHeDR+evJTE3ivKH5QYfiizZvwQBLgd+aWTZQAUwAipqUKQA2NNov9o5tavpmZjYFmALQu3dvP+IVkRhgZozql82oftkU79zHI/PW82TRBmYs2UxeVhoXnlDARcN70Scka6ls31PFjCWbmTSyF+kpQfwq9l+b18o5t8LMbgVmAnuAxUDTaxKbu1av2T9RnHNTgakAhYWF4fwzRkS+kJ5d0rlxwhCuP3MQry/fytMLN3D37DX8bdYahh/RhQtPKODLx+aTlR670648VVRMdV09l4w+IuhQfBNI2nTO3QfcB2BmvyPSQmmsGOjVaL8nUNI20YlIWKQmJXLucXmce1wem3dX8tyHG3n2g2Juem4p/zdtOacd2Y2vHF/AqUfmkpoUO4Po9fWOx95fz6i+XRnYPTPocHwTSIIxs27Oua1m1hu4EBjTpMg04Doze4LI4P5u59x/dI+JiDTokZXGt07pzzXj+7GspIxnP9jItMUbeWXZZjLTkphwTB4XDMtndL/swNdbmbO6lA07KvjxWUcGGoffgur4e8Ybg6kBrnXO7TSzawCcc/cAM4iMzXwC7AMuCyhOEQkZM+OYgiyOKcjipxOO5N0123l+0UZe+qiEJ4s2kJ+VxleH9+Rrw3tyRHYw4zX/nL+enIxUzjq6RyCf31YsntZmKCwsdEVFTa8XEBGJTMkyc/kW/rWwmLdWl1LvYGTfrpx3XB6nDelOQRtd8ly8cx9jb5vFtacM4IazBrfJZx6ImS10zhX68d7xeemCiEgTacmJnDc0n/OG5rN5dyXPfFDMMwuL+fkLy/j5C8s4skcmpw/pxtlH53FMQSff5gV7/P3PMGDSqPi/6lUtGBFpt5xzrN22lzdWbOGNFVspWr+TunpHv5yOnD8sn/OH5tMvijMcV9bUcfKtbzKsVxf+MdmXRsMXphaMiIgPzIz+uRn0z81gyrj+7NpXzavLNvP8hyXc8f2rmdsAAAmySURBVMZqbn99NUfnd+KMo7rzpSHdOTr/0Fs29fWOG55ezLY91Vxxct8o1yQ2qQUjItKMzbsreXFxCa8u28zCz3biHBR07sC4Qbkc37szx/fqTP/cjFZfkfb7l1dw75y13HjOkVw9vr/P0beeny0YJRgRkYPYtqeKN1duZebyLcxfu53yysi94ZmpSQzr3ZmTBuRw8oAcjsrr1GzCeWTeOn7+wjK+OfoIfnXB0TG17osSTCspwYiI3+rrI+M2izbsYtGGnSz4dCcfbykHoEt6MmP6ZzO0Z2eO7Rm5VPr9tTuY8kgRpw7uxr3fHE5SYmytkqIxGBGRGJGQYAzolsGAbhl8bXhPALaWVfLOmm28vXo789duZ8aSzfvLJyYYR+dnced/Hx9zycVvSjAiIoepW6c0Jh7fk4nHRxLOjr3VLNm4myXFu9i2p5pvn9o/bie0PJD2V2MREZ917ZjC+EG5jB+UG3QogWpf7TUREWkzSjAiIuILJRgREfGFEoyIiPhCCUZERHyhBCMiIr5QghEREV8owYiIiC/iai4yMysF1jc5nAXsPsixxvsH284Bth1GmM3F09oyX7QuTfcbtuOpLo23D6c+h1OXll7Tz9nnx/TdtC7Wg5Xx47sZ7JzLPHjYh8A5F9cPYOrBjjXeP9g2UBTteFpb5ovW5QB1iJu6RKs+h1MX/Zwd+OdM3038fjcHe7SHLrIXW3HsxS+4He14Wlvmi9al6f6LLZQ5VLFQl9bGcTCHU5eWXtPPWXTouznw8SC/mwOKqy6ytmBmRc6nqa3bWjzVBeKrPvFUF4iv+sRTXcDf+rSHFky0TQ06gCiKp7pAfNUnnuoC8VWfeKoL+FgftWBERMQXasGIiIgvlGBERMQX7TrBmNn9ZrbVzJYewrnDzWyJmX1iZn8xM2v02nfM7GMzW2Zmt0U36hbjiXpdzOyXZrbRzBZ5jwnRj7zFmHz5brzXbzAzZ2Y50Yv4gPH48d382sw+8r6X18wsP/qRNxuPH3X5g5mt9OrznJl1jn7kLcbkR30u8v7v15uZ7xcDHE4dWni/yWa22ntMbnT8gP+vmuXX9c9heADjgBOApYdw7vvAGMCAl4FzvOOnAq8Dqd5+txDX5ZfADfHy3Xiv9QJeJXJDbk5Y6wJ0alTmu8A9Ia7LmUCSt30rcGuYf86AIcBgYDZQGKt18OLr0+RYV2Ct99zF2+5yoPoe6NGuWzDOubnAjsbHzKy/mb1iZgvN7C0zO7LpeWaWR+Q/+DwX+Zd/GPiK9/K3gFucc1XeZ2z1txYRPtUlMD7W58/Aj4E2u7rFj7o458oaFe1IG9XHp7q85pyr9YrOB3r6W4vP+VSfFc65j9sifu/zDqkOLTgLmOmc2+Gc2wnMBM4+1N8T7TrBtGAq8B3n3HDgBuCuZsoUAMWN9ou9YwCDgLFm9p6ZzTGzEb5Ge2CHWxeA67yui/vNrIt/obbKYdXHzM4HNjrnFvsdaCsc9ndjZr81sw3AN4Bf+BjrwUTj56zB5UT+Og5SNOsTlNbUoTkFwIZG+w31OqT6JrXyQ9sFM8sATgSebtS9mNpc0WaONfwFmUSkaTkaGAE8ZWb9vKzfZqJUl7uBX3v7vwb+ROQXQJs73PqYWTpwE5HumEBF6bvBOXcTcJOZ3QhcB9wc5VAPKlp18d7rJqAW+Gc0Y/wiolmfoByoDmZ2GfA979gAYIaZVQOfOucm0nK9Dqm+SjD/LgHY5Zwb1vigmSUCC73daUR+8TZuxvcESrztYuBZL6G8b2b1RCbHK/Uz8GYcdl2cc1sanfd34CU/Az6Iw61Pf6AvsNj7T9cT+MDMRjrnNvsce1PR+Dlr7DFgOgEkGKJUF28w+cvA6W39x1gT0f5ugtBsHQCccw8ADwCY2WzgUufcukZFioFTGu33JDJWU8yh1NfvAahYfwB9aDQ4BrwLXORtGzC0hfMWEGmlNAx4TfCOXwP8ytseRKS5aSGtS16jMj8Angjzd9OkzDraaJDfp+9mYKMy3wH+FeK6nA0sB3Lb8ufL758z2miQ/1DrQMuD/J8S6YXp4m13bU19m40riC80Vh7A48AmoIZIhr6CyF+5rwCLvR/6X7RwbiGwFFgD/JXPZ0VIAR71XvsAOC3EdXkEWAJ8ROSvtry2qItf9WlSZh1tdxWZH9/NM97xj4hMXFgQ4rp8QuQPsUXeo02uiPOxPhO996oCtgCvxmIdaCbBeMcv976TT4DLDlbfAz00VYyIiPhCV5GJiIgvlGBERMQXSjAiIuILJRgREfGFEoyIiPhCCUbimpntaePP+4eZHRWl96qzyGzJS83sxYPNMmxmnc3s29H4bJFo0GXKEtfMbI9zLiOK75fkPp+Y0VeNYzezh4BVzrnfHqB8H+Al59wxbRGfyMGoBSPtjpnlmtkzZrbAe5zkHR9pZu+a2Yfe82Dv+KVm9rSZvQi8ZmanmNlsM/uXRdYx+WfD2hje8UJve483IeViM5tvZt294/29/QVm9qtWtrLm8fmknRlm9oaZfWCR9Tku8MrcAvT3Wj1/8Mr+yPucj8zs/6L4zyhyUEow0h7dAfzZOTcC+CrwD+/4SmCcc+54IrMT/67ROWOAyc6507z944HvA0cB/YCTmvmcjsB859xQYC5wVaPPv8P7/IPO5+TNg3U6kdkUACqBic65E4isP/QnL8H9BFjjnBvmnPuRmZ0JDARGAsOA4WY27mCfJxItmuxS2qMvAUc1mmm2k5llAlnAQ2Y2kMhMscmNzpnpnGu85sb7zrliADNbRGQuqLebfE41n08QuhA4w9sew+draTwG/LGFODs0eu+FRNbmgMhcUL/zkkU9kZZN92bOP9N7fOjtZxBJOHNb+DyRqFKCkfYoARjjnKtofNDM7gRmOecmeuMZsxu9vLfJe1Q12q6j+f9LNe7zQc6WyhxIhXNumJllEUlU1wJ/IbL+Sy4w3DlXY2brgLRmzjfg9865e7/g54pEhbrIpD16jcj6KQCYWcO05lnARm/7Uh8/fz6RrjmAiw9W2Dm3m8iyyDeYWTKROLd6yeVU4AivaDmQ2ejUV4HLvfVBMLMCM+sWpTqIHJQSjMS7dDMrbvS4nsgv60Jv4Hs5kSUWAG4Dfm9m7wCJPsb0feB6M3sfyAN2H+wE59yHRGbGvZjIglyFZlZEpDWz0iuzHXjHu6z5D86514h0wc0zsyXAv/j3BCTiK12mLNLGvNU1K5xzzswuBiY55y442HkiYaMxGJG2Nxz4q3fl1y4CWoZaxG9qwYiIiC80BiMiIr5QghEREV8owYiIiC+UYERExBdKMCIi4ov/D1Ddt12IR23bAAAAAElFTkSuQmCC\n", 828 | "text/plain": [ 829 | "
" 830 | ] 831 | }, 832 | "metadata": { 833 | "needs_background": "light" 834 | }, 835 | "output_type": "display_data" 836 | } 837 | ], 838 | "source": [ 839 | "learn.recorder.plot()" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 36, 845 | "metadata": {}, 846 | "outputs": [ 847 | { 848 | "data": { 849 | "text/html": [ 850 | "\n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | "
epochtrain_lossvalid_lossaccuracytime
05.0741925.0152740.2608681:25:33
14.4126914.3170230.3179311:25:35
24.1885624.0813950.3332401:25:59
34.0960964.0479930.3310701:26:10
44.0917984.0328400.3288471:26:03
54.0447644.0078920.3292691:25:37
64.0461273.9452810.3354641:25:58
73.9194193.8869970.3401241:26:16
83.8384953.8208550.3472431:26:22
93.8365253.7662150.3527271:26:20
103.8647183.7161550.3581961:26:24
113.7133933.6252760.3684091:26:22
123.6296553.5679000.3755021:26:36
133.6618373.4904010.3849521:26:39
143.4860283.4251630.3937101:26:38
153.4672913.3635440.4021951:26:46
163.3803243.3186380.4087341:26:33
173.3648623.2824820.4140051:26:45
183.3354733.2661140.4164751:26:48
193.2251913.2618830.4169721:26:50
" 1003 | ], 1004 | "text/plain": [ 1005 | "" 1006 | ] 1007 | }, 1008 | "metadata": {}, 1009 | "output_type": "display_data" 1010 | }, 1011 | { 1012 | "name": "stdout", 1013 | "output_type": "stream", 1014 | "text": [ 1015 | "Better model found at epoch 0 with accuracy value: 0.2608684301376343.\n", 1016 | "Better model found at epoch 1 with accuracy value: 0.31793108582496643.\n", 1017 | "Better model found at epoch 2 with accuracy value: 0.3332400619983673.\n", 1018 | "Better model found at epoch 6 with accuracy value: 0.3354644179344177.\n", 1019 | "Better model found at epoch 7 with accuracy value: 0.34012407064437866.\n", 1020 | "Better model found at epoch 8 with accuracy value: 0.34724289178848267.\n", 1021 | "Better model found at epoch 9 with accuracy value: 0.35272735357284546.\n", 1022 | "Better model found at epoch 10 with accuracy value: 0.35819557309150696.\n", 1023 | "Better model found at epoch 11 with accuracy value: 0.36840856075286865.\n", 1024 | "Better model found at epoch 12 with accuracy value: 0.37550199031829834.\n", 1025 | "Better model found at epoch 13 with accuracy value: 0.3849523663520813.\n", 1026 | "Better model found at epoch 14 with accuracy value: 0.3937096893787384.\n", 1027 | "Better model found at epoch 15 with accuracy value: 0.40219518542289734.\n", 1028 | "Better model found at epoch 16 with accuracy value: 0.4087342619895935.\n", 1029 | "Better model found at epoch 17 with accuracy value: 0.4140048027038574.\n", 1030 | "Better model found at epoch 18 with accuracy value: 0.4164750874042511.\n", 1031 | "Better model found at epoch 19 with accuracy value: 0.41697242856025696.\n" 1032 | ] 1033 | } 1034 | ], 1035 | "source": [ 1036 | "learn.fit_one_cycle(20, 1e-3, moms=(0.8,0.7), callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='model')])" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "code", 1041 | "execution_count": 37, 1042 | "metadata": {}, 1043 | "outputs": [], 1044 | "source": [ 1045 | "TEXT = \"जिसके लिये उन्हें \"\n", 1046 | "N_WORDS = 40\n", 1047 | "N_SENTENCES = 2" 1048 | ] 1049 | }, 1050 | { 1051 | "cell_type": "code", 1052 | "execution_count": 38, 1053 | "metadata": {}, 1054 | "outputs": [ 1055 | { 1056 | "name": "stdout", 1057 | "output_type": "stream", 1058 | "text": [ 1059 | "जिसके लिये उन्हें ▁बर्मी ▁अथवा ▁चो शि यु ▁भी ▁कहा ▁जाता ▁है । ▁x x bo s ▁साल ▁2007 ▁से ▁अब ▁तक ▁48 ▁मैचों ▁की ▁58 ▁सीरीज ▁हुई ▁हैं । ▁उसमें ▁श्रीलंका ▁ने ▁5 ▁सीरीज ▁जीते ▁हैं । ▁साल ▁1979 ▁से ▁अब ▁तक\n", 1060 | "जिसके लिये उन्हें ▁वर्ष ▁2010 ▁में ▁मरणोपरांत ▁साहित्य ▁अकादमी ▁पुरस्कार ▁से ▁सम्मानित ▁किया ▁गया । ▁x x bo s ▁ डो ग , खेत ▁खर सी या ▁मण्डल ▁में ▁भारत ▁के ▁छत्तीसगढ़ ▁राज्य ▁के ▁अन्तर्गत ▁रायगढ़ ▁जिले ▁का ▁एक ▁गाँव ▁है । ▁x\n" 1061 | ] 1062 | } 1063 | ], 1064 | "source": [ 1065 | "print(\"\\n\".join(learn.predict(TEXT, N_WORDS, temperature=0.9) for _ in range(N_SENTENCES)))" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "code", 1070 | "execution_count": 39, 1071 | "metadata": {}, 1072 | "outputs": [ 1073 | { 1074 | "data": { 1075 | "text/plain": [ 1076 | "26.09863463173677" 1077 | ] 1078 | }, 1079 | "execution_count": 39, 1080 | "metadata": {}, 1081 | "output_type": "execute_result" 1082 | } 1083 | ], 1084 | "source": [ 1085 | "np.exp(3.261883)" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": 40, 1091 | "metadata": {}, 1092 | "outputs": [], 1093 | "source": [ 1094 | "defaults.device = torch.device('cpu')\n", 1095 | "learn.model.eval()\n", 1096 | "learn.export()" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": 41, 1102 | "metadata": {}, 1103 | "outputs": [ 1104 | { 1105 | "data": { 1106 | "text/plain": [ 1107 | "PosixPath('/home/gaurav/PycharmProjects/nlp-for-hindi/language-model')" 1108 | ] 1109 | }, 1110 | "execution_count": 41, 1111 | "metadata": {}, 1112 | "output_type": "execute_result" 1113 | } 1114 | ], 1115 | "source": [ 1116 | "path" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": 42, 1122 | "metadata": {}, 1123 | "outputs": [], 1124 | "source": [ 1125 | "# learn = load_learner(path / 'HindiDataset')" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": 43, 1131 | "metadata": {}, 1132 | "outputs": [], 1133 | "source": [ 1134 | "encoder = get_model(learn.model)[0]" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": 44, 1140 | "metadata": {}, 1141 | "outputs": [ 1142 | { 1143 | "data": { 1144 | "text/plain": [ 1145 | "torch.Size([30000, 410])" 1146 | ] 1147 | }, 1148 | "execution_count": 44, 1149 | "metadata": {}, 1150 | "output_type": "execute_result" 1151 | } 1152 | ], 1153 | "source": [ 1154 | "encoder.state_dict()['encoder.weight'].shape" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": 45, 1160 | "metadata": {}, 1161 | "outputs": [], 1162 | "source": [ 1163 | "embeddings = encoder.state_dict()['encoder.weight']" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "code", 1168 | "execution_count": 46, 1169 | "metadata": {}, 1170 | "outputs": [], 1171 | "source": [ 1172 | "embeddings = np.array(embeddings)" 1173 | ] 1174 | }, 1175 | { 1176 | "cell_type": "code", 1177 | "execution_count": 47, 1178 | "metadata": {}, 1179 | "outputs": [ 1180 | { 1181 | "data": { 1182 | "text/plain": [ 1183 | "(410,)" 1184 | ] 1185 | }, 1186 | "execution_count": 47, 1187 | "metadata": {}, 1188 | "output_type": "execute_result" 1189 | } 1190 | ], 1191 | "source": [ 1192 | "embeddings[0].shape" 1193 | ] 1194 | }, 1195 | { 1196 | "cell_type": "code", 1197 | "execution_count": 48, 1198 | "metadata": {}, 1199 | "outputs": [], 1200 | "source": [ 1201 | "df = pd.DataFrame(embeddings)" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": 49, 1207 | "metadata": {}, 1208 | "outputs": [ 1209 | { 1210 | "data": { 1211 | "text/plain": [ 1212 | "(30000, 410)" 1213 | ] 1214 | }, 1215 | "execution_count": 49, 1216 | "metadata": {}, 1217 | "output_type": "execute_result" 1218 | } 1219 | ], 1220 | "source": [ 1221 | "df.shape" 1222 | ] 1223 | }, 1224 | { 1225 | "cell_type": "code", 1226 | "execution_count": 50, 1227 | "metadata": {}, 1228 | "outputs": [], 1229 | "source": [ 1230 | "df.to_csv('transformer3_embeddings.tsv', sep='\\t', index=False, header=False)" 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "code", 1235 | "execution_count": 51, 1236 | "metadata": {}, 1237 | "outputs": [ 1238 | { 1239 | "data": { 1240 | "text/html": [ 1241 | "
\n", 1242 | "\n", 1255 | "\n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | "
0123456789...400401402403404405406407408409
0-0.099972-0.164632-0.2615030.113243-0.0101160.008201-0.083398-0.439391-0.210696-0.042600...-0.4376570.091170-0.1013800.0959930.254158-0.1990730.176711-0.015598-0.0832390.054953
10.0506560.0132420.0234800.0052760.0565090.063262-0.037668-0.175472-0.001132-0.134797...-0.255351-0.030477-0.0370930.019064-0.031267-0.0722390.029152-0.178726-0.1673600.058221
20.0520350.0129450.0220100.0048500.0585040.062911-0.037709-0.173654-0.002538-0.134034...-0.255153-0.030363-0.0387120.020665-0.031761-0.0711560.030784-0.180084-0.1672330.058573
3-0.2342480.0608200.2076380.0969090.204999-0.002955-0.095316-0.5060150.017120-0.110036...-0.2162900.067957-0.0349870.0402470.227386-0.3853440.099810-0.569107-0.1307400.122291
40.3176530.194561-0.2754920.1764360.224074-0.2467350.113164-0.2121350.026025-0.478695...-0.653744-0.218545-0.1874020.2199130.126067-0.0251230.033977-0.4294420.0109870.399669
\n", 1405 | "

5 rows × 410 columns

\n", 1406 | "
" 1407 | ], 1408 | "text/plain": [ 1409 | " 0 1 2 3 4 5 6 \\\n", 1410 | "0 -0.099972 -0.164632 -0.261503 0.113243 -0.010116 0.008201 -0.083398 \n", 1411 | "1 0.050656 0.013242 0.023480 0.005276 0.056509 0.063262 -0.037668 \n", 1412 | "2 0.052035 0.012945 0.022010 0.004850 0.058504 0.062911 -0.037709 \n", 1413 | "3 -0.234248 0.060820 0.207638 0.096909 0.204999 -0.002955 -0.095316 \n", 1414 | "4 0.317653 0.194561 -0.275492 0.176436 0.224074 -0.246735 0.113164 \n", 1415 | "\n", 1416 | " 7 8 9 ... 400 401 402 403 \\\n", 1417 | "0 -0.439391 -0.210696 -0.042600 ... -0.437657 0.091170 -0.101380 0.095993 \n", 1418 | "1 -0.175472 -0.001132 -0.134797 ... -0.255351 -0.030477 -0.037093 0.019064 \n", 1419 | "2 -0.173654 -0.002538 -0.134034 ... -0.255153 -0.030363 -0.038712 0.020665 \n", 1420 | "3 -0.506015 0.017120 -0.110036 ... -0.216290 0.067957 -0.034987 0.040247 \n", 1421 | "4 -0.212135 0.026025 -0.478695 ... -0.653744 -0.218545 -0.187402 0.219913 \n", 1422 | "\n", 1423 | " 404 405 406 407 408 409 \n", 1424 | "0 0.254158 -0.199073 0.176711 -0.015598 -0.083239 0.054953 \n", 1425 | "1 -0.031267 -0.072239 0.029152 -0.178726 -0.167360 0.058221 \n", 1426 | "2 -0.031761 -0.071156 0.030784 -0.180084 -0.167233 0.058573 \n", 1427 | "3 0.227386 -0.385344 0.099810 -0.569107 -0.130740 0.122291 \n", 1428 | "4 0.126067 -0.025123 0.033977 -0.429442 0.010987 0.399669 \n", 1429 | "\n", 1430 | "[5 rows x 410 columns]" 1431 | ] 1432 | }, 1433 | "execution_count": 51, 1434 | "metadata": {}, 1435 | "output_type": "execute_result" 1436 | } 1437 | ], 1438 | "source": [ 1439 | "df.head()" 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "execution_count": 52, 1445 | "metadata": {}, 1446 | "outputs": [ 1447 | { 1448 | "data": { 1449 | "text/plain": [ 1450 | "(30000, 410)" 1451 | ] 1452 | }, 1453 | "execution_count": 52, 1454 | "metadata": {}, 1455 | "output_type": "execute_result" 1456 | } 1457 | ], 1458 | "source": [ 1459 | "df.shape" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": 53, 1465 | "metadata": {}, 1466 | "outputs": [ 1467 | { 1468 | "data": { 1469 | "text/plain": [ 1470 | "30000" 1471 | ] 1472 | }, 1473 | "execution_count": 53, 1474 | "metadata": {}, 1475 | "output_type": "execute_result" 1476 | } 1477 | ], 1478 | "source": [ 1479 | "len(itos)" 1480 | ] 1481 | }, 1482 | { 1483 | "cell_type": "code", 1484 | "execution_count": 54, 1485 | "metadata": {}, 1486 | "outputs": [], 1487 | "source": [ 1488 | "df2 = pd.DataFrame(itos)" 1489 | ] 1490 | }, 1491 | { 1492 | "cell_type": "code", 1493 | "execution_count": 55, 1494 | "metadata": {}, 1495 | "outputs": [ 1496 | { 1497 | "data": { 1498 | "text/html": [ 1499 | "
\n", 1500 | "\n", 1513 | "\n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | " \n", 1542 | "
0
0<unk>
1<s>
2</s>
3▁के
4
\n", 1543 | "
" 1544 | ], 1545 | "text/plain": [ 1546 | " 0\n", 1547 | "0 \n", 1548 | "1 \n", 1549 | "2 \n", 1550 | "3 ▁के\n", 1551 | "4 ।" 1552 | ] 1553 | }, 1554 | "execution_count": 55, 1555 | "metadata": {}, 1556 | "output_type": "execute_result" 1557 | } 1558 | ], 1559 | "source": [ 1560 | "df2.head()" 1561 | ] 1562 | }, 1563 | { 1564 | "cell_type": "code", 1565 | "execution_count": 56, 1566 | "metadata": {}, 1567 | "outputs": [ 1568 | { 1569 | "data": { 1570 | "text/plain": [ 1571 | "(30000, 1)" 1572 | ] 1573 | }, 1574 | "execution_count": 56, 1575 | "metadata": {}, 1576 | "output_type": "execute_result" 1577 | } 1578 | ], 1579 | "source": [ 1580 | "df2.shape" 1581 | ] 1582 | }, 1583 | { 1584 | "cell_type": "code", 1585 | "execution_count": 57, 1586 | "metadata": {}, 1587 | "outputs": [], 1588 | "source": [ 1589 | "df2.to_csv('transformer3_embeddings_metadata.tsv', sep='\\t', index=False, header=False)" 1590 | ] 1591 | }, 1592 | { 1593 | "cell_type": "code", 1594 | "execution_count": 58, 1595 | "metadata": { 1596 | "scrolled": true 1597 | }, 1598 | "outputs": [ 1599 | { 1600 | "data": { 1601 | "text/plain": [ 1602 | "tensor([ 5.0656e-02, 1.3242e-02, 2.3480e-02, 5.2757e-03, 5.6509e-02,\n", 1603 | " 6.3262e-02, -3.7668e-02, -1.7547e-01, -1.1316e-03, -1.3480e-01,\n", 1604 | " -7.2113e-03, -1.8429e-02, 6.4644e-02, 1.2132e-01, -2.8948e-02,\n", 1605 | " -3.2840e-02, 6.2042e-03, -2.8391e-02, 1.0298e-02, 3.1041e-03,\n", 1606 | " 7.5052e-02, 5.8731e-02, 2.2466e-02, -1.5483e-01, 7.0984e-02,\n", 1607 | " -1.7122e-02, 4.4589e-02, 3.4579e-02, 1.9023e-02, 2.5846e-02,\n", 1608 | " 2.4408e-02, -5.1587e-02, 5.7413e-03, -8.4907e-03, -6.2721e-02,\n", 1609 | " 3.4298e-02, -1.0657e-03, -2.1233e-03, 1.7872e-02, -4.6330e-02,\n", 1610 | " -9.7442e-02, -6.7839e-02, 1.4847e-01, -1.1888e-01, -8.5665e-02,\n", 1611 | " -2.7119e-02, 4.4531e-03, -2.2153e-03, -4.7003e-02, 2.1623e-01,\n", 1612 | " 5.0883e-03, 9.1632e-04, 7.6505e-02, -1.0731e-01, -1.2035e-01,\n", 1613 | " -6.8442e-02, -1.4023e-02, 8.4317e-02, 2.1350e-03, 7.3595e-03,\n", 1614 | " 1.1860e-04, 1.5694e-02, 3.8656e-03, 8.9918e-03, -4.1664e-02,\n", 1615 | " 4.9456e-02, 3.7580e-02, -1.8443e-02, 3.6612e-02, 1.1217e-01,\n", 1616 | " 2.2807e-01, -6.8999e-02, 5.0330e-02, -5.7284e-02, -8.3027e-02,\n", 1617 | " 7.8370e-03, -1.0025e-02, -6.3295e-02, 2.8806e-02, -5.1965e-02,\n", 1618 | " 6.3856e-02, -4.6325e-02, 5.7598e-02, -9.5437e-03, -2.0365e-02,\n", 1619 | " 6.6406e-02, 1.1256e-02, 2.1162e-02, -6.2357e-02, 3.7167e-02,\n", 1620 | " -8.4535e-02, 2.4254e-02, -3.9129e-02, -6.4247e-02, 3.1853e-02,\n", 1621 | " -2.7636e-02, -1.0975e-02, 2.4096e-02, 2.0454e-02, -2.4301e-02,\n", 1622 | " 1.2506e-02, 6.3875e-02, 7.1419e-02, 3.4079e-02, 6.8084e-02,\n", 1623 | " -2.8068e-02, -6.3978e-02, -7.1514e-03, -3.2478e-02, 2.5371e-02,\n", 1624 | " -6.2349e-02, -2.2309e-02, 3.5836e-05, -1.4245e-03, -1.1740e-02,\n", 1625 | " 1.0618e-02, -1.0656e-02, -5.5328e-02, -6.2446e-02, -1.2068e-01,\n", 1626 | " -1.8177e-01, -1.1060e-01, 8.0311e-02, 3.0907e-02, -1.6772e-03,\n", 1627 | " -2.0087e-02, -4.7168e-02, 1.0695e-02, -4.5706e-02, -3.1445e-03,\n", 1628 | " -9.7472e-02, -1.8877e-01, 3.5175e-01, -5.2590e-02, -4.8911e-02,\n", 1629 | " -1.5161e-01, 1.7336e-01, -4.6395e-03, 1.8659e-02, -1.1064e-01,\n", 1630 | " -2.4139e-01, 1.0921e-02, 1.6398e-02, -5.6421e-02, 3.5258e-02,\n", 1631 | " -2.4469e-02, 1.0934e-01, -2.2726e-02, -5.9306e-03, -1.2413e-05,\n", 1632 | " -1.5771e-02, 8.0107e-02, -1.6609e-02, 2.2906e-01, 3.6846e-02,\n", 1633 | " 1.4692e-02, -3.7585e-02, 3.8085e-01, 3.4497e-02, -1.6139e-01,\n", 1634 | " 9.3505e-02, -2.6396e-03, -5.0270e-02, -7.3960e-02, 5.1344e-03,\n", 1635 | " -6.7529e-02, -2.0915e-01, -1.2576e-01, -2.1735e-04, -1.9994e-03,\n", 1636 | " -1.9039e-02, -8.7684e-02, 1.9307e-02, 3.2286e-02, -9.2228e-03,\n", 1637 | " -1.1982e-01, 5.4506e-02, 5.1111e-02, -1.0419e-02, 4.4231e-02,\n", 1638 | " 2.8039e-02, 3.4680e-02, 1.2178e-01, 1.5167e-02, -1.1005e-01,\n", 1639 | " -3.1951e-02, -3.8300e-03, 2.2859e-02, 3.8487e-02, 2.5735e-02,\n", 1640 | " 1.0575e-02, -4.7987e-02, -3.0104e-02, -8.5111e-02, 8.6246e-02,\n", 1641 | " 1.6918e-02, 7.5921e-02, 4.4339e-03, -6.6624e-02, 3.0378e-02,\n", 1642 | " 5.2167e-03, 3.2058e-01, 1.3249e-01, 4.6685e-02, -2.8499e-03,\n", 1643 | " 9.4871e-02, -1.1373e-01, 1.5399e-02, -7.2755e-02, 2.9355e-02,\n", 1644 | " -2.8283e-02, -6.7321e-02, 2.0032e-01, 9.9979e-02, 1.4967e-01,\n", 1645 | " -3.7655e-03, 2.1711e-01, -3.7825e-02, 3.4148e-02, -5.2607e-02,\n", 1646 | " -2.1382e-02, -1.4589e-02, 2.7698e-01, -6.0767e-02, -9.0150e-03,\n", 1647 | " 2.4664e-02, 7.9007e-02, -7.2796e-02, 4.5905e-02, -3.9751e-02,\n", 1648 | " 2.1217e-02, 2.3667e-02, 5.8100e-02, 6.7359e-02, -9.6938e-03,\n", 1649 | " -2.8555e-02, 8.4434e-03, -1.4891e-01, 1.0388e-01, 1.7537e-02,\n", 1650 | " 1.2317e-01, 9.0843e-02, 3.2604e-02, -6.9219e-02, 1.7922e-02,\n", 1651 | " -1.6663e-01, 1.3265e-01, -1.1027e-01, 1.0015e-02, -1.3478e-02,\n", 1652 | " -1.1931e-01, -6.4607e-02, 2.0124e-02, 1.5606e-02, 2.2370e-02,\n", 1653 | " 3.2464e-02, 8.9678e-02, -5.6609e-02, 2.0690e-02, -1.1040e-01,\n", 1654 | " -8.5594e-03, 1.4072e-02, -2.0003e-02, -1.2807e-02, 5.2172e-02,\n", 1655 | " -8.8147e-02, 1.0300e-01, -5.0730e-02, -6.8871e-02, 2.1444e-02,\n", 1656 | " 4.0150e-04, 2.3980e-02, -2.2633e-02, 1.6460e-02, 2.5711e-02,\n", 1657 | " 3.0610e-02, 3.7705e-02, 6.9931e-02, -4.3815e-02, 2.4061e-02,\n", 1658 | " 1.6624e-01, -1.3245e-02, 1.0356e-01, 2.4516e-02, -3.3962e-02,\n", 1659 | " 7.3025e-03, 3.5299e-02, -6.5993e-02, -2.7829e-02, 1.6398e-02,\n", 1660 | " 2.9198e-01, -5.5268e-02, 5.4428e-02, -1.4158e-01, -1.7320e-01,\n", 1661 | " 1.4138e-02, 4.5654e-02, 1.3337e-02, -5.1941e-02, -9.8113e-03,\n", 1662 | " -1.9837e-01, -1.9948e-02, 3.9828e-02, -3.2904e-02, -1.9715e-01,\n", 1663 | " 1.8128e-02, 1.1107e-02, 4.5440e-02, -8.2876e-02, -1.9173e-02,\n", 1664 | " 4.6941e-02, 9.7118e-02, -2.2928e-01, 3.6161e-02, 3.4942e-02,\n", 1665 | " -9.1640e-03, -2.4777e-02, -6.5125e-03, 1.9604e-02, -1.3599e-01,\n", 1666 | " -1.1477e-01, -1.1444e-01, 1.7434e-02, 8.6008e-02, 3.7520e-02,\n", 1667 | " -8.4861e-02, -1.2059e-01, 9.0891e-02, 1.0568e-01, 1.0005e-01,\n", 1668 | " 1.7307e-01, 5.1416e-02, -2.6329e-02, 2.1981e-02, -1.6127e-01,\n", 1669 | " -5.0700e-02, -4.2929e-03, -2.2049e-02, 1.5071e-01, 2.9565e-02,\n", 1670 | " 2.0342e-02, 7.9813e-03, -6.0186e-03, -1.3067e-01, -6.2830e-02,\n", 1671 | " 3.2539e-03, -1.3242e-01, 2.6112e-01, -2.0987e-01, -6.1793e-02,\n", 1672 | " 3.9484e-05, 1.4935e-01, -1.9232e-02, 2.4302e-02, -1.7242e-02,\n", 1673 | " 2.6441e-03, -1.1132e-02, -2.3149e-01, 8.5944e-03, 4.8183e-02,\n", 1674 | " -2.9147e-03, -9.7672e-02, 1.8745e-02, 1.3182e-01, 1.3713e-02,\n", 1675 | " 1.0864e-01, -1.1162e-01, 3.1873e-02, -8.0539e-02, -8.0032e-02,\n", 1676 | " -1.2489e-01, 3.6867e-03, -9.5043e-03, 1.6820e-02, -2.1751e-02,\n", 1677 | " 3.4390e-02, 2.7523e-02, -1.7693e-02, -7.1201e-03, -2.1822e-02,\n", 1678 | " -4.0029e-02, -2.7987e-02, -1.8678e-01, -4.3399e-02, -1.4886e-01,\n", 1679 | " 3.4930e-02, -1.7412e-02, 6.7505e-02, -6.8466e-02, -1.0606e-02,\n", 1680 | " -3.1098e-02, 3.8319e-02, -2.3795e-02, 1.2339e-01, 5.5776e-02,\n", 1681 | " 8.8078e-02, -2.6969e-01, 2.1350e-02, -2.4553e-03, -9.6651e-02,\n", 1682 | " -2.5535e-01, -3.0477e-02, -3.7093e-02, 1.9064e-02, -3.1267e-02,\n", 1683 | " -7.2239e-02, 2.9152e-02, -1.7873e-01, -1.6736e-01, 5.8221e-02],\n", 1684 | " device='cuda:0')" 1685 | ] 1686 | }, 1687 | "execution_count": 58, 1688 | "metadata": {}, 1689 | "output_type": "execute_result" 1690 | } 1691 | ], 1692 | "source": [ 1693 | "encoder.state_dict()['encoder.weight'][1]" 1694 | ] 1695 | }, 1696 | { 1697 | "cell_type": "code", 1698 | "execution_count": null, 1699 | "metadata": {}, 1700 | "outputs": [], 1701 | "source": [] 1702 | } 1703 | ], 1704 | "metadata": { 1705 | "kernelspec": { 1706 | "display_name": "Python 3", 1707 | "language": "python", 1708 | "name": "python3" 1709 | }, 1710 | "language_info": { 1711 | "codemirror_mode": { 1712 | "name": "ipython", 1713 | "version": 3 1714 | }, 1715 | "file_extension": ".py", 1716 | "mimetype": "text/x-python", 1717 | "name": "python", 1718 | "nbconvert_exporter": "python", 1719 | "pygments_lexer": "ipython3", 1720 | "version": "3.7.3" 1721 | } 1722 | }, 1723 | "nbformat": 4, 1724 | "nbformat_minor": 2 1725 | } 1726 | --------------------------------------------------------------------------------