├── requirements.txt ├── svm_model.sav ├── media ├── news.png └── result.gif ├── system_design.png ├── vectorizer.pickle ├── LICENSE ├── app.py ├── README.md ├── stopword.txt └── Burmese_News_Classification.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | matplotlib 3 | numpy 4 | scikit-learn 5 | pandas 6 | pyidaungsu 7 | -------------------------------------------------------------------------------- /svm_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThuraAung1601/Automatic-Myanmar-News-Classification/HEAD/svm_model.sav -------------------------------------------------------------------------------- /media/news.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThuraAung1601/Automatic-Myanmar-News-Classification/HEAD/media/news.png -------------------------------------------------------------------------------- /media/result.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThuraAung1601/Automatic-Myanmar-News-Classification/HEAD/media/result.gif -------------------------------------------------------------------------------- /system_design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThuraAung1601/Automatic-Myanmar-News-Classification/HEAD/system_design.png -------------------------------------------------------------------------------- /vectorizer.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThuraAung1601/Automatic-Myanmar-News-Classification/HEAD/vectorizer.pickle -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Thura Aung 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pickle 3 | import numpy as np 4 | import pyidaungsu as pds 5 | from sklearn.feature_extraction.text import TfidfVectorizer 6 | 7 | stopwordslist = [] 8 | slist = [] 9 | 10 | with open("./stopword.txt", encoding = 'utf8') as stopwordsfile: 11 | stopwords = stopwordsfile.readlines() 12 | slist.extend(stopwords) 13 | 14 | for w in range(len(slist)): 15 | temp = slist[w] 16 | stopwordslist.append(temp.rstrip()) 17 | 18 | def stop_word(sentence): 19 | new_sentence = [] 20 | for word in sentence.split(): 21 | if word not in stopwordslist: 22 | new_sentence.append(word) 23 | return(' '.join(new_sentence)) 24 | 25 | 26 | def tokenize(line): 27 | sentence = pds.tokenize(line,form="word") 28 | sentence = ' '.join([str(elem) for elem in sentence]) 29 | sentence = stop_word(sentence) 30 | return sentence 31 | 32 | filename = './svm_model.sav' 33 | # load the model from disk 34 | loaded_model = pickle.load(open(filename, 'rb')) 35 | 36 | vectorizer = pickle.load(open("vectorizer.pickle", "rb")) 37 | 38 | st.title('Automatic News Classification System for Myanmar Language') 39 | st.subheader("Input the News content below") 40 | sentence = st.text_area("Enter your news Content Here", height=200) 41 | sentence = tokenize(sentence) 42 | predict_btt = st.button("Predict") 43 | if predict_btt: 44 | data = vectorizer.transform([sentence]).toarray() 45 | prediction = loaded_model.predict(data) 46 | if prediction == ['Politics']: 47 | st.text("This is Politics News") 48 | elif prediction == ['Sports']: 49 | st.text("This is Sports News") 50 | elif prediction == ['Entertainment']: 51 | st.text("This is Entertainment News") 52 | elif prediction == ['Business']: 53 | st.text("This is Business News") 54 | 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Automatic Myanmar News Classification 2 | 3 | ## Project Overview 4 | 5 | Automatic Myanmar News Classification System using Linear SVM. I have examined also with other machine learning algorithms - Logistic Regression, Multinomial Naive Bayes, Random Forest and Decision Tree. The weighted f-score is highest when using Linear SVM. 6 | - A.H.Khine, K.T.Nwet, K.M.Soe, Automatic Myanmar News Classification proposed a system which is based on Naive Bayes. I used their dataset for training the model.[1] 7 | - Tokenzation is done by using pyidaungsu library which is based on fasttext. 8 | - The vecotorizer I used is tf-idf. 9 | - N-gram for TF-IDF is Unigram + Bigram 10 | 11 | ## System Design 12 | 13 | - I use the system design proposed in Nwet, Khin & Darren, Seth, Machine Learning Algorithms for Myanmar News Classification [2] 14 | 15 | ![System](system_design.png) 16 | 17 | ## Dataset 18 | 19 | Dataset is taken from Aye Hnin Khine's [repository](https://github.com/ayehninnkhine/MyanmarNewsClassificationSystem) 20 | ![Dataset](media/news.png) 21 | 22 | ## Experiments 23 | 24 | - For feature extraction, vectorize text data using TF-IDF vectorizer available in scikit-learn 25 | - Then train on different machine learning models for classification 26 | 27 | | Model | F1-score | 28 | |:---------------------------:|:------------:| 29 | | Decision Tree | 67% | 30 | | Random Forest | 82% | 31 | | Multinomial Naive Bayes | 84% | 32 | | Logistic Regression | 86% | 33 | | **Linear SVM** | **88%** | 34 | 35 | ## Demonstration 36 | 37 | Demonstration available [HERE](https://share.streamlit.io/thuraaung1601/automatic-myanmar-news-classification/main/app.py) 38 | ![Demo](media/result.gif) 39 | 40 | ## How to run demo 41 | - Download this repository 42 | - Install requirements 43 | ```{r, engine='bash', count_lines} 44 | tra@thura-pc:~$ pip install -r requirements.txt 45 | ``` 46 | - Run the main notebook - News_Classificaiton.ipynb for training 47 | - For Demo 48 | ```{r, engine='bash', count_lines} 49 | tra@thura-pc:~$ streamlit run app.py 50 | ``` 51 | ## Future Works 52 | - More Data is needed to 53 | - Test with Hybrid methods and Deep Learning Approaches 54 | 55 | ## References 56 | [1] A.H.Khine, K.T.Nwet, K.M.Soe, Automatic Myanmar News Classification, 15th Proceedings of International Conference on Computer Applications, February 2017, pp. 401-408 57 |
58 | [2] Nwet, Khin & Darren, Seth. (2019). MACHINE LEARNING ALGORITHMS FOR MYANMAR NEWS CLASSIFICATION. Journal of Natural Language Processing. 8. 17-24. 59 | 60 | 61 | -------------------------------------------------------------------------------- /stopword.txt: -------------------------------------------------------------------------------- 1 | က 2 | ကတည်းက 3 | ကတော့ 4 | ကပ 5 | ကဘာ 6 | ကရ 7 | ကလ 8 | ကလူ 9 | ကာ 10 | ကာလ 11 | ကား 12 | ကို 13 | ကိုယ့် 14 | ကိုယ်တိုင် 15 | ကုန် 16 | ကေ 17 | ကော 18 | ကောင်း 19 | ကဲ 20 | ကျ 21 | ကျန 22 | ကျပ် 23 | ကျော် 24 | ကျွန်တော့် 25 | ကျွန်တော် 26 | ကျွန်မ 27 | ကြ 28 | ကြည့် 29 | ကြာ 30 | ကြာတော့ 31 | ကြား 32 | ကြိမ်မြောက် 33 | ကြီး 34 | ကြောင့် 35 | ကြောင်း 36 | ခ 37 | ခင် 38 | ခဏခဏ 39 | ခန့် 40 | ခါ 41 | ခိုင်း 42 | ခု 43 | ခုချိန် 44 | ခေါ် 45 | ခဲ့ 46 | ခံ 47 | ချ 48 | ချက် 49 | ချင် 50 | ချင်း 51 | ချိန် 52 | ချုပ် 53 | ခြ 54 | ခြင်း 55 | ခွင့် 56 | စ 57 | စက 58 | စစ် 59 | စဉ် 60 | စတင် 61 | စရာ 62 | စသည့် 63 | စာ 64 | စား 65 | စိတ် 66 | စိုး 67 | စီ 68 | စီး 69 | စု 70 | စုံ 71 | စေ 72 | စေသော 73 | စွာ 74 | ဆ 75 | ဆက် 76 | ဆက်စပ် 77 | ဆန 78 | ဆို 79 | ဆိုင် 80 | ဆိုင်ရာ 81 | ဆိုပြီး 82 | ဆိုသည် 83 | ဆီ 84 | ည 85 | ညနေ 86 | ညီ 87 | တ 88 | တကယ် 89 | တက် 90 | တချို့ 91 | တခြား 92 | တင် 93 | တင်ပြ 94 | တစ် 95 | တစ်ဆင့် 96 | တစ်ဦး 97 | တည်း 98 | တတ် 99 | တန 100 | တယ် 101 | တာ 102 | တာကို 103 | တို 104 | တိုင်း 105 | တို့ 106 | တို့သည် 107 | တိုး 108 | တီ 109 | တုန်း 110 | တော 111 | တောင် 112 | တော့ 113 | တော် 114 | တော်တော်လေး 115 | တဲ့ 116 | တွင် 117 | တွေ 118 | တွေ့ 119 | ထ 120 | ထက် 121 | ထင် 122 | ထည့် 123 | ထပ် 124 | ထား 125 | ထားသည် 126 | ထိ 127 | ထို 128 | ထိုသို့ 129 | ထို့အပြင် 130 | ထုတ် 131 | ထူး 132 | ထောင် 133 | ထဲ 134 | ထဲက 135 | ထံ 136 | ထွက် 137 | ထွန်း 138 | ထွေ 139 | ထွေထွေထူးထူး 140 | ဒ 141 | ဒါ 142 | ဒါကို 143 | ဒါကြောင့် 144 | ဒါတွေ 145 | ဒါတွေက 146 | ဒါနဲ့ 147 | ဒါပေမဲ့ 148 | ဒါလေး 149 | ဒါ့အပြင် 150 | ဒီ 151 | ဒီထက် 152 | ဒီနေ့ 153 | ဒီမှာ 154 | ဒီလို 155 | ဒု 156 | န 157 | နက် 158 | နဂို 159 | နည်း 160 | နား 161 | နိုင် 162 | နိုင်သည် 163 | နီ 164 | နေ 165 | နေကျ 166 | နောက် 167 | နောက်ဆုံး 168 | နောက်တစ်ခု 169 | နောက်ထပ် 170 | နောက်ပြီး 171 | နော့ 172 | နော် 173 | နေ့ 174 | နေ့စဉ် 175 | နဲ့ 176 | နှင့် 177 | နှင့်အတူ 178 | နှစ် 179 | ပ 180 | ပင 181 | ပတ 182 | ပတ်သက် 183 | ပါ 184 | ပါပ 185 | ပါဝင် 186 | ပါသည် 187 | ပိတ် 188 | ပို 189 | ပိုင်း 190 | ပိုမို 191 | ပို့ 192 | ပုံ 193 | ပေ 194 | ပေါ 195 | ပေါင်း 196 | ပေါ့ 197 | ပေါ် 198 | ပေး 199 | ပေးလိုက် 200 | ပဲ 201 | ပျော် 202 | ပြ 203 | ပြန် 204 | ပြီ 205 | ပြီး 206 | ပြီးခဲ့သည့် 207 | ပြီးတော့ 208 | ပြု 209 | ပြော 210 | ပြောကြား 211 | ပြောသည် 212 | ပြဲ 213 | ပွဲ 214 | ဖ 215 | ဖက် 216 | ဖို့ 217 | ဖူး 218 | ဖော် 219 | ဖြင့် 220 | ဖြစ် 221 | ဖြစ်ကြောင်း 222 | ဖြစ်တယ် 223 | ဖြစ်ပါတယ် 224 | ဖြစ်ပြီး 225 | ဖြစ်လာ 226 | ဖြစ်သည် 227 | ဖွ 228 | ဖွင့် 229 | ဖွဲ့ 230 | ဘ 231 | ဘက် 232 | ဘယ 233 | ဘယ်လို 234 | ဘဝ 235 | ဘာ 236 | ဘူး 237 | ဘဲ 238 | မ 239 | မက 240 | မင်း 241 | မစ 242 | မည့် 243 | မည် 244 | မည်သို့ 245 | မန 246 | မမ 247 | မယ့် 248 | မယ် 249 | မရှိ 250 | မရှိသလောက် 251 | မရှိသေး 252 | မသိ 253 | မဟုတ် 254 | မာ 255 | မိ 256 | မိမိ 257 | မေ 258 | မေး 259 | မဲ့ 260 | များ 261 | များသည် 262 | မျိုး 263 | မျှ 264 | မြင့် 265 | မြင် 266 | မှ 267 | မှစ၍ 268 | မှန် 269 | မှာ 270 | မှု 271 | ယ 272 | ယခင် 273 | ယခု 274 | ယခုခေတ် 275 | ယင်း 276 | ယင်းကဲ့သို့ 277 | ယင်းသို့ 278 | ယူ 279 | ယောက် 280 | ရ 281 | ရက် 282 | ရင် 283 | ရင်း 284 | ရတာ 285 | ရန် 286 | ရပ် 287 | ရရှိ 288 | ရရှိမည် 289 | ရလာ 290 | ရာ 291 | ရာတွင် 292 | ရာ၌ 293 | ရီ 294 | ရေ 295 | ရော 296 | ရောက် 297 | ရောက်ရှိ 298 | ရေး 299 | ရဲ့ 300 | ရွယ် 301 | ရှင့် 302 | ရှင် 303 | ရှင်း 304 | ရှာ 305 | ရှိ 306 | ရှိပါတယ် 307 | ရှိသည် 308 | ရှေ့ 309 | လ 310 | လက် 311 | လက်ရှိ 312 | လည်း 313 | လမ်း 314 | လာ 315 | လာမည့် 316 | လာရောက် 317 | လား 318 | လိမ့် 319 | လို 320 | လိုက် 321 | လိုက်ပါ 322 | လို့ 323 | လုပ် 324 | လုံး 325 | လုံးဝ 326 | လူ 327 | လောက် 328 | လောလောဆယ် 329 | လေး 330 | လဲ 331 | လဲဆို 332 | လျက် 333 | လျှင် 334 | လွန် 335 | ဝ 336 | ဝင် 337 | ဝင်း 338 | သ 339 | သက် 340 | သက်ဆိုင်ရာ 341 | သင့် 342 | သစ် 343 | သည့် 344 | သည့်အတွက် 345 | သည် 346 | သည်နှင့် 347 | သတ် 348 | သဖြင့် 349 | သလို 350 | သာ 351 | သား 352 | သိ 353 | သိန်း 354 | သိပ် 355 | သိရ 356 | သိရသည် 357 | သိရှိ 358 | သို့ 359 | သို့မဟုတ် 360 | သို့သော် 361 | သုံး 362 | သူ 363 | သူမ 364 | သူများ 365 | သူ့ 366 | သေ 367 | သေသေချာချာ 368 | သော 369 | သောကြောင့် 370 | သော်လည်း 371 | သေး 372 | သွား 373 | ဟန် 374 | ဟာ 375 | ဟို 376 | ဟီး 377 | ဟု 378 | ဟုတ် 379 | ဟူ 380 | ဟော 381 | ဟဲဟဲ 382 | အ 383 | အက 384 | အကြိမ် 385 | အခ 386 | အခါ 387 | အခု 388 | အခုတလော 389 | အခုလို 390 | အခုဟာ 391 | အချက် 392 | အချို့ 393 | အခြား 394 | အင် 395 | အစ 396 | အစွမ်းကုန် 397 | အဆိုပါ 398 | အတူ 399 | အတွက် 400 | အတွင်း 401 | အထက်ပါ 402 | အထိ 403 | အထူး 404 | အထူးသဖြင့် 405 | အနေ 406 | အနေဖြင့် 407 | အပ 408 | အပါအဝင် 409 | အပေါ် 410 | အပြင် 411 | အဖြစ် 412 | အမ 413 | အမှတ် 414 | အရ 415 | အရင် 416 | အရင်က 417 | အရမ်း 418 | အလိုက် 419 | အလိုလို 420 | အား 421 | အားဖြင့် 422 | အားလုံး 423 | အို 424 | အောက် 425 | အောင် 426 | အေး 427 | အဲ 428 | အဲဒီ 429 | အဲဒီလို 430 | အဲ့ 431 | ဥပမာ 432 | ဦး 433 | ၌ 434 | ၍ 435 | ၎င်း 436 | ၏ 437 | -------------------------------------------------------------------------------- /Burmese_News_Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "News-Classification.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [ 9 | "sHY9s9gh0maj", 10 | "KaZwIKyk1eUo", 11 | "-JLcmb_l1pu8", 12 | "-Jd2eK2u1iRS" 13 | ] 14 | }, 15 | "kernelspec": { 16 | "name": "python3", 17 | "display_name": "Python 3" 18 | }, 19 | "language_info": { 20 | "name": "python" 21 | } 22 | }, 23 | "cells": [ 24 | { 25 | "cell_type": "markdown", 26 | "source": [ 27 | "Dataset : https://github.com/ayehninnkhine/MyanmarNewsClassificationSystem\n", 28 | "\n", 29 | "Demo Website : https://share.streamlit.io/thuraaung1601/automatic-myanmar-news-classification/main/app.py\n", 30 | "\n", 31 | "Github : https://github.com/ThuraAung1601/Automatic-Myanmar-News-Classification" 32 | ], 33 | "metadata": { 34 | "id": "hoFwvAmn21uP" 35 | } 36 | }, 37 | { 38 | "cell_type": "code", 39 | "source": [ 40 | "import pandas as pd\n", 41 | "import numpy as np\n", 42 | "from matplotlib import pyplot as plt\n", 43 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 44 | "from sklearn.model_selection import train_test_split" 45 | ], 46 | "metadata": { 47 | "id": "toQF-eMHkbW-" 48 | }, 49 | "execution_count": 1, 50 | "outputs": [] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "source": [ 55 | "### Data preparation" 56 | ], 57 | "metadata": { 58 | "id": "8IseFn5p1z3u" 59 | } 60 | }, 61 | { 62 | "cell_type": "code", 63 | "source": [ 64 | "dataset = \"./mm-news-classification-dataset.csv\"" 65 | ], 66 | "metadata": { 67 | "id": "Pjl6r4vpMYCx" 68 | }, 69 | "execution_count": 2, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "source": [ 75 | "data = pd.read_csv(dataset)\n", 76 | "print(data.head())" 77 | ], 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "C5cEOwIjMTEI", 83 | "outputId": "56131687-27d1-42fd-af2d-36d36c62b68e" 84 | }, 85 | "execution_count": 3, 86 | "outputs": [ 87 | { 88 | "output_type": "stream", 89 | "name": "stdout", 90 | "text": [ 91 | " Unnamed: 0 News Category\n", 92 | "0 0 ဒီမိုကရေစီ_ရ_မှ_အမျှဝေ_ပါ_ဟု_မဝင်းမော်ဦး_တောင်... Politics\n", 93 | "1 1 ဒီမိုကရေစီ_အရေး_လူ့အခွင့်အရေး_တောင်းဆို_ဆန္ဒပြ... Politics\n", 94 | "2 2 ၂၀၁၅_ခုနှစ်_အထွေထွေရွေးကောက်ပွဲ_တွင်_အမျိုးသား... Politics\n", 95 | "3 3 လာမည့်_စက်တင်ဘာ_၁၉_ရက်_တွင်_မဝင်းမော်ဦး_ကျဆုံး... Politics\n", 96 | "4 4 တပ်မတော်ကာကွယ်ရေးဦးစီးချုပ်_ဗိုလ်ချုပ်မှူးကြီး... Politics\n" 97 | ] 98 | } 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "source": [ 104 | "data.isnull().sum()\n" 105 | ], 106 | "metadata": { 107 | "colab": { 108 | "base_uri": "https://localhost:8080/" 109 | }, 110 | "id": "7EvKVTDWNRI1", 111 | "outputId": "75a28cd4-f659-496d-a31b-06ebabceb2ac" 112 | }, 113 | "execution_count": 4, 114 | "outputs": [ 115 | { 116 | "output_type": "execute_result", 117 | "data": { 118 | "text/plain": [ 119 | "Unnamed: 0 0\n", 120 | "News 0\n", 121 | "Category 0\n", 122 | "dtype: int64" 123 | ] 124 | }, 125 | "metadata": {}, 126 | "execution_count": 4 127 | } 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "source": [ 133 | "data['category_id'] = data['Category'].factorize()[0]\n", 134 | "colslist = ['Index','News', 'Category', 'category_id']\n", 135 | "data.columns = colslist\n", 136 | "data.groupby('Category').Index.count().plot.bar(ylim=0)" 137 | ], 138 | "metadata": { 139 | "colab": { 140 | "base_uri": "https://localhost:8080/", 141 | "height": 359 142 | }, 143 | "id": "5GaEbXWJNY6I", 144 | "outputId": "2d4ccf5f-6b04-4ad7-cbbe-b90b50b62902" 145 | }, 146 | "execution_count": 5, 147 | "outputs": [ 148 | { 149 | "output_type": "execute_result", 150 | "data": { 151 | "text/plain": [ 152 | "" 153 | ] 154 | }, 155 | "metadata": {}, 156 | "execution_count": 5 157 | }, 158 | { 159 | "output_type": "display_data", 160 | "data": { 161 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAFECAYAAADLDO40AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAdZ0lEQVR4nO3de5wcZZ3v8c+XAIJCJCyzLCZg0A0ociTAiCgHRXG5qSDrDVwB0TW6goJ6dEGPL1APyvEuXmCjRMALiAssUVGIrMDKGmUSYwi3Q7gtycYwigsIyEL4nj/6GSniTDLT3Zmanvq+X69+TdVT1d2/7lfmO5WnnqpHtomIiGbYqO4CIiJi/CT0IyIaJKEfEdEgCf2IiAZJ6EdENMjGdRewPttss41nzpxZdxkRET1j0aJFv7XdN9y2CR/6M2fOZGBgoO4yIiJ6hqS7RtqW7p2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQhH5ERIMk9CMiGiShHxHRIAn9iIgGmfBX5G4IM0/6Yd0lrNedp7+y7hIiYhLKkX5ERIMk9CMiGiShHxHRIOsNfUnbS/qppBsl3SDphNK+taQFkm4tP6eVdkk6Q9JySUsl7VF5rWPK/rdKOmbDfayIiBjOaE7kPga83/ZiSVsCiyQtAN4CXGn7dEknAScB/wgcDMwqjxcCZwIvlLQ1cArQD7i8znzbv+/2h4qIgAzaGM56Q9/2KmBVWX5A0k3AdOAwYL+y27nAVbRC/zDgPNsGFkraStJ2Zd8Ftu8FKH84DgLO7+LniXHWC79U0DujofJ9xoY2pj59STOB3YFfANuWPwgAvwG2LcvTgbsrT1tR2kZqH+595kgakDQwODg4lhIjImIdRh36krYALgJOtH1/dVs5qne3irI913a/7f6+vmFn/IqIiDaMKvQlbUIr8L9t++LSvLp021B+3lPaVwLbV54+o7SN1B4REeNkNKN3BJwN3GT7c5VN84GhETjHAJdW2o8uo3j2Bu4r3UCXAwdImlZG+hxQ2iIiYpyMZvTOPsBRwPWSlpS2DwGnAxdKehtwF/CGsu0y4BBgOfAQcCyA7XslfRy4ruz3saGTuhERMT5GM3rnZ4BG2Lz/MPsbOG6E15oHzBtLgRER0T25IjciokES+hERDZLQj4hokIR+RESDJPQjIhokoR8R0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENMhopkucJ+keScsqbd+VtKQ87hyaUUvSTEkPV7adVXnOnpKul7Rc0hllGsaIiBhHo5ku8Rzgy8B5Qw223zi0LOmzwH2V/W+zPXuY1zkTeDvwC1pTKh4E/GjsJUdERLvWe6Rv+xpg2Llsy9H6G4Dz1/UakrYDptpeWKZTPA94zdjLjYiITnTap78vsNr2rZW2HSX9StLVkvYtbdOBFZV9VpS2YUmaI2lA0sDg4GCHJUZExJBOQ/9InnyUvwrYwfbuwPuA70iaOtYXtT3Xdr/t/r6+vg5LjIiIIaPp0x+WpI2BvwX2HGqz/QjwSFleJOk2YCdgJTCj8vQZpS0iIsZRJ0f6rwButv2nbhtJfZKmlOVnAbOA222vAu6XtHc5D3A0cGkH7x0REW0YzZDN84GfAztLWiHpbWXTEfz5CdyXAEvLEM5/Bt5pe+gk8LuArwPLgdvIyJ2IiHG33u4d20eO0P6WYdouAi4aYf8BYNcx1hcREV2UK3IjIhokoR8R0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQhH5ERIOMZuaseZLukbSs0naqpJWSlpTHIZVtJ0taLukWSQdW2g8qbcslndT9jxIREeszmiP9c4CDhmn/vO3Z5XEZgKRdaE2j+LzynK9KmlLmzf0KcDCwC3Bk2TciIsbRaKZLvEbSzFG+3mHABbYfAe6QtBzYq2xbbvt2AEkXlH1vHHPFERHRtk769I+XtLR0/0wrbdOBuyv7rChtI7UPS9IcSQOSBgYHBzsoMSIiqtoN/TOBZwOzgVXAZ7tWEWB7ru1+2/19fX3dfOmIiEZbb/fOcGyvHlqW9DXgB2V1JbB9ZdcZpY11tEdExDhp60hf0naV1cOBoZE984EjJD1F0o7ALOCXwHXALEk7StqU1sne+e2XHRER7Vjvkb6k84H9gG0krQBOAfaTNBswcCfwDgDbN0i6kNYJ2seA42yvKa9zPHA5MAWYZ/uGrn+aiIhYp9GM3jlymOaz17H/acBpw7RfBlw2puoiIqKrckVuRESDJPQjIhokoR8R0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQ9Ya+pHmS7pG0rNL2aUk3S1oq6RJJW5X2mZIelrSkPM6qPGdPSddLWi7pDEnaMB8pIiJGMpoj/XOAg9ZqWwDsavv5wP8DTq5su8327PJ4Z6X9TODttObNnTXMa0ZExAa23tC3fQ1w71ptV9h+rKwuBGas6zXKROpTbS+0beA84DXtlRwREe3qRp/+W4EfVdZ3lPQrSVdL2re0TQdWVPZZUdqGJWmOpAFJA4ODg10oMSIioMPQl/Rh4DHg26VpFbCD7d2B9wHfkTR1rK9re67tftv9fX19nZQYEREVG7f7RElvAV4F7F+6bLD9CPBIWV4k6TZgJ2AlT+4CmlHaIiJiHLV1pC/pIOCDwKG2H6q090maUpafReuE7e22VwH3S9q7jNo5Gri04+ojImJM1nukL+l8YD9gG0krgFNojdZ5CrCgjLxcWEbqvAT4mKRHgceBd9oeOgn8LlojgTandQ6geh4gIiLGwXpD3/aRwzSfPcK+FwEXjbBtANh1TNVFRERX5YrciIgGSehHRDRIQj8iokES+hERDZLQj4hokIR+RESDJPQjIhokoR8R0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgowp9SfMk3SNpWaVta0kLJN1afk4r7ZJ0hqTlkpZK2qPynGPK/rdKOqb7HyciItZltEf65wAHrdV2EnCl7VnAlWUd4GBac+POAuYAZ0LrjwStqRZfCOwFnDL0hyIiIsbHqELf9jXAvWs1HwacW5bPBV5TaT/PLQuBrSRtBxwILLB9r+3fAwv48z8kERGxAXXSp7+t7VVl+TfAtmV5OnB3Zb8VpW2k9j8jaY6kAUkDg4ODHZQYERFVXTmRa9uAu/Fa5fXm2u633d/X19etl42IaLxOQn916bah/LyntK8Etq/sN6O0jdQeERHjpJPQnw8MjcA5Bri00n50GcWzN3Bf6Qa6HDhA0rRyAveA0hYREeNk49HsJOl8YD9gG0kraI3COR24UNLbgLuAN5TdLwMOAZYDDwHHAti+V9LHgevKfh+zvfbJ4YiI2IBGFfq2jxxh0/7D7GvguBFeZx4wb9TVRUREV+WK3IiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQhH5ERIMk9CMiGiShHxHRIG2HvqSdJS2pPO6XdKKkUyWtrLQfUnnOyZKWS7pF0oHd+QgRETFao5o5azi2bwFmA0iaQmuS80toTY/4edufqe4vaRfgCOB5wDOAn0jayfaadmuIiIix6Vb3zv7AbbbvWsc+hwEX2H7E9h205tDdq0vvHxERo9Ct0D8COL+yfrykpZLmSZpW2qYDd1f2WVHa/oykOZIGJA0MDg52qcSIiOg49CVtChwKfK80nQk8m1bXzyrgs2N9Tdtzbffb7u/r6+u0xIiIKLpxpH8wsNj2agDbq22vsf048DWe6MJZCWxfed6M0hYREeOkG6F/JJWuHUnbVbYdDiwry/OBIyQ9RdKOwCzgl114/4iIGKW2R+8ASHoa8DfAOyrNn5I0GzBw59A22zdIuhC4EXgMOC4jdyIixldHoW/7QeAv1mo7ah37nwac1sl7RkRE+3JFbkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQhH5ERIMk9CMiGiShHxHRIAn9iIgGSehHRDRIQj8iokES+hERDZLQj4hokG5MjH6npOslLZE0UNq2lrRA0q3l57TSLklnSFouaamkPTp9/4iIGL1uHem/zPZs2/1l/STgStuzgCvLOrQmUZ9VHnOAM7v0/hERMQobqnvnMODcsnwu8JpK+3luWQhstdZE6hERsQF1I/QNXCFpkaQ5pW1b26vK8m+AbcvydODuynNXlLYnkTRH0oCkgcHBwS6UGBER0OHE6MX/tL1S0l8CCyTdXN1o25I8lhe0PReYC9Df3z+m50ZExMg6PtK3vbL8vAe4BNgLWD3UbVN+3lN2XwlsX3n6jNIWERHjoKPQl/Q0SVsOLQMHAMuA+cAxZbdjgEvL8nzg6DKKZ2/gvko3UEREbGCddu9sC1wiaei1vmP7x5KuAy6U9DbgLuANZf/LgEOA5cBDwLEdvn9ERIxBR6Fv+3Zgt2HafwfsP0y7geM6ec+IiGhfrsiNiGiQhH5ERIMk9CMiGiShHxHRIAn9iIgGSehHRDRIQj8iokES+hERDZLQj4hokIR+RESDJPQjIhokoR8R0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ3SduhL2l7STyXdKOkGSSeU9lMlrZS0pDwOqTznZEnLJd0i6cBufICIiBi9TmbOegx4v+3FZZ7cRZIWlG2ft/2Z6s6SdgGOAJ4HPAP4iaSdbK/poIaIiBiDto/0ba+yvbgsPwDcBExfx1MOAy6w/YjtO2jNk7tXu+8fERFj15U+fUkzgd2BX5Sm4yUtlTRP0rTSNh24u/K0FYzwR0LSHEkDkgYGBwe7UWJERNCF0Je0BXARcKLt+4EzgWcDs4FVwGfH+pq259rut93f19fXaYkREVF0FPqSNqEV+N+2fTGA7dW219h+HPgaT3ThrAS2rzx9RmmLiIhx0snoHQFnAzfZ/lylfbvKbocDy8ryfOAISU+RtCMwC/hlu+8fERFj18nonX2Ao4DrJS0pbR8CjpQ0GzBwJ/AOANs3SLoQuJHWyJ/jMnInImJ8tR36tn8GaJhNl63jOacBp7X7nhER0ZlckRsR0SAJ/YiIBknoR0Q0SEI/IqJBEvoREQ2S0I+IaJCEfkREgyT0IyIaJKEfEdEgCf2IiAZJ6EdENEhCPyKiQRL6ERENktCPiGiQhH5ERIMk9CMiGmTcQ1/SQZJukbRc0knj/f4REU02rqEvaQrwFeBgYBdaUyvuMp41REQ02Xgf6e8FLLd9u+3/Bi4ADhvnGiIiGku2x+/NpNcBB9n++7J+FPBC28evtd8cYE5Z3Rm4ZdyKbM82wG/rLmISyffZXfk+u6sXvs9n2u4bbkPbE6NvSLbnAnPrrmO0JA3Y7q+7jski32d35fvsrl7/Pse7e2clsH1lfUZpi4iIcTDeoX8dMEvSjpI2BY4A5o9zDRERjTWu3Tu2H5N0PHA5MAWYZ/uG8axhA+mZrqgeke+zu/J9dldPf5/jeiI3IiLqlStyIyIaJKEfEdEgCf2IiAZJ6EdEjJKkaZKeX3cdnUjot0nSCZKmquVsSYslHVB3Xb1K0jdH0xajI+lpkjYqyztJOlTSJnXX1YskXVV+17cGFgNfk/S5uutqV0K/fW+1fT9wADANOAo4vd6Setrzqivl5nx71lTLZHANsJmk6cAVtP59nlNrRb3r6eV3/W+B82y/EHhFzTW1LaHfPpWfhwDfLNcbaB37xzAknSzpAeD5ku4vjweAe4BLay6vl8n2Q7SC6qu2X89af1hj1DaWtB3wBuAHdRfTqYR++xZJuoJW6F8uaUvg8Zpr6jm2P2l7S+DTtqeWx5a2/8L2yXXX18Mk6UXA3wE/LG1Taqynl32U1gWly21fJ+lZwK0119S2XJzVptJfOhu43fZ/lf6+GbaX1lxazypdEc+kcqW47Wvqq6h3SXop8H7gWtv/twTVibbfU3NpPUfSPravXV9br0jot0nSPsAS2w9KejOwB/BF23fVXFpPknQ6rXsx3QisKc22fWh9VUWApMW291hfW6+YkLdW7hFnArtJ2o3WEdXXgfOAl9ZaVe86HNjZ9iN1FzIZSFoAvN72f5X1acAFtg+st7LeUbrHXgz0SXpfZdNUerirLH367XvMrf8mHQZ82fZXgC1rrqmX3Q5kSGH39A0FPoDt3wPb1lhPL9oU2ILWwfGWlcf9wOtqrKsjOdJv3wOSTqY1FG7f0sef0GrfQ8ASSVcCfzraTx9029ZI2sH2fwBIeiYZaDAmtq+W9DPg+bY/Wnc93ZLQb98bgTfRGq//G0k7AJ+uuaZeNp/MrdBNHwZ+JulqWkOJ9+WJKUhjlGyvkfSMuuvoppzI7UA5eppl+yeSngpMsf1A3XX1KkmbAzvYnuhzIvcESdsAe5fVhbYn+ryuE5KkM4HpwPeAB4fabV9cW1EdSJ9+myS9Hfhn4J9K03TgX+qrqLdJejWwBPhxWZ8tKUf+YyTpOeXnHsAOwH+Wxw6lLcZuM+B3wMuBV5fHq2qtqAM50m+TpCXAXsAvbO9e2q63/T/qraw3SVpE65fqqsr3ucz2rvVW1lskzbU9R9JPh9ls2y8f96JiQkmffvsesf3fUuvOC5I2BvIXtH2P2r5v6PsscuJxjGwP9dsfbPuP1W2SNquhpJ4naQbwJWCf0vRvwAm2V9RXVfvSvdO+qyV9CNhc0t/Q6u/7fs019bIbJL0JmCJplqQvAf9ed1E9bLjvLt9ne75Ba5DBM8rj+6WtJ6V7p01liObbaN1lU7TuzfF15wttSzkR/mGe/H1+fO2j1Vg3SX9F6/zSt2iNLhv6r9NU4Czbz6mrtl4laYnt2etr6xUJ/YhJRNIxwFuAfmCgsukB4JxeHXFSp3LtyDeA80vTkcCxtvevr6r2JfTbVO69cypP3CBMtE6UPavOunqVpH7gQ8BMnnzDtZ6epagukl5r+6K665gMytDsLwEvKk3XAu8ZuvCt1yT02yTpZuC9wCKeuEEYtn9XW1E9TNItwAeA66mcwM0N7MZG0pttf0vS+xlmYIHtnp3xKbojo3fad5/tH9VdxCQyaDvj8jv3tPJzi1qrmETKbam/SOtCNwM/B95r+/ZaC2tTjvTbVG4FPAW4mCffK2ZxbUX1MEn70+orXfveO+mDjlpJWgh8hSf69I8A3l2mTew5Cf025eKX7pL0LeA5wA080b1j22+tr6reI+mMdW3PDezGTtLStc8tSfq17d3qqqkT6d5pk+2X1V3DJPMC2zvXXcQksKjuAiahH0k6CbiAVvfOG4HLymx52L63zuLGKkf6Y1Q5Ufa+4bbnRFl7JH2D1jy5N9Zdy2QiaQsA23+ou5ZeJemOsjgUltXLxntuxF6O9Mdu6ERZJkzprr1p3U//Dlp9+kNDYDNksw2SdgW+CWzdWtUgcLTtG+qtrHdIegFwt+0dy/oxwGuBO4FTe+0If0iO9GNCKGOh/0yGbLZH0r8DH7b907K+H/AJ2y+utbAeImkx8Arb90p6Ca3unXcDs4Hn2u7J2bNy7502SfqUpKmSNpF0paTBMkF6tKGE+wrgUVr/jR56RHueNhT4ALav4on/pcboTKkczb8RmGv7ItsfAf66xro6ktBv3wG276d1X+07af0j+ECtFfUwSe8GVgMLgB+Wxw9qLaq33S7pI5Jmlsf/pjUPcYzelHL3XID9gX+tbOvZrvGeLXwCGPruXgl8b5jbAsfYnADsnCuau+atwEdpXUdiWrcDzvDXsTmf1t10fws8TOs7RNJfA/fVWVgnEvrt+0G5FcPDwD9I6gNyR8j23U0P/yJNFOWe+e+k9T/P64H323603qp6k+3Tys3WtgOuqNxBdyNaffs9KSdyO1DG6d5XJk9+KjDV9m/qrqsXSTob2JlWt071itwMgR0DSd+ldV7k34CDgTttn1hvVTGR5Ei/TZKOrixXN503/tVMCv9RHpuWR7Rnl6EpO8sf0l/WXE9MMAn99r2gsrwZrRM9i0not8X2R+uuYZL4U1eO7cdyninWlu6dLpG0FXCB7YPqrqWXSPqC7RMlfZ/hbwV8aA1l9SxJa4AHh1aBzYGHeOJit6l11RYTQ470u+dBYMe6i+hB3yw/P1NrFZOE7Sl11xATW0K/TWsdmW4E7AJcWF9Fvcn2ovLz6rpriWiCdO+0SdJLK6uPAXfZXlFXPb1O0izgk7T+eG421N5rN7OKmOhypN+m6pGppG2AXFTUmW8ApwCfB14GHEuuGI/ouvxSjZGkvSVdJeliSbtLWgYsA1ZLyknc9m1u+0pa//u8y/aptK52joguypH+2H0Z+BDwdFr34jjY9kJJz6F12faP6yyuhz0iaSPgVknHAyvJPK8RXZc+/TGStMT27LJ8k+3nVrb9yvbu9VXXu8q9y28CtgI+DkwFPmX7F7UWFjHJpHtn7B6vLD+81rb8BW3fTNt/sL3C9rG2XwvsUHdREZNNjvTHqHLxS/XCF8r6ZrY3qau2XiZpse091tcWEZ1Jn/4Y5eKX7pJ0MHAIMF3SGZVNU2kNhY2ILkroR93+ExgADgUWVdofAN5bS0URk1i6d6J2kqYA37T9prpriZjsciI3amd7DbC9pNxSOWIDS/dOTBR3ANdKms8Td4nMJCoRXZbQj4nitvLYCNiy5loiJq306ceEIumpth9a/54R0Y706ceEIOlFkm4Ebi7ru0n6as1lRUw6Cf2YKL4AHEi5W6ntXwMvqbWiiEkooR8Thu2712paU0shEZNYTuTGRHG3pBcDlrQJcAKtG7BFRBflRG5MCGUimi8Cr6B1H6MrgPfYvrfWwiImmYR+TAiS9rF97fraIqIzCf2YEHKXzYjxkT79qJWkFwEvBvokva+yaSqQO5pGdFlCP+q2Ka1pETfmyVfi3g+8rpaKIiaxdO/EhCDpmbbvqruOiMkuR/oxUTxF0lxgJpV/l7ZfXltFEZNQjvRjQpD0a+AsWhOp/OmiLNuLRnxSRIxZQj8mBEmLbO9Zdx0Rk11CPyYESacC9wCXAI8MtefirIjuSujHhCDpjmGabftZ415MxCSW0I+IaJDcZTNqJemDleXXr7XtE+NfUcTkltCPuh1RWT55rW0HjWchEU2Q0I+6aYTl4dYjokMJ/aibR1gebj0iOpQTuVErSWuAB2kd1W8ODE2KLmAz25vUVVvEZJTQj4hokHTvREQ0SEI/IqJBEvrRCJL+StIFkm6TtEjSZZJ2GmHfrSS9a7xrjBgPCf2Y9CSJ1j19rrL97HJjt5OBbUd4ylbABg99Sbm1eYy7hH40wcuAR22fNdRg+9fAryRdKWmxpOslHVY2nw48W9ISSZ8GkPQBSddJWirpo0OvI+kjkm6R9DNJ50v6X6V9tqSFZf9LJE0r7VdJ+oKkAeDDku6QtEnZNrW6HrEh5EgjmmBXWvfpX9sfgcNt3y9pG2ChpPnAScCutmcDSDoAmAXsRWso6XxJLwEeBl4L7AZsAiyuvM95wLttXy3pY8ApwIll26a2+8trzwReCfwLrauTL7b9aBc/e8STJPSjyQR8ogT448B0hu/yOaA8flXWt6D1R2BL4FLbfwT+KOn7AJKeDmxl++qy/7nA9yqv993K8teBD9IK/WOBt3fhc0WMKKEfTXADw0+y/ndAH7Cn7Ucl3QlsNsx+Aj5p+5+e1CidOMy+o/Hg0ILtayXNlLQfMMX2sjZfM2JU0qcfTfCvtObgnTPUIOn5wDOBe0rgv6ysAzxA6yh+yOXAWyVtUZ47XdJfAtcCr5a0Wdn2KgDb9wG/l7Rvef5RwNWM7DzgO8A3OvycEeuVI/2Y9Gxb0uHAFyT9I62+/DuBU4EzJF0PDAA3l/1/J+laScuAH9n+gKTnAj9vDQTiD8CbbV9XzgEsBVYD1wP3lbc9BjhL0lOB22l13Yzk28D/Ac7v4seOGFZuwxDRAUlb2P5DCfdrgDm2F4/xNV4HHGb7qA1SZERFjvQjOjNX0i60zgWc20bgfwk4GDhkQxQXsbYc6UdENEhO5EZENEhCPyKiQRL6ERENktCPiGiQhH5ERIP8f6Q+5HDmGfiMAAAAAElFTkSuQmCC\n", 162 | "text/plain": [ 163 | "
" 164 | ] 165 | }, 166 | "metadata": { 167 | "needs_background": "light" 168 | } 169 | } 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "source": [ 175 | "stopwordslist = []\n", 176 | "slist = []\n", 177 | "with open(\"/content/stopword.txt\", encoding = 'utf8') as stopwordsfile:\n", 178 | " stopwords = stopwordsfile.readlines()\n", 179 | " slist.extend(stopwords)\n", 180 | " for w in range(len(slist)):\n", 181 | " temp = slist[w]\n", 182 | " stopwordslist.append(temp.rstrip())\n" 183 | ], 184 | "metadata": { 185 | "id": "EeNVew5TZN2X" 186 | }, 187 | "execution_count": 6, 188 | "outputs": [] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "source": [ 193 | "!pip install pyidaungsu" 194 | ], 195 | "metadata": { 196 | "id": "1pjG3DQYUOS7" 197 | }, 198 | "execution_count": null, 199 | "outputs": [] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "source": [ 204 | "import pyidaungsu as pds\n", 205 | "import re\n", 206 | "CleanPattern = re.compile(r'\\d+|[၊။!-/:-@[-`{-~\\t ]|[A-za-z0-9]')\n", 207 | "def clean_sentence(sentence):\n", 208 | " sentence = sentence.replace(\"_\",\" \")\n", 209 | " sent = CleanPattern.sub(\" \",sentence)\n", 210 | " return sent\n", 211 | "\n", 212 | "def stop_word(sentence):\n", 213 | " new_sentence = []\n", 214 | " for word in sentence.split():\n", 215 | " if word not in stopwordslist:\n", 216 | " new_sentence.append(word)\n", 217 | " return(' '.join(new_sentence))\n", 218 | "\n", 219 | "def tokenize(line):\n", 220 | " line = clean_sentence(line)\n", 221 | " sentence = pds.tokenize(line,form=\"word\")\n", 222 | " sentence = ' '.join([str(elem) for elem in sentence])\n", 223 | " sentence = stop_word(sentence)\n", 224 | " return sentence\n", 225 | " \n", 226 | "data['News'] = data['News'].apply(tokenize)\n", 227 | "data.head(10)" 228 | ], 229 | "metadata": { 230 | "colab": { 231 | "base_uri": "https://localhost:8080/", 232 | "height": 363 233 | }, 234 | "id": "BeMZFKPtYSo7", 235 | "outputId": "28acc76b-369e-4e09-a8b0-bb0f2c383843" 236 | }, 237 | "execution_count": 7, 238 | "outputs": [ 239 | { 240 | "output_type": "execute_result", 241 | "data": { 242 | "text/html": [ 243 | "\n", 244 | "
\n", 245 | "
\n", 246 | "
\n", 247 | "\n", 260 | "\n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | "
IndexNewsCategorycategory_id
00ဒီမိုကရေစီ အမျှ ဝေ မော်ဦး တောင်းဆို အရေးတော်ပု...Politics0
11ဒီမိုကရေစီ အရေး လူ့ အခွင့် အရေး တောင်းဆို ဆန္ဒ...Politics0
22ခုနှစ် အထွေထွေ ရွေးကောက်ပွဲ အမျိုးသား ဒီမိုကရေ...Politics0
33စက်တင်ဘာ မဝင်း မော်ဦး ကျဆုံး ပြည့် ကျဆုံး ပတ်လ...Politics0
44တပ်မတော် ကာကွယ် ဦးစီးချုပ် ဗိုလ်ချုပ်မှူးကြီး ...Politics0
55တွေ့ဆုံ မြန်မာ နိုင်ငံ ငြိမ်းချမ်း ရာစုပင်လုံ ...Politics0
66ယနေ့ ခရီးစဉ် မြန်မာ နိုင်ငံ အမေရိကန် နိုင်ငံခြ...Politics0
77အလုပ်သမား နေပြည်တော် ဆက်လက်ချီတက်Politics0
88မန္တလေး ဒေသ ကြီးစဉ့် ကိုင် မြို့ဖါးလင်ပိုး ကျေ...Politics0
99ခုံသမာဓိကောင်စီ ကြားနာ စစ်ဆေး ခံယူ အလုပ်သမား လ...Politics0
\n", 343 | "
\n", 344 | " \n", 354 | " \n", 355 | " \n", 392 | "\n", 393 | " \n", 417 | "
\n", 418 | "
\n", 419 | " " 420 | ], 421 | "text/plain": [ 422 | " Index ... category_id\n", 423 | "0 0 ... 0\n", 424 | "1 1 ... 0\n", 425 | "2 2 ... 0\n", 426 | "3 3 ... 0\n", 427 | "4 4 ... 0\n", 428 | "5 5 ... 0\n", 429 | "6 6 ... 0\n", 430 | "7 7 ... 0\n", 431 | "8 8 ... 0\n", 432 | "9 9 ... 0\n", 433 | "\n", 434 | "[10 rows x 4 columns]" 435 | ] 436 | }, 437 | "metadata": {}, 438 | "execution_count": 7 439 | } 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 8, 445 | "metadata": { 446 | "id": "fcc99695" 447 | }, 448 | "outputs": [], 449 | "source": [ 450 | "data = data[[\"News\", \"Category\"]]\n", 451 | "\n", 452 | "x = np.array(data[\"News\"])\n", 453 | "y = np.array(data[\"Category\"])" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "source": [ 459 | "def tokenize(line):\n", 460 | " sentence = pds.tokenize(line,form=\"word\")\n", 461 | " return sentence\n", 462 | "\n", 463 | "vectorizer = TfidfVectorizer(tokenizer=tokenize,ngram_range=(1,2))\n", 464 | "X = vectorizer.fit_transform(x)" 465 | ], 466 | "metadata": { 467 | "id": "v0tB4xSlfeGN" 468 | }, 469 | "execution_count": 9, 470 | "outputs": [] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "source": [ 475 | "X.shape" 476 | ], 477 | "metadata": { 478 | "colab": { 479 | "base_uri": "https://localhost:8080/" 480 | }, 481 | "id": "5gzGmt11ZYup", 482 | "outputId": "827a0bf1-92ab-4de3-eb36-d6231c1d6a87" 483 | }, 484 | "execution_count": 10, 485 | "outputs": [ 486 | { 487 | "output_type": "execute_result", 488 | "data": { 489 | "text/plain": [ 490 | "(8115, 20178)" 491 | ] 492 | }, 493 | "metadata": {}, 494 | "execution_count": 10 495 | } 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "source": [ 501 | "import pickle\n", 502 | "pickle.dump(vectorizer, open(\"vectorizer.pickle\", \"wb\"))" 503 | ], 504 | "metadata": { 505 | "id": "3c3VEefWcubb" 506 | }, 507 | "execution_count": 11, 508 | "outputs": [] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "source": [ 513 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 514 | ], 515 | "metadata": { 516 | "id": "ASJopf6UXlYx" 517 | }, 518 | "execution_count": 16, 519 | "outputs": [] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "source": [ 524 | "### Naive Bayes " 525 | ], 526 | "metadata": { 527 | "id": "B2IxNcGx0KA7" 528 | } 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 17, 533 | "metadata": { 534 | "id": "290c3389", 535 | "outputId": "90274a87-64b3-46a4-977a-ae617e9989a1", 536 | "colab": { 537 | "base_uri": "https://localhost:8080/" 538 | } 539 | }, 540 | "outputs": [ 541 | { 542 | "output_type": "execute_result", 543 | "data": { 544 | "text/plain": [ 545 | "MultinomialNB()" 546 | ] 547 | }, 548 | "metadata": {}, 549 | "execution_count": 17 550 | } 551 | ], 552 | "source": [ 553 | "from sklearn.naive_bayes import MultinomialNB\n", 554 | "\n", 555 | "model = MultinomialNB()\n", 556 | "model.fit(X_train,y_train)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "source": [ 562 | "# predict\n", 563 | "y_pred = model.predict(X_test)" 564 | ], 565 | "metadata": { 566 | "id": "2quuFvRuR98-" 567 | }, 568 | "execution_count": 18, 569 | "outputs": [] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "source": [ 574 | "# confusion matrix and accuracy\n", 575 | "\n", 576 | "from sklearn import metrics\n", 577 | "from sklearn.metrics import classification_report \n", 578 | "\n", 579 | "print(f\"Classification report for classifier {model}:\\n\"\n", 580 | " f\"{metrics.classification_report(y_test, y_pred)}\\n\")" 581 | ], 582 | "metadata": { 583 | "colab": { 584 | "base_uri": "https://localhost:8080/" 585 | }, 586 | "id": "UVLDJrpXRSsQ", 587 | "outputId": "e3287fd3-fdfe-489f-a710-5be057e71963" 588 | }, 589 | "execution_count": 19, 590 | "outputs": [ 591 | { 592 | "output_type": "stream", 593 | "name": "stdout", 594 | "text": [ 595 | "Classification report for classifier MultinomialNB():\n", 596 | " precision recall f1-score support\n", 597 | "\n", 598 | " Business 0.78 0.91 0.84 412\n", 599 | "Entertainment 0.89 0.80 0.84 405\n", 600 | " Politics 0.82 0.86 0.84 386\n", 601 | " Sports 0.93 0.83 0.88 420\n", 602 | "\n", 603 | " accuracy 0.85 1623\n", 604 | " macro avg 0.86 0.85 0.85 1623\n", 605 | " weighted avg 0.86 0.85 0.85 1623\n", 606 | "\n", 607 | "\n" 608 | ] 609 | } 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "source": [ 615 | "### Linear SVM" 616 | ], 617 | "metadata": { 618 | "id": "sHY9s9gh0maj" 619 | } 620 | }, 621 | { 622 | "cell_type": "code", 623 | "source": [ 624 | "# Import classifiers and performance metrics\n", 625 | "from sklearn import svm, metrics\n", 626 | "\n", 627 | "# linear kernel model\n", 628 | "\n", 629 | "svm_model = svm.SVC(kernel='linear')\n", 630 | "svm_model.fit(X_train, y_train)\n", 631 | "\n", 632 | "# predict\n", 633 | "y_pred = svm_model.predict(X_test)" 634 | ], 635 | "metadata": { 636 | "id": "PqfOxTQU9vIc" 637 | }, 638 | "execution_count": 20, 639 | "outputs": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "source": [ 644 | "# confusion matrix and accuracy\n", 645 | "\n", 646 | "from sklearn import metrics\n", 647 | "from sklearn.metrics import classification_report \n", 648 | "\n", 649 | "print(f\"Classification report for classifier {svm_model}:\\n\"\n", 650 | " f\"{metrics.classification_report(y_test, y_pred)}\\n\")" 651 | ], 652 | "metadata": { 653 | "colab": { 654 | "base_uri": "https://localhost:8080/" 655 | }, 656 | "id": "ysrSrhlY-vjG", 657 | "outputId": "9ec74602-83ef-48d4-e6c5-7badca2169bc" 658 | }, 659 | "execution_count": 21, 660 | "outputs": [ 661 | { 662 | "output_type": "stream", 663 | "name": "stdout", 664 | "text": [ 665 | "Classification report for classifier SVC(kernel='linear'):\n", 666 | " precision recall f1-score support\n", 667 | "\n", 668 | " Business 0.90 0.87 0.89 412\n", 669 | "Entertainment 0.77 0.92 0.84 405\n", 670 | " Politics 0.87 0.83 0.85 386\n", 671 | " Sports 0.91 0.81 0.86 420\n", 672 | "\n", 673 | " accuracy 0.86 1623\n", 674 | " macro avg 0.86 0.86 0.86 1623\n", 675 | " weighted avg 0.86 0.86 0.86 1623\n", 676 | "\n", 677 | "\n" 678 | ] 679 | } 680 | ] 681 | }, 682 | { 683 | "cell_type": "markdown", 684 | "source": [ 685 | "### Random Forest " 686 | ], 687 | "metadata": { 688 | "id": "KaZwIKyk1eUo" 689 | } 690 | }, 691 | { 692 | "cell_type": "code", 693 | "source": [ 694 | "from sklearn.ensemble import RandomForestClassifier\n", 695 | "\n", 696 | "# Random forest classifier\n", 697 | "rf_model = RandomForestClassifier(n_estimators=100, n_jobs=1)\n", 698 | "rf_model.fit(X_train,y_train)\n", 699 | "\n", 700 | "# predict\n", 701 | "y_pred = rf_model.predict(X_test)" 702 | ], 703 | "metadata": { 704 | "id": "blkLzOMLFpL9" 705 | }, 706 | "execution_count": 22, 707 | "outputs": [] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "source": [ 712 | "# confusion matrix and accuracy\n", 713 | "\n", 714 | "from sklearn import metrics\n", 715 | "from sklearn.metrics import classification_report \n", 716 | "\n", 717 | "print(f\"Classification report for classifier {rf_model}:\\n\"\n", 718 | " f\"{metrics.classification_report(y_test, y_pred)}\\n\")" 719 | ], 720 | "metadata": { 721 | "colab": { 722 | "base_uri": "https://localhost:8080/" 723 | }, 724 | "id": "Lw9sJzlxHWBD", 725 | "outputId": "9ccd627b-63c1-47af-f547-577407106c14" 726 | }, 727 | "execution_count": 23, 728 | "outputs": [ 729 | { 730 | "output_type": "stream", 731 | "name": "stdout", 732 | "text": [ 733 | "Classification report for classifier RandomForestClassifier(n_jobs=1):\n", 734 | " precision recall f1-score support\n", 735 | "\n", 736 | " Business 0.87 0.79 0.83 412\n", 737 | "Entertainment 0.72 0.89 0.80 405\n", 738 | " Politics 0.79 0.80 0.79 386\n", 739 | " Sports 0.90 0.77 0.83 420\n", 740 | "\n", 741 | " accuracy 0.81 1623\n", 742 | " macro avg 0.82 0.81 0.81 1623\n", 743 | " weighted avg 0.82 0.81 0.81 1623\n", 744 | "\n", 745 | "\n" 746 | ] 747 | } 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "source": [ 753 | "### Decision Tree" 754 | ], 755 | "metadata": { 756 | "id": "-JLcmb_l1pu8" 757 | } 758 | }, 759 | { 760 | "cell_type": "code", 761 | "source": [ 762 | "from sklearn.tree import DecisionTreeClassifier\n", 763 | "\n", 764 | "dt_model = DecisionTreeClassifier(random_state=0)\n", 765 | "dt_model.fit(X_train,y_train)\n", 766 | "\n", 767 | "# predict\n", 768 | "y_pred = dt_model.predict(X_test)" 769 | ], 770 | "metadata": { 771 | "id": "j48GgUZqHbBF" 772 | }, 773 | "execution_count": 24, 774 | "outputs": [] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "source": [ 779 | "# confusion matrix and accuracy\n", 780 | "\n", 781 | "from sklearn import metrics\n", 782 | "from sklearn.metrics import classification_report \n", 783 | "\n", 784 | "print(f\"Classification report for classifier {dt_model}:\\n\"\n", 785 | " f\"{metrics.classification_report(y_test, y_pred)}\\n\")" 786 | ], 787 | "metadata": { 788 | "colab": { 789 | "base_uri": "https://localhost:8080/" 790 | }, 791 | "id": "hwrlX85QH2Ig", 792 | "outputId": "3150e0c9-cf92-4f18-d059-7b8363b0ad4c" 793 | }, 794 | "execution_count": 25, 795 | "outputs": [ 796 | { 797 | "output_type": "stream", 798 | "name": "stdout", 799 | "text": [ 800 | "Classification report for classifier DecisionTreeClassifier(random_state=0):\n", 801 | " precision recall f1-score support\n", 802 | "\n", 803 | " Business 0.76 0.72 0.74 412\n", 804 | "Entertainment 0.66 0.82 0.73 405\n", 805 | " Politics 0.75 0.70 0.73 386\n", 806 | " Sports 0.81 0.69 0.75 420\n", 807 | "\n", 808 | " accuracy 0.74 1623\n", 809 | " macro avg 0.74 0.74 0.74 1623\n", 810 | " weighted avg 0.74 0.74 0.74 1623\n", 811 | "\n", 812 | "\n" 813 | ] 814 | } 815 | ] 816 | }, 817 | { 818 | "cell_type": "markdown", 819 | "source": [ 820 | "### KNN" 821 | ], 822 | "metadata": { 823 | "id": "-Jd2eK2u1iRS" 824 | } 825 | }, 826 | { 827 | "cell_type": "code", 828 | "source": [ 829 | "from sklearn.neighbors import KNeighborsClassifier\n", 830 | "error = []\n", 831 | "best_k = dict()\n", 832 | "\n", 833 | "# Calculating error for K values between 1 and 20\n", 834 | "for i in range(1, 20):\n", 835 | " knn = KNeighborsClassifier(n_neighbors=i, n_jobs=-1)\n", 836 | " knn.fit(X_train, y_train)\n", 837 | " pred_i = knn.predict(X_test)\n", 838 | " error.append(np.mean(pred_i != y_test))\n", 839 | " best_k[i] = np.mean(pred_i != y_test)\n", 840 | " \n", 841 | "best_k = sorted(best_k.items(), key=lambda k: k[1])[0][0]\n", 842 | "knn_classifier = KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)\n", 843 | "knn_classifier.fit(X_train, y_train)\n", 844 | "y_pred = knn_classifier.predict(X_test)" 845 | ], 846 | "metadata": { 847 | "id": "w6tofWVrIQqz" 848 | }, 849 | "execution_count": 26, 850 | "outputs": [] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "source": [ 855 | "import matplotlib.pyplot as plt\n", 856 | "\n", 857 | "plt.figure(figsize=(12, 6))\n", 858 | "plt.plot(range(1, 20), error, color='red', linestyle='dashdot', marker='o',markerfacecolor='green', markersize=10)\n", 859 | "plt.title('Error Rate K Value')\n", 860 | "plt.xlabel('K Value')\n", 861 | "plt.ylabel('Mean Error')\n", 862 | "plt.show()" 863 | ], 864 | "metadata": { 865 | "colab": { 866 | "base_uri": "https://localhost:8080/", 867 | "height": 404 868 | }, 869 | "id": "-nyJcsNVInu1", 870 | "outputId": "9d3417bf-2723-4382-ccb1-dc7a3f59422a" 871 | }, 872 | "execution_count": 27, 873 | "outputs": [ 874 | { 875 | "output_type": "display_data", 876 | "data": { 877 | "image/png": "\n", 878 | "text/plain": [ 879 | "
" 880 | ] 881 | }, 882 | "metadata": { 883 | "needs_background": "light" 884 | } 885 | } 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "source": [ 891 | "# confusion matrix and accuracy\n", 892 | "\n", 893 | "from sklearn import metrics\n", 894 | "from sklearn.metrics import classification_report \n", 895 | "\n", 896 | "print(f\"Classification report for classifier {knn_classifier}:\\n\"\n", 897 | " f\"{metrics.classification_report(y_test, y_pred)}\\n\")" 898 | ], 899 | "metadata": { 900 | "colab": { 901 | "base_uri": "https://localhost:8080/" 902 | }, 903 | "id": "FkDutc8kIrqx", 904 | "outputId": "8c4f931f-abf1-4d57-c3d2-0404213f7fb6" 905 | }, 906 | "execution_count": null, 907 | "outputs": [ 908 | { 909 | "output_type": "stream", 910 | "name": "stdout", 911 | "text": [ 912 | "Classification report for classifier KNeighborsClassifier(n_jobs=-1, n_neighbors=2):\n", 913 | " precision recall f1-score support\n", 914 | "\n", 915 | " Business 0.65 0.60 0.62 412\n", 916 | "Entertainment 0.50 0.80 0.61 415\n", 917 | " Politics 0.66 0.58 0.62 383\n", 918 | " Sports 0.90 0.50 0.64 413\n", 919 | "\n", 920 | " accuracy 0.62 1623\n", 921 | " macro avg 0.68 0.62 0.62 1623\n", 922 | " weighted avg 0.68 0.62 0.62 1623\n", 923 | "\n", 924 | "\n" 925 | ] 926 | } 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "source": [ 932 | "### Post-processing" 933 | ], 934 | "metadata": { 935 | "id": "ejrxYOKN2Scy" 936 | } 937 | }, 938 | { 939 | "cell_type": "code", 940 | "source": [ 941 | "# save the model to disk\n", 942 | "import pickle\n", 943 | "\n", 944 | "filename = 'svm_model.sav'\n", 945 | "pickle.dump(svm_model, open(filename, 'wb'))\n", 946 | "\n", 947 | "# load the model from disk\n", 948 | "loaded_model = pickle.load(open(filename, 'rb'))" 949 | ], 950 | "metadata": { 951 | "id": "eqhzRO67IzQN" 952 | }, 953 | "execution_count": 28, 954 | "outputs": [] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "source": [ 959 | "line = \"ဒီကနေ့ ဒီဇင်ဘာလ ၂၁ ရက်နေ့ နေ့လယ်ပိုင်းမှာလည်း ရေဦးနဲ့ တန့်ဆည်မြို့နယ် နယ်နမိတ်ထိစပ်နေတဲ့နေရာနားက လိပ်ခြံရွာက ပီဒီအက်ဖ်စခန်းတွေလို့ ယူဆတဲ့နေရာတွေကို စစ်ကောင်စီတပ်ရဲ့ ရဟတ်ယာဉ်တွေက ပစ်ခတ်တာတွေ ဆက်လုပ်ခဲ့တယ်လို့ ဒေသခံတွေထံက သိရပါတယ်။\"" 960 | ], 961 | "metadata": { 962 | "id": "oaxUseLEviBk" 963 | }, 964 | "execution_count": 29, 965 | "outputs": [] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "source": [ 970 | "stopwordslist = []\n", 971 | "slist = []\n", 972 | "with open(\"./stopword.txt\", encoding = 'utf8') as stopwordsfile:\n", 973 | " stopwords = stopwordsfile.readlines()\n", 974 | " slist.extend(stopwords)\n", 975 | "\n", 976 | " for w in range(len(slist)):\n", 977 | " temp = slist[w]\n", 978 | " stopwordslist.append(temp.rstrip())\n" 979 | ], 980 | "metadata": { 981 | "id": "8m8FXyx8xkFE" 982 | }, 983 | "execution_count": 31, 984 | "outputs": [] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "source": [ 989 | "def stop_word(sentence):\n", 990 | " new_sentence = []\n", 991 | " for word in sentence.split():\n", 992 | " if word not in stopwordslist:\n", 993 | " new_sentence.append(word)\n", 994 | " return(' '.join(new_sentence))" 995 | ], 996 | "metadata": { 997 | "id": "UypfZVHfxkFG" 998 | }, 999 | "execution_count": 32, 1000 | "outputs": [] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "source": [ 1005 | "import pyidaungsu as pds\n", 1006 | "\n", 1007 | "def tokenize(line):\n", 1008 | " sentence = pds.tokenize(line,form=\"word\")\n", 1009 | " sentence = ' '.join([str(elem) for elem in sentence])\n", 1010 | " sentence = stop_word(sentence)\n", 1011 | " return sentence" 1012 | ], 1013 | "metadata": { 1014 | "id": "P1snC2NE2jsC" 1015 | }, 1016 | "execution_count": 33, 1017 | "outputs": [] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "source": [ 1022 | "tokenize(line)" 1023 | ], 1024 | "metadata": { 1025 | "colab": { 1026 | "base_uri": "https://localhost:8080/", 1027 | "height": 53 1028 | }, 1029 | "id": "htTGYa8Ovlkk", 1030 | "outputId": "d52cc97e-a1ce-489a-f4d2-bd41c3e930f4" 1031 | }, 1032 | "execution_count": 34, 1033 | "outputs": [ 1034 | { 1035 | "output_type": "execute_result", 1036 | "data": { 1037 | "application/vnd.google.colaboratory.intrinsic+json": { 1038 | "type": "string" 1039 | }, 1040 | "text/plain": [ 1041 | "'ဒီဇင်ဘာ ၂၁ ရက်နေ့ နေ့လယ် ရေဦး တန့်ဆည် မြို့နယ် နယ်နမိတ် ထိစပ် နေရာ နားကလိပ်ခြံ ရွာ ပီဒီအက်ဖ်စခန်း ယူဆ နေရာ စစ်ကောင်စီ တပ် ရဟတ်ယာဉ် ပစ်ခတ် ဆက်လုပ် ဒေသခံ ။'" 1042 | ] 1043 | }, 1044 | "metadata": {}, 1045 | "execution_count": 34 1046 | } 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "code", 1051 | "source": [ 1052 | "user = input(\"Enter a Text: \")\n", 1053 | "user = tokenize(user)\n", 1054 | "data = vectorizer.transform([user]).toarray()\n", 1055 | "output = loaded_model.predict(data)\n", 1056 | "print(output)" 1057 | ], 1058 | "metadata": { 1059 | "id": "QFiTpHpivoce", 1060 | "colab": { 1061 | "base_uri": "https://localhost:8080/" 1062 | }, 1063 | "outputId": "d9b613ec-3d4d-41f7-e5d5-6a1989f408d7" 1064 | }, 1065 | "execution_count": 35, 1066 | "outputs": [ 1067 | { 1068 | "output_type": "stream", 1069 | "name": "stdout", 1070 | "text": [ 1071 | "Enter a Text: ဒီဇင်ဘာ ၂၁ ရက်နေ့ နေ့လယ် ရေဦး တန့်ဆည် မြို့နယ် နယ်နမိတ် ထိစပ် နေရာ နားကလိပ်ခြံ ရွာ ပီဒီအက်ဖ်စခန်း ယူဆ နေရာ စစ်ကောင်စီ တပ် ရဟတ်ယာဉ် ပစ်ခတ် ဆက်လုပ် ဒေသခံ\n", 1072 | "['Politics']\n" 1073 | ] 1074 | } 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "source": [ 1080 | "" 1081 | ], 1082 | "metadata": { 1083 | "id": "LMHLsUtDaLe3" 1084 | }, 1085 | "execution_count": null, 1086 | "outputs": [] 1087 | } 1088 | ] 1089 | } 1090 | --------------------------------------------------------------------------------