├── test.txt
├── README.md
├── Text Classifier
    ├── data
    │   └── .gitignore
    ├── test.ipynb
    └── .ipynb_checkpoints
    │   └── test-checkpoint.ipynb
├── Text Summarization
    ├── model
    │   └── model
    ├── Text Summarization.ipynb
    └── .ipynb_checkpoints
    │   └── Text Summarization-checkpoint.ipynb
├── Topic Words
    ├── Untitled.ipynb
    └── .ipynb_checkpoints
    │   └── Untitled-checkpoint.ipynb
└── Viblo Similarity Documents
    ├── .ipynb_checkpoints
        └── Similarity Documents-checkpoint.ipynb
    └── Similarity Documents.ipynb


/test.txt:
--------------------------------------------------------------------------------
1 | aaaaasd abhsv
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Natual-Language-Processing
2 | 


--------------------------------------------------------------------------------
/Text Classifier/data/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | */
3 | !.gitignore
4 | 


--------------------------------------------------------------------------------
/Text Summarization/model/model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thanhhau097/Natual-Language-Processing/HEAD/Text Summarization/model/model


--------------------------------------------------------------------------------
/Topic Words/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Code thực hiện bài báo: http://www.aclweb.org/anthology/C10-2069?fbclid=IwAR0EHm40BYzdpoZ9BfkMZeJJgKZ6drbp3_-TgCKtLGwbymcYmGGxZbwJvU8"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Preprocessing Data"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 3,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pyvi import ViTokenizer, ViPosTagger\n",
 24 |     "from tqdm import tqdm\n",
 25 |     "import numpy as np\n",
 26 |     "import gensim\n",
 27 |     "import numpy as np"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "scrolled": true
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stderr",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "\r",
 42 |       " 33%|███▎      | 1004/3080 [00:30<01:02, 33.46it/s]"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "import os \n",
 48 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
 49 |     "dir_path = os.path.join(dir_path, 'Data')\n",
 50 |     "\n",
 51 |     "# Load data from dataset folder\n",
 52 |     "# VNTC-master/Data/10Topics/Ver1.1/Train_Full\n",
 53 |     "# VNTC-master/Data/10Topics/Ver1.1/Test_Full\n",
 54 |     "def get_data(folder_path):\n",
 55 |     "    X = []\n",
 56 |     "    y = []\n",
 57 |     "    dirs = os.listdir(folder_path)\n",
 58 |     "    for path in dirs:\n",
 59 |     "        file_paths = os.listdir(os.path.join(folder_path, path))\n",
 60 |     "        for file_path in tqdm(file_paths):\n",
 61 |     "            with open(os.path.join(folder_path, path, file_path), 'r', encoding=\"utf-16\") as f:\n",
 62 |     "                lines = f.readlines()\n",
 63 |     "                lines = ' '.join(lines)\n",
 64 |     "                lines = gensim.utils.simple_preprocess(lines)\n",
 65 |     "                lines = ' '.join(lines)\n",
 66 |     "                lines = ViTokenizer.tokenize(lines)\n",
 67 |     "\n",
 68 |     "                X.append(lines)\n",
 69 |     "                y.append(path)\n",
 70 |     "\n",
 71 |     "    return X, y\n",
 72 |     "\n",
 73 |     "train_path = os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Train_Full')\n",
 74 |     "X_data, y_data = get_data(train_path)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "import pickle\n",
 84 |     "\n",
 85 |     "pickle.dump(X_data, open('data/X_data.pkl', 'wb'))\n",
 86 |     "pickle.dump(y_data, open('data/y_data.pkl', 'wb'))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {
 93 |     "scrolled": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# test_path = os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Test_Full')\n",
 98 |     "# X_test, y_test = get_data(test_path)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# pickle.dump(X_test, open('data/X_test.pkl', 'wb'))\n",
108 |     "# pickle.dump(y_test, open('data/y_test.pkl', 'wb'))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "#### Load data\n",
118 |     "import pickle\n",
119 |     "\n",
120 |     "X_data = pickle.load(open('data/X_data.pkl', 'rb'))\n",
121 |     "y_data = pickle.load(open('data/y_data.pkl', 'rb'))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Bigram"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "'ông đồ cuối_cùng trên đảo vua tần cỡi cọp chơi tám cõi trời xanh kiếm quang sáng chói tiên_nhân cầm đuốc khói nhẹ_nhàng mắt say nhòa lệ theo cung đàn ngồi trên bãi biển lộng_gió ông đồ võ_hiển đạt bất_ngờ đọc cho tôi nghe bài tần vương ẩm_tửu của lý_hạ người được mệnh_danh là thi quỉ đời đường tiếng_tăm và tác_phẩm truyền lưu hậu_thế không thua_kém thi tiên lý bạch thi phật vương_duy thi thánh đỗ_phủ rồi ông tâm_sự tôi sinh ra nhằm lúc nho_học mạt_vận may_mắn được học dăm chữ thánh_hiền bây_giờ thỉnh_thoảng tôi chỉ còn dùng mớ chữ_nghĩa một thời vang bóng này khi có bạn tâm_giao hoặc ai đó nhờ viết câu_đối xưa men theo con đường bãi biển rợp bóng dừa xanh tôi về nhà ông thôn tây xã an vĩnh huyện đảo lý_sơn quảng_ngãi thời_gian như dừng lại trong ngôi nhà_cổ của ông đồ cuối_cùng hòn đảo này những bức hoành_phi câu_đối được viết bằng mực_tàu bút_lông nét chữ như rồng múa phượng bay treo từ hàng cột bờ hiên vào tận bàn_thờ tổ_tiên giữa gian nhà chính bộ sập gụ đã trải hơn trăm năm sáng bóng_nước thời_gian đó là nơi ông thường ngồi xếp_bằng để đọc sách và múa bút khi có người xin chữ trang_trọng nhất trong ngôi nhà_cổ là chiếc tủ bằng gỗ lim lúc_nào cũng ấm mùi nhang trầm phảng_phất trên hàng linh vị thờ các vị tổ_tiên bút tự của ông đồ đạt có khắp_nơi trên đảo lý_sơn vượt biển ra lý_sơn tôi tình_cờ biết ông đồ võ_hiển đạt khi được chứng_kiến một buổi an_táng theo nghi_lễ_mộ gió các ngư_dân bất_hạnh phải gửi xác dưới lòng đại_dương thi_hài trong quan_tài nhỏ chỉ là hình_nhân đất_sét nhưng lễ_nghi vẫn nghiêm linh với lời tụng kinh cờ phướn cầu_siêu những người đã ra đi mãi_mãi không về anh cán_bộ văn_phòng ủy_ban huyện cho biết gần nửa thế_kỷ qua đảo này chỉ có một thầy đồ duy_nhất và cũng có_lẽ là thầy đồ cuối_cùng không chỉ các đình chùa mà tất_cả chữ_nho trong bất_cứ ngôi nhà nào trên đảo đều do một tay ông viết ông kể hồi còn nhỏ vùng này chỉ có một trường tiểu_học dạy tiếng pháp ai muốn học cao hơn phải ra huế hoặc vào qui nhơn cụ thân_sinh của ông là một đồ nho ẩn_dật nhưng phóng_khoáng cho con đi học tiếng pháp nhưng vẫn khuyên ông nên dùi mài kinh_sử với nho_học đến bây_giờ ông vẫn nhớ lời cha dạy chữ pháp giúp con mở_mang văn_minh văn_hóa nhưng chữ_nho mới đủ thâm sâu để dạy con đạo làm người con_người sinh ra_đời không đem đến được gì và chết cũng chẳng mang theo được cái chi cuối_cùng chỉ có cái đạo làm người là lại nghe lời cha_ông theo cả nho_học lẫn pháp ngữ nhưng lấy chữ_nho làm trọng ông may_mắn có nhiều người thầy khả_kính uyên_thâm cả chữ_nho lẫn chữ pháp chín năm theo các thầy thêm mười năm tự học từ sách_vở đến năm_tuổi võ_hiển đạt đã trở_thành một người dạy chữ và cho chữ thánh_hiền đó cũng là lúc đất_nước giao_thời hầu_hết lớp trẻ đã chuyển sang dùi mài quốc_ngữ hoặc tiếng pháp chữ_nho dần_dần trở_thành linh tự trên các bàn_thờ đình_chùa và trong câu_đối của các bậc lớn_tuổi giáo huấn lớp hậu_sinh gìn_giữ đạo_đức gia phong đến giờ ông vẫn còn gìn_giữ nguyên_vẹn bộ tứ_thư được in từ năm và nhiều đầu_sách cổ khác ngôi nhà_cổ của ông đồ trong mùa thu_hoạch tỏi không gặp thời ông nối bước cha_lui về ẩn_dật hòn đảo nhỏ giữa biển ban_ngày vừa làm nghề cá vừa trồng rau khoai kiếm sống thanh_nhàn chiều_tối mở lớp gõ đầu_trẻ tuy_nhiên cho_dù kính_trọng chữ thánh_hiền nhưng nhiều gia_đình cũng không muốn con_em mình theo nho_học chỉ còn một thời vang bóng vừa khó học vừa không_thể xin được việc_làm thế là lớp_học của ông cứ vắng dần vắng dần cho đến lúc không_thể tồn_tại được nữa không gõ đầu_trẻ nữa nhưng ông trở_thành một người cho chữ thánh_hiền nổi_tiếng lý_sơn dù tuổi mới ngoài tam_thập chữ của ông trầm_mặc buồn âm linh tự nơi thờ_cúng đội lính hoàng sa_kiêu hùng thuở xưa và những người đã bỏ_mạng giữa biển khơi chữ của ông trang_nghiêm như lời gia huấn trên liễn thờ các bậc tiền hiền chữ của ông cũng huyền_ảo vi_diệu trên bàn_thờ nam hải tướng quân vị thần tôn_kính của ngư_dân quanh năm treo mạng trên đầu sóng ngọn gió người vui cũng tìm đến ông để xin mấy chữ mới treo trên bàn_thờ tổ_tiên trong ngày cưới_xin cất nhà mới người buồn cũng nài ông vài chữ cho trọn đạo_nghĩa trong lễ tang ma lúc_nào ông cũng vui_vẻ viết tặng khi có người đến xin chữ bởi theo ông đó không chỉ là bổn_phận mà còn là cái đạo của người theo nho_học chọn nhân_nghĩa để sống đời đã sắp đến hồi về với tổ_tiên_ông nói không còn gì phải hối_tiếc nhưng chỉ buồn một điều là chưa có một người học_trò chân_truyền nào để gửi_gắm lại cái chữ_cái đạo của thánh_hiền kể_cả con_cái con ông người bám đảo làm nghề biển người vào sài_gòn bươn_chải mưu_sinh nhiều lúc tôi đành ngẫm_nghĩ hình_như cái gì cũng có vận thịnh_suy của nó nho_học cũng vậy thôi ông trầm_ngâm tâm_sự rồi mài mực trải giấy khai_bút tặng tôi một chữ tâm làm hành_trang trở về với đất_liền theo ông đồ đạt người cho chữ thánh_hiền ngày_nay tuy không còn khắt_khe như xưa nữa nhưng vẫn có một_số điều cần tôn_trọng gìn_giữ để bút_lực có_thể đạt đến sự tinh_tế đến cái hồn sâu thẳm của từng nét chữ ông không viết khi mưa_gió ông cũng không cầm bút khi trời mây_âm hay lúc tịch dương ngả bóng ảm_đạm ông thường đề bút vào buổi sáng thời_điểm mặt_trời bắt_đầu tỏa nắng ấm sau khi thắp nén hương trầm lên bàn_thờ và cạn mấy tách trà ông mới chấm bút_lông vào mực và lúc nét bút bắt_đầu lướt trên giấy đỏ tâm người viết thanh_tịnh như mặt_nước hồ thu không vọng danh_lợi không sợ không khinh cũng không vui không buồn tất_cả chỉ còn có chữ và tâm người viết sẽ bộc_lộ trên cái thần cái hồn của nét chữ'"
140 |       ]
141 |      },
142 |      "execution_count": 9,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "X_data[0]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 24,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "corpus = ''\n",
158 |     "for X in X_data:\n",
159 |     "    corpus += X"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 25,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/plain": [
170 |        "69792128"
171 |       ]
172 |      },
173 |      "execution_count": 25,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "len(corpus)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 26,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "from nltk.collocations import *\n",
189 |     "import nltk"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 29,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "finder = BigramCollocationFinder.from_words(corpus.split(), window_size = 10)\n",
199 |     "# số lần xuất hiện ít nhất của một từ, dùng để lọc những từ có số lượng ít hơn ngưỡng nhất định min_frequent,\n",
200 |     "# nếu không muốn lọc thì để là 1\n",
201 |     "min_frequent = 1\n",
202 |     "finder.apply_freq_filter(min_frequent)\n",
203 |     "\n",
204 |     "# bigram - cặp gồm 2 từ\n",
205 |     "bigram_measures = nltk.collocations.BigramAssocMeasures()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 33,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# xem số lần xuất hiện cùng nhau của từng cặp\n",
215 |     "# for k,v in finder.ngram_fd.items():\n",
216 |     "#     print(k,v)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 34,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "# điểm PMI của từng cặp\n",
226 |     "pmi_results = finder.score_ngrams(bigram_measures.pmi)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 35,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# chuyển về dictionary để tìm kiếm\n",
236 |     "pmi_results_dict = dict(pmi_results)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 42,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "6.852442811586141"
248 |       ]
249 |      },
250 |      "execution_count": 42,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "pmi_results_dict[('cứ', 'dần')]"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "# Điểm PMI cho một cặp từ\n",
264 |     "\n",
265 |     "$$ PMI(w_i, w_j) = \\log\\dfrac{P(w_i, w_j)}{P(w_i)P(w_j)} $$"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 45,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "# lấy điểm PMI cho cặp từ\n",
275 |     "def get_pmi_of_pair(word1, word2):\n",
276 |     "    try:\n",
277 |     "        score = pmi_results_dict[(word1, word2)]\n",
278 |     "    except:\n",
279 |     "        score = 0\n",
280 |     "    finally:\n",
281 |     "        return score"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 46,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "1.4601253888073815"
293 |       ]
294 |      },
295 |      "execution_count": 46,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "get_pmi_of_pair('tôi', \"một\")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "# Điểm PMI trung bình của một từ trong danh sách \n",
309 |     "Danh sách ở trong bài báo là 10 từ, nên 1/9 có nghĩa là so sánh với 9 từ còn lại\n",
310 |     "$$ avg-PMI(w_i) = \\dfrac{1}{9}\\sum_jPMI(wi, w_j) $$"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 47,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "# lấy điểm PMI của từ trong danh sách từ đưa ra\n",
320 |     "def get_average_pmi_for_word(word, words_list):   \n",
321 |     "    _sum = 0\n",
322 |     "    _count = len(words_list)\n",
323 |     "    for compared_word in words_list:\n",
324 |     "        _sum += get_pmi_of_pair(word, compared_word)\n",
325 |     "        \n",
326 |     "    return _sum/_count"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 57,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "0.06825178101438283"
338 |       ]
339 |      },
340 |      "execution_count": 57,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "get_average_pmi_for_word(\"một\", [\"một\", \"hai\", \"không\", \"có\", \"đứng\", \"ngồi\", \"nằm\", \"nhiều\", \"ít\", \"nhỏ\"])"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "# Tìm từ đại diện tốt nhất (topic word) trong danh sách các từ - words_list"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 54,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "# tìm từ đại diện tốt nhất (topic word) trong danh sách các từ - words_list\n",
363 |     "def get_best_word(words_list):\n",
364 |     "    topic_word = words_list[0]\n",
365 |     "    max_score = 0\n",
366 |     "    for word in words_list[1:]:\n",
367 |     "        score = get_average_pmi_for_word(word, words_list)\n",
368 |     "        \n",
369 |     "        if score > max_score:\n",
370 |     "            max_score = score\n",
371 |     "            topic_word = word\n",
372 |     "    return topic_word"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 56,
378 |    "metadata": {},
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/plain": [
383 |        "'không'"
384 |       ]
385 |      },
386 |      "execution_count": 56,
387 |      "metadata": {},
388 |      "output_type": "execute_result"
389 |     }
390 |    ],
391 |    "source": [
392 |     "get_best_word([\"một\", \"hai\", \"không\", \"có\", \"đứng\", \"ngồi\", \"nằm\", \"nhiều\", \"ít\", \"nhỏ\"])"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "## Kết luận: chỉ cần chạy code từ đầu đến cuối, rồi dùng hàm cuối cùng, truyền một danh sách các từ vào, nó sẽ tìm ra topic word.\n",
400 |     "## Các công thức tính CP1, CP2 có vẻ không liên quan lắm, nên không cần tính ở đây. "
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": []
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": []
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": []
423 |   }
424 |  ],
425 |  "metadata": {
426 |   "kernelspec": {
427 |    "display_name": "Python 3",
428 |    "language": "python",
429 |    "name": "python3"
430 |   },
431 |   "language_info": {
432 |    "codemirror_mode": {
433 |     "name": "ipython",
434 |     "version": 3
435 |    },
436 |    "file_extension": ".py",
437 |    "mimetype": "text/x-python",
438 |    "name": "python",
439 |    "nbconvert_exporter": "python",
440 |    "pygments_lexer": "ipython3",
441 |    "version": "3.6.5"
442 |   }
443 |  },
444 |  "nbformat": 4,
445 |  "nbformat_minor": 2
446 | }
447 | 


--------------------------------------------------------------------------------
/Topic Words/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Code thực hiện bài báo: http://www.aclweb.org/anthology/C10-2069?fbclid=IwAR0EHm40BYzdpoZ9BfkMZeJJgKZ6drbp3_-TgCKtLGwbymcYmGGxZbwJvU8"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Preprocessing Data"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 3,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pyvi import ViTokenizer, ViPosTagger\n",
 24 |     "from tqdm import tqdm\n",
 25 |     "import numpy as np\n",
 26 |     "import gensim\n",
 27 |     "import numpy as np"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "scrolled": true
 35 |    },
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stderr",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "\r",
 42 |       " 33%|███▎      | 1004/3080 [00:30<01:02, 33.46it/s]"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "import os \n",
 48 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
 49 |     "dir_path = os.path.join(dir_path, 'Data')\n",
 50 |     "\n",
 51 |     "# Load data from dataset folder\n",
 52 |     "# VNTC-master/Data/10Topics/Ver1.1/Train_Full\n",
 53 |     "# VNTC-master/Data/10Topics/Ver1.1/Test_Full\n",
 54 |     "def get_data(folder_path):\n",
 55 |     "    X = []\n",
 56 |     "    y = []\n",
 57 |     "    dirs = os.listdir(folder_path)\n",
 58 |     "    for path in dirs:\n",
 59 |     "        file_paths = os.listdir(os.path.join(folder_path, path))\n",
 60 |     "        for file_path in tqdm(file_paths):\n",
 61 |     "            with open(os.path.join(folder_path, path, file_path), 'r', encoding=\"utf-16\") as f:\n",
 62 |     "                lines = f.readlines()\n",
 63 |     "                lines = ' '.join(lines)\n",
 64 |     "                lines = gensim.utils.simple_preprocess(lines)\n",
 65 |     "                lines = ' '.join(lines)\n",
 66 |     "                lines = ViTokenizer.tokenize(lines)\n",
 67 |     "\n",
 68 |     "                X.append(lines)\n",
 69 |     "                y.append(path)\n",
 70 |     "\n",
 71 |     "    return X, y\n",
 72 |     "\n",
 73 |     "train_path = os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Train_Full')\n",
 74 |     "X_data, y_data = get_data(train_path)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "import pickle\n",
 84 |     "\n",
 85 |     "pickle.dump(X_data, open('data/X_data.pkl', 'wb'))\n",
 86 |     "pickle.dump(y_data, open('data/y_data.pkl', 'wb'))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {
 93 |     "scrolled": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# test_path = os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Test_Full')\n",
 98 |     "# X_test, y_test = get_data(test_path)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# pickle.dump(X_test, open('data/X_test.pkl', 'wb'))\n",
108 |     "# pickle.dump(y_test, open('data/y_test.pkl', 'wb'))"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 8,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "#### Load data\n",
118 |     "import pickle\n",
119 |     "\n",
120 |     "X_data = pickle.load(open('data/X_data.pkl', 'rb'))\n",
121 |     "y_data = pickle.load(open('data/y_data.pkl', 'rb'))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "## Bigram"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "'ông đồ cuối_cùng trên đảo vua tần cỡi cọp chơi tám cõi trời xanh kiếm quang sáng chói tiên_nhân cầm đuốc khói nhẹ_nhàng mắt say nhòa lệ theo cung đàn ngồi trên bãi biển lộng_gió ông đồ võ_hiển đạt bất_ngờ đọc cho tôi nghe bài tần vương ẩm_tửu của lý_hạ người được mệnh_danh là thi quỉ đời đường tiếng_tăm và tác_phẩm truyền lưu hậu_thế không thua_kém thi tiên lý bạch thi phật vương_duy thi thánh đỗ_phủ rồi ông tâm_sự tôi sinh ra nhằm lúc nho_học mạt_vận may_mắn được học dăm chữ thánh_hiền bây_giờ thỉnh_thoảng tôi chỉ còn dùng mớ chữ_nghĩa một thời vang bóng này khi có bạn tâm_giao hoặc ai đó nhờ viết câu_đối xưa men theo con đường bãi biển rợp bóng dừa xanh tôi về nhà ông thôn tây xã an vĩnh huyện đảo lý_sơn quảng_ngãi thời_gian như dừng lại trong ngôi nhà_cổ của ông đồ cuối_cùng hòn đảo này những bức hoành_phi câu_đối được viết bằng mực_tàu bút_lông nét chữ như rồng múa phượng bay treo từ hàng cột bờ hiên vào tận bàn_thờ tổ_tiên giữa gian nhà chính bộ sập gụ đã trải hơn trăm năm sáng bóng_nước thời_gian đó là nơi ông thường ngồi xếp_bằng để đọc sách và múa bút khi có người xin chữ trang_trọng nhất trong ngôi nhà_cổ là chiếc tủ bằng gỗ lim lúc_nào cũng ấm mùi nhang trầm phảng_phất trên hàng linh vị thờ các vị tổ_tiên bút tự của ông đồ đạt có khắp_nơi trên đảo lý_sơn vượt biển ra lý_sơn tôi tình_cờ biết ông đồ võ_hiển đạt khi được chứng_kiến một buổi an_táng theo nghi_lễ_mộ gió các ngư_dân bất_hạnh phải gửi xác dưới lòng đại_dương thi_hài trong quan_tài nhỏ chỉ là hình_nhân đất_sét nhưng lễ_nghi vẫn nghiêm linh với lời tụng kinh cờ phướn cầu_siêu những người đã ra đi mãi_mãi không về anh cán_bộ văn_phòng ủy_ban huyện cho biết gần nửa thế_kỷ qua đảo này chỉ có một thầy đồ duy_nhất và cũng có_lẽ là thầy đồ cuối_cùng không chỉ các đình chùa mà tất_cả chữ_nho trong bất_cứ ngôi nhà nào trên đảo đều do một tay ông viết ông kể hồi còn nhỏ vùng này chỉ có một trường tiểu_học dạy tiếng pháp ai muốn học cao hơn phải ra huế hoặc vào qui nhơn cụ thân_sinh của ông là một đồ nho ẩn_dật nhưng phóng_khoáng cho con đi học tiếng pháp nhưng vẫn khuyên ông nên dùi mài kinh_sử với nho_học đến bây_giờ ông vẫn nhớ lời cha dạy chữ pháp giúp con mở_mang văn_minh văn_hóa nhưng chữ_nho mới đủ thâm sâu để dạy con đạo làm người con_người sinh ra_đời không đem đến được gì và chết cũng chẳng mang theo được cái chi cuối_cùng chỉ có cái đạo làm người là lại nghe lời cha_ông theo cả nho_học lẫn pháp ngữ nhưng lấy chữ_nho làm trọng ông may_mắn có nhiều người thầy khả_kính uyên_thâm cả chữ_nho lẫn chữ pháp chín năm theo các thầy thêm mười năm tự học từ sách_vở đến năm_tuổi võ_hiển đạt đã trở_thành một người dạy chữ và cho chữ thánh_hiền đó cũng là lúc đất_nước giao_thời hầu_hết lớp trẻ đã chuyển sang dùi mài quốc_ngữ hoặc tiếng pháp chữ_nho dần_dần trở_thành linh tự trên các bàn_thờ đình_chùa và trong câu_đối của các bậc lớn_tuổi giáo huấn lớp hậu_sinh gìn_giữ đạo_đức gia phong đến giờ ông vẫn còn gìn_giữ nguyên_vẹn bộ tứ_thư được in từ năm và nhiều đầu_sách cổ khác ngôi nhà_cổ của ông đồ trong mùa thu_hoạch tỏi không gặp thời ông nối bước cha_lui về ẩn_dật hòn đảo nhỏ giữa biển ban_ngày vừa làm nghề cá vừa trồng rau khoai kiếm sống thanh_nhàn chiều_tối mở lớp gõ đầu_trẻ tuy_nhiên cho_dù kính_trọng chữ thánh_hiền nhưng nhiều gia_đình cũng không muốn con_em mình theo nho_học chỉ còn một thời vang bóng vừa khó học vừa không_thể xin được việc_làm thế là lớp_học của ông cứ vắng dần vắng dần cho đến lúc không_thể tồn_tại được nữa không gõ đầu_trẻ nữa nhưng ông trở_thành một người cho chữ thánh_hiền nổi_tiếng lý_sơn dù tuổi mới ngoài tam_thập chữ của ông trầm_mặc buồn âm linh tự nơi thờ_cúng đội lính hoàng sa_kiêu hùng thuở xưa và những người đã bỏ_mạng giữa biển khơi chữ của ông trang_nghiêm như lời gia huấn trên liễn thờ các bậc tiền hiền chữ của ông cũng huyền_ảo vi_diệu trên bàn_thờ nam hải tướng quân vị thần tôn_kính của ngư_dân quanh năm treo mạng trên đầu sóng ngọn gió người vui cũng tìm đến ông để xin mấy chữ mới treo trên bàn_thờ tổ_tiên trong ngày cưới_xin cất nhà mới người buồn cũng nài ông vài chữ cho trọn đạo_nghĩa trong lễ tang ma lúc_nào ông cũng vui_vẻ viết tặng khi có người đến xin chữ bởi theo ông đó không chỉ là bổn_phận mà còn là cái đạo của người theo nho_học chọn nhân_nghĩa để sống đời đã sắp đến hồi về với tổ_tiên_ông nói không còn gì phải hối_tiếc nhưng chỉ buồn một điều là chưa có một người học_trò chân_truyền nào để gửi_gắm lại cái chữ_cái đạo của thánh_hiền kể_cả con_cái con ông người bám đảo làm nghề biển người vào sài_gòn bươn_chải mưu_sinh nhiều lúc tôi đành ngẫm_nghĩ hình_như cái gì cũng có vận thịnh_suy của nó nho_học cũng vậy thôi ông trầm_ngâm tâm_sự rồi mài mực trải giấy khai_bút tặng tôi một chữ tâm làm hành_trang trở về với đất_liền theo ông đồ đạt người cho chữ thánh_hiền ngày_nay tuy không còn khắt_khe như xưa nữa nhưng vẫn có một_số điều cần tôn_trọng gìn_giữ để bút_lực có_thể đạt đến sự tinh_tế đến cái hồn sâu thẳm của từng nét chữ ông không viết khi mưa_gió ông cũng không cầm bút khi trời mây_âm hay lúc tịch dương ngả bóng ảm_đạm ông thường đề bút vào buổi sáng thời_điểm mặt_trời bắt_đầu tỏa nắng ấm sau khi thắp nén hương trầm lên bàn_thờ và cạn mấy tách trà ông mới chấm bút_lông vào mực và lúc nét bút bắt_đầu lướt trên giấy đỏ tâm người viết thanh_tịnh như mặt_nước hồ thu không vọng danh_lợi không sợ không khinh cũng không vui không buồn tất_cả chỉ còn có chữ và tâm người viết sẽ bộc_lộ trên cái thần cái hồn của nét chữ'"
140 |       ]
141 |      },
142 |      "execution_count": 9,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "X_data[0]"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 24,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "corpus = ''\n",
158 |     "for X in X_data:\n",
159 |     "    corpus += X"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 25,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "data": {
169 |       "text/plain": [
170 |        "69792128"
171 |       ]
172 |      },
173 |      "execution_count": 25,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "len(corpus)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 26,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "from nltk.collocations import *\n",
189 |     "import nltk"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 29,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "finder = BigramCollocationFinder.from_words(corpus.split(), window_size = 10)\n",
199 |     "# số lần xuất hiện ít nhất của một từ, dùng để lọc những từ có số lượng ít hơn ngưỡng nhất định min_frequent,\n",
200 |     "# nếu không muốn lọc thì để là 1\n",
201 |     "min_frequent = 1\n",
202 |     "finder.apply_freq_filter(min_frequent)\n",
203 |     "\n",
204 |     "# bigram - cặp gồm 2 từ\n",
205 |     "bigram_measures = nltk.collocations.BigramAssocMeasures()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 33,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "# xem số lần xuất hiện cùng nhau của từng cặp\n",
215 |     "# for k,v in finder.ngram_fd.items():\n",
216 |     "#     print(k,v)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 34,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "# điểm PMI của từng cặp\n",
226 |     "pmi_results = finder.score_ngrams(bigram_measures.pmi)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 35,
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "# chuyển về dictionary để tìm kiếm\n",
236 |     "pmi_results_dict = dict(pmi_results)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 42,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "6.852442811586141"
248 |       ]
249 |      },
250 |      "execution_count": 42,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "pmi_results_dict[('cứ', 'dần')]"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "# Điểm PMI cho một cặp từ\n",
264 |     "\n",
265 |     "$$ PMI(w_i, w_j) = \\log\\dfrac{P(w_i, w_j)}{P(w_i)P(w_j)} $$"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 45,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "# lấy điểm PMI cho cặp từ\n",
275 |     "def get_pmi_of_pair(word1, word2):\n",
276 |     "    try:\n",
277 |     "        score = pmi_results_dict[(word1, word2)]\n",
278 |     "    except:\n",
279 |     "        score = 0\n",
280 |     "    finally:\n",
281 |     "        return score"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 46,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "1.4601253888073815"
293 |       ]
294 |      },
295 |      "execution_count": 46,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "get_pmi_of_pair('tôi', \"một\")"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "markdown",
306 |    "metadata": {},
307 |    "source": [
308 |     "# Điểm PMI trung bình của một từ trong danh sách \n",
309 |     "Danh sách ở trong bài báo là 10 từ, nên 1/9 có nghĩa là so sánh với 9 từ còn lại\n",
310 |     "$$ avg-PMI(w_i) = \\dfrac{1}{9}\\sum_jPMI(wi, w_j) $$"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 47,
316 |    "metadata": {},
317 |    "outputs": [],
318 |    "source": [
319 |     "# lấy điểm PMI của từ trong danh sách từ đưa ra\n",
320 |     "def get_average_pmi_for_word(word, words_list):   \n",
321 |     "    _sum = 0\n",
322 |     "    _count = len(words_list)\n",
323 |     "    for compared_word in words_list:\n",
324 |     "        _sum += get_pmi_of_pair(word, compared_word)\n",
325 |     "        \n",
326 |     "    return _sum/_count"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 57,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "0.06825178101438283"
338 |       ]
339 |      },
340 |      "execution_count": 57,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "get_average_pmi_for_word(\"một\", [\"một\", \"hai\", \"không\", \"có\", \"đứng\", \"ngồi\", \"nằm\", \"nhiều\", \"ít\", \"nhỏ\"])"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "# Tìm từ đại diện tốt nhất (topic word) trong danh sách các từ - words_list"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 54,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "# tìm từ đại diện tốt nhất (topic word) trong danh sách các từ - words_list\n",
363 |     "def get_best_word(words_list):\n",
364 |     "    topic_word = words_list[0]\n",
365 |     "    max_score = 0\n",
366 |     "    for word in words_list[1:]:\n",
367 |     "        score = get_average_pmi_for_word(word, words_list)\n",
368 |     "        \n",
369 |     "        if score > max_score:\n",
370 |     "            max_score = score\n",
371 |     "            topic_word = word\n",
372 |     "    return topic_word"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": 56,
378 |    "metadata": {},
379 |    "outputs": [
380 |     {
381 |      "data": {
382 |       "text/plain": [
383 |        "'không'"
384 |       ]
385 |      },
386 |      "execution_count": 56,
387 |      "metadata": {},
388 |      "output_type": "execute_result"
389 |     }
390 |    ],
391 |    "source": [
392 |     "get_best_word([\"một\", \"hai\", \"không\", \"có\", \"đứng\", \"ngồi\", \"nằm\", \"nhiều\", \"ít\", \"nhỏ\"])"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "## Kết luận: chỉ cần chạy code từ đầu đến cuối, rồi dùng hàm cuối cùng, truyền một danh sách các từ vào, nó sẽ tìm ra topic word.\n",
400 |     "## Các công thức tính CP1, CP2 có vẻ không liên quan lắm, nên không cần tính ở đây. "
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": []
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": null,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": []
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": []
423 |   }
424 |  ],
425 |  "metadata": {
426 |   "kernelspec": {
427 |    "display_name": "Python 3",
428 |    "language": "python",
429 |    "name": "python3"
430 |   },
431 |   "language_info": {
432 |    "codemirror_mode": {
433 |     "name": "ipython",
434 |     "version": 3
435 |    },
436 |    "file_extension": ".py",
437 |    "mimetype": "text/x-python",
438 |    "name": "python",
439 |    "nbconvert_exporter": "python",
440 |    "pygments_lexer": "ipython3",
441 |    "version": "3.6.5"
442 |   }
443 |  },
444 |  "nbformat": 4,
445 |  "nbformat_minor": 2
446 | }
447 | 


--------------------------------------------------------------------------------
/Text Classifier/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Word2Vec"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 3,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
 20 |       "  \n",
 21 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
 22 |       "  if __name__ == '__main__':\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "\n",
 28 |     "import os\n",
 29 |     "from gensim.models import KeyedVectors \n",
 30 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
 31 |     "word2vec_model_path = os.path.join(dir_path, \"Data/vi/vi.vec\")\n",
 32 |     "\n",
 33 |     "w2v = KeyedVectors.load_word2vec_format(word2vec_model_path)\n",
 34 |     "vocab = w2v.wv.vocab\n",
 35 |     "wv = w2v.wv"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 6,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "def word_vector(word):\n",
 45 |     "    return wv[word]"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 29,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "topic = ['giao_thông', 'bóng_đá', 'tài_chính', 'thị_trường', 'sức_khoẻ', 'thế_giới', 'thể_thao', 'ẩm_thực']"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 44,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "list_word = ['tai_nạn', 'đường_bộ', 'du_lịch', 'ông', 'đau']"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 45,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import numpy as np\n",
 73 |     "def get_best_topic(list_word, list_topic):\n",
 74 |     "    best_score = 0\n",
 75 |     "    \n",
 76 |     "    for topic in list_topic:\n",
 77 |     "        topic_score = 0\n",
 78 |     "        for word in list_word:\n",
 79 |     "            score = wv.similarity(word, topic)\n",
 80 |     "            topic_score += score\n",
 81 |     "        \n",
 82 |     "        if topic_score > best_score:\n",
 83 |     "            best_topic = topic\n",
 84 |     "            best_score = topic_score\n",
 85 |     "    \n",
 86 |     "    return best_topic"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 46,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stderr",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
 99 |       "  if np.issubdtype(vec.dtype, np.int):\n"
100 |      ]
101 |     },
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "'giao_thông'"
106 |       ]
107 |      },
108 |      "execution_count": 46,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "get_best_topic(list_word, topic)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "# Test Naive Bayes"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 1,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stderr",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
134 |       "  from numpy.core.umath_tests import inner1d\n",
135 |       "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
136 |       "  from ._conv import register_converters as _register_converters\n",
137 |       "Using TensorFlow backend.\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm\n",
143 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
144 |     "from sklearn import decomposition, ensemble\n",
145 |     "\n",
146 |     "import pandas, xgboost, numpy, textblob, string\n",
147 |     "from keras.preprocessing import text, sequence\n",
148 |     "from keras import layers, models, optimizers\n",
149 |     "from keras.layers import *"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 2,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "from pyvi import ViTokenizer, ViPosTagger\n",
159 |     "from tqdm import tqdm\n",
160 |     "import numpy as np\n",
161 |     "import gensim\n",
162 |     "import numpy as np"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "# Xử lý dữ liệu"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 3,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "def preprocessing_doc(doc):\n",
179 |     "    lines = gensim.utils.simple_preprocess(doc)\n",
180 |     "    lines = ' '.join(lines)\n",
181 |     "    lines = ViTokenizer.tokenize(lines)\n",
182 |     "\n",
183 |     "    return lines"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 5,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "import pickle\n",
193 |     "\n",
194 |     "X_data = pickle.load(open('data/X_data.pkl', 'rb'))\n",
195 |     "y_data = pickle.load(open('data/y_data.pkl', 'rb'))\n",
196 |     "\n",
197 |     "# X_test = pickle.load(open('data/X_test.pkl', 'rb'))\n",
198 |     "# y_test = pickle.load(open('data/y_test.pkl', 'rb'))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# word level - we choose max number of words equal to 30000 except all words (100k+ words)\n",
208 |     "tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)\n",
209 |     "tfidf_vect.fit(X_data) # learn vocabulary and idf from training set\n",
210 |     "X_data_tfidf =  tfidf_vect.transform(X_data)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 7,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "from sklearn.decomposition import TruncatedSVD"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 8,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "TruncatedSVD(algorithm='randomized', n_components=300, n_iter=5,\n",
231 |        "       random_state=42, tol=0.0)"
232 |       ]
233 |      },
234 |      "execution_count": 8,
235 |      "metadata": {},
236 |      "output_type": "execute_result"
237 |     }
238 |    ],
239 |    "source": [
240 |     "svd = TruncatedSVD(n_components=300, random_state=42)\n",
241 |     "svd.fit(X_data_tfidf)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 9,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "X_data_tfidf_svd = svd.transform(X_data_tfidf)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 37,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# from gensim.models import KeyedVectors \n",
260 |     "# import os \n",
261 |     "# dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
262 |     "# word2vec_model_path = os.path.join(dir_path, \"Data/vi/vi.vec\")\n",
263 |     "\n",
264 |     "# w2v = KeyedVectors.load_word2vec_format(word2vec_model_path)\n",
265 |     "# vocab = w2v.wv.vocab\n",
266 |     "# wv = w2v.wv"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 36,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# def get_word2vec_data(X):\n",
276 |     "#     word2vec_data = []\n",
277 |     "#     for x in X:\n",
278 |     "#         sentence = []\n",
279 |     "#         for word in x.split(\" \"):\n",
280 |     "#             if word in vocab:\n",
281 |     "# #                 print(word)\n",
282 |     "#                 sentence.append(wv[word])\n",
283 |     "\n",
284 |     "#         word2vec_data.append(sentence)\n",
285 |     "# #         break\n",
286 |     "#     return word2vec_data\n",
287 |     "\n",
288 |     "# X_data_w2v = get_word2vec_data(X_data)\n",
289 |     "# # X_test_w2v = get_word2vec_data(X_test)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 13,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "encoder = preprocessing.LabelEncoder()\n",
299 |     "y_data_n = encoder.fit_transform(y_data)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 14,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "array(['Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh',\n",
311 |        "       'Phap luat', 'Suc khoe', 'The gioi', 'The thao', 'Van hoa',\n",
312 |        "       'Vi tinh'], dtype='<U16')"
313 |       ]
314 |      },
315 |      "execution_count": 14,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "encoder.classes_"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "# Huấn luyện mô hình"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 15,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "from sklearn.model_selection import train_test_split"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 18,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "def train_model(classifier, X_data, y_data, X_test=None, y_test=None, is_neuralnet=False, n_epochs=3):       \n",
347 |     "    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.1, random_state=42)\n",
348 |     "    \n",
349 |     "    if is_neuralnet:\n",
350 |     "        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512)\n",
351 |     "        \n",
352 |     "        val_predictions = classifier.predict(X_val)\n",
353 |     "        test_predictions = classifier.predict(X_test)\n",
354 |     "        val_predictions = val_predictions.argmax(axis=-1)\n",
355 |     "#         test_predictions = test_predictions.argmax(axis=-1)\n",
356 |     "    else:\n",
357 |     "        classifier.fit(X_train, y_train)\n",
358 |     "    \n",
359 |     "        train_predictions = classifier.predict(X_train)\n",
360 |     "        val_predictions = classifier.predict(X_val)\n",
361 |     "#         test_predictions = classifier.predict(X_test)\n",
362 |     "        \n",
363 |     "    print(\"Validation accuracy: \", metrics.accuracy_score(val_predictions, y_val))\n",
364 |     "#     print(\"Test accuracy: \", metrics.accuracy_score(test_predictions, y_test))"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 20,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "name": "stdout",
374 |      "output_type": "stream",
375 |      "text": [
376 |       "Validation accuracy:  0.8690758293838863\n"
377 |      ]
378 |     }
379 |    ],
380 |    "source": [
381 |     "model = naive_bayes.MultinomialNB()\n",
382 |     "train_model(model, X_data_tfidf, y_data, is_neuralnet=False)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "# Test"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 55,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "test_doc = '''Tiếp đón ĐT Việt Nam trên sân nhà ở bán kết lượt đi AFF Cup 2018, những sai lầm nơi hàng thủ đã khiến ĐT Philippines nhận thất bại cay đắng 1-2. Sau trận, một số cầu thủ liên tục đăng đàn thể hiện sự tiếc nuối với kết quả này, thậm chí tuyên bố đội nhà xứng đáng giành chiến thắng hơn. \n",
399 |     "\n",
400 |     "Philippines thua Việt Nam: Nội bộ lục đục, báo châu Á khó tin phép màu - 1\n",
401 |     "\n",
402 |     "Patrick Reichelt chỉ trích thái độ thi đấu thiếu quyết tâm của các đồng đội\n",
403 |     "\n",
404 |     "Tuy nhiên khá bất ngờ khi Patrick Reichelt - tác giả bàn gỡ 1-1 lại lên tiếng chỉ trích thái độ thi đấu của các đồng đội giữa thời điểm nhạy cảm. Phản ứng này khiến dư luận nghi ngờ về tình trạng lục đục nội bộ ở Philippines.\n",
405 |     "\n",
406 |     "\"Các cầu thủ chỉ chơi với 80-90% phong độ, điều đó không đủ giúp Philippines chiến thắng. Tôi không hề muốn dừng bước trong lần thứ 3 lọt vào bán kết AFF Cup nhưng nếu toàn đội thi đấu hết mình, tôi sẽ không cảm thấy hối tiếc dù thất bại. Philippines đã có sự chuẩn bị rất tốt, vấn đề nằm ở thái độ thi đấu\", trích lời Reichelt trên Fox Sport Asia.\n",
407 |     "\n",
408 |     "Trong khi đó, chuyên gia bóng đá Đông Nam Á nổi tiếng Gabriel Tan cũng phân tích khá chi tiết những điểm mạnh, điểm yếu của Philppines ở bài viết: \"AFF Cup: Philippines vẫn còn cơ hội sống sót hay Việt Nam đã đặt một chân vào chung kết?\".\n",
409 |     "\n",
410 |     "\"Philippines phần nào tái hiện được tinh thần và lối chơi từng giúp họ cầm hòa ĐKVĐ Thái Lan 1-1 ở vòng bảng. Thầy trò Sven-Goran Eriksson cũng gây ra nhiều khó khăn cho Việt Nam suốt 90 phút, thậm chí trở thành đội đầu tiên chọc thủng lưới Đặng Văn Lâm ở AFF Cup 2018\".\n",
411 |     "\n",
412 |     "Tuy nhiên, Gabriel Tan lại bỏ ngỏ khả năng thầy trò Eriksson lội ngược dòng khi hành quân tới Hà Nội vào ngày 6/12 tới và chỉ gợi lại kỉ niệm đẹp tại SVĐ Mỹ Đình 8 năm trước - thời điểm Philippines đánh bại Việt Nam 2-0:\n",
413 |     "\n",
414 |     "\"Philippines có thể lội ngược dòng? Không có gì đảm bảo cả. Việt Nam vẫn còn nhiều phương án chiến thuật, nhân sự cho khả năng tấn công biên, trong khi The Azkals chỉ còn 18 cầu thủ. Tới Hà Nội, HLV Eriksson chỉ biết hy vọng các học trò thể hiện tinh thần quyết tâm như trận hòa Thái Lan và tái hiện phép màu Hà Nội cách đây 8 năm\".\n",
415 |     "\n",
416 |     "Philippines thua Việt Nam: Nội bộ lục đục, báo châu Á khó tin phép màu - 2\n",
417 |     "\"Phép màu Hà Nội 2010\" là yếu tố để giới chuyên môn lẫn các cầu thủ Philippines bấu víu ở trận bán kết lượt về\n",
418 |     "\n",
419 |     "Về màn trình diễn của ĐT Việt Nam, Gabriel Tan đánh giá rất cao HLV Park Hang Seo với những điều chỉnh chiến thuật, nhân sự cực kì táo bạo, nhạy bén: \n",
420 |     "\n",
421 |     "\"Chiến thắng của Việt Nam ấn tượng hơn cả bởi HLV Park Hang Seo thậm chí chưa tung ra Văn Quyết, Xuân Trường, trong khi Công Phượng chỉ vào sân 10 phút cuối. Thay vào đó, Đức Huy và Hùng Dũng - những cầu thủ mới đá chính ở lượt trận cuối vòng bảng gặp Campuchia - được lựa chọn cho vị trí tiền vệ trung tâm.\n",
422 |     "\n",
423 |     "Nhiều người cho rằng họ vào sân chỉ để giúp Xuân Trường dưỡng sức, tạo điều kiện cho Quang Hải trở về vị trí đá cánh sở trường, nhưng chiến lược gia người Hàn Quốc lại nghĩ khác. Ông không e ngại đặt niềm tin vào những cầu thủ trẻ. Mặt khác, hàng thủ với bộ ba hậu vệ, Đặng Văn Lâm và đôi cánh Trọng Hoàng - Văn Hậu tiếp tục cho thấy sự ăn ý đáng kinh ngạc\".\n",
424 |     "\n",
425 |     "'''"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 56,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "test_doc = preprocessing_doc(test_doc)\n",
435 |     "# test_vec = get_word2vec_data([test_doc])"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 63,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "name": "stdout",
445 |      "output_type": "stream",
446 |      "text": [
447 |       "(1, 30000)\n"
448 |      ]
449 |     }
450 |    ],
451 |    "source": [
452 |     "test_doc_tfidf = tfidf_vect.transform([test_doc])\n",
453 |     "print(np.shape(test_doc_tfidf))\n",
454 |     "test_doc_svd = svd.transform(test_doc_tfidf)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 68,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/plain": [
465 |        "array(['The thao'], dtype='<U16')"
466 |       ]
467 |      },
468 |      "execution_count": 68,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "model.predict(test_doc_tfidf)"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {},
481 |    "outputs": [],
482 |    "source": []
483 |   }
484 |  ],
485 |  "metadata": {
486 |   "kernelspec": {
487 |    "display_name": "Python 3",
488 |    "language": "python",
489 |    "name": "python3"
490 |   },
491 |   "language_info": {
492 |    "codemirror_mode": {
493 |     "name": "ipython",
494 |     "version": 3
495 |    },
496 |    "file_extension": ".py",
497 |    "mimetype": "text/x-python",
498 |    "name": "python",
499 |    "nbconvert_exporter": "python",
500 |    "pygments_lexer": "ipython3",
501 |    "version": "3.6.5"
502 |   }
503 |  },
504 |  "nbformat": 4,
505 |  "nbformat_minor": 2
506 | }
507 | 


--------------------------------------------------------------------------------
/Text Classifier/.ipynb_checkpoints/test-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Word2Vec"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 3,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
 20 |       "  \n",
 21 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:9: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n",
 22 |       "  if __name__ == '__main__':\n"
 23 |      ]
 24 |     }
 25 |    ],
 26 |    "source": [
 27 |     "\n",
 28 |     "import os\n",
 29 |     "from gensim.models import KeyedVectors \n",
 30 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
 31 |     "word2vec_model_path = os.path.join(dir_path, \"Data/vi/vi.vec\")\n",
 32 |     "\n",
 33 |     "w2v = KeyedVectors.load_word2vec_format(word2vec_model_path)\n",
 34 |     "vocab = w2v.wv.vocab\n",
 35 |     "wv = w2v.wv"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 6,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "def word_vector(word):\n",
 45 |     "    return wv[word]"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 29,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "topic = ['giao_thông', 'bóng_đá', 'tài_chính', 'thị_trường', 'sức_khoẻ', 'thế_giới', 'thể_thao', 'ẩm_thực']"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 44,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "list_word = ['tai_nạn', 'đường_bộ', 'du_lịch', 'ông', 'đau']"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 45,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "import numpy as np\n",
 73 |     "def get_best_topic(list_word, list_topic):\n",
 74 |     "    best_score = 0\n",
 75 |     "    \n",
 76 |     "    for topic in list_topic:\n",
 77 |     "        topic_score = 0\n",
 78 |     "        for word in list_word:\n",
 79 |     "            score = wv.similarity(word, topic)\n",
 80 |     "            topic_score += score\n",
 81 |     "        \n",
 82 |     "        if topic_score > best_score:\n",
 83 |     "            best_topic = topic\n",
 84 |     "            best_score = topic_score\n",
 85 |     "    \n",
 86 |     "    return best_topic"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 46,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stderr",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "/anaconda3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
 99 |       "  if np.issubdtype(vec.dtype, np.int):\n"
100 |      ]
101 |     },
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "'giao_thông'"
106 |       ]
107 |      },
108 |      "execution_count": 46,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "get_best_topic(list_word, topic)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "# Test Naive Bayes"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 1,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stderr",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.\n",
134 |       "  from numpy.core.umath_tests import inner1d\n",
135 |       "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
136 |       "  from ._conv import register_converters as _register_converters\n",
137 |       "Using TensorFlow backend.\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm\n",
143 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
144 |     "from sklearn import decomposition, ensemble\n",
145 |     "\n",
146 |     "import pandas, xgboost, numpy, textblob, string\n",
147 |     "from keras.preprocessing import text, sequence\n",
148 |     "from keras import layers, models, optimizers\n",
149 |     "from keras.layers import *"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 2,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "from pyvi import ViTokenizer, ViPosTagger\n",
159 |     "from tqdm import tqdm\n",
160 |     "import numpy as np\n",
161 |     "import gensim\n",
162 |     "import numpy as np"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "# Xử lý dữ liệu"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 3,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "def preprocessing_doc(doc):\n",
179 |     "    lines = gensim.utils.simple_preprocess(doc)\n",
180 |     "    lines = ' '.join(lines)\n",
181 |     "    lines = ViTokenizer.tokenize(lines)\n",
182 |     "\n",
183 |     "    return lines"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 5,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "import pickle\n",
193 |     "\n",
194 |     "X_data = pickle.load(open('data/X_data.pkl', 'rb'))\n",
195 |     "y_data = pickle.load(open('data/y_data.pkl', 'rb'))\n",
196 |     "\n",
197 |     "# X_test = pickle.load(open('data/X_test.pkl', 'rb'))\n",
198 |     "# y_test = pickle.load(open('data/y_test.pkl', 'rb'))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {},
205 |    "outputs": [],
206 |    "source": [
207 |     "# word level - we choose max number of words equal to 30000 except all words (100k+ words)\n",
208 |     "tfidf_vect = TfidfVectorizer(analyzer='word', max_features=30000)\n",
209 |     "tfidf_vect.fit(X_data) # learn vocabulary and idf from training set\n",
210 |     "X_data_tfidf =  tfidf_vect.transform(X_data)"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 7,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "from sklearn.decomposition import TruncatedSVD"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 8,
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "TruncatedSVD(algorithm='randomized', n_components=300, n_iter=5,\n",
231 |        "       random_state=42, tol=0.0)"
232 |       ]
233 |      },
234 |      "execution_count": 8,
235 |      "metadata": {},
236 |      "output_type": "execute_result"
237 |     }
238 |    ],
239 |    "source": [
240 |     "svd = TruncatedSVD(n_components=300, random_state=42)\n",
241 |     "svd.fit(X_data_tfidf)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 9,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "X_data_tfidf_svd = svd.transform(X_data_tfidf)"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 37,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# from gensim.models import KeyedVectors \n",
260 |     "# import os \n",
261 |     "# dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
262 |     "# word2vec_model_path = os.path.join(dir_path, \"Data/vi/vi.vec\")\n",
263 |     "\n",
264 |     "# w2v = KeyedVectors.load_word2vec_format(word2vec_model_path)\n",
265 |     "# vocab = w2v.wv.vocab\n",
266 |     "# wv = w2v.wv"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 36,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "# def get_word2vec_data(X):\n",
276 |     "#     word2vec_data = []\n",
277 |     "#     for x in X:\n",
278 |     "#         sentence = []\n",
279 |     "#         for word in x.split(\" \"):\n",
280 |     "#             if word in vocab:\n",
281 |     "# #                 print(word)\n",
282 |     "#                 sentence.append(wv[word])\n",
283 |     "\n",
284 |     "#         word2vec_data.append(sentence)\n",
285 |     "# #         break\n",
286 |     "#     return word2vec_data\n",
287 |     "\n",
288 |     "# X_data_w2v = get_word2vec_data(X_data)\n",
289 |     "# # X_test_w2v = get_word2vec_data(X_test)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 13,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "encoder = preprocessing.LabelEncoder()\n",
299 |     "y_data_n = encoder.fit_transform(y_data)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 14,
305 |    "metadata": {},
306 |    "outputs": [
307 |     {
308 |      "data": {
309 |       "text/plain": [
310 |        "array(['Chinh tri Xa hoi', 'Doi song', 'Khoa hoc', 'Kinh doanh',\n",
311 |        "       'Phap luat', 'Suc khoe', 'The gioi', 'The thao', 'Van hoa',\n",
312 |        "       'Vi tinh'], dtype='<U16')"
313 |       ]
314 |      },
315 |      "execution_count": 14,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "encoder.classes_"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "# Huấn luyện mô hình"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 15,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "from sklearn.model_selection import train_test_split"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 18,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "def train_model(classifier, X_data, y_data, X_test=None, y_test=None, is_neuralnet=False, n_epochs=3):       \n",
347 |     "    X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, test_size=0.1, random_state=42)\n",
348 |     "    \n",
349 |     "    if is_neuralnet:\n",
350 |     "        classifier.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=n_epochs, batch_size=512)\n",
351 |     "        \n",
352 |     "        val_predictions = classifier.predict(X_val)\n",
353 |     "        test_predictions = classifier.predict(X_test)\n",
354 |     "        val_predictions = val_predictions.argmax(axis=-1)\n",
355 |     "#         test_predictions = test_predictions.argmax(axis=-1)\n",
356 |     "    else:\n",
357 |     "        classifier.fit(X_train, y_train)\n",
358 |     "    \n",
359 |     "        train_predictions = classifier.predict(X_train)\n",
360 |     "        val_predictions = classifier.predict(X_val)\n",
361 |     "#         test_predictions = classifier.predict(X_test)\n",
362 |     "        \n",
363 |     "    print(\"Validation accuracy: \", metrics.accuracy_score(val_predictions, y_val))\n",
364 |     "#     print(\"Test accuracy: \", metrics.accuracy_score(test_predictions, y_test))"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 20,
370 |    "metadata": {},
371 |    "outputs": [
372 |     {
373 |      "name": "stdout",
374 |      "output_type": "stream",
375 |      "text": [
376 |       "Validation accuracy:  0.8690758293838863\n"
377 |      ]
378 |     }
379 |    ],
380 |    "source": [
381 |     "model = naive_bayes.MultinomialNB()\n",
382 |     "train_model(model, X_data_tfidf, y_data, is_neuralnet=False)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "markdown",
387 |    "metadata": {},
388 |    "source": [
389 |     "# Test"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 55,
395 |    "metadata": {},
396 |    "outputs": [],
397 |    "source": [
398 |     "test_doc = '''Tiếp đón ĐT Việt Nam trên sân nhà ở bán kết lượt đi AFF Cup 2018, những sai lầm nơi hàng thủ đã khiến ĐT Philippines nhận thất bại cay đắng 1-2. Sau trận, một số cầu thủ liên tục đăng đàn thể hiện sự tiếc nuối với kết quả này, thậm chí tuyên bố đội nhà xứng đáng giành chiến thắng hơn. \n",
399 |     "\n",
400 |     "Philippines thua Việt Nam: Nội bộ lục đục, báo châu Á khó tin phép màu - 1\n",
401 |     "\n",
402 |     "Patrick Reichelt chỉ trích thái độ thi đấu thiếu quyết tâm của các đồng đội\n",
403 |     "\n",
404 |     "Tuy nhiên khá bất ngờ khi Patrick Reichelt - tác giả bàn gỡ 1-1 lại lên tiếng chỉ trích thái độ thi đấu của các đồng đội giữa thời điểm nhạy cảm. Phản ứng này khiến dư luận nghi ngờ về tình trạng lục đục nội bộ ở Philippines.\n",
405 |     "\n",
406 |     "\"Các cầu thủ chỉ chơi với 80-90% phong độ, điều đó không đủ giúp Philippines chiến thắng. Tôi không hề muốn dừng bước trong lần thứ 3 lọt vào bán kết AFF Cup nhưng nếu toàn đội thi đấu hết mình, tôi sẽ không cảm thấy hối tiếc dù thất bại. Philippines đã có sự chuẩn bị rất tốt, vấn đề nằm ở thái độ thi đấu\", trích lời Reichelt trên Fox Sport Asia.\n",
407 |     "\n",
408 |     "Trong khi đó, chuyên gia bóng đá Đông Nam Á nổi tiếng Gabriel Tan cũng phân tích khá chi tiết những điểm mạnh, điểm yếu của Philppines ở bài viết: \"AFF Cup: Philippines vẫn còn cơ hội sống sót hay Việt Nam đã đặt một chân vào chung kết?\".\n",
409 |     "\n",
410 |     "\"Philippines phần nào tái hiện được tinh thần và lối chơi từng giúp họ cầm hòa ĐKVĐ Thái Lan 1-1 ở vòng bảng. Thầy trò Sven-Goran Eriksson cũng gây ra nhiều khó khăn cho Việt Nam suốt 90 phút, thậm chí trở thành đội đầu tiên chọc thủng lưới Đặng Văn Lâm ở AFF Cup 2018\".\n",
411 |     "\n",
412 |     "Tuy nhiên, Gabriel Tan lại bỏ ngỏ khả năng thầy trò Eriksson lội ngược dòng khi hành quân tới Hà Nội vào ngày 6/12 tới và chỉ gợi lại kỉ niệm đẹp tại SVĐ Mỹ Đình 8 năm trước - thời điểm Philippines đánh bại Việt Nam 2-0:\n",
413 |     "\n",
414 |     "\"Philippines có thể lội ngược dòng? Không có gì đảm bảo cả. Việt Nam vẫn còn nhiều phương án chiến thuật, nhân sự cho khả năng tấn công biên, trong khi The Azkals chỉ còn 18 cầu thủ. Tới Hà Nội, HLV Eriksson chỉ biết hy vọng các học trò thể hiện tinh thần quyết tâm như trận hòa Thái Lan và tái hiện phép màu Hà Nội cách đây 8 năm\".\n",
415 |     "\n",
416 |     "Philippines thua Việt Nam: Nội bộ lục đục, báo châu Á khó tin phép màu - 2\n",
417 |     "\"Phép màu Hà Nội 2010\" là yếu tố để giới chuyên môn lẫn các cầu thủ Philippines bấu víu ở trận bán kết lượt về\n",
418 |     "\n",
419 |     "Về màn trình diễn của ĐT Việt Nam, Gabriel Tan đánh giá rất cao HLV Park Hang Seo với những điều chỉnh chiến thuật, nhân sự cực kì táo bạo, nhạy bén: \n",
420 |     "\n",
421 |     "\"Chiến thắng của Việt Nam ấn tượng hơn cả bởi HLV Park Hang Seo thậm chí chưa tung ra Văn Quyết, Xuân Trường, trong khi Công Phượng chỉ vào sân 10 phút cuối. Thay vào đó, Đức Huy và Hùng Dũng - những cầu thủ mới đá chính ở lượt trận cuối vòng bảng gặp Campuchia - được lựa chọn cho vị trí tiền vệ trung tâm.\n",
422 |     "\n",
423 |     "Nhiều người cho rằng họ vào sân chỉ để giúp Xuân Trường dưỡng sức, tạo điều kiện cho Quang Hải trở về vị trí đá cánh sở trường, nhưng chiến lược gia người Hàn Quốc lại nghĩ khác. Ông không e ngại đặt niềm tin vào những cầu thủ trẻ. Mặt khác, hàng thủ với bộ ba hậu vệ, Đặng Văn Lâm và đôi cánh Trọng Hoàng - Văn Hậu tiếp tục cho thấy sự ăn ý đáng kinh ngạc\".\n",
424 |     "\n",
425 |     "'''"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 56,
431 |    "metadata": {},
432 |    "outputs": [],
433 |    "source": [
434 |     "test_doc = preprocessing_doc(test_doc)\n",
435 |     "# test_vec = get_word2vec_data([test_doc])"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 63,
441 |    "metadata": {},
442 |    "outputs": [
443 |     {
444 |      "name": "stdout",
445 |      "output_type": "stream",
446 |      "text": [
447 |       "(1, 30000)\n"
448 |      ]
449 |     }
450 |    ],
451 |    "source": [
452 |     "test_doc_tfidf = tfidf_vect.transform([test_doc])\n",
453 |     "print(np.shape(test_doc_tfidf))\n",
454 |     "test_doc_svd = svd.transform(test_doc_tfidf)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 68,
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "data": {
464 |       "text/plain": [
465 |        "array(['The thao'], dtype='<U16')"
466 |       ]
467 |      },
468 |      "execution_count": 68,
469 |      "metadata": {},
470 |      "output_type": "execute_result"
471 |     }
472 |    ],
473 |    "source": [
474 |     "model.predict(test_doc_tfidf)"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {},
481 |    "outputs": [],
482 |    "source": []
483 |   }
484 |  ],
485 |  "metadata": {
486 |   "kernelspec": {
487 |    "display_name": "Python 3",
488 |    "language": "python",
489 |    "name": "python3"
490 |   },
491 |   "language_info": {
492 |    "codemirror_mode": {
493 |     "name": "ipython",
494 |     "version": 3
495 |    },
496 |    "file_extension": ".py",
497 |    "mimetype": "text/x-python",
498 |    "name": "python",
499 |    "nbconvert_exporter": "python",
500 |    "pygments_lexer": "ipython3",
501 |    "version": "3.6.5"
502 |   }
503 |  },
504 |  "nbformat": 4,
505 |  "nbformat_minor": 2
506 | }
507 | 


--------------------------------------------------------------------------------
/Viblo Similarity Documents/.ipynb_checkpoints/Similarity Documents-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Abstract"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this tutorial, we will implement some algorithms that finding the similarity documents. The document dataset was crawled from Viblo website: $\\textbf{viblo.asia}$"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "## Algorithms\n",
  22 |     "1. TF-IDF + SVD\n",
  23 |     "2. DOC2VEC"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "# Preprocessing Data"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 29,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "import gensim\n",
  40 |     "import os\n",
  41 |     "import collections\n",
  42 |     "import smart_open\n",
  43 |     "import random\n",
  44 |     "import pickle\n",
  45 |     "from pyvi import ViTokenizer, ViPosTagger\n",
  46 |     "from tqdm import tqdm\n",
  47 |     "import numpy as np"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "markdown",
  52 |    "metadata": {},
  53 |    "source": [
  54 |     "We will load data in data folder. The data in \"contents_markdown_viblo.pkl\" is a list of document."
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "code",
  59 |    "execution_count": 2,
  60 |    "metadata": {},
  61 |    "outputs": [],
  62 |    "source": [
  63 |     "with open('data/contents_markdown_viblo.pkl', 'rb') as f:\n",
  64 |     "    content = pickle.load(f)"
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 6,
  70 |    "metadata": {
  71 |     "scrolled": false
  72 |    },
  73 |    "outputs": [
  74 |     {
  75 |      "name": "stdout",
  76 |      "output_type": "stream",
  77 |      "text": [
  78 |       "<class 'list'>\n",
  79 |       "14157\n"
  80 |      ]
  81 |     }
  82 |    ],
  83 |    "source": [
  84 |     "print(type(content))\n",
  85 |     "print(len(content))"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "code",
  90 |    "execution_count": 40,
  91 |    "metadata": {},
  92 |    "outputs": [
  93 |     {
  94 |      "name": "stdout",
  95 |      "output_type": "stream",
  96 |      "text": [
  97 |       "### Trong phần này tôi sẽ giúp các bạn mới học `React Native` hiểu qua phần cài đặt project đã tồn tại và cùng nhau tìm hiểu khái niệm cơ bản nhất của React Native.\n",
  98 |       "# I. Một số chia sẻ cá nhân\n",
  99 |       "\n",
 100 |       "## 1. Yêu cầu thứ 1\n",
 101 |       "Yêu cầu để thực hiện theo bài viết này, bạn hãy cài đặt môi trường theo bài viết dưới đây\n",
 102 |       "\n",
 103 |       "### [Học React Native từ cơ bản đến nâng cao - Phần 1 Hướng dẫn cài đặt và chạy \"Hello world\"](https://viblo.asia/p/hoc-react-native-tu-co-ban-den-nang-cao-phan-1-huong-dan-cai-dat-va-chay-hello-world-RQqKLYW0Z7z)\n",
 104 |       "\n",
 105 |       "## 2. Yêu cầu thứ 2\n",
 106 |       "Khi muốn code React Native bạn sẽ phải chọn cho mình 1 Editors phù hợp cho mình\n",
 107 |       "\n",
 108 |       "Bạn có thể tham khảo các Editors theo link dưới đây\n",
 109 |       "\n",
 110 |       "### [Top 10 Editors For React Native Mobile App Development](https://www.icicletech.com/blog/top-10-editors-for-react-native)\n",
 111 |       "\n",
 112 |       "Còn lựa chọn của tôi là [**`Visual Studio Code`**](https://code.visualstudio.com/) vì đơn giản là dùng khá nhẹ và tiện dụng\n",
 113 |       "\n",
 114 |       "![](https://i.imgur.com/ZfAI0VK.png)\n",
 115 |       "\n",
 116 |       "Nếu bạn cũng chọn **`Visual Studio Code`** như tôi thì tiếp tục cài các Extension theo bài viết dưới đây để giúp việc code React Native trở nên dễ dàng hơn\n",
 117 |       "\n",
 118 |       "### VSCode for React Native: \n",
 119 |       "[https://medium.com/react-native-training/vscode-for-react-native-526ec4a368ce](https://medium.com/react-native-training/vscode-for-react-native-526ec4a368ce)\n",
 120 |       "\n",
 121 |       "## 3. Yêu cầu thứ 3\n",
 122 |       "\n",
 123 |       "Kéo code ví dụ React Native về máy bạn:\n",
 124 |       "\n",
 125 |       "- Bạn mở terminal \n",
 126 |       "\n",
 127 |       "    **`git clone https://github.com/oTranThanhNghia/LearnReactNative.git`**\n",
 128 |       "\n",
 129 |       "- Khi đã kéo code về xong vào trong folder `LearnReactNative` mà vừa kéo về\n",
 130 |       "\n",
 131 |       "    gõ lệnh sau để cấu hình project: **`npm install react-native@0.55.4`**\n",
 132 |       "\n",
 133 |       "- Cấu hình lại Android SDK trong file `local.properties` trong folder của Android `LearnReactNative\\android` ở  như sau: \n",
 134 |       "```\n",
 135 |       "sdk.dir=D\\:\\\\Android\\\\Sdk\n",
 136 |       "```\n",
 137 |       "- Chạy android gõ: `react-native run-android`\n",
 138 |       "    Khi nào terminal hiển thị là bạn đã thành công. \n",
 139 |       "    \n",
 140 |       "![](https://i.imgur.com/ianKmsQ.png)\n",
 141 |       "\n",
 142 |       "Nếu bạn gặp phải lỗi sau thì hãy kiểm tra lại `port 8081` có đang sử dụng không. Nếu có thì tắt nó đi và gõ lại lệnh `npm start` để khởi động lại Metro Bundler\n",
 143 |       "\n",
 144 |       "![](https://i.imgur.com/GjvmD4f.png)\n",
 145 |       "\n",
 146 |       "* Ngoài ra bạn hãy vào index.js để chọn example để chạy nhé:\n",
 147 |       "\n",
 148 |       "![](https://i.imgur.com/WSeVoA8.png)\n",
 149 |       "\n",
 150 |       "# II. Khái niệm cơ bản trong React Native\n",
 151 |       "## 1. Props\n",
 152 |       "Hầu hết các thành phần có thể custom được khi được tạo ra với các tham số khác nhau. Các tham số này được gọi là `props`\n",
 153 |       "Ví dụ về `Image` sẽ giúp bạn sử dụng prop `source` để hiển thị ảnh\n",
 154 |       "\n",
 155 |       "```javascript\n",
 156 |       "import React, {Component} from 'react';\n",
 157 |       "import {Image} from 'react-native';\n",
 158 |       "\n",
 159 |       "export default class Bananas extends Component {\n",
 160 |       "    render() {\n",
 161 |       "      let pic = {\n",
 162 |       "        uri: 'https://upload.wikimedia.org/wikipedia/commons/d/de/Bananavarieties.jpg'\n",
 163 |       "      };\n",
 164 |       "      return (\n",
 165 |       "        <Image source={pic} style={{width: 193, height: 110}}/>\n",
 166 |       "      );\n",
 167 |       "    }\n",
 168 |       "}\n",
 169 |       "```\n",
 170 |       "\n",
 171 |       "\n",
 172 |       "![](https://i.imgur.com/BLVZLjk.png)\n",
 173 |       "\n",
 174 |       "Components bạn viết ra cũng có thể sử dụng `props`. Ví dụ dưới đây sẽ mô tả cách sử dụng `props` trong Component riêng biệt\n",
 175 |       "\n",
 176 |       "``` javascript\n",
 177 |       "import React, { Component } from 'react';\n",
 178 |       "import { Text, View } from 'react-native';\n",
 179 |       "\n",
 180 |       "class Greeting extends Component {\n",
 181 |       "  render() {\n",
 182 |       "    return (\n",
 183 |       "      <Text>Hello {this.props.name}!</Text>\n",
 184 |       "    );\n",
 185 |       "  }\n",
 186 |       "}\n",
 187 |       "\n",
 188 |       "export default class LotsOfGreetings extends Component {\n",
 189 |       "  render() {\n",
 190 |       "    return (\n",
 191 |       "      <View style={{alignItems: 'center'}}>\n",
 192 |       "        <Greeting name='Rexxar' />\n",
 193 |       "        <Greeting name='Jaina' />\n",
 194 |       "        <Greeting name='Valeera' />\n",
 195 |       "      </View>\n",
 196 |       "    );\n",
 197 |       "  }\n",
 198 |       "}\n",
 199 |       "```\n",
 200 |       "\n",
 201 |       "![](https://i.imgur.com/ThWUrYc.png)\n",
 202 |       "\n",
 203 |       "Sử dụng prop `name` giúp bạn có thể custom `Greeting` component 1 cách dễ dàng và giúp tái sử dụng lại component ở nhiều nơi\n",
 204 |       "\n",
 205 |       "## 2. State\n",
 206 |       "Trong khi `props` là không thể thay đổi thì `state` là kiểu dữ liệu có thể update được trong tương lai\n",
 207 |       "\n",
 208 |       "Nghe có vẻ khó hiểu nhưng bạn xem ví dụ dưới đây cho rõ ràng\n",
 209 |       "\n",
 210 |       "```javascript\n",
 211 |       "import React, { Component } from 'react';\n",
 212 |       "import { Text, View } from 'react-native';\n",
 213 |       "\n",
 214 |       "class Blink extends Component {\n",
 215 |       "  constructor(props) {\n",
 216 |       "    super(props);\n",
 217 |       "    this.state = {isShowingText: true};\n",
 218 |       "\n",
 219 |       "    // Toggle the state every second\n",
 220 |       "    setInterval(() => {\n",
 221 |       "      this.setState(previousState => {\n",
 222 |       "        return { isShowingText: !previousState.isShowingText };\n",
 223 |       "      });\n",
 224 |       "    }, 1000);\n",
 225 |       "  }\n",
 226 |       "\n",
 227 |       "  render() {\n",
 228 |       "    let display = this.state.isShowingText ? this.props.text : ' ';\n",
 229 |       "    return (\n",
 230 |       "      <Text>{display}</Text>\n",
 231 |       "    );\n",
 232 |       "  }\n",
 233 |       "}\n",
 234 |       "\n",
 235 |       "export default class BlinkApp extends Component {\n",
 236 |       "  render() {\n",
 237 |       "    return (\n",
 238 |       "      <View>\n",
 239 |       "        <Blink text='I love to blink' />\n",
 240 |       "        <Blink text='Yes blinking is so great' />\n",
 241 |       "        <Blink text='Why did they ever take this out of HTML' />\n",
 242 |       "        <Blink text='Look at me look at me look at me' />\n",
 243 |       "      </View>\n",
 244 |       "    );\n",
 245 |       "  }\n",
 246 |       "}\n",
 247 |       "```\n",
 248 |       "\n",
 249 |       "![](https://i.imgur.com/fyuYkCx.gif)\n",
 250 |       "\n",
 251 |       "Trong demo ở trên là ví dụ về sau 1 giây sẽ nhấp nháy chữ.\n",
 252 |       "\n",
 253 |       "Trong đó:\n",
 254 |       " - `props`: `text` trong `Blink` component\n",
 255 |       " - `state`: `isShowingText` là cờ để hiển thị\n",
 256 |       "\n",
 257 |       "\n",
 258 |       "\n",
 259 |       "## 3. Style\n",
 260 |       " `Style` sẽ gợi nhắc các bạn nghĩ ngay đến style của Web. Tất cả cấu trúc, biến, giá trị hầu hết đều giống với CSS\n",
 261 |       " Bạn hãy xem ví dụ dưới đây \n",
 262 |       " \n",
 263 |       "```javascript\n",
 264 |       "import React, { Component } from 'react';\n",
 265 |       "import { StyleSheet, Text, View } from 'react-native';\n",
 266 |       "\n",
 267 |       "export default class LotsOfStyles extends Component {\n",
 268 |       "  render() {\n",
 269 |       "    return (\n",
 270 |       "      <View>\n",
 271 |       "        <Text style={styles.red}>just red</Text>\n",
 272 |       "        <Text style={styles.bigblue}>just bigblue</Text>\n",
 273 |       "        <Text style={[styles.bigblue, styles.red]}>bigblue, then red</Text>\n",
 274 |       "        <Text style={[styles.red, styles.bigblue]}>red, then bigblue</Text>\n",
 275 |       "      </View>\n",
 276 |       "    );\n",
 277 |       "  }\n",
 278 |       "}\n",
 279 |       "\n",
 280 |       "const styles = StyleSheet.create({\n",
 281 |       "  bigblue: {\n",
 282 |       "    color: 'blue',\n",
 283 |       "    fontWeight: 'bold',\n",
 284 |       "    fontSize: 30,\n",
 285 |       "  },\n",
 286 |       "  red: {\n",
 287 |       "    color: 'red',\n",
 288 |       "  },\n",
 289 |       "});\n",
 290 |       "```\n",
 291 |       "\n",
 292 |       "Trong đó:\n",
 293 |       "- set 1 phần tử: `style={styles.red}` \n",
 294 |       "- set 2 phần tử trở lên sẽ phải để trong `[]` ví dụ như `style={[styles.red, styles.bigblue]}`\n",
 295 |       "\n",
 296 |       "## 4. Handling Touches\n",
 297 |       "\n",
 298 |       "Các phần trên là mới chỉ đáp ứng được việc hiển thị lên app. Còn trong phần này sẽ giúp cho User tương tác được với App. Đơn giản nhất là sự kiện click vào 1 nút\n",
 299 |       "\n",
 300 |       "Để dễ dàng nhất bạn hãy dùng cấu trúc như sau:\n",
 301 |       "\n",
 302 |       "```xml\n",
 303 |       "<View>\n",
 304 |       "    <Button \n",
 305 |       "     onPress={callFunction}       \n",
 306 |       "     />\n",
 307 |       "</View>\n",
 308 |       "```\n",
 309 |       "\n",
 310 |       "Để chi tiết hơn nữa bạn xem ví dụ sau:\n",
 311 |       "\n",
 312 |       "```javascript\n",
 313 |       "import React, { Component } from 'react';\n",
 314 |       "import { Alert, Button, StyleSheet, View } from 'react-native';\n",
 315 |       "\n",
 316 |       "export default class ButtonBasics extends Component {\n",
 317 |       "  _onPressButton() {\n",
 318 |       "    Alert.alert('You tapped the button!')\n",
 319 |       "  }\n",
 320 |       "\n",
 321 |       "  render() {\n",
 322 |       "    return (\n",
 323 |       "      <View style={styles.container}>\n",
 324 |       "        <View style={styles.buttonContainer}>\n",
 325 |       "          <Button\n",
 326 |       "            onPress={this._onPressButton}\n",
 327 |       "            title=\"Press Me\"\n",
 328 |       "          />\n",
 329 |       "        </View>\n",
 330 |       "        <View style={styles.buttonContainer}>\n",
 331 |       "          <Button\n",
 332 |       "            onPress={this._onPressButton}\n",
 333 |       "            title=\"Press Me\"\n",
 334 |       "            color=\"#841584\"\n",
 335 |       "          />\n",
 336 |       "        </View>\n",
 337 |       "        <View style={styles.alternativeLayoutButtonContainer}>\n",
 338 |       "          <Button\n",
 339 |       "            onPress={this._onPressButton}\n",
 340 |       "            title=\"This looks great!\"\n",
 341 |       "          />\n",
 342 |       "          <Button\n",
 343 |       "            onPress={this._onPressButton}\n",
 344 |       "            title=\"OK!\"\n",
 345 |       "            color=\"#841584\"\n",
 346 |       "          />\n",
 347 |       "        </View>\n",
 348 |       "      </View>\n",
 349 |       "    );\n",
 350 |       "  }\n",
 351 |       "}\n",
 352 |       "\n",
 353 |       "const styles = StyleSheet.create({\n",
 354 |       "  container: {\n",
 355 |       "   flex: 1,\n",
 356 |       "   justifyContent: 'center',\n",
 357 |       "  },\n",
 358 |       "  buttonContainer: {\n",
 359 |       "    margin: 20\n",
 360 |       "  },\n",
 361 |       "  alternativeLayoutButtonContainer: {\n",
 362 |       "    margin: 20,\n",
 363 |       "    flexDirection: 'row',\n",
 364 |       "    justifyContent: 'space-between'\n",
 365 |       "  }\n",
 366 |       "})\n",
 367 |       "```\n",
 368 |       "\n",
 369 |       "![](https://i.imgur.com/YlyCGn2.png)\n",
 370 |       "\n",
 371 |       "## 5. List Views\n",
 372 |       "\n",
 373 |       "React Native cung cấp 2 loại view để hiển thị list. Đó là: `FlatList` và `SectionList`\n",
 374 |       "\n",
 375 |       "\n",
 376 |       "`FlatList` làm việc khá tốt và mượn mà với những list dài, số lượng item có thể thay đổi được. Không giống như `ScrollView` thì `FlatList` chỉ render những thành phần nào được hiển thị trên màn hình, chứ không phải là hiển thị tất cả các thành phần trong List lên cùng 1 lúc.\n",
 377 |       "\n",
 378 |       "`FlatList` yêu cầu 2 props cơ bản: `data`và `renderItem`\n",
 379 |       "Trong đó:\n",
 380 |       "- `data` là dữ liệu để hiển thị lên list\n",
 381 |       "- `renderItem` là cách trình bày, màu, font, cỡ chữ trên ứng dụng\n",
 382 |       "\n",
 383 |       "Ví dụ:\n",
 384 |       "\n",
 385 |       "```javascript\n",
 386 |       "import React, { Component } from 'react';\n",
 387 |       "import { FlatList, StyleSheet, Text, View } from 'react-native';\n",
 388 |       "\n",
 389 |       "export default class FlatListBasics extends Component {\n",
 390 |       "  render() {\n",
 391 |       "    return (\n",
 392 |       "      <View style={styles.container}>\n",
 393 |       "        <FlatList\n",
 394 |       "          data={[\n",
 395 |       "            {key: 'Devin'},\n",
 396 |       "            {key: 'Jackson'},\n",
 397 |       "            {key: 'James'},\n",
 398 |       "            {key: 'Joel'},\n",
 399 |       "            {key: 'John'},\n",
 400 |       "            {key: 'Jillian'},\n",
 401 |       "            {key: 'Jimmy'},\n",
 402 |       "            {key: 'Julie'},\n",
 403 |       "\n",
 404 |       "            {key: 'Devin1'},\n",
 405 |       "            {key: 'Jackson1'},\n",
 406 |       "            {key: 'James1'},\n",
 407 |       "            {key: 'Joel1'},\n",
 408 |       "            {key: 'John1'},\n",
 409 |       "            {key: 'Jillian1'},\n",
 410 |       "            {key: 'Jimmy1'},\n",
 411 |       "            {key: 'Julie1'},\n",
 412 |       "\n",
 413 |       "            {key: 'Devin2'},\n",
 414 |       "            {key: 'Jackson2'},\n",
 415 |       "            {key: 'James2'},\n",
 416 |       "            {key: 'Joel2'},\n",
 417 |       "            {key: 'John2'},\n",
 418 |       "            {key: 'Jillian2'},\n",
 419 |       "            {key: 'Jimmy2'},\n",
 420 |       "            {key: 'Julie2'},\n",
 421 |       "          ]}\n",
 422 |       "          renderItem={({item}) => <Text style={styles.item}>{item.key}</Text>}\n",
 423 |       "        />\n",
 424 |       "      </View>\n",
 425 |       "    );\n",
 426 |       "  }\n",
 427 |       "}\n",
 428 |       "\n",
 429 |       "const styles = StyleSheet.create({\n",
 430 |       "  container: {\n",
 431 |       "   flex: 1,\n",
 432 |       "   paddingTop: 22\n",
 433 |       "  },\n",
 434 |       "  item: {\n",
 435 |       "    padding: 10,\n",
 436 |       "    fontSize: 18,\n",
 437 |       "    height: 44,\n",
 438 |       "  },\n",
 439 |       "})\n",
 440 |       "```\n",
 441 |       "\n",
 442 |       "![](https://i.imgur.com/RZFZcsT.gif)\n",
 443 |       "\n",
 444 |       "Nếu bạn muốn chia list hiển thị thành các nhóm khác nhau thì React Native cũng cung cấp cho bạn `SectionList` để làm việc này\n",
 445 |       "\n",
 446 |       "Ví dụ:\n",
 447 |       "\n",
 448 |       "```javascript\n",
 449 |       "import React, { Component } from 'react';\n",
 450 |       "import { SectionList, StyleSheet, Text, View } from 'react-native';\n",
 451 |       "\n",
 452 |       "export default class SectionListBasics extends Component {\n",
 453 |       "  render() {\n",
 454 |       "    return (\n",
 455 |       "      <View style={styles.container}>\n",
 456 |       "        <SectionList\n",
 457 |       "          sections={[\n",
 458 |       "            {title: 'D', data: ['Devin']},\n",
 459 |       "            {title: 'J', data: ['Jackson', 'James', 'Jillian', 'Jimmy', 'Joel', 'John', 'Julie']},\n",
 460 |       "          ]}\n",
 461 |       "          renderItem={({item}) => <Text style={styles.item}>{item}</Text>}\n",
 462 |       "          renderSectionHeader={({section}) => <Text style={styles.sectionHeader}>{section.title}</Text>}\n",
 463 |       "          keyExtractor={(item, index) => index}\n",
 464 |       "        />\n",
 465 |       "      </View>\n",
 466 |       "    );\n",
 467 |       "  }\n",
 468 |       "}\n",
 469 |       "\n",
 470 |       "const styles = StyleSheet.create({\n",
 471 |       "  container: {\n",
 472 |       "   flex: 1,\n",
 473 |       "   paddingTop: 22\n",
 474 |       "  },\n",
 475 |       "  sectionHeader: {\n",
 476 |       "    paddingTop: 2,\n",
 477 |       "    paddingLeft: 10,\n",
 478 |       "    paddingRight: 10,\n",
 479 |       "    paddingBottom: 2,\n",
 480 |       "    fontSize: 14,\n",
 481 |       "    fontWeight: 'bold',\n",
 482 |       "    backgroundColor: 'rgba(247,247,247,1.0)',\n",
 483 |       "  },\n",
 484 |       "  item: {\n",
 485 |       "    padding: 10,\n",
 486 |       "    fontSize: 18,\n",
 487 |       "    height: 44,\n",
 488 |       "  },\n",
 489 |       "})\n",
 490 |       "```\n",
 491 |       "\n",
 492 |       "![](https://i.imgur.com/9dLhnEq.png)\n",
 493 |       "# Kết luận\n",
 494 |       "Chúc bạn chạy được example mà không gặp phải lỗi nào nhé. Nếu có vấn đề gì thì bạn có thể comment dưới đây để mình support nhé\n",
 495 |       "\n",
 496 |       "Nguồn tham khảo: https://facebook.github.io/react-native/docs/tutorial.html\n"
 497 |      ]
 498 |     }
 499 |    ],
 500 |    "source": [
 501 |     "print(content[0])"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "markdown",
 506 |    "metadata": {},
 507 |    "source": [
 508 |     "We will remove documents which it's text length is less than 1000. Because many documents do not have any meaning."
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": 7,
 514 |    "metadata": {},
 515 |    "outputs": [],
 516 |    "source": [
 517 |     "documents = []\n",
 518 |     "for c in content:\n",
 519 |     "    if len(c) > 1000:\n",
 520 |     "        documents.append(c)"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": 8,
 526 |    "metadata": {},
 527 |    "outputs": [
 528 |     {
 529 |      "name": "stdout",
 530 |      "output_type": "stream",
 531 |      "text": [
 532 |       "13887\n"
 533 |      ]
 534 |     }
 535 |    ],
 536 |    "source": [
 537 |     "print(len(documents))"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "markdown",
 542 |    "metadata": {},
 543 |    "source": [
 544 |     "# Implementation"
 545 |    ]
 546 |   },
 547 |   {
 548 |    "cell_type": "markdown",
 549 |    "metadata": {},
 550 |    "source": [
 551 |     "## TF-IDF"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 14,
 557 |    "metadata": {},
 558 |    "outputs": [],
 559 |    "source": [
 560 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer"
 561 |    ]
 562 |   },
 563 |   {
 564 |    "cell_type": "markdown",
 565 |    "metadata": {},
 566 |    "source": [
 567 |     "#### Get the corpus used in TfIdf model"
 568 |    ]
 569 |   },
 570 |   {
 571 |    "cell_type": "code",
 572 |    "execution_count": 10,
 573 |    "metadata": {},
 574 |    "outputs": [],
 575 |    "source": [
 576 |     "def get_corpus(documents):\n",
 577 |     "    corpus = []\n",
 578 |     "    for i in tqdm(range(len(documents))):\n",
 579 |     "        doc = documents[i]\n",
 580 |     "        # preprocessing text, remove all letters which is not character\n",
 581 |     "        doc = gensim.utils.simple_preprocess(doc)\n",
 582 |     "        # because these documents is mostly written in Vietnamese\n",
 583 |     "        # we need to use an library which supporting Vietnamese\n",
 584 |     "        doc = ' '.join(doc)\n",
 585 |     "        words = ViTokenizer.tokenize(doc)\n",
 586 |     "        sentence = ' '.join(words)\n",
 587 |     "\n",
 588 |     "        corpus.append(sentence)\n",
 589 |     "    return corpus"
 590 |    ]
 591 |   },
 592 |   {
 593 |    "cell_type": "code",
 594 |    "execution_count": 11,
 595 |    "metadata": {
 596 |     "scrolled": true
 597 |    },
 598 |    "outputs": [
 599 |     {
 600 |      "name": "stderr",
 601 |      "output_type": "stream",
 602 |      "text": [
 603 |       "100%|██████████| 13887/13887 [05:11<00:00, 44.63it/s]\n"
 604 |      ]
 605 |     }
 606 |    ],
 607 |    "source": [
 608 |     "corpus = get_corpus(documents)"
 609 |    ]
 610 |   },
 611 |   {
 612 |    "cell_type": "markdown",
 613 |    "metadata": {},
 614 |    "source": [
 615 |     "#### Custom the tokenizer to use in TfidfVectorizer()"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "code",
 620 |    "execution_count": 15,
 621 |    "metadata": {},
 622 |    "outputs": [],
 623 |    "source": [
 624 |     "def my_tokenizer(doc):\n",
 625 |     "#     doc = ViTokenizer.tokenize(doc)\n",
 626 |     "    doc = doc.split(\" \")\n",
 627 |     "    return doc"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "cell_type": "markdown",
 632 |    "metadata": {},
 633 |    "source": [
 634 |     "#### Create an Vectorize object with max features (number of words in dictionary) to be 20000"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "code",
 639 |    "execution_count": 16,
 640 |    "metadata": {},
 641 |    "outputs": [],
 642 |    "source": [
 643 |     "vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, max_features=20000)"
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "markdown",
 648 |    "metadata": {},
 649 |    "source": [
 650 |     "#### Train model"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": 17,
 656 |    "metadata": {},
 657 |    "outputs": [],
 658 |    "source": [
 659 |     "res_corpus = vectorizer.fit_transform(corpus)"
 660 |    ]
 661 |   },
 662 |   {
 663 |    "cell_type": "code",
 664 |    "execution_count": 19,
 665 |    "metadata": {},
 666 |    "outputs": [
 667 |     {
 668 |      "name": "stdout",
 669 |      "output_type": "stream",
 670 |      "text": [
 671 |       "20000\n"
 672 |      ]
 673 |     }
 674 |    ],
 675 |    "source": [
 676 |     "print(len(vectorizer.get_feature_names()))\n"
 677 |    ]
 678 |   },
 679 |   {
 680 |    "cell_type": "markdown",
 681 |    "metadata": {},
 682 |    "source": [
 683 |     "## SVD"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "markdown",
 688 |    "metadata": {},
 689 |    "source": [
 690 |     "#### Reduce document vector's dimensions"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": 20,
 696 |    "metadata": {},
 697 |    "outputs": [],
 698 |    "source": [
 699 |     "from sklearn.decomposition import TruncatedSVD\n",
 700 |     "svd = TruncatedSVD(n_components=300, random_state=42)"
 701 |    ]
 702 |   },
 703 |   {
 704 |    "cell_type": "code",
 705 |    "execution_count": 21,
 706 |    "metadata": {},
 707 |    "outputs": [],
 708 |    "source": [
 709 |     "truncated_corpus = svd.fit_transform(res_corpus)"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "code",
 714 |    "execution_count": 22,
 715 |    "metadata": {},
 716 |    "outputs": [
 717 |     {
 718 |      "name": "stdout",
 719 |      "output_type": "stream",
 720 |      "text": [
 721 |       "(13887, 300)\n"
 722 |      ]
 723 |     }
 724 |    ],
 725 |    "source": [
 726 |     "print(truncated_corpus.shape)"
 727 |    ]
 728 |   },
 729 |   {
 730 |    "cell_type": "markdown",
 731 |    "metadata": {},
 732 |    "source": [
 733 |     "#### Save model"
 734 |    ]
 735 |   },
 736 |   {
 737 |    "cell_type": "code",
 738 |    "execution_count": 24,
 739 |    "metadata": {},
 740 |    "outputs": [
 741 |     {
 742 |      "name": "stdout",
 743 |      "output_type": "stream",
 744 |      "text": [
 745 |       "SAVED SUCESS\n"
 746 |      ]
 747 |     }
 748 |    ],
 749 |    "source": [
 750 |     "filename = 'tfidf_svd_20k_to_128.pkl'\n",
 751 |     "pickle.dump(svd, open(filename, 'wb'))\n",
 752 |     "print(\"SAVED SUCESS\")"
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "markdown",
 757 |    "metadata": {},
 758 |    "source": [
 759 |     "#### Calculate similarity values between documents"
 760 |    ]
 761 |   },
 762 |   {
 763 |    "cell_type": "code",
 764 |    "execution_count": 25,
 765 |    "metadata": {},
 766 |    "outputs": [],
 767 |    "source": [
 768 |     "from sklearn.metrics.pairwise import cosine_similarity"
 769 |    ]
 770 |   },
 771 |   {
 772 |    "cell_type": "code",
 773 |    "execution_count": 26,
 774 |    "metadata": {},
 775 |    "outputs": [],
 776 |    "source": [
 777 |     "similarity_matrix = cosine_similarity(truncated_corpus, truncated_corpus)"
 778 |    ]
 779 |   },
 780 |   {
 781 |    "cell_type": "markdown",
 782 |    "metadata": {},
 783 |    "source": [
 784 |     "#### Test for first document, get 10 most similar document"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": 34,
 790 |    "metadata": {},
 791 |    "outputs": [
 792 |     {
 793 |      "data": {
 794 |       "text/plain": [
 795 |        "array([ 3515,   752,  9364, 12871, 11940,  1489,  2265, 13167, 11466,\n",
 796 |        "           0])"
 797 |       ]
 798 |      },
 799 |      "execution_count": 34,
 800 |      "metadata": {},
 801 |      "output_type": "execute_result"
 802 |     }
 803 |    ],
 804 |    "source": [
 805 |     "np.argsort(similarity_matrix[0])[-10:]"
 806 |    ]
 807 |   },
 808 |   {
 809 |    "cell_type": "code",
 810 |    "execution_count": 39,
 811 |    "metadata": {},
 812 |    "outputs": [
 813 |     {
 814 |      "name": "stdout",
 815 |      "output_type": "stream",
 816 |      "text": [
 817 |       "react native là gì react native là một frameworkework cho phép bạn xây_dựng một ứng_dụng trên native platforms sử_dụng kinh_nghiệm của lập_trình_viên bằng javascript và react http_facebook github io react trọng_tâm của react native là hiệu_quả của nhà phát_triển trên tất_cả các nền_tảng mà bạn quan_tâm học một lần viết trên bất_cứ nền_tảng nào facebook sử_dụng react native trên nhiều ứng_dụng và tiếp_tục đầu_tư vào react native native components các thành_phần native với react native bạn có_thể sử_dụng các thành_phần cơ_bản mặc_định của nền_tảng như uitabbar trên ios và drawer trên android điều này cho phép ứng_dụng có sự nhất_quán với phần còn lại của nền_tảng và giữ cho chất_lượng sản_phẩm cao các thành_phần này có_thể dễ_dàng tích_hợp vào ứng_dụng của bạn sử_dụng react component counterparts chẳng_hạn như tabbarios and javascript ios import react component tabbarios navigatorios from react native class app extends component render return tabbarios tabbarios item title react native selected true navigatorios initialroute title react native tabbarios item tabbarios javascript android import react component text from react native class app extends component render return text react native text asynchronous execution xử_lý không đồng_bộ mọi hoạt_động giữa javascript code và native platform đều được thực_hiện không đồng_bộ và các native module cũng có_thể sử_dụng thêm các threads khác tốt điều này có nghĩa_là chúng_ta có_thể giải_mã ảnh không nằm tren main thread lưu vào bộ_nhớ trong luồng background tính_toán và sắp_xếp giao_diện sẽ không làm đơ giao_diện và còn nhiều hơn thế nữa kết_quả là ứng_dụng được viết bằng react native hoạt_động và đáp_ứng giống như app được viết bằng native các phương_án giao_tiếp cũng được hỗ_trợ đầy_đủ cho phép bạn sử_dụng chrome developer tool để debug javascript khi chạy các ứng_dụng hoàn_chỉnh trên công_cụ mô_phỏng hoặc thiết_bị thật png uploads ea_da ac afc png touch handling xử_lý tương_tác chạm react native đã thực_hiện một hệ_thống mạnh_mẽ để xử_lý các thao_tác chạm trên giao_diện và cung_cấp các công_cụ như tích_hợp các giao_diện có_thể cuộn được và nhiều phần_tử khác mà không cần cấu_hình gì thêm javascipt ios android import react component scrollview text from react native class touchdemo extends component render return scrollview onpress console log pressed text proper touch handling text scrollview flexbox and styling sắp_xếp bố_trí giao_diện cần được thực_hiện một các dễ_dàng đó là lý_do vì sao react native sử_dụng chế_độ flexbox layout từ web vào react native flexbox khiến việc xây_dựng hầu_hết các giao_diện cơ_bản trở thàn công_việc dễ_dàng như giao_diện stacked và nested boxes với margin và padding react native cũng hỗ_trợ các web styles phổ_biến như fontweight và cung_cấp một các tối_ưu để giải_quyết các vấn_đề về thiết_kế giao_diện javascript ios android var react component image stylesheet text view from react native class reactnative extends component render return view style styles row image source uri http_facebook github io react img logo_og png style styles image view style styles text text style styles title react native text text style styles subtitle build high quality mobile apps using react text view view var styles stylesheet create row flexdirection row margin image width height marginright text flex justifycontent center title fontsize fontweight bold subtitle fontsize extensibility khả_năng mở_rộng điều tuyệt_vời là bạn có_thể xây_dựng một ứng_dụng bằng react native mà không cần viết cho mỗi nền_tảng tuy_nhiên react native cũng được thiết_kế để dễ_dàng mở_rộng với các views và modules native đã được tuỳ biến điều này có nghĩa_là bạn có_thể tái sử_dụng tất_cả những gì bạn đã xây_dựng trước đó hoặc có_thể sử_dụng các thư_viện native mà bạn thích tạo modules cho ios để tạo modules trên ios bạn tạo một class kế_thừa rctbridgemodule prototcol và viết hàm bạn muốn có bên javascript trong hàm ngoài_ra chính class phải được exported rõ_ràng với objective objective import rctbridgemodule interface mycustommodule nsobject rctbridgemodule end implementation mycustommodule available as nativemodules mycustommodule processstring processstring nsstring input callback callback callback input goodbye withstring hello end javascript import react component nativemodules text from react native class message extends component constructor props super props this state text goodbye world nativemodules mycustommodule processstring this state text text this setstate text render return text this state text text tạo ios views_tuỳ chỉnh ios views có_thể kế_thừa class rctviewmanager thực_hiện hàm uiview và đăng_ký các thuộc_tính với macro objective objective import rctviewmanager interface rctviewmanager end implementation uiview view return mycustomview alloc init nsstring end javascript javascript import react component from react native var mycustomview mycustomview export default class mycustomview extends component static proptypes react proptypes oneof render return this props\n"
 818 |      ]
 819 |     }
 820 |    ],
 821 |    "source": [
 822 |     "print(corpus[11466])"
 823 |    ]
 824 |   },
 825 |   {
 826 |    "cell_type": "markdown",
 827 |    "metadata": {},
 828 |    "source": [
 829 |     "## Doc2Vec"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "markdown",
 834 |    "metadata": {},
 835 |    "source": [
 836 |     "#### Create Corpus which is trained in Doc2Vec Model"
 837 |    ]
 838 |   },
 839 |   {
 840 |    "cell_type": "code",
 841 |    "execution_count": 9,
 842 |    "metadata": {},
 843 |    "outputs": [],
 844 |    "source": [
 845 |     "def get_training_corpus(documents):\n",
 846 |     "    corpus = []\n",
 847 |     "    for i in tqdm(range(len(documents))):\n",
 848 |     "        doc = documents[i]\n",
 849 |     "        # preprocessing text, remove all letters which is not character\n",
 850 |     "        doc = gensim.utils.simple_preprocess(doc) \n",
 851 |     "        # because these documents is mostly written in Vietnamese\n",
 852 |     "        # we need to use an library which supporting Vietnamese\n",
 853 |     "        doc = ' '.join(doc)\n",
 854 |     "        words = ViTokenizer.tokenize(doc)\n",
 855 |     "        # convert to format used in Doc2Vec function below\n",
 856 |     "        tagged_document = gensim.models.doc2vec.TaggedDocument(words.split(\" \"), [i])\n",
 857 |     "        corpus.append(tagged_document)\n",
 858 |     "    return corpus"
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "code",
 863 |    "execution_count": null,
 864 |    "metadata": {},
 865 |    "outputs": [],
 866 |    "source": [
 867 |     "train_corpus = get_training_corpus(documents)"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "markdown",
 872 |    "metadata": {},
 873 |    "source": [
 874 |     "#### Build Doc2Vec Model"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "code",
 879 |    "execution_count": null,
 880 |    "metadata": {},
 881 |    "outputs": [],
 882 |    "source": [
 883 |     "# build a Doc2Vec model with vector size 300\n",
 884 |     "# remove all the words which occur less than 2 times\n",
 885 |     "# training in 40 epochs\n",
 886 |     "model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)\n",
 887 |     "model.build_vocab(train_corpus)"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": null,
 893 |    "metadata": {},
 894 |    "outputs": [],
 895 |    "source": [
 896 |     "# train model and get the training time\n",
 897 |     "%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)"
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "markdown",
 902 |    "metadata": {},
 903 |    "source": [
 904 |     "#### Test for first document"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "code",
 909 |    "execution_count": null,
 910 |    "metadata": {},
 911 |    "outputs": [],
 912 |    "source": [
 913 |     "# get the vector for first document\n",
 914 |     "vector = model.infer_vector(train_corpus[0].words)"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "code",
 919 |    "execution_count": null,
 920 |    "metadata": {},
 921 |    "outputs": [],
 922 |    "source": [
 923 |     "# get first 100 similar documents to first document\n",
 924 |     "sims = model.docvecs.most_similar([vector], topn=100)"
 925 |    ]
 926 |   },
 927 |   {
 928 |    "cell_type": "code",
 929 |    "execution_count": null,
 930 |    "metadata": {
 931 |     "scrolled": true
 932 |    },
 933 |    "outputs": [],
 934 |    "source": [
 935 |     "for i in range(10):\n",
 936 |     "    print(i, \" : \", sims[i][0])"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": null,
 942 |    "metadata": {},
 943 |    "outputs": [],
 944 |    "source": []
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": null,
 949 |    "metadata": {},
 950 |    "outputs": [],
 951 |    "source": []
 952 |   },
 953 |   {
 954 |    "cell_type": "code",
 955 |    "execution_count": null,
 956 |    "metadata": {},
 957 |    "outputs": [],
 958 |    "source": []
 959 |   },
 960 |   {
 961 |    "cell_type": "code",
 962 |    "execution_count": null,
 963 |    "metadata": {},
 964 |    "outputs": [],
 965 |    "source": []
 966 |   },
 967 |   {
 968 |    "cell_type": "code",
 969 |    "execution_count": null,
 970 |    "metadata": {},
 971 |    "outputs": [],
 972 |    "source": []
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": null,
 977 |    "metadata": {},
 978 |    "outputs": [],
 979 |    "source": []
 980 |   },
 981 |   {
 982 |    "cell_type": "code",
 983 |    "execution_count": null,
 984 |    "metadata": {},
 985 |    "outputs": [],
 986 |    "source": []
 987 |   },
 988 |   {
 989 |    "cell_type": "code",
 990 |    "execution_count": null,
 991 |    "metadata": {},
 992 |    "outputs": [],
 993 |    "source": []
 994 |   },
 995 |   {
 996 |    "cell_type": "code",
 997 |    "execution_count": null,
 998 |    "metadata": {},
 999 |    "outputs": [],
1000 |    "source": []
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": null,
1005 |    "metadata": {},
1006 |    "outputs": [],
1007 |    "source": []
1008 |   }
1009 |  ],
1010 |  "metadata": {
1011 |   "kernelspec": {
1012 |    "display_name": "Python 3",
1013 |    "language": "python",
1014 |    "name": "python3"
1015 |   },
1016 |   "language_info": {
1017 |    "codemirror_mode": {
1018 |     "name": "ipython",
1019 |     "version": 3
1020 |    },
1021 |    "file_extension": ".py",
1022 |    "mimetype": "text/x-python",
1023 |    "name": "python",
1024 |    "nbconvert_exporter": "python",
1025 |    "pygments_lexer": "ipython3",
1026 |    "version": "3.6.5"
1027 |   }
1028 |  },
1029 |  "nbformat": 4,
1030 |  "nbformat_minor": 2
1031 | }
1032 | 


--------------------------------------------------------------------------------
/Viblo Similarity Documents/Similarity Documents.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Abstract"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this tutorial, we will implement some algorithms that finding the similarity documents. The document dataset was crawled from Viblo website: $\\textbf{viblo.asia}$"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "## Algorithms\n",
  22 |     "1. TF-IDF + SVD\n",
  23 |     "2. DOC2VEC"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "# Preprocessing Data"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 1,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "import gensim\n",
  40 |     "import os\n",
  41 |     "import collections\n",
  42 |     "import smart_open\n",
  43 |     "import random\n",
  44 |     "import pickle\n",
  45 |     "from pyvi import ViTokenizer, ViPosTagger\n",
  46 |     "from tqdm import tqdm\n",
  47 |     "import numpy as np"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "markdown",
  52 |    "metadata": {},
  53 |    "source": [
  54 |     "We will load data in data folder. The data in \"contents_markdown_viblo.pkl\" is a list of document."
  55 |    ]
  56 |   },
  57 |   {
  58 |    "cell_type": "code",
  59 |    "execution_count": 2,
  60 |    "metadata": {},
  61 |    "outputs": [
  62 |     {
  63 |      "ename": "FileNotFoundError",
  64 |      "evalue": "[Errno 2] No such file or directory: 'data/contents_markdown_viblo.pkl'",
  65 |      "output_type": "error",
  66 |      "traceback": [
  67 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  68 |       "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
  69 |       "\u001b[0;32m<ipython-input-2-84c55f04dd8a>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'data/contents_markdown_viblo.pkl'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'rb'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpickle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  70 |       "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'data/contents_markdown_viblo.pkl'"
  71 |      ]
  72 |     }
  73 |    ],
  74 |    "source": [
  75 |     "with open('data/contents_markdown_viblo.pkl', 'rb') as f:\n",
  76 |     "    content = pickle.load(f)"
  77 |    ]
  78 |   },
  79 |   {
  80 |    "cell_type": "code",
  81 |    "execution_count": 6,
  82 |    "metadata": {
  83 |     "scrolled": false
  84 |    },
  85 |    "outputs": [
  86 |     {
  87 |      "name": "stdout",
  88 |      "output_type": "stream",
  89 |      "text": [
  90 |       "<class 'list'>\n",
  91 |       "14157\n"
  92 |      ]
  93 |     }
  94 |    ],
  95 |    "source": [
  96 |     "print(type(content))\n",
  97 |     "print(len(content))"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 40,
 103 |    "metadata": {},
 104 |    "outputs": [
 105 |     {
 106 |      "name": "stdout",
 107 |      "output_type": "stream",
 108 |      "text": [
 109 |       "### Trong phần này tôi sẽ giúp các bạn mới học `React Native` hiểu qua phần cài đặt project đã tồn tại và cùng nhau tìm hiểu khái niệm cơ bản nhất của React Native.\n",
 110 |       "# I. Một số chia sẻ cá nhân\n",
 111 |       "\n",
 112 |       "## 1. Yêu cầu thứ 1\n",
 113 |       "Yêu cầu để thực hiện theo bài viết này, bạn hãy cài đặt môi trường theo bài viết dưới đây\n",
 114 |       "\n",
 115 |       "### [Học React Native từ cơ bản đến nâng cao - Phần 1 Hướng dẫn cài đặt và chạy \"Hello world\"](https://viblo.asia/p/hoc-react-native-tu-co-ban-den-nang-cao-phan-1-huong-dan-cai-dat-va-chay-hello-world-RQqKLYW0Z7z)\n",
 116 |       "\n",
 117 |       "## 2. Yêu cầu thứ 2\n",
 118 |       "Khi muốn code React Native bạn sẽ phải chọn cho mình 1 Editors phù hợp cho mình\n",
 119 |       "\n",
 120 |       "Bạn có thể tham khảo các Editors theo link dưới đây\n",
 121 |       "\n",
 122 |       "### [Top 10 Editors For React Native Mobile App Development](https://www.icicletech.com/blog/top-10-editors-for-react-native)\n",
 123 |       "\n",
 124 |       "Còn lựa chọn của tôi là [**`Visual Studio Code`**](https://code.visualstudio.com/) vì đơn giản là dùng khá nhẹ và tiện dụng\n",
 125 |       "\n",
 126 |       "![](https://i.imgur.com/ZfAI0VK.png)\n",
 127 |       "\n",
 128 |       "Nếu bạn cũng chọn **`Visual Studio Code`** như tôi thì tiếp tục cài các Extension theo bài viết dưới đây để giúp việc code React Native trở nên dễ dàng hơn\n",
 129 |       "\n",
 130 |       "### VSCode for React Native: \n",
 131 |       "[https://medium.com/react-native-training/vscode-for-react-native-526ec4a368ce](https://medium.com/react-native-training/vscode-for-react-native-526ec4a368ce)\n",
 132 |       "\n",
 133 |       "## 3. Yêu cầu thứ 3\n",
 134 |       "\n",
 135 |       "Kéo code ví dụ React Native về máy bạn:\n",
 136 |       "\n",
 137 |       "- Bạn mở terminal \n",
 138 |       "\n",
 139 |       "    **`git clone https://github.com/oTranThanhNghia/LearnReactNative.git`**\n",
 140 |       "\n",
 141 |       "- Khi đã kéo code về xong vào trong folder `LearnReactNative` mà vừa kéo về\n",
 142 |       "\n",
 143 |       "    gõ lệnh sau để cấu hình project: **`npm install react-native@0.55.4`**\n",
 144 |       "\n",
 145 |       "- Cấu hình lại Android SDK trong file `local.properties` trong folder của Android `LearnReactNative\\android` ở  như sau: \n",
 146 |       "```\n",
 147 |       "sdk.dir=D\\:\\\\Android\\\\Sdk\n",
 148 |       "```\n",
 149 |       "- Chạy android gõ: `react-native run-android`\n",
 150 |       "    Khi nào terminal hiển thị là bạn đã thành công. \n",
 151 |       "    \n",
 152 |       "![](https://i.imgur.com/ianKmsQ.png)\n",
 153 |       "\n",
 154 |       "Nếu bạn gặp phải lỗi sau thì hãy kiểm tra lại `port 8081` có đang sử dụng không. Nếu có thì tắt nó đi và gõ lại lệnh `npm start` để khởi động lại Metro Bundler\n",
 155 |       "\n",
 156 |       "![](https://i.imgur.com/GjvmD4f.png)\n",
 157 |       "\n",
 158 |       "* Ngoài ra bạn hãy vào index.js để chọn example để chạy nhé:\n",
 159 |       "\n",
 160 |       "![](https://i.imgur.com/WSeVoA8.png)\n",
 161 |       "\n",
 162 |       "# II. Khái niệm cơ bản trong React Native\n",
 163 |       "## 1. Props\n",
 164 |       "Hầu hết các thành phần có thể custom được khi được tạo ra với các tham số khác nhau. Các tham số này được gọi là `props`\n",
 165 |       "Ví dụ về `Image` sẽ giúp bạn sử dụng prop `source` để hiển thị ảnh\n",
 166 |       "\n",
 167 |       "```javascript\n",
 168 |       "import React, {Component} from 'react';\n",
 169 |       "import {Image} from 'react-native';\n",
 170 |       "\n",
 171 |       "export default class Bananas extends Component {\n",
 172 |       "    render() {\n",
 173 |       "      let pic = {\n",
 174 |       "        uri: 'https://upload.wikimedia.org/wikipedia/commons/d/de/Bananavarieties.jpg'\n",
 175 |       "      };\n",
 176 |       "      return (\n",
 177 |       "        <Image source={pic} style={{width: 193, height: 110}}/>\n",
 178 |       "      );\n",
 179 |       "    }\n",
 180 |       "}\n",
 181 |       "```\n",
 182 |       "\n",
 183 |       "\n",
 184 |       "![](https://i.imgur.com/BLVZLjk.png)\n",
 185 |       "\n",
 186 |       "Components bạn viết ra cũng có thể sử dụng `props`. Ví dụ dưới đây sẽ mô tả cách sử dụng `props` trong Component riêng biệt\n",
 187 |       "\n",
 188 |       "``` javascript\n",
 189 |       "import React, { Component } from 'react';\n",
 190 |       "import { Text, View } from 'react-native';\n",
 191 |       "\n",
 192 |       "class Greeting extends Component {\n",
 193 |       "  render() {\n",
 194 |       "    return (\n",
 195 |       "      <Text>Hello {this.props.name}!</Text>\n",
 196 |       "    );\n",
 197 |       "  }\n",
 198 |       "}\n",
 199 |       "\n",
 200 |       "export default class LotsOfGreetings extends Component {\n",
 201 |       "  render() {\n",
 202 |       "    return (\n",
 203 |       "      <View style={{alignItems: 'center'}}>\n",
 204 |       "        <Greeting name='Rexxar' />\n",
 205 |       "        <Greeting name='Jaina' />\n",
 206 |       "        <Greeting name='Valeera' />\n",
 207 |       "      </View>\n",
 208 |       "    );\n",
 209 |       "  }\n",
 210 |       "}\n",
 211 |       "```\n",
 212 |       "\n",
 213 |       "![](https://i.imgur.com/ThWUrYc.png)\n",
 214 |       "\n",
 215 |       "Sử dụng prop `name` giúp bạn có thể custom `Greeting` component 1 cách dễ dàng và giúp tái sử dụng lại component ở nhiều nơi\n",
 216 |       "\n",
 217 |       "## 2. State\n",
 218 |       "Trong khi `props` là không thể thay đổi thì `state` là kiểu dữ liệu có thể update được trong tương lai\n",
 219 |       "\n",
 220 |       "Nghe có vẻ khó hiểu nhưng bạn xem ví dụ dưới đây cho rõ ràng\n",
 221 |       "\n",
 222 |       "```javascript\n",
 223 |       "import React, { Component } from 'react';\n",
 224 |       "import { Text, View } from 'react-native';\n",
 225 |       "\n",
 226 |       "class Blink extends Component {\n",
 227 |       "  constructor(props) {\n",
 228 |       "    super(props);\n",
 229 |       "    this.state = {isShowingText: true};\n",
 230 |       "\n",
 231 |       "    // Toggle the state every second\n",
 232 |       "    setInterval(() => {\n",
 233 |       "      this.setState(previousState => {\n",
 234 |       "        return { isShowingText: !previousState.isShowingText };\n",
 235 |       "      });\n",
 236 |       "    }, 1000);\n",
 237 |       "  }\n",
 238 |       "\n",
 239 |       "  render() {\n",
 240 |       "    let display = this.state.isShowingText ? this.props.text : ' ';\n",
 241 |       "    return (\n",
 242 |       "      <Text>{display}</Text>\n",
 243 |       "    );\n",
 244 |       "  }\n",
 245 |       "}\n",
 246 |       "\n",
 247 |       "export default class BlinkApp extends Component {\n",
 248 |       "  render() {\n",
 249 |       "    return (\n",
 250 |       "      <View>\n",
 251 |       "        <Blink text='I love to blink' />\n",
 252 |       "        <Blink text='Yes blinking is so great' />\n",
 253 |       "        <Blink text='Why did they ever take this out of HTML' />\n",
 254 |       "        <Blink text='Look at me look at me look at me' />\n",
 255 |       "      </View>\n",
 256 |       "    );\n",
 257 |       "  }\n",
 258 |       "}\n",
 259 |       "```\n",
 260 |       "\n",
 261 |       "![](https://i.imgur.com/fyuYkCx.gif)\n",
 262 |       "\n",
 263 |       "Trong demo ở trên là ví dụ về sau 1 giây sẽ nhấp nháy chữ.\n",
 264 |       "\n",
 265 |       "Trong đó:\n",
 266 |       " - `props`: `text` trong `Blink` component\n",
 267 |       " - `state`: `isShowingText` là cờ để hiển thị\n",
 268 |       "\n",
 269 |       "\n",
 270 |       "\n",
 271 |       "## 3. Style\n",
 272 |       " `Style` sẽ gợi nhắc các bạn nghĩ ngay đến style của Web. Tất cả cấu trúc, biến, giá trị hầu hết đều giống với CSS\n",
 273 |       " Bạn hãy xem ví dụ dưới đây \n",
 274 |       " \n",
 275 |       "```javascript\n",
 276 |       "import React, { Component } from 'react';\n",
 277 |       "import { StyleSheet, Text, View } from 'react-native';\n",
 278 |       "\n",
 279 |       "export default class LotsOfStyles extends Component {\n",
 280 |       "  render() {\n",
 281 |       "    return (\n",
 282 |       "      <View>\n",
 283 |       "        <Text style={styles.red}>just red</Text>\n",
 284 |       "        <Text style={styles.bigblue}>just bigblue</Text>\n",
 285 |       "        <Text style={[styles.bigblue, styles.red]}>bigblue, then red</Text>\n",
 286 |       "        <Text style={[styles.red, styles.bigblue]}>red, then bigblue</Text>\n",
 287 |       "      </View>\n",
 288 |       "    );\n",
 289 |       "  }\n",
 290 |       "}\n",
 291 |       "\n",
 292 |       "const styles = StyleSheet.create({\n",
 293 |       "  bigblue: {\n",
 294 |       "    color: 'blue',\n",
 295 |       "    fontWeight: 'bold',\n",
 296 |       "    fontSize: 30,\n",
 297 |       "  },\n",
 298 |       "  red: {\n",
 299 |       "    color: 'red',\n",
 300 |       "  },\n",
 301 |       "});\n",
 302 |       "```\n",
 303 |       "\n",
 304 |       "Trong đó:\n",
 305 |       "- set 1 phần tử: `style={styles.red}` \n",
 306 |       "- set 2 phần tử trở lên sẽ phải để trong `[]` ví dụ như `style={[styles.red, styles.bigblue]}`\n",
 307 |       "\n",
 308 |       "## 4. Handling Touches\n",
 309 |       "\n",
 310 |       "Các phần trên là mới chỉ đáp ứng được việc hiển thị lên app. Còn trong phần này sẽ giúp cho User tương tác được với App. Đơn giản nhất là sự kiện click vào 1 nút\n",
 311 |       "\n",
 312 |       "Để dễ dàng nhất bạn hãy dùng cấu trúc như sau:\n",
 313 |       "\n",
 314 |       "```xml\n",
 315 |       "<View>\n",
 316 |       "    <Button \n",
 317 |       "     onPress={callFunction}       \n",
 318 |       "     />\n",
 319 |       "</View>\n",
 320 |       "```\n",
 321 |       "\n",
 322 |       "Để chi tiết hơn nữa bạn xem ví dụ sau:\n",
 323 |       "\n",
 324 |       "```javascript\n",
 325 |       "import React, { Component } from 'react';\n",
 326 |       "import { Alert, Button, StyleSheet, View } from 'react-native';\n",
 327 |       "\n",
 328 |       "export default class ButtonBasics extends Component {\n",
 329 |       "  _onPressButton() {\n",
 330 |       "    Alert.alert('You tapped the button!')\n",
 331 |       "  }\n",
 332 |       "\n",
 333 |       "  render() {\n",
 334 |       "    return (\n",
 335 |       "      <View style={styles.container}>\n",
 336 |       "        <View style={styles.buttonContainer}>\n",
 337 |       "          <Button\n",
 338 |       "            onPress={this._onPressButton}\n",
 339 |       "            title=\"Press Me\"\n",
 340 |       "          />\n",
 341 |       "        </View>\n",
 342 |       "        <View style={styles.buttonContainer}>\n",
 343 |       "          <Button\n",
 344 |       "            onPress={this._onPressButton}\n",
 345 |       "            title=\"Press Me\"\n",
 346 |       "            color=\"#841584\"\n",
 347 |       "          />\n",
 348 |       "        </View>\n",
 349 |       "        <View style={styles.alternativeLayoutButtonContainer}>\n",
 350 |       "          <Button\n",
 351 |       "            onPress={this._onPressButton}\n",
 352 |       "            title=\"This looks great!\"\n",
 353 |       "          />\n",
 354 |       "          <Button\n",
 355 |       "            onPress={this._onPressButton}\n",
 356 |       "            title=\"OK!\"\n",
 357 |       "            color=\"#841584\"\n",
 358 |       "          />\n",
 359 |       "        </View>\n",
 360 |       "      </View>\n",
 361 |       "    );\n",
 362 |       "  }\n",
 363 |       "}\n",
 364 |       "\n",
 365 |       "const styles = StyleSheet.create({\n",
 366 |       "  container: {\n",
 367 |       "   flex: 1,\n",
 368 |       "   justifyContent: 'center',\n",
 369 |       "  },\n",
 370 |       "  buttonContainer: {\n",
 371 |       "    margin: 20\n",
 372 |       "  },\n",
 373 |       "  alternativeLayoutButtonContainer: {\n",
 374 |       "    margin: 20,\n",
 375 |       "    flexDirection: 'row',\n",
 376 |       "    justifyContent: 'space-between'\n",
 377 |       "  }\n",
 378 |       "})\n",
 379 |       "```\n",
 380 |       "\n",
 381 |       "![](https://i.imgur.com/YlyCGn2.png)\n",
 382 |       "\n",
 383 |       "## 5. List Views\n",
 384 |       "\n",
 385 |       "React Native cung cấp 2 loại view để hiển thị list. Đó là: `FlatList` và `SectionList`\n",
 386 |       "\n",
 387 |       "\n",
 388 |       "`FlatList` làm việc khá tốt và mượn mà với những list dài, số lượng item có thể thay đổi được. Không giống như `ScrollView` thì `FlatList` chỉ render những thành phần nào được hiển thị trên màn hình, chứ không phải là hiển thị tất cả các thành phần trong List lên cùng 1 lúc.\n",
 389 |       "\n",
 390 |       "`FlatList` yêu cầu 2 props cơ bản: `data`và `renderItem`\n",
 391 |       "Trong đó:\n",
 392 |       "- `data` là dữ liệu để hiển thị lên list\n",
 393 |       "- `renderItem` là cách trình bày, màu, font, cỡ chữ trên ứng dụng\n",
 394 |       "\n",
 395 |       "Ví dụ:\n",
 396 |       "\n",
 397 |       "```javascript\n",
 398 |       "import React, { Component } from 'react';\n",
 399 |       "import { FlatList, StyleSheet, Text, View } from 'react-native';\n",
 400 |       "\n",
 401 |       "export default class FlatListBasics extends Component {\n",
 402 |       "  render() {\n",
 403 |       "    return (\n",
 404 |       "      <View style={styles.container}>\n",
 405 |       "        <FlatList\n",
 406 |       "          data={[\n",
 407 |       "            {key: 'Devin'},\n",
 408 |       "            {key: 'Jackson'},\n",
 409 |       "            {key: 'James'},\n",
 410 |       "            {key: 'Joel'},\n",
 411 |       "            {key: 'John'},\n",
 412 |       "            {key: 'Jillian'},\n",
 413 |       "            {key: 'Jimmy'},\n",
 414 |       "            {key: 'Julie'},\n",
 415 |       "\n",
 416 |       "            {key: 'Devin1'},\n",
 417 |       "            {key: 'Jackson1'},\n",
 418 |       "            {key: 'James1'},\n",
 419 |       "            {key: 'Joel1'},\n",
 420 |       "            {key: 'John1'},\n",
 421 |       "            {key: 'Jillian1'},\n",
 422 |       "            {key: 'Jimmy1'},\n",
 423 |       "            {key: 'Julie1'},\n",
 424 |       "\n",
 425 |       "            {key: 'Devin2'},\n",
 426 |       "            {key: 'Jackson2'},\n",
 427 |       "            {key: 'James2'},\n",
 428 |       "            {key: 'Joel2'},\n",
 429 |       "            {key: 'John2'},\n",
 430 |       "            {key: 'Jillian2'},\n",
 431 |       "            {key: 'Jimmy2'},\n",
 432 |       "            {key: 'Julie2'},\n",
 433 |       "          ]}\n",
 434 |       "          renderItem={({item}) => <Text style={styles.item}>{item.key}</Text>}\n",
 435 |       "        />\n",
 436 |       "      </View>\n",
 437 |       "    );\n",
 438 |       "  }\n",
 439 |       "}\n",
 440 |       "\n",
 441 |       "const styles = StyleSheet.create({\n",
 442 |       "  container: {\n",
 443 |       "   flex: 1,\n",
 444 |       "   paddingTop: 22\n",
 445 |       "  },\n",
 446 |       "  item: {\n",
 447 |       "    padding: 10,\n",
 448 |       "    fontSize: 18,\n",
 449 |       "    height: 44,\n",
 450 |       "  },\n",
 451 |       "})\n",
 452 |       "```\n",
 453 |       "\n",
 454 |       "![](https://i.imgur.com/RZFZcsT.gif)\n",
 455 |       "\n",
 456 |       "Nếu bạn muốn chia list hiển thị thành các nhóm khác nhau thì React Native cũng cung cấp cho bạn `SectionList` để làm việc này\n",
 457 |       "\n",
 458 |       "Ví dụ:\n",
 459 |       "\n",
 460 |       "```javascript\n",
 461 |       "import React, { Component } from 'react';\n",
 462 |       "import { SectionList, StyleSheet, Text, View } from 'react-native';\n",
 463 |       "\n",
 464 |       "export default class SectionListBasics extends Component {\n",
 465 |       "  render() {\n",
 466 |       "    return (\n",
 467 |       "      <View style={styles.container}>\n",
 468 |       "        <SectionList\n",
 469 |       "          sections={[\n",
 470 |       "            {title: 'D', data: ['Devin']},\n",
 471 |       "            {title: 'J', data: ['Jackson', 'James', 'Jillian', 'Jimmy', 'Joel', 'John', 'Julie']},\n",
 472 |       "          ]}\n",
 473 |       "          renderItem={({item}) => <Text style={styles.item}>{item}</Text>}\n",
 474 |       "          renderSectionHeader={({section}) => <Text style={styles.sectionHeader}>{section.title}</Text>}\n",
 475 |       "          keyExtractor={(item, index) => index}\n",
 476 |       "        />\n",
 477 |       "      </View>\n",
 478 |       "    );\n",
 479 |       "  }\n",
 480 |       "}\n",
 481 |       "\n",
 482 |       "const styles = StyleSheet.create({\n",
 483 |       "  container: {\n",
 484 |       "   flex: 1,\n",
 485 |       "   paddingTop: 22\n",
 486 |       "  },\n",
 487 |       "  sectionHeader: {\n",
 488 |       "    paddingTop: 2,\n",
 489 |       "    paddingLeft: 10,\n",
 490 |       "    paddingRight: 10,\n",
 491 |       "    paddingBottom: 2,\n",
 492 |       "    fontSize: 14,\n",
 493 |       "    fontWeight: 'bold',\n",
 494 |       "    backgroundColor: 'rgba(247,247,247,1.0)',\n",
 495 |       "  },\n",
 496 |       "  item: {\n",
 497 |       "    padding: 10,\n",
 498 |       "    fontSize: 18,\n",
 499 |       "    height: 44,\n",
 500 |       "  },\n",
 501 |       "})\n",
 502 |       "```\n",
 503 |       "\n",
 504 |       "![](https://i.imgur.com/9dLhnEq.png)\n",
 505 |       "# Kết luận\n",
 506 |       "Chúc bạn chạy được example mà không gặp phải lỗi nào nhé. Nếu có vấn đề gì thì bạn có thể comment dưới đây để mình support nhé\n",
 507 |       "\n",
 508 |       "Nguồn tham khảo: https://facebook.github.io/react-native/docs/tutorial.html\n"
 509 |      ]
 510 |     }
 511 |    ],
 512 |    "source": [
 513 |     "print(content[0])"
 514 |    ]
 515 |   },
 516 |   {
 517 |    "cell_type": "markdown",
 518 |    "metadata": {},
 519 |    "source": [
 520 |     "We will remove documents which it's text length is less than 1000. Because many documents do not have any meaning."
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": 7,
 526 |    "metadata": {},
 527 |    "outputs": [],
 528 |    "source": [
 529 |     "documents = []\n",
 530 |     "for c in content:\n",
 531 |     "    if len(c) > 1000:\n",
 532 |     "        documents.append(c)"
 533 |    ]
 534 |   },
 535 |   {
 536 |    "cell_type": "code",
 537 |    "execution_count": 8,
 538 |    "metadata": {},
 539 |    "outputs": [
 540 |     {
 541 |      "name": "stdout",
 542 |      "output_type": "stream",
 543 |      "text": [
 544 |       "13887\n"
 545 |      ]
 546 |     }
 547 |    ],
 548 |    "source": [
 549 |     "print(len(documents))"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "markdown",
 554 |    "metadata": {},
 555 |    "source": [
 556 |     "# Implementation"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "markdown",
 561 |    "metadata": {},
 562 |    "source": [
 563 |     "## TF-IDF"
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "code",
 568 |    "execution_count": 14,
 569 |    "metadata": {},
 570 |    "outputs": [],
 571 |    "source": [
 572 |     "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "markdown",
 577 |    "metadata": {},
 578 |    "source": [
 579 |     "#### Get the corpus used in TfIdf model"
 580 |    ]
 581 |   },
 582 |   {
 583 |    "cell_type": "code",
 584 |    "execution_count": 10,
 585 |    "metadata": {},
 586 |    "outputs": [],
 587 |    "source": [
 588 |     "def get_corpus(documents):\n",
 589 |     "    corpus = []\n",
 590 |     "    for i in tqdm(range(len(documents))):\n",
 591 |     "        doc = documents[i]\n",
 592 |     "        # preprocessing text, remove all letters which is not character\n",
 593 |     "        doc = gensim.utils.simple_preprocess(doc)\n",
 594 |     "        # because these documents is mostly written in Vietnamese\n",
 595 |     "        # we need to use an library which supporting Vietnamese\n",
 596 |     "        doc = ' '.join(doc)\n",
 597 |     "        words = ViTokenizer.tokenize(doc)\n",
 598 |     "        sentence = ' '.join(words)\n",
 599 |     "\n",
 600 |     "        corpus.append(sentence)\n",
 601 |     "    return corpus"
 602 |    ]
 603 |   },
 604 |   {
 605 |    "cell_type": "code",
 606 |    "execution_count": 11,
 607 |    "metadata": {
 608 |     "scrolled": true
 609 |    },
 610 |    "outputs": [
 611 |     {
 612 |      "name": "stderr",
 613 |      "output_type": "stream",
 614 |      "text": [
 615 |       "100%|██████████| 13887/13887 [05:11<00:00, 44.63it/s]\n"
 616 |      ]
 617 |     }
 618 |    ],
 619 |    "source": [
 620 |     "corpus = get_corpus(documents)"
 621 |    ]
 622 |   },
 623 |   {
 624 |    "cell_type": "markdown",
 625 |    "metadata": {},
 626 |    "source": [
 627 |     "#### Custom the tokenizer to use in TfidfVectorizer()"
 628 |    ]
 629 |   },
 630 |   {
 631 |    "cell_type": "code",
 632 |    "execution_count": 15,
 633 |    "metadata": {},
 634 |    "outputs": [],
 635 |    "source": [
 636 |     "def my_tokenizer(doc):\n",
 637 |     "#     doc = ViTokenizer.tokenize(doc)\n",
 638 |     "    doc = doc.split(\" \")\n",
 639 |     "    return doc"
 640 |    ]
 641 |   },
 642 |   {
 643 |    "cell_type": "markdown",
 644 |    "metadata": {},
 645 |    "source": [
 646 |     "#### Create an Vectorize object with max features (number of words in dictionary) to be 20000"
 647 |    ]
 648 |   },
 649 |   {
 650 |    "cell_type": "code",
 651 |    "execution_count": 16,
 652 |    "metadata": {},
 653 |    "outputs": [],
 654 |    "source": [
 655 |     "vectorizer = TfidfVectorizer(tokenizer=my_tokenizer, max_features=20000)"
 656 |    ]
 657 |   },
 658 |   {
 659 |    "cell_type": "markdown",
 660 |    "metadata": {},
 661 |    "source": [
 662 |     "#### Train model"
 663 |    ]
 664 |   },
 665 |   {
 666 |    "cell_type": "code",
 667 |    "execution_count": 17,
 668 |    "metadata": {},
 669 |    "outputs": [],
 670 |    "source": [
 671 |     "res_corpus = vectorizer.fit_transform(corpus)"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "code",
 676 |    "execution_count": 19,
 677 |    "metadata": {},
 678 |    "outputs": [
 679 |     {
 680 |      "name": "stdout",
 681 |      "output_type": "stream",
 682 |      "text": [
 683 |       "20000\n"
 684 |      ]
 685 |     }
 686 |    ],
 687 |    "source": [
 688 |     "print(len(vectorizer.get_feature_names()))\n"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "markdown",
 693 |    "metadata": {},
 694 |    "source": [
 695 |     "## SVD"
 696 |    ]
 697 |   },
 698 |   {
 699 |    "cell_type": "markdown",
 700 |    "metadata": {},
 701 |    "source": [
 702 |     "#### Reduce document vector's dimensions"
 703 |    ]
 704 |   },
 705 |   {
 706 |    "cell_type": "code",
 707 |    "execution_count": 20,
 708 |    "metadata": {},
 709 |    "outputs": [],
 710 |    "source": [
 711 |     "from sklearn.decomposition import TruncatedSVD\n",
 712 |     "svd = TruncatedSVD(n_components=300, random_state=42)"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "code",
 717 |    "execution_count": 21,
 718 |    "metadata": {},
 719 |    "outputs": [],
 720 |    "source": [
 721 |     "truncated_corpus = svd.fit_transform(res_corpus)"
 722 |    ]
 723 |   },
 724 |   {
 725 |    "cell_type": "code",
 726 |    "execution_count": 22,
 727 |    "metadata": {},
 728 |    "outputs": [
 729 |     {
 730 |      "name": "stdout",
 731 |      "output_type": "stream",
 732 |      "text": [
 733 |       "(13887, 300)\n"
 734 |      ]
 735 |     }
 736 |    ],
 737 |    "source": [
 738 |     "print(truncated_corpus.shape)"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "markdown",
 743 |    "metadata": {},
 744 |    "source": [
 745 |     "#### Save model"
 746 |    ]
 747 |   },
 748 |   {
 749 |    "cell_type": "code",
 750 |    "execution_count": 24,
 751 |    "metadata": {},
 752 |    "outputs": [
 753 |     {
 754 |      "name": "stdout",
 755 |      "output_type": "stream",
 756 |      "text": [
 757 |       "SAVED SUCESS\n"
 758 |      ]
 759 |     }
 760 |    ],
 761 |    "source": [
 762 |     "filename = 'tfidf_svd_20k_to_128.pkl'\n",
 763 |     "pickle.dump(svd, open(filename, 'wb'))\n",
 764 |     "print(\"SAVED SUCESS\")"
 765 |    ]
 766 |   },
 767 |   {
 768 |    "cell_type": "markdown",
 769 |    "metadata": {},
 770 |    "source": [
 771 |     "#### Calculate similarity values between documents"
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": 25,
 777 |    "metadata": {},
 778 |    "outputs": [],
 779 |    "source": [
 780 |     "from sklearn.metrics.pairwise import cosine_similarity"
 781 |    ]
 782 |   },
 783 |   {
 784 |    "cell_type": "code",
 785 |    "execution_count": 26,
 786 |    "metadata": {},
 787 |    "outputs": [],
 788 |    "source": [
 789 |     "similarity_matrix = cosine_similarity(truncated_corpus, truncated_corpus)"
 790 |    ]
 791 |   },
 792 |   {
 793 |    "cell_type": "markdown",
 794 |    "metadata": {},
 795 |    "source": [
 796 |     "#### Test for first document, get 10 most similar document"
 797 |    ]
 798 |   },
 799 |   {
 800 |    "cell_type": "code",
 801 |    "execution_count": 34,
 802 |    "metadata": {},
 803 |    "outputs": [
 804 |     {
 805 |      "data": {
 806 |       "text/plain": [
 807 |        "array([ 3515,   752,  9364, 12871, 11940,  1489,  2265, 13167, 11466,\n",
 808 |        "           0])"
 809 |       ]
 810 |      },
 811 |      "execution_count": 34,
 812 |      "metadata": {},
 813 |      "output_type": "execute_result"
 814 |     }
 815 |    ],
 816 |    "source": [
 817 |     "np.argsort(similarity_matrix[0])[-10:]"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": 39,
 823 |    "metadata": {},
 824 |    "outputs": [
 825 |     {
 826 |      "name": "stdout",
 827 |      "output_type": "stream",
 828 |      "text": [
 829 |       "react native là gì react native là một frameworkework cho phép bạn xây_dựng một ứng_dụng trên native platforms sử_dụng kinh_nghiệm của lập_trình_viên bằng javascript và react http_facebook github io react trọng_tâm của react native là hiệu_quả của nhà phát_triển trên tất_cả các nền_tảng mà bạn quan_tâm học một lần viết trên bất_cứ nền_tảng nào facebook sử_dụng react native trên nhiều ứng_dụng và tiếp_tục đầu_tư vào react native native components các thành_phần native với react native bạn có_thể sử_dụng các thành_phần cơ_bản mặc_định của nền_tảng như uitabbar trên ios và drawer trên android điều này cho phép ứng_dụng có sự nhất_quán với phần còn lại của nền_tảng và giữ cho chất_lượng sản_phẩm cao các thành_phần này có_thể dễ_dàng tích_hợp vào ứng_dụng của bạn sử_dụng react component counterparts chẳng_hạn như tabbarios and javascript ios import react component tabbarios navigatorios from react native class app extends component render return tabbarios tabbarios item title react native selected true navigatorios initialroute title react native tabbarios item tabbarios javascript android import react component text from react native class app extends component render return text react native text asynchronous execution xử_lý không đồng_bộ mọi hoạt_động giữa javascript code và native platform đều được thực_hiện không đồng_bộ và các native module cũng có_thể sử_dụng thêm các threads khác tốt điều này có nghĩa_là chúng_ta có_thể giải_mã ảnh không nằm tren main thread lưu vào bộ_nhớ trong luồng background tính_toán và sắp_xếp giao_diện sẽ không làm đơ giao_diện và còn nhiều hơn thế nữa kết_quả là ứng_dụng được viết bằng react native hoạt_động và đáp_ứng giống như app được viết bằng native các phương_án giao_tiếp cũng được hỗ_trợ đầy_đủ cho phép bạn sử_dụng chrome developer tool để debug javascript khi chạy các ứng_dụng hoàn_chỉnh trên công_cụ mô_phỏng hoặc thiết_bị thật png uploads ea_da ac afc png touch handling xử_lý tương_tác chạm react native đã thực_hiện một hệ_thống mạnh_mẽ để xử_lý các thao_tác chạm trên giao_diện và cung_cấp các công_cụ như tích_hợp các giao_diện có_thể cuộn được và nhiều phần_tử khác mà không cần cấu_hình gì thêm javascipt ios android import react component scrollview text from react native class touchdemo extends component render return scrollview onpress console log pressed text proper touch handling text scrollview flexbox and styling sắp_xếp bố_trí giao_diện cần được thực_hiện một các dễ_dàng đó là lý_do vì sao react native sử_dụng chế_độ flexbox layout từ web vào react native flexbox khiến việc xây_dựng hầu_hết các giao_diện cơ_bản trở thàn công_việc dễ_dàng như giao_diện stacked và nested boxes với margin và padding react native cũng hỗ_trợ các web styles phổ_biến như fontweight và cung_cấp một các tối_ưu để giải_quyết các vấn_đề về thiết_kế giao_diện javascript ios android var react component image stylesheet text view from react native class reactnative extends component render return view style styles row image source uri http_facebook github io react img logo_og png style styles image view style styles text text style styles title react native text text style styles subtitle build high quality mobile apps using react text view view var styles stylesheet create row flexdirection row margin image width height marginright text flex justifycontent center title fontsize fontweight bold subtitle fontsize extensibility khả_năng mở_rộng điều tuyệt_vời là bạn có_thể xây_dựng một ứng_dụng bằng react native mà không cần viết cho mỗi nền_tảng tuy_nhiên react native cũng được thiết_kế để dễ_dàng mở_rộng với các views và modules native đã được tuỳ biến điều này có nghĩa_là bạn có_thể tái sử_dụng tất_cả những gì bạn đã xây_dựng trước đó hoặc có_thể sử_dụng các thư_viện native mà bạn thích tạo modules cho ios để tạo modules trên ios bạn tạo một class kế_thừa rctbridgemodule prototcol và viết hàm bạn muốn có bên javascript trong hàm ngoài_ra chính class phải được exported rõ_ràng với objective objective import rctbridgemodule interface mycustommodule nsobject rctbridgemodule end implementation mycustommodule available as nativemodules mycustommodule processstring processstring nsstring input callback callback callback input goodbye withstring hello end javascript import react component nativemodules text from react native class message extends component constructor props super props this state text goodbye world nativemodules mycustommodule processstring this state text text this setstate text render return text this state text text tạo ios views_tuỳ chỉnh ios views có_thể kế_thừa class rctviewmanager thực_hiện hàm uiview và đăng_ký các thuộc_tính với macro objective objective import rctviewmanager interface rctviewmanager end implementation uiview view return mycustomview alloc init nsstring end javascript javascript import react component from react native var mycustomview mycustomview export default class mycustomview extends component static proptypes react proptypes oneof render return this props\n"
 830 |      ]
 831 |     }
 832 |    ],
 833 |    "source": [
 834 |     "print(corpus[11466])"
 835 |    ]
 836 |   },
 837 |   {
 838 |    "cell_type": "markdown",
 839 |    "metadata": {},
 840 |    "source": [
 841 |     "## Doc2Vec"
 842 |    ]
 843 |   },
 844 |   {
 845 |    "cell_type": "markdown",
 846 |    "metadata": {},
 847 |    "source": [
 848 |     "#### Create Corpus which is trained in Doc2Vec Model"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": 9,
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "def get_training_corpus(documents):\n",
 858 |     "    corpus = []\n",
 859 |     "    for i in tqdm(range(len(documents))):\n",
 860 |     "        doc = documents[i]\n",
 861 |     "        # preprocessing text, remove all letters which is not character\n",
 862 |     "        doc = gensim.utils.simple_preprocess(doc) \n",
 863 |     "        # because these documents is mostly written in Vietnamese\n",
 864 |     "        # we need to use an library which supporting Vietnamese\n",
 865 |     "        doc = ' '.join(doc)\n",
 866 |     "        words = ViTokenizer.tokenize(doc)\n",
 867 |     "        # convert to format used in Doc2Vec function below\n",
 868 |     "        tagged_document = gensim.models.doc2vec.TaggedDocument(words.split(\" \"), [i])\n",
 869 |     "        corpus.append(tagged_document)\n",
 870 |     "    return corpus"
 871 |    ]
 872 |   },
 873 |   {
 874 |    "cell_type": "code",
 875 |    "execution_count": null,
 876 |    "metadata": {},
 877 |    "outputs": [],
 878 |    "source": [
 879 |     "train_corpus = get_training_corpus(documents)"
 880 |    ]
 881 |   },
 882 |   {
 883 |    "cell_type": "markdown",
 884 |    "metadata": {},
 885 |    "source": [
 886 |     "#### Build Doc2Vec Model"
 887 |    ]
 888 |   },
 889 |   {
 890 |    "cell_type": "code",
 891 |    "execution_count": null,
 892 |    "metadata": {},
 893 |    "outputs": [],
 894 |    "source": [
 895 |     "# build a Doc2Vec model with vector size 300\n",
 896 |     "# remove all the words which occur less than 2 times\n",
 897 |     "# training in 40 epochs\n",
 898 |     "model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)\n",
 899 |     "model.build_vocab(train_corpus)"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "code",
 904 |    "execution_count": null,
 905 |    "metadata": {},
 906 |    "outputs": [],
 907 |    "source": [
 908 |     "# train model and get the training time\n",
 909 |     "%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "markdown",
 914 |    "metadata": {},
 915 |    "source": [
 916 |     "#### Test for first document"
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "code",
 921 |    "execution_count": null,
 922 |    "metadata": {},
 923 |    "outputs": [],
 924 |    "source": [
 925 |     "# get the vector for first document\n",
 926 |     "vector = model.infer_vector(train_corpus[0].words)"
 927 |    ]
 928 |   },
 929 |   {
 930 |    "cell_type": "code",
 931 |    "execution_count": null,
 932 |    "metadata": {},
 933 |    "outputs": [],
 934 |    "source": [
 935 |     "# get first 100 similar documents to first document\n",
 936 |     "sims = model.docvecs.most_similar([vector], topn=100)"
 937 |    ]
 938 |   },
 939 |   {
 940 |    "cell_type": "code",
 941 |    "execution_count": null,
 942 |    "metadata": {
 943 |     "scrolled": true
 944 |    },
 945 |    "outputs": [],
 946 |    "source": [
 947 |     "for i in range(10):\n",
 948 |     "    print(i, \" : \", sims[i][0])"
 949 |    ]
 950 |   },
 951 |   {
 952 |    "cell_type": "code",
 953 |    "execution_count": null,
 954 |    "metadata": {},
 955 |    "outputs": [],
 956 |    "source": []
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": null,
 961 |    "metadata": {},
 962 |    "outputs": [],
 963 |    "source": []
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": null,
 968 |    "metadata": {},
 969 |    "outputs": [],
 970 |    "source": []
 971 |   },
 972 |   {
 973 |    "cell_type": "code",
 974 |    "execution_count": null,
 975 |    "metadata": {},
 976 |    "outputs": [],
 977 |    "source": []
 978 |   },
 979 |   {
 980 |    "cell_type": "code",
 981 |    "execution_count": null,
 982 |    "metadata": {},
 983 |    "outputs": [],
 984 |    "source": []
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": null,
 989 |    "metadata": {},
 990 |    "outputs": [],
 991 |    "source": []
 992 |   },
 993 |   {
 994 |    "cell_type": "code",
 995 |    "execution_count": null,
 996 |    "metadata": {},
 997 |    "outputs": [],
 998 |    "source": []
 999 |   },
1000 |   {
1001 |    "cell_type": "code",
1002 |    "execution_count": null,
1003 |    "metadata": {},
1004 |    "outputs": [],
1005 |    "source": []
1006 |   },
1007 |   {
1008 |    "cell_type": "code",
1009 |    "execution_count": null,
1010 |    "metadata": {},
1011 |    "outputs": [],
1012 |    "source": []
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": null,
1017 |    "metadata": {},
1018 |    "outputs": [],
1019 |    "source": []
1020 |   }
1021 |  ],
1022 |  "metadata": {
1023 |   "kernelspec": {
1024 |    "display_name": "Python 3",
1025 |    "language": "python",
1026 |    "name": "python3"
1027 |   },
1028 |   "language_info": {
1029 |    "codemirror_mode": {
1030 |     "name": "ipython",
1031 |     "version": 3
1032 |    },
1033 |    "file_extension": ".py",
1034 |    "mimetype": "text/x-python",
1035 |    "name": "python",
1036 |    "nbconvert_exporter": "python",
1037 |    "pygments_lexer": "ipython3",
1038 |    "version": "3.6.5"
1039 |   }
1040 |  },
1041 |  "nbformat": 4,
1042 |  "nbformat_minor": 2
1043 | }
1044 | 


--------------------------------------------------------------------------------
/Text Summarization/Text Summarization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Data was published at https://github.com/duyvuleo/VNTC"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Introduction"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "In this tutorial, we will implement some algorithms to apply in text summarization problem."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## What is Text Summarization?"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Text summarization is the problem of creating a short, accurate, and fluent summary of a longer text document.\n",
 36 |     "\n",
 37 |     "Automatic text summarization methods are greatly needed to address the ever-growing amount of text data available online to both better help discover relevant information and to consume relevant information faster."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## What will we do in this tutorial?"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "In this tutorial, we will solve Text Summarization for Vietnamese newspapers, using some algorithms belows:\n",
 52 |     "1. Extractive Text Summarization\n",
 53 |     "    - Doc2Vec\n",
 54 |     "    - Text Rank\n",
 55 |     "2. Abstractive Text Summarization\n",
 56 |     "    - Google textsum\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "We just implement \"**Single document summarization**\" problem in this tutorial, another problem called \"**Multi-document summarization**\" will be dicussed in another time."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Extractive Text Summarization"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Doc2Vec"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "example: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Basic idea\n",
 88 |     "The idea of using Doc2Vec algorithm for text summarization problem is described as follows:\n",
 89 |     "1. In all documents, we will extract sentences separately.\n",
 90 |     "2. Each sentence will be represented by a vector, via doc2vec model\n",
 91 |     "3. Use KMean algorithm to find out most featured sentences."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 8,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn import preprocessing\n",
101 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 9,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "from pyvi import ViTokenizer, ViPosTagger\n",
111 |     "from tqdm import tqdm\n",
112 |     "import numpy as np\n",
113 |     "import gensim\n",
114 |     "import numpy as np"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 10,
120 |    "metadata": {
121 |     "scrolled": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "import os \n",
126 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
127 |     "dir_path = os.path.join(dir_path, 'Data')\n",
128 |     "\n",
129 |     "sentences = []"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 11,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import pickle\n",
139 |     "\n",
140 |     "def get_data(folder):\n",
141 |     "    sentences = []\n",
142 |     "    for path in os.listdir(folder):\n",
143 |     "        file_path = os.path.join(folder, path)\n",
144 |     "        with open(file_path, 'r', encoding=\"utf-16\") as f:\n",
145 |     "\n",
146 |     "            lines = f.readlines()\n",
147 |     "\n",
148 |     "            for line in lines:\n",
149 |     "                sens = line.split('.')\n",
150 |     "                for sen in sens:\n",
151 |     "                    if len(sen) > 10:\n",
152 |     "                        sen = gensim.utils.simple_preprocess(sen)\n",
153 |     "                        sen = ' '.join(sen)\n",
154 |     "                        sen = ViTokenizer.tokenize(sen)\n",
155 |     "                        sentences.append(sen)\n",
156 |     "\n",
157 |     "    return sentences"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 5,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 6,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# sens = test_doc.split('.')\n",
181 |     "# for sen in sens:\n",
182 |     "#     if len(sen) > 10:\n",
183 |     "#         sen = gensim.utils.simple_preprocess(sen)\n",
184 |     "#         sen = ' '.join(sen)\n",
185 |     "#         sen = ViTokenizer.tokenize(sen)\n",
186 |     "#         sentences.append(sen)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "# sentences"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "You can use multiprocessing here, but we will not use it for easy in understanding code."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 10,
215 |    "metadata": {
216 |     "scrolled": false
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "# from multiprocessing import Pool\n",
221 |     "# sentences = []\n",
222 |     "# train_paths = [os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Train_Full'), \n",
223 |     "#                os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Test_Full'),\n",
224 |     "#                os.path.join(dir_path, 'VNTC-master/Data/27Topics/Ver1.1/new train'),\n",
225 |     "#                os.path.join(dir_path, 'VNTC-master/Data/27Topics/Ver1.1/new test')]\n",
226 |     "\n",
227 |     "# dirs = []\n",
228 |     "# for path in train_paths:\n",
229 |     "#     for p in os.listdir(path):\n",
230 |     "#         dirs.append(os.path.join(path, p))\n",
231 |     "\n",
232 |     "# for d in tqdm(dirs):\n",
233 |     "#     sens = get_data(d)\n",
234 |     "#     sentences = sentences + sens\n",
235 |     "\n",
236 |     "# # with Pool(8) as pool:\n",
237 |     "# #     pool.map(get_data, tqdm(dirs))\n",
238 |     "\n"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 12,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# pickle.dump(sentences, open('./sentences.pkl', 'wb'))\n",
248 |     "sentences = pickle.load(open('./sentences.pkl', 'rb'))"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 13,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "data": {
258 |       "text/plain": [
259 |        "'ông đồ cuối_cùng trên đảo'"
260 |       ]
261 |      },
262 |      "execution_count": 13,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": [
268 |     "sentences[0]"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 14,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "def get_corpus(sentences):\n",
278 |     "    corpus = []\n",
279 |     "    \n",
280 |     "    for i in tqdm(range(len(sentences))):\n",
281 |     "        sen = sentences[i]\n",
282 |     "        \n",
283 |     "        words = sen.split(' ')\n",
284 |     "        tagged_document = gensim.models.doc2vec.TaggedDocument(words, [i])\n",
285 |     "        \n",
286 |     "        corpus.append(tagged_document)\n",
287 |     "        \n",
288 |     "    return corpus"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 15,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "100%|██████████| 2385532/2385532 [00:34<00:00, 69769.71it/s]\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "train_corpus = get_corpus(sentences)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 11,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "from sklearn.utils import shuffle\n",
315 |     "\n",
316 |     "train_corpus = shuffle(train_corpus)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "#### Build Doc2Vec model"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 11,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)\n",
333 |     "model.build_vocab(train_corpus)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 13,
339 |    "metadata": {
340 |     "scrolled": true
341 |    },
342 |    "outputs": [
343 |     {
344 |      "name": "stderr",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).\n",
348 |       "  import sys\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "# max_epochs = 40\n",
354 |     "\n",
355 |     "# for epoch in tqdm(range(max_epochs)):\n",
356 |     "#     print('iteration {0}'.format(epoch))\n",
357 |     "model.train(train_corpus[:50000],\n",
358 |     "                total_examples=model.corpus_count,\n",
359 |     "                epochs=model.iter)\n",
360 |     "    \n",
361 |     "#     # decrease the learning rate\n",
362 |     "#     model.alpha -= 0.0002\n",
363 |     "#     # fix the learning rate, no decay\n",
364 |     "#     model.min_alpha = model.alpha\n",
365 |     "\n",
366 |     "# %time model.train(train_corpus[:50000], total_examples=model.corpus_count, epochs=model.epochs)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 15,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "model.save('model/model')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 12,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "model = gensim.models.doc2vec.Doc2Vec.load('model/model')"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 21,
390 |    "metadata": {
391 |     "scrolled": true
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "array([-0.04828287,  0.25527653,  1.1613333 , -0.43151897, -0.9858117 ,\n",
398 |        "        0.10932952,  0.20315444, -0.48530903,  0.24952224, -0.11833256,\n",
399 |        "       -0.0337567 , -0.3887124 , -0.39426357,  0.4454976 ,  0.64964545,\n",
400 |        "       -0.5074249 ,  0.2037328 ,  0.32153234, -0.62261915,  0.8188216 ,\n",
401 |        "        0.5820815 , -0.09879603, -0.44826344,  0.1201525 ,  0.236654  ,\n",
402 |        "        0.13032307, -0.46023956,  0.19788027, -0.34569028, -0.21599784,\n",
403 |        "        0.42319658, -0.106575  , -0.24495657, -0.00839793, -0.11475623,\n",
404 |        "       -0.5559897 , -0.12046688,  0.18673038, -0.16149993, -0.02872676,\n",
405 |        "        0.42999822,  0.46070522,  0.50624824, -0.15866163, -0.11092521,\n",
406 |        "        0.30938515,  0.23203233,  0.11736044, -0.7434822 , -0.78674805,\n",
407 |        "        0.27668393,  0.25058967, -0.15513541,  0.05721006, -0.62895125,\n",
408 |        "       -0.3618494 ,  0.48457113, -0.16074707,  0.32852057, -0.63208133,\n",
409 |        "       -0.45503548, -0.373764  ,  0.6417061 , -0.15453526,  0.828889  ,\n",
410 |        "        0.4040729 , -0.13313939,  0.20088702, -0.36382645,  0.3100666 ,\n",
411 |        "        0.02355373,  0.5920582 , -0.2271741 , -0.30618507, -0.23971866,\n",
412 |        "        0.91544545, -0.51666105, -0.05829609, -0.43708014,  0.35457942,\n",
413 |        "        0.50872976, -0.24838248,  0.44898847,  0.11512683,  0.34157744,\n",
414 |        "       -0.47279087, -0.02090802,  0.23195563, -0.14476988,  0.5966468 ,\n",
415 |        "        0.25278485,  0.70205003, -0.16960798, -0.09220067,  1.387285  ,\n",
416 |        "        0.5248568 ,  0.33318955, -0.33651793,  0.41348195, -0.94656795,\n",
417 |        "       -0.56593996,  0.6216159 ,  0.3179036 ,  0.31106716,  0.14830516,\n",
418 |        "        0.535672  ,  0.695546  ,  0.28968796,  0.4329898 , -0.6800865 ,\n",
419 |        "       -0.6313374 ,  0.36142987,  0.3392832 , -0.3685879 ,  1.0465527 ,\n",
420 |        "       -0.31610152,  0.26410806, -0.75767416, -0.0933219 , -0.10084625,\n",
421 |        "        0.11192366, -0.63711953,  0.6878306 ,  0.20774055,  0.37814376,\n",
422 |        "       -0.38910306, -0.29257646,  0.32447788,  1.4432929 ,  0.42116693,\n",
423 |        "        0.10012217, -0.54671454,  0.15930349, -0.04576634,  0.11046711,\n",
424 |        "        0.4345503 ,  0.5950319 ,  0.10390531,  0.00534402, -0.05976183,\n",
425 |        "        1.0111569 ,  0.14526764,  0.0051693 , -0.55909073,  0.18523502,\n",
426 |        "       -0.59934396,  0.24894848, -0.18078412,  0.5796731 , -0.44970104,\n",
427 |        "        0.81793183, -0.5046711 , -0.16381589,  0.14662668,  0.21144816,\n",
428 |        "        0.08799265, -0.25188333, -0.39610714, -0.46737796,  0.06498595,\n",
429 |        "       -0.24232577,  0.08590741, -0.34991795, -0.7811069 ,  0.05049568,\n",
430 |        "       -0.44203833, -0.04051779, -0.93674725,  0.7014623 ,  0.43860036,\n",
431 |        "        1.0785912 ,  0.4614321 ,  0.9178922 ,  0.01267096,  0.08151802,\n",
432 |        "       -0.21591717, -0.389159  , -0.4332839 ,  0.06478307, -0.549585  ,\n",
433 |        "        0.24735504, -0.15430401, -0.10635387,  0.9497028 , -0.5208101 ,\n",
434 |        "       -0.25834572,  0.5067593 , -0.3163417 , -0.45160556, -1.0110141 ,\n",
435 |        "       -0.11357957,  0.3088588 ,  0.67771375,  0.5347725 , -0.08545431,\n",
436 |        "       -0.6260072 ,  0.37074357,  0.3511689 ,  0.03659426, -0.5359085 ,\n",
437 |        "       -0.22255394, -0.4841223 , -0.31908542,  0.6693267 , -0.43263623,\n",
438 |        "        0.17883465,  0.76907945,  0.3865581 , -0.27964267,  0.5833102 ,\n",
439 |        "        0.10791489,  0.4569784 , -0.0223736 ,  0.48295155, -0.00460218,\n",
440 |        "       -0.47181183, -0.48191187,  0.1006198 , -0.30717742,  0.62139356,\n",
441 |        "        0.28134045,  0.29010874, -0.26925838,  0.8383542 , -0.18886985,\n",
442 |        "        0.18526816, -0.57650745, -0.59799755,  0.19990733,  0.22144596,\n",
443 |        "        0.70591587, -0.76111233,  0.13711332, -0.7318054 ,  0.02516509,\n",
444 |        "       -0.3590674 , -0.6440488 , -0.5580956 , -0.5993928 , -0.32801956,\n",
445 |        "       -0.4644991 ,  0.89624447, -0.39741072, -0.52681875, -0.29390556,\n",
446 |        "       -0.3324342 , -0.62701875,  0.12948091,  0.9591448 , -0.21732959,\n",
447 |        "       -0.6216343 , -0.04387471, -0.22252487,  0.27053964,  0.17134936,\n",
448 |        "        0.69296885,  0.39905074,  0.3307731 , -0.38610834,  0.05903669,\n",
449 |        "        0.40507847, -0.53825825,  0.08011609, -0.27195254, -0.296355  ,\n",
450 |        "        0.27324116,  0.5513492 ,  0.77330786, -0.6397054 , -0.24681841,\n",
451 |        "        0.2817206 ,  0.37891504,  0.03597298,  0.42222285, -0.06389087,\n",
452 |        "        0.39442137,  0.07020057, -0.24582939,  0.279675  ,  0.00950517,\n",
453 |        "       -0.60586107, -1.0425315 , -0.2628614 ,  0.20990998,  0.25524455,\n",
454 |        "       -0.27130723,  0.51966363, -0.14886895,  0.8109764 ,  0.258794  ,\n",
455 |        "       -0.05932726, -0.10472207,  0.06371555,  0.04762143,  0.02594266,\n",
456 |        "       -1.0294654 , -0.5873498 ,  0.60305655, -0.07507906, -0.3711069 ],\n",
457 |        "      dtype=float32)"
458 |       ]
459 |      },
460 |      "execution_count": 21,
461 |      "metadata": {},
462 |      "output_type": "execute_result"
463 |     }
464 |    ],
465 |    "source": [
466 |     "model.infer_vector(train_corpus[100000].words)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "#### Test with new document"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 1,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "test_doc = '''Trong trận bán kết lượt về AFF Cup 2018 diễn ra trên sân vận động Mỹ Đình tối 6/12, đội tuyển Việt Nam đã vượt qua đội tuyển Philippines với tỉ số 2-1. Qua đó, nâng tổng tỉ số sau hai lượt trận bán kết là 4-2.\n",
483 |     "\n",
484 |     "Đội tuyển Việt Nam đã xuất sắc giành quyền vào chơi trận chung kết AFF Cup sau tròn 10 năm chờ đợi. Đối thủ của chúng ta là đội tuyển Malaysia.\n",
485 |     "\n",
486 |     "Hai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng. Đáng chú ý, bàn thắng của Công Phượng được ghi chỉ sau vài phút anh được HLV Park Hang Seo tung vào sân thay người ở những phút cuối cùng của trận đấu.\n",
487 |     "\n",
488 |     "Bàn thắng của Công Phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ “không tưởng” của cầu thủ này ở trận bán kết lượt đi trên sân của đội tuyển Philippines hôm 2/12.\n",
489 |     "\n",
490 |     "Trong trận đấu ấy, Công Phượng cũng được HLV trưởng người Hàn Quốc tung vào sân ở những phút cuối trận đấu. Anh thực hiện một pha đi bóng qua hàng loạt cầu thủ hậu vệ Philippines. Thế nhưng, khi đối mặt với khung thành rộng lớn, anh lại sút bóng chệch cột dọc.\n",
491 |     "\n",
492 |     "Sau tình huống bỏ lỡ ấy, cộng đồng mạng Việt Nam thi nhau chế ảnh Công Phượng. Họ cho rằng, Công Phượng không chỉ lừa qua hàng loạt hậu vệ Philippines mà còn lừa luôn cả hàng triệu fan hâm mộ đội nhà.\n",
493 |     "\n",
494 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 2\n",
495 |     "\n",
496 |     "Công Phượng đã không còn lừa người hâm mộ khi ghi bàn trong trận bán kết lượt về AFF Cup 2018.\n",
497 |     "\n",
498 |     "Chính vì vậy, trước trận đấu bán kết lượt về hôm qua, Công Phượng đã đăng tải một tấm hình lên mạng xã hội Facebook với tựa đề: “Ngày mai rồi đấy”.\n",
499 |     "\n",
500 |     "Dòng trạng thái ấy thể hiện quyết tâm của tiền đạo xứ Nghệ. Anh mong chờ được ra sân trong trận bán kết lượt về với Philippines để khẳng định mình và lấy lại niềm tin nơi người hâm mộ. Và cuối cùng, Công Phượng cũng đã làm được điều mình mong muốn.\n",
501 |     "\n",
502 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 3\n",
503 |     "\n",
504 |     "Status trước hôm bán kết thể hiện sự quyết tâm của Công Phương.\n",
505 |     "\n",
506 |     "Ngay sau trận bán kết lượt về kết thúc, Công Phượng lại tiếp tục đăng một status: “Lần này không lừa cả nhà nữa nhé. Thắng rồi bà con ơi”. Với bàn thắng ghi được ở những phút cuối trận đấu, Công Phượng đã giúp đội tuyển Việt Nam chắc chắn vào chơi trận chung kết AFF Cup 2018.\n",
507 |     "\n",
508 |     "Cũng sau trận đấu bán kết lượt về khi đội tuyển Việt Nam vượt qua đội tuyển Philippines, cầu thủ Văn Toàn đã chia sẻ trạng thái: “Trở lại thôi”. Dòng trạng thái này của Văn Toàn như một thông điệp gửi tới người hâm mộ rằng, anh đã bình phục chấn thương và sẵn sàng trở lại ở trận chung kết.\n",
509 |     "\n",
510 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 4\n",
511 |     "\n",
512 |     "Văn Toàn đăng status mang thông điệp đã bình phục chấn thương và sẵn sàng trở lại.\n",
513 |     "\n",
514 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\n",
515 |     "\n",
516 |     "Người hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại.\n",
517 |     "\n",
518 |     "Trước đó, Văn Toàn đã bị chấn thương sụn chêm ở đầu gối sau một pha va chạm với đồng đội Văn Quyết trong buổi tập trước trận đấu với đội tuyển Campuchia ở vòng bảng AFF Cup 2018.\n",
519 |     "\n",
520 |     "Rất may, chấn thương của Văn Toàn không quá nặng và không phải phẫu thuật nên bình phục nhanh chóng. Ban đầu, các bác sĩ của đội tuyển Việt Nam dự đoán Văn Toàn có thể trở lại ở trận bán kết lượt về. Tuy nhiên, chấn thương chưa bình phục hẳn nên Văn Toàn phải đợi đến chung kết để có cơ hội được ra sân.\n",
521 |     "\n",
522 |     "Những cầu thủ khác như Nguyễn Quang Hải, Phan Văn Đức, Phạm Đức Huy cũng có những chia sẻ lên Facebook cá nhân sau trận đấu. Các cầu thủ thầm cảm ơn những người thân, người hâm mộ đã luôn bên họ và chứng kiến họ trưởng thành.\n",
523 |     "'''"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": []
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 9,
536 |    "metadata": {},
537 |    "outputs": [
538 |     {
539 |      "data": {
540 |       "text/plain": [
541 |        "'Trong trận bán kết lượt về AFF Cup 2018 diễn ra trên sân vận động Mỹ Đình tối 6/12, đội tuyển Việt Nam đã vượt qua đội tuyển Philippines với tỉ số 2-1. Qua đó, nâng tổng tỉ số sau hai lượt trận bán kết là 4-2.\\n\\nĐội tuyển Việt Nam đã xuất sắc giành quyền vào chơi trận chung kết AFF Cup sau tròn 10 năm chờ đợi. Đối thủ của chúng ta là đội tuyển Malaysia.\\n\\nHai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng. Đáng chú ý, bàn thắng của Công Phượng được ghi chỉ sau vài phút anh được HLV Park Hang Seo tung vào sân thay người ở những phút cuối cùng của trận đấu.\\n\\nBàn thắng của Công Phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ “không tưởng” của cầu thủ này ở trận bán kết lượt đi trên sân của đội tuyển Philippines hôm 2/12.\\n\\nTrong trận đấu ấy, Công Phượng cũng được HLV trưởng người Hàn Quốc tung vào sân ở những phút cuối trận đấu. Anh thực hiện một pha đi bóng qua hàng loạt cầu thủ hậu vệ Philippines. Thế nhưng, khi đối mặt với khung thành rộng lớn, anh lại sút bóng chệch cột dọc.\\n\\nSau tình huống bỏ lỡ ấy, cộng đồng mạng Việt Nam thi nhau chế ảnh Công Phượng. Họ cho rằng, Công Phượng không chỉ lừa qua hàng loạt hậu vệ Philippines mà còn lừa luôn cả hàng triệu fan hâm mộ đội nhà.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 2\\n\\nCông Phượng đã không còn lừa người hâm mộ khi ghi bàn trong trận bán kết lượt về AFF Cup 2018.\\n\\nChính vì vậy, trước trận đấu bán kết lượt về hôm qua, Công Phượng đã đăng tải một tấm hình lên mạng xã hội Facebook với tựa đề: “Ngày mai rồi đấy”.\\n\\nDòng trạng thái ấy thể hiện quyết tâm của tiền đạo xứ Nghệ. Anh mong chờ được ra sân trong trận bán kết lượt về với Philippines để khẳng định mình và lấy lại niềm tin nơi người hâm mộ. Và cuối cùng, Công Phượng cũng đã làm được điều mình mong muốn.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 3\\n\\nStatus trước hôm bán kết thể hiện sự quyết tâm của Công Phương.\\n\\nNgay sau trận bán kết lượt về kết thúc, Công Phượng lại tiếp tục đăng một status: “Lần này không lừa cả nhà nữa nhé. Thắng rồi bà con ơi”. Với bàn thắng ghi được ở những phút cuối trận đấu, Công Phượng đã giúp đội tuyển Việt Nam chắc chắn vào chơi trận chung kết AFF Cup 2018.\\n\\nCũng sau trận đấu bán kết lượt về khi đội tuyển Việt Nam vượt qua đội tuyển Philippines, cầu thủ Văn Toàn đã chia sẻ trạng thái: “Trở lại thôi”. Dòng trạng thái này của Văn Toàn như một thông điệp gửi tới người hâm mộ rằng, anh đã bình phục chấn thương và sẵn sàng trở lại ở trận chung kết.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 4\\n\\nVăn Toàn đăng status mang thông điệp đã bình phục chấn thương và sẵn sàng trở lại.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\\n\\nNgười hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại.\\n\\nTrước đó, Văn Toàn đã bị chấn thương sụn chêm ở đầu gối sau một pha va chạm với đồng đội Văn Quyết trong buổi tập trước trận đấu với đội tuyển Campuchia ở vòng bảng AFF Cup 2018.\\n\\nRất may, chấn thương của Văn Toàn không quá nặng và không phải phẫu thuật nên bình phục nhanh chóng. Ban đầu, các bác sĩ của đội tuyển Việt Nam dự đoán Văn Toàn có thể trở lại ở trận bán kết lượt về. Tuy nhiên, chấn thương chưa bình phục hẳn nên Văn Toàn phải đợi đến chung kết để có cơ hội được ra sân.\\n\\nNhững cầu thủ khác như Nguyễn Quang Hải, Phan Văn Đức, Phạm Đức Huy cũng có những chia sẻ lên Facebook cá nhân sau trận đấu. Các cầu thủ thầm cảm ơn những người thân, người hâm mộ đã luôn bên họ và chứng kiến họ trưởng thành.\\n'"
542 |       ]
543 |      },
544 |      "execution_count": 9,
545 |      "metadata": {},
546 |      "output_type": "execute_result"
547 |     }
548 |    ],
549 |    "source": [
550 |     "test_doc"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 44,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "def get_list_sentence_vectors_from_document(doc, model):\n",
560 |     "    vectors = []\n",
561 |     "    sens = doc.split('.')\n",
562 |     "    for sen in sens:\n",
563 |     "        if len(sen) > 10:\n",
564 |     "            sen = gensim.utils.simple_preprocess(sen)\n",
565 |     "            sen = ' '.join(sen)\n",
566 |     "            sen = ViTokenizer.tokenize(sen)\n",
567 |     "            sen = sen.split(' ')\n",
568 |     "            vec = model.infer_vector(sen)\n",
569 |     "            \n",
570 |     "            vectors.append(vec)\n",
571 |     "    \n",
572 |     "    return np.array(vectors), sens"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 45,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "ename": "NameError",
582 |      "evalue": "name 'model' is not defined",
583 |      "output_type": "error",
584 |      "traceback": [
585 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
586 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
587 |       "\u001b[0;32m<ipython-input-45-2e6a5b890abc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msen_vectors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_list_sentence_vectors_from_document\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_doc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
588 |       "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
589 |      ]
590 |     }
591 |    ],
592 |    "source": [
593 |     "sen_vectors, sens = get_list_sentence_vectors_from_document(test_doc, model=model)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 46,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "ename": "NameError",
603 |      "evalue": "name 'sen_vectors' is not defined",
604 |      "output_type": "error",
605 |      "traceback": [
606 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
607 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
608 |       "\u001b[0;32m<ipython-input-46-8f05907afce0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msen_vectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
609 |       "\u001b[0;31mNameError\u001b[0m: name 'sen_vectors' is not defined"
610 |      ]
611 |     }
612 |    ],
613 |    "source": [
614 |     "sen_vectors.shape"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 59,
620 |    "metadata": {},
621 |    "outputs": [],
622 |    "source": [
623 |     "X = sen_vectors"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 62,
629 |    "metadata": {},
630 |    "outputs": [
631 |     {
632 |      "data": {
633 |       "text/plain": [
634 |        "GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,\n",
635 |        "        means_init=None, n_components=2, n_init=1, precisions_init=None,\n",
636 |        "        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,\n",
637 |        "        verbose_interval=10, warm_start=False, weights_init=None)"
638 |       ]
639 |      },
640 |      "execution_count": 62,
641 |      "metadata": {},
642 |      "output_type": "execute_result"
643 |     }
644 |    ],
645 |    "source": [
646 |     "from sklearn.cluster import KMeans\n",
647 |     "from sklearn.mixture import GaussianMixture\n",
648 |     "n_clusters = 2\n",
649 |     "\n",
650 |     "gm = GaussianMixture(2)\n",
651 |     "gm.fit(X)\n",
652 |     "# kmeans = KMeans(n_clusters=n_clusters)\n",
653 |     "# kmeans = kmeans.fit(X)"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 64,
659 |    "metadata": {},
660 |    "outputs": [
661 |     {
662 |      "data": {
663 |       "text/plain": [
664 |        "array([0.07272727, 0.92727273])"
665 |       ]
666 |      },
667 |      "execution_count": 64,
668 |      "metadata": {},
669 |      "output_type": "execute_result"
670 |     }
671 |    ],
672 |    "source": [
673 |     "gm.weights_"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": 63,
679 |    "metadata": {},
680 |    "outputs": [
681 |     {
682 |      "name": "stdout",
683 |      "output_type": "stream",
684 |      "text": [
685 |       "\n",
686 |       "\n",
687 |       "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\n",
688 |       "\n",
689 |       "Người hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại\n",
690 |       "\n",
691 |       "\n",
692 |       "Hai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng\n"
693 |      ]
694 |     }
695 |    ],
696 |    "source": [
697 |     "from sklearn.metrics import pairwise_distances_argmin_min\n",
698 |     "\n",
699 |     "avg = []\n",
700 |     "for j in range(n_clusters):\n",
701 |     "    idx = np.where(kmeans.labels_ == j)[0]\n",
702 |     "    avg.append(np.mean(idx))\n",
703 |     "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)\n",
704 |     "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n",
705 |     "summary = [sens[closest[idx]] for idx in ordering]\n",
706 |     "\n",
707 |     "for sen in summary:\n",
708 |     "    print(sen)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "## Text Rank"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": 2,
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "def build_index(links):\n",
725 |     "    website_list = links.keys()\n",
726 |     "    return {website: index for index, website in enumerate(website_list)}\n",
727 |     " \n",
728 |     "    "
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 3,
734 |    "metadata": {},
735 |    "outputs": [],
736 |    "source": [
737 |     "import numpy as np\n",
738 |     " \n",
739 |     "def build_transition_matrix(links, index):\n",
740 |     "    total_links = 0\n",
741 |     "    A = np.zeros((len(index), len(index)))\n",
742 |     "    for webpage in links:\n",
743 |     "        # dangling page\n",
744 |     "        if not links[webpage]:\n",
745 |     "            # Assign equal probabilities to transition to all the other pages\n",
746 |     "            A[index[webpage]] = np.ones(len(index)) / len(index)\n",
747 |     "        else:\n",
748 |     "            for dest_webpage in links[webpage]:\n",
749 |     "                total_links += 1\n",
750 |     "                A[index[webpage]][index[dest_webpage]] = 1.0 / len(links[webpage])\n",
751 |     " \n",
752 |     "    return A"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 4,
758 |    "metadata": {},
759 |    "outputs": [],
760 |    "source": [
761 |     "def pagerank(A, eps=0.0001, d=0.85):\n",
762 |     "    P = np.ones(len(A)) / len(A)\n",
763 |     "    while True:\n",
764 |     "        new_P = np.ones(len(A)) * (1 - d) / len(A) + d * A.T.dot(P)\n",
765 |     "        delta = abs(new_P - P).sum()\n",
766 |     "        if delta <= eps:\n",
767 |     "            return new_P\n",
768 |     "        P = new_P"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 5,
774 |    "metadata": {},
775 |    "outputs": [],
776 |    "source": [
777 |     "from nltk.corpus import brown, stopwords\n",
778 |     "from nltk.cluster.util import cosine_distance\n",
779 |     " \n",
780 |     "def sentence_similarity(sent1, sent2, stopwords=None):\n",
781 |     "    if stopwords is None:\n",
782 |     "        stopwords = []\n",
783 |     " \n",
784 |     "    sent1 = [w.lower() for w in sent1]\n",
785 |     "    sent2 = [w.lower() for w in sent2]\n",
786 |     " \n",
787 |     "    all_words = list(set(sent1 + sent2))\n",
788 |     " \n",
789 |     "    vector1 = [0] * len(all_words)\n",
790 |     "    vector2 = [0] * len(all_words)\n",
791 |     " \n",
792 |     "    # build the vector for the first sentence\n",
793 |     "    for w in sent1:\n",
794 |     "        if w in stopwords:\n",
795 |     "            continue\n",
796 |     "        vector1[all_words.index(w)] += 1\n",
797 |     " \n",
798 |     "    # build the vector for the second sentence\n",
799 |     "    for w in sent2:\n",
800 |     "        if w in stopwords:\n",
801 |     "            continue\n",
802 |     "        vector2[all_words.index(w)] += 1\n",
803 |     " \n",
804 |     "    return 1 - cosine_distance(vector1, vector2)"
805 |    ]
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": 6,
810 |    "metadata": {},
811 |    "outputs": [],
812 |    "source": [
813 |     "def build_similarity_matrix(sentences, stopwords=None):\n",
814 |     "    # Create an empty similarity matrix\n",
815 |     "    S = np.zeros((len(sentences), len(sentences)))\n",
816 |     " \n",
817 |     " \n",
818 |     "    for idx1 in range(len(sentences)):\n",
819 |     "        for idx2 in range(len(sentences)):\n",
820 |     "            if idx1 == idx2:\n",
821 |     "                continue\n",
822 |     " \n",
823 |     "            S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)\n",
824 |     " \n",
825 |     "    # normalize the matrix row-wise\n",
826 |     "    for idx in range(len(S)):\n",
827 |     "        S[idx] /= S[idx].sum()\n",
828 |     " \n",
829 |     "    return S"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "code",
834 |    "execution_count": 7,
835 |    "metadata": {},
836 |    "outputs": [],
837 |    "source": [
838 |     "def get_list_of_sentences(doc):\n",
839 |     "    sentences = []\n",
840 |     "    sens = doc.split('.')\n",
841 |     "    for sen in sens:\n",
842 |     "        if len(sen) > 10:\n",
843 |     "            sen = gensim.utils.simple_preprocess(sen)\n",
844 |     "            sen = ' '.join(sen)\n",
845 |     "            sen = ViTokenizer.tokenize(sen)\n",
846 |     "            sen = sen.split(' ')\n",
847 |     "#             print(sen)\n",
848 |     "            sentences.append(sen)\n",
849 |     "    \n",
850 |     "    return sentences"
851 |    ]
852 |   },
853 |   {
854 |    "cell_type": "code",
855 |    "execution_count": 8,
856 |    "metadata": {},
857 |    "outputs": [
858 |     {
859 |      "ename": "NameError",
860 |      "evalue": "name 'gensim' is not defined",
861 |      "output_type": "error",
862 |      "traceback": [
863 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
864 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
865 |       "\u001b[0;32m<ipython-input-8-601637eeac0e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msentences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_list_of_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_doc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
866 |       "\u001b[0;32m<ipython-input-7-db93cc69b7d6>\u001b[0m in \u001b[0;36mget_list_of_sentences\u001b[0;34m(doc)\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0msen\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msens\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msimple_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m' '\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mViTokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
867 |       "\u001b[0;31mNameError\u001b[0m: name 'gensim' is not defined"
868 |      ]
869 |     }
870 |    ],
871 |    "source": [
872 |     "sentences = get_list_of_sentences(test_doc)"
873 |    ]
874 |   },
875 |   {
876 |    "cell_type": "code",
877 |    "execution_count": null,
878 |    "metadata": {},
879 |    "outputs": [],
880 |    "source": [
881 |     "len(sentences)"
882 |    ]
883 |   },
884 |   {
885 |    "cell_type": "code",
886 |    "execution_count": 55,
887 |    "metadata": {},
888 |    "outputs": [],
889 |    "source": [
890 |     "stop_words = []\n",
891 |     "S = build_similarity_matrix(sentences, stop_words)    \n",
892 |     "# print(S)"
893 |    ]
894 |   },
895 |   {
896 |    "cell_type": "code",
897 |    "execution_count": 56,
898 |    "metadata": {},
899 |    "outputs": [],
900 |    "source": [
901 |     "from operator import itemgetter "
902 |    ]
903 |   },
904 |   {
905 |    "cell_type": "code",
906 |    "execution_count": 57,
907 |    "metadata": {},
908 |    "outputs": [
909 |     {
910 |      "name": "stdout",
911 |      "output_type": "stream",
912 |      "text": [
913 |       "1. bàn thắng của công phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ không_tưởng của cầu_thủ này trận bán_kết lượt đi trên sân của đội_tuyển philippines hôm\n",
914 |       "2. thắng bán_kết aff cup công phượng hết lừa fan văn_toàn hứa_hẹn trở_lại công phượng đã không còn lừa người hâm_mộ khi ghi_bàn trong trận bán_kết lượt về aff cup\n"
915 |      ]
916 |     }
917 |    ],
918 |    "source": [
919 |     "def textrank(sentences, top_n=5, stopwords=None):\n",
920 |     "    S = build_similarity_matrix(sentences, stop_words) \n",
921 |     "    sentence_ranks = pagerank(S)\n",
922 |     " \n",
923 |     "    # Sort the sentence ranks\n",
924 |     "    ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]\n",
925 |     "    selected_sentences = sorted(ranked_sentence_indexes[:top_n])\n",
926 |     "    summary = itemgetter(*selected_sentences)(sentences)\n",
927 |     "    return summary\n",
928 |     " \n",
929 |     "for idx, sentence in enumerate(textrank(sentences, top_n=2, stopwords=[])):\n",
930 |     "    print(\"%s. %s\" % ((idx + 1), ' '.join(sentence)))"
931 |    ]
932 |   },
933 |   {
934 |    "cell_type": "code",
935 |    "execution_count": null,
936 |    "metadata": {},
937 |    "outputs": [],
938 |    "source": []
939 |   }
940 |  ],
941 |  "metadata": {
942 |   "kernelspec": {
943 |    "display_name": "Python 3",
944 |    "language": "python",
945 |    "name": "python3"
946 |   },
947 |   "language_info": {
948 |    "codemirror_mode": {
949 |     "name": "ipython",
950 |     "version": 3
951 |    },
952 |    "file_extension": ".py",
953 |    "mimetype": "text/x-python",
954 |    "name": "python",
955 |    "nbconvert_exporter": "python",
956 |    "pygments_lexer": "ipython3",
957 |    "version": "3.6.5"
958 |   }
959 |  },
960 |  "nbformat": 4,
961 |  "nbformat_minor": 2
962 | }
963 | 


--------------------------------------------------------------------------------
/Text Summarization/.ipynb_checkpoints/Text Summarization-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Data was published at https://github.com/duyvuleo/VNTC"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Introduction"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "In this tutorial, we will implement some algorithms to apply in text summarization problem."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## What is Text Summarization?"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Text summarization is the problem of creating a short, accurate, and fluent summary of a longer text document.\n",
 36 |     "\n",
 37 |     "Automatic text summarization methods are greatly needed to address the ever-growing amount of text data available online to both better help discover relevant information and to consume relevant information faster."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## What will we do in this tutorial?"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "In this tutorial, we will solve Text Summarization for Vietnamese newspapers, using some algorithms belows:\n",
 52 |     "1. Extractive Text Summarization\n",
 53 |     "    - Doc2Vec\n",
 54 |     "    - Text Rank\n",
 55 |     "2. Abstractive Text Summarization\n",
 56 |     "    - Google textsum\n",
 57 |     "\n",
 58 |     "\n",
 59 |     "We just implement \"**Single document summarization**\" problem in this tutorial, another problem called \"**Multi-document summarization**\" will be dicussed in another time."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Extractive Text Summarization"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "## Doc2Vec"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "example: https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Basic idea\n",
 88 |     "The idea of using Doc2Vec algorithm for text summarization problem is described as follows:\n",
 89 |     "1. In all documents, we will extract sentences separately.\n",
 90 |     "2. Each sentence will be represented by a vector, via doc2vec model\n",
 91 |     "3. Use KMean algorithm to find out most featured sentences."
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 8,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "from sklearn import preprocessing\n",
101 |     "from sklearn.feature_extraction.text import TfidfVectorizer"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 9,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "from pyvi import ViTokenizer, ViPosTagger\n",
111 |     "from tqdm import tqdm\n",
112 |     "import numpy as np\n",
113 |     "import gensim\n",
114 |     "import numpy as np"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 10,
120 |    "metadata": {
121 |     "scrolled": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "import os \n",
126 |     "dir_path = os.path.dirname(os.path.realpath(os.getcwd()))\n",
127 |     "dir_path = os.path.join(dir_path, 'Data')\n",
128 |     "\n",
129 |     "sentences = []"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 11,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import pickle\n",
139 |     "\n",
140 |     "def get_data(folder):\n",
141 |     "    sentences = []\n",
142 |     "    for path in os.listdir(folder):\n",
143 |     "        file_path = os.path.join(folder, path)\n",
144 |     "        with open(file_path, 'r', encoding=\"utf-16\") as f:\n",
145 |     "\n",
146 |     "            lines = f.readlines()\n",
147 |     "\n",
148 |     "            for line in lines:\n",
149 |     "                sens = line.split('.')\n",
150 |     "                for sen in sens:\n",
151 |     "                    if len(sen) > 10:\n",
152 |     "                        sen = gensim.utils.simple_preprocess(sen)\n",
153 |     "                        sen = ' '.join(sen)\n",
154 |     "                        sen = ViTokenizer.tokenize(sen)\n",
155 |     "                        sentences.append(sen)\n",
156 |     "\n",
157 |     "    return sentences"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": []
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 5,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": []
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 6,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# sens = test_doc.split('.')\n",
181 |     "# for sen in sens:\n",
182 |     "#     if len(sen) > 10:\n",
183 |     "#         sen = gensim.utils.simple_preprocess(sen)\n",
184 |     "#         sen = ' '.join(sen)\n",
185 |     "#         sen = ViTokenizer.tokenize(sen)\n",
186 |     "#         sentences.append(sen)"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 7,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "# sentences"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "You can use multiprocessing here, but we will not use it for easy in understanding code."
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": []
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 10,
215 |    "metadata": {
216 |     "scrolled": false
217 |    },
218 |    "outputs": [],
219 |    "source": [
220 |     "# from multiprocessing import Pool\n",
221 |     "# sentences = []\n",
222 |     "# train_paths = [os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Train_Full'), \n",
223 |     "#                os.path.join(dir_path, 'VNTC-master/Data/10Topics/Ver1.1/Test_Full'),\n",
224 |     "#                os.path.join(dir_path, 'VNTC-master/Data/27Topics/Ver1.1/new train'),\n",
225 |     "#                os.path.join(dir_path, 'VNTC-master/Data/27Topics/Ver1.1/new test')]\n",
226 |     "\n",
227 |     "# dirs = []\n",
228 |     "# for path in train_paths:\n",
229 |     "#     for p in os.listdir(path):\n",
230 |     "#         dirs.append(os.path.join(path, p))\n",
231 |     "\n",
232 |     "# for d in tqdm(dirs):\n",
233 |     "#     sens = get_data(d)\n",
234 |     "#     sentences = sentences + sens\n",
235 |     "\n",
236 |     "# # with Pool(8) as pool:\n",
237 |     "# #     pool.map(get_data, tqdm(dirs))\n",
238 |     "\n"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 12,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "# pickle.dump(sentences, open('./sentences.pkl', 'wb'))\n",
248 |     "sentences = pickle.load(open('./sentences.pkl', 'rb'))"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 13,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "data": {
258 |       "text/plain": [
259 |        "'ông đồ cuối_cùng trên đảo'"
260 |       ]
261 |      },
262 |      "execution_count": 13,
263 |      "metadata": {},
264 |      "output_type": "execute_result"
265 |     }
266 |    ],
267 |    "source": [
268 |     "sentences[0]"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 14,
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "def get_corpus(sentences):\n",
278 |     "    corpus = []\n",
279 |     "    \n",
280 |     "    for i in tqdm(range(len(sentences))):\n",
281 |     "        sen = sentences[i]\n",
282 |     "        \n",
283 |     "        words = sen.split(' ')\n",
284 |     "        tagged_document = gensim.models.doc2vec.TaggedDocument(words, [i])\n",
285 |     "        \n",
286 |     "        corpus.append(tagged_document)\n",
287 |     "        \n",
288 |     "    return corpus"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 15,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stderr",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "100%|██████████| 2385532/2385532 [00:34<00:00, 69769.71it/s]\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "train_corpus = get_corpus(sentences)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 11,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "from sklearn.utils import shuffle\n",
315 |     "\n",
316 |     "train_corpus = shuffle(train_corpus)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "#### Build Doc2Vec model"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 11,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "model = gensim.models.doc2vec.Doc2Vec(vector_size=300, min_count=2, epochs=40)\n",
333 |     "model.build_vocab(train_corpus)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 13,
339 |    "metadata": {
340 |     "scrolled": true
341 |    },
342 |    "outputs": [
343 |     {
344 |      "name": "stderr",
345 |      "output_type": "stream",
346 |      "text": [
347 |       "/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: DeprecationWarning: Call to deprecated `iter` (Attribute will be removed in 4.0.0, use self.epochs instead).\n",
348 |       "  import sys\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "# max_epochs = 40\n",
354 |     "\n",
355 |     "# for epoch in tqdm(range(max_epochs)):\n",
356 |     "#     print('iteration {0}'.format(epoch))\n",
357 |     "model.train(train_corpus[:50000],\n",
358 |     "                total_examples=model.corpus_count,\n",
359 |     "                epochs=model.iter)\n",
360 |     "    \n",
361 |     "#     # decrease the learning rate\n",
362 |     "#     model.alpha -= 0.0002\n",
363 |     "#     # fix the learning rate, no decay\n",
364 |     "#     model.min_alpha = model.alpha\n",
365 |     "\n",
366 |     "# %time model.train(train_corpus[:50000], total_examples=model.corpus_count, epochs=model.epochs)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 15,
372 |    "metadata": {},
373 |    "outputs": [],
374 |    "source": [
375 |     "model.save('model/model')"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 12,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "model = gensim.models.doc2vec.Doc2Vec.load('model/model')"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 21,
390 |    "metadata": {
391 |     "scrolled": true
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "array([-0.04828287,  0.25527653,  1.1613333 , -0.43151897, -0.9858117 ,\n",
398 |        "        0.10932952,  0.20315444, -0.48530903,  0.24952224, -0.11833256,\n",
399 |        "       -0.0337567 , -0.3887124 , -0.39426357,  0.4454976 ,  0.64964545,\n",
400 |        "       -0.5074249 ,  0.2037328 ,  0.32153234, -0.62261915,  0.8188216 ,\n",
401 |        "        0.5820815 , -0.09879603, -0.44826344,  0.1201525 ,  0.236654  ,\n",
402 |        "        0.13032307, -0.46023956,  0.19788027, -0.34569028, -0.21599784,\n",
403 |        "        0.42319658, -0.106575  , -0.24495657, -0.00839793, -0.11475623,\n",
404 |        "       -0.5559897 , -0.12046688,  0.18673038, -0.16149993, -0.02872676,\n",
405 |        "        0.42999822,  0.46070522,  0.50624824, -0.15866163, -0.11092521,\n",
406 |        "        0.30938515,  0.23203233,  0.11736044, -0.7434822 , -0.78674805,\n",
407 |        "        0.27668393,  0.25058967, -0.15513541,  0.05721006, -0.62895125,\n",
408 |        "       -0.3618494 ,  0.48457113, -0.16074707,  0.32852057, -0.63208133,\n",
409 |        "       -0.45503548, -0.373764  ,  0.6417061 , -0.15453526,  0.828889  ,\n",
410 |        "        0.4040729 , -0.13313939,  0.20088702, -0.36382645,  0.3100666 ,\n",
411 |        "        0.02355373,  0.5920582 , -0.2271741 , -0.30618507, -0.23971866,\n",
412 |        "        0.91544545, -0.51666105, -0.05829609, -0.43708014,  0.35457942,\n",
413 |        "        0.50872976, -0.24838248,  0.44898847,  0.11512683,  0.34157744,\n",
414 |        "       -0.47279087, -0.02090802,  0.23195563, -0.14476988,  0.5966468 ,\n",
415 |        "        0.25278485,  0.70205003, -0.16960798, -0.09220067,  1.387285  ,\n",
416 |        "        0.5248568 ,  0.33318955, -0.33651793,  0.41348195, -0.94656795,\n",
417 |        "       -0.56593996,  0.6216159 ,  0.3179036 ,  0.31106716,  0.14830516,\n",
418 |        "        0.535672  ,  0.695546  ,  0.28968796,  0.4329898 , -0.6800865 ,\n",
419 |        "       -0.6313374 ,  0.36142987,  0.3392832 , -0.3685879 ,  1.0465527 ,\n",
420 |        "       -0.31610152,  0.26410806, -0.75767416, -0.0933219 , -0.10084625,\n",
421 |        "        0.11192366, -0.63711953,  0.6878306 ,  0.20774055,  0.37814376,\n",
422 |        "       -0.38910306, -0.29257646,  0.32447788,  1.4432929 ,  0.42116693,\n",
423 |        "        0.10012217, -0.54671454,  0.15930349, -0.04576634,  0.11046711,\n",
424 |        "        0.4345503 ,  0.5950319 ,  0.10390531,  0.00534402, -0.05976183,\n",
425 |        "        1.0111569 ,  0.14526764,  0.0051693 , -0.55909073,  0.18523502,\n",
426 |        "       -0.59934396,  0.24894848, -0.18078412,  0.5796731 , -0.44970104,\n",
427 |        "        0.81793183, -0.5046711 , -0.16381589,  0.14662668,  0.21144816,\n",
428 |        "        0.08799265, -0.25188333, -0.39610714, -0.46737796,  0.06498595,\n",
429 |        "       -0.24232577,  0.08590741, -0.34991795, -0.7811069 ,  0.05049568,\n",
430 |        "       -0.44203833, -0.04051779, -0.93674725,  0.7014623 ,  0.43860036,\n",
431 |        "        1.0785912 ,  0.4614321 ,  0.9178922 ,  0.01267096,  0.08151802,\n",
432 |        "       -0.21591717, -0.389159  , -0.4332839 ,  0.06478307, -0.549585  ,\n",
433 |        "        0.24735504, -0.15430401, -0.10635387,  0.9497028 , -0.5208101 ,\n",
434 |        "       -0.25834572,  0.5067593 , -0.3163417 , -0.45160556, -1.0110141 ,\n",
435 |        "       -0.11357957,  0.3088588 ,  0.67771375,  0.5347725 , -0.08545431,\n",
436 |        "       -0.6260072 ,  0.37074357,  0.3511689 ,  0.03659426, -0.5359085 ,\n",
437 |        "       -0.22255394, -0.4841223 , -0.31908542,  0.6693267 , -0.43263623,\n",
438 |        "        0.17883465,  0.76907945,  0.3865581 , -0.27964267,  0.5833102 ,\n",
439 |        "        0.10791489,  0.4569784 , -0.0223736 ,  0.48295155, -0.00460218,\n",
440 |        "       -0.47181183, -0.48191187,  0.1006198 , -0.30717742,  0.62139356,\n",
441 |        "        0.28134045,  0.29010874, -0.26925838,  0.8383542 , -0.18886985,\n",
442 |        "        0.18526816, -0.57650745, -0.59799755,  0.19990733,  0.22144596,\n",
443 |        "        0.70591587, -0.76111233,  0.13711332, -0.7318054 ,  0.02516509,\n",
444 |        "       -0.3590674 , -0.6440488 , -0.5580956 , -0.5993928 , -0.32801956,\n",
445 |        "       -0.4644991 ,  0.89624447, -0.39741072, -0.52681875, -0.29390556,\n",
446 |        "       -0.3324342 , -0.62701875,  0.12948091,  0.9591448 , -0.21732959,\n",
447 |        "       -0.6216343 , -0.04387471, -0.22252487,  0.27053964,  0.17134936,\n",
448 |        "        0.69296885,  0.39905074,  0.3307731 , -0.38610834,  0.05903669,\n",
449 |        "        0.40507847, -0.53825825,  0.08011609, -0.27195254, -0.296355  ,\n",
450 |        "        0.27324116,  0.5513492 ,  0.77330786, -0.6397054 , -0.24681841,\n",
451 |        "        0.2817206 ,  0.37891504,  0.03597298,  0.42222285, -0.06389087,\n",
452 |        "        0.39442137,  0.07020057, -0.24582939,  0.279675  ,  0.00950517,\n",
453 |        "       -0.60586107, -1.0425315 , -0.2628614 ,  0.20990998,  0.25524455,\n",
454 |        "       -0.27130723,  0.51966363, -0.14886895,  0.8109764 ,  0.258794  ,\n",
455 |        "       -0.05932726, -0.10472207,  0.06371555,  0.04762143,  0.02594266,\n",
456 |        "       -1.0294654 , -0.5873498 ,  0.60305655, -0.07507906, -0.3711069 ],\n",
457 |        "      dtype=float32)"
458 |       ]
459 |      },
460 |      "execution_count": 21,
461 |      "metadata": {},
462 |      "output_type": "execute_result"
463 |     }
464 |    ],
465 |    "source": [
466 |     "model.infer_vector(train_corpus[100000].words)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "markdown",
471 |    "metadata": {},
472 |    "source": [
473 |     "#### Test with new document"
474 |    ]
475 |   },
476 |   {
477 |    "cell_type": "code",
478 |    "execution_count": 1,
479 |    "metadata": {},
480 |    "outputs": [],
481 |    "source": [
482 |     "test_doc = '''Trong trận bán kết lượt về AFF Cup 2018 diễn ra trên sân vận động Mỹ Đình tối 6/12, đội tuyển Việt Nam đã vượt qua đội tuyển Philippines với tỉ số 2-1. Qua đó, nâng tổng tỉ số sau hai lượt trận bán kết là 4-2.\n",
483 |     "\n",
484 |     "Đội tuyển Việt Nam đã xuất sắc giành quyền vào chơi trận chung kết AFF Cup sau tròn 10 năm chờ đợi. Đối thủ của chúng ta là đội tuyển Malaysia.\n",
485 |     "\n",
486 |     "Hai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng. Đáng chú ý, bàn thắng của Công Phượng được ghi chỉ sau vài phút anh được HLV Park Hang Seo tung vào sân thay người ở những phút cuối cùng của trận đấu.\n",
487 |     "\n",
488 |     "Bàn thắng của Công Phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ “không tưởng” của cầu thủ này ở trận bán kết lượt đi trên sân của đội tuyển Philippines hôm 2/12.\n",
489 |     "\n",
490 |     "Trong trận đấu ấy, Công Phượng cũng được HLV trưởng người Hàn Quốc tung vào sân ở những phút cuối trận đấu. Anh thực hiện một pha đi bóng qua hàng loạt cầu thủ hậu vệ Philippines. Thế nhưng, khi đối mặt với khung thành rộng lớn, anh lại sút bóng chệch cột dọc.\n",
491 |     "\n",
492 |     "Sau tình huống bỏ lỡ ấy, cộng đồng mạng Việt Nam thi nhau chế ảnh Công Phượng. Họ cho rằng, Công Phượng không chỉ lừa qua hàng loạt hậu vệ Philippines mà còn lừa luôn cả hàng triệu fan hâm mộ đội nhà.\n",
493 |     "\n",
494 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 2\n",
495 |     "\n",
496 |     "Công Phượng đã không còn lừa người hâm mộ khi ghi bàn trong trận bán kết lượt về AFF Cup 2018.\n",
497 |     "\n",
498 |     "Chính vì vậy, trước trận đấu bán kết lượt về hôm qua, Công Phượng đã đăng tải một tấm hình lên mạng xã hội Facebook với tựa đề: “Ngày mai rồi đấy”.\n",
499 |     "\n",
500 |     "Dòng trạng thái ấy thể hiện quyết tâm của tiền đạo xứ Nghệ. Anh mong chờ được ra sân trong trận bán kết lượt về với Philippines để khẳng định mình và lấy lại niềm tin nơi người hâm mộ. Và cuối cùng, Công Phượng cũng đã làm được điều mình mong muốn.\n",
501 |     "\n",
502 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 3\n",
503 |     "\n",
504 |     "Status trước hôm bán kết thể hiện sự quyết tâm của Công Phương.\n",
505 |     "\n",
506 |     "Ngay sau trận bán kết lượt về kết thúc, Công Phượng lại tiếp tục đăng một status: “Lần này không lừa cả nhà nữa nhé. Thắng rồi bà con ơi”. Với bàn thắng ghi được ở những phút cuối trận đấu, Công Phượng đã giúp đội tuyển Việt Nam chắc chắn vào chơi trận chung kết AFF Cup 2018.\n",
507 |     "\n",
508 |     "Cũng sau trận đấu bán kết lượt về khi đội tuyển Việt Nam vượt qua đội tuyển Philippines, cầu thủ Văn Toàn đã chia sẻ trạng thái: “Trở lại thôi”. Dòng trạng thái này của Văn Toàn như một thông điệp gửi tới người hâm mộ rằng, anh đã bình phục chấn thương và sẵn sàng trở lại ở trận chung kết.\n",
509 |     "\n",
510 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 4\n",
511 |     "\n",
512 |     "Văn Toàn đăng status mang thông điệp đã bình phục chấn thương và sẵn sàng trở lại.\n",
513 |     "\n",
514 |     "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\n",
515 |     "\n",
516 |     "Người hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại.\n",
517 |     "\n",
518 |     "Trước đó, Văn Toàn đã bị chấn thương sụn chêm ở đầu gối sau một pha va chạm với đồng đội Văn Quyết trong buổi tập trước trận đấu với đội tuyển Campuchia ở vòng bảng AFF Cup 2018.\n",
519 |     "\n",
520 |     "Rất may, chấn thương của Văn Toàn không quá nặng và không phải phẫu thuật nên bình phục nhanh chóng. Ban đầu, các bác sĩ của đội tuyển Việt Nam dự đoán Văn Toàn có thể trở lại ở trận bán kết lượt về. Tuy nhiên, chấn thương chưa bình phục hẳn nên Văn Toàn phải đợi đến chung kết để có cơ hội được ra sân.\n",
521 |     "\n",
522 |     "Những cầu thủ khác như Nguyễn Quang Hải, Phan Văn Đức, Phạm Đức Huy cũng có những chia sẻ lên Facebook cá nhân sau trận đấu. Các cầu thủ thầm cảm ơn những người thân, người hâm mộ đã luôn bên họ và chứng kiến họ trưởng thành.\n",
523 |     "'''"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {},
530 |    "outputs": [],
531 |    "source": []
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": 9,
536 |    "metadata": {},
537 |    "outputs": [
538 |     {
539 |      "data": {
540 |       "text/plain": [
541 |        "'Trong trận bán kết lượt về AFF Cup 2018 diễn ra trên sân vận động Mỹ Đình tối 6/12, đội tuyển Việt Nam đã vượt qua đội tuyển Philippines với tỉ số 2-1. Qua đó, nâng tổng tỉ số sau hai lượt trận bán kết là 4-2.\\n\\nĐội tuyển Việt Nam đã xuất sắc giành quyền vào chơi trận chung kết AFF Cup sau tròn 10 năm chờ đợi. Đối thủ của chúng ta là đội tuyển Malaysia.\\n\\nHai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng. Đáng chú ý, bàn thắng của Công Phượng được ghi chỉ sau vài phút anh được HLV Park Hang Seo tung vào sân thay người ở những phút cuối cùng của trận đấu.\\n\\nBàn thắng của Công Phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ “không tưởng” của cầu thủ này ở trận bán kết lượt đi trên sân của đội tuyển Philippines hôm 2/12.\\n\\nTrong trận đấu ấy, Công Phượng cũng được HLV trưởng người Hàn Quốc tung vào sân ở những phút cuối trận đấu. Anh thực hiện một pha đi bóng qua hàng loạt cầu thủ hậu vệ Philippines. Thế nhưng, khi đối mặt với khung thành rộng lớn, anh lại sút bóng chệch cột dọc.\\n\\nSau tình huống bỏ lỡ ấy, cộng đồng mạng Việt Nam thi nhau chế ảnh Công Phượng. Họ cho rằng, Công Phượng không chỉ lừa qua hàng loạt hậu vệ Philippines mà còn lừa luôn cả hàng triệu fan hâm mộ đội nhà.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 2\\n\\nCông Phượng đã không còn lừa người hâm mộ khi ghi bàn trong trận bán kết lượt về AFF Cup 2018.\\n\\nChính vì vậy, trước trận đấu bán kết lượt về hôm qua, Công Phượng đã đăng tải một tấm hình lên mạng xã hội Facebook với tựa đề: “Ngày mai rồi đấy”.\\n\\nDòng trạng thái ấy thể hiện quyết tâm của tiền đạo xứ Nghệ. Anh mong chờ được ra sân trong trận bán kết lượt về với Philippines để khẳng định mình và lấy lại niềm tin nơi người hâm mộ. Và cuối cùng, Công Phượng cũng đã làm được điều mình mong muốn.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 3\\n\\nStatus trước hôm bán kết thể hiện sự quyết tâm của Công Phương.\\n\\nNgay sau trận bán kết lượt về kết thúc, Công Phượng lại tiếp tục đăng một status: “Lần này không lừa cả nhà nữa nhé. Thắng rồi bà con ơi”. Với bàn thắng ghi được ở những phút cuối trận đấu, Công Phượng đã giúp đội tuyển Việt Nam chắc chắn vào chơi trận chung kết AFF Cup 2018.\\n\\nCũng sau trận đấu bán kết lượt về khi đội tuyển Việt Nam vượt qua đội tuyển Philippines, cầu thủ Văn Toàn đã chia sẻ trạng thái: “Trở lại thôi”. Dòng trạng thái này của Văn Toàn như một thông điệp gửi tới người hâm mộ rằng, anh đã bình phục chấn thương và sẵn sàng trở lại ở trận chung kết.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 4\\n\\nVăn Toàn đăng status mang thông điệp đã bình phục chấn thương và sẵn sàng trở lại.\\n\\nThắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\\n\\nNgười hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại.\\n\\nTrước đó, Văn Toàn đã bị chấn thương sụn chêm ở đầu gối sau một pha va chạm với đồng đội Văn Quyết trong buổi tập trước trận đấu với đội tuyển Campuchia ở vòng bảng AFF Cup 2018.\\n\\nRất may, chấn thương của Văn Toàn không quá nặng và không phải phẫu thuật nên bình phục nhanh chóng. Ban đầu, các bác sĩ của đội tuyển Việt Nam dự đoán Văn Toàn có thể trở lại ở trận bán kết lượt về. Tuy nhiên, chấn thương chưa bình phục hẳn nên Văn Toàn phải đợi đến chung kết để có cơ hội được ra sân.\\n\\nNhững cầu thủ khác như Nguyễn Quang Hải, Phan Văn Đức, Phạm Đức Huy cũng có những chia sẻ lên Facebook cá nhân sau trận đấu. Các cầu thủ thầm cảm ơn những người thân, người hâm mộ đã luôn bên họ và chứng kiến họ trưởng thành.\\n'"
542 |       ]
543 |      },
544 |      "execution_count": 9,
545 |      "metadata": {},
546 |      "output_type": "execute_result"
547 |     }
548 |    ],
549 |    "source": [
550 |     "test_doc"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 44,
556 |    "metadata": {},
557 |    "outputs": [],
558 |    "source": [
559 |     "def get_list_sentence_vectors_from_document(doc, model):\n",
560 |     "    vectors = []\n",
561 |     "    sens = doc.split('.')\n",
562 |     "    for sen in sens:\n",
563 |     "        if len(sen) > 10:\n",
564 |     "            sen = gensim.utils.simple_preprocess(sen)\n",
565 |     "            sen = ' '.join(sen)\n",
566 |     "            sen = ViTokenizer.tokenize(sen)\n",
567 |     "            sen = sen.split(' ')\n",
568 |     "            vec = model.infer_vector(sen)\n",
569 |     "            \n",
570 |     "            vectors.append(vec)\n",
571 |     "    \n",
572 |     "    return np.array(vectors), sens"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 45,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "ename": "NameError",
582 |      "evalue": "name 'model' is not defined",
583 |      "output_type": "error",
584 |      "traceback": [
585 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
586 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
587 |       "\u001b[0;32m<ipython-input-45-2e6a5b890abc>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msen_vectors\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msens\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_list_sentence_vectors_from_document\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_doc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
588 |       "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
589 |      ]
590 |     }
591 |    ],
592 |    "source": [
593 |     "sen_vectors, sens = get_list_sentence_vectors_from_document(test_doc, model=model)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "code",
598 |    "execution_count": 46,
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "ename": "NameError",
603 |      "evalue": "name 'sen_vectors' is not defined",
604 |      "output_type": "error",
605 |      "traceback": [
606 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
607 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
608 |       "\u001b[0;32m<ipython-input-46-8f05907afce0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msen_vectors\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
609 |       "\u001b[0;31mNameError\u001b[0m: name 'sen_vectors' is not defined"
610 |      ]
611 |     }
612 |    ],
613 |    "source": [
614 |     "sen_vectors.shape"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 59,
620 |    "metadata": {},
621 |    "outputs": [],
622 |    "source": [
623 |     "X = sen_vectors"
624 |    ]
625 |   },
626 |   {
627 |    "cell_type": "code",
628 |    "execution_count": 62,
629 |    "metadata": {},
630 |    "outputs": [
631 |     {
632 |      "data": {
633 |       "text/plain": [
634 |        "GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,\n",
635 |        "        means_init=None, n_components=2, n_init=1, precisions_init=None,\n",
636 |        "        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,\n",
637 |        "        verbose_interval=10, warm_start=False, weights_init=None)"
638 |       ]
639 |      },
640 |      "execution_count": 62,
641 |      "metadata": {},
642 |      "output_type": "execute_result"
643 |     }
644 |    ],
645 |    "source": [
646 |     "from sklearn.cluster import KMeans\n",
647 |     "from sklearn.mixture import GaussianMixture\n",
648 |     "n_clusters = 2\n",
649 |     "\n",
650 |     "gm = GaussianMixture(2)\n",
651 |     "gm.fit(X)\n",
652 |     "# kmeans = KMeans(n_clusters=n_clusters)\n",
653 |     "# kmeans = kmeans.fit(X)"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": 64,
659 |    "metadata": {},
660 |    "outputs": [
661 |     {
662 |      "data": {
663 |       "text/plain": [
664 |        "array([0.07272727, 0.92727273])"
665 |       ]
666 |      },
667 |      "execution_count": 64,
668 |      "metadata": {},
669 |      "output_type": "execute_result"
670 |     }
671 |    ],
672 |    "source": [
673 |     "gm.weights_"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": 63,
679 |    "metadata": {},
680 |    "outputs": [
681 |     {
682 |      "name": "stdout",
683 |      "output_type": "stream",
684 |      "text": [
685 |       "\n",
686 |       "\n",
687 |       "Thắng bán kết AFF Cup 2018, Công Phượng hết &#34;lừa&#34; fan, Văn Toàn hứa hẹn trở lại - 5\n",
688 |       "\n",
689 |       "Người hâm mộ động viên tinh thần khi biết Văn Toàn sắp trở lại\n",
690 |       "\n",
691 |       "\n",
692 |       "Hai cầu thủ ghi bàn thắng trên sân Mỹ Đình tối qua là Quang Hải và Công Phượng\n"
693 |      ]
694 |     }
695 |    ],
696 |    "source": [
697 |     "from sklearn.metrics import pairwise_distances_argmin_min\n",
698 |     "\n",
699 |     "avg = []\n",
700 |     "for j in range(n_clusters):\n",
701 |     "    idx = np.where(kmeans.labels_ == j)[0]\n",
702 |     "    avg.append(np.mean(idx))\n",
703 |     "closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)\n",
704 |     "ordering = sorted(range(n_clusters), key=lambda k: avg[k])\n",
705 |     "summary = [sens[closest[idx]] for idx in ordering]\n",
706 |     "\n",
707 |     "for sen in summary:\n",
708 |     "    print(sen)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "## Text Rank"
716 |    ]
717 |   },
718 |   {
719 |    "cell_type": "code",
720 |    "execution_count": 2,
721 |    "metadata": {},
722 |    "outputs": [],
723 |    "source": [
724 |     "def build_index(links):\n",
725 |     "    website_list = links.keys()\n",
726 |     "    return {website: index for index, website in enumerate(website_list)}\n",
727 |     " \n",
728 |     "    "
729 |    ]
730 |   },
731 |   {
732 |    "cell_type": "code",
733 |    "execution_count": 3,
734 |    "metadata": {},
735 |    "outputs": [],
736 |    "source": [
737 |     "import numpy as np\n",
738 |     " \n",
739 |     "def build_transition_matrix(links, index):\n",
740 |     "    total_links = 0\n",
741 |     "    A = np.zeros((len(index), len(index)))\n",
742 |     "    for webpage in links:\n",
743 |     "        # dangling page\n",
744 |     "        if not links[webpage]:\n",
745 |     "            # Assign equal probabilities to transition to all the other pages\n",
746 |     "            A[index[webpage]] = np.ones(len(index)) / len(index)\n",
747 |     "        else:\n",
748 |     "            for dest_webpage in links[webpage]:\n",
749 |     "                total_links += 1\n",
750 |     "                A[index[webpage]][index[dest_webpage]] = 1.0 / len(links[webpage])\n",
751 |     " \n",
752 |     "    return A"
753 |    ]
754 |   },
755 |   {
756 |    "cell_type": "code",
757 |    "execution_count": 4,
758 |    "metadata": {},
759 |    "outputs": [],
760 |    "source": [
761 |     "def pagerank(A, eps=0.0001, d=0.85):\n",
762 |     "    P = np.ones(len(A)) / len(A)\n",
763 |     "    while True:\n",
764 |     "        new_P = np.ones(len(A)) * (1 - d) / len(A) + d * A.T.dot(P)\n",
765 |     "        delta = abs(new_P - P).sum()\n",
766 |     "        if delta <= eps:\n",
767 |     "            return new_P\n",
768 |     "        P = new_P"
769 |    ]
770 |   },
771 |   {
772 |    "cell_type": "code",
773 |    "execution_count": 5,
774 |    "metadata": {},
775 |    "outputs": [],
776 |    "source": [
777 |     "from nltk.corpus import brown, stopwords\n",
778 |     "from nltk.cluster.util import cosine_distance\n",
779 |     " \n",
780 |     "def sentence_similarity(sent1, sent2, stopwords=None):\n",
781 |     "    if stopwords is None:\n",
782 |     "        stopwords = []\n",
783 |     " \n",
784 |     "    sent1 = [w.lower() for w in sent1]\n",
785 |     "    sent2 = [w.lower() for w in sent2]\n",
786 |     " \n",
787 |     "    all_words = list(set(sent1 + sent2))\n",
788 |     " \n",
789 |     "    vector1 = [0] * len(all_words)\n",
790 |     "    vector2 = [0] * len(all_words)\n",
791 |     " \n",
792 |     "    # build the vector for the first sentence\n",
793 |     "    for w in sent1:\n",
794 |     "        if w in stopwords:\n",
795 |     "            continue\n",
796 |     "        vector1[all_words.index(w)] += 1\n",
797 |     " \n",
798 |     "    # build the vector for the second sentence\n",
799 |     "    for w in sent2:\n",
800 |     "        if w in stopwords:\n",
801 |     "            continue\n",
802 |     "        vector2[all_words.index(w)] += 1\n",
803 |     " \n",
804 |     "    return 1 - cosine_distance(vector1, vector2)"
805 |    ]
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": 6,
810 |    "metadata": {},
811 |    "outputs": [],
812 |    "source": [
813 |     "def build_similarity_matrix(sentences, stopwords=None):\n",
814 |     "    # Create an empty similarity matrix\n",
815 |     "    S = np.zeros((len(sentences), len(sentences)))\n",
816 |     " \n",
817 |     " \n",
818 |     "    for idx1 in range(len(sentences)):\n",
819 |     "        for idx2 in range(len(sentences)):\n",
820 |     "            if idx1 == idx2:\n",
821 |     "                continue\n",
822 |     " \n",
823 |     "            S[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)\n",
824 |     " \n",
825 |     "    # normalize the matrix row-wise\n",
826 |     "    for idx in range(len(S)):\n",
827 |     "        S[idx] /= S[idx].sum()\n",
828 |     " \n",
829 |     "    return S"
830 |    ]
831 |   },
832 |   {
833 |    "cell_type": "code",
834 |    "execution_count": 7,
835 |    "metadata": {},
836 |    "outputs": [],
837 |    "source": [
838 |     "def get_list_of_sentences(doc):\n",
839 |     "    sentences = []\n",
840 |     "    sens = doc.split('.')\n",
841 |     "    for sen in sens:\n",
842 |     "        if len(sen) > 10:\n",
843 |     "            sen = gensim.utils.simple_preprocess(sen)\n",
844 |     "            sen = ' '.join(sen)\n",
845 |     "            sen = ViTokenizer.tokenize(sen)\n",
846 |     "            sen = sen.split(' ')\n",
847 |     "#             print(sen)\n",
848 |     "            sentences.append(sen)\n",
849 |     "    \n",
850 |     "    return sentences"
851 |    ]
852 |   },
853 |   {
854 |    "cell_type": "code",
855 |    "execution_count": 8,
856 |    "metadata": {},
857 |    "outputs": [
858 |     {
859 |      "ename": "NameError",
860 |      "evalue": "name 'gensim' is not defined",
861 |      "output_type": "error",
862 |      "traceback": [
863 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
864 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
865 |       "\u001b[0;32m<ipython-input-8-601637eeac0e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0msentences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_list_of_sentences\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_doc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
866 |       "\u001b[0;32m<ipython-input-7-db93cc69b7d6>\u001b[0m in \u001b[0;36mget_list_of_sentences\u001b[0;34m(doc)\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0msen\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msens\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msimple_preprocess\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m' '\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m             \u001b[0msen\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mViTokenizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
867 |       "\u001b[0;31mNameError\u001b[0m: name 'gensim' is not defined"
868 |      ]
869 |     }
870 |    ],
871 |    "source": [
872 |     "sentences = get_list_of_sentences(test_doc)"
873 |    ]
874 |   },
875 |   {
876 |    "cell_type": "code",
877 |    "execution_count": null,
878 |    "metadata": {},
879 |    "outputs": [],
880 |    "source": [
881 |     "len(sentences)"
882 |    ]
883 |   },
884 |   {
885 |    "cell_type": "code",
886 |    "execution_count": 55,
887 |    "metadata": {},
888 |    "outputs": [],
889 |    "source": [
890 |     "stop_words = []\n",
891 |     "S = build_similarity_matrix(sentences, stop_words)    \n",
892 |     "# print(S)"
893 |    ]
894 |   },
895 |   {
896 |    "cell_type": "code",
897 |    "execution_count": 56,
898 |    "metadata": {},
899 |    "outputs": [],
900 |    "source": [
901 |     "from operator import itemgetter "
902 |    ]
903 |   },
904 |   {
905 |    "cell_type": "code",
906 |    "execution_count": 57,
907 |    "metadata": {},
908 |    "outputs": [
909 |     {
910 |      "name": "stdout",
911 |      "output_type": "stream",
912 |      "text": [
913 |       "1. bàn thắng của công phượng không khỏi khiến nhiều người nhớ đến pha bỏ lỡ không_tưởng của cầu_thủ này trận bán_kết lượt đi trên sân của đội_tuyển philippines hôm\n",
914 |       "2. thắng bán_kết aff cup công phượng hết lừa fan văn_toàn hứa_hẹn trở_lại công phượng đã không còn lừa người hâm_mộ khi ghi_bàn trong trận bán_kết lượt về aff cup\n"
915 |      ]
916 |     }
917 |    ],
918 |    "source": [
919 |     "def textrank(sentences, top_n=5, stopwords=None):\n",
920 |     "    S = build_similarity_matrix(sentences, stop_words) \n",
921 |     "    sentence_ranks = pagerank(S)\n",
922 |     " \n",
923 |     "    # Sort the sentence ranks\n",
924 |     "    ranked_sentence_indexes = [item[0] for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]\n",
925 |     "    selected_sentences = sorted(ranked_sentence_indexes[:top_n])\n",
926 |     "    summary = itemgetter(*selected_sentences)(sentences)\n",
927 |     "    return summary\n",
928 |     " \n",
929 |     "for idx, sentence in enumerate(textrank(sentences, top_n=2, stopwords=[])):\n",
930 |     "    print(\"%s. %s\" % ((idx + 1), ' '.join(sentence)))"
931 |    ]
932 |   },
933 |   {
934 |    "cell_type": "code",
935 |    "execution_count": null,
936 |    "metadata": {},
937 |    "outputs": [],
938 |    "source": []
939 |   }
940 |  ],
941 |  "metadata": {
942 |   "kernelspec": {
943 |    "display_name": "Python 3",
944 |    "language": "python",
945 |    "name": "python3"
946 |   },
947 |   "language_info": {
948 |    "codemirror_mode": {
949 |     "name": "ipython",
950 |     "version": 3
951 |    },
952 |    "file_extension": ".py",
953 |    "mimetype": "text/x-python",
954 |    "name": "python",
955 |    "nbconvert_exporter": "python",
956 |    "pygments_lexer": "ipython3",
957 |    "version": "3.6.5"
958 |   }
959 |  },
960 |  "nbformat": 4,
961 |  "nbformat_minor": 2
962 | }
963 | 


--------------------------------------------------------------------------------