├── pythainlp
├── script_fairseq_eval_for_n_epochs.sh
├── run_fairseq.sh
├── tokenize.py
├── .gitignore
├── README.md
├── sandbox.ipynb
└── notebooks
    └── preprocess_opensubtitle_with_bpe.ipynb


/pythainlp:
--------------------------------------------------------------------------------
1 | /root/pythainlp/pythainlp


--------------------------------------------------------------------------------
/script_fairseq_eval_for_n_epochs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | start=$1
 3 | end=$2
 4 | step=$3
 5 | data_path=$4
 6 | checkpoint_prefix=$5
 7 | beam=$6
 8 | max_tokens=$7
 9 | opts=$8
10 | 
11 | for (( i=$start; i<=$end; i+=$step ))
12 | do
13 |   echo "Evaluate BLEU at epoch number $i of the model checkpoint: $checkpoint_prefix/checkpoint$i.pt";
14 |   echo "beam_size=$beam, max_tokens=$max_tokens";
15 | 
16 |   fairseq-generate $data_path \
17 | 	--path ${checkpoint_prefix}/checkpoint${i}.pt \
18 | 	--quiet \
19 | 	--beam $beam \
20 | 	--max-tokens $max_tokens $opts > ${checkpoint_prefix}/result_checkpoint_${i}.txt
21 | 
22 |   echo "Done evaluation for epoch $i";
23 | 
24 |   
25 | done
26 | 


--------------------------------------------------------------------------------
/run_fairseq.sh:
--------------------------------------------------------------------------------
 1 | fairseq-preprocess --source-lang en --target-lang th \
 2 |     --trainpref data/opensubtitles_tok/train \
 3 |     --validpref data/opensubtitles_tok/valid \
 4 |     --testpref data/opensubtitles_tok/test \
 5 |     --destdir data/opensubtitles_bin
 6 | 
 7 | fairseq-train \
 8 |     data/opensubtitles_bin \
 9 |     --arch transformer_iwslt_de_en --max-epoch 10 \
10 |     --share-decoder-input-output-embed \
11 |     --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \
12 |     --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \
13 |     --dropout 0.3 --weight-decay 0.0001 \
14 |     --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \
15 |     --max-tokens 2048 \
16 |     --bpe sentencepiece \
17 |     --memory-efficient-fp16
18 |     
19 | fairseq-generate data/opensubtitles_bin \
20 |     --path checkpoints/checkpoint_best.pt \
21 |     --remove-bpe --beam 5 --max-tokens 2048
22 | 
23 | # | Translated 328154 sentences (2773243 tokens) in 1517.1s (216.31 sentences/s, 1828.01 tokens/s)
24 | # | Generate test with beam=5: BLEU4 = 10.80, 36.6/15.3/7.2/3.4 (BP=1.000, ratio=1.029, syslen=2445089, reflen=2376306)


--------------------------------------------------------------------------------
/tokenize.py:
--------------------------------------------------------------------------------
 1 | #python tokenize.py data/opensubtitles/OpenSubtitles.en-th.en data/opensubtitles/OpenSubtitles.en-th.th data/opensubtitles_tok/
 2 | 
 3 | from pythainlp.tokenize import word_tokenize
 4 | # from pythainlp.ulmfit import *
 5 | import random
 6 | import sys
 7 | 
 8 | #open file
 9 | with open(sys.argv[0],'r') as f:
10 |     en = f.readlines()
11 | print('English raw:', len(en), en[:3])
12 | 
13 | with open(sys.argv[1],'r') as f:
14 |     th = f.readlines()
15 | print('Thai raw:', len(th), th[:3])
16 | 
17 | 
18 | #tokenize
19 | en_tok = []
20 | for e in tqdm_notebook(en):
21 |     en_tok.append(' '.join(word_tokenize(e,keep_whitespace=False)))
22 | 
23 | th_tok = []
24 | for t in tqdm_notebook(th):
25 |     th_tok.append(' '.join(word_tokenize(t)))
26 | #     th_tok.append(' '.join(process_thai(t)))
27 | 
28 | #train-valid-test split 80/10/10
29 | n = len(th_tok)
30 | idx = list(range(n))
31 | random.shuffle(idx)
32 | train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]
33 | print('train/valid/test:', len(train_idx),len(valid_idx),len(test_idx))
34 | 
35 | #save tokenized
36 | th_train = [th_tok[i] for i in train_idx]
37 | print('English tokenized train', len(th_train), th_train[:10])
38 | en_train = [en_tok[i] for i in train_idx]
39 | print('Thai tokenized train', len(en_train), en_train[:10])
40 | 
41 | with open(f'{sys.argv[2]}/train.en','w') as f:
42 |     for e in en_train:
43 |         f.write(e)
44 | with open(f'{sys.argv[2]}/train.th','w') as f:
45 |     for t in th_train:
46 |         f.write(t)
47 | with open(f'{sys.argv[2]}/valid.en','w') as f:
48 |     for e in en_valid:
49 |         f.write(e)
50 | with open(f'{sys.argv[2]}/valid.th','w') as f:
51 |     for t in th_valid:
52 |         f.write(t)
53 | with open(f'{sys.argv[2]}/test.en','w') as f:
54 |     for e in en_test:
55 |         f.write(e)
56 | with open(f'{sys.argv[2]}/test.th','w') as f:
57 |     for t in th_test:
58 |         f.write(t)
59 | print(f'saved to {sys.argv[2]}')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #data
  2 | data/
  3 | 
  4 | #sunword-nmt
  5 | subword-nmt/
  6 | 
  7 | #led / optimized / DLL files
  8 | __pycache__/
  9 | *.py[cod]
 10 | *$py.class
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Distribution / packaging
 16 | .Python
 17 | build/
 18 | develop-eggs/
 19 | dist/
 20 | downloads/
 21 | eggs/
 22 | .eggs/
 23 | lib/
 24 | lib64/
 25 | parts/
 26 | sdist/
 27 | var/
 28 | wheels/
 29 | pip-wheel-metadata/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # celery beat schedule file
100 | celerybeat-schedule
101 | 
102 | # SageMath parsed files
103 | *.sage.py
104 | 
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 | 
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 | 
118 | # Rope project settings
119 | .ropeproject
120 | 
121 | # mkdocs documentation
122 | /site
123 | 
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 | 
129 | # Pyre type checker
130 | .pyre/
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # mt-opus
 2 | English-Thai Machine Translation with OPUS data
 3 | 
 4 | ## Data
 5 | We used 9 datasets from [OPUS](http://opus.nlpl.eu/index.php) to train and validate our models within and across domains (total 5.4M sentence pairs; 68.8M English tokens and 53.1M Thai tokens).
 6 | 
 7 | | datasets | nb_sent | en_tok | th_tok | description | reference |
 8 | |------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|--------|--------|------------------------------------|-----------|
 9 | | [OpenSubtitles   v2018](http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/en-th.txt.zip) | 3.5M | 28.4M | 7.8M | crowdsourced subtitles | [1] |
10 | | [JW300 v1](http://opus.nlpl.eu/JW300-v1.php)   [en](https://object.pouta.csc.fi/OPUS-JW300/v1/raw/en.zip)   [th](https://object.pouta.csc.fi/OPUS-JW300/v1/raw/th.zip) | 0.8M | 14.9M | 34.6M | Jehovah's Witness site | [2], [3] |
11 | | [GNOME v1](https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-th.txt.zip) | 0.5M | 2.3M | 3.5M | GNOME documentation | [2] |
12 | | [QED v2.0a](https://object.pouta.csc.fi/OPUS-QED/v2.0a/moses/en-th.txt.zip) | 0.3M | 4.7M | 1.2M | crowdsourced educational subtitles | [2] |
13 | | [bible-uedin v1](https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-th.txt.zip) | 0.1M | 3.6M | 2.1M | the Bible | [2], [4] |
14 | | [Tanzil v1](https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-th.txt.zip) | 93.5k | 2.8M | 3.4M | the Quran | [2] |
15 | | [KDE4 v2](https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-th.txt.zip) | 92.0k | 0.5M | 0.2M | KDE4 documentation | [2] |
16 | | [Ubuntu v14.10](https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-th.txt.zip) | 46.6k | 0.4M | 0.2M | Ubuntu documentation | [2] |
17 | | [Tatoeba v20190709](https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-th.txt.zip) | 1.1k | 6k | 1.7k | crowdsourced translations | [2] |
18 | 
19 | ## Models
20 | 
21 | ## Results
22 | 
23 | # References
24 | * [1] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016)
25 | * [2] J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)
26 | * [3]  Željko Agić, Ivan Vulić: "JW300: A Wide-Coverage Parallel Corpus for Low-Resource Languages", In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. Acknowledge also OPUS by citing the following article: J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012)
27 | * [4] A massively parallel corpus: the Bible in 100 languages, Christos Christodoulopoulos and Mark Steedman, *Language Resources and Evaluation*, 49
28 | 


--------------------------------------------------------------------------------
/sandbox.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 21,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from pythainlp.tokenize import word_tokenize\n",
 10 |     "from pythainlp.ulmfit import *\n",
 11 |     "from tqdm import tqdm_notebook\n",
 12 |     "import random"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 5,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "(3281533,\n",
 24 |        " ['Slave in the Magic Mirror, come from the farthest space.\\n',\n",
 25 |        "  'Through wind and darkness, I summon thee.\\n',\n",
 26 |        "  'Speak!\\n'])"
 27 |       ]
 28 |      },
 29 |      "execution_count": 5,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "with open('data/opensubtitles/OpenSubtitles.en-th.en','r') as f:\n",
 36 |     "    en = f.readlines()\n",
 37 |     "len(en),en[:3]"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 6,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "application/vnd.jupyter.widget-view+json": {
 48 |        "model_id": "5a5090a95cfc4a568af1e10ae56b0fbc",
 49 |        "version_major": 2,
 50 |        "version_minor": 0
 51 |       },
 52 |       "text/plain": [
 53 |        "HBox(children=(IntProgress(value=0, max=3281533), HTML(value='')))"
 54 |       ]
 55 |      },
 56 |      "metadata": {},
 57 |      "output_type": "display_data"
 58 |     },
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "\n"
 64 |      ]
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "en_tok = []\n",
 69 |     "for e in tqdm_notebook(en):\n",
 70 |     "    en_tok.append(' '.join(word_tokenize(e,keep_whitespace=False)))"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 7,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "(3281533,\n",
 82 |        " ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด\\n',\n",
 83 |        "  'ผ่านลมและความมืดฉันเรียกเจ้า\\n',\n",
 84 |        "  'พูด!\\n'])"
 85 |       ]
 86 |      },
 87 |      "execution_count": 7,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "with open('data/opensubtitles/OpenSubtitles.en-th.th','r') as f:\n",
 94 |     "    th = f.readlines()\n",
 95 |     "len(th),th[:3]"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 8,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "application/vnd.jupyter.widget-view+json": {
106 |        "model_id": "af3fa1bd4be6447e891ac01f44db47b0",
107 |        "version_major": 2,
108 |        "version_minor": 0
109 |       },
110 |       "text/plain": [
111 |        "HBox(children=(IntProgress(value=0, max=3281533), HTML(value='')))"
112 |       ]
113 |      },
114 |      "metadata": {},
115 |      "output_type": "display_data"
116 |     },
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "th_tok = []\n",
127 |     "for t in tqdm_notebook(th):\n",
128 |     "    th_tok.append(' '.join(word_tokenize(t)))"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "(2625226, 328153, 328154)"
140 |       ]
141 |      },
142 |      "execution_count": 9,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "#train-valid-test split 80/10/10\n",
149 |     "n = len(th_tok)\n",
150 |     "idx = list(range(n))\n",
151 |     "random.shuffle(idx)\n",
152 |     "train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]\n",
153 |     "len(train_idx),len(valid_idx),len(test_idx)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 11,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "(2625226,\n",
165 |        " ['ไปให้พ้น   ไอ้ ลูก หมา \\n',\n",
166 |        "  'โชค ดีแล้ว นะ \\n',\n",
167 |        "  'ไม่เอา น่า   ทำ อะไร หน่อย สิ ! \\n',\n",
168 |        "  'หันไป \\n',\n",
169 |        "  'หงุดหงิด อย่างแรง \\n',\n",
170 |        "  'ฉัน ตรวจดู หมาย เลขที่ โทร มา \\n',\n",
171 |        "  'ไม่ สามารถ มี สี่ คน ที่ อาศัย อยู่ ที่นี่ \\n',\n",
172 |        "  'เขา เป็น กษัตริย์ ของ เรา \\n',\n",
173 |        "  'ไม่ได้ บอก ที่ บ้าน ใช่ไหม ว่า ถึง แล้ว \\n',\n",
174 |        "  'เรา ต้อง มีทาง อื่น   ที่จะ เข้าไป ได้ สิ น่า \\n'])"
175 |       ]
176 |      },
177 |      "execution_count": 11,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "th_train = [th_tok[i] for i in train_idx]\n",
184 |     "len(th_train), th_train[:10]"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 12,
190 |    "metadata": {},
191 |    "outputs": [
192 |     {
193 |      "data": {
194 |       "text/plain": [
195 |        "(2625226,\n",
196 |        " ['Go away , son of a bitch \\n',\n",
197 |        "  'Well , good for them . \\n',\n",
198 |        "  'Come on , do something ! \\n',\n",
199 |        "  'Turn around . \\n',\n",
200 |        "  'Talk about a killer cappuccino . What is this thing ? \\n',\n",
201 |        "  \"I ' ve checked the call record . \\n\",\n",
202 |        "  \"There ' s like a hundred houses . \\n\",\n",
203 |        "  \"He ' s our king . \\n\",\n",
204 |        "  \"You don ' t have to call home about arriving ? \\n\",\n",
205 |        "  \"There ' s gotta be some other way we can get in . \\n\"])"
206 |       ]
207 |      },
208 |      "execution_count": 12,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "en_train = [en_tok[i] for i in train_idx]\n",
215 |     "len(en_train), en_train[:10]"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 13,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "with open('data/opensubtitles_tok/train.en','w') as f:\n",
225 |     "    for e in en_train:\n",
226 |     "        f.write(e)\n",
227 |     "with open('data/opensubtitles_tok/train.th','w') as f:\n",
228 |     "    for t in th_train:\n",
229 |     "        f.write(t)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 14,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "(328153,\n",
241 |        " ['ผม ต้อง แถลงการณ์ \\n',\n",
242 |        "  'แล้ วจะ ให้ รัก ผม มั้ย ล่ะ \\n',\n",
243 |        "  'ก็ นะ   มัน เข้ากับ คุณ ดี \\n',\n",
244 |        "  'เดี๋ยวก่อน นะ   นั่น มัน บ้าน ของ ครอบครัว ฉัน \\n',\n",
245 |        "  'หลังจาก   2 - 3   ชม.   พวก มัน จะ เริ่ม เหนียว ข้น และ แห้ง \\n',\n",
246 |        "  'อย่า เพิ่ง . \\n',\n",
247 |        "  'เยี่ยม \\n',\n",
248 |        "  'สัปดาห์ แรก ของ กันยายน   มะเขือเทศ ใน   เบ เก อร ์ฟิลด์ \\n',\n",
249 |        "  'บำบัด โรค โดย การ สะกดจิต \\n',\n",
250 |        "  'ผม เริ่ม จาก กระเป๋า เศษ เหรียญ   \\\\   แล้ว มัน ก็ เพิ่มขึ้น เรื่อยๆ นับ จากนั้น \\n'])"
251 |       ]
252 |      },
253 |      "execution_count": 14,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "th_valid = [th_tok[i] for i in valid_idx]\n",
260 |     "len(th_valid), th_valid[:10]"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 15,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "data": {
270 |       "text/plain": [
271 |        "(328153,\n",
272 |        " ['I need to make a statement . \\n',\n",
273 |        "  \"Gonna give me some lovin '? \\n\",\n",
274 |        "  \"Well , it ' s working for you . \\n\",\n",
275 |        "  'Wait a minute . \\n',\n",
276 |        "  'After a few hours , they begin to get cloudy and wilt . \\n',\n",
277 |        "  'Notyet . \\n',\n",
278 |        "  '- Excellent . \\n',\n",
279 |        "  'First week of september , tomatoes in bakersfield . \\n',\n",
280 |        "  'A hypnotherapist . \\n',\n",
281 |        "  'It started with coin purses and sort of went on from there , really . \\n'])"
282 |       ]
283 |      },
284 |      "execution_count": 15,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "en_valid = [en_tok[i] for i in valid_idx]\n",
291 |     "len(en_valid), en_valid[:10]"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 16,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "with open('data/opensubtitles_tok/valid.en','w') as f:\n",
301 |     "    for e in en_valid:\n",
302 |     "        f.write(e)\n",
303 |     "with open('data/opensubtitles_tok/valid.th','w') as f:\n",
304 |     "    for t in th_valid:\n",
305 |     "        f.write(t)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 17,
311 |    "metadata": {},
312 |    "outputs": [
313 |     {
314 |      "data": {
315 |       "text/plain": [
316 |        "(328154,\n",
317 |        " ['พวก นั้น กำลัง มา   แม่ \\n',\n",
318 |        "  'เอา ห น่า   แฮ รี่   นาย มี แล้ว นิ   มังกร ไง   ถ้า นาย คิด ว่า จะ หาคู่ เดท ได้ นะ   ฉัน คิด ว่า ฉัน ไป กับ มังกร ดีกว่า \\n',\n",
319 |        "  'ฉัน เสียใจ \\n',\n",
320 |        "  'มัน เรื่อง อะไร ของ แก ? \\n',\n",
321 |        "  'แต่ เรา จะ ลอง ทำ ดู \\n',\n",
322 |        "  'ไป กัน เถอะ \\n',\n",
323 |        "  'ชาร์ท   ที่   200   เม ก   เม ก \\n',\n",
324 |        "  'ตกลง มั้ย ? \\n',\n",
325 |        "  'เรา มี ความสัมพันธ์   ค่อนข้างจะ ซับซ้อน น่ะ \\n',\n",
326 |        "  '100   ศพ   ดึง ออก มาจาก ที่เกิดเหตุ \\n'])"
327 |       ]
328 |      },
329 |      "execution_count": 17,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "th_test = [th_tok[i] for i in test_idx]\n",
336 |     "len(th_test), th_test[:10]"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 18,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "(328154,\n",
348 |        " [\"They ' re coming , mother . No . \\n\",\n",
349 |        "  \"Come on , Harry , you have slain a dragon lf you want to get a date you can I think I ' ll take the dragon right now \\n\",\n",
350 |        "  \"I ' m sorry . \\n\",\n",
351 |        "  'WHAT THE HELL IS YOUR PROBLEM ? \\n',\n",
352 |        "  \"But we ' ll give it a try . \\n\",\n",
353 |        "  \"Let ' s go . \\n\",\n",
354 |        "  'Megan ? Megan ! \\n',\n",
355 |        "  'Okay ? \\n',\n",
356 |        "  'We have a ... very complicated relationship . \\n',\n",
357 |        "  '100 more bodies pulled from the arena . \\n'])"
358 |       ]
359 |      },
360 |      "execution_count": 18,
361 |      "metadata": {},
362 |      "output_type": "execute_result"
363 |     }
364 |    ],
365 |    "source": [
366 |     "en_test = [en_tok[i] for i in test_idx]\n",
367 |     "len(en_test), en_test[:10]"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 19,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "with open('data/opensubtitles_tok/test.en','w') as f:\n",
377 |     "    for e in en_test:\n",
378 |     "        f.write(e)\n",
379 |     "        \n",
380 |     "with open('data/opensubtitles_tok/test.th','w') as f:\n",
381 |     "    for t in th_test:\n",
382 |     "        f.write(t)"
383 |    ]
384 |   },
385 |   {
386 |    "cell_type": "code",
387 |    "execution_count": 91,
388 |    "metadata": {},
389 |    "outputs": [],
390 |    "source": [
391 |     "# !fairseq-preprocess --source-lang en --target-lang th \\\n",
392 |     "#     --trainpref data/opensubtitles_tok/train \\\n",
393 |     "#     --validpref data/opensubtitles_tok/valid \\\n",
394 |     "#     --testpref data/opensubtitles_tok/test \\\n",
395 |     "#     --destdir data/opensubtitles_bin\n"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 90,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": [
404 |     "# | [en] data/opensubtitles_tok/train.en: 2625226 sents, 25945418 tokens, 0.0% replaced by <unk>\n",
405 |     "# | [en] Dictionary: 173623 types\n",
406 |     "# | [en] data/opensubtitles_tok/valid.en: 328153 sents, 3238427 tokens, 0.304% replaced by <unk>\n",
407 |     "# | [en] Dictionary: 173623 types\n",
408 |     "# | [en] data/opensubtitles_tok/test.en: 328154 sents, 3235608 tokens, 0.314% replaced by <unk>\n",
409 |     "# | [th] Dictionary: 116495 types\n",
410 |     "# | [th] data/opensubtitles_tok/train.th: 2625226 sents, 21658577 tokens, 0.0% replaced by <unk>\n",
411 |     "# | [th] Dictionary: 116495 types\n",
412 |     "# | [th] data/opensubtitles_tok/valid.th: 328153 sents, 2705475 tokens, 0.262% replaced by <unk>\n",
413 |     "# | [th] Dictionary: 116495 types\n",
414 |     "# | [th] data/opensubtitles_tok/test.th: 328154 sents, 2701605 tokens, 0.257% replaced by <unk>"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": 98,
420 |    "metadata": {},
421 |    "outputs": [],
422 |    "source": [
423 |     "!fairseq-train \\\n",
424 |     "    data/opensubtitles_bin \\\n",
425 |     "    --arch transformer_iwslt_de_en --share-decoder-input-output-embed \\\n",
426 |     "    --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \\\n",
427 |     "    --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \\\n",
428 |     "    --dropout 0.3 --weight-decay 0.0001 \\\n",
429 |     "    --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n",
430 |     "    --max-tokens 2048 \\\n",
431 |     "    --bpe sentencepiece \\\n",
432 |     "    --memory-efficient-fp16\n",
433 |     "    --save-dir data/opensubtitles_model/transformers"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "# fairseq-generate data/opensubtitles_bin \\\n",
443 |     "#     --path data/opensubtitles_model/transformers/checkpoint_best.pt \\\n",
444 |     "#     --beam 5 --remove-bpe"
445 |    ]
446 |   }
447 |  ],
448 |  "metadata": {
449 |   "kernelspec": {
450 |    "display_name": "Python 3",
451 |    "language": "python",
452 |    "name": "python3"
453 |   },
454 |   "language_info": {
455 |    "codemirror_mode": {
456 |     "name": "ipython",
457 |     "version": 3
458 |    },
459 |    "file_extension": ".py",
460 |    "mimetype": "text/x-python",
461 |    "name": "python",
462 |    "nbconvert_exporter": "python",
463 |    "pygments_lexer": "ipython3",
464 |    "version": "3.6.8"
465 |   }
466 |  },
467 |  "nbformat": 4,
468 |  "nbformat_minor": 2
469 | }
470 | 


--------------------------------------------------------------------------------
/notebooks/preprocess_opensubtitle_with_bpe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 83,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "2.1\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "# coding=utf-8\n",
 18 |     "import sys\n",
 19 |     "sys.path.append('..')\n",
 20 |     "\n",
 21 |     "import os\n",
 22 |     "import io\n",
 23 |     "import random\n",
 24 |     "import copy \n",
 25 |     "import re\n",
 26 |     "import html\n",
 27 |     "\n",
 28 |     "from time import time \n",
 29 |     "from multiprocessing import Pool\n",
 30 |     "from collections import Counter\n",
 31 |     "\n",
 32 |     "from functools import partial\n",
 33 |     "\n",
 34 |     "from tqdm import tqdm_notebook\n",
 35 |     "import pythainlp\n",
 36 |     "from pythainlp.util import *\n",
 37 |     "from pythainlp.tokenize import word_tokenize\n",
 38 |     "from pythainlp.ulmfit import *\n",
 39 |     "\n",
 40 |     "# subword-nmt\n",
 41 |     "from subword_nmt import learn_bpe as learner\n",
 42 |     "from subword_nmt import apply_bpe as subword_tokenizer\n",
 43 |     "\n",
 44 |     "import fairseq \n",
 45 |     "from datetime import timedelta\n",
 46 |     "from tqdm import tqdm, tqdm_notebook\n",
 47 |     "from pythainlp.tokenize import DEFAULT_DICT_TRIE\n",
 48 |     "\n",
 49 |     "from pythainlp.corpus import thai_words\n",
 50 |     "\n",
 51 |     "print(pythainlp.__version__)\n",
 52 |     "assert pythainlp.__version__ == '2.1'\n",
 53 |     "\n"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 84,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# เขียนใหม่ เอาแบบ initializer รับ wordlist มา\n",
 63 |     "class Trie:\n",
 64 |     "  class Node(object):\n",
 65 |     "    __slots__ = 'end', 'children'\n",
 66 |     "    def __init__(self):\n",
 67 |     "      self.end = False \n",
 68 |     "      self.children = {}\n",
 69 |     "      \n",
 70 |     "  def __init__(self, words):\n",
 71 |     "    self.words = words\n",
 72 |     "    self.root = Trie.Node()\n",
 73 |     "    for word in words:\n",
 74 |     "      cur = self.root\n",
 75 |     "      for ch in word: \n",
 76 |     "        node = cur.children.get(ch)\n",
 77 |     "        if not node: \n",
 78 |     "          node = Trie.Node() \n",
 79 |     "          cur.children[ch] = node \n",
 80 |     "        cur = node\n",
 81 |     "      cur.end = True \n",
 82 |     "  def prefixes(self, text):\n",
 83 |     "    res = []\n",
 84 |     "    cur = self.root\n",
 85 |     "    for i, ch in enumerate(text):\n",
 86 |     "      node = cur.children.get(ch)\n",
 87 |     "      if not node: break\n",
 88 |     "      if node.end:\n",
 89 |     "        res.append(text[:i+1])\n",
 90 |     "      cur = node\n",
 91 |     "    return res\n",
 92 |     "    \n",
 93 |     "  def __contains__(self, key):\n",
 94 |     "    return key in self.words\n",
 95 |     "  def __iter__(self):\n",
 96 |     "    yield from self.words"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 85,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "323 ms ± 802 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "%%timeit\n",
114 |     "pt = Trie(thai_words())\n",
115 |     "\n"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 86,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "['กา', 'กาก', 'กากี']"
127 |       ]
128 |      },
129 |      "execution_count": 86,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "pt.prefixes('กากี่')"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 87,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "\u001b[33mYou are using pip version 19.0.3, however version 19.2.3 is available.\r\n",
148 |       "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\r\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "# install BPEmb (BPE embeddings)\n",
154 |     "\n",
155 |     "!pip install --q bpemb"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 88,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "from bpemb import BPEmb\n",
165 |     "\n",
166 |     "bpemb_pretrained ={\n",
167 |     "    'th': {\n",
168 |     "        '25000': BPEmb(lang=\"th\", vs=25000)\n",
169 |     "    },\n",
170 |     "    'en': {\n",
171 |     "        '25000': BPEmb(lang=\"en\", vs=25000)\n",
172 |     "    }\n",
173 |     "}\n"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 89,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "(3281534,\n",
185 |        " ['Slave in the Magic Mirror, come from the farthest space.',\n",
186 |        "  'Through wind and darkness, I summon thee.',\n",
187 |        "  'Speak!'])"
188 |       ]
189 |      },
190 |      "execution_count": 89,
191 |      "metadata": {},
192 |      "output_type": "execute_result"
193 |     }
194 |    ],
195 |    "source": [
196 |     "with open('../data/opensubtitles/OpenSubtitles.en-th.en','r', encoding='utf-8') as f:\n",
197 |     "    en = f.read().split('\\n')\n",
198 |     "len(en),en[:3]\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 90,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "(3281534,\n",
210 |        " ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด',\n",
211 |        "  'ผ่านลมและความมืดฉันเรียกเจ้า',\n",
212 |        "  'พูด!'])"
213 |       ]
214 |      },
215 |      "execution_count": 90,
216 |      "metadata": {},
217 |      "output_type": "execute_result"
218 |     }
219 |    ],
220 |    "source": [
221 |     "with open('../data/opensubtitles/OpenSubtitles.en-th.th','r', encoding='utf-8') as f:\n",
222 |     "    th = f.read().split('\\n')\n",
223 |     "    \n",
224 |     "len(th),th[:3]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 91,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "# preprocess text\n",
234 |     "\n",
235 |     "LIST_OF_UNKNOWN_TOKENS = [b'\\x98\\xc2', b'\\xae\\xc2', b'\\x99\\xc2', b'\\xb1\\xc2' , b'\\xc2\\xb7', b'\\xc3\\x83']\n",
236 |     "LISY_OF_TOKENS_TO_REPLACE = ['', '', '​', '', '', '', '",
237 |     "', '', 'โ', '​',\n",
238 |     "                   '♪', '{\\ cHFFFFFF }', '§', 'font color = \"# 808080 \"']\n",
239 |     "def unescape_string(text):\n",
240 |     "    return html.unescape(text)\n",
241 |     "\n",
242 |     "def sentences_filter(sentences, lang):\n",
243 |     "    indices = []\n",
244 |     "    for index, sentence in tqdm_notebook(enumerate(sentences), total=len(sentences)):\n",
245 |     "        for token in LIST_OF_UNKNOWN_TOKENS:\n",
246 |     "            if token in sentence.encode('utf-8'):\n",
247 |     "                indices.append(index)\n",
248 |     "                break\n",
249 |     "        if len(sentence) <= 1:\n",
250 |     "            indices.append(index)\n",
251 |     "            continue\n",
252 |     "        if lang == 'th' and countthai(sentence, ignore_chars='') == 0.0:\n",
253 |     "            indices.append(index)\n",
254 |     "            continue\n",
255 |     "    return indices\n",
256 |     "\n",
257 |     "def clean_sentence(sentence):\n",
258 |     "    for token in LISY_OF_TOKENS_TO_REPLACE:\n",
259 |     "        sentence = sentence.replace(token, '')\n",
260 |     "        sentence = unescape_string(sentence)\n",
261 |     "        \n",
262 |     "    return sentence"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 92,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "\n",
272 |     "def tokenize_worker(sentence, lang, trie):\n",
273 |     "    \n",
274 |     "    _tokenizer_newmm = partial(pythainlp.tokenize.word_tokenize, engine='newmm',\n",
275 |     "                               keep_whitespace=False,\n",
276 |     "                              custom_dict=(trie if trie != None else DEFAULT_DICT_TRIE))\n",
277 |     "    return ' '.join(_tokenizer_newmm(sentence))\n",
278 |     "  \n",
279 |     "def tokenize_handler(sentences, lang, trie=None):\n",
280 |     "    toks = []\n",
281 |     "    p = Pool(12)\n",
282 |     "    t = time()\n",
283 |     "    _tokenize_worker = partial(tokenize_worker, lang=lang, trie=trie)\n",
284 |     "    toks = p.map(_tokenize_worker, sentences)\n",
285 |     "    \n",
286 |     "    p.close()\n",
287 |     "    p.join() # call Pool.join() to wait for the worker processes to terminate.\n",
288 |     "\n",
289 |     "    print('{} s'.format(time() -t))\n",
290 |     "\n",
291 |     "    return toks\n",
292 |     "  "
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 100,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "def write_spaced_tokens_to_file(data, folder_name, filename):\n",
302 |     "    with open('/root/mt-opus/data/{}/{}'.format(folder_name, filename),'w') as f:\n",
303 |     "        for item in data:\n",
304 |     "            f.write(item + '\\n')\n",
305 |     "            \n",
306 |     "            "
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": 101,
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "data": {
316 |       "application/vnd.jupyter.widget-view+json": {
317 |        "model_id": "60a71371634346f08d65cb6cacf94b02",
318 |        "version_major": 2,
319 |        "version_minor": 0
320 |       },
321 |       "text/plain": [
322 |        "HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))"
323 |       ]
324 |      },
325 |      "metadata": {},
326 |      "output_type": "display_data"
327 |     },
328 |     {
329 |      "name": "stdout",
330 |      "output_type": "stream",
331 |      "text": [
332 |       "\n"
333 |      ]
334 |     },
335 |     {
336 |      "data": {
337 |       "application/vnd.jupyter.widget-view+json": {
338 |        "model_id": "425025bfbc3e46e193943c9ff6f05568",
339 |        "version_major": 2,
340 |        "version_minor": 0
341 |       },
342 |       "text/plain": [
343 |        "HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))"
344 |       ]
345 |      },
346 |      "metadata": {},
347 |      "output_type": "display_data"
348 |     },
349 |     {
350 |      "name": "stdout",
351 |      "output_type": "stream",
352 |      "text": [
353 |       "\n",
354 |       "76544\n",
355 |       "4392\n",
356 |       "111.4701018333435 seconds\n"
357 |      ]
358 |     }
359 |    ],
360 |    "source": [
361 |     "t = time()\n",
362 |     "indices_to_filter_out_th = sentences_filter(th, lang='th')\n",
363 |     "indices_to_filter_out_en = sentences_filter(en, lang='en')\n",
364 |     "\n",
365 |     "print(len(indices_to_filter_out_th))\n",
366 |     "print(len(indices_to_filter_out_en))\n",
367 |     "\n",
368 |     "indices_to_filter_out = indices_to_filter_out_th + indices_to_filter_out_en\n",
369 |     "indices_to_filter_out = set(indices_to_filter_out)\n",
370 |     "\n",
371 |     "\n",
372 |     "filtered_th = [clean_sentence(x) for i, x in enumerate(th) if i not in indices_to_filter_out]\n",
373 |     "filtered_en = [clean_sentence(x) for i, x in enumerate(en) if i not in indices_to_filter_out]\n",
374 |     "\n",
375 |     "print('{} seconds'.format(time() -t))\n"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 95,
381 |    "metadata": {},
382 |    "outputs": [
383 |     {
384 |      "data": {
385 |       "text/plain": [
386 |        "'ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด'"
387 |       ]
388 |      },
389 |      "execution_count": 95,
390 |      "metadata": {},
391 |      "output_type": "execute_result"
392 |     }
393 |    ],
394 |    "source": [
395 |     "filtered_th[0]"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": null,
401 |    "metadata": {},
402 |    "outputs": [],
403 |    "source": []
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 102,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "toks = {\n",
412 |     "    'th': {\n",
413 |     "        'sentencepiece': [],\n",
414 |     "        'newmm':[]\n",
415 |     "    },\n",
416 |     "    'en': {\n",
417 |     "        'sentencepiece': [],\n",
418 |     "        'newmm':[]\n",
419 |     "    }\n",
420 |     "}"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "metadata": {},
426 |    "source": [
427 |     "## 1a Segment texts into tokens with `newmm`"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": null,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": []
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 103,
440 |    "metadata": {},
441 |    "outputs": [],
442 |    "source": [
443 |     "# toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th')\n",
444 |     "\n",
445 |     "# toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en')"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 104,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "# # test with Pyhon Trie\n",
455 |     "# t = time()\n",
456 |     "# for sent in filtered_th[:100000]:\n",
457 |     "#      toks = pythainlp.tokenize.word_tokenize(text=sent,\n",
458 |     "#                                              engine='newmm',\n",
459 |     "#                                              keep_whitespace=False,\n",
460 |     "#                                              custom_dict=pt)\n",
461 |     "        \n",
462 |     "# print('{} s'.format(time() -t))\n",
463 |     "# # toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th', trie=pt)\n",
464 |     "\n",
465 |     "# # toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en', trie=pt)"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 105,
471 |    "metadata": {},
472 |    "outputs": [],
473 |    "source": [
474 |     "# # test with Marisa Trie\n",
475 |     "# t = time()\n",
476 |     "# for sent in filtered_th[:100000]:\n",
477 |     "#      toks = pythainlp.tokenize.word_tokenize(text=sent,\n",
478 |     "#                                              engine='newmm',\n",
479 |     "#                                              keep_whitespace=False)\n",
480 |     "# print('{} s'.format(time() -t))\n",
481 |     "# # toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th', trie=pt)\n",
482 |     "\n",
483 |     "# # toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en', trie=pt)"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 106,
489 |    "metadata": {},
490 |    "outputs": [
491 |     {
492 |      "name": "stdout",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "38.81156301498413 s\n",
496 |       "39.77449178695679 s\n"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "toks['th']['newmm'] = tokenize_handler(filtered_th, lang='th')\n",
502 |     "toks['en']['newmm'] = tokenize_handler(filtered_en, lang='en')\n"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 107,
508 |    "metadata": {},
509 |    "outputs": [
510 |     {
511 |      "data": {
512 |       "text/plain": [
513 |        "(['ทาส ใน กระจก วิเศษ , มาจาก พื้นที่ ที่ ไกล ที่สุด',\n",
514 |        "  'ผ่าน ลม และ ความมืด ฉัน เรียก เจ้า',\n",
515 |        "  'พูด !',\n",
516 |        "  'ให้ ฉัน เห็น พระพักตร์ ของ พระองค์',\n",
517 |        "  'สิ่ง ที่ เจ้า จะ รู้ ว่า สมเด็จ พระราชินี ของ ฉัน ได้ อย่างไร',\n",
518 |        "  'กระจก วิเศษ บน ผนัง ผู้ ที่ เป็น สังขาร หนึ่ง ทั้งหมด หรือไม่',\n",
519 |        "  'ที่ มีชื่อเสียง เป็น ความงาม ของ เจ้า พระ บาท สมเด็จ พระเจ้าอยู่หัว',\n",
520 |        "  'แต่ ถือเป็น แม่บ้าน ที่ น่ารัก ที่ ฉัน เห็น',\n",
521 |        "  'ยาจก ไม่ สามารถ ซ่อน พระคุณ อ่อนโยน ของ เธอ',\n",
522 |        "  'อนิจจา เธอ มี ความเป็นธรรม มากขึ้น กว่า เจ้า'],\n",
523 |        " ['Slave in the Magic Mirror , come from the farthest space .',\n",
524 |        "  'Through wind and darkness , I summon thee .',\n",
525 |        "  'Speak !',\n",
526 |        "  'Let me see thy face .',\n",
527 |        "  'What wouldst thou know , my Queen ?',\n",
528 |        "  'Magic Mirror on the wall , who is the fairest one of all ?',\n",
529 |        "  'Famed is thy beauty , Majesty .',\n",
530 |        "  'But hold , a lovely maid I see .',\n",
531 |        "  'Rags cannot hide her gentle grace .',\n",
532 |        "  'Alas , she is more fair than thee .'])"
533 |       ]
534 |      },
535 |      "execution_count": 107,
536 |      "metadata": {},
537 |      "output_type": "execute_result"
538 |     }
539 |    ],
540 |    "source": [
541 |     "toks['th']['newmm'][0:10], toks['en']['newmm'][0:10]"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "metadata": {},
547 |    "source": [
548 |     "## 1b Segment texts into BPE tokens with SentencePiece (BPEmb)\n"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": 108,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "def encode_bpe(sentences, lang, n_vocab=25000):\n",
558 |     "    \"\"\"Return a list of bpe tokens give a list of sentences\"\"\"\n",
559 |     "    segmented_sentences = []\n",
560 |     "    for sentence in tqdm_notebook(sentences, total=len(sentences)):\n",
561 |     "#         print(sentence)\n",
562 |     "        bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)\n",
563 |     "        segmented_sentences.append(' '.join(bpe_tokens))\n",
564 |     "        \n",
565 |     "    return segmented_sentences"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "markdown",
570 |    "metadata": {},
571 |    "source": [
572 |     "### 1.1 Thai language"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 109,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "data": {
582 |       "application/vnd.jupyter.widget-view+json": {
583 |        "model_id": "1ce4eb5e8cbe469886307235a01f1dcc",
584 |        "version_major": 2,
585 |        "version_minor": 0
586 |       },
587 |       "text/plain": [
588 |        "HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))"
589 |       ]
590 |      },
591 |      "metadata": {},
592 |      "output_type": "display_data"
593 |     },
594 |     {
595 |      "name": "stdout",
596 |      "output_type": "stream",
597 |      "text": [
598 |       "\n",
599 |       "['▁ท าส ใน กระจก วิเศษ , ▁มาจาก พื้นที่ ที่ ไกล ที่สุด', '▁ผ่าน ลม และความ มืด ฉัน เรียก เจ้า', '▁พูด !', '▁ให้ ฉัน เห็น พระพักตร์ ของ ▁พระองค์', '▁สิ่งที่ เจ้า จะ รู้ว่า สมเด็จพระราชินี ▁ของ ฉัน ได้อย่างไร', '▁กระจ ก วิเศษ บน ผนัง ▁ผู้ ที่เป็น สัง ขาร หนึ่ง ทั้งหมด ▁หรือไม่', '▁ที่มีชื่อเสียง เป็น ความงาม ของ ▁เจ้า พระบาทสมเด็จพระ เจ้าอยู่หัว', '▁แต่ ถือเป็น แม่ บ้าน ที่น ่ารัก ที่ ฉัน ▁เห็น', '▁ยา จก ไม่สามารถ ซ่อน พระคุณ ▁อ่อน โยน ของเธอ', '▁อน ิจ จา เธอ มีความเป็น ธรรม ▁มาก ขึ้น กว่า เจ้า']\n"
600 |      ]
601 |     }
602 |    ],
603 |    "source": [
604 |     "toks['th']['sentencepiece'] = encode_bpe(filtered_th, 'th', 25000)\n",
605 |     "\n",
606 |     "print(toks['th']['sentencepiece'][0:10])"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "markdown",
611 |    "metadata": {},
612 |    "source": [
613 |     "### 1.2 English language"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 110,
619 |    "metadata": {},
620 |    "outputs": [
621 |     {
622 |      "data": {
623 |       "application/vnd.jupyter.widget-view+json": {
624 |        "model_id": "4a0ac2a9e16c4fe0814a82c713b07a92",
625 |        "version_major": 2,
626 |        "version_minor": 0
627 |       },
628 |       "text/plain": [
629 |        "HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))"
630 |       ]
631 |      },
632 |      "metadata": {},
633 |      "output_type": "display_data"
634 |     },
635 |     {
636 |      "name": "stdout",
637 |      "output_type": "stream",
638 |      "text": [
639 |       "\n",
640 |       "['▁slave ▁in ▁the ▁magic ▁mirror , ▁come ▁from ▁the ▁fart hest ▁space .', '▁through ▁wind ▁and ▁darkness , ▁i ▁summon ▁the e .', '▁speak !', '▁let ▁me ▁see ▁thy ▁face .', '▁what ▁would st ▁thou ▁know , ▁my ▁queen ?', '▁magic ▁mirror ▁on ▁the ▁wall , ▁who ▁is ▁the ▁fa ire st ▁one ▁of ▁all ?', '▁famed ▁is ▁thy ▁beauty , ▁majesty .', '▁but ▁hold , ▁a ▁lov ely ▁maid ▁i ▁see .', '▁ra gs ▁cannot ▁hide ▁her ▁gentle ▁grace .', '▁al as , ▁she ▁is ▁more ▁fair ▁than ▁the e .']\n"
641 |      ]
642 |     }
643 |    ],
644 |    "source": [
645 |     "toks['en']['sentencepiece']  = encode_bpe(filtered_en, 'en', 25000)\n",
646 |     "print(toks['en']['sentencepiece'][0:10])"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {},
652 |    "source": [
653 |     "## 2. Split train-valid-test "
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "markdown",
658 |    "metadata": {},
659 |    "source": []
660 |   },
661 |   {
662 |    "cell_type": "code",
663 |    "execution_count": 111,
664 |    "metadata": {},
665 |    "outputs": [
666 |     {
667 |      "name": "stdout",
668 |      "output_type": "stream",
669 |      "text": [
670 |       "N =  3202751\n"
671 |      ]
672 |     },
673 |     {
674 |      "data": {
675 |       "text/plain": [
676 |        "(2562200, 320275, 320276)"
677 |       ]
678 |      },
679 |      "execution_count": 111,
680 |      "metadata": {},
681 |      "output_type": "execute_result"
682 |     }
683 |    ],
684 |    "source": [
685 |     "#train-valid-test split 80/10/10\n",
686 |     "\n",
687 |     "n = len(toks['th']['newmm'])\n",
688 |     "\n",
689 |     "print('N = ',n)\n",
690 |     "idx = list(range(n))\n",
691 |     "\n",
692 |     "random.seed(1234) # Set SEED\n",
693 |     "random.shuffle(idx)\n",
694 |     "\n",
695 |     "train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]\n",
696 |     "\n",
697 |     "dataset_split = {}\n",
698 |     "dataset_split['train'] = train_idx\n",
699 |     "dataset_split['valid'] = valid_idx\n",
700 |     "dataset_split['test'] = test_idx\n",
701 |     "\n",
702 |     "\n",
703 |     "len(train_idx),len(valid_idx),len(test_idx)\n",
704 |     "\n"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": 112,
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": [
713 |     "dataset = {\n",
714 |     "    'train': {\n",
715 |     "        'en': {\n",
716 |     "            'sentencepiece': [],\n",
717 |     "            'newmm':[]\n",
718 |     "        },\n",
719 |     "        'th': {\n",
720 |     "             'sentencepiece': [],\n",
721 |     "            'newmm':[]\n",
722 |     "        }\n",
723 |     "    },\n",
724 |     "    'valid': {\n",
725 |     "        'en': {\n",
726 |     "            'sentencepiece': [],\n",
727 |     "            'newmm':[]\n",
728 |     "        },\n",
729 |     "        'th': {\n",
730 |     "             'sentencepiece': [],\n",
731 |     "            'newmm':[]\n",
732 |     "        }\n",
733 |     "    },\n",
734 |     "    'test': {\n",
735 |     "        'en': {\n",
736 |     "            'sentencepiece': [],\n",
737 |     "            'newmm':[]\n",
738 |     "        },\n",
739 |     "        'th': {\n",
740 |     "             'sentencepiece': [],\n",
741 |     "            'newmm':[]\n",
742 |     "        }\n",
743 |     "    }\n",
744 |     "}\n",
745 |     "\n",
746 |     "for split_name in ['train', 'valid', 'test']:\n",
747 |     "    for lang in ['th', 'en']:\n",
748 |     "        for tok_type in ['sentencepiece', 'newmm']:\n",
749 |     "\n",
750 |     "            dataset[split_name][lang][tok_type] = [toks[lang][tok_type][i] for i in dataset_split[split_name]] \n"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": 113,
756 |    "metadata": {},
757 |    "outputs": [
758 |     {
759 |      "name": "stdout",
760 |      "output_type": "stream",
761 |      "text": [
762 |       "['เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง', 'อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป'] \n",
763 |       "\n",
764 |       "['Becky , um , you were acting particularly strange in there just now .', \"Stay with her so Anna can guide you . I ' m going back .\"] \n",
765 |       "\n",
766 |       "['▁เบ ค กี้ ▁เธอ ทํา ท่า แปลก ๆ ▁เมื่อ กี้ ▁ในห้อง', '▁ อยู่กับ เธอ ▁แอนนา จะนํา ทาง คุณ ▁ผม จะ กลับไป'] \n",
767 |       "\n",
768 |       "['▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .', \"▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .\"] \n",
769 |       "\n"
770 |      ]
771 |     }
772 |    ],
773 |    "source": [
774 |     "print(dataset['train']['th']['newmm'][0:2],'\\n')\n",
775 |     "print(dataset['train']['en']['newmm'][0:2],'\\n')\n",
776 |     "print(dataset['train']['th']['sentencepiece'][0:2],'\\n')\n",
777 |     "print(dataset['train']['en']['sentencepiece'][0:2],'\\n')"
778 |    ]
779 |   },
780 |   {
781 |    "cell_type": "code",
782 |    "execution_count": 119,
783 |    "metadata": {},
784 |    "outputs": [
785 |     {
786 |      "name": "stdout",
787 |      "output_type": "stream",
788 |      "text": [
789 |       "Counter({'en_train_n_toks': 92383739, 'th_train_n_toks': 86683223, 'en_valid_n_toks': 11536351, 'en_test_n_toks': 11535798, 'th_test_n_toks': 10833242, 'th_valid_n_toks': 10826042})\n"
790 |      ]
791 |     }
792 |    ],
793 |    "source": [
794 |     "# Counting number of tokens for train, valid, test\n",
795 |     "counter = Counter( )\n",
796 |     "for dataset_type in ['train', 'valid', 'test']:\n",
797 |     "    for th_sent_toks in dataset[dataset_type]['th']['newmm']:\n",
798 |     "        counter['th_{}_n_toks'.format(dataset_type)] += len(th_sent_toks)\n",
799 |     "    for en_sent_toks in dataset[dataset_type]['en']['newmm']:\n",
800 |     "        counter['en_{}_n_toks'.format(dataset_type)] += len(en_sent_toks)\n",
801 |     "\n",
802 |     "print(counter) "
803 |    ]
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": 124,
808 |    "metadata": {},
809 |    "outputs": [
810 |     {
811 |      "name": "stdout",
812 |      "output_type": "stream",
813 |      "text": [
814 |       "create directories: \n",
815 |       "dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/th-en\n",
816 |       "dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/th-en\n",
817 |       "create directories: \n",
818 |       "dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th\n",
819 |       "dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th\n",
820 |       "create directories: \n",
821 |       "dir: ../data/opensubtitles_tok/sentencepiece-newmm/th-en\n",
822 |       "dir: ../data/opensubtitles_bin/sentencepiece-newmm/th-en\n",
823 |       "create directories: \n",
824 |       "dir: ../data/opensubtitles_tok/sentencepiece-newmm/en-th\n",
825 |       "dir: ../data/opensubtitles_bin/sentencepiece-newmm/en-th\n",
826 |       "create directories: \n",
827 |       "dir: ../data/opensubtitles_tok/newmm-sentencepiece/th-en\n",
828 |       "dir: ../data/opensubtitles_bin/newmm-sentencepiece/th-en\n",
829 |       "create directories: \n",
830 |       "dir: ../data/opensubtitles_tok/newmm-sentencepiece/en-th\n",
831 |       "dir: ../data/opensubtitles_bin/newmm-sentencepiece/en-th\n",
832 |       "create directories: \n",
833 |       "dir: ../data/opensubtitles_tok/newmm-newmm/th-en\n",
834 |       "dir: ../data/opensubtitles_bin/newmm-newmm/th-en\n",
835 |       "create directories: \n",
836 |       "dir: ../data/opensubtitles_tok/newmm-newmm/en-th\n",
837 |       "dir: ../data/opensubtitles_bin/newmm-newmm/en-th\n"
838 |      ]
839 |     }
840 |    ],
841 |    "source": [
842 |     "\n",
843 |     "for tok_type_src in ['sentencepiece', 'newmm']:\n",
844 |     "    for tok_type_tgt in ['sentencepiece', 'newmm']:\n",
845 |     "        langs = ['th', 'en']\n",
846 |     "        for lang in langs:\n",
847 |     "            src_lang = lang\n",
848 |     "            tgt_lang = 'en' if lang =='th' else 'th'\n",
849 |     "            FOLDER_NAME = \"opensubtitles_tok/{}-{}/{}-{}\".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang )\n",
850 |     "            FOLDER_NAME_BIN = \"opensubtitles_bin/{}-{}/{}-{}\".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang)\n",
851 |     "           \n",
852 |     "            \n",
853 |     "            # Create directories\n",
854 |     "            print('create directories: ')\n",
855 |     "            print('dir: ../data/{}'.format(FOLDER_NAME))\n",
856 |     "            print('dir: ../data/{}'.format(FOLDER_NAME_BIN))\n",
857 |     "\n",
858 |     "            !mkdir -p ../data/{FOLDER_NAME}\n",
859 |     "            !mkdir -p ../data/{FOLDER_NAME_BIN}\n",
860 |     "\n",
861 |     "            for split_name in ['train', 'valid', 'test']:\n",
862 |     "                \n",
863 |     "                write_spaced_tokens_to_file(dataset[split_name][src_lang][tok_type_src],\n",
864 |     "                                            FOLDER_NAME, '{}.{}'.format(split_name, src_lang))\n",
865 |     "                \n",
866 |     "                write_spaced_tokens_to_file(dataset[split_name][tgt_lang][tok_type_tgt],\n",
867 |     "                                            FOLDER_NAME, '{}.{}'.format(split_name, tgt_lang))\n"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": 125,
873 |    "metadata": {},
874 |    "outputs": [
875 |     {
876 |      "name": "stdout",
877 |      "output_type": "stream",
878 |      "text": [
879 |       "▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .\r\n",
880 |       "▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .\r\n",
881 |       "▁look .\r\n",
882 |       "▁oh , ▁no , ▁it ' s ▁the ▁other ▁way ▁around , ▁dr . ▁lewis .\r\n",
883 |       "▁sort ▁of .\r\n",
884 |       "▁bart ender , ▁something ▁really ▁strong , ▁please .\r\n",
885 |       "▁yes , ▁obviously .\r\n",
886 |       "▁la ' s ▁so ▁nice .\r\n",
887 |       "▁i ' m ▁going ▁to ▁fix ▁it .\r\n",
888 |       "▁i ▁get ▁b ored .\r\n"
889 |      ]
890 |     }
891 |    ],
892 |    "source": [
893 |     "!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.en\n"
894 |    ]
895 |   },
896 |   {
897 |    "cell_type": "code",
898 |    "execution_count": 126,
899 |    "metadata": {},
900 |    "outputs": [
901 |     {
902 |      "name": "stdout",
903 |      "output_type": "stream",
904 |      "text": [
905 |       "เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง\r\n",
906 |       "อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป\r\n",
907 |       "ฟัง นะ\r\n",
908 |       "พอดี เลย ดร. ลี วิ ส\r\n",
909 |       "แบบ ว่า\r\n",
910 |       "เอ่อ บาร์ เท็น เด อร ์ ขอ อะไร ที่\r\n",
911 |       "ก็ ใช่ ห น่ะ สิ\r\n",
912 |       "แอลเอ สวย เนอะ\r\n",
913 |       "ฉัน กำลังจะ แก้ ไขมัน\r\n",
914 |       "ฉัน เบื่อ ละ\r\n"
915 |      ]
916 |     }
917 |    ],
918 |    "source": [
919 |     "!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.th"
920 |    ]
921 |   },
922 |   {
923 |    "cell_type": "code",
924 |    "execution_count": null,
925 |    "metadata": {},
926 |    "outputs": [],
927 |    "source": []
928 |   }
929 |  ],
930 |  "metadata": {
931 |   "kernelspec": {
932 |    "display_name": "Python 3",
933 |    "language": "python",
934 |    "name": "python3"
935 |   },
936 |   "language_info": {
937 |    "codemirror_mode": {
938 |     "name": "ipython",
939 |     "version": 3
940 |    },
941 |    "file_extension": ".py",
942 |    "mimetype": "text/x-python",
943 |    "name": "python",
944 |    "nbconvert_exporter": "python",
945 |    "pygments_lexer": "ipython3",
946 |    "version": "3.6.8"
947 |   }
948 |  },
949 |  "nbformat": 4,
950 |  "nbformat_minor": 2
951 | }
952 | 


--------------------------------------------------------------------------------