├── pythainlp ├── script_fairseq_eval_for_n_epochs.sh ├── run_fairseq.sh ├── tokenize.py ├── .gitignore ├── README.md ├── sandbox.ipynb └── notebooks └── preprocess_opensubtitle_with_bpe.ipynb /pythainlp: -------------------------------------------------------------------------------- 1 | /root/pythainlp/pythainlp -------------------------------------------------------------------------------- /script_fairseq_eval_for_n_epochs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | start=$1 3 | end=$2 4 | step=$3 5 | data_path=$4 6 | checkpoint_prefix=$5 7 | beam=$6 8 | max_tokens=$7 9 | opts=$8 10 | 11 | for (( i=$start; i<=$end; i+=$step )) 12 | do 13 | echo "Evaluate BLEU at epoch number $i of the model checkpoint: $checkpoint_prefix/checkpoint$i.pt"; 14 | echo "beam_size=$beam, max_tokens=$max_tokens"; 15 | 16 | fairseq-generate $data_path \ 17 | --path ${checkpoint_prefix}/checkpoint${i}.pt \ 18 | --quiet \ 19 | --beam $beam \ 20 | --max-tokens $max_tokens $opts > ${checkpoint_prefix}/result_checkpoint_${i}.txt 21 | 22 | echo "Done evaluation for epoch $i"; 23 | 24 | 25 | done 26 | -------------------------------------------------------------------------------- /run_fairseq.sh: -------------------------------------------------------------------------------- 1 | fairseq-preprocess --source-lang en --target-lang th \ 2 | --trainpref data/opensubtitles_tok/train \ 3 | --validpref data/opensubtitles_tok/valid \ 4 | --testpref data/opensubtitles_tok/test \ 5 | --destdir data/opensubtitles_bin 6 | 7 | fairseq-train \ 8 | data/opensubtitles_bin \ 9 | --arch transformer_iwslt_de_en --max-epoch 10 \ 10 | --share-decoder-input-output-embed \ 11 | --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ 12 | --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ 13 | --dropout 0.3 --weight-decay 0.0001 \ 14 | --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ 15 | --max-tokens 2048 \ 16 | --bpe sentencepiece \ 17 | --memory-efficient-fp16 18 | 19 | fairseq-generate data/opensubtitles_bin \ 20 | --path checkpoints/checkpoint_best.pt \ 21 | --remove-bpe --beam 5 --max-tokens 2048 22 | 23 | # | Translated 328154 sentences (2773243 tokens) in 1517.1s (216.31 sentences/s, 1828.01 tokens/s) 24 | # | Generate test with beam=5: BLEU4 = 10.80, 36.6/15.3/7.2/3.4 (BP=1.000, ratio=1.029, syslen=2445089, reflen=2376306) -------------------------------------------------------------------------------- /tokenize.py: -------------------------------------------------------------------------------- 1 | #python tokenize.py data/opensubtitles/OpenSubtitles.en-th.en data/opensubtitles/OpenSubtitles.en-th.th data/opensubtitles_tok/ 2 | 3 | from pythainlp.tokenize import word_tokenize 4 | # from pythainlp.ulmfit import * 5 | import random 6 | import sys 7 | 8 | #open file 9 | with open(sys.argv[0],'r') as f: 10 | en = f.readlines() 11 | print('English raw:', len(en), en[:3]) 12 | 13 | with open(sys.argv[1],'r') as f: 14 | th = f.readlines() 15 | print('Thai raw:', len(th), th[:3]) 16 | 17 | 18 | #tokenize 19 | en_tok = [] 20 | for e in tqdm_notebook(en): 21 | en_tok.append(' '.join(word_tokenize(e,keep_whitespace=False))) 22 | 23 | th_tok = [] 24 | for t in tqdm_notebook(th): 25 | th_tok.append(' '.join(word_tokenize(t))) 26 | # th_tok.append(' '.join(process_thai(t))) 27 | 28 | #train-valid-test split 80/10/10 29 | n = len(th_tok) 30 | idx = list(range(n)) 31 | random.shuffle(idx) 32 | train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):] 33 | print('train/valid/test:', len(train_idx),len(valid_idx),len(test_idx)) 34 | 35 | #save tokenized 36 | th_train = [th_tok[i] for i in train_idx] 37 | print('English tokenized train', len(th_train), th_train[:10]) 38 | en_train = [en_tok[i] for i in train_idx] 39 | print('Thai tokenized train', len(en_train), en_train[:10]) 40 | 41 | with open(f'{sys.argv[2]}/train.en','w') as f: 42 | for e in en_train: 43 | f.write(e) 44 | with open(f'{sys.argv[2]}/train.th','w') as f: 45 | for t in th_train: 46 | f.write(t) 47 | with open(f'{sys.argv[2]}/valid.en','w') as f: 48 | for e in en_valid: 49 | f.write(e) 50 | with open(f'{sys.argv[2]}/valid.th','w') as f: 51 | for t in th_valid: 52 | f.write(t) 53 | with open(f'{sys.argv[2]}/test.en','w') as f: 54 | for e in en_test: 55 | f.write(e) 56 | with open(f'{sys.argv[2]}/test.th','w') as f: 57 | for t in th_test: 58 | f.write(t) 59 | print(f'saved to {sys.argv[2]}') -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #data 2 | data/ 3 | 4 | #sunword-nmt 5 | subword-nmt/ 6 | 7 | #led / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | build/ 18 | develop-eggs/ 19 | dist/ 20 | downloads/ 21 | eggs/ 22 | .eggs/ 23 | lib/ 24 | lib64/ 25 | parts/ 26 | sdist/ 27 | var/ 28 | wheels/ 29 | pip-wheel-metadata/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # celery beat schedule file 100 | celerybeat-schedule 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mt-opus 2 | English-Thai Machine Translation with OPUS data 3 | 4 | ## Data 5 | We used 9 datasets from [OPUS](http://opus.nlpl.eu/index.php) to train and validate our models within and across domains (total 5.4M sentence pairs; 68.8M English tokens and 53.1M Thai tokens). 6 | 7 | | datasets | nb_sent | en_tok | th_tok | description | reference | 8 | |------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------|--------|--------|------------------------------------|-----------| 9 | | [OpenSubtitles v2018](http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/en-th.txt.zip) | 3.5M | 28.4M | 7.8M | crowdsourced subtitles | [1] | 10 | | [JW300 v1](http://opus.nlpl.eu/JW300-v1.php) [en](https://object.pouta.csc.fi/OPUS-JW300/v1/raw/en.zip) [th](https://object.pouta.csc.fi/OPUS-JW300/v1/raw/th.zip) | 0.8M | 14.9M | 34.6M | Jehovah's Witness site | [2], [3] | 11 | | [GNOME v1](https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-th.txt.zip) | 0.5M | 2.3M | 3.5M | GNOME documentation | [2] | 12 | | [QED v2.0a](https://object.pouta.csc.fi/OPUS-QED/v2.0a/moses/en-th.txt.zip) | 0.3M | 4.7M | 1.2M | crowdsourced educational subtitles | [2] | 13 | | [bible-uedin v1](https://object.pouta.csc.fi/OPUS-bible-uedin/v1/moses/en-th.txt.zip) | 0.1M | 3.6M | 2.1M | the Bible | [2], [4] | 14 | | [Tanzil v1](https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-th.txt.zip) | 93.5k | 2.8M | 3.4M | the Quran | [2] | 15 | | [KDE4 v2](https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-th.txt.zip) | 92.0k | 0.5M | 0.2M | KDE4 documentation | [2] | 16 | | [Ubuntu v14.10](https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-th.txt.zip) | 46.6k | 0.4M | 0.2M | Ubuntu documentation | [2] | 17 | | [Tatoeba v20190709](https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/moses/en-th.txt.zip) | 1.1k | 6k | 1.7k | crowdsourced translations | [2] | 18 | 19 | ## Models 20 | 21 | ## Results 22 | 23 | # References 24 | * [1] P. Lison and J. Tiedemann, 2016, OpenSubtitles2016: Extracting Large Parallel Corpora from Movie and TV Subtitles. In Proceedings of the 10th International Conference on Language Resources and Evaluation (LREC 2016) 25 | * [2] J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012) 26 | * [3] Željko Agić, Ivan Vulić: "JW300: A Wide-Coverage Parallel Corpus for Low-Resource Languages", In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics (ACL), 2019. Acknowledge also OPUS by citing the following article: J. Tiedemann, 2012, Parallel Data, Tools and Interfaces in OPUS. In Proceedings of the 8th International Conference on Language Resources and Evaluation (LREC 2012) 27 | * [4] A massively parallel corpus: the Bible in 100 languages, Christos Christodoulopoulos and Mark Steedman, *Language Resources and Evaluation*, 49 28 | -------------------------------------------------------------------------------- /sandbox.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 21, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from pythainlp.tokenize import word_tokenize\n", 10 | "from pythainlp.ulmfit import *\n", 11 | "from tqdm import tqdm_notebook\n", 12 | "import random" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 5, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "(3281533,\n", 24 | " ['Slave in the Magic Mirror, come from the farthest space.\\n',\n", 25 | " 'Through wind and darkness, I summon thee.\\n',\n", 26 | " 'Speak!\\n'])" 27 | ] 28 | }, 29 | "execution_count": 5, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "with open('data/opensubtitles/OpenSubtitles.en-th.en','r') as f:\n", 36 | " en = f.readlines()\n", 37 | "len(en),en[:3]" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 6, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "application/vnd.jupyter.widget-view+json": { 48 | "model_id": "5a5090a95cfc4a568af1e10ae56b0fbc", 49 | "version_major": 2, 50 | "version_minor": 0 51 | }, 52 | "text/plain": [ 53 | "HBox(children=(IntProgress(value=0, max=3281533), HTML(value='')))" 54 | ] 55 | }, 56 | "metadata": {}, 57 | "output_type": "display_data" 58 | }, 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "en_tok = []\n", 69 | "for e in tqdm_notebook(en):\n", 70 | " en_tok.append(' '.join(word_tokenize(e,keep_whitespace=False)))" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 7, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "(3281533,\n", 82 | " ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด\\n',\n", 83 | " 'ผ่านลมและความมืดฉันเรียกเจ้า\\n',\n", 84 | " 'พูด!\\n'])" 85 | ] 86 | }, 87 | "execution_count": 7, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "with open('data/opensubtitles/OpenSubtitles.en-th.th','r') as f:\n", 94 | " th = f.readlines()\n", 95 | "len(th),th[:3]" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 8, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "application/vnd.jupyter.widget-view+json": { 106 | "model_id": "af3fa1bd4be6447e891ac01f44db47b0", 107 | "version_major": 2, 108 | "version_minor": 0 109 | }, 110 | "text/plain": [ 111 | "HBox(children=(IntProgress(value=0, max=3281533), HTML(value='')))" 112 | ] 113 | }, 114 | "metadata": {}, 115 | "output_type": "display_data" 116 | }, 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "th_tok = []\n", 127 | "for t in tqdm_notebook(th):\n", 128 | " th_tok.append(' '.join(word_tokenize(t)))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 9, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "(2625226, 328153, 328154)" 140 | ] 141 | }, 142 | "execution_count": 9, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "#train-valid-test split 80/10/10\n", 149 | "n = len(th_tok)\n", 150 | "idx = list(range(n))\n", 151 | "random.shuffle(idx)\n", 152 | "train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]\n", 153 | "len(train_idx),len(valid_idx),len(test_idx)" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 11, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "(2625226,\n", 165 | " ['ไปให้พ้น ไอ้ ลูก หมา \\n',\n", 166 | " 'โชค ดีแล้ว นะ \\n',\n", 167 | " 'ไม่เอา น่า ทำ อะไร หน่อย สิ ! \\n',\n", 168 | " 'หันไป \\n',\n", 169 | " 'หงุดหงิด อย่างแรง \\n',\n", 170 | " 'ฉัน ตรวจดู หมาย เลขที่ โทร มา \\n',\n", 171 | " 'ไม่ สามารถ มี สี่ คน ที่ อาศัย อยู่ ที่นี่ \\n',\n", 172 | " 'เขา เป็น กษัตริย์ ของ เรา \\n',\n", 173 | " 'ไม่ได้ บอก ที่ บ้าน ใช่ไหม ว่า ถึง แล้ว \\n',\n", 174 | " 'เรา ต้อง มีทาง อื่น ที่จะ เข้าไป ได้ สิ น่า \\n'])" 175 | ] 176 | }, 177 | "execution_count": 11, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "th_train = [th_tok[i] for i in train_idx]\n", 184 | "len(th_train), th_train[:10]" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 12, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "(2625226,\n", 196 | " ['Go away , son of a bitch \\n',\n", 197 | " 'Well , good for them . \\n',\n", 198 | " 'Come on , do something ! \\n',\n", 199 | " 'Turn around . \\n',\n", 200 | " 'Talk about a killer cappuccino . What is this thing ? \\n',\n", 201 | " \"I ' ve checked the call record . \\n\",\n", 202 | " \"There ' s like a hundred houses . \\n\",\n", 203 | " \"He ' s our king . \\n\",\n", 204 | " \"You don ' t have to call home about arriving ? \\n\",\n", 205 | " \"There ' s gotta be some other way we can get in . \\n\"])" 206 | ] 207 | }, 208 | "execution_count": 12, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "en_train = [en_tok[i] for i in train_idx]\n", 215 | "len(en_train), en_train[:10]" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 13, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "with open('data/opensubtitles_tok/train.en','w') as f:\n", 225 | " for e in en_train:\n", 226 | " f.write(e)\n", 227 | "with open('data/opensubtitles_tok/train.th','w') as f:\n", 228 | " for t in th_train:\n", 229 | " f.write(t)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 14, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "(328153,\n", 241 | " ['ผม ต้อง แถลงการณ์ \\n',\n", 242 | " 'แล้ วจะ ให้ รัก ผม มั้ย ล่ะ \\n',\n", 243 | " 'ก็ นะ มัน เข้ากับ คุณ ดี \\n',\n", 244 | " 'เดี๋ยวก่อน นะ นั่น มัน บ้าน ของ ครอบครัว ฉัน \\n',\n", 245 | " 'หลังจาก 2 - 3 ชม. พวก มัน จะ เริ่ม เหนียว ข้น และ แห้ง \\n',\n", 246 | " 'อย่า เพิ่ง . \\n',\n", 247 | " 'เยี่ยม \\n',\n", 248 | " 'สัปดาห์ แรก ของ กันยายน มะเขือเทศ ใน เบ เก อร ์ฟิลด์ \\n',\n", 249 | " 'บำบัด โรค โดย การ สะกดจิต \\n',\n", 250 | " 'ผม เริ่ม จาก กระเป๋า เศษ เหรียญ \\\\ แล้ว มัน ก็ เพิ่มขึ้น เรื่อยๆ นับ จากนั้น \\n'])" 251 | ] 252 | }, 253 | "execution_count": 14, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "th_valid = [th_tok[i] for i in valid_idx]\n", 260 | "len(th_valid), th_valid[:10]" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 15, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "(328153,\n", 272 | " ['I need to make a statement . \\n',\n", 273 | " \"Gonna give me some lovin '? \\n\",\n", 274 | " \"Well , it ' s working for you . \\n\",\n", 275 | " 'Wait a minute . \\n',\n", 276 | " 'After a few hours , they begin to get cloudy and wilt . \\n',\n", 277 | " 'Notyet . \\n',\n", 278 | " '- Excellent . \\n',\n", 279 | " 'First week of september , tomatoes in bakersfield . \\n',\n", 280 | " 'A hypnotherapist . \\n',\n", 281 | " 'It started with coin purses and sort of went on from there , really . \\n'])" 282 | ] 283 | }, 284 | "execution_count": 15, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "en_valid = [en_tok[i] for i in valid_idx]\n", 291 | "len(en_valid), en_valid[:10]" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 16, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "with open('data/opensubtitles_tok/valid.en','w') as f:\n", 301 | " for e in en_valid:\n", 302 | " f.write(e)\n", 303 | "with open('data/opensubtitles_tok/valid.th','w') as f:\n", 304 | " for t in th_valid:\n", 305 | " f.write(t)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 17, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "(328154,\n", 317 | " ['พวก นั้น กำลัง มา แม่ \\n',\n", 318 | " 'เอา ห น่า แฮ รี่ นาย มี แล้ว นิ มังกร ไง ถ้า นาย คิด ว่า จะ หาคู่ เดท ได้ นะ ฉัน คิด ว่า ฉัน ไป กับ มังกร ดีกว่า \\n',\n", 319 | " 'ฉัน เสียใจ \\n',\n", 320 | " 'มัน เรื่อง อะไร ของ แก ? \\n',\n", 321 | " 'แต่ เรา จะ ลอง ทำ ดู \\n',\n", 322 | " 'ไป กัน เถอะ \\n',\n", 323 | " 'ชาร์ท ที่ 200 เม ก เม ก \\n',\n", 324 | " 'ตกลง มั้ย ? \\n',\n", 325 | " 'เรา มี ความสัมพันธ์ ค่อนข้างจะ ซับซ้อน น่ะ \\n',\n", 326 | " '100 ศพ ดึง ออก มาจาก ที่เกิดเหตุ \\n'])" 327 | ] 328 | }, 329 | "execution_count": 17, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "th_test = [th_tok[i] for i in test_idx]\n", 336 | "len(th_test), th_test[:10]" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 18, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "(328154,\n", 348 | " [\"They ' re coming , mother . No . \\n\",\n", 349 | " \"Come on , Harry , you have slain a dragon lf you want to get a date you can I think I ' ll take the dragon right now \\n\",\n", 350 | " \"I ' m sorry . \\n\",\n", 351 | " 'WHAT THE HELL IS YOUR PROBLEM ? \\n',\n", 352 | " \"But we ' ll give it a try . \\n\",\n", 353 | " \"Let ' s go . \\n\",\n", 354 | " 'Megan ? Megan ! \\n',\n", 355 | " 'Okay ? \\n',\n", 356 | " 'We have a ... very complicated relationship . \\n',\n", 357 | " '100 more bodies pulled from the arena . \\n'])" 358 | ] 359 | }, 360 | "execution_count": 18, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "en_test = [en_tok[i] for i in test_idx]\n", 367 | "len(en_test), en_test[:10]" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 19, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "with open('data/opensubtitles_tok/test.en','w') as f:\n", 377 | " for e in en_test:\n", 378 | " f.write(e)\n", 379 | " \n", 380 | "with open('data/opensubtitles_tok/test.th','w') as f:\n", 381 | " for t in th_test:\n", 382 | " f.write(t)" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 91, 388 | "metadata": {}, 389 | "outputs": [], 390 | "source": [ 391 | "# !fairseq-preprocess --source-lang en --target-lang th \\\n", 392 | "# --trainpref data/opensubtitles_tok/train \\\n", 393 | "# --validpref data/opensubtitles_tok/valid \\\n", 394 | "# --testpref data/opensubtitles_tok/test \\\n", 395 | "# --destdir data/opensubtitles_bin\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 90, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# | [en] data/opensubtitles_tok/train.en: 2625226 sents, 25945418 tokens, 0.0% replaced by \n", 405 | "# | [en] Dictionary: 173623 types\n", 406 | "# | [en] data/opensubtitles_tok/valid.en: 328153 sents, 3238427 tokens, 0.304% replaced by \n", 407 | "# | [en] Dictionary: 173623 types\n", 408 | "# | [en] data/opensubtitles_tok/test.en: 328154 sents, 3235608 tokens, 0.314% replaced by \n", 409 | "# | [th] Dictionary: 116495 types\n", 410 | "# | [th] data/opensubtitles_tok/train.th: 2625226 sents, 21658577 tokens, 0.0% replaced by \n", 411 | "# | [th] Dictionary: 116495 types\n", 412 | "# | [th] data/opensubtitles_tok/valid.th: 328153 sents, 2705475 tokens, 0.262% replaced by \n", 413 | "# | [th] Dictionary: 116495 types\n", 414 | "# | [th] data/opensubtitles_tok/test.th: 328154 sents, 2701605 tokens, 0.257% replaced by " 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 98, 420 | "metadata": {}, 421 | "outputs": [], 422 | "source": [ 423 | "!fairseq-train \\\n", 424 | " data/opensubtitles_bin \\\n", 425 | " --arch transformer_iwslt_de_en --share-decoder-input-output-embed \\\n", 426 | " --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \\\n", 427 | " --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \\\n", 428 | " --dropout 0.3 --weight-decay 0.0001 \\\n", 429 | " --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n", 430 | " --max-tokens 2048 \\\n", 431 | " --bpe sentencepiece \\\n", 432 | " --memory-efficient-fp16\n", 433 | " --save-dir data/opensubtitles_model/transformers" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# fairseq-generate data/opensubtitles_bin \\\n", 443 | "# --path data/opensubtitles_model/transformers/checkpoint_best.pt \\\n", 444 | "# --beam 5 --remove-bpe" 445 | ] 446 | } 447 | ], 448 | "metadata": { 449 | "kernelspec": { 450 | "display_name": "Python 3", 451 | "language": "python", 452 | "name": "python3" 453 | }, 454 | "language_info": { 455 | "codemirror_mode": { 456 | "name": "ipython", 457 | "version": 3 458 | }, 459 | "file_extension": ".py", 460 | "mimetype": "text/x-python", 461 | "name": "python", 462 | "nbconvert_exporter": "python", 463 | "pygments_lexer": "ipython3", 464 | "version": "3.6.8" 465 | } 466 | }, 467 | "nbformat": 4, 468 | "nbformat_minor": 2 469 | } 470 | -------------------------------------------------------------------------------- /notebooks/preprocess_opensubtitle_with_bpe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 83, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "2.1\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# coding=utf-8\n", 18 | "import sys\n", 19 | "sys.path.append('..')\n", 20 | "\n", 21 | "import os\n", 22 | "import io\n", 23 | "import random\n", 24 | "import copy \n", 25 | "import re\n", 26 | "import html\n", 27 | "\n", 28 | "from time import time \n", 29 | "from multiprocessing import Pool\n", 30 | "from collections import Counter\n", 31 | "\n", 32 | "from functools import partial\n", 33 | "\n", 34 | "from tqdm import tqdm_notebook\n", 35 | "import pythainlp\n", 36 | "from pythainlp.util import *\n", 37 | "from pythainlp.tokenize import word_tokenize\n", 38 | "from pythainlp.ulmfit import *\n", 39 | "\n", 40 | "# subword-nmt\n", 41 | "from subword_nmt import learn_bpe as learner\n", 42 | "from subword_nmt import apply_bpe as subword_tokenizer\n", 43 | "\n", 44 | "import fairseq \n", 45 | "from datetime import timedelta\n", 46 | "from tqdm import tqdm, tqdm_notebook\n", 47 | "from pythainlp.tokenize import DEFAULT_DICT_TRIE\n", 48 | "\n", 49 | "from pythainlp.corpus import thai_words\n", 50 | "\n", 51 | "print(pythainlp.__version__)\n", 52 | "assert pythainlp.__version__ == '2.1'\n", 53 | "\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 84, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# เขียนใหม่ เอาแบบ initializer รับ wordlist มา\n", 63 | "class Trie:\n", 64 | " class Node(object):\n", 65 | " __slots__ = 'end', 'children'\n", 66 | " def __init__(self):\n", 67 | " self.end = False \n", 68 | " self.children = {}\n", 69 | " \n", 70 | " def __init__(self, words):\n", 71 | " self.words = words\n", 72 | " self.root = Trie.Node()\n", 73 | " for word in words:\n", 74 | " cur = self.root\n", 75 | " for ch in word: \n", 76 | " node = cur.children.get(ch)\n", 77 | " if not node: \n", 78 | " node = Trie.Node() \n", 79 | " cur.children[ch] = node \n", 80 | " cur = node\n", 81 | " cur.end = True \n", 82 | " def prefixes(self, text):\n", 83 | " res = []\n", 84 | " cur = self.root\n", 85 | " for i, ch in enumerate(text):\n", 86 | " node = cur.children.get(ch)\n", 87 | " if not node: break\n", 88 | " if node.end:\n", 89 | " res.append(text[:i+1])\n", 90 | " cur = node\n", 91 | " return res\n", 92 | " \n", 93 | " def __contains__(self, key):\n", 94 | " return key in self.words\n", 95 | " def __iter__(self):\n", 96 | " yield from self.words" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 85, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "323 ms ± 802 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "%%timeit\n", 114 | "pt = Trie(thai_words())\n", 115 | "\n" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 86, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "['กา', 'กาก', 'กากี']" 127 | ] 128 | }, 129 | "execution_count": 86, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "pt.prefixes('กากี่')" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 87, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "\u001b[33mYou are using pip version 19.0.3, however version 19.2.3 is available.\r\n", 148 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\r\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# install BPEmb (BPE embeddings)\n", 154 | "\n", 155 | "!pip install --q bpemb" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 88, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "from bpemb import BPEmb\n", 165 | "\n", 166 | "bpemb_pretrained ={\n", 167 | " 'th': {\n", 168 | " '25000': BPEmb(lang=\"th\", vs=25000)\n", 169 | " },\n", 170 | " 'en': {\n", 171 | " '25000': BPEmb(lang=\"en\", vs=25000)\n", 172 | " }\n", 173 | "}\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 89, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "(3281534,\n", 185 | " ['Slave in the Magic Mirror, come from the farthest space.',\n", 186 | " 'Through wind and darkness, I summon thee.',\n", 187 | " 'Speak!'])" 188 | ] 189 | }, 190 | "execution_count": 89, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "with open('../data/opensubtitles/OpenSubtitles.en-th.en','r', encoding='utf-8') as f:\n", 197 | " en = f.read().split('\\n')\n", 198 | "len(en),en[:3]\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 90, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "(3281534,\n", 210 | " ['ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด',\n", 211 | " 'ผ่านลมและความมืดฉันเรียกเจ้า',\n", 212 | " 'พูด!'])" 213 | ] 214 | }, 215 | "execution_count": 90, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "with open('../data/opensubtitles/OpenSubtitles.en-th.th','r', encoding='utf-8') as f:\n", 222 | " th = f.read().split('\\n')\n", 223 | " \n", 224 | "len(th),th[:3]" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 91, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# preprocess text\n", 234 | "\n", 235 | "LIST_OF_UNKNOWN_TOKENS = [b'\\x98\\xc2', b'\\xae\\xc2', b'\\x99\\xc2', b'\\xb1\\xc2' , b'\\xc2\\xb7', b'\\xc3\\x83']\n", 236 | "LISY_OF_TOKENS_TO_REPLACE = ['™', '„', '​', '”', '–', '“', '…", 237 | "', '—', 'โ€', '​',\n", 238 | " '♪', '{\\ cHFFFFFF }', '§', 'font color = \"# 808080 \"']\n", 239 | "def unescape_string(text):\n", 240 | " return html.unescape(text)\n", 241 | "\n", 242 | "def sentences_filter(sentences, lang):\n", 243 | " indices = []\n", 244 | " for index, sentence in tqdm_notebook(enumerate(sentences), total=len(sentences)):\n", 245 | " for token in LIST_OF_UNKNOWN_TOKENS:\n", 246 | " if token in sentence.encode('utf-8'):\n", 247 | " indices.append(index)\n", 248 | " break\n", 249 | " if len(sentence) <= 1:\n", 250 | " indices.append(index)\n", 251 | " continue\n", 252 | " if lang == 'th' and countthai(sentence, ignore_chars='') == 0.0:\n", 253 | " indices.append(index)\n", 254 | " continue\n", 255 | " return indices\n", 256 | "\n", 257 | "def clean_sentence(sentence):\n", 258 | " for token in LISY_OF_TOKENS_TO_REPLACE:\n", 259 | " sentence = sentence.replace(token, '')\n", 260 | " sentence = unescape_string(sentence)\n", 261 | " \n", 262 | " return sentence" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 92, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "\n", 272 | "def tokenize_worker(sentence, lang, trie):\n", 273 | " \n", 274 | " _tokenizer_newmm = partial(pythainlp.tokenize.word_tokenize, engine='newmm',\n", 275 | " keep_whitespace=False,\n", 276 | " custom_dict=(trie if trie != None else DEFAULT_DICT_TRIE))\n", 277 | " return ' '.join(_tokenizer_newmm(sentence))\n", 278 | " \n", 279 | "def tokenize_handler(sentences, lang, trie=None):\n", 280 | " toks = []\n", 281 | " p = Pool(12)\n", 282 | " t = time()\n", 283 | " _tokenize_worker = partial(tokenize_worker, lang=lang, trie=trie)\n", 284 | " toks = p.map(_tokenize_worker, sentences)\n", 285 | " \n", 286 | " p.close()\n", 287 | " p.join() # call Pool.join() to wait for the worker processes to terminate.\n", 288 | "\n", 289 | " print('{} s'.format(time() -t))\n", 290 | "\n", 291 | " return toks\n", 292 | " " 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 100, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "def write_spaced_tokens_to_file(data, folder_name, filename):\n", 302 | " with open('/root/mt-opus/data/{}/{}'.format(folder_name, filename),'w') as f:\n", 303 | " for item in data:\n", 304 | " f.write(item + '\\n')\n", 305 | " \n", 306 | " " 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 101, 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "application/vnd.jupyter.widget-view+json": { 317 | "model_id": "60a71371634346f08d65cb6cacf94b02", 318 | "version_major": 2, 319 | "version_minor": 0 320 | }, 321 | "text/plain": [ 322 | "HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))" 323 | ] 324 | }, 325 | "metadata": {}, 326 | "output_type": "display_data" 327 | }, 328 | { 329 | "name": "stdout", 330 | "output_type": "stream", 331 | "text": [ 332 | "\n" 333 | ] 334 | }, 335 | { 336 | "data": { 337 | "application/vnd.jupyter.widget-view+json": { 338 | "model_id": "425025bfbc3e46e193943c9ff6f05568", 339 | "version_major": 2, 340 | "version_minor": 0 341 | }, 342 | "text/plain": [ 343 | "HBox(children=(IntProgress(value=0, max=3281534), HTML(value='')))" 344 | ] 345 | }, 346 | "metadata": {}, 347 | "output_type": "display_data" 348 | }, 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "\n", 354 | "76544\n", 355 | "4392\n", 356 | "111.4701018333435 seconds\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "t = time()\n", 362 | "indices_to_filter_out_th = sentences_filter(th, lang='th')\n", 363 | "indices_to_filter_out_en = sentences_filter(en, lang='en')\n", 364 | "\n", 365 | "print(len(indices_to_filter_out_th))\n", 366 | "print(len(indices_to_filter_out_en))\n", 367 | "\n", 368 | "indices_to_filter_out = indices_to_filter_out_th + indices_to_filter_out_en\n", 369 | "indices_to_filter_out = set(indices_to_filter_out)\n", 370 | "\n", 371 | "\n", 372 | "filtered_th = [clean_sentence(x) for i, x in enumerate(th) if i not in indices_to_filter_out]\n", 373 | "filtered_en = [clean_sentence(x) for i, x in enumerate(en) if i not in indices_to_filter_out]\n", 374 | "\n", 375 | "print('{} seconds'.format(time() -t))\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 95, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/plain": [ 386 | "'ทาสในกระจกวิเศษ, มาจากพื้นที่ที่ไกลที่สุด'" 387 | ] 388 | }, 389 | "execution_count": 95, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "filtered_th[0]" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 102, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "toks = {\n", 412 | " 'th': {\n", 413 | " 'sentencepiece': [],\n", 414 | " 'newmm':[]\n", 415 | " },\n", 416 | " 'en': {\n", 417 | " 'sentencepiece': [],\n", 418 | " 'newmm':[]\n", 419 | " }\n", 420 | "}" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "## 1a Segment texts into tokens with `newmm`" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 103, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "# toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th')\n", 444 | "\n", 445 | "# toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en')" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 104, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# # test with Pyhon Trie\n", 455 | "# t = time()\n", 456 | "# for sent in filtered_th[:100000]:\n", 457 | "# toks = pythainlp.tokenize.word_tokenize(text=sent,\n", 458 | "# engine='newmm',\n", 459 | "# keep_whitespace=False,\n", 460 | "# custom_dict=pt)\n", 461 | " \n", 462 | "# print('{} s'.format(time() -t))\n", 463 | "# # toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th', trie=pt)\n", 464 | "\n", 465 | "# # toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en', trie=pt)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 105, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# # test with Marisa Trie\n", 475 | "# t = time()\n", 476 | "# for sent in filtered_th[:100000]:\n", 477 | "# toks = pythainlp.tokenize.word_tokenize(text=sent,\n", 478 | "# engine='newmm',\n", 479 | "# keep_whitespace=False)\n", 480 | "# print('{} s'.format(time() -t))\n", 481 | "# # toks['th']['newmm'] = tokenize_handler(filtered_th[:10000], lang='th', trie=pt)\n", 482 | "\n", 483 | "# # toks['en']['newmm'] = tokenize_handler(filtered_en[:10000], lang='en', trie=pt)" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 106, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "38.81156301498413 s\n", 496 | "39.77449178695679 s\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "toks['th']['newmm'] = tokenize_handler(filtered_th, lang='th')\n", 502 | "toks['en']['newmm'] = tokenize_handler(filtered_en, lang='en')\n" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 107, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "(['ทาส ใน กระจก วิเศษ , มาจาก พื้นที่ ที่ ไกล ที่สุด',\n", 514 | " 'ผ่าน ลม และ ความมืด ฉัน เรียก เจ้า',\n", 515 | " 'พูด !',\n", 516 | " 'ให้ ฉัน เห็น พระพักตร์ ของ พระองค์',\n", 517 | " 'สิ่ง ที่ เจ้า จะ รู้ ว่า สมเด็จ พระราชินี ของ ฉัน ได้ อย่างไร',\n", 518 | " 'กระจก วิเศษ บน ผนัง ผู้ ที่ เป็น สังขาร หนึ่ง ทั้งหมด หรือไม่',\n", 519 | " 'ที่ มีชื่อเสียง เป็น ความงาม ของ เจ้า พระ บาท สมเด็จ พระเจ้าอยู่หัว',\n", 520 | " 'แต่ ถือเป็น แม่บ้าน ที่ น่ารัก ที่ ฉัน เห็น',\n", 521 | " 'ยาจก ไม่ สามารถ ซ่อน พระคุณ อ่อนโยน ของ เธอ',\n", 522 | " 'อนิจจา เธอ มี ความเป็นธรรม มากขึ้น กว่า เจ้า'],\n", 523 | " ['Slave in the Magic Mirror , come from the farthest space .',\n", 524 | " 'Through wind and darkness , I summon thee .',\n", 525 | " 'Speak !',\n", 526 | " 'Let me see thy face .',\n", 527 | " 'What wouldst thou know , my Queen ?',\n", 528 | " 'Magic Mirror on the wall , who is the fairest one of all ?',\n", 529 | " 'Famed is thy beauty , Majesty .',\n", 530 | " 'But hold , a lovely maid I see .',\n", 531 | " 'Rags cannot hide her gentle grace .',\n", 532 | " 'Alas , she is more fair than thee .'])" 533 | ] 534 | }, 535 | "execution_count": 107, 536 | "metadata": {}, 537 | "output_type": "execute_result" 538 | } 539 | ], 540 | "source": [ 541 | "toks['th']['newmm'][0:10], toks['en']['newmm'][0:10]" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "## 1b Segment texts into BPE tokens with SentencePiece (BPEmb)\n" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 108, 554 | "metadata": {}, 555 | "outputs": [], 556 | "source": [ 557 | "def encode_bpe(sentences, lang, n_vocab=25000):\n", 558 | " \"\"\"Return a list of bpe tokens give a list of sentences\"\"\"\n", 559 | " segmented_sentences = []\n", 560 | " for sentence in tqdm_notebook(sentences, total=len(sentences)):\n", 561 | "# print(sentence)\n", 562 | " bpe_tokens = bpemb_pretrained[lang]['{}'.format(n_vocab)].encode(sentence)\n", 563 | " segmented_sentences.append(' '.join(bpe_tokens))\n", 564 | " \n", 565 | " return segmented_sentences" 566 | ] 567 | }, 568 | { 569 | "cell_type": "markdown", 570 | "metadata": {}, 571 | "source": [ 572 | "### 1.1 Thai language" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 109, 578 | "metadata": {}, 579 | "outputs": [ 580 | { 581 | "data": { 582 | "application/vnd.jupyter.widget-view+json": { 583 | "model_id": "1ce4eb5e8cbe469886307235a01f1dcc", 584 | "version_major": 2, 585 | "version_minor": 0 586 | }, 587 | "text/plain": [ 588 | "HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))" 589 | ] 590 | }, 591 | "metadata": {}, 592 | "output_type": "display_data" 593 | }, 594 | { 595 | "name": "stdout", 596 | "output_type": "stream", 597 | "text": [ 598 | "\n", 599 | "['▁ท าส ใน กระจก วิเศษ , ▁มาจาก พื้นที่ ที่ ไกล ที่สุด', '▁ผ่าน ลม และความ มืด ฉัน เรียก เจ้า', '▁พูด !', '▁ให้ ฉัน เห็น พระพักตร์ ของ ▁พระองค์', '▁สิ่งที่ เจ้า จะ รู้ว่า สมเด็จพระราชินี ▁ของ ฉัน ได้อย่างไร', '▁กระจ ก วิเศษ บน ผนัง ▁ผู้ ที่เป็น สัง ขาร หนึ่ง ทั้งหมด ▁หรือไม่', '▁ที่มีชื่อเสียง เป็น ความงาม ของ ▁เจ้า พระบาทสมเด็จพระ เจ้าอยู่หัว', '▁แต่ ถือเป็น แม่ บ้าน ที่น ่ารัก ที่ ฉัน ▁เห็น', '▁ยา จก ไม่สามารถ ซ่อน พระคุณ ▁อ่อน โยน ของเธอ', '▁อน ิจ จา เธอ มีความเป็น ธรรม ▁มาก ขึ้น กว่า เจ้า']\n" 600 | ] 601 | } 602 | ], 603 | "source": [ 604 | "toks['th']['sentencepiece'] = encode_bpe(filtered_th, 'th', 25000)\n", 605 | "\n", 606 | "print(toks['th']['sentencepiece'][0:10])" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "### 1.2 English language" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 110, 619 | "metadata": {}, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "application/vnd.jupyter.widget-view+json": { 624 | "model_id": "4a0ac2a9e16c4fe0814a82c713b07a92", 625 | "version_major": 2, 626 | "version_minor": 0 627 | }, 628 | "text/plain": [ 629 | "HBox(children=(IntProgress(value=0, max=3202751), HTML(value='')))" 630 | ] 631 | }, 632 | "metadata": {}, 633 | "output_type": "display_data" 634 | }, 635 | { 636 | "name": "stdout", 637 | "output_type": "stream", 638 | "text": [ 639 | "\n", 640 | "['▁slave ▁in ▁the ▁magic ▁mirror , ▁come ▁from ▁the ▁fart hest ▁space .', '▁through ▁wind ▁and ▁darkness , ▁i ▁summon ▁the e .', '▁speak !', '▁let ▁me ▁see ▁thy ▁face .', '▁what ▁would st ▁thou ▁know , ▁my ▁queen ?', '▁magic ▁mirror ▁on ▁the ▁wall , ▁who ▁is ▁the ▁fa ire st ▁one ▁of ▁all ?', '▁famed ▁is ▁thy ▁beauty , ▁majesty .', '▁but ▁hold , ▁a ▁lov ely ▁maid ▁i ▁see .', '▁ra gs ▁cannot ▁hide ▁her ▁gentle ▁grace .', '▁al as , ▁she ▁is ▁more ▁fair ▁than ▁the e .']\n" 641 | ] 642 | } 643 | ], 644 | "source": [ 645 | "toks['en']['sentencepiece'] = encode_bpe(filtered_en, 'en', 25000)\n", 646 | "print(toks['en']['sentencepiece'][0:10])" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": {}, 652 | "source": [ 653 | "## 2. Split train-valid-test " 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 111, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "name": "stdout", 668 | "output_type": "stream", 669 | "text": [ 670 | "N = 3202751\n" 671 | ] 672 | }, 673 | { 674 | "data": { 675 | "text/plain": [ 676 | "(2562200, 320275, 320276)" 677 | ] 678 | }, 679 | "execution_count": 111, 680 | "metadata": {}, 681 | "output_type": "execute_result" 682 | } 683 | ], 684 | "source": [ 685 | "#train-valid-test split 80/10/10\n", 686 | "\n", 687 | "n = len(toks['th']['newmm'])\n", 688 | "\n", 689 | "print('N = ',n)\n", 690 | "idx = list(range(n))\n", 691 | "\n", 692 | "random.seed(1234) # Set SEED\n", 693 | "random.shuffle(idx)\n", 694 | "\n", 695 | "train_idx, valid_idx, test_idx = idx[:int(n*0.8)], idx[int(n*0.8):int(n*0.9)], idx[int(n*0.9):]\n", 696 | "\n", 697 | "dataset_split = {}\n", 698 | "dataset_split['train'] = train_idx\n", 699 | "dataset_split['valid'] = valid_idx\n", 700 | "dataset_split['test'] = test_idx\n", 701 | "\n", 702 | "\n", 703 | "len(train_idx),len(valid_idx),len(test_idx)\n", 704 | "\n" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 112, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "dataset = {\n", 714 | " 'train': {\n", 715 | " 'en': {\n", 716 | " 'sentencepiece': [],\n", 717 | " 'newmm':[]\n", 718 | " },\n", 719 | " 'th': {\n", 720 | " 'sentencepiece': [],\n", 721 | " 'newmm':[]\n", 722 | " }\n", 723 | " },\n", 724 | " 'valid': {\n", 725 | " 'en': {\n", 726 | " 'sentencepiece': [],\n", 727 | " 'newmm':[]\n", 728 | " },\n", 729 | " 'th': {\n", 730 | " 'sentencepiece': [],\n", 731 | " 'newmm':[]\n", 732 | " }\n", 733 | " },\n", 734 | " 'test': {\n", 735 | " 'en': {\n", 736 | " 'sentencepiece': [],\n", 737 | " 'newmm':[]\n", 738 | " },\n", 739 | " 'th': {\n", 740 | " 'sentencepiece': [],\n", 741 | " 'newmm':[]\n", 742 | " }\n", 743 | " }\n", 744 | "}\n", 745 | "\n", 746 | "for split_name in ['train', 'valid', 'test']:\n", 747 | " for lang in ['th', 'en']:\n", 748 | " for tok_type in ['sentencepiece', 'newmm']:\n", 749 | "\n", 750 | " dataset[split_name][lang][tok_type] = [toks[lang][tok_type][i] for i in dataset_split[split_name]] \n" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 113, 756 | "metadata": {}, 757 | "outputs": [ 758 | { 759 | "name": "stdout", 760 | "output_type": "stream", 761 | "text": [ 762 | "['เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง', 'อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป'] \n", 763 | "\n", 764 | "['Becky , um , you were acting particularly strange in there just now .', \"Stay with her so Anna can guide you . I ' m going back .\"] \n", 765 | "\n", 766 | "['▁เบ ค กี้ ▁เธอ ทํา ท่า แปลก ๆ ▁เมื่อ กี้ ▁ในห้อง', '▁ อยู่กับ เธอ ▁แอนนา จะนํา ทาง คุณ ▁ผม จะ กลับไป'] \n", 767 | "\n", 768 | "['▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .', \"▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .\"] \n", 769 | "\n" 770 | ] 771 | } 772 | ], 773 | "source": [ 774 | "print(dataset['train']['th']['newmm'][0:2],'\\n')\n", 775 | "print(dataset['train']['en']['newmm'][0:2],'\\n')\n", 776 | "print(dataset['train']['th']['sentencepiece'][0:2],'\\n')\n", 777 | "print(dataset['train']['en']['sentencepiece'][0:2],'\\n')" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": 119, 783 | "metadata": {}, 784 | "outputs": [ 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "Counter({'en_train_n_toks': 92383739, 'th_train_n_toks': 86683223, 'en_valid_n_toks': 11536351, 'en_test_n_toks': 11535798, 'th_test_n_toks': 10833242, 'th_valid_n_toks': 10826042})\n" 790 | ] 791 | } 792 | ], 793 | "source": [ 794 | "# Counting number of tokens for train, valid, test\n", 795 | "counter = Counter( )\n", 796 | "for dataset_type in ['train', 'valid', 'test']:\n", 797 | " for th_sent_toks in dataset[dataset_type]['th']['newmm']:\n", 798 | " counter['th_{}_n_toks'.format(dataset_type)] += len(th_sent_toks)\n", 799 | " for en_sent_toks in dataset[dataset_type]['en']['newmm']:\n", 800 | " counter['en_{}_n_toks'.format(dataset_type)] += len(en_sent_toks)\n", 801 | "\n", 802 | "print(counter) " 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": 124, 808 | "metadata": {}, 809 | "outputs": [ 810 | { 811 | "name": "stdout", 812 | "output_type": "stream", 813 | "text": [ 814 | "create directories: \n", 815 | "dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/th-en\n", 816 | "dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/th-en\n", 817 | "create directories: \n", 818 | "dir: ../data/opensubtitles_tok/sentencepiece-sentencepiece/en-th\n", 819 | "dir: ../data/opensubtitles_bin/sentencepiece-sentencepiece/en-th\n", 820 | "create directories: \n", 821 | "dir: ../data/opensubtitles_tok/sentencepiece-newmm/th-en\n", 822 | "dir: ../data/opensubtitles_bin/sentencepiece-newmm/th-en\n", 823 | "create directories: \n", 824 | "dir: ../data/opensubtitles_tok/sentencepiece-newmm/en-th\n", 825 | "dir: ../data/opensubtitles_bin/sentencepiece-newmm/en-th\n", 826 | "create directories: \n", 827 | "dir: ../data/opensubtitles_tok/newmm-sentencepiece/th-en\n", 828 | "dir: ../data/opensubtitles_bin/newmm-sentencepiece/th-en\n", 829 | "create directories: \n", 830 | "dir: ../data/opensubtitles_tok/newmm-sentencepiece/en-th\n", 831 | "dir: ../data/opensubtitles_bin/newmm-sentencepiece/en-th\n", 832 | "create directories: \n", 833 | "dir: ../data/opensubtitles_tok/newmm-newmm/th-en\n", 834 | "dir: ../data/opensubtitles_bin/newmm-newmm/th-en\n", 835 | "create directories: \n", 836 | "dir: ../data/opensubtitles_tok/newmm-newmm/en-th\n", 837 | "dir: ../data/opensubtitles_bin/newmm-newmm/en-th\n" 838 | ] 839 | } 840 | ], 841 | "source": [ 842 | "\n", 843 | "for tok_type_src in ['sentencepiece', 'newmm']:\n", 844 | " for tok_type_tgt in ['sentencepiece', 'newmm']:\n", 845 | " langs = ['th', 'en']\n", 846 | " for lang in langs:\n", 847 | " src_lang = lang\n", 848 | " tgt_lang = 'en' if lang =='th' else 'th'\n", 849 | " FOLDER_NAME = \"opensubtitles_tok/{}-{}/{}-{}\".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang )\n", 850 | " FOLDER_NAME_BIN = \"opensubtitles_bin/{}-{}/{}-{}\".format(tok_type_src, tok_type_tgt, src_lang, tgt_lang)\n", 851 | " \n", 852 | " \n", 853 | " # Create directories\n", 854 | " print('create directories: ')\n", 855 | " print('dir: ../data/{}'.format(FOLDER_NAME))\n", 856 | " print('dir: ../data/{}'.format(FOLDER_NAME_BIN))\n", 857 | "\n", 858 | " !mkdir -p ../data/{FOLDER_NAME}\n", 859 | " !mkdir -p ../data/{FOLDER_NAME_BIN}\n", 860 | "\n", 861 | " for split_name in ['train', 'valid', 'test']:\n", 862 | " \n", 863 | " write_spaced_tokens_to_file(dataset[split_name][src_lang][tok_type_src],\n", 864 | " FOLDER_NAME, '{}.{}'.format(split_name, src_lang))\n", 865 | " \n", 866 | " write_spaced_tokens_to_file(dataset[split_name][tgt_lang][tok_type_tgt],\n", 867 | " FOLDER_NAME, '{}.{}'.format(split_name, tgt_lang))\n" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 125, 873 | "metadata": {}, 874 | "outputs": [ 875 | { 876 | "name": "stdout", 877 | "output_type": "stream", 878 | "text": [ 879 | "▁bec ky , ▁um , ▁you ▁were ▁acting ▁particularly ▁strange ▁in ▁there ▁just ▁now .\r\n", 880 | "▁stay ▁with ▁her ▁so ▁anna ▁can ▁guide ▁you . ▁i ' m ▁going ▁back .\r\n", 881 | "▁look .\r\n", 882 | "▁oh , ▁no , ▁it ' s ▁the ▁other ▁way ▁around , ▁dr . ▁lewis .\r\n", 883 | "▁sort ▁of .\r\n", 884 | "▁bart ender , ▁something ▁really ▁strong , ▁please .\r\n", 885 | "▁yes , ▁obviously .\r\n", 886 | "▁la ' s ▁so ▁nice .\r\n", 887 | "▁i ' m ▁going ▁to ▁fix ▁it .\r\n", 888 | "▁i ▁get ▁b ored .\r\n" 889 | ] 890 | } 891 | ], 892 | "source": [ 893 | "!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.en\n" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": 126, 899 | "metadata": {}, 900 | "outputs": [ 901 | { 902 | "name": "stdout", 903 | "output_type": "stream", 904 | "text": [ 905 | "เบค กี้ เธอ ทำท่า แปลก ๆ เมื่อกี้ ใน ห้อง\r\n", 906 | "อยู่ กับ เธอ แอน นา จะ นำทาง คุณ ผม จะ กลับ ไป\r\n", 907 | "ฟัง นะ\r\n", 908 | "พอดี เลย ดร. ลี วิ ส\r\n", 909 | "แบบ ว่า\r\n", 910 | "เอ่อ บาร์ เท็น เด อร ์ ขอ อะไร ที่\r\n", 911 | "ก็ ใช่ ห น่ะ สิ\r\n", 912 | "แอลเอ สวย เนอะ\r\n", 913 | "ฉัน กำลังจะ แก้ ไขมัน\r\n", 914 | "ฉัน เบื่อ ละ\r\n" 915 | ] 916 | } 917 | ], 918 | "source": [ 919 | "!head ../data/opensubtitles_tok/newmm-sentencepiece/th-en/train.th" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": {}, 926 | "outputs": [], 927 | "source": [] 928 | } 929 | ], 930 | "metadata": { 931 | "kernelspec": { 932 | "display_name": "Python 3", 933 | "language": "python", 934 | "name": "python3" 935 | }, 936 | "language_info": { 937 | "codemirror_mode": { 938 | "name": "ipython", 939 | "version": 3 940 | }, 941 | "file_extension": ".py", 942 | "mimetype": "text/x-python", 943 | "name": "python", 944 | "nbconvert_exporter": "python", 945 | "pygments_lexer": "ipython3", 946 | "version": "3.6.8" 947 | } 948 | }, 949 | "nbformat": 4, 950 | "nbformat_minor": 2 951 | } 952 | --------------------------------------------------------------------------------