├── models
    ├── __init__.py
    ├── transformer_utils
    │   ├── __init__.py
    │   ├── mask.py
    │   ├── basic_layer.py
    │   ├── models.py
    │   ├── optim.py
    │   ├── lightweight_layer.py
    │   ├── utils.py
    │   └── evolved_layer.py
    ├── lstm.py
    ├── transformer.py
    └── model_factory.py
├── ngram_model
    ├── requirements.txt
    ├── train_2gram.sh
    ├── train_3gram.sh
    ├── test_2gram.sh
    ├── test_3gram.sh
    ├── result.png
    ├── tokenizer_standarized.h5
    ├── train.py
    ├── Readme.md
    ├── test_2gram.py
    ├── test_3gram.py
    └── accent_utils.py
├── run_bi_lstm.sh
├── run_bi_lstm_large.sh
├── run_transformer_evolved.sh
├── train_bi_lstm.sh
├── train_transformer.sh
├── train_bi_lstm_large.sh
├── test_bi_lstm_large.sh
├── train_transformer_evolved.sh
├── test_transformer_evolved.sh
├── model_config.json
├── dataloader.py
├── preprocess_data
    ├── split_data.py
    └── Preprocess_data_Wikipedia.ipynb
├── translate.py
├── .gitignore
├── README.md
├── test.py
├── train.py
├── utils.py
└── accent_utils.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/models/transformer_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ngram_model/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk==3.5
2 | 


--------------------------------------------------------------------------------
/ngram_model/train_2gram.sh:
--------------------------------------------------------------------------------
1 | python train.py train.tone 2gram_model.pkl --ngram 2


--------------------------------------------------------------------------------
/ngram_model/train_3gram.sh:
--------------------------------------------------------------------------------
1 | python train.py train.tone 3gram_model.pkl --ngram 3


--------------------------------------------------------------------------------
/ngram_model/test_2gram.sh:
--------------------------------------------------------------------------------
1 | python test_2gram.py val.notone val.tone 2gram_model.pkl


--------------------------------------------------------------------------------
/ngram_model/test_3gram.sh:
--------------------------------------------------------------------------------
1 | python test_3gram.py val.notone val.tone 3gram_model.pkl


--------------------------------------------------------------------------------
/ngram_model/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VNOpenAI/vn-accent/HEAD/ngram_model/result.png


--------------------------------------------------------------------------------
/ngram_model/tokenizer_standarized.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/VNOpenAI/vn-accent/HEAD/ngram_model/tokenizer_standarized.h5


--------------------------------------------------------------------------------
/run_bi_lstm.sh:
--------------------------------------------------------------------------------
1 | python translate.py 'data/tokenizer.h5' 'experiments/bi_lstm/weights/epoch_02.h5' \
2 | --model_name 'bi_lstm'


--------------------------------------------------------------------------------
/run_bi_lstm_large.sh:
--------------------------------------------------------------------------------
1 | python translate.py 'data/tokenizer.h5' 'experiments/bi_lstm_large/weights/epoch_18.h5' \
2 | --model_name 'bi_lstm_large'


--------------------------------------------------------------------------------
/run_transformer_evolved.sh:
--------------------------------------------------------------------------------
1 | python translate.py 'data/tokenizer.h5' 'experiments/transformer_evolved/weights/epoch_14.h5' \
2 | --model_name 'transformer_evolved'


--------------------------------------------------------------------------------
/train_bi_lstm.sh:
--------------------------------------------------------------------------------
1 | python3 train.py 'data/tokenizer.h5' 'data/train' 'data/val' \
2 | --num_epochs 20 \
3 | --cuda \
4 | --learning_rate 0.001 \
5 | --model_name 'bi_lstm' \
6 | --experiment_name 'bi_lstm'
7 | 


--------------------------------------------------------------------------------
/train_transformer.sh:
--------------------------------------------------------------------------------
1 | python3 train.py 'data/tokenizer.h5' 'data/train' 'data/val' \
2 | --num_epochs 50 \
3 | --cuda \
4 | --learning_rate 0.0003 \
5 | --model_name 'transformer' \
6 | --experiment_name 'transformer'
7 | 


--------------------------------------------------------------------------------
/train_bi_lstm_large.sh:
--------------------------------------------------------------------------------
1 | python3 train.py 'data/tokenizer.h5' 'data/train' 'data/val' \
2 | --num_epochs 20 \
3 | --cuda \
4 | --learning_rate 0.0001 \
5 | --model_name 'bi_lstm_large' \
6 | --experiment_name 'bi_lstm_large'
7 | 


--------------------------------------------------------------------------------
/test_bi_lstm_large.sh:
--------------------------------------------------------------------------------
1 | python test.py 'data/tokenizer.h5' 'experiments/bi_lstm_large/weights/epoch_18.h5' \
2 | --model_name 'bi_lstm_large' \
3 | --test_data_file 'data/test.notone' \
4 | --ground_truth_file 'data/test.tone' \
5 | --cuda


--------------------------------------------------------------------------------
/train_transformer_evolved.sh:
--------------------------------------------------------------------------------
1 | python3 train.py 'data/tokenizer.h5' 'data/train' 'data/val' \
2 | --num_epochs 20 \
3 | --cuda \
4 | --learning_rate 0.0003 \
5 | --model_name 'transformer_evolved' \
6 | --experiment_name 'transformer_evolved'
7 | 


--------------------------------------------------------------------------------
/test_transformer_evolved.sh:
--------------------------------------------------------------------------------
1 | python test.py 'data/tokenizer.h5' 'experiments/transformer_evolved/weights/epoch_14.h5' \
2 | --model_name 'transformer_evolved' \
3 | --test_data_file 'data/test.notone' \
4 | --ground_truth_file 'data/test.tone' \
5 | --cuda


--------------------------------------------------------------------------------
/models/lstm.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import torch.nn.functional as F
 3 | 
 4 | class LSTM(nn.Module):
 5 |     def __init__(self, src_vocab_size, trg_vocab_size, 
 6 |             d_model, bidirectional=False):
 7 |         super().__init__()
 8 |         self.embed = nn.Embedding(src_vocab_size, d_model)
 9 |         num_directions = 2 if bidirectional else 1
10 |         self.lstm = nn.LSTM(d_model, d_model, bidirectional=bidirectional, batch_first=True)
11 |         self.out = nn.Linear(d_model * num_directions, trg_vocab_size)
12 |     def forward(self, src):
13 |         embedded = self.embed(src)
14 |         lstm_output, hidden = self.lstm(embedded)
15 |         output = self.out(lstm_output)
16 |         return output
17 | 


--------------------------------------------------------------------------------
/model_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "transformer_evolved": {
 3 |         "model_type": "TRANSFORMER_ENCODER_EVOLVED",
 4 |         "d_model": 512,
 5 |         "d_ff": 2048,
 6 |         "num_layers": 3,
 7 |         "num_heads": 8,
 8 |         "dropout": 0.1,
 9 |         "use_mask": true
10 |     },
11 |     "transformer": {
12 |         "model_type": "TRANSFORMER_ENCODER_BASE",
13 |         "d_model": 512,
14 |         "d_ff": 2048,
15 |         "num_layers": 3,
16 |         "num_heads": 8,
17 |         "dropout": 0.1,
18 |         "use_mask": true
19 |     },
20 |     "lstm": {
21 |         "model_type": "LSTM",
22 |         "d_model": 256,
23 |         "use_mask": false
24 |     },
25 |     "bi_lstm": {
26 |         "model_type": "LSTM_BIDIRECTIONAL",
27 |         "d_model": 256,
28 |         "use_mask": false
29 |     },
30 |     "bi_lstm_large": {
31 |         "model_type": "LSTM_BIDIRECTIONAL",
32 |         "d_model": 512,
33 |         "use_mask": false
34 |     }
35 | }


--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils import data
 3 | from keras.preprocessing.sequence import pad_sequences
 4 | 
 5 | class Dataset(data.Dataset):
 6 |     def __init__(self, src_tokenizer, trg_tokenizer, src_corpus_path, trg_corpus_path, pad_len=200):
 7 |         with open(src_corpus_path) as f:
 8 |             src_data = f.read().split('\n')[:-1]
 9 |         with open(trg_corpus_path) as f:
10 |             trg_data = f.read().split('\n')[:-1]
11 |         
12 |         src_tokens = src_tokenizer.texts_to_sequences(src_data)
13 |         trg_tokens = trg_tokenizer.texts_to_sequences(trg_data)
14 | 
15 |         src_tokens = pad_sequences(src_tokens, pad_len)
16 |         trg_tokens = pad_sequences(trg_tokens, pad_len)
17 | 
18 |         self.src = src_tokens.astype(np.int64)
19 |         self.trg = trg_tokens.astype(np.int64)
20 | 
21 |     def __len__(self):
22 |         return len(self.src)
23 | 
24 |     def __getitem__(self, ix):
25 |         return self.src[ix], self.trg[ix]


--------------------------------------------------------------------------------
/models/transformer_utils/mask.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | def nopeak_mask(size):
 6 |     np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
 7 |     np_mask = np_mask == 0
 8 |     np_mask =  torch.from_numpy(np_mask.astype('uint8'))
 9 |     return np_mask
10 | 
11 | def create_src_mask(src, pad_token):
12 |     """ src: BxS
13 |         pad_token: index of pad_token token
14 | 
15 |         output: Bx1xS --> broadcast BxSxS
16 |     """
17 |     src_mask = (src != pad_token).unsqueeze(-2)
18 |     return src_mask
19 | 
20 | def create_trg_mask(trg, pad_token):
21 |     trg_mask = (trg != pad_token).unsqueeze(-2)
22 |     size = trg.size(1) # get seq_len for matrix
23 |     np_mask = nopeak_mask(size)
24 |     if trg.is_cuda:
25 |         np_mask = np_mask.cuda()
26 |     trg_mask = trg_mask & np_mask
27 |     return trg_mask
28 | 
29 | def create_mask(src, trg, src_pad_token, trg_pad_token):
30 |     src_mask = create_src_mask(src, src_pad_token)
31 |     trg_mask = create_trg_mask(trg, trg_pad_token)
32 |     return src_mask, trg_mask


--------------------------------------------------------------------------------
/preprocess_data/split_data.py:
--------------------------------------------------------------------------------
 1 | from accent_utils import *
 2 | import random
 3 | 
 4 | 
 5 | data = []
 6 | 
 7 | with open("data/wikipedia.txt", 'r', encoding='utf-8') as f:
 8 |     wiki_data = f.read().split("\n")
 9 |     print("Wiki: {} sentences".format(len(wiki_data)))
10 |     data += wiki_data
11 | 
12 | with open("data/yhoc.txt", 'r', encoding='utf-8') as f:
13 |     yhoc_data = f.read().split("\n")
14 |     print("YHoc: {} sentences".format(len(yhoc_data)))
15 |     data += yhoc_data
16 | 
17 | random.seed(42)
18 | random.shuffle(data)
19 | 
20 | X = data
21 | y = [remove_tone_line(x) for x in X]
22 | 
23 | X_val = X[:10000]
24 | y_val = y[:10000]
25 | X_test = X[10000:20000]
26 | y_test = y[10000:20000]
27 | X_train = X[20000:]
28 | y_train = y[20000:]
29 | 
30 | with open("data/train.tone", 'w', encoding='utf-8') as f:
31 |     f.write("\n".join(X_train))
32 | with open("data/train.notone", 'w', encoding='utf-8') as f:
33 |     f.write("\n".join(y_train))
34 | print("Train: {} samples".format(len(X_train)))
35 | 
36 | with open("data/val.tone", 'w', encoding='utf-8') as f:
37 |     f.write("\n".join(X_val))
38 | with open("data/val.notone", 'w', encoding='utf-8') as f:
39 |     f.write("\n".join(y_val))
40 | print("Val: {} samples".format(len(X_val)))
41 | 
42 | with open("data/test.tone", 'w', encoding='utf-8') as f:
43 |     f.write("\n".join(X_test))
44 | with open("data/test.notone", 'w', encoding='utf-8') as f:
45 |     f.write("\n".join(y_test))
46 | print("Test: {} samples".format(len(X_test)))
47 | 


--------------------------------------------------------------------------------
/models/transformer.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | from .transformer_utils.models import Decoder, Encoder
 4 | from .transformer_utils import basic_layer
 5 | 
 6 | 
 7 | class TransformerEncoder(nn.Module):
 8 |     def __init__(self, src_vocab_size, trg_vocab_size, 
 9 |             d_model, d_ff, num_layers, num_heads, dropout, 
10 |             layer_type=basic_layer):
11 |         super().__init__()
12 |         self.encoder = Encoder(src_vocab_size, d_model, d_ff, num_layers, num_heads, dropout, layer_type.EncoderLayer)
13 |         self.out = nn.Linear(d_model, trg_vocab_size)
14 |     def forward(self, src, src_mask):
15 |         e_outputs = self.encoder(src, src_mask)
16 |         output = self.out(e_outputs)
17 |         return output
18 | 
19 | 
20 | class Transformer(nn.Module):
21 |     def __init__(self, src_vocab_size, trg_vocab_size, 
22 |             d_model, d_ff, num_layers, num_heads, dropout, 
23 |             layer_type=basic_layer):
24 |         super().__init__()
25 |         self.encoder = Encoder(src_vocab_size, d_model, d_ff, num_layers, num_heads, dropout, layer_type.EncoderLayer)
26 |         self.decoder = Decoder(trg_vocab_size, d_model, d_ff, num_layers, num_heads, dropout, layer_type.DecoderLayer)
27 |         self.out = nn.Linear(d_model, trg_vocab_size)
28 |     def forward(self, src, trg, src_mask, trg_mask):
29 |         e_outputs = self.encoder(src, src_mask)
30 |         d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
31 |         output = self.out(d_output)
32 |         return output
33 | 


--------------------------------------------------------------------------------
/ngram_model/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from torch import load
 3 | from tqdm import tqdm
 4 | import pickle
 5 | from nltk.tokenize import word_tokenize
 6 | from nltk.lm.preprocessing import padded_everygram_pipeline
 7 | from nltk.lm import KneserNeyInterpolated
 8 | import time
 9 | 
10 | 
11 | def get_arg():
12 |     parse = argparse.ArgumentParser()
13 |     parse.add_argument('doc_dir')
14 |     parse.add_argument('model_dir')
15 |     parse.add_argument('--ngram', type=int, default=3)
16 |     return parse.parse_args()
17 | 
18 | # tokenize word, if word not in dictionary then word = 'unknown'
19 | def tokenize(doc):
20 |     temp = load('tokenizer_standarized.h5')
21 |     vnword = temp['tone'].word_index
22 |     result = []
23 |     for sent in tqdm(doc):
24 |         temp = word_tokenize(sent)
25 |         for idx, word in enumerate(temp):
26 |             if word not in vnword:
27 |                 temp[idx] = 'unknown'
28 |         result.append(temp)
29 |     print('tokenize done')
30 |     return result
31 |     
32 | 
33 | if __name__=='__main__':
34 |     arg = get_arg()
35 | 
36 |     # get train data and tokenize
37 |     with open(arg.doc_dir, 'r', encoding='utf-8') as fin:
38 |         doc = fin.readlines()
39 |     corpus = tokenize(doc)
40 |     del doc
41 | 
42 |     vi_model = KneserNeyInterpolated(arg.ngram)
43 |     train_data, padded_sent = padded_everygram_pipeline(arg.ngram, corpus)
44 |     del corpus
45 |     start_time = time.time()
46 |     vi_model.fit(train_data, padded_sent)
47 |     print('train %s-gram model in %d s'%(arg.ngram, time.time()-start_time))
48 |     print('length of vocab = %s'%(len(vi_model.vocab)))
49 | 
50 |     with open(arg.model_dir, 'wb') as fout:
51 |         pickle.dump(vi_model, fout)
52 |     print('save model successfully!')
53 |   
54 | 
55 | 
56 |   


--------------------------------------------------------------------------------
/ngram_model/Readme.md:
--------------------------------------------------------------------------------
 1 | # Giới thiệu
 2 | Mã nguồn này được phát triển dựa trên bài viết tại [đây](https://viblo.asia/p/language-modeling-mo-hinh-ngon-ngu-va-bai-toan-them-dau-cau-trong-tieng-viet-1VgZveV2KAw?fbclid=IwAR27jOzmETv8zUj-idE5uMh7BGuKQqvOMtnTRpAdqVhhVwDHXbuTfYy59J0). 
 3 | **So với bài viết trên, tác giả đã thay đổi:**
 4 | - [x] Sử dụng dữ liệu từ wiki + báo cáo y tế.
 5 | - [x] Thêm metric + code test.
 6 | - [x] Thêm code tiền/hậu xử lý cho chuỗi đầu vào.
 7 | - [x] Khảo sất độ chính xác model và tốc độ với các tham số khác nhau.
 8 | Xem nhanh file hướng dẫn demo ở [đây](https://colab.research.google.com/drive/1_kNLhHi9Kc4d__Y6DfFuh7popS1A5DmT?usp=sharing)
 9 | ## Tiền xử lý
10 | Ở đây chúng ta không dùng tokenizer tạo từ các tập data trên mà dùng word list từ [vietnamese-wordlist](https://github.com/duyetdev/vietnamese-wordlist) để tạo bộ tokenizer bằng cách split các từ trong word list và chọn lại các từ đơn.
11 | Các từ không nằm trong word list sẽ được gán thành 'unknown'. Điều này làm giảm đi rất nhiều kích thước model và làm tăng độ chính xác.
12 | ## Model
13 | Triển khai model 2grams và 3grams. 
14 | Mô hình sau khi huấn luyện tại [đây](https://drive.google.com/drive/folders/1I0uzjSMQOb07I0nnbWzCsgwCmdmjRD9c?usp=sharing)
15 | ## Train model
16 | ```sh
17 | python train.py train.tone 2gram_model.pkl --ngram 2
18 | ```
19 | ```sh
20 | python train.py train.tone 3gram_model.pkl --ngram 3
21 | ```
22 | ## Test model
23 | ```sh
24 | python test_2gram.py val.notone val.tone 2gram_model.pkl
25 | ```
26 | ```sh
27 | python test_3gram.py val.notone val.tone 3gram_model.pkl
28 | ```
29 | ## Kết quả
30 | 
31 | | ngram | nbeam | val_score | runtime(500 sen) | test_score | model_size |
32 | | -- | -- | -- | -- | --| -- |
33 | | 2gram | 1 | 0.8231 | 8.5 |  | 41Mb |
34 | ||2|0.8905|14.65|||
35 | ||3|0.9138|15|||
36 | ||4|0.9216|16.5|||
37 | ||5|0.9258|18.3|0.9221||
38 | ||6|0.9273|18.5|||
39 | ||7|0.9282|18.9|||
40 | |3gram|1|0.8840|9||441Mb|
41 | ||2|0.9128|15.5|||
42 | ||3|0.9309|14.35|||
43 | ||4|0.9369|19.5|||
44 | ||5|0.9411|25.5|0.9401||
45 | ||6|0.9438|26.5|||
46 | ||7|0.9453|31.9|||
47 | 
48 | ![result](result.png)
49 | 


--------------------------------------------------------------------------------
/models/transformer_utils/basic_layer.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | from .utils import *
 4 | 
 5 | class EncoderLayer(nn.Module):
 6 |     def __init__(self, d_model, d_ff, heads, dropout=0.1):
 7 |         super().__init__()
 8 |         self.norm_1 = Norm(d_model)
 9 |         self.norm_2 = Norm(d_model)
10 |         self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
11 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
12 |         self.dropout_1 = nn.Dropout(dropout)
13 |         self.dropout_2 = nn.Dropout(dropout)
14 |         
15 |     def forward(self, x, mask):
16 |         residual = x
17 |         x = self.norm_1(x)
18 |         x = self.attn(x,x,x,mask)
19 |         x = self.dropout_1(x)
20 |         x = residual + x
21 | 
22 |         residual = x
23 |         x = self.norm_2(x)
24 |         x = self.ff(x)
25 |         x = self.dropout_2(x)
26 |         x = residual + x
27 |         
28 |         return x
29 | 
30 | 
31 | class DecoderLayer(nn.Module):
32 |     def __init__(self, d_model, d_ff, heads, dropout=0.1):
33 |         super().__init__()
34 |         self.norm_1 = Norm(d_model)
35 |         self.norm_2 = Norm(d_model)
36 |         self.norm_3 = Norm(d_model)
37 |         
38 |         self.dropout_1 = nn.Dropout(dropout)
39 |         self.dropout_2 = nn.Dropout(dropout)
40 |         self.dropout_3 = nn.Dropout(dropout)
41 |         
42 |         self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
43 |         self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
44 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
45 | 
46 |     def forward(self, x, e_outputs, src_mask, trg_mask):
47 |         residual = x
48 |         x = self.norm_1(x)
49 |         x = self.attn_1(x,x,x,trg_mask)
50 |         x = self.dropout_1(x)
51 |         x = residual + x
52 | 
53 |         residual = x
54 |         x = self.norm_2(x)
55 |         x = self.attn_2(x,e_outputs,e_outputs,src_mask)
56 |         x = self.dropout_2(x)
57 |         x = residual + x
58 | 
59 |         residual = x
60 |         x = self.norm_3(x)
61 |         x = self.ff(x)
62 |         x = self.dropout_3(x)
63 |         x = residual + x
64 | 
65 |         return x
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/models/transformer_utils/models.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | from .basic_layer import * 
 6 | 
 7 | 
 8 | def get_clones(module, N):
 9 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
10 | 
11 | class Encoder(nn.Module):
12 |     def __init__(self, vocab_size, d_model, d_ff, N, heads, dropout, encoder_layer):
13 |         super().__init__()
14 |         self.N = N
15 |         self.embed = Embedder(vocab_size, d_model)
16 |         self.pe = PositionalEncoder(d_model, dropout=dropout)
17 |         self.layers = get_clones(encoder_layer(d_model, d_ff,  heads, dropout), N)
18 |         self.norm = Norm(d_model)
19 |     def forward(self, src, mask):
20 |         x = self.embed(src)
21 |         x = self.pe(x)
22 |         for i in range(self.N):
23 |             x = self.layers[i](x, mask)
24 |         return self.norm(x)
25 | 
26 | class Decoder(nn.Module):
27 |     def __init__(self, vocab_size, d_model, d_ff, N, heads, dropout, decoder_layer):
28 |         super().__init__()
29 |         self.N = N
30 |         self.embed = Embedder(vocab_size, d_model)
31 |         self.pe = PositionalEncoder(d_model, dropout=dropout)
32 |         self.layers = get_clones(decoder_layer(d_model, d_ff, heads, dropout), N)
33 |         self.norm = Norm(d_model)
34 |     def forward(self, trg, e_outputs, src_mask, trg_mask):
35 |         x = self.embed(trg)
36 |         x = self.pe(x)
37 |         for i in range(self.N):
38 |             x = self.layers[i](x, e_outputs, src_mask, trg_mask)
39 |         return self.norm(x)
40 | 
41 | class Transformer(nn.Module):
42 |     def __init__(self, src_vocab_size, trg_vocab_size, 
43 |             d_model, d_ff, N, heads, dropout, 
44 |             encoder_layer=EncoderLayer, decoder_layer=DecoderLayer):
45 |         super().__init__()
46 |         self.encoder = Encoder(src_vocab_size, d_model, d_ff, N, heads, dropout, encoder_layer)
47 |         self.decoder = Decoder(trg_vocab_size, d_model, d_ff, N, heads, dropout, decoder_layer)
48 |         self.out = nn.Linear(d_model, trg_vocab_size)
49 |     def forward(self, src, trg, src_mask, trg_mask):
50 |         e_outputs = self.encoder(src, src_mask)
51 |         d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
52 |         output = self.out(d_output)
53 |         return output
54 | 
55 |   


--------------------------------------------------------------------------------
/translate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | import torch
 6 | 
 7 | from models.model_factory import get_model
 8 | from utils import translate
 9 | 
10 | def get_arg():
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('vocab_path')
13 |     # parser.add_argument('test_path')
14 |     parser.add_argument('weight_file')
15 |     # parser.add_argument('--output_file', default='output.txt')
16 |     parser.add_argument('--config_file', default='model_config.json')
17 |     parser.add_argument('--model_name', default='big_evolved')
18 |     parser.add_argument('--batch_size', type=int, default=32)
19 |     parser.add_argument('--cuda', action='store_true', default=False)
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     return args
24 | 
25 | 
26 | if __name__=='__main__':
27 |     args = get_arg()
28 | 
29 |     # Load tokenizer
30 |     print("Load tokenizer")
31 |     tokenizer = torch.load(args.vocab_path)
32 |     src_tokenizer = tokenizer['notone']
33 |     trg_tokenizer = tokenizer['tone']
34 |     src_pad_token = 0
35 |     trg_pad_token = 0
36 | 
37 |     # Load model
38 |     print("Init model")
39 |     with open(args.config_file) as f:
40 |         config = json.load(f)
41 |     
42 |     if args.model_name in config:
43 |         model_param = config[args.model_name]
44 |     else:
45 |         raise Exception("Invalid model name")
46 |     
47 |     model_param['src_vocab_size'] = len(src_tokenizer.word_index) + 1
48 |     model_param['trg_vocab_size'] = len(trg_tokenizer.word_index) + 1
49 | 
50 |     model = get_model(model_param)
51 |     device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
52 |     print("Using", device.type)
53 |     if device.type=='cuda':
54 |         model = model.cuda()
55 | 
56 |     if os.path.isfile(args.weight_file):
57 |         print("Load model")
58 |         state = torch.load(args.weight_file)
59 |         if isinstance(state, dict):
60 |             model.load_state_dict(state['model'])
61 |         else:
62 |             model.load_state_dict(state)
63 |     else:
64 |         raise Exception("Invalid weight path")
65 | 
66 |     while True:
67 |         sents = input("String: ")
68 |         res = translate(model, sents, src_tokenizer, trg_tokenizer, use_mask=model_param["use_mask"], device=device)
69 |         print(res)
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/models/transformer_utils/optim.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | 
 5 | class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
 6 |     """
 7 |     Cosine annealing with restarts.
 8 | 
 9 |     Parameters
10 |     ----------
11 |     optimizer : torch.optim.Optimizer
12 | 
13 |     T_max : int
14 |         The maximum number of iterations within the first cycle.
15 | 
16 |     eta_min : float, optional (default: 0)
17 |         The minimum learning rate.
18 | 
19 |     last_epoch : int, optional (default: -1)
20 |         The index of the last epoch.
21 | 
22 |     """
23 | 
24 |     def __init__(self,
25 |                  optimizer: torch.optim.Optimizer,
26 |                  T_max: int,
27 |                  eta_min: float = 0.,
28 |                  last_epoch: int = -1,
29 |                  factor: float = 1.) -> None:
30 |         # pylint: disable=invalid-name
31 |         self.T_max = T_max
32 |         self.eta_min = eta_min
33 |         self.factor = factor
34 |         self._last_restart = 0
35 |         self._cycle_counter = 0
36 |         self._cycle_factor = 1.0
37 |         self._updated_cycle_len = T_max
38 |         self._initialized = False
39 |         super(CosineWithRestarts, self).__init__(optimizer, last_epoch)
40 | 
41 |     def get_lr(self):
42 |         """Get updated learning rate."""
43 |         # HACK: We need to check if this is the first time get_lr() was called, since
44 |         # we want to start with step = 0, but _LRScheduler calls get_lr with
45 |         # last_epoch + 1 when initialized.
46 |         if not self._initialized:
47 |             self._initialized = True
48 |             return self.base_lrs
49 | 
50 |         step = self.last_epoch + 1
51 |         self._cycle_counter = step - self._last_restart
52 | 
53 |         lrs = [
54 |             (
55 |                 self.eta_min + ((lr - self.eta_min) / 2) *
56 |                 (
57 |                     np.cos(
58 |                         np.pi *
59 |                         ((self._cycle_counter) % self._updated_cycle_len) /
60 |                         self._updated_cycle_len
61 |                     ) + 1
62 |                 )
63 |             ) for lr in self.base_lrs
64 |         ]
65 | 
66 |         if self._cycle_counter % self._updated_cycle_len == 0:
67 |             # Adjust the cycle length.
68 |             self._cycle_factor *= self.factor
69 |             self._cycle_counter = 0
70 |             self._updated_cycle_len = int(self._cycle_factor * self.T_max)
71 |             self._last_restart = step
72 | 
73 |         return lrs


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # Ipython
  7 | .ipynb_checkpoints/
  8 | 
  9 | # Log and weight
 10 | log.txt
 11 | weight/
 12 | 
 13 | # MacOS
 14 | .DS_Store
 15 | 
 16 | # Data, Test dir
 17 | data/
 18 | test/
 19 | 
 20 | # C extensions
 21 | *.so
 22 | 
 23 | # Distribution / packaging
 24 | .Python
 25 | build/
 26 | develop-eggs/
 27 | dist/
 28 | downloads/
 29 | eggs/
 30 | .eggs/
 31 | lib/
 32 | lib64/
 33 | parts/
 34 | sdist/
 35 | var/
 36 | wheels/
 37 | pip-wheel-metadata/
 38 | share/python-wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .nox/
 58 | .coverage
 59 | .coverage.*
 60 | .cache
 61 | nosetests.xml
 62 | coverage.xml
 63 | *.cover
 64 | .hypothesis/
 65 | .pytest_cache/
 66 | 
 67 | # Translations
 68 | *.mo
 69 | *.pot
 70 | 
 71 | # Django stuff:
 72 | *.log
 73 | local_settings.py
 74 | db.sqlite3
 75 | db.sqlite3-journal
 76 | 
 77 | # Flask stuff:
 78 | instance/
 79 | .webassets-cache
 80 | 
 81 | # Scrapy stuff:
 82 | .scrapy
 83 | 
 84 | # Sphinx documentation
 85 | docs/_build/
 86 | 
 87 | # PyBuilder
 88 | target/
 89 | 
 90 | # Jupyter Notebook
 91 | .ipynb_checkpoints
 92 | 
 93 | # IPython
 94 | profile_default/
 95 | ipython_config.py
 96 | 
 97 | # pyenv
 98 | .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # celery beat schedule file
108 | celerybeat-schedule
109 | 
110 | # SageMath parsed files
111 | *.sage.py
112 | 
113 | # Environments
114 | .env
115 | .venv
116 | env/
117 | venv/
118 | ENV/
119 | env.bak/
120 | venv.bak/
121 | 
122 | # Spyder project settings
123 | .spyderproject
124 | .spyproject
125 | 
126 | # Rope project settings
127 | .ropeproject
128 | 
129 | # mkdocs documentation
130 | /site
131 | 
132 | # mypy
133 | .mypy_cache/
134 | .dmypy.json
135 | dmypy.json
136 | 
137 | # Pyre type checker
138 | .pyre/
139 | 
140 | 
141 | /experiments
142 | 
143 | /.vscode


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Giới thiệu
 2 | 
 3 | Mã nguồn ngày được xây dựng dựa trên bộ mã nguồn tại [đây](https://github.com/vudaoanhtuan/vietnamese-tone-prediction), với mục đích thêm dấu tiếng Việt cho báo cáo y tế.
 4 | 
 5 | **Các thay đổi đã/sẽ thực hiện:**
 6 | 
 7 | - [x] Refactor code + Thêm model LSTM + Linear.
 8 | - [x] Sử dụng dữ liệu từ wiki + báo y tế.
 9 | - [x] Thêm metric + code test.
10 | - [x] Thêm code tiền/hậu xử lý cho chuỗi đầu vào.
11 | - [ ] Thử nghiệm mô hình đoán dấu thay vì đoán từ.
12 | 
13 | ## Model
14 | 
15 | Repo này triển khai các mô hình dựa trên kiến trúc LSTM và Transformer. Trong đó với Transformer, do cần dự đoán một từ có dấu tương ứng với một từ không có dấu nên chỉ cần dùng Encoder là đủ.  
16 | 
17 | ## Data
18 | 
19 | ### Nguồn data
20 | 
21 | - [Wikipedia](https://dumps.wikimedia.org/viwiki/latest/viwiki-latest-pages-articles.xml.bz2). File dữ liệu tải sẵn tại bài viết này: <https://phamdinhkhanh.github.io/2020/05/28/TransformerThemDauTV.html>.
22 | - Dữ liệu tin tức và bài viết y tế tự crawl.
23 | ### Kết quả
24 | - Dữ liệu tổng hợp được tại [đây](https://drive.google.com/drive/folders/1Ik_oK5_AeU60LZ2cx3nOAycM-HG2BsGp?fbclid=IwAR3x-rbGZRDLaC_tTccJvF2H2S2zsAlZxQ_1RwRS4iQXZdGQD5qKAYMtT7Q)
25 | ### Một số bước tiền xử lí với 2 tập này:
26 | 
27 | - Với tập wiki thì định dạng của nó là xml nên cần dùng tool [wikiextractor](https://github.com/attardi/wikiextractor) để lấy nội dung text của các bài viết. Sau đó tách các câu dựa vào các dấu chấm câu như `.!;:`
28 | - Với tập các bài báo cũng tách các câu như vậy
29 | - Với mỗi câu, loại bỏ số, các dấu chấm câu và đưa về chữ thường
30 | - Loại bỏ các câu có dưới 10 từ và lớn hơn 200 từ rồi ghi ra file text, mỗi câu một dòng được một file khoảng 5300000 dòng
31 | - Dùng scipt để tạo file không có dấu từ file trên và chia thành 2 tập training và validation (tỉ lệ 85-15)
32 | 
33 | ## Tokenizer
34 | 
35 | Ở đây chúng ta không dùng tokenizer tạo từ các tập data trên mà dùng word list từ [vietnamese-wordlist](https://github.com/duyetdev/vietnamese-wordlist) để tạo bộ tokenizer bằng cách split các từ trong word list và chọn lại các từ đơn. 
36 | Kết quả được khoảng 9000 từ có dấu và 3000 từ không có dấu.
37 | 
38 | ## Demo
39 | 
40 | - Transformer Envolved:
41 | 
42 | ```
43 | sh run_transformer_evolved.sh
44 | ```
45 | 
46 | ## Train
47 | 
48 | - Transformer Envolved:
49 | 
50 | ```
51 | sh train_transformer_evolved.sh
52 | ```
53 | 
54 | ## Test
55 | 
56 | - Transformer Envolved:
57 | 
58 | ```
59 | sh test_transformer_evolved.sh
60 | ```
61 | 
62 | ## Tham khảo
63 | 
64 | Mã nguồn được xây dựng trên mã nguồn của tác giả Vũ Đào Anh Tuấn tại [đây](https://github.com/vudaoanhtuan/vietnamese-tone-prediction).
65 | 


--------------------------------------------------------------------------------
/models/model_factory.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from .transformer_utils import evolved_layer, basic_layer
 6 | 
 7 | from .transformer import TransformerEncoder, Transformer
 8 | from .lstm import LSTM
 9 | 
10 | 
11 | def get_model(model_param):
12 |     model_type = model_param.get("model_type")
13 |     src_vocab_size = model_param.get("src_vocab_size")
14 |     trg_vocab_size = model_param.get("trg_vocab_size")
15 |     d_model = model_param.get("d_model", 512)
16 |     d_ff = model_param.get("d_ff", 2048)
17 |     num_layers = model_param.get("num_layers", 6)
18 |     num_heads = model_param.get("num_heads", 8)
19 |     dropout = model_param.get("dropout", 0.0)
20 | 
21 |     # assert d_model % num_heads == 0
22 |     assert dropout < 1
23 | 
24 |     model = None
25 |     if model_type == "TRANSFORMER_BASE":
26 |         model = Transformer(src_vocab_size, trg_vocab_size, 
27 |                                 d_model, d_ff, num_layers, num_heads, 
28 |                                 dropout, layer_type=basic_layer)
29 |     elif model_type == "TRANSFORMER_ENCODER_BASE":
30 |         model = TransformerEncoder(src_vocab_size, trg_vocab_size, 
31 |                                 d_model, d_ff, num_layers, num_heads, 
32 |                                 dropout, layer_type=basic_layer)
33 |     elif model_type == "TRANSFORMER_EVOLVED":
34 |         model = Transformer(src_vocab_size, trg_vocab_size, 
35 |                                 d_model, d_ff, num_layers, num_heads, 
36 |                                 dropout, layer_type=evolved_layer)
37 |     elif model_type == "TRANSFORMER_ENCODER_EVOLVED":
38 |         model = TransformerEncoder(src_vocab_size, trg_vocab_size, 
39 |                                 d_model, d_ff, num_layers, num_heads, 
40 |                                 dropout, layer_type=evolved_layer)
41 |     elif model_type == "LSTM":
42 |         model = LSTM(src_vocab_size, trg_vocab_size, d_model, bidirectional=False)
43 |     elif model_type == "LSTM_BIDIRECTIONAL":
44 |         model = LSTM(src_vocab_size, trg_vocab_size, d_model, bidirectional=True)
45 |     else:
46 |         raise ValueError("Wrong model type: {}".format(model_type))
47 | 
48 |     for p in model.parameters():
49 |         if p.dim() > 1:
50 |             nn.init.xavier_uniform_(p) 
51 | 
52 |     return model
53 | 
54 | def load_model(model, optim=None, sched=None, path=''):
55 |     if os.path.isfile(path):
56 |         state = torch.load(path)
57 |         model.load_state_dict(state['model'])
58 |         if optim is not None:
59 |             optim.load_state_dict(state['optim'])
60 |         if sched is not None:
61 |             sched.load_state_dict(state['sched'])
62 |     else:
63 |         raise Exception("Invalid path")


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from tqdm import trange
  8 | 
  9 | from accent_utils import process_line
 10 | from utils import translate
 11 | 
 12 | from models.model_factory import get_model
 13 | 
 14 | 
 15 | def get_arg():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument('vocab_path')
 18 |     parser.add_argument('weight_file')
 19 |     parser.add_argument('--test_data_file', default="data/test.notone")
 20 |     parser.add_argument('--ground_truth_file', default="data/test.tone")
 21 |     parser.add_argument('--config_file', default='model_config.json')
 22 |     parser.add_argument('--model_name', default='big_evolved')
 23 |     parser.add_argument('--batch_size', type=int, default=32)
 24 |     parser.add_argument('--cuda', action='store_true', default=False)
 25 |     args = parser.parse_args()
 26 |     return args
 27 | 
 28 | def evaluate(pred, label):
 29 |     _, pred_punc = process_line(pred)
 30 |     _, label_punc = process_line(label)
 31 | 
 32 |     pred_punc = np.array(pred_punc)
 33 |     label_punc = np.array(label_punc)
 34 | 
 35 |     true_values = np.sum(pred_punc==label_punc)
 36 |     n_values = len(pred_punc)
 37 | 
 38 |     return true_values, n_values
 39 | 
 40 | if __name__=='__main__':
 41 |     args = get_arg()
 42 | 
 43 |     # Load tokenizer
 44 |     print("Load tokenizer")
 45 |     tokenizer = torch.load(args.vocab_path)
 46 |     src_tokenizer = tokenizer['notone']
 47 |     trg_tokenizer = tokenizer['tone']
 48 |     src_pad_token = 0
 49 |     trg_pad_token = 0
 50 | 
 51 |     # Load model
 52 |     print("Init model")
 53 |     with open(args.config_file) as f:
 54 |         config = json.load(f)
 55 |     
 56 |     if args.model_name in config:
 57 |         model_param = config[args.model_name]
 58 |     else:
 59 |         raise Exception("Invalid model name")
 60 |     
 61 |     model_param['src_vocab_size'] = len(src_tokenizer.word_index) + 1
 62 |     model_param['trg_vocab_size'] = len(trg_tokenizer.word_index) + 1
 63 | 
 64 |     model = get_model(model_param)
 65 |     device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
 66 |     print("Using", device.type)
 67 |     if device.type=='cuda':
 68 |         model = model.cuda()
 69 | 
 70 |     if os.path.isfile(args.weight_file):
 71 |         print("Load model")
 72 |         state = torch.load(args.weight_file)
 73 |         if isinstance(state, dict):
 74 |             model.load_state_dict(state['model'])
 75 |         else:
 76 |             model.load_state_dict(state)
 77 |     else:
 78 |         raise Exception("Invalid weight path")
 79 | 
 80 | 
 81 |     test_data_lines = None
 82 |     ground_truth_lines = None
 83 |     with open(args.test_data_file, "r", encoding='utf-8') as f:
 84 |         test_data_lines = f.readlines()
 85 |     with open(args.ground_truth_file, "r", encoding='utf-8') as f:
 86 |         ground_truth_lines = f.readlines()
 87 | 
 88 |     total_true_values = 0
 89 |     total_values = 0
 90 |     t = trange(len(test_data_lines), desc='', leave=True)
 91 |     for i in t:
 92 |         line = test_data_lines[i]
 93 |         line_gt = ground_truth_lines[i]
 94 |         line_pr = translate(model, line, src_tokenizer, trg_tokenizer, use_mask=model_param["use_mask"], device=device)
 95 |         true_values, n_values = evaluate(line_pr, line_gt)
 96 |         total_true_values += true_values
 97 |         total_values += n_values
 98 |         t.set_description("Accuracy: {:.4f}".format(total_true_values / total_values))
 99 |         t.refresh() # to show immediately the update
100 | 
101 |     print("Avg. Accuracy: {}".format(total_true_values / total_values))
102 | 


--------------------------------------------------------------------------------
/models/transformer_utils/lightweight_layer.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from fairseq.modules.lightweight_convolution import LightweightConv1dTBC
  3 | 
  4 | from .utils import *
  5 | 
  6 | 
  7 | class LightweightConvLayer(nn.Module):
  8 |     def __init__(self, d_model, conv_dim, kernel_size, weight_softmax, num_heads, weight_dropout):
  9 |         super().__init__()
 10 |         padding_l = kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2)
 11 |         self.linear_1 = nn.Linear(d_model, conv_dim*2)
 12 |         self.activation = nn.GLU()
 13 |         self.conv = LightweightConv1dTBC(conv_dim, kernel_size, padding_l=padding_l,
 14 |                                             weight_softmax=weight_softmax,
 15 |                                             num_heads=num_heads,
 16 |                                             weight_dropout=weight_dropout)
 17 |         self.linear_2 = nn.Linear(conv_dim, d_model)
 18 | 
 19 |     def forward(self, x, mask):
 20 |         x = self.linear_1(x)
 21 |         x = self.activation(x)
 22 |         conv_mask = mask[:,-1,:] # BxS
 23 |         conv_mask = conv_mask.unsqueeze(-1) # BxSx1 => BxSxD
 24 |         x = x.masked_fill(conv_mask==0, 0)
 25 |         x = x.transpose(0, 1) # SxBxH
 26 |         x = self.conv(x.contiguous())
 27 |         x = x.transpose(0, 1)
 28 |         x = self.linear_2(x)
 29 |         return x
 30 | 
 31 | 
 32 | 
 33 | class EncoderLayer(nn.Module):
 34 |     def __init__(self, d_model, d_ff, heads, dropout=0.1, weight_softmax=True, weight_dropout=0.):
 35 |         super().__init__()
 36 |         self.norm_1 = Norm(d_model)
 37 |         self.norm_2 = Norm(d_model)
 38 | 
 39 |         conv_dim = d_model
 40 |         kernel_size = 3
 41 |         self.conv = LightweightConvLayer(d_model, conv_dim, kernel_size,
 42 |                                             weight_softmax=weight_softmax,
 43 |                                             num_heads=heads,
 44 |                                             weight_dropout=weight_dropout)
 45 | 
 46 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
 47 | 
 48 |         self.dropout_1 = nn.Dropout(dropout)
 49 |         self.dropout_2 = nn.Dropout(dropout)
 50 |         
 51 |     def forward(self, x, mask):
 52 |         # x: BxSxH
 53 | 
 54 |         residual = x
 55 |         x = self.norm_1(x)
 56 |         x = self.conv(x, mask)
 57 |         x = self.dropout_1(x)
 58 |         x = residual + x
 59 | 
 60 |         residual = x
 61 |         x = self.norm_2(x)
 62 |         x = self.ff(x)
 63 |         x = self.dropout_2(x)
 64 |         x = residual + x
 65 | 
 66 |         return x
 67 | 
 68 | 
 69 | class DecoderLayer(nn.Module):
 70 |     def __init__(self, d_model, d_ff, heads, dropout=0.1, weight_softmax=True, weight_dropout=0.):
 71 |         super().__init__()
 72 |         self.norm_1 = Norm(d_model)
 73 |         self.norm_2 = Norm(d_model)
 74 |         self.norm_3 = Norm(d_model)
 75 |         
 76 |         self.dropout_1 = nn.Dropout(dropout)
 77 |         self.dropout_2 = nn.Dropout(dropout)
 78 |         self.dropout_3 = nn.Dropout(dropout)
 79 | 
 80 |         self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
 81 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
 82 | 
 83 |         conv_dim = d_model
 84 |         kernel_size = 3
 85 |         self.conv = LightweightConvLayer(d_model, conv_dim, kernel_size,
 86 |                                             weight_softmax=weight_softmax,
 87 |                                             num_heads=heads,
 88 |                                             weight_dropout=weight_dropout)
 89 | 
 90 |     def forward(self, x, e_outputs, src_mask, trg_mask):
 91 |         residual = x
 92 |         x = self.norm_1(x)
 93 |         x = self.conv(x, trg_mask)
 94 |         x = self.dropout_1(x)
 95 |         x = residual + x
 96 | 
 97 |         residual = x
 98 |         x = self.norm_2(x)
 99 |         x = self.attn(x,e_outputs,e_outputs,src_mask)
100 |         x = self.dropout_2(x)
101 |         x = residual + x
102 | 
103 |         residual = x
104 |         x = self.norm_3(x)
105 |         x = self.ff(x)
106 |         x = self.dropout_3(x)
107 |         x = residual + x
108 | 
109 |         return x
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/ngram_model/test_2gram.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from tqdm import trange
  3 | from tqdm import tqdm
  4 | from nltk.tokenize.treebank import TreebankWordDetokenizer
  5 | detokenize = TreebankWordDetokenizer().detokenize
  6 | import re 
  7 | import argparse
  8 | from accent_utils import process_line
  9 | import numpy as np
 10 | from multiprocessing import Pool
 11 | from torch import load
 12 | 
 13 | def get_arg():
 14 |     parse = argparse.ArgumentParser()
 15 |     parse.add_argument('test_data_file')
 16 |     parse.add_argument('ground_truth_file')
 17 |     parse.add_argument('model_dir')
 18 |     parse.add_argument('--nbeam', type=int, default=3)
 19 |     parse.add_argument('--npool', type=int, default=2)
 20 |     return parse.parse_args()
 21 | 
 22 | def remove_vn_accent(word):
 23 |     word = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', word)
 24 |     word = re.sub('[éèẻẽẹêếềểễệ]', 'e', word)
 25 |     word = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', word)
 26 |     word = re.sub('[íìỉĩị]', 'i', word)
 27 |     word = re.sub('[úùủũụưứừửữự]', 'u', word)
 28 |     word = re.sub('[ýỳỷỹỵ]', 'y', word)
 29 |     word = re.sub('đ', 'd', word)
 30 |     return word
 31 |     
 32 | def vn_dict(vn_word):
 33 |     vndict = {}
 34 |     for word in vnword:
 35 |         no_accent = remove_vn_accent(word)
 36 |         if not no_accent in vndict.keys():
 37 |             vndict.setdefault(no_accent, [word])
 38 |         else:
 39 |             vndict[no_accent].append(word)
 40 |     return vndict
 41 | 
 42 | # get dictionary for tieng viet
 43 | tokenizer = load('tokenizer_standarized.h5')
 44 | vnword = list(tokenizer['tone'].word_index.keys())
 45 | vndict = vn_dict(vnword)
 46 | 
 47 | def gen_accents(word):
 48 |     word = remove_vn_accent(word.lower())
 49 |     if word in vndict:
 50 |         return vndict[word]
 51 |     else:
 52 |         return [word]
 53 | 
 54 | # beam search
 55 | def beam_search(words, model, k=3):
 56 |     sequences = []
 57 |     for idx, word in enumerate(words):
 58 |         if idx == 0:
 59 |             sequences = [([x], 0.0) for x in gen_accents(word)]
 60 |         else:
 61 |             all_sequences = []
 62 |             for seq in sequences:
 63 |                 for next_word in gen_accents(word):
 64 |                     current_word = seq[0][-1]
 65 |                     score = model.logscore(next_word, [current_word])
 66 |                     new_seq = seq[0].copy()
 67 |                     new_seq.append(next_word)
 68 |                     all_sequences.append((new_seq, seq[1] + score))
 69 |             all_sequences = sorted(all_sequences,key=lambda x: x[1], reverse=True)
 70 |             sequences = all_sequences[:k]
 71 |     return sequences
 72 | 
 73 | def translate(sent, model_sent, k):
 74 |     sent = sent.replace('\n','')
 75 |     result = beam_search(sent.lower().split(), model_sent, k)
 76 |     return detokenize(result[0][0])
 77 | 
 78 | def evaluate(pred, label):
 79 |     _, pred_punc = process_line(pred)
 80 |     _, label_punc = process_line(label)
 81 | 
 82 |     pred_punc = np.array(pred_punc)
 83 |     label_punc = np.array(label_punc)
 84 | 
 85 |     true_values = np.sum(pred_punc==label_punc)
 86 |     n_values = len(pred_punc)
 87 | 
 88 |     return true_values, n_values
 89 | 
 90 | # load model
 91 | args = get_arg()
 92 | with open(args.model_dir, 'rb') as fin:
 93 |     model = pickle.load(fin)
 94 |     print('load model done')
 95 | 
 96 | nbeam = args.nbeam
 97 | def translate1(sent):
 98 |     return translate(sent, model, nbeam)
 99 | 
100 | # config to use multiprocessing 
101 | npool = args.npool
102 | def pool_handler(data):
103 |     print('process with %d pool, nbeam = %d'%(npool, nbeam))
104 |     with Pool(npool) as p:
105 |         temp_result = list(tqdm(p.imap(translate1, data), total=len(data)))
106 |     return temp_result
107 |   
108 | 
109 | if __name__=='__main__':
110 |     test_data_lines = None
111 |     ground_truth_lines = None
112 |     with open(args.test_data_file, "r", encoding='utf-8') as f:
113 |         test_data_lines = f.readlines()
114 | 
115 |     print('load data done')
116 | 
117 |     temp_result = pool_handler(test_data_lines[:500])
118 |     del test_data_lines
119 | 
120 |     total_true_values = 0
121 |     total_values = 0
122 |     with open(args.ground_truth_file, "r", encoding='utf-8') as f:
123 |         ground_truth_lines = f.readlines()
124 | 
125 |     for i in range(len(temp_result)):
126 |         true_values, n_values = evaluate(temp_result[i], ground_truth_lines[i])
127 |         total_true_values += true_values
128 |         total_values += n_values
129 | 
130 |     print("Avg. Accuracy: {}".format(total_true_values / total_values))
131 | 
132 | 
133 | 
134 |   
135 |   
136 | 


--------------------------------------------------------------------------------
/models/transformer_utils/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | 
  7 | ##############   Sublayer  ##########################
  8 | 
  9 | class Norm(nn.Module):
 10 |     def __init__(self, d_model, eps = 1e-6):
 11 |         super().__init__()
 12 |     
 13 |         self.size = d_model
 14 |         
 15 |         # create two learnable parameters to calibrate normalisation
 16 |         self.alpha = nn.Parameter(torch.ones(self.size))
 17 |         self.bias = nn.Parameter(torch.zeros(self.size))
 18 |         
 19 |         self.eps = eps
 20 |     
 21 |     def forward(self, x):
 22 |         norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
 23 |         / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
 24 |         return norm
 25 | 
 26 | def attention(q, k, v, d_k, mask=None, dropout=None):
 27 |     scores = torch.matmul(q, k.transpose(-2, -1)) /  np.sqrt(d_k)
 28 |     
 29 |     if mask is not None:
 30 |         mask = mask.unsqueeze(1)
 31 |         scores = scores.masked_fill(mask == 0, -1e9)
 32 |     
 33 |     scores = F.softmax(scores, dim=-1)
 34 |     
 35 |     if dropout is not None:
 36 |         scores = dropout(scores)
 37 |         
 38 |     output = torch.matmul(scores, v)
 39 |     return output
 40 | 
 41 | class MultiHeadAttention(nn.Module):
 42 |     def __init__(self, heads, d_model, dropout = 0.1):
 43 |         super().__init__()
 44 |         
 45 |         self.d_model = d_model
 46 |         self.d_k = d_model // heads
 47 |         self.h = heads
 48 |         
 49 |         self.q_linear = nn.Linear(d_model, d_model)
 50 |         self.v_linear = nn.Linear(d_model, d_model)
 51 |         self.k_linear = nn.Linear(d_model, d_model)
 52 |         
 53 |         self.dropout = nn.Dropout(dropout)
 54 |         self.out = nn.Linear(d_model, d_model)
 55 |     
 56 |     def forward(self, q, k, v, mask=None):
 57 |         
 58 |         bs = q.size(0)
 59 |         
 60 |         
 61 |         # perform linear operation and split into N heads
 62 |         k = self.k_linear(k).view(bs, -1, self.h, self.d_k)
 63 |         q = self.q_linear(q).view(bs, -1, self.h, self.d_k)
 64 |         v = self.v_linear(v).view(bs, -1, self.h, self.d_k)
 65 |         
 66 |         # transpose to get dimensions bs * N * sl * d_model
 67 |         k = k.transpose(1,2)
 68 |         q = q.transpose(1,2)
 69 |         v = v.transpose(1,2)
 70 |         
 71 | 
 72 |         # calculate attention using function we will define next
 73 |         scores = attention(q, k, v, self.d_k, mask, self.dropout)
 74 |         # concatenate heads and put through final linear layer
 75 |         concat = scores.transpose(1,2).contiguous()\
 76 |         .view(bs, -1, self.d_model)
 77 |         output = self.out(concat)
 78 |     
 79 |         return output
 80 | 
 81 | class FeedForward(nn.Module):
 82 |     def __init__(self, d_model, d_ff, dropout = 0.1, activation=F.relu):
 83 |         super().__init__() 
 84 |     
 85 |         # We set d_ff as a default to 2048
 86 |         self.linear_1 = nn.Linear(d_model, d_ff)
 87 |         self.dropout = nn.Dropout(dropout)
 88 |         self.linear_2 = nn.Linear(d_ff, d_model)
 89 |         self.activation = activation
 90 |     
 91 |     def forward(self, x):
 92 |         x = self.dropout(self.activation(self.linear_1(x)))
 93 |         x = self.linear_2(x)
 94 |         return x
 95 | 
 96 | 
 97 | # Embeding #
 98 | 
 99 | class Embedder(nn.Module):
100 |     def __init__(self, vocab_size, d_model):
101 |         super().__init__()
102 |         self.d_model = d_model
103 |         self.embed = nn.Embedding(vocab_size, d_model)
104 |     def forward(self, x):
105 |         return self.embed(x)
106 | 
107 | 
108 | class PositionalEncoder(nn.Module):
109 |     def __init__(self, d_model, max_seq_len = 5000, dropout = 0.1):
110 |         super().__init__()
111 |         self.d_model = d_model
112 |         self.dropout = nn.Dropout(dropout)
113 |         # create constant 'pe' matrix with values dependant on 
114 |         # pos and i
115 |         pe = torch.zeros(max_seq_len, d_model)
116 |         for pos in range(max_seq_len):
117 |             for i in range(0, d_model, 2):
118 |                 pe[pos, i] = \
119 |                 np.sin(pos / (10000 ** ((2 * i)/d_model)))
120 |                 pe[pos, i + 1] = \
121 |                 np.cos(pos / (10000 ** ((2 * (i + 1))/d_model)))
122 |         pe = pe.unsqueeze(0)
123 |         self.register_buffer('pe', pe)
124 |  
125 |     
126 |     def forward(self, x):
127 |         # make embeddings relatively larger
128 |         x = x * np.sqrt(self.d_model)
129 |         #add constant to embedding
130 |         seq_len = x.size(1)
131 |         pe = self.pe[:,:seq_len].clone().detach()
132 |         if x.is_cuda:
133 |             pe = pe.cuda()
134 |         x = x + pe
135 |         return self.dropout(x)


--------------------------------------------------------------------------------
/ngram_model/test_3gram.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from tqdm import trange
  3 | from tqdm import tqdm
  4 | from nltk.tokenize.treebank import TreebankWordDetokenizer
  5 | detokenize = TreebankWordDetokenizer().detokenize
  6 | import re 
  7 | import argparse
  8 | from accent_utils import process_line
  9 | import numpy as np
 10 | from multiprocessing import Pool
 11 | from torch import load
 12 | 
 13 | def get_arg():
 14 |     parse = argparse.ArgumentParser()
 15 |     parse.add_argument('test_data_file')
 16 |     parse.add_argument('ground_truth_file')
 17 |     parse.add_argument('model_dir')
 18 |     parse.add_argument('--nbeam', type=int, default=3)
 19 |     return parse.parse_args()
 20 | 
 21 | def remove_vn_accent(word):
 22 |     word = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', word)
 23 |     word = re.sub('[éèẻẽẹêếềểễệ]', 'e', word)
 24 |     word = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', word)
 25 |     word = re.sub('[íìỉĩị]', 'i', word)
 26 |     word = re.sub('[úùủũụưứừửữự]', 'u', word)
 27 |     word = re.sub('[ýỳỷỹỵ]', 'y', word)
 28 |     word = re.sub('đ', 'd', word)
 29 |     return word
 30 |     
 31 | def vn_dict(vn_word):
 32 |     vndict = {}
 33 |     for word in vnword:
 34 |         no_accent = remove_vn_accent(word)
 35 |         if not no_accent in vndict.keys():
 36 |             vndict.setdefault(no_accent, [word])
 37 |         else:
 38 |             vndict[no_accent].append(word)
 39 |     return vndict
 40 | 
 41 | # get dictionary for tieng viet
 42 | tokenizer = load('tokenizer_standarized.h5')
 43 | vnword = list(tokenizer['tone'].word_index.keys())
 44 | vndict = vn_dict(vnword)
 45 | 
 46 | def gen_accents(word):
 47 |     word = remove_vn_accent(word.lower())
 48 |     if word in vndict:
 49 |         return vndict[word]
 50 |     else:
 51 |         return [word]
 52 | 
 53 | # beam search
 54 | def beam_search(words, model, k=3):
 55 |     sequences = []
 56 |     for idx, word in enumerate(words):
 57 |         if idx == 0:
 58 |             sequences = [([x], 0.0) for x in gen_accents(word)]
 59 |         else:
 60 |             all_sequences = []
 61 |             for seq in sequences:
 62 |                 for next_word in gen_accents(word):
 63 |                     current_word = seq[0][-1]
 64 |                     try:
 65 |                         previous_word = seq[0][-2]
 66 |                         score = model.logscore(next_word, [previous_word, current_word])
 67 |                     except:
 68 |                         score = model.logscore(next_word, [current_word])
 69 |                     new_seq = seq[0].copy()
 70 |                     new_seq.append(next_word)
 71 |                     all_sequences.append((new_seq, seq[1] + score))
 72 |             all_sequences = sorted(all_sequences,key=lambda x: x[1], reverse=True)
 73 |             sequences = all_sequences[:k]
 74 |     return sequences
 75 | 
 76 | def translate(sent, model_sent, k):
 77 |     sent = sent.replace('\n','')
 78 |     result = beam_search(sent.lower().split(), model_sent, k)
 79 |     return detokenize(result[0][0])
 80 | 
 81 | def evaluate(pred, label):
 82 |     _, pred_punc = process_line(pred)
 83 |     _, label_punc = process_line(label)
 84 | 
 85 |     pred_punc = np.array(pred_punc)
 86 |     label_punc = np.array(label_punc)
 87 | 
 88 |     true_values = np.sum(pred_punc==label_punc)
 89 |     n_values = len(pred_punc)
 90 | 
 91 |     return true_values, n_values
 92 | 
 93 | # load model
 94 | args = get_arg()
 95 | with open(args.model_dir, 'rb') as fin:
 96 |     model = pickle.load(fin)
 97 |     print('load model done')
 98 | 
 99 | nbeam = args.nbeam
100 | 
101 | # config to use multiprocessing 
102 | def translate1(sent):
103 |     return translate(sent, model, nbeam)
104 | 
105 | def pool_handler(data):
106 |     print('process with 3 pool, nbeam = %d'%(nbeam))
107 |     with Pool(3) as p:
108 |         temp_result = list(tqdm(p.imap(translate1, data), total=len(data)))
109 |     return temp_result
110 |   
111 | 
112 | if __name__=='__main__':
113 |     test_data_lines = None
114 |     ground_truth_lines = None
115 |     
116 |     # load data
117 |     with open(args.test_data_file, "r", encoding='utf-8') as f:
118 |         test_data_lines = f.readlines()
119 |     print('load data done')
120 | 
121 |     # run model to get the result
122 |     temp_result = pool_handler(test_data_lines)
123 |     del test_data_lines
124 | 
125 |     # load ground truth
126 |     with open(args.ground_truth_file, "r", encoding='utf-8') as f:
127 |         ground_truth_lines = f.readlines()
128 | 
129 |     total_true_values = 0
130 |     total_values = 0
131 |     # coumpute accuracy
132 |     for i in range(len(temp_result)):
133 |         true_values, n_values = evaluate(temp_result[i], ground_truth_lines[i])
134 |         total_true_values += true_values
135 |         total_values += n_values
136 | 
137 |     print("Avg. Accuracy: {}".format(total_true_values / total_values))
138 | 
139 | 
140 | 
141 |   
142 |   
143 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import json
  4 | import logging
  5 | from pathlib import Path
  6 | 
  7 | import torch
  8 | from torch.utils import data
  9 | 
 10 | from models.model_factory import get_model
 11 | from dataloader import Dataset
 12 | from utils import train_model, evaluate_model
 13 | 
 14 | 
 15 | def get_arg():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument('vocab_path')
 18 |     parser.add_argument('train_path')
 19 |     parser.add_argument('val_path')
 20 |     parser.add_argument('--experiment_name')
 21 |     parser.add_argument('--src_postfix', default='.notone')
 22 |     parser.add_argument('--trg_postfix', default='.tone')
 23 |     parser.add_argument('--config_file', default='model_config.json')
 24 |     parser.add_argument('--model_name', default='big_evolved')
 25 |     parser.add_argument('--batch_size', type=int, default=32)
 26 |     parser.add_argument('--cuda', action='store_true', default=False)
 27 |     parser.add_argument('--learning_rate', type=float, default=0.0001)
 28 |     parser.add_argument('--num_epochs', type=int, default=1)
 29 |     parser.add_argument('--restore_file', default=None)
 30 |     parser.add_argument('--initial_epoch', default=1, type=int)
 31 | 
 32 |     args = parser.parse_args()
 33 | 
 34 |     return args
 35 | 
 36 | if __name__=='__main__':
 37 |     args = get_arg()
 38 | 
 39 |     # Init experiment folder
 40 |     experiment_folder = os.path.join("experiments", args.experiment_name)
 41 |     Path(experiment_folder).mkdir(parents=True, exist_ok=True)
 42 | 
 43 |     # Init Log
 44 |     log_file = os.path.join(experiment_folder, "logs.txt")
 45 |     for handler in logging.root.handlers[:]:
 46 |         logging.root.removeHandler(handler)
 47 |     logging.basicConfig(filename=log_file, 
 48 |                         filemode='a',
 49 |                         level=logging.INFO, 
 50 |                         format="%(levelname)s - %(asctime)s: %(message)s")
 51 |     logger=logging.getLogger(__name__)
 52 | 
 53 |     # Load tokenizer
 54 |     print("Load tokenizer")
 55 |     tokenizer = torch.load(args.vocab_path)
 56 |     src_tokenizer = tokenizer['notone']
 57 |     trg_tokenizer = tokenizer['tone']
 58 |     src_pad_token = 0
 59 |     trg_pad_token = 0
 60 | 
 61 |     # Load data
 62 |     print("Load data")
 63 |     train_src_file = args.train_path + args.src_postfix
 64 |     train_trg_file = args.train_path + args.trg_postfix
 65 |     train_dataset = Dataset(src_tokenizer, trg_tokenizer, train_src_file, train_trg_file)
 66 |     train_iter = data.dataloader.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
 67 | 
 68 |     val_src_file = args.val_path + args.src_postfix
 69 |     val_trg_file = args.val_path + args.trg_postfix
 70 |     val_dataset = Dataset(src_tokenizer, trg_tokenizer, val_src_file, val_trg_file)
 71 |     val_iter = data.dataloader.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False)
 72 | 
 73 |     # Model config
 74 |     with open(args.config_file) as f:
 75 |         config = json.load(f)
 76 |     
 77 |     if args.model_name in config:
 78 |         model_param = config[args.model_name]
 79 |     else:
 80 |         raise Exception("Invalid model name")
 81 |     
 82 |     model_param['src_vocab_size'] = len(src_tokenizer.word_index) + 1
 83 |     model_param['trg_vocab_size'] = len(trg_tokenizer.word_index) + 1
 84 | 
 85 |     # Device 
 86 |     print("Init model")
 87 |     device = torch.device('cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
 88 | 
 89 |     # Init model
 90 |     model = get_model(model_param)
 91 |     if device.type=='cuda':
 92 |         model = model.cuda()
 93 |     optim = torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-9)
 94 |     print("Using", device.type)
 95 | 
 96 |     # Load weight
 97 |     if args.restore_file is not None:
 98 |         if os.path.isfile(args.restore_file):
 99 |             print("Load model")
100 |             state = torch.load(args.restore_file)
101 |             model.load_state_dict(state['model'])
102 |             optim.load_state_dict(state['optim'])
103 |         else:
104 |             raise Exception("Invalid weight path")
105 |     
106 |     # Init weight dir
107 |     weight_folder = os.path.join(experiment_folder, "weights")
108 |     Path(weight_folder).mkdir(parents=True, exist_ok=True)
109 | 
110 |     # Train model
111 |     print("Start training %d epochs" % args.num_epochs)
112 |     for e in range(args.initial_epoch, args.num_epochs+1):
113 |         logger.info("Epoch %02d/%02d" % (e, args.num_epochs))
114 |         logger.info("Start training")
115 |         print("\nEpoch %02d/%02d" % (e, args.num_epochs), flush=True)
116 |         save_file = os.path.join(weight_folder, 'epoch_%02d.h5' % e)
117 |         train_loss = train_model(model, optim, train_iter, src_pad_token, use_mask=model_param["use_mask"], device=device, save_path=save_file)
118 |         logger.info("End training")
119 |         logger.info("train_loss = %.8f" % train_loss)
120 |         val_loss = evaluate_model(model, val_iter, src_pad_token, use_mask=model_param["use_mask"], device=device)
121 |         logger.info("val_loss   = %.8f\n" % val_loss)


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from tqdm import tqdm
  5 | from tensorflow.keras.preprocessing.sequence import pad_sequences
  6 | from accent_utils import extract_words, remove_tone_line
  7 | from models.transformer_utils.mask import create_src_mask
  8 | 
  9 | def forward(model, src, src_pad_token=0, use_mask=True):
 10 |     if use_mask:
 11 |         src_mask = create_src_mask(src, pad_token=src_pad_token)
 12 |         logit = model(src, src_mask)
 13 |     else:
 14 |         logit = model(src)
 15 |     return logit
 16 | 
 17 | 
 18 | def forward_and_loss(model, src, trg, loss_fn, src_pad_token=0, use_mask=True):
 19 |     if use_mask:
 20 |         src_mask = create_src_mask(src, pad_token=src_pad_token)
 21 |         preds = model(src, src_mask)
 22 |     else:
 23 |         preds = model(src)
 24 |     ys = trg.contiguous().view(-1)
 25 |     loss = loss_fn(preds.view(-1, preds.size(-1)), ys, ignore_index=src_pad_token)
 26 |     return preds, loss
 27 | 
 28 | 
 29 | def train_model(model, optimizer, train_iter, src_pad_token, use_mask=True, save_path=None, device=None):
 30 |     total_loss = 0.0
 31 |     total_item = 0
 32 | 
 33 |     model.train()
 34 |     
 35 |     with tqdm(total=len(train_iter)) as pbar:
 36 |         for src, trg in train_iter: 
 37 |             if device is not None and device.type=='cuda':
 38 |                 src = src.cuda()
 39 |                 trg = trg.cuda()
 40 | 
 41 |             optimizer.zero_grad()
 42 |             _, loss = forward_and_loss(model, src, trg, F.cross_entropy, src_pad_token=src_pad_token, use_mask=use_mask)
 43 |             
 44 |             loss.backward()
 45 |             optimizer.step()
 46 |             
 47 |             total_loss += loss.item()
 48 |             total_item += trg.size(0)
 49 | 
 50 |             pbar.update(1)
 51 |             pbar.set_description("loss     = %.8f" % (total_loss/total_item))
 52 |             
 53 |     # Save model
 54 |     if save_path is not None:
 55 |         state = {
 56 |             "model": model.state_dict(),
 57 |             "optim": optimizer.state_dict(),
 58 |         }
 59 |         
 60 |         torch.save(state, save_path)
 61 |     
 62 |     return total_loss/total_item
 63 | 
 64 | 
 65 | def evaluate_model(model, val_iter, src_pad_token, use_mask=True, device=None):
 66 |     model.eval()
 67 |     with torch.no_grad(), tqdm(total=len(val_iter)) as pbar:
 68 |         total_loss = 0.0
 69 |         total_item = 0
 70 |         for src, trg in val_iter:
 71 |             if device is not None and device.type=='cuda':
 72 |                 src = src.cuda()
 73 |                 trg = trg.cuda()
 74 | 
 75 |             _, loss = forward_and_loss(model, src, trg, F.cross_entropy, use_mask=use_mask, src_pad_token=src_pad_token)
 76 |             
 77 |             total_loss += loss.item()
 78 |             total_item += src.size(0)
 79 | 
 80 |             pbar.update(1)
 81 |             pbar.set_description("val_loss = %.8f" % (total_loss/total_item))
 82 | 
 83 |     return total_loss/total_item
 84 | 
 85 | def translate(model, sents, src_tokenizer, trg_tokenizer, maxlen=200, use_mask=True, device=None):
 86 |     
 87 |     words, word_indices = extract_words(sents)
 88 |     lower_words = [x.lower() for x in words]
 89 | 
 90 |     # Tokenize words
 91 |     known_word_mask = [] # Same size as words - True if word is in word list, otherwise False
 92 |     seqs = []
 93 |     for word in lower_words:
 94 |         if word in src_tokenizer.word_index:
 95 |             seqs.append(src_tokenizer.word_index[word])
 96 |             known_word_mask.append(True)
 97 |         else:
 98 |             seqs.append(1)
 99 |             known_word_mask.append(False)
100 |     seqs = [seqs]
101 | 
102 |     # Model inference
103 |     seqs = pad_sequences(seqs, maxlen, padding='post')
104 |     seqs = torch.tensor(seqs).long()
105 |     if device is not None and device.type=='cuda':
106 |         seqs = seqs.cuda()
107 |     with torch.no_grad():
108 |         probs = forward(model, seqs, 0, use_mask=use_mask)
109 |     probs = probs.cpu().detach().numpy()
110 |     
111 |     # Add tone
112 |     output = sents
113 |     probs = probs[0]
114 |     prob_indices = probs.argsort(axis=-1)[:, ::-1]
115 |     prob_indices = prob_indices[:, :100]
116 |     for i, word in enumerate(lower_words):
117 |         
118 |         # Skip unknown words
119 |         if not known_word_mask[i]:
120 |             continue
121 | 
122 |         # Find the best solution
123 |         for idx in prob_indices[i, :]:
124 |             target_word = trg_tokenizer.sequences_to_texts([[idx]])[0]
125 |             if remove_tone_line(target_word.lower()) == word:
126 |                 begin_idx, end_idx = word_indices[i]
127 | 
128 |                 # Correct lower / upper case
129 |                 corrected_word = ""
130 |                 for ic, char in enumerate(words[i]):
131 |                     if char.islower():
132 |                         corrected_word += target_word[ic].lower()
133 |                     else:
134 |                         corrected_word += target_word[ic].upper()
135 | 
136 |                 output = output[:begin_idx] + corrected_word + output[end_idx:]
137 |                 break
138 | 
139 |     return output


--------------------------------------------------------------------------------
/models/transformer_utils/evolved_layer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | from .utils import *
  6 | 
  7 | class SeparableConv1D(nn.Module):
  8 |     """ Input: (batch_size, in_channel, length)
  9 |         Output: (batch_size, out_channel, length)
 10 |     """
 11 |     def __init__(self, in_channel, out_channel, kernel_size=1, padding=0):
 12 |         super().__init__()
 13 |         self.deep_wise = nn.Conv1d(in_channel, in_channel, kernel_size=kernel_size, padding=padding, groups=in_channel)
 14 |         self.point_wise = nn.Conv1d(in_channel, out_channel, kernel_size=1)
 15 | 
 16 |     def forward(self, x):
 17 |         x = self.deep_wise(x)
 18 |         x = self.point_wise(x)
 19 |         return x
 20 | 
 21 | class EncoderLayer(nn.Module):
 22 |     def __init__(self, d_model, d_ff, heads, dropout=0.1):
 23 |         super().__init__()
 24 |         self.d_model = d_model
 25 | 
 26 |         # GLU
 27 |         self.norm_glu = Norm(d_model)
 28 |         self.glu_ff1 = nn.Linear(d_model, d_model)
 29 |         self.glu_ff2 = nn.Linear(d_model, d_model)
 30 |         
 31 |         # Conv
 32 |         self.norm_conv1 = Norm(d_model)
 33 |         self.norm_conv2 = Norm(d_model*4)
 34 | 
 35 |         self.left_conv = nn.Linear(d_model, d_model * 4)
 36 |         self.left_dropout = nn.Dropout(dropout)
 37 | 
 38 |         self.right_conv = nn.Conv1d(d_model, d_model//2, kernel_size=3, padding=1)
 39 |         self.right_dropout = nn.Dropout(dropout)
 40 | 
 41 |         self.sep_conv = SeparableConv1D(d_model*4, d_model//2, kernel_size=9, padding=4)
 42 | 
 43 |         # Self-attention
 44 |         self.norm_attn = Norm(d_model)
 45 |         self.attn = MultiHeadAttention(heads, d_model, dropout=dropout)
 46 |         self.attn_dropout = nn.Dropout(dropout)
 47 | 
 48 |         # Fully connected
 49 |         self.norm_ff = Norm(d_model)
 50 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
 51 | 
 52 | 
 53 |     def forward(self, x, mask):
 54 |         # GLU: 512
 55 |         residual = x
 56 |         x = self.norm_glu(x)
 57 |         values = self.glu_ff1(x)
 58 |         gates = torch.sigmoid(self.glu_ff2(x))
 59 |         hiddent_state = values * gates
 60 |         x = residual + hiddent_state
 61 | 
 62 |         # Conv: 512
 63 |         conv_mask = mask[:,-1,:] # BxS
 64 |         conv_mask = conv_mask.unsqueeze(-1) # BxSx1 => BxSxD
 65 | 
 66 |         residual = x
 67 |         x = self.norm_conv1(x)
 68 |         x = x.masked_fill(conv_mask==0, 0)
 69 | 
 70 |         left_state = self.left_conv(x)
 71 |         left_state = F.relu(left_state)
 72 |         left_state = self.left_dropout(left_state) # 2048
 73 | 
 74 |         right_state = self.right_conv(x.transpose(-1,-2)).transpose(-1,-2)
 75 |         right_state = F.relu(right_state)
 76 |         right_state = self.right_dropout(right_state) # 256
 77 | 
 78 |         right_state = F.pad(right_state, (0, self.d_model*4 - self.d_model//2))
 79 |         hiddent_state = left_state + right_state # 2048
 80 | 
 81 |         hiddent_state = self.norm_conv2(hiddent_state) 
 82 |         hiddent_state = hiddent_state.masked_fill(conv_mask==0, 0)
 83 | 
 84 |         hiddent_state = self.sep_conv(hiddent_state.transpose(-1,-2)).transpose(-1,-2) # 256
 85 |         hiddent_state = F.pad(hiddent_state, (0, self.d_model//2)) # 512
 86 | 
 87 |         x = residual + hiddent_state # 512
 88 | 
 89 |         # Self-attention: 512
 90 |         residual = x
 91 |         x = self.norm_attn(x)
 92 |         attn = self.attn(x, x, x, mask)
 93 |         attn = self.attn_dropout(attn)
 94 |         x = residual + attn
 95 | 
 96 |         # Fully connected: 512
 97 |         residual = x
 98 |         x = self.norm_ff(x)
 99 |         hiddent_state = self.ff(x)
100 |         x = residual + hiddent_state
101 | 
102 |         return x
103 | 
104 | 
105 | 
106 | class DecoderLayer(nn.Module):
107 |     def __init__(self, d_model, d_ff, heads, dropout=0.1):
108 |         super().__init__()
109 |         self.norm_1 = Norm(d_model)
110 |         self.norm_2 = Norm(d_model)
111 |         self.norm_3 = Norm(d_model)
112 |         
113 |         self.dropout_1 = nn.Dropout(dropout)
114 |         self.dropout_2 = nn.Dropout(dropout)
115 |         self.dropout_3 = nn.Dropout(dropout)
116 |         
117 |         self.attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
118 |         self.attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
119 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout)
120 | 
121 |         self.heads = heads
122 |         self.d_model = d_model
123 | 
124 |         # Attention 1
125 |         self.norm_attn_1 = Norm(d_model)
126 |         self.self_attn_1 = MultiHeadAttention(heads*2, d_model, dropout=dropout)
127 |         self.self_attn_dropout_1 = nn.Dropout(dropout)
128 |         self.enc_attn_1 = MultiHeadAttention(heads, d_model, dropout=dropout)
129 |         self.enc_attn_dropout_1 = nn.Dropout(dropout)
130 | 
131 |         # Conv
132 |         self.norm_conv1 = Norm(d_model)
133 |         self.norm_conv2 = Norm(d_model*2)
134 |         
135 |         self.left_sep_conv = SeparableConv1D(d_model, d_model*2, kernel_size=11)
136 |         self.right_sep_conv = SeparableConv1D(d_model, d_model//2, kernel_size=7)
137 |         self.sep_conv = SeparableConv1D(d_model*2, d_model, kernel_size=7)
138 | 
139 |         # Attention 2
140 |         self.norm_attn_2 = Norm(d_model)
141 |         self.self_attn_2 = MultiHeadAttention(heads, d_model, dropout=dropout)
142 | 
143 |         # Attention 3
144 |         self.norm_attn_3 = Norm(d_model)
145 |         self.enc_attn_3 = MultiHeadAttention(heads, d_model, dropout=dropout)
146 | 
147 |         # Feed Forward
148 |         self.norm_ff = Norm(d_model)
149 |         self.ff = FeedForward(d_model, d_ff, dropout=dropout, activation=lambda x: x*torch.sigmoid(x))
150 | 
151 |     def forward(self, x, e_outputs, src_mask, trg_mask):
152 |         # Attention 1: 512
153 |         residual = x
154 |         x = self.norm_attn_1(x)
155 |         
156 |         self_attn = self.self_attn_1(x, x, x, trg_mask)
157 |         self_attn = self.self_attn_dropout_1(self_attn)
158 | 
159 |         enc_attn = self.enc_attn_1(x, e_outputs, e_outputs, src_mask)
160 |         enc_attn = self.enc_attn_dropout_1(enc_attn)
161 | 
162 |         hiddent_state = self_attn + enc_attn
163 |         x = residual + hiddent_state
164 | 
165 |         # Conv: 512
166 |         conv_mask = trg_mask[:,-1,:] # BxS
167 |         conv_mask = conv_mask.unsqueeze(-1) # BxSx1 => BxSxD
168 | 
169 |         residual = x
170 |         x = self.norm_conv1(x)
171 |         x = x.masked_fill(conv_mask==0, 0)
172 | 
173 |         x_pad = F.pad(x.transpose(-1,-2), (10, 0))
174 |         left_state = self.left_sep_conv(x_pad).transpose(-1,-2) # 1024
175 |         left_state = F.relu(left_state)
176 |         
177 |         x_pad = F.pad(x.transpose(-1,-2), (6, 0))
178 |         right_state = self.right_sep_conv(x_pad).transpose(-1,-2) # 256
179 | 
180 |         right_state = F.pad(right_state, (0, self.d_model*2 - self.d_model//2)) # 1024
181 |         hiddent_state = left_state + right_state # 1024
182 | 
183 |         hiddent_state = self.norm_conv2(hiddent_state) # 512
184 |         hiddent_state = hiddent_state.masked_fill(conv_mask==0, 0)
185 |         hiddent_state_pad = F.pad(hiddent_state.transpose(-1,-2), (6, 0))
186 |         hiddent_state = self.sep_conv(hiddent_state_pad).transpose(-1,-2)
187 | 
188 |         x = residual + hiddent_state # 512
189 |         
190 |         # Attention 2
191 |         residual = x
192 |         x = self.norm_attn_2(x)
193 |         # x = x.masked_fill(conv_mask==0, 0)
194 | 
195 |         self_attn = self.self_attn_2(x, x, x, trg_mask)
196 | 
197 |         x = residual + self_attn
198 | 
199 |         # Attention 3
200 |         residual = x
201 |         x = self.norm_attn_3(x)
202 | 
203 |         enc_attn = self.enc_attn_3(x, e_outputs, e_outputs, src_mask)
204 | 
205 |         x = residual + enc_attn
206 | 
207 |         # Feed Forward
208 |         residual = x
209 |         x = self.norm_ff(x)
210 |         hiddent_state = self.ff(x)
211 |         x = residual + hiddent_state
212 | 
213 |         return x


--------------------------------------------------------------------------------
/accent_utils.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import csv
  3 | import re
  4 | 
  5 | def remove_tone_file(in_path, out_path):
  6 |     with codecs.open(in_path, 'r', encoding='utf-8') as in_file,\
  7 |             codecs.open(out_path, 'w', encoding='utf-8') as out_file:
  8 |         for line in in_file:
  9 |             utf8_line = line.encode('utf-8')
 10 |             no_tone_line = remove_tone_line(utf8_line)
 11 |             try:
 12 |                 out_file.write(no_tone_line)
 13 |             except UnicodeDecodeError as e:
 14 |                 print ('Line with decode error:')
 15 | 
 16 | 
 17 | def decompose_predicted_test_file(in_path, out_no_tone_path=None, out_simplified_path=None):
 18 |     """
 19 |     Convert a predicted test file to two files:
 20 |         1. a csv file with line_and_word_id and no tone word
 21 |         2. a csv file with line_and_word_id and simplified word
 22 |     :param in_path: path to in put file
 23 |     :return: None, write to files
 24 |     """
 25 |     removed_ext_path = in_path.rsplit('.', 1)[0]
 26 |     if out_no_tone_path is None:
 27 |         out_no_tone_path = removed_ext_path + '_no_tone.csv'
 28 |     if out_simplified_path is None:
 29 |         out_simplified_path = removed_ext_path + '_simplified.csv'
 30 | 
 31 |     no_tone_header = ['id', 'no_tone']
 32 |     simplified_header = ['id', 'label']
 33 |     with codecs.open(in_path, 'r', encoding='utf-8') as in_file,\
 34 |             open(out_no_tone_path, 'w') as out_no_tone_file,\
 35 |             open(out_simplified_path, 'w') as out_simplified_file:
 36 | 
 37 |         out_no_tone_writer = csv.writer(out_no_tone_file, delimiter=',')
 38 |         out_simplified_writer = csv.writer(out_simplified_file, delimiter=',')
 39 | 
 40 |         out_no_tone_writer.writerow(no_tone_header)
 41 |         out_simplified_writer.writerow(simplified_header)
 42 | 
 43 |         for line in in_file:
 44 |             no_tone_words, simplified_words = process_line(line)
 45 |             if len(simplified_words) < 1000:
 46 |                 write_to_test_label(out_no_tone_writer, no_tone_words[0], no_tone_words[1:])
 47 |                 write_to_test_label(out_simplified_writer, no_tone_words[0], simplified_words[1:])
 48 | 
 49 |     assert count_lines(out_simplified_path) == count_lines(out_no_tone_path)
 50 | 
 51 | 
 52 | def remove_tone_line(utf8_str):
 53 |     intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
 54 |     intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
 55 |     intab = [ch for ch in str(intab_l+intab_u)]
 56 | 
 57 |     outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
 58 |     outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
 59 |     outtab = outtab_l + outtab_u
 60 | 
 61 |     r = re.compile("|".join(intab))
 62 |     replaces_dict = dict(zip(intab, outtab))
 63 | 
 64 |     return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
 65 | 
 66 | 
 67 | def normalize_tone_line(utf8_str):
 68 |     intab_l = "áàảãạâấầẩẫậăắằẳẵặđèéẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ"
 69 |     intab_u = "ÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶĐÈÉẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ"
 70 |     intab = [ch for ch in str(intab_l + intab_u)]
 71 | 
 72 |     outtab_l = [
 73 |         "a1", "a2", "a3", "a4", "a5",
 74 |         "a6", "a61", "a62", "a63", "a64", "a65",
 75 |         "a8", "a81", "a82", "a83", "a84", "a85",
 76 |         "d9",
 77 |         "e1", "e2", "e3", "e4", "e5",
 78 |         "e6", "e61", "e62", "e63", "e64", "e65",
 79 |         "i1", "i2", "i3", "i4", "i5",
 80 |         "o1", "o2", "o3", "o4", "o5",
 81 |         "o6", "a61", "o62", "o63", "o64", "o65",
 82 |         "o7", "o71", "o72", "o73", "o74", "o75",
 83 |         "u1", "u2", "u3", "u4", "u5",
 84 |         "u7", "u71", "u72", "u73", "u74", "u75",
 85 |         "y1", "y2", "y3", "y4", "y5",
 86 |     ]
 87 | 
 88 |     outtab_u = [
 89 |         "A1", "A2", "A3", "A4", "A5",
 90 |         "A6", "A61", "A62", "A63", "A64", "A65",
 91 |         "A8", "A81", "A82", "A83", "A84", "A85",
 92 |         "D9",
 93 |         "E1", "E2", "E3", "E4", "E5",
 94 |         "E6", "E61", "E62", "E63", "E64", "E65",
 95 |         "I1", "I2", "I3", "I4", "I5",
 96 |         "O1", "O2", "O3", "O4", "O5",
 97 |         "O6", "O61", "O62", "O63", "O64", "O65",
 98 |         "O7", "O71", "O72", "O73", "O74", "O75",
 99 |         "U1", "U2", "U3", "U4", "U5",
100 |         "U7", "U71", "U72", "U73", "U74", "U75",
101 |         "Y1", "Y2", "Y3", "Y4", "Y5",
102 |     ]
103 | 
104 |     r = re.compile("|".join(intab))
105 |     replaces_dict = dict(zip(intab, outtab_l + outtab_u))
106 | 
107 |     return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
108 | 
109 | 
110 | def _remove_special_chars_and_numbers(unicode_line):
111 |     removed_special_chars = re.sub('[^a-zA-Z\d\\\\]', ' ', repr(unicode_line))[1:]
112 |     removed_numbers = re.sub(r'\b\d+\b', '', removed_special_chars)
113 |     return removed_numbers
114 | 
115 | 
116 | def write_to_test_label(label_writer, line_id, words):
117 |     for i, word in enumerate(words):
118 |         line = ['{}{:03}'.format(line_id, i), word]
119 |         label_writer.writerow(line)
120 | 
121 | 
122 | def process_line(line):
123 |     """
124 |     Process a line
125 |     :param line:
126 |     :return: no_tone_line, no_tone_words, simplified_words
127 |     """
128 |     # utf8_line = line.encode('utf-8')
129 |     utf8_line = line.strip('\n')
130 | 
131 |     no_tone_line_pre = remove_tone_line(utf8_line)
132 |     normalized_line_pre = normalize_tone_line(utf8_line)
133 | 
134 |     no_tone_words, _ = extract_words(no_tone_line_pre, include_digits=True)
135 |     normalized_words, _ = extract_words(normalized_line_pre, include_digits=True)
136 | 
137 |     assert len(no_tone_words) == len(normalized_words)
138 | 
139 |     filtered_no_tone_words = []
140 |     simplified_words = []
141 |     for i, word in enumerate(no_tone_words):
142 |         if not word.isalpha():
143 |             continue
144 |         simplified_word = simplify(normalized_words[i])
145 |         filtered_no_tone_words.append(word)
146 |         simplified_words.append(simplified_word)
147 | 
148 |     return filtered_no_tone_words, simplified_words
149 | 
150 | 
151 | def simplify(word):
152 |     """
153 |     normalize and simplify a vni word:
154 |     * move tone digit to the end
155 |     * return only digits
156 |     * return 0 if there is no digit
157 |     """
158 |     if word.isalpha(): 
159 |         return '0'
160 |     ret = ''
161 |     tone = ''
162 |     for letter in word:
163 |         if '1' <= letter <= '9':
164 |             if '1' <= letter <= '5':
165 |                 # assert len(tone) == 0, '{}, {}'.format(tone, word)
166 |                 if tone != '':
167 |                     return '#'  # ignore this word
168 |                 tone = letter
169 |             else:
170 |                 ret += letter
171 |     return ret + tone
172 | 
173 | 
174 | def count_lines(thefilepath):
175 |     count = 0
176 |     for _ in open(thefilepath).xreadlines():
177 |         count += 1
178 |     return count
179 | 
180 | 
181 | def get_ids(file_path):
182 |     ids = set()
183 |     with codecs.open(file_path, 'r', encoding='utf-8') as f:
184 |         for line in f:
185 |             ids.add(line[:3])
186 | 
187 | 
188 | def compare_ids(file1, file2):
189 |     """
190 |     compare ids between two files
191 |     """
192 |     ids1 = get_ids(file1)
193 |     ids2 = get_ids(file2)
194 | 
195 |     print ('ids in {} but not in {}:'.format(file1, file2))
196 |     print (ids1 - ids2)
197 |     print ('ids in {} but not in {}:'.format(file2, file1))
198 |     print (ids2 - ids1)
199 | 
200 | 
201 | def extract_words(sentence, include_digits=False):
202 | 
203 |     additional_pattern = ''
204 |     if include_digits:
205 |         additional_pattern = '0-9'
206 |     pattern = '[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \
207 |             'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬ'+ \
208 |             'ĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \
209 |             'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \
210 |             'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \
211 |             'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐ'+ \
212 |             'EÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \
213 |             'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \
214 |             'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z' + additional_pattern + ']+'
215 |             
216 |     indices = []
217 |     words = []
218 |     for m in re.finditer(pattern, sentence,  re.IGNORECASE):
219 |         words.append(m.group(0))
220 |         indices.append((m.start(0), m.end(0)))
221 |     return words, indices


--------------------------------------------------------------------------------
/ngram_model/accent_utils.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import csv
  3 | import re
  4 | 
  5 | def remove_tone_file(in_path, out_path):
  6 |     with codecs.open(in_path, 'r', encoding='utf-8') as in_file,\
  7 |             codecs.open(out_path, 'w', encoding='utf-8') as out_file:
  8 |         for line in in_file:
  9 |             utf8_line = line.encode('utf-8')
 10 |             no_tone_line = remove_tone_line(utf8_line)
 11 |             try:
 12 |                 out_file.write(no_tone_line)
 13 |             except UnicodeDecodeError as e:
 14 |                 print ('Line with decode error:')
 15 | 
 16 | 
 17 | def decompose_predicted_test_file(in_path, out_no_tone_path=None, out_simplified_path=None):
 18 |     """
 19 |     Convert a predicted test file to two files:
 20 |         1. a csv file with line_and_word_id and no tone word
 21 |         2. a csv file with line_and_word_id and simplified word
 22 |     :param in_path: path to in put file
 23 |     :return: None, write to files
 24 |     """
 25 |     removed_ext_path = in_path.rsplit('.', 1)[0]
 26 |     if out_no_tone_path is None:
 27 |         out_no_tone_path = removed_ext_path + '_no_tone.csv'
 28 |     if out_simplified_path is None:
 29 |         out_simplified_path = removed_ext_path + '_simplified.csv'
 30 | 
 31 |     no_tone_header = ['id', 'no_tone']
 32 |     simplified_header = ['id', 'label']
 33 |     with codecs.open(in_path, 'r', encoding='utf-8') as in_file,\
 34 |             open(out_no_tone_path, 'w') as out_no_tone_file,\
 35 |             open(out_simplified_path, 'w') as out_simplified_file:
 36 | 
 37 |         out_no_tone_writer = csv.writer(out_no_tone_file, delimiter=',')
 38 |         out_simplified_writer = csv.writer(out_simplified_file, delimiter=',')
 39 | 
 40 |         out_no_tone_writer.writerow(no_tone_header)
 41 |         out_simplified_writer.writerow(simplified_header)
 42 | 
 43 |         for line in in_file:
 44 |             no_tone_words, simplified_words = process_line(line)
 45 |             if len(simplified_words) < 1000:
 46 |                 write_to_test_label(out_no_tone_writer, no_tone_words[0], no_tone_words[1:])
 47 |                 write_to_test_label(out_simplified_writer, no_tone_words[0], simplified_words[1:])
 48 | 
 49 |     assert count_lines(out_simplified_path) == count_lines(out_no_tone_path)
 50 | 
 51 | 
 52 | def remove_tone_line(utf8_str):
 53 |     intab_l = "ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ"
 54 |     intab_u = "ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ"
 55 |     intab = [ch for ch in str(intab_l+intab_u)]
 56 | 
 57 |     outtab_l = "a"*17 + "o"*17 + "e"*11 + "u"*11 + "i"*5 + "y"*5 + "d"
 58 |     outtab_u = "A"*17 + "O"*17 + "E"*11 + "U"*11 + "I"*5 + "Y"*5 + "D"
 59 |     outtab = outtab_l + outtab_u
 60 | 
 61 |     r = re.compile("|".join(intab))
 62 |     replaces_dict = dict(zip(intab, outtab))
 63 | 
 64 |     return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
 65 | 
 66 | 
 67 | def normalize_tone_line(utf8_str):
 68 |     intab_l = "áàảãạâấầẩẫậăắằẳẵặđèéẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ"
 69 |     intab_u = "ÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶĐÈÉẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ"
 70 |     intab = [ch for ch in str(intab_l + intab_u)]
 71 | 
 72 |     outtab_l = [
 73 |         "a1", "a2", "a3", "a4", "a5",
 74 |         "a6", "a61", "a62", "a63", "a64", "a65",
 75 |         "a8", "a81", "a82", "a83", "a84", "a85",
 76 |         "d9",
 77 |         "e1", "e2", "e3", "e4", "e5",
 78 |         "e6", "e61", "e62", "e63", "e64", "e65",
 79 |         "i1", "i2", "i3", "i4", "i5",
 80 |         "o1", "o2", "o3", "o4", "o5",
 81 |         "o6", "a61", "o62", "o63", "o64", "o65",
 82 |         "o7", "o71", "o72", "o73", "o74", "o75",
 83 |         "u1", "u2", "u3", "u4", "u5",
 84 |         "u7", "u71", "u72", "u73", "u74", "u75",
 85 |         "y1", "y2", "y3", "y4", "y5",
 86 |     ]
 87 | 
 88 |     outtab_u = [
 89 |         "A1", "A2", "A3", "A4", "A5",
 90 |         "A6", "A61", "A62", "A63", "A64", "A65",
 91 |         "A8", "A81", "A82", "A83", "A84", "A85",
 92 |         "D9",
 93 |         "E1", "E2", "E3", "E4", "E5",
 94 |         "E6", "E61", "E62", "E63", "E64", "E65",
 95 |         "I1", "I2", "I3", "I4", "I5",
 96 |         "O1", "O2", "O3", "O4", "O5",
 97 |         "O6", "O61", "O62", "O63", "O64", "O65",
 98 |         "O7", "O71", "O72", "O73", "O74", "O75",
 99 |         "U1", "U2", "U3", "U4", "U5",
100 |         "U7", "U71", "U72", "U73", "U74", "U75",
101 |         "Y1", "Y2", "Y3", "Y4", "Y5",
102 |     ]
103 | 
104 |     r = re.compile("|".join(intab))
105 |     replaces_dict = dict(zip(intab, outtab_l + outtab_u))
106 | 
107 |     return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)
108 | 
109 | 
110 | def _remove_special_chars_and_numbers(unicode_line):
111 |     removed_special_chars = re.sub('[^a-zA-Z\d\\\\]', ' ', repr(unicode_line))[1:]
112 |     removed_numbers = re.sub(r'\b\d+\b', '', removed_special_chars)
113 |     return removed_numbers
114 | 
115 | 
116 | def write_to_test_label(label_writer, line_id, words):
117 |     for i, word in enumerate(words):
118 |         line = ['{}{:03}'.format(line_id, i), word]
119 |         label_writer.writerow(line)
120 | 
121 | 
122 | def process_line(line):
123 |     """
124 |     Process a line
125 |     :param line:
126 |     :return: no_tone_line, no_tone_words, simplified_words
127 |     """
128 |     # utf8_line = line.encode('utf-8')
129 |     utf8_line = line.strip('\n')
130 | 
131 |     no_tone_line_pre = remove_tone_line(utf8_line)
132 |     normalized_line_pre = normalize_tone_line(utf8_line)
133 | 
134 |     no_tone_words, _ = extract_words(no_tone_line_pre, include_digits=True)
135 |     normalized_words, _ = extract_words(normalized_line_pre, include_digits=True)
136 | 
137 |     assert len(no_tone_words) == len(normalized_words)
138 | 
139 |     filtered_no_tone_words = []
140 |     simplified_words = []
141 |     for i, word in enumerate(no_tone_words):
142 |         if not word.isalpha():
143 |             continue
144 |         simplified_word = simplify(normalized_words[i])
145 |         filtered_no_tone_words.append(word)
146 |         simplified_words.append(simplified_word)
147 | 
148 |     return filtered_no_tone_words, simplified_words
149 | 
150 | 
151 | def simplify(word):
152 |     """
153 |     normalize and simplify a vni word:
154 |     * move tone digit to the end
155 |     * return only digits
156 |     * return 0 if there is no digit
157 |     """
158 |     if word.isalpha(): 
159 |         return '0'
160 |     ret = ''
161 |     tone = ''
162 |     for letter in word:
163 |         if '1' <= letter <= '9':
164 |             if '1' <= letter <= '5':
165 |                 # assert len(tone) == 0, '{}, {}'.format(tone, word)
166 |                 if tone != '':
167 |                     return '#'  # ignore this word
168 |                 tone = letter
169 |             else:
170 |                 ret += letter
171 |     return ret + tone
172 | 
173 | 
174 | def count_lines(thefilepath):
175 |     count = 0
176 |     for _ in open(thefilepath).xreadlines():
177 |         count += 1
178 |     return count
179 | 
180 | 
181 | def get_ids(file_path):
182 |     ids = set()
183 |     with codecs.open(file_path, 'r', encoding='utf-8') as f:
184 |         for line in f:
185 |             ids.add(line[:3])
186 | 
187 | 
188 | def compare_ids(file1, file2):
189 |     """
190 |     compare ids between two files
191 |     """
192 |     ids1 = get_ids(file1)
193 |     ids2 = get_ids(file2)
194 | 
195 |     print ('ids in {} but not in {}:'.format(file1, file2))
196 |     print (ids1 - ids2)
197 |     print ('ids in {} but not in {}:'.format(file2, file1))
198 |     print (ids2 - ids1)
199 | 
200 | 
201 | def extract_words(sentence, include_digits=False):
202 | 
203 |     additional_pattern = ''
204 |     if include_digits:
205 |         additional_pattern = '0-9'
206 |     pattern = '[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \
207 |             'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬ'+ \
208 |             'ĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \
209 |             'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \
210 |             'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \
211 |             'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐ'+ \
212 |             'EÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \
213 |             'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \
214 |             'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z' + additional_pattern + ']+'
215 |             
216 |     indices = []
217 |     words = []
218 |     for m in re.finditer(pattern, sentence,  re.IGNORECASE):
219 |         words.append(m.group(0))
220 |         indices.append((m.start(0), m.end(0)))
221 |     return words, indices


--------------------------------------------------------------------------------
/preprocess_data/Preprocess_data_Wikipedia.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "gpoZc5wzuHrd"
  7 |    },
  8 |    "source": [
  9 |     "## Tiền xử lý dữ liệu cho dữ liệu từ Wikipedia\r\n",
 10 |     "\r\n",
 11 |     "- https://github.com/VNOpenAI/vn-accent\r\n",
 12 |     "\r\n",
 13 |     "- Lấy dữ liệu đã xử lý tại: https://phamdinhkhanh.github.io/2020/05/28/TransformerThemDauTV.html"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {
 20 |     "colab": {
 21 |      "base_uri": "https://localhost:8080/"
 22 |     },
 23 |     "id": "z9-h72VjcBYo",
 24 |     "outputId": "8818ccf1-2105-4df6-936b-99c130cf26d8"
 25 |    },
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stdout",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "Downloading...\n",
 32 |       "From: https://drive.google.com/uc?id=1-7lERkqCoID1691yCXLAOyZoJqYPqhGq\n",
 33 |       "To: /home/vietanhdev/Works/AIDr/vn-aidr/model_utils/vn_accent/preprocess_data/train_tieng_viet.txt\n",
 34 |       "776MB [03:08, 4.11MB/s] \n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "!gdown --id \"1bEMOe2TooEuhmzRv16f4aouC5mNg3EOr\""
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {
 45 |     "id": "VQ7snyj1I2Od"
 46 |    },
 47 |    "source": [
 48 |     "## 1. Làm sạch dữ liệu"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 2,
 54 |    "metadata": {
 55 |     "colab": {
 56 |      "base_uri": "https://localhost:8080/"
 57 |     },
 58 |     "id": "tXJ1cwKzyu3b",
 59 |     "outputId": "bb3531d4-ddde-4f4f-86d6-205a10a59f8c"
 60 |    },
 61 |    "outputs": [
 62 |     {
 63 |      "name": "stdout",
 64 |      "output_type": "stream",
 65 |      "text": [
 66 |       "Trang Chính\n",
 67 |       "Internet Society\n",
 68 |       "Internet Society hay ISOC là một tổ chức quốc tế hoạt động phi lợi nhuận, phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành. Tổ chức này chú trọng đến: tiêu chuẩn, giáo dục và các vấn đề về chính sách. Với trên 145 tổ chức thành viên và 65.000 thành viên cá nhân, ISOC bao gồm những con người cụ thể trong cộng đồng Internet. Mọi chi tiết có thể tìm thấy tại website của ISOC.\n",
 69 |       "Internet Society nằm ở gần thủ đô Washington, DC, Hoa Kỳ và Geneva, Thụy Sĩ. Số hội viên của nó bao gồm hơn 145 tổ chức thành viên và hơn 65.000 cá nhân. Thành viên còn có thể tự lập một chi nhánh của tổ chức tùy theo vị trí hoặc sở thích. Hiện nay tổ chức có tới 90 chi nhánh trên toàn thế giới.\n",
 70 |       "Bảo đảm, cổ vũ cho sự phát triển, mở rộng và sử dụng Internet được thuận lợi nhất cho mọi người trên toàn thế giới.\n",
 71 |       "Tiếng Việt\n",
 72 |       "Tiếng Việt được chính thức ghi nhận trong Hiến pháp nước Cộng hòa xã hội chủ nghĩa Việt Nam 2013, tại Chương I Điều 5 Mục 3, là ngôn ngữ quốc gia của Việt Nam . Tiếng Việt bao gồm cách phát âm tiếng Việt và chữ Quốc ngữ để viết. Tuy nhiên, hiện chưa có bất kỳ văn bản nào ở cấp nhà nước quy định \"giọng chuẩn\" và \"quốc tự\" của tiếng Việt . Hiện nay phần lớn các văn bản trong nước được viết theo những \"\"Quy định về chính tả tiếng Việt và về thuật ngữ tiếng Việt\" áp dụng cho các sách giáo khoa, báo và văn bản của ngành giáo dục\" nêu tại Quyết định của Bộ Giáo dục số 240/QĐ ngày 5 tháng 3 năm 1984 do những người thụ hưởng giáo dục đó sau này ra làm việc trong mọi lĩnh vực xã hội.\n",
 73 |       "Tiếng Việt cũng đã được công nhận là một ngôn ngữ dân tộc thiểu số tại Cộng hòa Séc.\n",
 74 |       "Tổ chức tiêu chuẩn hóa quốc tế đặt mã ngôn ngữ hai chữ cái cho tiếng Việt là \"vi\" (tiêu chuẩn ISO 639-1) và đặt mã ngôn ngữ ba chữ cái cho tiếng Việt là \"vie\" (tiêu chuẩn ISO 639-2) .\n",
 75 |       "Những ngôn ngữ này có chung một số từ vựng căn bản. Thí dụ, từ \"tay\" trong tiếng Việt tương đương trong tiếng Mường là \"thay\", trong tiếng Khmer là \"đay\" và trong tiếng Môn là \"tai\".\n",
 76 |       "3624432\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "with open('raw_wikipedia.txt', 'r', encoding='utf-8') as f:\n",
 82 |     "    data = f.readlines()\n",
 83 |     "for i in range(len(data)):\n",
 84 |     "    data[i] = data[i].split(\"\\t\")[1].strip()\n",
 85 |     "print(\"\\n\".join(data[:10]))\n",
 86 |     "print(len(data))"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 3,
 92 |    "metadata": {
 93 |     "colab": {
 94 |      "base_uri": "https://localhost:8080/"
 95 |     },
 96 |     "id": "dlConB6IzqzC",
 97 |     "outputId": "94b7efa7-20e5-4d69-af0f-3f0916a817af"
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "Trang Chính\n",
105 |       "Internet Society\n",
106 |       "Internet Society hay ISOC là một tổ chức quốc tế hoạt động phi lợi nhuận, phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành. Tổ chức này chú trọng đến: tiêu chuẩn, giáo dục và các vấn đề về chính sách. Với trên 145 tổ chức thành viên và 65.000 thành viên cá nhân, ISOC bao gồm những con người cụ thể trong cộng đồng Internet. Mọi chi tiết có thể tìm thấy tại website của ISOC.\n",
107 |       "Internet Society nằm ở gần thủ đô Washington, DC, Hoa Kỳ và Geneva, Thụy Sĩ. Số hội viên của nó bao gồm hơn 145 tổ chức thành viên và hơn 65.000 cá nhân. Thành viên còn có thể tự lập một chi nhánh của tổ chức tùy theo vị trí hoặc sở thích. Hiện nay tổ chức có tới 90 chi nhánh trên toàn thế giới.\n",
108 |       "Bảo đảm, cổ vũ cho sự phát triển, mở rộng và sử dụng Internet được thuận lợi nhất cho mọi người trên toàn thế giới.\n",
109 |       "Tiếng Việt\n",
110 |       "Tiếng Việt được chính thức ghi nhận trong Hiến pháp nước Cộng hòa xã hội chủ nghĩa Việt Nam 2013, tại Chương I Điều 5 Mục 3, là ngôn ngữ quốc gia của Việt Nam . Tiếng Việt bao gồm cách phát âm tiếng Việt và chữ Quốc ngữ để viết. Tuy nhiên, hiện chưa có bất kỳ văn bản nào ở cấp nhà nước quy định \"giọng chuẩn\" và \"quốc tự\" của tiếng Việt . Hiện nay phần lớn các văn bản trong nước được viết theo những \"\"Quy định về chính tả tiếng Việt và về thuật ngữ tiếng Việt\" áp dụng cho các sách giáo khoa, báo và văn bản của ngành giáo dục\" nêu tại Quyết định của Bộ Giáo dục số 240/QĐ ngày 5 tháng 3 năm 1984 do những người thụ hưởng giáo dục đó sau này ra làm việc trong mọi lĩnh vực xã hội.\n",
111 |       "Tiếng Việt cũng đã được công nhận là một ngôn ngữ dân tộc thiểu số tại Cộng hòa Séc.\n",
112 |       "Tổ chức tiêu chuẩn hóa quốc tế đặt mã ngôn ngữ hai chữ cái cho tiếng Việt là \"vi\" (tiêu chuẩn ISO 639-1) và đặt mã ngôn ngữ ba chữ cái cho tiếng Việt là \"vie\" (tiêu chuẩn ISO 639-2) .\n",
113 |       "Những ngôn ngữ này có chung một số từ vựng căn bản. Thí dụ, từ \"tay\" trong tiếng Việt tương đương trong tiếng Mường là \"thay\", trong tiếng Khmer là \"đay\" và trong tiếng Môn là \"tai\".\n",
114 |       "3624432\n"
115 |      ]
116 |     }
117 |    ],
118 |    "source": [
119 |     "# Xoá dòng trống\n",
120 |     "data = [line for line in data if len(line) > 0]\n",
121 |     "print(\"\\n\".join(data[:10]))\n",
122 |     "print(len(data))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 4,
128 |    "metadata": {
129 |     "colab": {
130 |      "base_uri": "https://localhost:8080/"
131 |     },
132 |     "id": "nrQ5koXuv6e-",
133 |     "outputId": "f7794676-3dea-4144-9ee1-a009c6a87ccf"
134 |    },
135 |    "outputs": [
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "Trang Chính\n",
141 |       "Internet Society\n",
142 |       "Internet Society hay ISOC là một tổ chức quốc tế hoạt động phi lợi nhuận, phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành.\n",
143 |       "Tổ chức này chú trọng đến: tiêu chuẩn, giáo dục và các vấn đề về chính sách.\n",
144 |       "Với trên 145 tổ chức thành viên và 65.\n",
145 |       "000 thành viên cá nhân, ISOC bao gồm những con người cụ thể trong cộng đồng Internet.\n",
146 |       "Mọi chi tiết có thể tìm thấy tại website của ISOC.\n",
147 |       "Internet Society nằm ở gần thủ đô Washington, DC, Hoa Kỳ và Geneva, Thụy Sĩ.\n",
148 |       "Số hội viên của nó bao gồm hơn 145 tổ chức thành viên và hơn 65.\n",
149 |       "000 cá nhân.\n",
150 |       "7314542\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "# Tách câu\r\n",
156 |     "import re\r\n",
157 |     "new_data = []\r\n",
158 |     "for i in range(len(data)):\r\n",
159 |     "    if data[i] is not None:\r\n",
160 |     "        new_data += re.split('((?<=[.?!]\")|((?<=[.?!])(?!\")))\\s*', data[i])\r\n",
161 |     "data = new_data\r\n",
162 |     "# Xoá dòng trống\r\n",
163 |     "data = [line for line in data if line is not None and len(line) > 0]\r\n",
164 |     "print(\"\\n\".join(data[:10]))\r\n",
165 |     "print(len(data))"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 5,
171 |    "metadata": {
172 |     "colab": {
173 |      "base_uri": "https://localhost:8080/"
174 |     },
175 |     "id": "BRqgxxJbvUtP",
176 |     "outputId": "5d61d61b-8aa5-4fb5-f02a-2277aaf3863d"
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "Trang Chính\n",
184 |       "Internet Society\n",
185 |       "Internet Society hay ISOC là một tổ chức quốc tế hoạt động phi lợi nhuận phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành\n",
186 |       "Tổ chức này chú trọng đến tiêu chuẩn giáo dục và các vấn đề về chính sách\n",
187 |       "Với trên tổ chức thành viên và\n",
188 |       "thành viên cá nhân ISOC bao gồm những con người cụ thể trong cộng đồng Internet\n",
189 |       "Mọi chi tiết có thể tìm thấy tại website của ISOC\n",
190 |       "Internet Society nằm ở gần thủ đô Washington DC Hoa Kỳ và Geneva Thụy Sĩ\n",
191 |       "Số hội viên của nó bao gồm hơn tổ chức thành viên và hơn\n",
192 |       "cá nhân\n",
193 |       "7167310\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "# Chỉ giữ lại từ trong mỗi câu\r\n",
199 |     "def extract_words(sentence, include_digits=False):\r\n",
200 |     "    additional_pattern = ''\r\n",
201 |     "    if include_digits:\r\n",
202 |     "        additional_pattern = '0-9'\r\n",
203 |     "    pattern = '[AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \\\r\n",
204 |     "            'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬ'+ \\\r\n",
205 |     "            'ĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \\\r\n",
206 |     "            'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \\\r\n",
207 |     "            'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊ'+ \\\r\n",
208 |     "            'OÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴAĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐ'+ \\\r\n",
209 |     "            'EÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢUƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴ'+ \\\r\n",
210 |     "            'AĂÂÁẮẤÀẰẦẢẲẨÃẴẪẠẶẬĐEÊÉẾÈỀẺỂẼỄẸỆIÍÌỈĨỊOÔƠÓỐỚÒỒỜỎỔỞÕỖỠỌỘỢ'+ \\\r\n",
211 |     "            'UƯÚỨÙỪỦỬŨỮỤỰYÝỲỶỸỴA-Z' + additional_pattern + ']+'\r\n",
212 |     "    indices = []\r\n",
213 |     "    words = []\r\n",
214 |     "    for m in re.finditer(pattern, sentence,  re.IGNORECASE):\r\n",
215 |     "        words.append(m.group(0))\r\n",
216 |     "        indices.append((m.start(0), m.end(0)))\r\n",
217 |     "    return words, indices\r\n",
218 |     "\r\n",
219 |     "for i in range(len(data)):\r\n",
220 |     "    words, indices = extract_words(data[i], include_digits=False)\r\n",
221 |     "    da\n",
222 |     "ta[i] = \" \".join(words)\r\n",
223 |     "# Xoá dòng trống\r\n",
224 |     "data = [line for line in data if len(line) > 0]\r\n",
225 |     "print(\"\\n\".join(data[:10]))\r\n",
226 |     "print(len(data))"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 6,
232 |    "metadata": {
233 |     "colab": {
234 |      "base_uri": "https://localhost:8080/"
235 |     },
236 |     "id": "8bxFXjq9viSQ",
237 |     "outputId": "10c9963f-b149-4e18-b0de-274d3184f136"
238 |    },
239 |    "outputs": [
240 |     {
241 |      "name": "stdout",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "trang chính\n",
245 |       "internet society\n",
246 |       "internet society hay isoc là một tổ chức quốc tế hoạt động phi lợi nhuận phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành\n",
247 |       "tổ chức này chú trọng đến tiêu chuẩn giáo dục và các vấn đề về chính sách\n",
248 |       "với trên tổ chức thành viên và\n",
249 |       "thành viên cá nhân isoc bao gồm những con người cụ thể trong cộng đồng internet\n",
250 |       "mọi chi tiết có thể tìm thấy tại website của isoc\n",
251 |       "internet society nằm ở gần thủ đô washington dc hoa kỳ và geneva thụy sĩ\n",
252 |       "số hội viên của nó bao gồm hơn tổ chức thành viên và hơn\n",
253 |       "cá nhân\n",
254 |       "7167310\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "# Chuyển sang chữ thường\r\n",
260 |     "data = [line.lower() for line in data]\r\n",
261 |     "print(\"\\n\".join(data[:10]))\r\n",
262 |     "print(len(data))"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 7,
268 |    "metadata": {
269 |     "colab": {
270 |      "base_uri": "https://localhost:8080/"
271 |     },
272 |     "id": "8sg7UNgayNHJ",
273 |     "outputId": "2bcc3d7d-e88a-40b1-c724-4023ce22f9ed"
274 |    },
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "internet society hay isoc là một tổ chức quốc tế hoạt động phi lợi nhuận phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành\n",
281 |       "tổ chức này chú trọng đến tiêu chuẩn giáo dục và các vấn đề về chính sách\n",
282 |       "thành viên cá nhân isoc bao gồm những con người cụ thể trong cộng đồng internet\n",
283 |       "mọi chi tiết có thể tìm thấy tại website của isoc\n",
284 |       "internet society nằm ở gần thủ đô washington dc hoa kỳ và geneva thụy sĩ\n",
285 |       "số hội viên của nó bao gồm hơn tổ chức thành viên và hơn\n",
286 |       "thành viên còn có thể tự lập một chi nhánh của tổ chức tùy theo vị trí hoặc sở thích\n",
287 |       "hiện nay tổ chức có tới chi nhánh trên toàn thế giới\n",
288 |       "bảo đảm cổ vũ cho sự phát triển mở rộng và sử dụng internet được thuận lợi nhất cho mọi người trên toàn thế giới\n",
289 |       "tiếng việt được chính thức ghi nhận trong hiến pháp nước cộng hòa xã hội chủ nghĩa việt nam tại chương i điều mục là ngôn ngữ quốc gia của việt nam\n",
290 |       "4315334\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "new_data = []\r\n",
296 |     "for i in range(len(data)):\r\n",
297 |     "    n_words = len(data[i].split(\" \"))\r\n",
298 |     "    if n_words >= 10 and n_words <= 200:\r\n",
299 |     "        new_data.append(data[i])\r\n",
300 |     "data = new_data\r\n",
301 |     "print(\"\\n\".join(data[:10]))\r\n",
302 |     "print(len(data))"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {
308 |     "id": "3tS7-h8uNJh8"
309 |    },
310 |    "source": [
311 |     "## 2. Chuẩn hoá dấu câu"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 8,
317 |    "metadata": {
318 |     "colab": {
319 |      "base_uri": "https://localhost:8080/"
320 |     },
321 |     "id": "OdE9domENRTC",
322 |     "outputId": "49c6562b-20b9-4559-dc87-eecc355394e2"
323 |    },
324 |    "outputs": [
325 |     {
326 |      "name": "stdout",
327 |      "output_type": "stream",
328 |      "text": [
329 |       "Downloading...\n",
330 |       "From: https://drive.google.com/uc?id=1M2tCWPD2aCo8OXfOEuEmF5XHzE1OJDWK\n",
331 |       "To: /home/vietanhdev/Works/AIDr/vn-aidr/model_utils/vn_accent/preprocess_data/tone_dict.json\n",
332 |       "100%|████████████████████████████████████████| 885k/885k [00:00<00:00, 3.26MB/s]\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "!gdown --id \"1M2tCWPD2aCo8OXfOEuEmF5XHzE1OJDWK\""
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 9,
343 |    "metadata": {
344 |     "id": "5Y-JnVL5Pa7I"
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "import json\r\n",
349 |     "\r\n",
350 |     "def remove_tone_line(utf8_str):\r\n",
351 |     "    intab_l = \"ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ\"\r\n",
352 |     "    intab_u = \"ẠẢÃÀÁÂẬẦẤẨẪĂẮẰẶẲẴÓÒỌÕỎÔỘỔỖỒỐƠỜỚỢỞỠÉÈẺẸẼÊẾỀỆỂỄÚÙỤỦŨƯỰỮỬỪỨÍÌỊỈĨÝỲỶỴỸĐ\"\r\n",
353 |     "    intab = [ch for ch in str(intab_l+intab_u)]\r\n",
354 |     "\r\n",
355 |     "    outtab_l = \"a\"*17 + \"o\"*17 + \"e\"*11 + \"u\"*11 + \"i\"*5 + \"y\"*5 + \"d\"\r\n",
356 |     "    outtab_u = \"A\"*17 + \"O\"*17 + \"E\"*11 + \"U\"*11 + \"I\"*5 + \"Y\"*5 + \"D\"\r\n",
357 |     "    outtab = outtab_l + outtab_u\r\n",
358 |     "\r\n",
359 |     "    r = re.compile(\"|\".join(intab))\r\n",
360 |     "    replaces_dict = dict(zip(intab, outtab))\r\n",
361 |     "\r\n",
362 |     "    return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)\r\n",
363 |     "\r\n",
364 |     "\r\n",
365 |     "def normalize_tone_line(utf8_str):\r\n",
366 |     "    intab_l = \"áàảãạâấầẩẫậăắằẳẵặđèéẻẽẹêếềểễệíìỉĩịóòỏõọôốồổỗộơớờởỡợúùủũụưứừửữựýỳỷỹỵ\"\r\n",
367 |     "    intab_u = \"ÁÀẢÃẠÂẤẦẨẪẬĂẮẰẲẴẶĐÈÉẺẼẸÊẾỀỂỄỆÍÌỈĨỊÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢÚÙỦŨỤƯỨỪỬỮỰÝỲỶỸỴ\"\r\n",
368 |     "    intab = [ch for ch in str(intab_l + intab_u)]\r\n",
369 |     "\r\n",
370 |     "    outtab_l = [\r\n",
371 |     "        \"a1\", \"a2\", \"a3\", \"a4\", \"a5\",\r\n",
372 |     "        \"a6\", \"a61\", \"a62\", \"a63\", \"a64\", \"a65\",\r\n",
373 |     "        \"a8\", \"a81\", \"a82\", \"a83\", \"a84\", \"a85\",\r\n",
374 |     "        \"d9\",\r\n",
375 |     "        \"e1\", \"e2\", \"e3\", \"e4\", \"e5\",\r\n",
376 |     "        \"e6\", \"e61\", \"e62\", \"e63\", \"e64\", \"e65\",\r\n",
377 |     "        \"i1\", \"i2\", \"i3\", \"i4\", \"i5\",\r\n",
378 |     "        \"o1\", \"o2\", \"o3\", \"o4\", \"o5\",\r\n",
379 |     "        \"o6\", \"a61\", \"o62\", \"o63\", \"o64\", \"o65\",\r\n",
380 |     "        \"o7\", \"o71\", \"o72\", \"o73\", \"o74\", \"o75\",\r\n",
381 |     "        \"u1\", \"u2\", \"u3\", \"u4\", \"u5\",\r\n",
382 |     "        \"u7\", \"u71\", \"u72\", \"u73\", \"u74\", \"u75\",\r\n",
383 |     "        \"y1\", \"y2\", \"y3\", \"y4\", \"y5\",\r\n",
384 |     "    ]\r\n",
385 |     "\r\n",
386 |     "    outtab_u = [\r\n",
387 |     "        \"A1\", \"A2\", \"A3\", \"A4\", \"A5\",\r\n",
388 |     "        \"A6\", \"A61\", \"A62\", \"A63\", \"A64\", \"A65\",\r\n",
389 |     "        \"A8\", \"A81\", \"A82\", \"A83\", \"A84\", \"A85\",\r\n",
390 |     "        \"D9\",\r\n",
391 |     "        \"E1\", \"E2\", \"E3\", \"E4\", \"E5\",\r\n",
392 |     "        \"E6\", \"E61\", \"E62\", \"E63\", \"E64\", \"E65\",\r\n",
393 |     "        \"I1\", \"I2\", \"I3\", \"I4\", \"I5\",\r\n",
394 |     "        \"O1\", \"O2\", \"O3\", \"O4\", \"O5\",\r\n",
395 |     "        \"O6\", \"O61\", \"O62\", \"O63\", \"O64\", \"O65\",\r\n",
396 |     "        \"O7\", \"O71\", \"O72\", \"O73\", \"O74\", \"O75\",\r\n",
397 |     "        \"U1\", \"U2\", \"U3\", \"U4\", \"U5\",\r\n",
398 |     "        \"U7\", \"U71\", \"U72\", \"U73\", \"U74\", \"U75\",\r\n",
399 |     "        \"Y1\", \"Y2\", \"Y3\", \"Y4\", \"Y5\",\r\n",
400 |     "    ]\r\n",
401 |     "\r\n",
402 |     "    r = re.compile(\"|\".join(intab))\r\n",
403 |     "    replaces_dict = dict(zip(intab, outtab_l + outtab_u))\r\n",
404 |     "\r\n",
405 |     "    return r.sub(lambda m: replaces_dict[m.group(0)], utf8_str)\r\n",
406 |     "\r\n",
407 |     "def simplify(word):\r\n",
408 |     "    \"\"\"\r\n",
409 |     "    normalize and simplify a vni word:\r\n",
410 |     "    * move tone digit to the end\r\n",
411 |     "    * return only digits\r\n",
412 |     "    * return 0 if there is no digit\r\n",
413 |     "    \"\"\"\r\n",
414 |     "    if word.isalpha(): \r\n",
415 |     "        return '0'\r\n",
416 |     "    ret = ''\r\n",
417 |     "    tone = ''\r\n",
418 |     "    for letter in word:\r\n",
419 |     "        if '1' <= letter <= '9':\r\n",
420 |     "            if '1' <= letter <= '5':\r\n",
421 |     "                # assert len(tone) == 0, '{}, {}'.format(tone, word)\r\n",
422 |     "                if tone != '':\r\n",
423 |     "                    return '#'  # ignore this word\r\n",
424 |     "                tone = letter\r\n",
425 |     "            else:\r\n",
426 |     "                ret += letter\r\n",
427 |     "    return ret + tone\r\n",
428 |     "\r\n",
429 |     "\r\n",
430 |     "def extract_tone(line):\r\n",
431 |     "    \"\"\"\r\n",
432 |     "    Process a line\r\n",
433 |     "    :param line:\r\n",
434 |     "    :return: no_tone_line, no_tone_words, simplified_words\r\n",
435 |     "    \"\"\"\r\n",
436 |     "    utf8_line = line.strip('\\n')\r\n",
437 |     "\r\n",
438 |     "    no_tone_line_pre = remove_tone_line(utf8_line)\r\n",
439 |     "    normalized_line_pre = normalize_tone_line(utf8_line)\r\n",
440 |     "\r\n",
441 |     "    no_tone_words, _ = extract_words(no_tone_line_pre, include_digits=True)\r\n",
442 |     "    normalized_words, _ = extract_words(normalized_line_pre, include_digits=True)\r\n",
443 |     "\r\n",
444 |     "    assert len(no_tone_words) == len(normalized_words)\r\n",
445 |     "\r\n",
446 |     "    filtered_no_tone_words = []\r\n",
447 |     "    simplified_words = []\r\n",
448 |     "    for i, word in enumerate(no_tone_words):\r\n",
449 |     "        if not word.isalpha():\r\n",
450 |     "            continue\r\n",
451 |     "        simplified_word = simplify(normalized_words[i])\r\n",
452 |     "        filtered_no_tone_words.append(word)\r\n",
453 |     "        simplified_words.append(simplified_word)\r\n",
454 |     "\r\n",
455 |     "    return filtered_no_tone_words, simplified_words\r\n",
456 |     "\r\n",
457 |     "\r\n",
458 |     "def standarize_tone_word(word, tone_dict):\r\n",
459 |     "    notone_word, tone = extract_tone(word)\r\n",
460 |     "    notone_word = notone_word[0]\r\n",
461 |     "    tone = tone[0]\r\n",
462 |     "    std_form = notone_word + tone\r\n",
463 |     "    if std_form in tone_dict:\r\n",
464 |     "        return tone_dict[std_form]\r\n",
465 |     "    else:\r\n",
466 |     "        # print(\"MISSING:\", word)\r\n",
467 |     "        return word\r\n",
468 |     "\r\n",
469 |     "with open(\"tone_dict.json\", \"r\") as f:\r\n",
470 |     "    tone_dict = json.load(f)\r\n",
471 |     "def standarize_tone_str(text):\r\n",
472 |     "    text = text.lower()\r\n",
473 |     "    words = text.split(\" \")\r\n",
474 |     "    for i in range(len(words)):\r\n",
475 |     "        words[i] = standarize_tone_word(words[i], tone_dict)\r\n",
476 |     "    text = \" \".join(words)\r\n",
477 |     "    return text"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 10,
483 |    "metadata": {
484 |     "colab": {
485 |      "base_uri": "https://localhost:8080/"
486 |     },
487 |     "id": "bdpefphjNtqu",
488 |     "outputId": "95fe6781-7aef-44d7-ff18-6cb16940ba28"
489 |    },
490 |    "outputs": [
491 |     {
492 |      "name": "stderr",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "100%|██████████| 4315334/4315334 [24:17<00:00, 2960.64it/s]\n"
496 |      ]
497 |     },
498 |     {
499 |      "name": "stdout",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "internet society hay isoc là một tổ chức quốc tế hoạt động phi lợi nhuận phi chính phủ và bao gồm các thành viên có trình độ chuyên ngành\n",
503 |       "tổ chức này chú trọng đến tiêu chuẩn giáo dục và các vấn đề về chính sách\n",
504 |       "thành viên cá nhân isoc bao gồm những con người cụ thể trong cộng đồng internet\n",
505 |       "mọi chi tiết có thể tìm thấy tại website của isoc\n",
506 |       "internet society nằm ở gần thủ đô washington dc hoa kỳ và geneva thụy sĩ\n",
507 |       "số hội viên của nó bao gồm hơn tổ chức thành viên và hơn\n",
508 |       "thành viên còn có thể tự lập một chi nhánh của tổ chức tùy theo vị trí hoặc sở thích\n",
509 |       "hiện nay tổ chức có tới chi nhánh trên toàn thế giới\n",
510 |       "bảo đảm cổ vũ cho sự phát triển mở rộng và sử dụng internet được thuận lợi nhất cho mọi người trên toàn thế giới\n",
511 |       "tiếng việt được chính thức ghi nhận trong hiến pháp nước cộng hòa xã hội chủ nghĩa việt nam tại chương i điều mục là ngôn ngữ quốc gia của việt nam\n",
512 |       "4315334\n"
513 |      ]
514 |     }
515 |    ],
516 |    "source": [
517 |     "import json\r\n",
518 |     "from tqdm import tqdm\r\n",
519 |     "from multiprocessing import Pool\r\n",
520 |     "\r\n",
521 |     "# Chuẩn hoá dấu\r\n",
522 |     "with Pool(12) as p:\r\n",
523 |     "    data = list(tqdm(p.imap(standarize_tone_str, data), total=len(data)))\r\n",
524 |     "print(\"\\n\".join(data[:10]))\r\n",
525 |     "print(len(data))"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {
531 |     "id": "KYEcjlOcNOMT"
532 |    },
533 |    "source": [
534 |     "## 3. Ghi ra output"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": 11,
540 |    "metadata": {
541 |     "id": "TydcCpF7yZz2"
542 |    },
543 |    "outputs": [
544 |     {
545 |      "name": "stdout",
546 |      "output_type": "stream",
547 |      "text": [
548 |       "4315334\n"
549 |      ]
550 |     }
551 |    ],
552 |    "source": [
553 |     "with open('wikipedia.txt', 'w', encoding='utf-8') as f:\r\n",
554 |     "    print(len(data))\r\n",
555 |     "    f.write(\"\\n\".join(data))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": null,
561 |    "metadata": {
562 |     "id": "9EMlKElzy-Jt"
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "!cp wikipedia.txt /content/drive/MyDrive"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {
573 |     "id": "4AA49I9z1twu"
574 |    },
575 |    "outputs": [],
576 |    "source": []
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "colab": {
581 |    "collapsed_sections": [],
582 |    "name": "Preprocess Thêm dấu tiếng Việt - Wikipedia.ipynb",
583 |    "provenance": [],
584 |    "toc_visible": true
585 |   },
586 |   "kernelspec": {
587 |    "display_name": "Python 3",
588 |    "language": "python",
589 |    "name": "python3"
590 |   },
591 |   "language_info": {
592 |    "codemirror_mode": {
593 |     "name": "ipython",
594 |     "version": 3
595 |    },
596 |    "file_extension": ".py",
597 |    "mimetype": "text/x-python",
598 |    "name": "python",
599 |    "nbconvert_exporter": "python",
600 |    "pygments_lexer": "ipython3",
601 |    "version": "3.7.0"
602 |   }
603 |  },
604 |  "nbformat": 4,
605 |  "nbformat_minor": 4
606 | }
607 | 


--------------------------------------------------------------------------------