├── Baselines
    ├── ConvSeq2Seq
    │   ├── Dataframes
    │   │   └── dfs.txt
    │   ├── Checkpoints
    │   │   └── checkpoints.txt
    │   ├── Dataset
    │   │   └── kalpurush.ttf
    │   ├── errors.py
    │   ├── README.md
    │   ├── pipeline.py
    │   ├── utils.py
    │   ├── metrics.py
    │   ├── models.py
    │   └── main.py
    ├── DCSpell
    │   ├── Dataframes
    │   │   └── dfs.txt
    │   ├── Checkpoints
    │   │   └── checkpoints.txt
    │   ├── Dataset
    │   │   └── kalpurush.ttf
    │   ├── README.md
    │   ├── process.py
    │   ├── pipeline.py
    │   ├── corrector.py
    │   ├── utils.py
    │   └── metrics.py
    ├── DTransformer
    │   ├── Dataframes
    │   │   └── dfs.txt
    │   ├── Checkpoints
    │   │   └── checkpoints.txt
    │   ├── Dataset
    │   │   └── kalpurush.ttf
    │   ├── README.md
    │   ├── process.py
    │   ├── focalLoss.py
    │   ├── pipeline.py
    │   ├── utils.py
    │   ├── dtransformer.py
    │   └── metrics.py
    ├── GRUSeq2Seq
    │   ├── Corrections
    │   │   └── corpora.txt
    │   ├── Dataset
    │   │   └── kalpurush.ttf
    │   ├── Checkpoints
    │   │   └── temp.txt
    │   ├── README.md
    │   ├── metrics.py
    │   ├── check.py
    │   ├── decoding.py
    │   ├── focalLoss.py
    │   ├── inference.py
    │   ├── pipeline.py
    │   ├── models.py
    │   ├── utils.py
    │   ├── errors.py
    │   └── main.py
    ├── RuleBased
    │   ├── Dataset
    │   │   └── kalpurush.ttf
    │   ├── README.md
    │   └── rule_based.py
    └── README.md
├── Dataframes
    └── dataframesGeneratedByModels.txt
├── Dataset
    ├── Hindi
    │   └── HindiCorpusFromAnotherPaper.txt
    ├── Bangla
    │   └── BanglaCorpusFromAnotherPaper.txt
    └── Telugu
    │   └── TeluguCorpusFromAnotherPaper.txt
├── figures
    └── DPCSpell.jpg
├── Checkpoints
    ├── checkpoints.txt
    └── Checkpoints.md
├── CorpusCreation
    ├── README.md
    ├── scraper.py
    └── corpus_stats_valid.py
├── LICENSE
├── process.py
├── focalLoss.py
├── README.md
├── pipeline.py
├── corrector.py
├── utils.py
├── Requirements
    └── requirements_u.yml
├── metrics.py
└── detector.py


/Baselines/ConvSeq2Seq/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs


--------------------------------------------------------------------------------
/Baselines/DCSpell/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs


--------------------------------------------------------------------------------
/Baselines/DTransformer/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs


--------------------------------------------------------------------------------
/Dataframes/dataframesGeneratedByModels.txt:
--------------------------------------------------------------------------------
1 | https://


--------------------------------------------------------------------------------
/Dataset/Hindi/HindiCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://


--------------------------------------------------------------------------------
/Dataset/Bangla/BanglaCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://


--------------------------------------------------------------------------------
/Dataset/Telugu/TeluguCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://


--------------------------------------------------------------------------------
/Baselines/DCSpell/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Corrections/corpora.txt:
--------------------------------------------------------------------------------
1 | corpus.csv
2 | corpus2.csv


--------------------------------------------------------------------------------
/figures/DPCSpell.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/figures/DPCSpell.jpg


--------------------------------------------------------------------------------
/Baselines/DCSpell/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DCSpell/Dataset/kalpurush.ttf


--------------------------------------------------------------------------------
/Baselines/RuleBased/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/RuleBased/Dataset/kalpurush.ttf


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf


--------------------------------------------------------------------------------
/Baselines/DTransformer/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DTransformer/Dataset/kalpurush.ttf


--------------------------------------------------------------------------------
/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | Download checkpoints from the following link:
2 | https://drive.google.com/drive/folders/1prH28CiedKmhDmh3lOqquByQQTD8DN2d?usp=share_link
3 | 


--------------------------------------------------------------------------------
/Checkpoints/Checkpoints.md:
--------------------------------------------------------------------------------
 1 | <html>
 2 | 
 3 | <head>
 4 | </head>
 5 |   
 6 | <body>
 7 |   <h1 align="center"><a href="https://drive.google.com/drive/folders/1prH28CiedKmhDmh3lOqquByQQTD8DN2d?usp=share_link" target="_blank">Download Checkpoints</a></h1>
 8 | </body>
 9 |   
10 | </html>
11 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Checkpoints/temp.txt:
--------------------------------------------------------------------------------
1 | Top1 Acc: 0.5088253438742403
2 | Top2 Acc: 0.13197459214915688
3 | Top3 Acc: 0.10706370241740164
4 | Accuracy: 0.6364529543481241
5 | 100%|██████████| 175064/175064 [1:04:31<00:00, 45.21it/s]
6 | Modified Top1 Acc: 0.6128444454599461
7 | 
8 | Process finished with exit code 0
9 | 


--------------------------------------------------------------------------------
/CorpusCreation/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">Corpus Creation</h1>
 2 |  
 3 | ### Word Accumulation
 4 | ```
 5 | python scraper.py
 6 | ```
 7 |  
 8 | ### Error Annexation
 9 | ```
10 | python errors.py
11 | ```
12 |  
13 | ### Error Filtration
14 |  
15 |  
16 |  
17 | ### Corpus Statistic and Error Percentage Validation
18 | ```
19 | python corpus_stats_valid.py --email "username@gmail.com" --password "facebook_password"
20 | ```
21 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/errors.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from utils import word2char
 3 | 
 4 | 
 5 | def error_df(df, error='Cognitive Error'):
 6 |     df = df.loc[df['ErrorType'] == error]
 7 |     df['Word'] = df['Word'].apply(word2char)
 8 |     df['Error'] = df['Error'].apply(word2char)
 9 |     df = df.sample(frac=1).reset_index(drop=True)
10 |     df = df.iloc[:, [1, 0]]
11 |     df.to_csv('./Dataset/error.csv', index=False)
12 | 
13 | 


--------------------------------------------------------------------------------
/Baselines/RuleBased/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">RuleBased</h1>
 2 | 
 3 | ## Activate the Environment
 4 | ```
 5 | conda activate DPCSpell
 6 | ```
 7 | 
 8 | <br>
 9 | 
10 | ## Prepare SEC Corpora 
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 | <p>
15 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
16 | </p>
17 | 
18 | <br>
19 | 
20 | ## Training and Evaluation of RuleBased
21 | ```
22 | python rule_based.py
23 | ```
24 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">ConvSeq2Seq</h1>
 2 | 
 3 | ## Activate the Environment
 4 | ```
 5 | conda activate DPCSpell
 6 | ```
 7 | 
 8 | <br>
 9 | 
10 | ## Prepare SEC Corpora 
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 | <p>
15 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
16 | </p>
17 | 
18 | <br>
19 | 
20 | ## Training and Evaluation of ConvSeq2Seq
21 | ```
22 | python main.py --CORPUS "./Dataset/corpus.csv" --EMB_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_KERNEL_SIZE 3 --DEC_KERNEL_SIZE 3 --ENC_DROPOUT 0.2 --DEC_DROPOUT 0.2 --CLIP 0.1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">DTransformer</h1>
 2 | 
 3 | ## Activate the Environment
 4 | ```
 5 | conda activate DPCSpell
 6 | ```
 7 | 
 8 | <br>
 9 | 
10 | ## Prepare SEC Corpora 
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 | <p>
15 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
16 | </p>
17 | 
18 | <br>
19 | 
20 | ## Training and Evaluation of DTransformer
21 | ```
22 | python dtransformer.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">GRUSeq2Seq</h1>
 2 | 
 3 | ## Activate the Environment
 4 | ```
 5 | conda activate DPCSpell
 6 | ```
 7 | 
 8 | <br>
 9 | 
10 | ## Prepare SEC Corpora 
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 | <p>
15 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
16 | </p>
17 | 
18 | <br>
19 | 
20 | ## Training and Evaluation of GRUSeq2Seq
21 | ```
22 | python main.py --CORPUS "./Dataset/corpus.csv" --ENC_EMB_DIM 128 --DEC_EMB_DIM 128 --ENC_HIDDEN_DIM 256 --DEC_HIDDEN_DIM 512 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --MAX_LEN 48 --CLIP 1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Mehedi Hasan Bijoy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">DCSpell</h1>
 2 | 
 3 | ## Activate the Environment
 4 | ```
 5 | conda activate DPCSpell
 6 | ```
 7 | 
 8 | <br>
 9 | 
10 | ## Prepare SEC Corpora 
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 | <p>
15 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
16 | </p>
17 | 
18 | <br>
19 | 
20 | 
21 | ## Training and Evaluation of DPCSpell
22 | 
23 | ### Detector Network
24 | 
25 | ```
26 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
27 | ```
28 | 
29 | ### Corrector Network
30 | 
31 | ```
32 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 
33 | ```
34 | 


--------------------------------------------------------------------------------
/Baselines/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">Baselines</h1>
 2 | 
 3 | <table align="center">
 4 | 
 5 |   <tr>
 6 |     <th>Method</th>
 7 |     <th>Source Code</th>
 8 |   </tr>
 9 |   
10 |   <tr>
11 |     <td>RuleBased</td>
12 |     <td><i><a href="https://github.com/mehedihasanbijoy/DPCSpell/tree/main/Baselines/RuleBased" target="_blank">DPCSpell/Baselines/RuleBased</a></i></td>
13 |   </tr>
14 |   
15 |   <tr>
16 |     <td>GRUSeq2Seq</td>
17 |     <td><i><a href="https://github.com/mehedihasanbijoy/DPCSpell/tree/main/Baselines/GRUSeq2Seq" target="_blank">DPCSpell/Baselines/GRUSeq2Seq</a></i></td>
18 |   </tr>
19 |   
20 |   <tr>
21 |     <td>ConvSeq2Seq</td>
22 |     <td><i><a href="https://github.com/mehedihasanbijoy/DPCSpell/tree/main/Baselines/ConvSeq2Seq" target="_blank">DPCSpell/Baselines/ConvSeq2Seq</a></i></td>
23 |   </tr>
24 |   
25 |   <tr>
26 |     <td>DTransformer</td>
27 |     <td><i><a href="https://github.com/mehedihasanbijoy/DPCSpell/tree/main/Baselines/DTransformer" target="_blank">DPCSpell/Baselines/DTransformer</a></i></td>
28 |   </tr>
29 |   
30 |   <tr>
31 |     <td>DCSpell</td>
32 |     <td><i><a href="https://github.com/mehedihasanbijoy/DPCSpell/tree/main/Baselines/DCSpell" target="_blank">DPCSpell/Baselines/DCSpell</a></i></td>
33 |   </tr>
34 | 
35 | </table>
36 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/pipeline.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from tqdm import tqdm
 3 | 
 4 | 
 5 | def train(model, iterator, optimizer, criterion, clip):
 6 |     model.train()
 7 |     epoch_loss = 0
 8 |     for idx, batch in enumerate(tqdm(iterator)):
 9 |         src = batch.src
10 |         trg = batch.trg
11 |         optimizer.zero_grad()
12 |         output, _ = model(src, trg[:, :-1])
13 |         output_dim = output.shape[-1]
14 |         output = output.contiguous().view(-1, output_dim)
15 |         trg = trg[:, 1:].contiguous().view(-1)
16 |         loss = criterion(output, trg)
17 |         loss.backward()
18 |         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
19 |         optimizer.step()
20 |         epoch_loss += loss.item()
21 |     return epoch_loss / len(iterator)
22 | 
23 | 
24 | def evaluate(model, iterator, criterion):
25 |     model.eval()
26 |     epoch_loss = 0
27 |     with torch.no_grad():
28 |         for idx, batch in enumerate(tqdm(iterator)):
29 |             src = batch.src
30 |             trg = batch.trg
31 |             output, _ = model(src, trg[:, :-1])
32 |             output_dim = output.shape[-1]
33 |             output = output.contiguous().view(-1, output_dim)
34 |             trg = trg[:, 1:].contiguous().view(-1)
35 |             loss = criterion(output, trg)
36 |             epoch_loss += loss.item()
37 |     return epoch_loss / len(iterator)
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     pass
42 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def basic_tokenizer(text):
 5 |     return text.split()
 6 | 
 7 | 
 8 | def word2char(word):
 9 |     w2c = [char for char in word]
10 |     return ' '.join(w2c)
11 | 
12 | 
13 | def count_parameters(model):
14 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
15 | 
16 | 
17 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30):
18 |     model.eval()
19 |     tokens = [src_field.init_token] + sentence + [src_field.eos_token]
20 |     src_indexes = [src_field.vocab.stoi[token] for token in tokens]
21 |     src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
22 |     with torch.no_grad():
23 |         encoder_conved, encoder_combined = model.encoder(src_tensor)
24 |     trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
25 |     for i in range(max_len):
26 |         trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
27 |         with torch.no_grad():
28 |             output, attention = model.decoder(trg_tensor, encoder_conved, encoder_combined)
29 |         pred_token = output.argmax(2)[:, -1].item()
30 |         trg_indexes.append(pred_token)
31 |         if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
32 |             break
33 |     trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
34 |     return trg_tokens[1:], attention
35 | 
36 | 
37 | def save_model(model, train_loss, epoch, PATH):
38 |     torch.save({
39 |         'epoch': epoch,
40 |         'model_state_dict': model.state_dict(),
41 |         # 'optimizer_state_dict': optimizer.state_dict(),
42 |         'loss': train_loss
43 |     }, PATH)
44 |     print(f"---------\nModel Saved at {PATH}\n---------\n")
45 | 
46 | 
47 | def load_model(model, PATH):
48 |     checkpoint = torch.load(PATH)
49 |     model.load_state_dict(checkpoint['model_state_dict'])
50 |     # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
51 |     epoch = checkpoint['epoch']
52 |     train_loss = checkpoint['loss']
53 |     return checkpoint, epoch, train_loss
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     pass


--------------------------------------------------------------------------------
/process.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from utils import word2char
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def check_from_left(word, error):
 7 |     left = []
 8 |     for i in range(len(error)):
 9 |         if error[i] == word[i]:
10 |             left.append(0)
11 |         else:
12 |             left.append(1)
13 |     return left
14 | 
15 | 
16 | def check_from_right(word, error):
17 |     word.reverse()
18 |     error.reverse()
19 |     right = []
20 |     for i in range(len(error)):
21 |         if error[i] == word[i]:
22 |             right.append(0)
23 |         else:
24 |             right.append(1)
25 |     right.reverse()
26 |     return right
27 | 
28 | 
29 | def check_from_both(word, error):
30 |     length = len(error)
31 |     if length % 2 == 0:
32 |         iterator = length // 2
33 |     else:
34 |         iterator = (length // 2) + 1
35 | 
36 |     x = -1
37 | 
38 |     left = []
39 |     right = []
40 | 
41 |     for i in range(iterator):
42 |         if error[i] == word[i]:
43 |             left.append(0)
44 |         else:
45 |             left.append(1)
46 | 
47 |         if error[x] == word[x]:
48 |             right.append(0)
49 |         else:
50 |             right.append(1)
51 |         x -= 1
52 | 
53 |     right.reverse()
54 |     both = [*left, *right]
55 |     return both
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     path = './Dataset/sec_dataset_III_v3.csv'
60 |     df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 |     df_copy = df.copy()
62 |     df['Word'] = df['Word'].apply(word2char)
63 |     df['Error'] = df['Error'].apply(word2char)
64 | 
65 |     for idx in tqdm(range(len(df))):
66 |         word = df.iloc[idx, 0].split()
67 |         error = df.iloc[idx, 1].split()
68 |         word = ['ব', 'া', 'ং', 'ল', 'া']
69 |         error = ['ব', 'ং', 'ল', 'া']
70 |         print(len(word), len(error))
71 |         print(f'{word}\n{error}')
72 |         # checking from left
73 |         left = check_from_left(word, error)
74 |         print(left)
75 |         right = check_from_right(word, error)
76 |         print(right)
77 |         both = check_from_both(word, error)
78 |         print(both)
79 |         break
80 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/process.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from utils import word2char
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def check_from_left(word, error):
 7 |     left = []
 8 |     for i in range(len(error)):
 9 |         if error[i] == word[i]:
10 |             left.append(0)
11 |         else:
12 |             left.append(1)
13 |     return left
14 | 
15 | 
16 | def check_from_right(word, error):
17 |     word.reverse()
18 |     error.reverse()
19 |     right = []
20 |     for i in range(len(error)):
21 |         if error[i] == word[i]:
22 |             right.append(0)
23 |         else:
24 |             right.append(1)
25 |     right.reverse()
26 |     return right
27 | 
28 | 
29 | def check_from_both(word, error):
30 |     length = len(error)
31 |     if length % 2 == 0:
32 |         iterator = length // 2
33 |     else:
34 |         iterator = (length // 2) + 1
35 | 
36 |     x = -1
37 | 
38 |     left = []
39 |     right = []
40 | 
41 |     for i in range(iterator):
42 |         if error[i] == word[i]:
43 |             left.append(0)
44 |         else:
45 |             left.append(1)
46 | 
47 |         if error[x] == word[x]:
48 |             right.append(0)
49 |         else:
50 |             right.append(1)
51 |         x -= 1
52 | 
53 |     right.reverse()
54 |     both = [*left, *right]
55 |     return both
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     path = './Dataset/sec_dataset_III_v3.csv'
60 |     df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 |     df_copy = df.copy()
62 |     df['Word'] = df['Word'].apply(word2char)
63 |     df['Error'] = df['Error'].apply(word2char)
64 | 
65 |     for idx in tqdm(range(len(df))):
66 |         word = df.iloc[idx, 0].split()
67 |         error = df.iloc[idx, 1].split()
68 |         word = ['ব', 'া', 'ং', 'ল', 'া']
69 |         error = ['ব', 'ং', 'ল', 'া']
70 |         print(len(word), len(error))
71 |         print(f'{word}\n{error}')
72 |         # checking from left
73 |         left = check_from_left(word, error)
74 |         print(left)
75 |         right = check_from_right(word, error)
76 |         print(right)
77 |         both = check_from_both(word, error)
78 |         print(both)
79 |         break
80 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/process.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from utils import word2char
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def check_from_left(word, error):
 7 |     left = []
 8 |     for i in range(len(error)):
 9 |         if error[i] == word[i]:
10 |             left.append(0)
11 |         else:
12 |             left.append(1)
13 |     return left
14 | 
15 | 
16 | def check_from_right(word, error):
17 |     word.reverse()
18 |     error.reverse()
19 |     right = []
20 |     for i in range(len(error)):
21 |         if error[i] == word[i]:
22 |             right.append(0)
23 |         else:
24 |             right.append(1)
25 |     right.reverse()
26 |     return right
27 | 
28 | 
29 | def check_from_both(word, error):
30 |     length = len(error)
31 |     if length % 2 == 0:
32 |         iterator = length // 2
33 |     else:
34 |         iterator = (length // 2) + 1
35 | 
36 |     x = -1
37 | 
38 |     left = []
39 |     right = []
40 | 
41 |     for i in range(iterator):
42 |         if error[i] == word[i]:
43 |             left.append(0)
44 |         else:
45 |             left.append(1)
46 | 
47 |         if error[x] == word[x]:
48 |             right.append(0)
49 |         else:
50 |             right.append(1)
51 |         x -= 1
52 | 
53 |     right.reverse()
54 |     both = [*left, *right]
55 |     return both
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     path = './Dataset/sec_dataset_III_v3.csv'
60 |     df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 |     df_copy = df.copy()
62 |     df['Word'] = df['Word'].apply(word2char)
63 |     df['Error'] = df['Error'].apply(word2char)
64 | 
65 |     for idx in tqdm(range(len(df))):
66 |         word = df.iloc[idx, 0].split()
67 |         error = df.iloc[idx, 1].split()
68 |         word = ['ব', 'া', 'ং', 'ল', 'া']
69 |         error = ['ব', 'ং', 'ল', 'া']
70 |         print(len(word), len(error))
71 |         print(f'{word}\n{error}')
72 |         # checking from left
73 |         left = check_from_left(word, error)
74 |         print(left)
75 |         right = check_from_right(word, error)
76 |         print(right)
77 |         both = check_from_both(word, error)
78 |         print(both)
79 |         break
80 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/metrics.py:
--------------------------------------------------------------------------------
 1 | import torch, torch.nn as nn, torch.optim as optim
 2 | import torch.nn.functional as F
 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
 4 | import random
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | from sklearn.model_selection import train_test_split
 8 | import math
 9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 | 
12 | import matplotlib.pyplot as plt
13 | import matplotlib.ticker as ticker
14 | import matplotlib.font_manager as fm
15 | 
16 | import numpy as np
17 | import math
18 | import time
19 | from sklearn import metrics
20 | 
21 | import warnings as wrn
22 | wrn.filterwarnings('ignore')
23 | 
24 | 
25 | def beam_eval_report(trg_words, topk_prediction_list):
26 |     y_true = np.array(trg_words)
27 |     y_pred = np.array(topk_prediction_list)[:, 0]
28 | 
29 |     LABELS = np.array(set(list(set(y_true)) + list(set(y_pred))))
30 | 
31 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
32 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
33 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
34 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
35 |     ACC = metrics.accuracy_score(y_true, y_pred)
36 | 
37 |     print("Evaluation report of beam decoding")
38 |     print(f'''
39 |     Top-1 (Beam Decoding)
40 |         Precision: {PR:.4f}
41 |         Recall: {RE:.4f}
42 |         F1 Score: {F1:.4f}
43 |         F0.5 Score: {F05:.4f}
44 |         Accuracy: {RE * 100:.2f}%
45 |     ''')
46 | 
47 | 
48 | def greedy_eval_report(correct_words, predicted_words):
49 |     y_true = np.array(correct_words)
50 |     y_pred = np.array(predicted_words)
51 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
52 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
53 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
54 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
55 |     ACC = metrics.accuracy_score(y_true, y_pred)
56 |     print("Evaluation report of greedy decoding")
57 |     print(f'''
58 |     Top-1 (Greedy Decoding)
59 |         Precision: {PR:.4f}
60 |         Recall: {RE:.4f}
61 |         F1 Score: {F1:.4f}
62 |         F0.5 Score: {F05:.4f}
63 |         Accuracy: {RE * 100:.2f}%
64 |     ''')
65 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/metrics.py:
--------------------------------------------------------------------------------
 1 | from utils import translate_sentence
 2 | 
 3 | from sklearn import metrics
 4 | from tqdm import tqdm
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | 
 9 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
10 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
11 | 
12 |     modified_flags = []
13 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
14 |     all_words = sorted(all_words.iloc[:, 0].values)
15 | 
16 |     for data in tqdm(test_data):
17 |         src = data.src
18 |         trg = data.trg
19 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
20 | 
21 |         src = ''.join(src)
22 |         trg = ''.join(trg)
23 |         pred = ''.join(translation[:-1])
24 | 
25 |         erroneous_words.append(src)
26 |         predicted_words.append(pred)
27 |         correct_words.append(trg)
28 | 
29 |         if trg == pred:
30 |             flags.append(1)
31 |         else:
32 |             flags.append(0)
33 | 
34 |         if pred in all_words:
35 |             modified_flags.append(1)
36 |         else:
37 |             modified_flags.append(0)
38 | 
39 |     evaluation_df = pd.DataFrame({
40 |         'Error': erroneous_words,
41 |         'Predicton': predicted_words,
42 |         'Target': correct_words,
43 |         'Correction': flags
44 |     })
45 | 
46 |     corrected_instances = evaluation_df['Correction'].values.sum()
47 |     total_instances = len(evaluation_df)
48 |     accuracy = corrected_instances / total_instances
49 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
50 |     # print(f"Accuracy: {accuracy * 100:.2f}%")
51 | 
52 |     y_true = np.array(correct_words)
53 |     y_pred = np.array(predicted_words)
54 | 
55 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
56 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
57 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
58 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
59 |     ACC = metrics.accuracy_score(y_true, y_pred)
60 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
61 | 
62 |     print(f'''
63 |     Top-1 (Greedy Decoding)
64 |         Precision: {PR:.4f}
65 |         Recall: {RE:.4f}
66 |         F1 Score: {F1:.4f}
67 |         F0.5 Score: {F05:.4f}
68 |         Accuracy: {RE * 100:.2f}%
69 |         Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
70 |     ''')
71 | 
72 |     # evaluation_df.to_csv('./Dataset/preds_convs2s.csv', index=False)
73 |     return evaluation_df
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     pass


--------------------------------------------------------------------------------
/CorpusCreation/scraper.py:
--------------------------------------------------------------------------------
 1 | import requests, bs4
 2 | import pandas as pd
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def word_accumulation():
 7 |     char_pages = {
 8 |         'অ': 71, 'আ': 50, 'ই': 10, 'ঈ': 1, 'উ': 25, 'ঊ': 2, 'ঋ': 1, 'এ': 13, 'ঐ': 2, 'ও': 7, 'ঔ': 3,
 9 |         'ক': 82, 'খ': 29, 'গ': 35, 'ঘ': 7, 'ঙ': 1, 'চ': 32, 'ছ': 12, 'জ': 28, 'ঝ': 8, 'ঞ': 1,
10 |         'ট': 16, 'ঠ': 4, 'ড': 12, 'ঢ': 6, 'ণ': 1, 'ত': 44, 'থ': 6, 'দ': 44, 'ধ': 13, 'ন': 52,
11 |         'প': 77, 'ফ': 16, 'ব': 90, 'ভ': 24, 'ম': 58, 'য': 11, 'র': 30, 'ল': 18, 'শ': 25, 'ষ': 3, 'স': 86, 'হ': 27
12 |     }
13 | 
14 |     all_urls = {}
15 | 
16 |     url = 'https://accessibledictionary.gov.bd/bengali-to-bengali/'
17 | 
18 |     html_codes = requests.get(url).text
19 |     document = bs4.BeautifulSoup(html_codes, 'lxml')
20 |     alphabet_links = document.find('ul', class_='alphabet')
21 |     items = alphabet_links.find_all('li')
22 | 
23 |     for item in items:
24 |         url = str(item).split('"')[1]
25 |         all_urls[url[-1:]] = url
26 | 
27 |     df_dict = {}
28 | 
29 |     for url in all_urls.values():
30 |         no_of_pages = char_pages[url[-1:]]
31 |         for idx in tqdm(range(1, no_of_pages + 1)):
32 |             desired_url = url + '&page=' + str(idx)
33 |             html_codes = requests.get(desired_url).text
34 |             document = bs4.BeautifulSoup(html_codes, 'lxml')
35 |             article = document.find('article', class_='dicDisplay')
36 |             items = article.find_all('li')
37 | 
38 |             for item in items:
39 |                 text = item.get_text()
40 |                 text = text.split('Bengali Word')[1]
41 |                 text = text.split('Bengali definition')
42 |                 ben_word = text[0]
43 |                 ben_def = text[1]
44 |                 df_dict[ben_word] = ben_def
45 |             # break
46 | 
47 |     df = pd.DataFrame(
48 |         {
49 |             'Word': df_dict.keys(),
50 |             'Defination': df_dict.values()
51 |         }
52 |     )
53 |     return df
54 | 
55 | 
56 | def get_len(word):
57 |     return len(word)
58 | 
59 | 
60 | def text_preprocessing(df):
61 |     all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ',
62 |                  'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ',
63 |                  'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ',
64 |                  'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ',
65 |                  'ৗ', 'ড়', 'ঢ়', 'য়', ' ']
66 | 
67 |     words = ''
68 | 
69 |     df_words = ' '.join(df['Word'].values)
70 |     for char in df_words:
71 |         if char in all_chars:
72 |             words += char
73 | 
74 |     words += ' '
75 | 
76 |     df_definations = ' '.join(df['Defination'].values)
77 |     for char in df_definations:
78 |         if char in all_chars:
79 |             words += char
80 | 
81 |     words = sorted(list(set(words.split(' '))))
82 |     df_all_words = pd.DataFrame({'word': words})
83 |     df_all_words['len'] = df_all_words['word'].apply(get_len)
84 |     df_all_words = df_all_words.loc[df_all_words['len'] > 2]
85 |     return df_all_words
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     df = word_accumulation()
90 |     df_all_words = text_preprocessing(df)
91 |     df_all_words.to_csv('./dfs/df_all_words.csv', index=False)
92 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/check.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | def within_topk(df, k):
 7 |     correct = df['Correct']
 8 |     topk = df.iloc[:, 1:k+1].values
 9 |     preds = 0
10 |     # for idx in tqdm(range(len(df))):
11 |     for idx in range(len(df)):
12 |         if correct[idx] in topk[idx]:
13 |             preds += 1
14 |     acc_within_topk = preds / len(df)
15 |     print(f"Within Top-{k} Acc: {acc_within_topk}")
16 | 
17 | 
18 | def modified_acc(df_allWords, df, k):
19 |     df_allWords = sorted(df_allWords.iloc[:, 0].values)
20 |     correct = df['Correct']
21 |     topk = df.iloc[:, 1:k + 1].values
22 |     preds = 0
23 |     for words in tqdm(topk):
24 |         for word in words:
25 |             if word in df_allWords:
26 |                 preds += 1
27 |                 break;
28 |     modified_acc_within_topk = preds / len(df)
29 |     print(f"Within Top-{k} Modified Acc: {modified_acc_within_topk}")
30 | 
31 | 
32 | def beam_report():
33 |     print("""
34 |     --------------------------------
35 |     Beam Decoding Evaluation Report
36 |     --------------------------------
37 |     """)
38 |     df_allWords = pd.read_csv('./Dataset/allDictWords_df.csv')
39 |     df_beam = pd.read_csv('./Corrections/preds_beam_colab.csv')
40 |     top1_acc = np.sum(df_beam['Pred-1'] == df_beam['Correct']) / len(df_beam)
41 |     top2_acc = np.sum(df_beam['Pred-2'] == df_beam['Correct']) / len(df_beam)
42 |     top3_acc = np.sum(df_beam['Pred-3'] == df_beam['Correct']) / len(df_beam)
43 |     print(f"Top1 Acc: {top1_acc}")
44 |     print(f"Top2 Acc: {top2_acc}")
45 |     print(f"Top3 Acc: {top3_acc}\n")
46 |     within_topk(df_beam, 1)
47 |     within_topk(df_beam, 2)
48 |     within_topk(df_beam, 3)
49 |     modified_acc(df_allWords, df_beam, 1)
50 |     modified_acc(df_allWords, df_beam, 2)
51 |     modified_acc(df_allWords, df_beam, 3)
52 | 
53 | def test():
54 |     df = pd.read_csv('./Dataset/allDictWords_df.csv')
55 |     words = sorted(df.iloc[:, 0].values)
56 |     print(words)
57 | #
58 | # acc = (df_beam['Pred-1'] == df_beam['Correct'])*1 + \
59 | #         (df_beam['Pred-2'] == df_beam['Correct'])*1 + \
60 | #         (df_beam['Pred-3'] == df_beam['Correct'])*1
61 | # acc = acc.values
62 | # acc = [1 if x>0 else 0 for x in acc]
63 | # print(f"Accuracy: {np.sum(acc) / len(df_beam)}")
64 | #
65 | # df_dict = pd.read_csv('./Dataset/allDictWords_df.csv')
66 | # df_allWords = pd.read_csv('./Dataset/df_all_words.csv')
67 | # #
68 | # preds1 = []
69 | # for word in tqdm(df_beam['Pred-1'].values):
70 | #     # similar_words = df_dict.loc[df_dict['word'].str.startswith(word)].iloc[:, 0].values
71 | #     if word in df_allWords.iloc[:, 0].values:
72 | #         preds1.append(1)
73 | #     else:
74 | #         preds1.append(0)
75 | # print(f"Modified Top1 Acc: {np.sum(preds1) / len(preds1)}")
76 | #
77 | # df_greedy = pd.read_csv('./Corrections/preds_greedy_colab.csv')
78 | # # print(df_greedy)
79 | # greedy_acc = np.sum(df_greedy['Predicton'] == df_greedy['Target'])/len(df_greedy)
80 | # print(f'Greedy Accuracy: {greedy_acc}')
81 | # preds = []
82 | # for word in tqdm(df_greedy['Predicton'].values):
83 | #     if word in df_allWords.iloc[:, 0].values:
84 | #         preds.append(1)
85 | #     else:
86 | #         preds.append(0)
87 | # print(f"Modified Greedy Accuracy: {np.sum(preds) / len(preds)}")
88 | 
89 | if __name__ == '__main__':
90 |     beam_report()


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/decoding.py:
--------------------------------------------------------------------------------
  1 | import torch, torch.nn as nn, torch.optim as optim
  2 | import torch.nn.functional as F
  3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
  4 | import random
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | import math
  9 | import time
 10 | # from torchtext.data.metrics import bleu_score
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.ticker as ticker
 14 | import matplotlib.font_manager as fm
 15 | 
 16 | import numpy as np
 17 | import math
 18 | import time
 19 | 
 20 | import copy
 21 | from heapq import heappush, heappop
 22 | 
 23 | import warnings as wrn
 24 | wrn.filterwarnings('ignore')
 25 | 
 26 | 
 27 | class BeamSearchNode(object):
 28 |     def __init__(self, h, prev_node, wid, logp, length):
 29 |         self.h = h
 30 |         self.prev_node = prev_node
 31 |         self.wid = wid
 32 |         self.logp = logp
 33 |         self.length = length
 34 | 
 35 |     def eval(self):
 36 |         return self.logp / float(self.length - 1 + 1e-6)
 37 | 
 38 | 
 39 | def beam_search_decoding(model, src, decoder, enc_outs, enc_last_h, beam_width, n_best, \
 40 |                          sos_token, eos_token, max_dec_steps, device):
 41 |     assert beam_width >= n_best
 42 |     n_best_list = []
 43 |     bs = enc_outs.shape[1]
 44 | 
 45 |     for batch_id in range(bs):
 46 |         decoder_hidden = enc_last_h[batch_id]
 47 |         enc_out = enc_outs[:, batch_id].unsqueeze(1)
 48 | 
 49 |         # decoder_input = torch.tensor([sos_token].long().to(DEVICE))
 50 |         decoder_input = torch.tensor([sos_token]).to(device)
 51 |         end_nodes = []
 52 | 
 53 |         node = BeamSearchNode(h=decoder_hidden, prev_node=None, wid=decoder_input, logp=0, length=1)
 54 |         nodes = []
 55 | 
 56 |         heappush(nodes, (-node.eval(), id(node), node))
 57 |         n_dec_steps = 0
 58 | 
 59 |         while True:
 60 |             if n_dec_steps > max_dec_steps:
 61 |                 break
 62 | 
 63 |             score, _, n = heappop(nodes)
 64 |             decoder_input = n.wid
 65 |             decoder_hidden = n.h
 66 | 
 67 |             if n.wid.item() == eos_token and n.prev_node is not None:
 68 |                 end_nodes.append((score, id(n), n))
 69 |                 if len(end_nodes) >= n_best:
 70 |                     break
 71 |                 else:
 72 |                     continue
 73 | 
 74 |             mask = model.create_mask(src)
 75 |             decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden.unsqueeze(0), enc_out, mask)
 76 | 
 77 |             # restricting length
 78 |             topk_log_prob, topk_indexes = torch.topk(decoder_output, beam_width)
 79 | 
 80 |             for new_k in range(beam_width):
 81 |                 decoded_t = topk_indexes[0][new_k].view(1)
 82 |                 logp = topk_log_prob[0][new_k].item()
 83 | 
 84 |                 node = BeamSearchNode(
 85 |                     h=decoder_hidden.squeeze(0), prev_node=n, wid=decoded_t, logp=n.logp + logp, length=n.length + 1
 86 |                 )
 87 | 
 88 |                 heappush(nodes, (-node.eval(), id(node), node))
 89 | 
 90 |             n_dec_steps += beam_width
 91 | 
 92 |         if len(end_nodes) == 0:
 93 |             end_nodes = [heappop(nodes) for _ in range(beam_width)]
 94 | 
 95 |         n_best_seq_list = []
 96 |         for score, _id, n in sorted(end_nodes, key=lambda x: x[0]):
 97 |             sequence = [n.wid.item()]
 98 |             while n.prev_node is not None:
 99 |                 n = n.prev_node
100 |                 sequence.append(n.wid.item())
101 |             sequence = sequence[::-1]
102 |             n_best_seq_list.append(sequence)
103 | 
104 |         n_best_list.append(n_best_seq_list)
105 | 
106 |     return n_best_list
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/focalLoss.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | # following:
  9 | # https://github.com/kornia/kornia/
 10 | # which is based on:
 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
 12 | 
 13 | 
 14 | def one_hot(
 15 |     labels: torch.Tensor,
 16 |     num_classes: int,
 17 |     device: Optional[torch.device] = None,
 18 |     dtype: Optional[torch.dtype] = None,
 19 |     eps: float = 1e-6,
 20 | ) -> torch.Tensor:
 21 | 
 22 |     if not isinstance(labels, torch.Tensor):
 23 |         raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
 24 | 
 25 |     if not labels.dtype == torch.int64:
 26 |         raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
 27 | 
 28 |     if num_classes < 1:
 29 |         raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
 30 | 
 31 |     shape = labels.shape
 32 |     one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
 33 | 
 34 |     return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
 35 | 
 36 | 
 37 | 
 38 | def focal_loss(
 39 |     input: torch.Tensor,
 40 |     target: torch.Tensor,
 41 |     alpha: float,
 42 |     gamma: float = 2.0,
 43 |     reduction: str = 'none',
 44 |     eps: Optional[float] = None,
 45 | ) -> torch.Tensor:
 46 | 
 47 |     if eps is not None and not torch.jit.is_scripting():
 48 |         warnings.warn(
 49 |             "`focal_loss` has been reworked for improved numerical stability "
 50 |             "and the `eps` argument is no longer necessary",
 51 |             DeprecationWarning,
 52 |             stacklevel=2,
 53 |         )
 54 | 
 55 |     if not isinstance(input, torch.Tensor):
 56 |         raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
 57 | 
 58 |     if not len(input.shape) >= 2:
 59 |         raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
 60 | 
 61 |     if input.size(0) != target.size(0):
 62 |         raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
 63 | 
 64 |     n = input.size(0)
 65 |     out_size = (n,) + input.size()[2:]
 66 |     if target.size()[1:] != input.size()[2:]:
 67 |         raise ValueError(f'Expected target size {out_size}, got {target.size()}')
 68 | 
 69 |     if not input.device == target.device:
 70 |         raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
 71 | 
 72 |     # compute softmax over the classes axis
 73 |     input_soft: torch.Tensor = F.softmax(input, dim=1)
 74 |     log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
 75 | 
 76 |     # create the labels one hot tensor
 77 |     target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
 78 | 
 79 |     # compute the actual focal loss
 80 |     weight = torch.pow(-input_soft + 1.0, gamma)
 81 | 
 82 |     focal = -alpha * weight * log_input_soft
 83 |     loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
 84 | 
 85 |     if reduction == 'none':
 86 |         loss = loss_tmp
 87 |     elif reduction == 'mean':
 88 |         loss = torch.mean(loss_tmp)
 89 |     elif reduction == 'sum':
 90 |         loss = torch.sum(loss_tmp)
 91 |     else:
 92 |         raise NotImplementedError(f"Invalid reduction mode: {reduction}")
 93 |     return loss
 94 | 
 95 | 
 96 | class FocalLoss(nn.Module):
 97 |     def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
 98 |         super().__init__()
 99 |         self.alpha: float = alpha
100 |         self.gamma: float = gamma
101 |         self.reduction: str = reduction
102 |         self.eps: Optional[float] = eps
103 | 
104 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 |         return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/focalLoss.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | # following:
  9 | # https://github.com/kornia/kornia/
 10 | # which is based on:
 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
 12 | 
 13 | 
 14 | def one_hot(
 15 |     labels: torch.Tensor,
 16 |     num_classes: int,
 17 |     device: Optional[torch.device] = None,
 18 |     dtype: Optional[torch.dtype] = None,
 19 |     eps: float = 1e-6,
 20 | ) -> torch.Tensor:
 21 | 
 22 |     if not isinstance(labels, torch.Tensor):
 23 |         raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
 24 | 
 25 |     if not labels.dtype == torch.int64:
 26 |         raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
 27 | 
 28 |     if num_classes < 1:
 29 |         raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
 30 | 
 31 |     shape = labels.shape
 32 |     one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
 33 | 
 34 |     return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
 35 | 
 36 | 
 37 | 
 38 | def focal_loss(
 39 |     input: torch.Tensor,
 40 |     target: torch.Tensor,
 41 |     alpha: float,
 42 |     gamma: float = 2.0,
 43 |     reduction: str = 'none',
 44 |     eps: Optional[float] = None,
 45 | ) -> torch.Tensor:
 46 | 
 47 |     if eps is not None and not torch.jit.is_scripting():
 48 |         warnings.warn(
 49 |             "`focal_loss` has been reworked for improved numerical stability "
 50 |             "and the `eps` argument is no longer necessary",
 51 |             DeprecationWarning,
 52 |             stacklevel=2,
 53 |         )
 54 | 
 55 |     if not isinstance(input, torch.Tensor):
 56 |         raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
 57 | 
 58 |     if not len(input.shape) >= 2:
 59 |         raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
 60 | 
 61 |     if input.size(0) != target.size(0):
 62 |         raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
 63 | 
 64 |     n = input.size(0)
 65 |     out_size = (n,) + input.size()[2:]
 66 |     if target.size()[1:] != input.size()[2:]:
 67 |         raise ValueError(f'Expected target size {out_size}, got {target.size()}')
 68 | 
 69 |     if not input.device == target.device:
 70 |         raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
 71 | 
 72 |     # compute softmax over the classes axis
 73 |     input_soft: torch.Tensor = F.softmax(input, dim=1)
 74 |     log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
 75 | 
 76 |     # create the labels one hot tensor
 77 |     target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
 78 | 
 79 |     # compute the actual focal loss
 80 |     weight = torch.pow(-input_soft + 1.0, gamma)
 81 | 
 82 |     focal = -alpha * weight * log_input_soft
 83 |     loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
 84 | 
 85 |     if reduction == 'none':
 86 |         loss = loss_tmp
 87 |     elif reduction == 'mean':
 88 |         loss = torch.mean(loss_tmp)
 89 |     elif reduction == 'sum':
 90 |         loss = torch.sum(loss_tmp)
 91 |     else:
 92 |         raise NotImplementedError(f"Invalid reduction mode: {reduction}")
 93 |     return loss
 94 | 
 95 | 
 96 | class FocalLoss(nn.Module):
 97 |     def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
 98 |         super().__init__()
 99 |         self.alpha: float = alpha
100 |         self.gamma: float = gamma
101 |         self.reduction: str = reduction
102 |         self.eps: Optional[float] = eps
103 | 
104 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 |         return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/focalLoss.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | # following:
  9 | # https://github.com/kornia/kornia/
 10 | # which is based on:
 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
 12 | 
 13 | 
 14 | def one_hot(
 15 |     labels: torch.Tensor,
 16 |     num_classes: int,
 17 |     device: Optional[torch.device] = None,
 18 |     dtype: Optional[torch.dtype] = None,
 19 |     eps: float = 1e-6,
 20 | ) -> torch.Tensor:
 21 | 
 22 |     if not isinstance(labels, torch.Tensor):
 23 |         raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
 24 | 
 25 |     if not labels.dtype == torch.int64:
 26 |         raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
 27 | 
 28 |     if num_classes < 1:
 29 |         raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
 30 | 
 31 |     shape = labels.shape
 32 |     one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
 33 | 
 34 |     return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
 35 | 
 36 | 
 37 | 
 38 | def focal_loss(
 39 |     input: torch.Tensor,
 40 |     target: torch.Tensor,
 41 |     alpha: float,
 42 |     gamma: float = 2.0,
 43 |     reduction: str = 'none',
 44 |     eps: Optional[float] = None,
 45 | ) -> torch.Tensor:
 46 | 
 47 |     if eps is not None and not torch.jit.is_scripting():
 48 |         warnings.warn(
 49 |             "`focal_loss` has been reworked for improved numerical stability "
 50 |             "and the `eps` argument is no longer necessary",
 51 |             DeprecationWarning,
 52 |             stacklevel=2,
 53 |         )
 54 | 
 55 |     if not isinstance(input, torch.Tensor):
 56 |         raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
 57 | 
 58 |     if not len(input.shape) >= 2:
 59 |         raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
 60 | 
 61 |     if input.size(0) != target.size(0):
 62 |         raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
 63 | 
 64 |     n = input.size(0)
 65 |     out_size = (n,) + input.size()[2:]
 66 |     if target.size()[1:] != input.size()[2:]:
 67 |         raise ValueError(f'Expected target size {out_size}, got {target.size()}')
 68 | 
 69 |     if not input.device == target.device:
 70 |         raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
 71 | 
 72 |     # compute softmax over the classes axis
 73 |     input_soft: torch.Tensor = F.softmax(input, dim=1)
 74 |     log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
 75 | 
 76 |     # create the labels one hot tensor
 77 |     target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
 78 | 
 79 |     # compute the actual focal loss
 80 |     weight = torch.pow(-input_soft + 1.0, gamma)
 81 | 
 82 |     focal = -alpha * weight * log_input_soft
 83 |     loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
 84 | 
 85 |     if reduction == 'none':
 86 |         loss = loss_tmp
 87 |     elif reduction == 'mean':
 88 |         loss = torch.mean(loss_tmp)
 89 |     elif reduction == 'sum':
 90 |         loss = torch.sum(loss_tmp)
 91 |     else:
 92 |         raise NotImplementedError(f"Invalid reduction mode: {reduction}")
 93 |     return loss
 94 | 
 95 | 
 96 | class FocalLoss(nn.Module):
 97 |     def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
 98 |         super().__init__()
 99 |         self.alpha: float = alpha
100 |         self.gamma: float = gamma
101 |         self.reduction: str = reduction
102 |         self.eps: Optional[float] = eps
103 | 
104 |     def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 |         return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">DPCSpell</h1>
  2 | <p align="center">
  3 |   A transformer-based spelling error correction framework for Bangla and resource scarce Indic languages </br> Link — <a href="https://www.sciencedirect.com/science/article/pii/S088523082400086X" target="_blank">Computer Speech & Language</a>
  4 | </p>
  5 | 
  6 | 
  7 | 
  8 | <!-- ![dpcspell](https://user-images.githubusercontent.com/58245357/194469283-c7dbfc0b-391e-4214-a6a2-99b7ba2dc512.png) -->
  9 | <!-- ![DPCSpellGif2](https://user-images.githubusercontent.com/58245357/197951922-8859c491-0c8e-44b4-a8f0-4b774122a060.gif) -->
 10 | <!-- ![DPCSpellGif](https://user-images.githubusercontent.com/58245357/197949190-ebdcf496-98c3-4506-897e-b2ef9a4efc29.gif) -->
 11 | 
 12 | ## 
 13 | 
 14 | ## How DPCSpell works?
 15 | <!-- ![DPCSpellGif](https://user-images.githubusercontent.com/58245357/197949190-ebdcf496-98c3-4506-897e-b2ef9a4efc29.gif) -->
 16 | ![dpcspell](https://user-images.githubusercontent.com/58245357/202089360-6fb3a70d-09cc-47ba-b5f5-3b100001c124.gif)
 17 | 
 18 | ## Running Test
 19 | | Operating System  | Requirement | Remark |
 20 | | ------------- | ------------- | ------------- |
 21 | | Ubuntu 16.04.7 LTS  | requirements_u.yml  | :heavy_check_mark: Successful |
 22 | | Ubuntu 18.04.6 LTS (Google Colab)  | requirements_c.txt  | :heavy_check_mark: Successful* |
 23 | | Windows 10  | requirements_w.yml  | :heavy_check_mark: Successful |
 24 | 
 25 | <br>
 26 | 
 27 | ## Get Started
 28 | 
 29 | ```
 30 | git clone https://github.com/mehedihasanbijoy/DPCSpell.git
 31 | ```
 32 | or manually **download** and **extract** the github repository of DPCSpell.
 33 | 
 34 | <br>
 35 | 
 36 | ## Environment Setup
 37 | ### Create A Virtual Environment
 38 | ```
 39 | conda env create -f requirements_u.yml (for Ubuntu 16.04.7 LTS)
 40 | or
 41 | conda env create -f requirements_w.yml (for Windows 10)
 42 | ```
 43 | <!-- conda env create -f requirements_c.txt (for Ubuntu 18.04.6 LTS in Colab) -->
 44 | 
 45 | ### Activate the Environment
 46 | ```
 47 | conda activate DPCSpell
 48 | ```
 49 | 
 50 | <br>
 51 | 
 52 | ## Prepare SEC Corpora 
 53 | ```
 54 | gdown https://drive.google.com/drive/folders/1_sWSi-LFsvuYh9c5GBMDd4V6_uM8yYjH?usp=share_link -O ./Dataset --folder
 55 | ```
 56 | <p>
 57 | or manually <b>download</b> the folder from <a href="https://drive.google.com/drive/folders/1_sWSi-LFsvuYh9c5GBMDd4V6_uM8yYjH?usp=share_link" target="_blank">here</a> and keep the extracted files into <b>./Dataset/</b>
 58 | </p>
 59 | 
 60 | <br>
 61 | 
 62 | ## Training and Evaluation of DPCSpell
 63 | 
 64 | ### Detector Network
 65 | 
 66 | ```
 67 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
 68 | ```
 69 | 
 70 | ### Purificator Network
 71 | 
 72 | ```
 73 | python purificator.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 
 74 | ```
 75 | 
 76 | ### Corrector Network
 77 | 
 78 | ```
 79 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 
 80 | ```
 81 | 
 82 | <br>
 83 | 
 84 | ## Benchmarking Bangla SEC Task
 85 | 
 86 | ![benchmark](https://user-images.githubusercontent.com/58245357/195144459-0150f456-f06b-4aff-93f5-36b1fb76ea42.png)
 87 | 
 88 | 
 89 | ## BibTeX Entry and Citation Info
 90 | 
 91 | ```
 92 | @article{bijoy2024transformer,
 93 |   title={A transformer based spelling error correction framework for Bangla and resource scarce Indic languages},
 94 |   author={Bijoy, Mehedi Hasan and Hossain, Nahid and Islam, Salekul and Shatabda, Swakkhar},
 95 |   journal={Computer Speech \& Language},
 96 |   volume = {89},
 97 |   pages = {101703},
 98 |   year = {2025},
 99 |   issn = {0885-2308},
100 |   doi = {https://doi.org/10.1016/j.csl.2024.101703},
101 |   url = {https://www.sciencedirect.com/science/article/pii/S088523082400086X},
102 |   publisher={Elsevier}
103 | }
104 | ```
105 | 


--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from tqdm import tqdm
  4 | from utils import basic_tokenizer
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.ticker as ticker
  7 | import matplotlib.font_manager as fm
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def train(model, iterator, optimizer, criterion, clip):
 13 |     model.train()
 14 |     epoch_loss = 0
 15 |     for idx, batch in enumerate(tqdm(iterator)):
 16 |         src = batch.src
 17 |         trg = batch.trg
 18 | 
 19 |         optimizer.zero_grad()
 20 |         output, _ = model(src, trg[:, :-1])
 21 |         # output = [batch size, trg len - 1, output dim]
 22 |         # trg = [batch size, trg len]
 23 | 
 24 |         output_dim = output.shape[-1]
 25 |         output = output.contiguous().view(-1, output_dim)
 26 |         trg = trg[:, 1:].contiguous().view(-1)
 27 |         # output = [batch size * trg len - 1, output dim]
 28 |         # trg = [batch size * trg len - 1]
 29 | 
 30 |         # trg one hot for BCEwLogits
 31 |         # trg = F.one_hot(trg, num_classes=66)
 32 | 
 33 |         loss = criterion(output, trg)
 34 |         loss.backward()
 35 |         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
 36 |         optimizer.step()
 37 |         epoch_loss += loss.item()
 38 | 
 39 |     return epoch_loss / len(iterator)
 40 | 
 41 | 
 42 | def evaluate(model, iterator, criterion):
 43 |     model.eval()
 44 |     epoch_loss = 0
 45 |     with torch.no_grad():
 46 |         for idx, batch in enumerate(tqdm(iterator)):
 47 |             src = batch.src
 48 |             trg = batch.trg
 49 | 
 50 |             output, _ = model(src, trg[:, :-1])
 51 |             # output = [batch size, trg len - 1, output dim]
 52 |             # trg = [batch size, trg len]
 53 | 
 54 |             output_dim = output.shape[-1]
 55 |             output = output.contiguous().view(-1, output_dim)
 56 |             trg = trg[:, 1:].contiguous().view(-1)
 57 |             # output = [batch size * trg len - 1, output dim]
 58 |             # trg = [batch size * trg len - 1]
 59 | 
 60 |             loss = criterion(output, trg)
 61 |             epoch_loss += loss.item()
 62 |     return epoch_loss / len(iterator)
 63 | 
 64 | 
 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
 66 |     model.eval()
 67 | 
 68 |     if isinstance(sentence, str):
 69 |         tokens = basic_tokenizer(sentence)
 70 |     else:
 71 |         tokens = sentence
 72 | 
 73 |     tokens = [src_field.init_token] + tokens + [src_field.eos_token]
 74 |     src_indexes = [src_field.vocab.stoi[token] for token in tokens]
 75 |     src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
 76 |     src_mask = model.make_src_mask(src_tensor)
 77 | 
 78 |     with torch.no_grad():
 79 |         enc_src = model.encoder(src_tensor, src_mask)
 80 | 
 81 |     trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
 82 | 
 83 |     for i in range(max_len):
 84 |         trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
 85 |         trg_mask = model.make_trg_mask(trg_tensor)
 86 | 
 87 |         with torch.no_grad():
 88 |             output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
 89 | 
 90 |         pred_token = output.argmax(2)[:, -1].item()
 91 |         trg_indexes.append(pred_token)
 92 | 
 93 |         if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
 94 |             break
 95 | 
 96 |     trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
 97 |     return trg_tokens[1:-1], attention
 98 | 
 99 | 
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 |     assert n_rows * n_cols == n_heads
102 | 
103 |     prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 | 
105 |     fig = plt.figure(figsize=(15, 25))
106 |     for i in range(n_heads):
107 |         ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 |         _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 |         cax = ax.matshow(_attention, cmap='bone')
110 | 
111 |         ax.tick_params(labelsize=12)
112 |         ax.set_xticklabels(
113 |             [''] + ['<sos>'] + [t for t in sentence] + ['<eos>'],
114 |             rotation=45, fontproperties=prop, fontsize=20
115 |         )
116 |         ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 | 
118 |         ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 |         ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 | 
121 |     plt.show()
122 |     plt.close()
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     pass
127 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/pipeline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from tqdm import tqdm
  4 | from utils import basic_tokenizer
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.ticker as ticker
  7 | import matplotlib.font_manager as fm
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def train(model, iterator, optimizer, criterion, clip):
 13 |     model.train()
 14 |     epoch_loss = 0
 15 |     for idx, batch in enumerate(tqdm(iterator)):
 16 |         src = batch.src
 17 |         trg = batch.trg
 18 | 
 19 |         optimizer.zero_grad()
 20 |         output, _ = model(src, trg[:, :-1])
 21 |         # output = [batch size, trg len - 1, output dim]
 22 |         # trg = [batch size, trg len]
 23 | 
 24 |         output_dim = output.shape[-1]
 25 |         output = output.contiguous().view(-1, output_dim)
 26 |         trg = trg[:, 1:].contiguous().view(-1)
 27 |         # output = [batch size * trg len - 1, output dim]
 28 |         # trg = [batch size * trg len - 1]
 29 | 
 30 |         # trg one hot for BCEwLogits
 31 |         # trg = F.one_hot(trg, num_classes=66)
 32 | 
 33 |         loss = criterion(output, trg)
 34 |         loss.backward()
 35 |         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
 36 |         optimizer.step()
 37 |         epoch_loss += loss.item()
 38 | 
 39 |     return epoch_loss / len(iterator)
 40 | 
 41 | 
 42 | def evaluate(model, iterator, criterion):
 43 |     model.eval()
 44 |     epoch_loss = 0
 45 |     with torch.no_grad():
 46 |         for idx, batch in enumerate(tqdm(iterator)):
 47 |             src = batch.src
 48 |             trg = batch.trg
 49 | 
 50 |             output, _ = model(src, trg[:, :-1])
 51 |             # output = [batch size, trg len - 1, output dim]
 52 |             # trg = [batch size, trg len]
 53 | 
 54 |             output_dim = output.shape[-1]
 55 |             output = output.contiguous().view(-1, output_dim)
 56 |             trg = trg[:, 1:].contiguous().view(-1)
 57 |             # output = [batch size * trg len - 1, output dim]
 58 |             # trg = [batch size * trg len - 1]
 59 | 
 60 |             loss = criterion(output, trg)
 61 |             epoch_loss += loss.item()
 62 |     return epoch_loss / len(iterator)
 63 | 
 64 | 
 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
 66 |     model.eval()
 67 | 
 68 |     if isinstance(sentence, str):
 69 |         tokens = basic_tokenizer(sentence)
 70 |     else:
 71 |         tokens = sentence
 72 | 
 73 |     tokens = [src_field.init_token] + tokens + [src_field.eos_token]
 74 |     src_indexes = [src_field.vocab.stoi[token] for token in tokens]
 75 |     src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
 76 |     src_mask = model.make_src_mask(src_tensor)
 77 | 
 78 |     with torch.no_grad():
 79 |         enc_src = model.encoder(src_tensor, src_mask)
 80 | 
 81 |     trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
 82 | 
 83 |     for i in range(max_len):
 84 |         trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
 85 |         trg_mask = model.make_trg_mask(trg_tensor)
 86 | 
 87 |         with torch.no_grad():
 88 |             output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
 89 | 
 90 |         pred_token = output.argmax(2)[:, -1].item()
 91 |         trg_indexes.append(pred_token)
 92 | 
 93 |         if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
 94 |             break
 95 | 
 96 |     trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
 97 |     return trg_tokens[1:-1], attention
 98 | 
 99 | 
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 |     assert n_rows * n_cols == n_heads
102 | 
103 |     prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 | 
105 |     fig = plt.figure(figsize=(15, 25))
106 |     for i in range(n_heads):
107 |         ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 |         _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 |         cax = ax.matshow(_attention, cmap='bone')
110 | 
111 |         ax.tick_params(labelsize=12)
112 |         ax.set_xticklabels(
113 |             [''] + ['<sos>'] + [t for t in sentence] + ['<eos>'],
114 |             rotation=45, fontproperties=prop, fontsize=20
115 |         )
116 |         ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 | 
118 |         ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 |         ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 | 
121 |     plt.show()
122 |     plt.close()
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     pass
127 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/pipeline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from tqdm import tqdm
  4 | from utils import basic_tokenizer
  5 | import matplotlib.pyplot as plt
  6 | import matplotlib.ticker as ticker
  7 | import matplotlib.font_manager as fm
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def train(model, iterator, optimizer, criterion, clip):
 13 |     model.train()
 14 |     epoch_loss = 0
 15 |     for idx, batch in enumerate(tqdm(iterator)):
 16 |         src = batch.src
 17 |         trg = batch.trg
 18 | 
 19 |         optimizer.zero_grad()
 20 |         output, _ = model(src, trg[:, :-1])
 21 |         # output = [batch size, trg len - 1, output dim]
 22 |         # trg = [batch size, trg len]
 23 | 
 24 |         output_dim = output.shape[-1]
 25 |         output = output.contiguous().view(-1, output_dim)
 26 |         trg = trg[:, 1:].contiguous().view(-1)
 27 |         # output = [batch size * trg len - 1, output dim]
 28 |         # trg = [batch size * trg len - 1]
 29 | 
 30 |         # trg one hot for BCEwLogits
 31 |         # trg = F.one_hot(trg, num_classes=66)
 32 | 
 33 |         loss = criterion(output, trg)
 34 |         loss.backward()
 35 |         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
 36 |         optimizer.step()
 37 |         epoch_loss += loss.item()
 38 | 
 39 |     return epoch_loss / len(iterator)
 40 | 
 41 | 
 42 | def evaluate(model, iterator, criterion):
 43 |     model.eval()
 44 |     epoch_loss = 0
 45 |     with torch.no_grad():
 46 |         for idx, batch in enumerate(tqdm(iterator)):
 47 |             src = batch.src
 48 |             trg = batch.trg
 49 | 
 50 |             output, _ = model(src, trg[:, :-1])
 51 |             # output = [batch size, trg len - 1, output dim]
 52 |             # trg = [batch size, trg len]
 53 | 
 54 |             output_dim = output.shape[-1]
 55 |             output = output.contiguous().view(-1, output_dim)
 56 |             trg = trg[:, 1:].contiguous().view(-1)
 57 |             # output = [batch size * trg len - 1, output dim]
 58 |             # trg = [batch size * trg len - 1]
 59 | 
 60 |             loss = criterion(output, trg)
 61 |             epoch_loss += loss.item()
 62 |     return epoch_loss / len(iterator)
 63 | 
 64 | 
 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
 66 |     model.eval()
 67 | 
 68 |     if isinstance(sentence, str):
 69 |         tokens = basic_tokenizer(sentence)
 70 |     else:
 71 |         tokens = sentence
 72 | 
 73 |     tokens = [src_field.init_token] + tokens + [src_field.eos_token]
 74 |     src_indexes = [src_field.vocab.stoi[token] for token in tokens]
 75 |     src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
 76 |     src_mask = model.make_src_mask(src_tensor)
 77 | 
 78 |     with torch.no_grad():
 79 |         enc_src = model.encoder(src_tensor, src_mask)
 80 | 
 81 |     trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
 82 | 
 83 |     for i in range(max_len):
 84 |         trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
 85 |         trg_mask = model.make_trg_mask(trg_tensor)
 86 | 
 87 |         with torch.no_grad():
 88 |             output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
 89 | 
 90 |         pred_token = output.argmax(2)[:, -1].item()
 91 |         trg_indexes.append(pred_token)
 92 | 
 93 |         if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
 94 |             break
 95 | 
 96 |     trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
 97 |     return trg_tokens[1:-1], attention
 98 | 
 99 | 
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 |     assert n_rows * n_cols == n_heads
102 | 
103 |     prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 | 
105 |     fig = plt.figure(figsize=(15, 25))
106 |     for i in range(n_heads):
107 |         ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 |         _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 |         cax = ax.matshow(_attention, cmap='bone')
110 | 
111 |         ax.tick_params(labelsize=12)
112 |         ax.set_xticklabels(
113 |             [''] + ['<sos>'] + [t for t in sentence] + ['<eos>'],
114 |             rotation=45, fontproperties=prop, fontsize=20
115 |         )
116 |         ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 | 
118 |         ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 |         ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 | 
121 |     plt.show()
122 |     plt.close()
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     pass
127 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/inference.py:
--------------------------------------------------------------------------------
  1 | from decoding import beam_search_decoding
  2 | from metrics import beam_eval_report, greedy_eval_report
  3 | from utils import print_n_best
  4 | from utils import translate_sentence
  5 | 
  6 | import torch, torch.nn as nn, torch.optim as optim
  7 | import torch.nn.functional as F
  8 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
  9 | import random
 10 | from tqdm import tqdm
 11 | import pandas as pd
 12 | from sklearn.model_selection import train_test_split
 13 | import math
 14 | import time
 15 | # from torchtext.data.metrics import bleu_score
 16 | 
 17 | import matplotlib.pyplot as plt
 18 | import matplotlib.ticker as ticker
 19 | import matplotlib.font_manager as fm
 20 | 
 21 | import numpy as np
 22 | import math
 23 | import time
 24 | 
 25 | import warnings as wrn
 26 | wrn.filterwarnings('ignore')
 27 | 
 28 | 
 29 | def test_beam(model, train_data, test_data, SRC, TRG, DEVICE):
 30 |     _, test_iterator = BucketIterator.splits(
 31 |         (train_data, test_data),
 32 |         batch_size=1,
 33 |         sort_within_batch=True,
 34 |         sort_key=lambda x: len(x.src),
 35 |         device=DEVICE
 36 |     )
 37 | 
 38 |     TRG_SOS_IDX = TRG.vocab.stoi[TRG.init_token]
 39 |     TRG_EOS_IDX = TRG.vocab.stoi[TRG.eos_token]
 40 | 
 41 |     src_words = []
 42 |     topk_prediction_list = []
 43 |     trg_words = []
 44 |     found_within_topk = []
 45 |     found_at_top1 = []
 46 | 
 47 |     model.eval()
 48 |     with torch.no_grad():
 49 |         for batch_id, batch in enumerate(tqdm(test_iterator)):
 50 |             src, src_len = batch.src
 51 |             trg = batch.trg
 52 | 
 53 |             src_word = "".join(SRC.vocab.itos[idx] for idx in src[:, 0][1:-1])
 54 |             trg_word = "".join(TRG.vocab.itos[idx] for idx in trg[:, 0][1:-1])
 55 |             # print(f'\nSRC: {src_word}')
 56 |             # print(f'\nTRG: {trg_word}')
 57 | 
 58 |             enc_outs, h = model.encoder(src, src_len)
 59 |             # print(enc_outs.shape, h.shape)
 60 | 
 61 |             # decoder, enc_outs, enc_last_h, beam_width, n_best, sos_token, eos_token, max_dec_steps, device
 62 |             decoded_seqs = beam_search_decoding(
 63 |                 model = model,
 64 |                 src = src,
 65 |                 decoder=model.decoder,
 66 |                 enc_outs=enc_outs,
 67 |                 enc_last_h=h,
 68 |                 beam_width=1,
 69 |                 n_best=1,
 70 |                 sos_token=TRG_SOS_IDX,
 71 |                 eos_token=TRG_EOS_IDX,
 72 |                 max_dec_steps=100,
 73 |                 device=DEVICE
 74 |             )
 75 |             topk_preds = print_n_best(decoded_seqs[0], TRG.vocab.itos)
 76 |             # print(topk_preds)
 77 | 
 78 |             src_words.append(src_word)
 79 |             trg_words.append(trg_word)
 80 |             topk_prediction_list.append((topk_preds * 3)[:3])
 81 |             found_within_topk.append(1) if trg_word in topk_preds else found_within_topk.append(0)
 82 |             found_at_top1.append(1) if trg_word == topk_preds[0] else found_at_top1.append(0)
 83 | 
 84 |             # if batch_id == 100:
 85 |             #     break
 86 | 
 87 |     topk_pred_df = pd.DataFrame({
 88 |         'Error': src_words,
 89 |         'Pred-1': np.array(topk_prediction_list)[:, 0],
 90 |         'Pred-2': np.array(topk_prediction_list)[:, 1],
 91 |         'Pred-3': np.array(topk_prediction_list)[:, 2],
 92 |         'Correct': trg_words,
 93 |         'Greedy': found_at_top1,
 94 |         'Beam': found_within_topk
 95 |     })
 96 |     topk_pred_df.to_csv('./Corrections/preds_beam.csv', index=False)
 97 | 
 98 |     beam_eval_report(trg_words, topk_prediction_list)
 99 | 
100 | 
101 | def test_greedy(test_data, SRC, TRG, model, DEVICE):
102 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
103 |     for idx, data in enumerate(tqdm(test_data)):
104 |         src = data.src
105 |         trg = data.trg
106 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
107 | 
108 |         src = ''.join(src)
109 |         trg = ''.join(trg)
110 |         pred = ''.join(translation[:-1])
111 | 
112 |         erroneous_words.append(src)
113 |         predicted_words.append(pred)
114 |         correct_words.append(trg)
115 |         if trg == pred:
116 |             flags.append(1)
117 |         else:
118 |             flags.append(0)
119 | 
120 |     evaluation_df = pd.DataFrame({
121 |         'Error': erroneous_words,
122 |         'Predicton': predicted_words,
123 |         'Target': correct_words,
124 |         'Correction': flags
125 |     })
126 |     evaluation_df.to_csv('./Corrections/preds_greedy.csv', index=False)
127 | 
128 |     greedy_eval_report(correct_words, predicted_words)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/pipeline.py:
--------------------------------------------------------------------------------
  1 | import torch, torch.nn as nn, torch.optim as optim
  2 | import torch.nn.functional as F
  3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
  4 | import random
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | import math
  9 | import time
 10 | # from torchtext.data.metrics import bleu_score
 11 | from utils import translate_sentence
 12 | from sklearn import metrics
 13 | 
 14 | import matplotlib.pyplot as plt
 15 | import matplotlib.ticker as ticker
 16 | import matplotlib.font_manager as fm
 17 | 
 18 | import numpy as np
 19 | import math
 20 | import time
 21 | 
 22 | import warnings as wrn
 23 | wrn.filterwarnings('ignore')
 24 | 
 25 | 
 26 | def train(model, iterator, optimizer, criterion, clip=1):
 27 |     model.train()
 28 |     epoch_loss = 0
 29 |     for idx, batch in enumerate(tqdm(iterator)):
 30 |         src, src_len = batch.src
 31 |         trg = batch.trg
 32 | 
 33 |         optimizer.zero_grad()
 34 |         output = model(src, src_len, trg)
 35 |         output_dim = output.shape[-1]
 36 | 
 37 |         output = output[1:].view(-1, output_dim)
 38 |         trg = trg[1:].view(-1)
 39 | 
 40 |         # print(f"output: {output.shape}, target: {trg.shape} \n\n{trg}")
 41 | 
 42 |         loss = criterion(output, trg)
 43 |         loss.backward()
 44 |         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
 45 |         optimizer.step()
 46 |         epoch_loss += loss.item()
 47 | 
 48 |     return epoch_loss / len(iterator)
 49 | 
 50 | 
 51 | def evaluate(model, iterator, criterion):
 52 |     model.eval()
 53 |     epoch_loss = 0
 54 |     with torch.no_grad():
 55 |         for idx, batch in enumerate(tqdm(iterator)):
 56 |             src, src_len = batch.src
 57 |             trg = batch.trg
 58 | 
 59 |             output = model(src, src_len, trg, 0)
 60 | 
 61 |             output_dim = output.shape[-1]
 62 |             output = output[1:].view(-1, output_dim)
 63 |             trg = trg[1:].view(-1)
 64 | 
 65 |             loss = criterion(output, trg)
 66 |             epoch_loss += loss.item()
 67 | 
 68 |     return epoch_loss / len(iterator)
 69 | 
 70 | 
 71 | def test_accuracy(test_data, SRC, TRG, model, DEVICE):
 72 |     df = pd.read_csv('./Dataset/allDictWords_df.csv')
 73 |     # df = pd.read_csv('./Dataset/df_all_words.csv')
 74 |     all_words = sorted(df.iloc[:, 0].values)
 75 | 
 76 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 77 |     modified_flags = []
 78 |     for idx, data in enumerate(tqdm(test_data)):
 79 |         src = data.src
 80 |         trg = data.trg
 81 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
 82 | 
 83 |         src = ''.join(src)
 84 |         trg = ''.join(trg)
 85 |         pred = ''.join(translation[:-1])
 86 | 
 87 |         erroneous_words.append(src)
 88 |         predicted_words.append(pred)
 89 |         correct_words.append(trg)
 90 |         if trg == pred:
 91 |             flags.append(1)
 92 |         else:
 93 |             flags.append(0)
 94 | 
 95 |         if pred in all_words:
 96 |             modified_flags.append(1)
 97 |         else:
 98 |             modified_flags.append(0)
 99 | 
100 |     modified_acc = np.sum(modified_flags) / len(modified_flags)
101 | 
102 |     evaluation_df = pd.DataFrame({
103 |         'Error': erroneous_words,
104 |         'Predicton': predicted_words,
105 |         'Target': correct_words,
106 |         'Correction': flags
107 |     })
108 |     # evaluation_df.to_csv('/content/drive/MyDrive/Bangla Spell & Grammar Checker/Codes/GEDC/Seq2Seq/preds_greedy.csv', index=False)
109 | 
110 |     corrected_instances = evaluation_df['Correction'].values.sum()
111 |     total_instances = len(evaluation_df)
112 |     accuracy = corrected_instances / total_instances
113 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
114 |     # print(f"Accuracy: {accuracy*100:.2f}%")
115 | 
116 |     y_true = np.array(correct_words)
117 |     y_pred = np.array(predicted_words)
118 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
119 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
120 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
121 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
122 |     ACC = metrics.accuracy_score(y_true, y_pred)
123 |     print(f'''Top-1 (Greedy Decoding)
124 |             Precision: {PR:.4f}
125 |             Recall: {RE:.4f}
126 |             F1 Score: {F1:.4f}
127 |             F0.5 Score: {F05:.4f}
128 |             Accuracy: {ACC * 100:.2f}%
129 |             Modified Accuracy: {modified_acc * 100:.2f}%
130 |     ''')
131 | 
132 |     return evaluation_df
133 | 
134 |     # evaluation_df.sample(10)
135 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/models.py:
--------------------------------------------------------------------------------
  1 | import torch, torch.nn as nn, torch.optim as optim
  2 | import torch.nn.functional as F
  3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
  4 | import random
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | import math
  9 | import time
 10 | # from torchtext.data.metrics import bleu_score
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.ticker as ticker
 14 | import matplotlib.font_manager as fm
 15 | 
 16 | import numpy as np
 17 | import math
 18 | import time
 19 | 
 20 | import warnings as wrn
 21 | wrn.filterwarnings('ignore')
 22 | 
 23 | 
 24 | class Encoder(nn.Module):
 25 |     def __init__(self, input_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout):
 26 |         super().__init__()
 27 |         self.embedding = nn.Embedding(input_dim, embed_dim)
 28 |         self.rnn = nn.GRU(embed_dim, enc_hidden_dim, bidirectional=True)
 29 |         self.fc = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim)
 30 |         self.dropout = nn.Dropout(dropout)
 31 | 
 32 |     def forward(self, src, src_len):
 33 |         embedded = self.dropout(self.embedding(src))
 34 |         packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))
 35 |         packed_outputs, hidden = self.rnn(packed_embedded)
 36 |         outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
 37 |         hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
 38 |         return outputs, hidden
 39 | 
 40 | 
 41 | class Attention(nn.Module):
 42 |     def __init__(self, enc_hidden_dim, dec_hidden_dim):
 43 |         super().__init__()
 44 |         self.attn = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim, dec_hidden_dim)
 45 |         self.v = nn.Linear(dec_hidden_dim, 1, bias=False)
 46 | 
 47 |     def forward(self, hidden, encoder_outputs, mask):
 48 |         batch_size = encoder_outputs.shape[1]
 49 |         src_len = encoder_outputs.shape[0]
 50 |         hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
 51 |         encoder_outputs = encoder_outputs.permute(1, 0, 2)
 52 |         energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
 53 |         attention = self.v(energy).squeeze(2)
 54 |         attention = attention.masked_fill(mask==0, -1e10)
 55 |         return F.softmax(attention, dim=1)
 56 | 
 57 | 
 58 | class Decoder(nn.Module):
 59 |     def __init__(self, output_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention):
 60 |         super().__init__()
 61 |         self.output_dim = output_dim
 62 |         self.attention = attention
 63 |         self.embedding = nn.Embedding(output_dim, embed_dim)
 64 |         self.rnn = nn.GRU((enc_hidden_dim*2) + embed_dim, dec_hidden_dim)
 65 |         self.fc_out = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim + embed_dim, output_dim)
 66 |         self.dropout = nn.Dropout(dropout)
 67 | 
 68 |     def forward(self, input, hidden, encoder_outputs, mask):
 69 |         input = input.unsqueeze(0)
 70 |         embedded = self.dropout(self.embedding(input))
 71 |         a = self.attention(hidden, encoder_outputs, mask)
 72 |         a = a.unsqueeze(1)
 73 |         encoder_outputs = encoder_outputs.permute(1, 0, 2)
 74 |         weighted = torch.bmm(a, encoder_outputs)
 75 |         weighted = weighted.permute(1, 0, 2)
 76 |         rnn_input = torch.cat((embedded, weighted), dim=2)
 77 | 
 78 |         output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
 79 | 
 80 |         assert (output == hidden).all()
 81 | 
 82 |         embedded = embedded.squeeze(0)
 83 |         output = output.squeeze(0)
 84 |         weighted = weighted.squeeze(0)
 85 | 
 86 |         prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
 87 | 
 88 |         return prediction, hidden.squeeze(0), a.squeeze(1)
 89 | 
 90 | 
 91 | class Seq2Seq(nn.Module):
 92 |     def __init__(self, encoder, decoder, src_pad_idx, device):
 93 |         super().__init__()
 94 |         self.encoder = encoder
 95 |         self.decoder = decoder
 96 |         self.src_pad_idx = src_pad_idx
 97 |         self.device = device
 98 | 
 99 |     def create_mask(self, src):
100 |         mask = (src != self.src_pad_idx).permute(1, 0)
101 |         return mask
102 | 
103 |     def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
104 |         batch_size = src.shape[1]
105 |         trg_len = trg.shape[0]
106 |         trg_vocab_size = self.decoder.output_dim
107 | 
108 |         outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
109 | 
110 |         encoder_outputs, hidden = self.encoder(src, src_len)
111 |         input = trg[0, :]
112 |         mask = self.create_mask(src)
113 | 
114 |         for t in range(1, trg_len):
115 |             output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
116 |             outputs[t] = output
117 | 
118 |             top1 = output.argmax(1)
119 | 
120 |             input = trg[t] if random.random() < teacher_forcing_ratio else top1
121 | 
122 |         return outputs
123 | 


--------------------------------------------------------------------------------
/Baselines/RuleBased/rule_based.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """PreviousEditDistanceBasedSpellChecker.ipynb
  3 | 
  4 | Automatically generated by Colaboratory.
  5 | 
  6 | Original file is located at
  7 |     https://colab.research.google.com/drive/1Kp3C18yaWmfhKJU_8294UKqfHrmLA1Ow
  8 | """
  9 | 
 10 | import pandas as pd
 11 | import numpy as np
 12 | from sklearn.model_selection import train_test_split
 13 | from sklearn import metrics
 14 | from tqdm import tqdm
 15 | import warnings as wrn
 16 | 
 17 | wrn.filterwarnings('ignore')
 18 | 
 19 | def editDistance(str1, str2, m, n):
 20 |     if m == 0:
 21 |         return n
 22 |  
 23 |     if n == 0:
 24 |         return m
 25 |  
 26 |     if str1[m-1] == str2[n-1]:
 27 |         return editDistance(str1, str2, m-1, n-1)
 28 |  
 29 |     return 1 + min(editDistance(str1, str2, m, n-1),    # Insert
 30 |                    editDistance(str1, str2, m-1, n),    # Remove
 31 |                    editDistance(str1, str2, m-1, n-1)   # Replace
 32 |                 )
 33 | 
 34 | # Dynamic Programming based
 35 | def editDistDP(str1, str2, m, n):
 36 | 	# Create a table to store results of subproblems
 37 | 	dp = [[0 for x in range(n + 1)] for x in range(m + 1)]
 38 | 
 39 | 	# Fill d[][] in bottom up manner
 40 | 	for i in range(m + 1):
 41 | 		for j in range(n + 1):
 42 | 
 43 | 			# If first string is empty, only option is to
 44 | 			# insert all characters of second string
 45 | 			if i == 0:
 46 | 				dp[i][j] = j # Min. operations = j
 47 | 
 48 | 			# If second string is empty, only option is to
 49 | 			# remove all characters of second string
 50 | 			elif j == 0:
 51 | 				dp[i][j] = i # Min. operations = i
 52 | 
 53 | 			# If last characters are same, ignore last char
 54 | 			# and recur for remaining string
 55 | 			elif str1[i-1] == str2[j-1]:
 56 | 				dp[i][j] = dp[i-1][j-1]
 57 | 
 58 | 			# If last character are different, consider all
 59 | 			# possibilities and find minimum
 60 | 			else:
 61 | 				dp[i][j] = 1 + min(dp[i][j-1],	 # Insert
 62 | 								dp[i-1][j],	 # Remove
 63 | 								dp[i-1][j-1]) # Replace
 64 | 
 65 | 	return dp[m][n]
 66 | 
 67 | 
 68 | # Driver code
 69 | # str1 = "sunday"
 70 | # str2 = "saturday"
 71 | 
 72 | # print(editDistDP(str1, str2, len(str1), len(str2)))
 73 | # This code is contributed by Bhavya Jain
 74 | 
 75 | df = pd.read_csv('./Dataset/corpus.csv')
 76 | # df
 77 | 
 78 | train_df, test_df = train_test_split(df, test_size=.15)
 79 | train_df, valid_df = train_test_split(train_df, test_size=.05)
 80 | 
 81 | # len(train_df), len(valid_df), len(test_df)
 82 | 
 83 | erroneous_words = []
 84 | actual_words = []
 85 | calculated_words = []
 86 | 
 87 | for i in tqdm(range(10000)):
 88 |     word = valid_df['Error'].values[i]
 89 |     # print(word)
 90 |     
 91 |     x = len(word)
 92 |     while True:
 93 |         temp_df = train_df['Word'].str.startswith(word[:x], na = False)
 94 |         temp_df = train_df[temp_df]
 95 |         if len(temp_df) != 0:
 96 |             break
 97 |         x -= 1
 98 |     
 99 |     if len(temp_df) > 100:
100 |         temp_df = temp_df.sample(100)
101 | 
102 |     # print(temp_df)
103 |     
104 |     scores = []
105 |     for temp_word in temp_df['Word'].values: 
106 |         # score = editDistance(word, temp_word, len(word), len(temp_word))
107 |         score = editDistDP(word, temp_word, len(word), len(temp_word))
108 |         scores.append(score)
109 |     
110 |     temp_df['Scores'] = scores
111 |     temp_df = temp_df.sort_values(by=['Scores'], ascending=True)
112 | 
113 |     calculated = temp_df.iloc[0, 0]
114 |     
115 |     act_word = valid_df['Word'].values[i]
116 | 
117 |     erroneous_words.append(word)
118 |     calculated_words.append(calculated)
119 |     actual_words.append(act_word)
120 | 
121 |     if i % 100 == 0 and i > 0:
122 |         x = pd.DataFrame({
123 |             'Error': erroneous_words,
124 |             'Actual': actual_words,
125 |             'Calculated': calculated_words
126 |         })
127 |         x.to_csv('./Dataset/ed_output.csv', index=False)
128 |     
129 | 
130 |     # print(word, calculated)
131 |     print(f"\n erroneous: {word}\n actual: {act_word}\n calculated: {calculated}")
132 | 
133 | words = []
134 | for i in tqdm(range(len(df))):
135 |     if df.iloc[i, 1] not in x['Error'].values:
136 |         words.append(df.iloc[i, 0])
137 | 
138 | # x = pd.DataFrame({
139 | #     'Error': erroneous_words,
140 | #     'Actual': actual_words,
141 | #     'Calculated': calculated_words
142 | # })
143 | 
144 | acc_flags = []
145 | for i in range(len(x)):
146 |     if x.iloc[i, 1] == x.iloc[i, -1]:
147 |         acc_flags.append(1)
148 |     else:
149 |         acc_flags.append(0)
150 | x['EM'] = acc_flags
151 | 
152 | train_df = df
153 | mod_acc_flags = []
154 | for pred in x['Calculated'].values:
155 |     if pred in words:
156 |         mod_acc_flags.append(1)
157 |     else:
158 |         mod_acc_flags.append(0)
159 | x['MA'] = mod_acc_flags
160 | 
161 | y_true = np.array(x['Actual'].values)
162 | y_pred = np.array(x['Calculated'].values)
163 | 
164 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
165 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
166 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
167 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
168 | ACC = metrics.accuracy_score(y_true, y_pred)
169 | 
170 | print(f'Accuracy = {ACC*100:.2f}%')
171 | print(f'Precision = {PR:.4f}')
172 | print(f'Recall = {RE:.4f}')
173 | print(f'F1 Score = {F1:.4f}')
174 | print(f'F0.5 Score = {F05:.4f}')
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | class Encoder(nn.Module):
  7 |     def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device, max_length=50):
  8 |         super().__init__()
  9 |         assert kernel_size % 2 == 1, "Kernel size should be odd in encoder"
 10 |         self.device = device
 11 |         self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
 12 |         self.tok_embedding = nn.Embedding(input_dim, emb_dim)
 13 |         self.pos_embedding = nn.Embedding(max_length, emb_dim)
 14 |         self.emb2hid = nn.Linear(emb_dim, hid_dim)
 15 |         self.hid2emb = nn.Linear(hid_dim, emb_dim)
 16 |         self.convs = nn.ModuleList([
 17 |             nn.Conv1d(
 18 |                 in_channels=hid_dim, out_channels= 2 *hid_dim, kernel_size=kernel_size, padding=(kernel_size-1 )//2
 19 |             ) for _ in range(n_layers)
 20 |         ])
 21 |         self.dropout = nn.Dropout(dropout)
 22 | 
 23 |     def forward(self, src):
 24 |         batch_size = src.shape[0]
 25 |         src_len = src.shape[1]
 26 |         pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
 27 |         tok_embedded = self.tok_embedding(src)
 28 |         pos_embedded = self.pos_embedding(pos)
 29 |         embedded = self.dropout(tok_embedded + pos_embedded)
 30 |         conv_inp = self.emb2hid(embedded)
 31 |         conv_inp = conv_inp.permute(0, 2, 1)
 32 | 
 33 |         for idx, conv in enumerate(self.convs):
 34 |             conved = conv(self.dropout(conv_inp))
 35 |             conved = F.glu(conved, dim=1)
 36 |             conved = (conved + conv_inp) * self.scale
 37 |             conv_inp = conved
 38 | 
 39 |         conved = self.hid2emb(conved.permute(0, 2, 1))
 40 |         combined = (conved + embedded) * self.scale
 41 |         return conved, combined
 42 | 
 43 | 
 44 | class Decoder(nn.Module):
 45 |     def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, \
 46 |                  trg_pad_idx, device, max_length=50):
 47 |         super().__init__()
 48 |         self.kernel_size = kernel_size
 49 |         self.trg_pad_idx = trg_pad_idx
 50 |         self.device = device
 51 |         self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
 52 | 
 53 |         self.tok_embedding = nn.Embedding(output_dim, emb_dim)
 54 |         self.pos_embedding = nn.Embedding(max_length, emb_dim)
 55 |         self.emb2hid = nn.Linear(emb_dim, hid_dim)
 56 |         self.hid2emb = nn.Linear(hid_dim, emb_dim)
 57 | 
 58 |         self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
 59 |         self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
 60 | 
 61 |         self.fc_out = nn.Linear(emb_dim, output_dim)
 62 |         self.convs = nn.ModuleList([
 63 |             nn.Conv1d(
 64 |                 in_channels=hid_dim, out_channels=2 * hid_dim, kernel_size=kernel_size
 65 |             ) for _ in range(n_layers)
 66 |         ])
 67 |         self.dropout = nn.Dropout(dropout)
 68 | 
 69 |     def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
 70 |         conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
 71 |         combined = (conved_emb + embedded) * self.scale
 72 |         energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
 73 |         attention = F.softmax(energy, dim=2)
 74 |         attended_encoding = torch.matmul(attention, encoder_combined)
 75 |         attended_encoding = self.attn_emb2hid(attended_encoding)
 76 |         attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
 77 |         return attention, attended_combined
 78 | 
 79 |     def forward(self, trg, encoder_conved, encoder_combined):
 80 |         batch_size = trg.shape[0]
 81 |         trg_len = trg.shape[1]
 82 |         pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
 83 | 
 84 |         tok_embedded = self.tok_embedding(trg)
 85 |         pos_embedded = self.pos_embedding(pos)
 86 |         embedded = self.dropout(tok_embedded + pos_embedded)
 87 | 
 88 |         conv_inp = self.emb2hid(embedded)
 89 |         conv_inp = conv_inp.permute(0, 2, 1)
 90 | 
 91 |         batch_size = conv_inp.shape[0]
 92 |         hid_dim = conv_inp.shape[1]
 93 |         for idx, conv in enumerate(self.convs):
 94 |             conv_inp = self.dropout(conv_inp)
 95 |             padding = torch.zeros(
 96 |                 batch_size, hid_dim, self.kernel_size - 1
 97 |             ).fill_(self.trg_pad_idx).to(self.device)
 98 |             padded_conv_inp = torch.cat((padding, conv_inp), dim=2)
 99 |             conved = conv(padded_conv_inp)
100 |             conved = F.glu(conved, dim=1)
101 | 
102 |             attention, conved = self.calculate_attention(
103 |                 embedded, conved, encoder_conved, encoder_combined
104 |             )
105 |             conved = (conved + conv_inp) * self.scale
106 |             conv_inp = conved
107 | 
108 |         conved = self.hid2emb(conved.permute(0, 2, 1))
109 |         output = self.fc_out(self.dropout(conved))
110 |         return output, attention
111 | 
112 | 
113 | class Seq2Seq(nn.Module):
114 |     def __init__(self, encoder, decoder):
115 |         super().__init__()
116 |         self.encoder = encoder
117 |         self.decoder = decoder
118 | 
119 |     def forward(self, src, trg):
120 |         encoder_conved, encoder_combined = self.encoder(src)
121 |         output, attention = self.decoder(trg, encoder_conved, encoder_combined)
122 |         return output, attention
123 | 
124 | if __name__ == '__main__':
125 |     pass


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/utils.py:
--------------------------------------------------------------------------------
  1 | import torch, torch.nn as nn, torch.optim as optim
  2 | import torch.nn.functional as F
  3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
  4 | import random
  5 | from tqdm import tqdm
  6 | import pandas as pd
  7 | from sklearn.model_selection import train_test_split
  8 | import math
  9 | import time
 10 | # from torchtext.data.metrics import bleu_score
 11 | 
 12 | import matplotlib.pyplot as plt
 13 | import matplotlib.ticker as ticker
 14 | import matplotlib.font_manager as fm
 15 | 
 16 | import numpy as np
 17 | import math
 18 | import time
 19 | 
 20 | import warnings as wrn
 21 | wrn.filterwarnings('ignore')
 22 | 
 23 | 
 24 | def word2chars(word):
 25 |     w2c = [char for char in word]
 26 |     return ' '.join(w2c)
 27 | 
 28 | 
 29 | def df2train_test_dfs(df, test_size=0.15):
 30 |     df['Word'] = df['Word'].apply(word2chars)
 31 |     df['Error'] = df['Error'].apply(word2chars)
 32 |     df = df.sample(frac=1).reset_index(drop=True)
 33 |     df = df.iloc[:, [1, 0]]
 34 |     train_df, test_df = train_test_split(df, test_size=test_size)
 35 |     train_df.to_csv('./Dataset/train.csv', index=False)
 36 |     test_df.to_csv('./Dataset/test.csv', index=False)
 37 | 
 38 | 
 39 | def df2train_valid_test_dfs(df, test_size=0.15):
 40 |     df['Word'] = df['Word'].apply(word2chars)
 41 |     df['Error'] = df['Error'].apply(word2chars)
 42 |     df = df.sample(frac=1).reset_index(drop=True)
 43 |     df = df.iloc[:, [1, 0]]
 44 |     train_df, test_df = train_test_split(df, test_size=test_size)
 45 |     train_df, valid_df = train_test_split(train_df, test_size=.05)
 46 | 
 47 |     train_df.to_csv('./Dataset/train.csv', index=False)
 48 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 49 |     test_df.to_csv('./Dataset/test.csv', index=False)
 50 | 
 51 | 
 52 | def df2train_error_dfs(df, error='Cognitive Error', test_size=0.20):
 53 |     df['Word'] = df['Word'].apply(word2chars)
 54 |     df['Error'] = df['Error'].apply(word2chars)
 55 |     df = df.sample(frac=1).reset_index(drop=True)
 56 |     # df = df.iloc[:, [1, 0]]
 57 |     train_df, error_df = train_test_split(df, test_size=test_size)
 58 |     error_df = error_df.loc[error_df['ErrorType'] == error]
 59 |     train_df = train_df.iloc[:, [1, 0]]
 60 |     error_df = error_df.iloc[:, [1, 0]]
 61 | 
 62 |     train_df.to_csv('./Dataset/train.csv', index=False)
 63 |     error_df.to_csv('./Dataset/error.csv', index=False)
 64 | 
 65 | 
 66 | def basic_tokenizer(text):
 67 |     return text.split()
 68 | 
 69 | 
 70 | def init_weights(m):
 71 |     for name, param in m.named_parameters():
 72 |         if 'weight' in name:
 73 |             nn.init.normal_(param.data, mean=0, std=0.01)
 74 |         else:
 75 |             nn.init.constant_(param.data, 0)
 76 | 
 77 | 
 78 | def count_parameters(model):
 79 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 80 | 
 81 | 
 82 | def save_model(model, epoch, optimizer, train_loss, PATH):
 83 |     torch.save({
 84 |         'epoch': epoch,
 85 |         'model_state_dict': model.state_dict(),
 86 |         'optimizer_state_dict': optimizer.state_dict(),
 87 |         'loss': train_loss
 88 |     }, PATH)
 89 |     print(f"---------\nModel Saved at {PATH}\n---------\n")
 90 | 
 91 | 
 92 | def load_model(model, optimizer, PATH):
 93 |     checkpoint = torch.load(PATH)
 94 |     model.load_state_dict(checkpoint['model_state_dict'])
 95 |     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
 96 |     epoch = checkpoint['epoch']
 97 |     train_loss = checkpoint['loss']
 98 |     return checkpoint, epoch, train_loss
 99 | 
100 | 
101 | def print_n_best(decoded_seq, itos):
102 |     topk_preds = []
103 |     for rank, seq in enumerate(decoded_seq):
104 |         pred = "".join([itos[idx] for idx in seq[1:-1]])
105 |         topk_preds.append(pred)
106 |         # print(f'Out: Rank-{rank+1}: {pred}')
107 |     return topk_preds
108 | 
109 | 
110 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30):
111 |     model.eval()
112 |     tokens = [token for token in sentence]
113 | 
114 |     tokens = [src_field.init_token] + tokens + [src_field.eos_token]
115 | 
116 |     src_indexes = [src_field.vocab.stoi[token] for token in tokens]
117 |     src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
118 |     src_len = torch.LongTensor([len(src_indexes)])
119 | 
120 |     with torch.no_grad():
121 |         encoder_outputs, hidden = model.encoder(src_tensor, src_len)
122 | 
123 |     mask = model.create_mask(src_tensor)
124 | 
125 |     trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
126 |     attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
127 | 
128 |     for i in range(max_len):
129 |         trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
130 |         with torch.no_grad():
131 |             output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)
132 | 
133 |         attentions[i] = attention
134 | 
135 |         pred_token = output.argmax(1).item()
136 |         trg_indexes.append(pred_token)
137 |         if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
138 |             break
139 | 
140 |     trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
141 |     return trg_tokens[1:], attentions[:len(trg_tokens) - 1]
142 | 
143 | 
144 | def display_attention(sentence, translation, attention):
145 |     prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
146 | 
147 |     fig = plt.figure(figsize=(7, 10))
148 |     ax = fig.add_subplot(111)
149 | 
150 |     attention = attention.squeeze(1).cpu().detach().numpy()
151 | 
152 |     cax = ax.matshow(attention, cmap='bone')
153 | 
154 |     ax.tick_params(labelsize=15)
155 | 
156 |     x_ticks = [''] + ['<sos>'] + [t.lower() for t in sentence] + ['<eos>']
157 |     y_ticks = [''] + translation
158 | 
159 |     ax.set_xticklabels(x_ticks, rotation=0, fontproperties=prop, fontsize=20)
160 |     ax.set_yticklabels(y_ticks, fontproperties=prop, fontsize=20)
161 | 
162 |     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
163 |     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
164 | 
165 |     plt.show()
166 |     plt.close()
167 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/errors.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     df2train_test_dfs, basic_tokenizer, init_weights, count_parameters,
  3 |     translate_sentence, display_attention, df2train_valid_test_dfs,
  4 |     save_model, load_model, df2train_error_dfs, word2chars
  5 | )
  6 | from models import Encoder, Decoder, Attention, Seq2Seq
  7 | from pipeline import train, test_accuracy
  8 | from inference import test_beam, test_greedy
  9 | from focalLoss import FocalLoss
 10 | 
 11 | import torch, torch.nn as nn, torch.optim as optim
 12 | import torch.nn.functional as F
 13 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
 14 | import random
 15 | from tqdm import tqdm
 16 | import pandas as pd
 17 | from sklearn.model_selection import train_test_split
 18 | import math
 19 | import time
 20 | # from torchtext.data.metrics import bleu_score
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | import matplotlib.ticker as ticker
 24 | import matplotlib.font_manager as fm
 25 | 
 26 | import numpy as np
 27 | import math
 28 | import time
 29 | import sys
 30 | 
 31 | import warnings as wrn
 32 | wrn.filterwarnings('ignore')
 33 | 
 34 | 
 35 | def error_df(df, error='Cognitive Error'):
 36 |     df = df.loc[df['ErrorType'] == error]
 37 |     df['Word'] = df['Word'].apply(word2chars)
 38 |     df['Error'] = df['Error'].apply(word2chars)
 39 |     df = df.sample(frac=1).reset_index(drop=True)
 40 |     df = df.iloc[:, [1, 0]]
 41 |     df.to_csv('./Dataset/error.csv', index=False)
 42 | 
 43 | 
 44 | def check_error():
 45 |     df = pd.read_csv('./Dataset/sec_dataset_II.csv')
 46 |     df = df.iloc[:, :]
 47 |     # df2train_test_dfs(df=df, test_size=0.15)
 48 |     df2train_valid_test_dfs(df=df, test_size=0.15)
 49 | 
 50 |     # ['Cognitive Error', 'Homonym Error', 'Run-on Error',
 51 |     #  'Split-word Error (Left)', 'Split-word Error (Random)',
 52 |     #  'Split-word Error (Right)', 'Split-word Error (both)',
 53 |     #  'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
 54 |     #  'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
 55 |     #  'Visual Error', 'Visual Error (Combined Character)']
 56 |     error_name = 'Cognitive Error'
 57 |     error_df(df, error_name)
 58 |     # df2train_error_dfs(df, error='Cognitive Error')
 59 |     # sys.exit()
 60 | 
 61 |     SRC = Field(
 62 |         tokenize=basic_tokenizer, lower=False,
 63 |         init_token='<sos>', eos_token='<eos>',
 64 |         sequential=True, use_vocab=True, include_lengths=True
 65 |     )
 66 |     TRG = Field(
 67 |         tokenize=basic_tokenizer, lower=False,
 68 |         init_token='<sos>', eos_token='<eos>',
 69 |         sequential=True, use_vocab=True
 70 |     )
 71 |     fields = {
 72 |         'Error': ('src', SRC),
 73 |         'Word': ('trg', TRG)
 74 |     }
 75 |     train_data, valid_data, test_data = TabularDataset.splits(
 76 |         path='./Dataset',
 77 |         train='train.csv',
 78 |         validation='valid.csv',
 79 |         test='test.csv',
 80 |         format='csv',
 81 |         fields=fields
 82 |     )
 83 |     error_data, _ = TabularDataset.splits(
 84 |         path='./Dataset',
 85 |         train='error.csv',
 86 |         test='error.csv',
 87 |         format='csv',
 88 |         fields=fields
 89 |     )
 90 | 
 91 |     # print(error_data)
 92 |     # sys.exit()
 93 | 
 94 |     SRC.build_vocab(train_data, max_size=64, min_freq=100)
 95 |     TRG.build_vocab(train_data, max_size=64, min_freq=75)
 96 |     # print(len(SRC.vocab), len(TRG.vocab))
 97 | 
 98 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 99 |     BATCH_SIZE = 256
100 |     INPUT_DIM = len(SRC.vocab)
101 |     OUTPUT_DIM = len(TRG.vocab)
102 |     ENC_EMB_DIM = 64
103 |     DEC_EMB_DIM = 64
104 |     ENC_HIDDEN_DIM = 256
105 |     DEC_HIDDEN_DIM = 512
106 |     ENC_DROPOUT = 0.25
107 |     DEC_DROPOUT = 0.25
108 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
109 |     MAX_LEN = 32
110 |     N_EPOCHS = 10
111 |     CLIP = 1
112 |     PATH = ''
113 | 
114 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
115 |         (train_data, valid_data, test_data),
116 |         batch_size=BATCH_SIZE,
117 |         sort_within_batch=True,
118 |         sort_key=lambda x: len(x.src),
119 |         device=DEVICE
120 |     )
121 |     error_iterator, _ = BucketIterator.splits(
122 |         (error_data, error_data),
123 |         batch_size=BATCH_SIZE,
124 |         sort_within_batch=True,
125 |         sort_key=lambda x: len(x.src),
126 |         device=DEVICE
127 |     )
128 | 
129 |     attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
130 |     encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT)
131 |     decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention)
132 | 
133 |     model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE)
134 |     model.apply(init_weights)
135 |     # print(model)
136 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
137 | 
138 |     optimizer = optim.Adam(model.parameters())
139 |     # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4)
140 | 
141 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
142 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
143 |     # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX)
144 |     # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')
145 | 
146 |     PATH = './Checkpoints/spell_s2s.pth'
147 |     # best_loss = 1e10
148 | 
149 |     checkpoint, epoch, train_loss = load_model(model, optimizer, PATH)
150 |     # best_loss = train_loss
151 |     error_df_ = pd.read_csv('./Dataset/error.csv')
152 |     error_pct = (len(error_df_) / len(df)) * 100
153 |     
154 |     print(f"\n------------\nError Name: {error_name} - {error_pct:.2f}% of dataset\n------------")
155 |     test_accuracy(error_data, SRC, TRG, model, DEVICE)
156 | 
157 | 
158 |     # test_beam(model, train_data, test_data, SRC, TRG, DEVICE)
159 |     # test_greedy(test_data, SRC, TRG, model, DEVICE)
160 | 
161 |     # example_idx = 1
162 |     # src = vars(train_data.examples[example_idx])['src']
163 |     # trg = vars(train_data.examples[example_idx])['trg']
164 |     # print(f'src = {src}')
165 |     # print(f'trg = {trg}')
166 |     # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
167 |     # print(f'predicted trg = {translation}')
168 |     # display_attention(src, translation, attention)
169 | 
170 | 
171 | if __name__ == '__main__':
172 |     check_error()
173 | 


--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/main.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     basic_tokenizer, word2char, count_parameters, translate_sentence,
  3 |     save_model, load_model
  4 | )
  5 | from errors import error_df
  6 | from models import Encoder, Decoder, Seq2Seq
  7 | from pipeline import train, evaluate
  8 | from metrics import evaluation_report
  9 | 
 10 | import torch
 11 | import torch.optim as optim
 12 | import torch.nn as nn
 13 | import pandas as pd
 14 | from sklearn.model_selection import train_test_split
 15 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
 16 | import os
 17 | import argparse
 18 | 
 19 | import warnings as wrn
 20 | wrn.filterwarnings('ignore')
 21 | 
 22 | 
 23 | def main():
 24 |     parser = argparse.ArgumentParser()
 25 |     parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv", 
 26 |         choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"]
 27 |     )
 28 |     parser.add_argument("--EMB_DIM", help="Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
 29 |     parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=256, choices=[64, 128, 256])
 30 |     parser.add_argument("--ENC_LAYERS", help="Encoder Layers", type=int,default=5, choices=[5, 10, 20])
 31 |     parser.add_argument("--DEC_LAYERS", help="Decoder Layers", type=int,default=5, choices=[5, 10, 20])
 32 |     parser.add_argument("--ENC_KERNEL_SIZE", help="Encoder Kernel Size", type=int, default=3, choices=[3, 5, 10])
 33 |     parser.add_argument("--DEC_KERNEL_SIZE", help="Decoder Kernel Size", type=int, default=3, choices=[3, 5, 10])
 34 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5])
 35 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5])
 36 |     parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=0.1, choices=[0.1, 0.2, 0.5, 1])
 37 |     parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512])
 38 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 39 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 40 |     args = parser.parse_args()
 41 | 
 42 |     df = pd.read_csv(args.CORPUS)
 43 |     df['Word'] = df['Word'].apply(word2char)
 44 |     df['Error'] = df['Error'].apply(word2char)
 45 |     df = df.sample(frac=1).reset_index(drop=True)
 46 |     df = df[['Error', 'Word']]
 47 | 
 48 |     train_df, test_df = train_test_split(df, test_size=.15)
 49 |     train_df, valid_df = train_test_split(train_df, test_size=.05)
 50 | 
 51 |     train_df.to_csv('./Dataset/train.csv', index=False)
 52 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 53 |     test_df.to_csv('./Dataset/test.csv', index=False)
 54 | 
 55 |     SRC = Field(
 56 |         tokenize=basic_tokenizer, lower=False,
 57 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 58 |     )
 59 |     TRG = Field(
 60 |         tokenize=basic_tokenizer, lower=False,
 61 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 62 |     )
 63 |     fields = {
 64 |         'Error': ('src', SRC),
 65 |         'Word': ('trg', TRG)
 66 |     }
 67 | 
 68 |     train_data, valid_data, test_data = TabularDataset.splits(
 69 |         path='./Dataset',
 70 |         train='train.csv',
 71 |         validation='valid.csv',
 72 |         test='test.csv',
 73 |         format='csv',
 74 |         fields=fields
 75 |     )
 76 | 
 77 |     SRC.build_vocab(train_data, min_freq=100)
 78 |     TRG.build_vocab(train_data, min_freq=50)
 79 | 
 80 |     # Hyperparameters
 81 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 82 |     BATCH_SIZE = args.BATCH_SIZE
 83 |     #
 84 |     INPUT_DIM = len(SRC.vocab)
 85 |     OUTPUT_DIM = len(TRG.vocab)
 86 |     EMB_DIM = args.EMB_DIM  # 64
 87 |     HID_DIM = args.HID_DIM  # 256 # each conv. layer has 2 * hid_dim filters
 88 |     ENC_LAYERS = args.ENC_LAYERS  # 10  # number of conv. blocks in encoder
 89 |     DEC_LAYERS = args.DEC_LAYERS  # 10  # number of conv. blocks in decoder
 90 |     ENC_KERNEL_SIZE = args.ENC_KERNEL_SIZE  # must be odd!
 91 |     DEC_KERNEL_SIZE = args.DEC_KERNEL_SIZE  # can be even or odd
 92 |     ENC_DROPOUT = args.ENC_DROPOUT
 93 |     DEC_DROPOUT = args.DEC_DROPOUT
 94 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
 95 |     CLIP = args.CLIP
 96 |     PATH = './Checkpoints/conv_s2s.pth'
 97 | 
 98 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
 99 |         (train_data, valid_data, test_data),
100 |         batch_size=BATCH_SIZE,
101 |         sort_within_batch=True,
102 |         sort_key=lambda x: len(x.src),
103 |         device=DEVICE
104 |     )
105 | 
106 |     enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, DEVICE)
107 |     dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, DEVICE)
108 |     model = Seq2Seq(enc, dec).to(DEVICE)
109 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
110 | 
111 |     optimizer = optim.Adam(model.parameters())
112 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
113 | 
114 |     epoch = 1
115 |     # load the model
116 |     if os.path.exists(PATH):
117 |         checkpoint, epoch, train_loss = load_model(model, PATH)
118 |     #
119 |     best_loss = 1e10
120 | 
121 |     for epoch in range(epoch, N_EPOCHS):
122 |         print(f"Epoch: {epoch} / {N_EPOCHS}")
123 |         train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
124 |         print(f"Train Loss: {train_loss:.4f}")
125 |         if train_loss < best_loss:
126 |             best_loss = train_loss
127 |             save_model(model, train_loss, epoch, PATH)
128 | 
129 |     # example_idx = 10
130 |     # src = vars(train_data.examples[example_idx])['src']
131 |     # trg = vars(train_data.examples[example_idx])['trg']
132 |     # print(f'src = {src}')
133 |     # print(f'trg = {trg}')
134 |     # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
135 |     # print(f'predicted trg = {translation}')
136 | 
137 |     evaluation_report(valid_data, SRC, TRG, model, DEVICE)
138 |     # evaluation_report(error_data, SRC, TRG, model, DEVICE)
139 | 
140 | 
141 |     # -------------
142 |     # error_types = ['Cognitive Error', 'Homonym Error', 'Run-on Error',
143 |     #  'Split-word Error (Left)', 'Split-word Error (Random)',
144 |     #  'Split-word Error (Right)', 'Split-word Error (both)',
145 |     #  'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
146 |     #  'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
147 |     #  'Visual Error', 'Visual Error (Combined Character)']
148 | 
149 |     # for error_name in error_types:
150 |     #     print(f'------\nError Type: {error_name}\n------')
151 |     #     error_df(df_copy, error_name)
152 | 
153 |     #     error_data, _ = TabularDataset.splits(
154 |     #         path='./Dataset',
155 |     #         train='error.csv',
156 |     #         test='error.csv',
157 |     #         format='csv',
158 |     #         fields=fields
159 |     #     )
160 | 
161 |     #     eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
162 | 
163 |     #     error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
164 |     #     eval_df.to_csv(f'./Dataframes/convs2s_{error_name}_2.csv')
165 |     #     print('\n\n')
166 |     # -------------
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/corrector.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     word2char, basic_tokenizer, count_parameters, initialize_weights,
  3 |     save_model, load_model, error_df, train_valid_test_df, mask2str,
  4 |     error_blank, find_len, error_df_2
  5 | )
  6 | from transformer import (
  7 |     Encoder, EncoderLayer, MultiHeadAttentionLayer,
  8 |     PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
  9 |     Seq2Seq
 10 | )
 11 | from pipeline import train, evaluate
 12 | from metrics import evaluation_report
 13 | 
 14 | import pandas as pd
 15 | from sklearn.model_selection import train_test_split
 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
 17 | import torch
 18 | import torch.nn as nn
 19 | import os
 20 | import gc
 21 | from tqdm import tqdm
 22 | import sys
 23 | import argparse
 24 | 
 25 | import warnings as wrn
 26 | wrn.filterwarnings('ignore')
 27 | 
 28 | import os
 29 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
 30 | 
 31 | 
 32 | 
 33 | def main():
 34 |     parser = argparse.ArgumentParser()
 35 |     parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
 36 |     parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
 37 |     parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
 38 |     parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 39 |     parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 40 |     parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
 41 |     parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
 42 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 43 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 44 |     parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
 45 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 46 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 47 |     args = parser.parse_args()
 48 | 
 49 |     SEED = 1234
 50 |     torch.manual_seed(SEED)
 51 |     torch.cuda.manual_seed(SEED)
 52 | 
 53 |     df = pd.read_csv('./Dataset/purificator_preds.csv')
 54 |     df_copy = df.copy()
 55 |     df['Word'] = df['Word'].apply(word2char)
 56 |     df['Error'] = df['Error'].apply(word2char)
 57 |     df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char)
 58 |     df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char)
 59 |     df['ErrorBlanksPredD2'] = df['ErrorBlanksPredD2'].apply(word2char)
 60 | 
 61 |     df['MaskErrorBlank'] = '<CLS> ' + df['Error'] + ' <SEP> ' + df['ErrorBlanksPredD2'] + ' <SEP>'
 62 |     df['Length'] = df['MaskErrorBlank'].apply(find_len)
 63 | 
 64 |     df = df.loc[df['Length'] <= 48] # 48 works
 65 | 
 66 |     # df = df.iloc[:, [1, -2, 8]] # word - maskerrorblank - errortype
 67 |     df = df[['Word', 'MaskErrorBlank', 'ErrorType']]
 68 | 
 69 |     train_df, valid_df, test_df = train_valid_test_df(df, test_size=.15, valid_size=.05)
 70 | 
 71 |     train_df.to_csv('./Dataset/train.csv', index=False)
 72 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 73 |     test_df.to_csv('./Dataset/test.csv', index=False)
 74 | 
 75 |     SRC = Field(
 76 |         tokenize=basic_tokenizer, lower=False,
 77 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 78 |     )
 79 |     TRG = Field(
 80 |         tokenize=basic_tokenizer, lower=False,
 81 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 82 |     )
 83 |     fields = {
 84 |         'MaskErrorBlank': ('src', SRC),
 85 |         'Word': ('trg', TRG)
 86 |     }
 87 | 
 88 |     train_data, valid_data, test_data = TabularDataset.splits(
 89 |         path='./Dataset',
 90 |         train='train.csv',
 91 |         validation='valid.csv',
 92 |         test='test.csv',
 93 |         format='csv',
 94 |         fields=fields
 95 |     )
 96 | 
 97 |     SRC.build_vocab(train_data, min_freq=100)
 98 |     TRG.build_vocab(train_data, min_freq=50)
 99 | 
100 | 
101 |     # ------------------------------
102 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
103 |     BATCH_SIZE = 512  # 512
104 |     # ------------------------------
105 |     INPUT_DIM = len(SRC.vocab)
106 |     OUTPUT_DIM = len(TRG.vocab)
107 |     # ------------------------------
108 |     HID_DIM = int(args.HID_DIM)
109 |     ENC_LAYERS = int(args.ENC_LAYERS)
110 |     DEC_LAYERS = int(args.DEC_LAYERS)
111 |     ENC_HEADS = int(args.ENC_HEADS)
112 |     DEC_HEADS = int(args.DEC_HEADS)
113 |     ENC_PF_DIM = int(args.ENC_PF_DIM)
114 |     DEC_PF_DIM = int(args.DEC_PF_DIM)
115 |     ENC_DROPOUT = float(args.ENC_DROPOUT)
116 |     DEC_DROPOUT = float(args.DEC_DROPOUT)
117 |     CLIP = float(args.CLIP)
118 |     N_EPOCHS = int(args.N_EPOCHS)
119 |     LEARNING_RATE = float(args.LEARNING_RATE)
120 |     # ------------------------------
121 |     PATH = './Checkpoints/corrector.pth'
122 |     # ------------------------------
123 |     gc.collect()
124 |     torch.cuda.empty_cache()
125 |     # -----------------------------
126 | 
127 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
128 |         (train_data, valid_data, test_data),
129 |         batch_size=BATCH_SIZE,
130 |         sort_within_batch=True,
131 |         sort_key=lambda x: len(x.src),
132 |         device=DEVICE
133 |     )
134 | 
135 |     enc = Encoder(
136 |         INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
137 |         ENC_DROPOUT, DEVICE
138 |     )
139 |     dec = Decoder(
140 |         OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
141 |         DEC_DROPOUT, DEVICE
142 |     )
143 | 
144 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
145 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
146 |     model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
147 |     model.apply(initialize_weights)
148 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
149 | 
150 |     optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
151 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
152 |     # criterion = nn.BCEWithLogitsLoss()
153 | 
154 |     epoch = 1
155 |     best_loss = 1e10
156 |     if os.path.exists(PATH):
157 |         checkpoint, epoch, train_loss = load_model(model, PATH)
158 |         best_loss = train_loss
159 | 
160 |     # model.resize_token_embeddings(len(TRG.vocab))
161 |     for epoch in range(epoch, N_EPOCHS):
162 |         print(f"Epoch: {epoch} / {N_EPOCHS}")
163 |         train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
164 |         print(f"Train Loss: {train_loss:.4f}")
165 |         if train_loss < best_loss:
166 |             best_loss = train_loss
167 |             save_model(model, train_loss, epoch, PATH)
168 | 
169 |     # ---------------------
170 |     error_types = sorted(list(set(df.iloc[:, -1].values)))
171 | 
172 |     for error_name in error_types:
173 |         print(f'------\nError Type: {error_name}\n------')
174 |         error_df_2(df, error_name)
175 | 
176 |         error_data, _ = TabularDataset.splits(
177 |             path='./Dataset',
178 |             train='error.csv',
179 |             test='error.csv',
180 |             format='csv',
181 |             fields=fields
182 |         )
183 | 
184 |         eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
185 | 
186 |         error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
187 |         print('\n\n')
188 |      # ---------------------
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     main()
193 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/corrector.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     word2char, basic_tokenizer, count_parameters, initialize_weights,
  3 |     save_model, load_model, error_df, train_valid_test_df, mask2str,
  4 |     error_df_2, error_df_3, find_len, train_valid_test_df2, merge_dfs
  5 | )
  6 | from transformer import (
  7 |     Encoder, EncoderLayer, MultiHeadAttentionLayer,
  8 |     PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
  9 |     Seq2Seq
 10 | )
 11 | from pipeline import train, evaluate
 12 | from metrics import evaluation_report, evaluation_report2, evaluation_report3
 13 | 
 14 | import pandas as pd
 15 | from sklearn.model_selection import train_test_split
 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
 17 | import torch
 18 | import torch.nn as nn
 19 | import os
 20 | import gc
 21 | import argparse
 22 | import sys
 23 | 
 24 | import warnings as wrn
 25 | wrn.filterwarnings('ignore')
 26 | 
 27 | 
 28 | def main():
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
 31 |     parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
 32 |     parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
 33 |     parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 34 |     parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 35 |     parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
 36 |     parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
 37 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 38 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 39 |     parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
 40 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 41 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 42 |     args = parser.parse_args()
 43 | 
 44 |     SEED = 1234
 45 |     torch.manual_seed(SEED)
 46 |     torch.cuda.manual_seed(SEED)
 47 | 
 48 |     df = pd.read_csv('./Dataset/detector_preds.csv')
 49 |     df['Error'] = df['Error'].apply(word2char)
 50 |     df['Word'] = df['Word'].apply(word2char)
 51 |     df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char)
 52 |     df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char)
 53 | 
 54 |     df['MaskErrorBlank'] = '<CLS> ' + df['Error'] + ' <SEP> ' + df['ErrorBlanksPredD1'] + ' <SEP>'
 55 |     df['Length'] = df['MaskErrorBlank'].apply(find_len)
 56 |     df = df.loc[df['Length'] <= 48]  # 48 works
 57 | 
 58 |     df = df.sample(frac=1).reset_index(drop=True)
 59 |     df = df[['ErrorBlanksActual', 'MaskErrorBlank', 'ErrorType']]
 60 | 
 61 |     train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
 62 | 
 63 |     train_df.to_csv('./Dataset/train.csv', index=False)
 64 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 65 |     test_df.to_csv('./Dataset/test.csv', index=False)
 66 | 
 67 |     SRC = Field(
 68 |         tokenize=basic_tokenizer, lower=False,
 69 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 70 |     )
 71 |     TRG = Field(
 72 |         tokenize=basic_tokenizer, lower=False,
 73 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 74 |     )
 75 |     WORD = Field(
 76 |         tokenize=basic_tokenizer, lower=False,
 77 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 78 |     )
 79 |     fields = {
 80 |         'ErrorBlanksPredD1': ('src', SRC),
 81 |         'Word': ('trg', TRG)
 82 |     }
 83 | 
 84 |     train_data, valid_data, test_data = TabularDataset.splits(
 85 |         path='./Dataset',
 86 |         train='train.csv',
 87 |         validation='valid.csv',
 88 |         test='test.csv',
 89 |         format='csv',
 90 |         fields=fields
 91 |     )
 92 | 
 93 |     SRC.build_vocab(train_data, min_freq=100)  # 100
 94 |     TRG.build_vocab(train_data, min_freq=50)  # 50
 95 | 
 96 |     # ------------------------------
 97 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 98 |     BATCH_SIZE = 512  # 512
 99 |     # ------------------------------
100 |     INPUT_DIM = len(SRC.vocab)
101 |     OUTPUT_DIM = len(TRG.vocab)
102 |     # ------------------------------
103 |     HID_DIM = int(args.HID_DIM)
104 |     ENC_LAYERS = int(args.ENC_LAYERS)
105 |     DEC_LAYERS = int(args.DEC_LAYERS)
106 |     ENC_HEADS = int(args.ENC_HEADS)
107 |     DEC_HEADS = int(args.DEC_HEADS)
108 |     ENC_PF_DIM = int(args.ENC_PF_DIM)
109 |     DEC_PF_DIM = int(args.DEC_PF_DIM)
110 |     ENC_DROPOUT = float(args.ENC_DROPOUT)
111 |     DEC_DROPOUT = float(args.DEC_DROPOUT)
112 |     CLIP = float(args.CLIP)
113 |     N_EPOCHS = int(args.N_EPOCHS)
114 |     LEARNING_RATE = float(args.LEARNING_RATE)
115 |     # ------------------------------
116 |     PATH = './Checkpoints/corrector.pth'
117 |     # ------------------------------
118 |     gc.collect()
119 |     torch.cuda.empty_cache()
120 |     # -----------------------------
121 | 
122 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
123 |         (train_data, valid_data, test_data),
124 |         batch_size=BATCH_SIZE,
125 |         sort_within_batch=True,
126 |         sort_key=lambda x: len(x.src),
127 |         device=DEVICE
128 |     )
129 | 
130 |     enc = Encoder(
131 |         INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
132 |         ENC_DROPOUT, DEVICE
133 |     )
134 |     dec = Decoder(
135 |         OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
136 |         DEC_DROPOUT, DEVICE
137 |     )
138 | 
139 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
140 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
141 |     model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
142 |     model.apply(initialize_weights)
143 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
144 | 
145 |     optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
146 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
147 |     # criterion = nn.BCEWithLogitsLoss()
148 | 
149 |     epoch = 1
150 |     best_loss = 1e10
151 |     if os.path.exists(PATH):
152 |         checkpoint, epoch, train_loss = load_model(model, PATH)
153 |         best_loss = train_loss
154 | 
155 |     # model.resize_token_embeddings(len(TRG.vocab))
156 |     for epoch in range(epoch, N_EPOCHS):
157 |         print(f"Epoch: {epoch} / {N_EPOCHS}")
158 |         train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
159 |         print(f"Train Loss: {train_loss:.4f}")
160 |         if train_loss < best_loss:
161 |             best_loss = train_loss
162 |             save_model(model, train_loss, epoch, PATH)
163 | 
164 |     # ---------------------
165 |     error_types = sorted(list(set(df.iloc[:, -1].values)))
166 | 
167 |     for error_name in error_types:
168 |         print(f'------\nError Type: {error_name}\n------')
169 |         error_df_3(df, error_name)
170 | 
171 |         error_data, _ = TabularDataset.splits(
172 |             path='./Dataset',
173 |             train='error.csv',
174 |             test='error.csv',
175 |             format='csv',
176 |             fields=fields
177 |         )
178 | 
179 |         eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
180 | 
181 |         error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
182 |         print('\n\n')
183 |      # ---------------------
184 | 
185 | 
186 | if __name__ == '__main__':
187 |     main()
188 | 


--------------------------------------------------------------------------------
/CorpusCreation/corpus_stats_valid.py:
--------------------------------------------------------------------------------
  1 | from selenium import webdriver
  2 | from selenium.webdriver.common.keys import Keys
  3 | from selenium.webdriver.support import expected_conditions as EC
  4 | from selenium.webdriver.common.by import By
  5 | from selenium.webdriver.support.wait import WebDriverWait
  6 | import time
  7 | import pandas as pd
  8 | import re
  9 | import sys
 10 | import argparse
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | # ########################################################
 15 | def login():
 16 |     parser = argparse.ArgumentParser()
 17 |     parser.add_argument("--email", help="Enter Your Email")
 18 |     parser.add_argument("--password", help="Enter Your Facebook Password")
 19 |     args = parser.parse_args()
 20 | 
 21 |     # code to ignore browser notifications
 22 |     chrome_options = webdriver.ChromeOptions()
 23 |     prefs = {"profile.default_content_setting_values.notifications": 2}
 24 |     chrome_options.add_experimental_option("prefs", prefs)
 25 |     driver = webdriver.Chrome('./chromedriver.exe', chrome_options=chrome_options)
 26 |     # open the webpage
 27 |     driver.get("https://wwww.facebook.com/")
 28 |     # target username
 29 |     username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']")))
 30 |     password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']")))
 31 |     # entering email as username
 32 |     username.clear()
 33 |     username.send_keys(args.email)
 34 |     # entering password
 35 |     password.clear()
 36 |     password.send_keys(args.password)
 37 |     # target the login button and click it
 38 |     time.sleep(5)
 39 |     button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
 40 |     # We are logged in!
 41 |     print("Logged in")
 42 |     return driver
 43 | # ########################################################
 44 | 
 45 | 
 46 | # ########################################################
 47 | def scrape_post_1():
 48 |     driver = login()
 49 |     # https://fb.watch/eN-nBOb45t/
 50 |     url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid02TjtvmwDs51fyVRaHbvM5XgxL1gBGb6USBYvsxgMdn8c4BcQvjbLv1BFCjw52UsXQl&id=111762869482599&eav=Afba2OolCuRXElnzf97xViXfIosR66LZPdko_Q9oxtd5fhvZMDjeKOC_JD1Nx2LKtEE&__tn__=%2AW&paipv=0"
 51 |     # url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0"
 52 |     while True:
 53 |         driver.get(url)
 54 |         comments = driver.find_element(By.CLASS_NAME, "ef").text
 55 |         comments = re.sub("[A-Za-z0-9·\\n]", "", comments)
 56 |         next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href')
 57 |         if type(next_page) != str:
 58 |             break
 59 |         url = next_page
 60 |         time.sleep(5)
 61 |         sys.exit()
 62 |         with open('./dfs/comments.txt', 'a', encoding='utf-8') as f:
 63 |             f.write(comments)
 64 |             f.write(' \n ')
 65 | # ########################################################
 66 | 
 67 | 
 68 | # ########################################################
 69 | def scrape_post_2():
 70 |     driver = login()
 71 |     # https://fb.watch/eNQHYjDuA6/
 72 |     url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0"
 73 |     while True:
 74 |         driver.get(url)
 75 |         comments = driver.find_elements(By.CLASS_NAME, "eb")
 76 |         for comment in comments:
 77 |             comment = comment.text
 78 |             comment = re.sub("[A-Za-z0-9·.\\n]", "", comment)
 79 |             with open('comments.txt', 'a', encoding='utf-8') as f:
 80 |                 f.write(comment)
 81 |                 f.write('  ')
 82 | 
 83 |         comments = driver.find_elements(By.CLASS_NAME, "ec")
 84 |         for comment in comments:
 85 |             comment = comment.text
 86 |             comment = re.sub("[A-Za-z0-9·.\\n]", "", comment)
 87 |             with open('./dfs/comments.txt', 'a', encoding='utf-8') as f:
 88 |                 f.write(comment)
 89 |                 f.write('  ')
 90 | 
 91 |         next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href')
 92 |         if type(next_page) != str:
 93 |             break
 94 | 
 95 |         url = next_page
 96 |         time.sleep(5)
 97 | # ########################################################
 98 | 
 99 | 
100 | # ########################################################
101 | def clean_text(text):
102 |     all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ',
103 |                  'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ',
104 |                  'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ',
105 |                  'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ',
106 |                  'ৗ', 'ড়', 'ঢ়', 'য়']
107 |     cleaned_text = ''
108 |     for i in tqdm(range(len(text))):
109 |         if text[i] in all_chars:
110 |             cleaned_text += text[i]
111 |         else:
112 |             cleaned_text += ' '
113 |     return cleaned_text
114 | 
115 | def find_stats():
116 |     f = open("./dfs/comments.txt", "r", encoding='utf-8')
117 |     text = f.read()
118 |     text = clean_text(text)
119 | 
120 |     words = sorted(text.split())
121 |     unique_words = sorted(list(set(words)))
122 | 
123 |     error_df = pd.read_csv('./dfs/sec_dataset_IV.csv')
124 |     balanced_df = pd.DataFrame()
125 |     all_error_types = sorted(list(set(error_df.iloc[:, -1].values)))
126 |     for error in all_error_types:
127 |         x = error_df.loc[error_df['ErrorType'] == error]
128 |         if (len(x)) < 100000:
129 |             balanced_df = pd.concat([balanced_df, x])
130 |         else:
131 |             balanced_df = pd.concat([balanced_df, x.sample(100000)])
132 | 
133 |     erroneous_words = balanced_df.iloc[:, 1].values
134 |     erroneous_words_type = balanced_df.iloc[:, 2].values
135 | 
136 |     found = []
137 |     types = []
138 |     for i in tqdm(range(len(unique_words))):
139 |         word = unique_words[i]
140 |         if word in erroneous_words:
141 |             found.append(word)
142 |             types.append(erroneous_words_type[i])
143 |         if (i != 0 and i % 1000 == 0):
144 |             print(len(found))
145 | 
146 |     error_words = []
147 |     error_types = []
148 |     for i in tqdm(range(len(found))):
149 |         word = found[i]
150 |         etype = error_df.loc[error_df['Error'] == word]['ErrorType'].values[0]
151 |         error_words.append(word)
152 |         error_types.append(etype)
153 | 
154 |     temp = pd.DataFrame({
155 |         'Error': error_words,
156 |         'ErrorType': error_types
157 |     })
158 | 
159 |     unique_etypes = sorted(list(set(error_types)))
160 |     err_names, instances, pcts = [], [], []
161 |     for etype in unique_etypes:
162 |         x = temp.loc[temp['ErrorType'] == etype]
163 |         print(f"{etype}, {len(x)}/{len(temp)}, {len(x) / len(temp) * 100:.2f}%")
164 |         err_names.append(etype)
165 |         instances.append(f"{len(x)}/{len(temp)}")
166 |         pcts.append(len(x) / len(temp) * 100)
167 | 
168 |     df = pd.DataFrame({
169 |         'ErrorType': err_names,
170 |         'Instances': instances,
171 |         'Pct': pcts
172 |     })
173 |     print(df)
174 | 
175 |     print("Missing error types")
176 |     found = sorted(list(set(error_types)))
177 |     target = sorted(list(set(error_df.iloc[:, -1].values)))
178 | 
179 |     for item in target:
180 |         if item not in found:
181 |             print(item)
182 | # ########################################################
183 | 
184 | 
185 | # ########################################################
186 | if __name__ == '__main__':
187 |     scrape_post_1()
188 |     scrape_post_2()
189 |     find_stats()
190 | 


--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/main.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     df2train_test_dfs, basic_tokenizer, init_weights, count_parameters,
  3 |     translate_sentence, display_attention, df2train_valid_test_dfs,
  4 |     save_model, load_model, df2train_error_dfs
  5 | )
  6 | from models import Encoder, Decoder, Attention, Seq2Seq
  7 | from pipeline import train, test_accuracy
  8 | from inference import test_beam, test_greedy
  9 | from focalLoss import FocalLoss
 10 | from errors import error_df
 11 | 
 12 | import torch, torch.nn as nn, torch.optim as optim
 13 | import torch.nn.functional as F
 14 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
 15 | import random
 16 | from tqdm import tqdm
 17 | import pandas as pd
 18 | from sklearn.model_selection import train_test_split
 19 | import math
 20 | import time
 21 | 
 22 | import matplotlib.pyplot as plt
 23 | import matplotlib.ticker as ticker
 24 | import matplotlib.font_manager as fm
 25 | 
 26 | import numpy as np
 27 | import math
 28 | import time
 29 | import sys
 30 | import os
 31 | import argparse
 32 | 
 33 | import warnings as wrn
 34 | wrn.filterwarnings('ignore')
 35 | 
 36 | 
 37 | def main():
 38 |     parser = argparse.ArgumentParser()
 39 |     parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv", 
 40 |         choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"]
 41 |     )
 42 |     parser.add_argument("--ENC_EMB_DIM", help="Encoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
 43 |     parser.add_argument("--DEC_EMB_DIM", help="Decoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
 44 |     parser.add_argument("--ENC_HIDDEN_DIM", help="Encoder Hidden Dimension", type=int,default=256, choices=[128, 256, 512])
 45 |     parser.add_argument("--DEC_HIDDEN_DIM", help="Decoder Hidden Dimension", type=int, default=512, choices=[256, 512, 1024])
 46 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 47 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 48 |     parser.add_argument("--MAX_LEN", help="Maximum Length", type=int, default=48, choices=[48, 56, 64])
 49 |     parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512])
 50 |     parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=1, choices=[0.1, 0.2, 0.5, 1])
 51 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 52 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 53 |     args = parser.parse_args()
 54 | 
 55 | 
 56 |     df = pd.read_csv(args.CORPUS)
 57 |     df2train_valid_test_dfs(df=df, test_size=0.15)
 58 | 
 59 |     SRC = Field(
 60 |         tokenize=basic_tokenizer, lower=False,
 61 |         init_token='<sos>', eos_token='<eos>',
 62 |         sequential=True, use_vocab=True, include_lengths=True
 63 |     )
 64 |     TRG = Field(
 65 |         tokenize=basic_tokenizer, lower=False,
 66 |         init_token='<sos>', eos_token='<eos>',
 67 |         sequential=True, use_vocab=True
 68 |     )
 69 |     fields = {
 70 |         'Error': ('src', SRC),
 71 |         'Word': ('trg', TRG)
 72 |     }
 73 |     train_data, valid_data, test_data = TabularDataset.splits(
 74 |         path='./Dataset',
 75 |         train='train.csv',
 76 |         validation='valid.csv',
 77 |         test='test.csv',
 78 |         format='csv',
 79 |         fields=fields
 80 |     )
 81 | 
 82 |     SRC.build_vocab(train_data, max_size=64, min_freq=100)
 83 |     TRG.build_vocab(train_data, max_size=64, min_freq=75)
 84 |     # -------------------------------------
 85 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 86 |     BATCH_SIZE = args.BATCH_SIZE
 87 |     INPUT_DIM = len(SRC.vocab)
 88 |     OUTPUT_DIM = len(TRG.vocab)
 89 |     ENC_EMB_DIM = args.ENC_EMB_DIM
 90 |     DEC_EMB_DIM = args.DEC_EMB_DIM
 91 |     ENC_HIDDEN_DIM = args.ENC_HIDDEN_DIM
 92 |     DEC_HIDDEN_DIM = args.DEC_HIDDEN_DIM
 93 |     ENC_DROPOUT = args.ENC_DROPOUT
 94 |     DEC_DROPOUT = args.DEC_DROPOUT
 95 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
 96 |     MAX_LEN = args.MAX_LEN
 97 |     N_EPOCHS = args.N_EPOCHS
 98 |     CLIP = args.CLIP
 99 |     # -------------------------------------
100 |     PATH = './Checkpoints/GRUSeq2Seq.pth'
101 | 
102 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
103 |         (train_data, valid_data, test_data),
104 |         batch_size=BATCH_SIZE,
105 |         sort_within_batch=True,
106 |         sort_key=lambda x: len(x.src),
107 |         device=DEVICE
108 |     )
109 | 
110 |     attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
111 |     encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT)
112 |     decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention)
113 | 
114 |     model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE)
115 |     model.apply(init_weights)
116 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
117 | 
118 |     optimizer = optim.Adam(model.parameters())
119 |     # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4)
120 | 
121 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
122 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
123 |     # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX)
124 |     # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')
125 | 
126 |     best_loss = 1e10
127 |     epoch = 1
128 |     if os.path.exists(PATH):
129 |         checkpoint, epoch, train_loss = load_model(model, optimizer, PATH)
130 |         best_loss = train_loss
131 | 
132 |     for epoch in range(epoch, N_EPOCHS):
133 |         print(f'Epoch: {epoch} / {N_EPOCHS}')
134 |         train_loss = train(model, train_iterator, optimizer, criterion)
135 |         print(f"Train Loss: {train_loss:.2f}")
136 | 
137 |         if train_loss < best_loss:
138 |             best_loss = train_loss
139 |             save_model(model, epoch, optimizer, train_loss, PATH)
140 | 
141 |         # scheduler.step()
142 |         # if epoch%10 == 0:
143 |         #     # test_accuracy(valid_data, SRC, TRG, model, DEVICE)
144 |         #     test_accuracy(error_data, SRC, TRG, model, DEVICE)
145 | 
146 |     test_accuracy(valid_data, SRC, TRG, model, DEVICE)
147 | 
148 |     
149 |     # errors = ['Cognitive Error', 'Homonym Error', 'Run-on Error',
150 |     #  'Split-word Error (Left)', 'Split-word Error (Random)',
151 |     #  'Split-word Error (Right)', 'Split-word Error (both)',
152 |     #  'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
153 |     #  'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
154 |     #  'Visual Error', 'Visual Error (Combined Character)']
155 | 
156 |     # for error in errors:
157 |     #     print(f"-----\nError Type: {error}\n-----")
158 |     #     error_df(df, error)
159 |     #     error_data, _ = TabularDataset.splits(
160 |     #         path='./Dataset',
161 |     #         train='error.csv',
162 |     #         test='error.csv',
163 |     #         format='csv',
164 |     #         fields=fields
165 |     #     )
166 |     #     eval_df = test_accuracy(error_data, SRC, TRG, model, DEVICE)
167 |     #     error = error.replace(' ', '').replace('(', '').replace(')', '')
168 |     #     eval_df.to_csv(f'./Corrections/s2sJL_{error}.csv')
169 |     #     print('\n\n')
170 | 
171 | 
172 |     # test_beam(model, train_data, test_data, SRC, TRG, DEVICE)
173 |     # test_greedy(test_data, SRC, TRG, model, DEVICE)
174 | 
175 |     # example_idx = 1
176 |     # src = vars(train_data.examples[example_idx])['src']
177 |     # trg = vars(train_data.examples[example_idx])['trg']
178 |     # print(f'src = {src}')
179 |     # print(f'trg = {trg}')
180 |     # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
181 |     # print(f'predicted trg = {translation}')
182 |     # display_attention(src, translation, attention)
183 | 
184 | 
185 | if __name__ == '__main__':
186 |     main()
187 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import pandas as pd
  4 | import numpy as np
  5 | from sklearn.model_selection import train_test_split
  6 | import os
  7 | 
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | SEED = 1234
 12 | torch.manual_seed(SEED)
 13 | torch.cuda.manual_seed(SEED)
 14 | 
 15 | 
 16 | # ---------------------------
 17 | def train_valid_test_df(df, test_size, valid_size):
 18 |     # etypes = list(set(df.iloc[:, -1]))
 19 |     etypes = list(set(df['ErrorType']))
 20 | 
 21 |     train_df = pd.DataFrame()
 22 |     valid_df = pd.DataFrame()
 23 |     test_df = pd.DataFrame()
 24 | 
 25 |     for etype in etypes:
 26 |         etype_df = df.loc[df['ErrorType'] == etype]
 27 |         train, test = train_test_split(etype_df, test_size=test_size)
 28 |         train, valid = train_test_split(train, test_size=valid_size)
 29 | 
 30 |         train_df = pd.concat([train_df, train])
 31 |         valid_df = pd.concat([valid_df, valid])
 32 |         test_df = pd.concat([test_df, test])
 33 | 
 34 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 35 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 36 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 37 | 
 38 |     train_df = train_df.iloc[:, [1, 0]]
 39 |     valid_df = valid_df.iloc[:, [1, 0]]
 40 |     test_df = test_df.iloc[:, [1, 0]]
 41 | 
 42 |     return train_df, valid_df, test_df
 43 | # ---------------------------
 44 | 
 45 | 
 46 | # ---------------------------
 47 | def train_valid_test_df2(df, test_size, valid_size):
 48 |     # etypes = list(set(df.iloc[:, -1]))
 49 |     etypes = list(set(df['ErrorType']))
 50 | 
 51 |     train_df = pd.DataFrame()
 52 |     valid_df = pd.DataFrame()
 53 |     test_df = pd.DataFrame()
 54 | 
 55 |     for etype in etypes:
 56 |         etype_df = df.loc[df['ErrorType'] == etype]
 57 |         train, test = train_test_split(etype_df, test_size=test_size)
 58 |         train, valid = train_test_split(train, test_size=valid_size)
 59 | 
 60 |         train_df = pd.concat([train_df, train])
 61 |         valid_df = pd.concat([valid_df, valid])
 62 |         test_df = pd.concat([test_df, test])
 63 | 
 64 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 65 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 66 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 67 | 
 68 |     # train_df = train_df.iloc[:, [1, 0]]
 69 |     # valid_df = valid_df.iloc[:, [1, 0]]
 70 |     # test_df = test_df.iloc[:, [1, 0]]
 71 | 
 72 |     return train_df, valid_df, test_df
 73 | # ---------------------------
 74 | 
 75 | 
 76 | # ---------------------------
 77 | def merge_dfs(network='detector'):
 78 |     df_names = [
 79 |         f'{network}_CognitiveError.csv',
 80 |         f'{network}_HomonymError.csv',
 81 |         f'{network}_Run-onError.csv',
 82 |         f'{network}_Split-wordErrorLeft.csv',
 83 |         f'{network}_Split-wordErrorRandom.csv',
 84 |         f'{network}_Split-wordErrorRight.csv',
 85 |         f'{network}_Split-wordErrorboth.csv',
 86 |         f'{network}_TypoAvroSubstituition.csv',
 87 |         f'{network}_TypoBijoySubstituition.csv',
 88 |         f'{network}_TypoDeletion.csv',
 89 |         f'{network}_TypoInsertion.csv',
 90 |         f'{network}_TypoTransposition.csv',
 91 |         f'{network}_VisualError.csv',
 92 |         f'{network}_VisualErrorCombinedCharacter.csv'
 93 |     ]
 94 |     
 95 |     df = pd.DataFrame()
 96 | 
 97 |     for df_name in df_names:
 98 |         df_path = os.path.join('./Dataframes', df_name)
 99 |         temp_df = pd.read_csv(df_path)
100 |         temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 |                                 for _ in range(len(temp_df))]
102 |         df = pd.concat([df, temp_df])
103 | 
104 |     df = df.iloc[:, :]
105 | 
106 |     if network=='detector':
107 |         df.rename(
108 |             columns = {
109 |                 'Predicton':'ErrorBlanksPredD1', 
110 |                 'Target':'ErrorBlanksActual', 
111 |                 'Correction':'EBP_Flag_D1', 
112 |             }, 
113 |             inplace = True
114 |         )
115 |         df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 | 
117 |     df.to_csv(f'./Dataset/{network}_preds.csv', index=False)  # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 |                                                                # (purificator)
119 | # ---------------------------
120 | 
121 | 
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 |     df = df.loc[df['ErrorType'] == error]
125 |     df['Word'] = df['Word'].apply(word2char)
126 |     df['Error'] = df['Error'].apply(word2char)
127 |     df = df.sample(frac=1).reset_index(drop=True)
128 |     idx = int(len(df)/1)
129 |     df = df.iloc[:idx, [1, 0]]
130 |     df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 | 
133 | 
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 |     df = df.loc[df['ErrorType'] == error]
137 |     # df['Word'] = df['Word'].apply(word2char)
138 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 |     df = df.sample(frac=1).reset_index(drop=True)
140 |     idx = int(len(df)/1)
141 |     df = df.iloc[:idx, [1, 0]]
142 |     #
143 |     # if(len(df) >= 10000):
144 |     #     df = df.iloc[:10000, :]
145 |     #
146 |     df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 | 
149 | 
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 |     df = df.loc[df['ErrorType'] == error]
153 |     # df['Word'] = df['Word'].apply(word2char)
154 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 |     df = df.sample(frac=1).reset_index(drop=True)
156 |     # idx = int(len(df)/1)
157 |     # df = df.iloc[:idx, [1, 0]]
158 |     #
159 |     # if(len(df) >= 10000):
160 |     #     df = df.iloc[:10000, :]
161 |     #
162 |     df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 | 
165 | 
166 | # ---------------------------
167 | def word2char(word):
168 |     w2c = [char for char in word]
169 |     return ' '.join(w2c)
170 | # ---------------------------
171 | 
172 | 
173 | # ---------------------------
174 | def find_len(seq):
175 |     return len(seq.split(' '))
176 | # ---------------------------
177 | 
178 | 
179 | # ---------------------------
180 | def mask2str(mask):
181 |     x = ''
182 |     for item in mask:
183 |         if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 |             x += str(item)
185 |     return x
186 | # ---------------------------
187 | 
188 | 
189 | # ---------------------------
190 | def error_blank(error, mask):
191 |     error_list = np.array(error.split())
192 |     mask_list = np.array(mask.split())
193 |     idx = np.where(mask_list=='1')[0]
194 |     error_list[idx] = ' '
195 |     error = ' '.join(error_list)
196 |     return error
197 | # ---------------------------
198 | 
199 | 
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 |     return text.split()
203 | # ---------------------------
204 | 
205 | 
206 | # ---------------------------
207 | def count_parameters(model):
208 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 | 
211 | 
212 | # ---------------------------
213 | def initialize_weights(m):
214 |     if hasattr(m, 'weight') and m.weight.dim() > 1:
215 |         nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 | 
218 | 
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 |     torch.save({
222 |         'epoch': epoch,
223 |         'model_state_dict': model.state_dict(),
224 |         # 'optimizer_state_dict': optimizer.state_dict(),
225 |         'loss': train_loss
226 |     }, PATH)
227 |     print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 | 
230 | 
231 | # ---------------------------
232 | def load_model(model, PATH):
233 |     checkpoint = torch.load(PATH)
234 |     model.load_state_dict(checkpoint['model_state_dict'])
235 |     # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 |     epoch = checkpoint['epoch']
237 |     train_loss = checkpoint['loss']
238 |     return checkpoint, epoch, train_loss
239 | # ---------------------------
240 | 
241 | 
242 | if __name__ == '__main__':
243 |     pass
244 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/utils.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import pandas as pd
  4 | import numpy as np
  5 | from sklearn.model_selection import train_test_split
  6 | import os
  7 | 
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | SEED = 1234
 12 | torch.manual_seed(SEED)
 13 | torch.cuda.manual_seed(SEED)
 14 | 
 15 | 
 16 | # ---------------------------
 17 | def train_valid_test_df(df, test_size, valid_size):
 18 |     # etypes = list(set(df.iloc[:, -1]))
 19 |     etypes = list(set(df['ErrorType']))
 20 | 
 21 |     train_df = pd.DataFrame()
 22 |     valid_df = pd.DataFrame()
 23 |     test_df = pd.DataFrame()
 24 | 
 25 |     for etype in etypes:
 26 |         etype_df = df.loc[df['ErrorType'] == etype]
 27 |         train, test = train_test_split(etype_df, test_size=test_size)
 28 |         train, valid = train_test_split(train, test_size=valid_size)
 29 | 
 30 |         train_df = pd.concat([train_df, train])
 31 |         valid_df = pd.concat([valid_df, valid])
 32 |         test_df = pd.concat([test_df, test])
 33 | 
 34 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 35 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 36 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 37 | 
 38 |     train_df = train_df.iloc[:, [1, 0]]
 39 |     valid_df = valid_df.iloc[:, [1, 0]]
 40 |     test_df = test_df.iloc[:, [1, 0]]
 41 | 
 42 |     return train_df, valid_df, test_df
 43 | # ---------------------------
 44 | 
 45 | 
 46 | # ---------------------------
 47 | def train_valid_test_df2(df, test_size, valid_size):
 48 |     # etypes = list(set(df.iloc[:, -1]))
 49 |     etypes = list(set(df['ErrorType']))
 50 | 
 51 |     train_df = pd.DataFrame()
 52 |     valid_df = pd.DataFrame()
 53 |     test_df = pd.DataFrame()
 54 | 
 55 |     for etype in etypes:
 56 |         etype_df = df.loc[df['ErrorType'] == etype]
 57 |         train, test = train_test_split(etype_df, test_size=test_size)
 58 |         train, valid = train_test_split(train, test_size=valid_size)
 59 | 
 60 |         train_df = pd.concat([train_df, train])
 61 |         valid_df = pd.concat([valid_df, valid])
 62 |         test_df = pd.concat([test_df, test])
 63 | 
 64 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 65 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 66 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 67 | 
 68 |     # train_df = train_df.iloc[:, [1, 0]]
 69 |     # valid_df = valid_df.iloc[:, [1, 0]]
 70 |     # test_df = test_df.iloc[:, [1, 0]]
 71 | 
 72 |     return train_df, valid_df, test_df
 73 | # ---------------------------
 74 | 
 75 | 
 76 | # ---------------------------
 77 | def merge_dfs(network='detector'):
 78 |     df_names = [
 79 |         f'{network}_CognitiveError.csv',
 80 |         f'{network}_HomonymError.csv',
 81 |         f'{network}_Run-onError.csv',
 82 |         f'{network}_Split-wordErrorLeft.csv',
 83 |         f'{network}_Split-wordErrorRandom.csv',
 84 |         f'{network}_Split-wordErrorRight.csv',
 85 |         f'{network}_Split-wordErrorboth.csv',
 86 |         f'{network}_TypoAvroSubstituition.csv',
 87 |         f'{network}_TypoBijoySubstituition.csv',
 88 |         f'{network}_TypoDeletion.csv',
 89 |         f'{network}_TypoInsertion.csv',
 90 |         f'{network}_TypoTransposition.csv',
 91 |         f'{network}_VisualError.csv',
 92 |         f'{network}_VisualErrorCombinedCharacter.csv'
 93 |     ]
 94 |     
 95 |     df = pd.DataFrame()
 96 | 
 97 |     for df_name in df_names:
 98 |         df_path = os.path.join('./Dataframes', df_name)
 99 |         temp_df = pd.read_csv(df_path)
100 |         temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 |                                 for _ in range(len(temp_df))]
102 |         df = pd.concat([df, temp_df])
103 | 
104 |     df = df.iloc[:, :]
105 | 
106 |     if network=='detector':
107 |         df.rename(
108 |             columns = {
109 |                 'Predicton':'ErrorBlanksPredD1', 
110 |                 'Target':'ErrorBlanksActual', 
111 |                 'Correction':'EBP_Flag_D1', 
112 |             }, 
113 |             inplace = True
114 |         )
115 |         df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 | 
117 |     df.to_csv(f'./Dataset/{network}_preds.csv', index=False)  # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 |                                                                # (purificator)
119 | # ---------------------------
120 | 
121 | 
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 |     df = df.loc[df['ErrorType'] == error]
125 |     df['Word'] = df['Word'].apply(word2char)
126 |     df['Error'] = df['Error'].apply(word2char)
127 |     df = df.sample(frac=1).reset_index(drop=True)
128 |     idx = int(len(df)/1)
129 |     df = df.iloc[:idx, [1, 0]]
130 |     df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 | 
133 | 
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 |     df = df.loc[df['ErrorType'] == error]
137 |     # df['Word'] = df['Word'].apply(word2char)
138 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 |     df = df.sample(frac=1).reset_index(drop=True)
140 |     idx = int(len(df)/1)
141 |     df = df.iloc[:idx, [1, 0]]
142 |     #
143 |     # if(len(df) >= 10000):
144 |     #     df = df.iloc[:10000, :]
145 |     #
146 |     df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 | 
149 | 
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 |     df = df.loc[df['ErrorType'] == error]
153 |     # df['Word'] = df['Word'].apply(word2char)
154 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 |     df = df.sample(frac=1).reset_index(drop=True)
156 |     # idx = int(len(df)/1)
157 |     # df = df.iloc[:idx, [1, 0]]
158 |     #
159 |     # if(len(df) >= 10000):
160 |     #     df = df.iloc[:10000, :]
161 |     #
162 |     df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 | 
165 | 
166 | # ---------------------------
167 | def word2char(word):
168 |     w2c = [char for char in word]
169 |     return ' '.join(w2c)
170 | # ---------------------------
171 | 
172 | 
173 | # ---------------------------
174 | def find_len(seq):
175 |     return len(seq.split(' '))
176 | # ---------------------------
177 | 
178 | 
179 | # ---------------------------
180 | def mask2str(mask):
181 |     x = ''
182 |     for item in mask:
183 |         if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 |             x += str(item)
185 |     return x
186 | # ---------------------------
187 | 
188 | 
189 | # ---------------------------
190 | def error_blank(error, mask):
191 |     error_list = np.array(error.split())
192 |     mask_list = np.array(mask.split())
193 |     idx = np.where(mask_list=='1')[0]
194 |     error_list[idx] = ' '
195 |     error = ' '.join(error_list)
196 |     return error
197 | # ---------------------------
198 | 
199 | 
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 |     return text.split()
203 | # ---------------------------
204 | 
205 | 
206 | # ---------------------------
207 | def count_parameters(model):
208 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 | 
211 | 
212 | # ---------------------------
213 | def initialize_weights(m):
214 |     if hasattr(m, 'weight') and m.weight.dim() > 1:
215 |         nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 | 
218 | 
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 |     torch.save({
222 |         'epoch': epoch,
223 |         'model_state_dict': model.state_dict(),
224 |         # 'optimizer_state_dict': optimizer.state_dict(),
225 |         'loss': train_loss
226 |     }, PATH)
227 |     print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 | 
230 | 
231 | # ---------------------------
232 | def load_model(model, PATH):
233 |     checkpoint = torch.load(PATH)
234 |     model.load_state_dict(checkpoint['model_state_dict'])
235 |     # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 |     epoch = checkpoint['epoch']
237 |     train_loss = checkpoint['loss']
238 |     return checkpoint, epoch, train_loss
239 | # ---------------------------
240 | 
241 | 
242 | if __name__ == '__main__':
243 |     pass
244 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/utils.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | import pandas as pd
  4 | import numpy as np
  5 | from sklearn.model_selection import train_test_split
  6 | import os
  7 | 
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | SEED = 1234
 12 | torch.manual_seed(SEED)
 13 | torch.cuda.manual_seed(SEED)
 14 | 
 15 | 
 16 | # ---------------------------
 17 | def train_valid_test_df(df, test_size, valid_size):
 18 |     # etypes = list(set(df.iloc[:, -1]))
 19 |     etypes = list(set(df['ErrorType']))
 20 | 
 21 |     train_df = pd.DataFrame()
 22 |     valid_df = pd.DataFrame()
 23 |     test_df = pd.DataFrame()
 24 | 
 25 |     for etype in etypes:
 26 |         etype_df = df.loc[df['ErrorType'] == etype]
 27 |         train, test = train_test_split(etype_df, test_size=test_size)
 28 |         train, valid = train_test_split(train, test_size=valid_size)
 29 | 
 30 |         train_df = pd.concat([train_df, train])
 31 |         valid_df = pd.concat([valid_df, valid])
 32 |         test_df = pd.concat([test_df, test])
 33 | 
 34 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 35 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 36 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 37 | 
 38 |     train_df = train_df.iloc[:, [1, 0]]
 39 |     valid_df = valid_df.iloc[:, [1, 0]]
 40 |     test_df = test_df.iloc[:, [1, 0]]
 41 | 
 42 |     return train_df, valid_df, test_df
 43 | # ---------------------------
 44 | 
 45 | 
 46 | # ---------------------------
 47 | def train_valid_test_df2(df, test_size, valid_size):
 48 |     # etypes = list(set(df.iloc[:, -1]))
 49 |     etypes = list(set(df['ErrorType']))
 50 | 
 51 |     train_df = pd.DataFrame()
 52 |     valid_df = pd.DataFrame()
 53 |     test_df = pd.DataFrame()
 54 | 
 55 |     for etype in etypes:
 56 |         etype_df = df.loc[df['ErrorType'] == etype]
 57 |         train, test = train_test_split(etype_df, test_size=test_size)
 58 |         train, valid = train_test_split(train, test_size=valid_size)
 59 | 
 60 |         train_df = pd.concat([train_df, train])
 61 |         valid_df = pd.concat([valid_df, valid])
 62 |         test_df = pd.concat([test_df, test])
 63 | 
 64 |     train_df = train_df.sample(frac=1).reset_index(drop=True)
 65 |     valid_df = valid_df.sample(frac=1).reset_index(drop=True)
 66 |     test_df = test_df.sample(frac=1).reset_index(drop=True)
 67 | 
 68 |     # train_df = train_df.iloc[:, [1, 0]]
 69 |     # valid_df = valid_df.iloc[:, [1, 0]]
 70 |     # test_df = test_df.iloc[:, [1, 0]]
 71 | 
 72 |     return train_df, valid_df, test_df
 73 | # ---------------------------
 74 | 
 75 | 
 76 | # ---------------------------
 77 | def merge_dfs(network='detector'):
 78 |     df_names = [
 79 |         f'{network}_CognitiveError.csv',
 80 |         f'{network}_HomonymError.csv',
 81 |         f'{network}_Run-onError.csv',
 82 |         f'{network}_Split-wordErrorLeft.csv',
 83 |         f'{network}_Split-wordErrorRandom.csv',
 84 |         f'{network}_Split-wordErrorRight.csv',
 85 |         f'{network}_Split-wordErrorboth.csv',
 86 |         f'{network}_TypoAvroSubstituition.csv',
 87 |         f'{network}_TypoBijoySubstituition.csv',
 88 |         f'{network}_TypoDeletion.csv',
 89 |         f'{network}_TypoInsertion.csv',
 90 |         f'{network}_TypoTransposition.csv',
 91 |         f'{network}_VisualError.csv',
 92 |         f'{network}_VisualErrorCombinedCharacter.csv'
 93 |     ]
 94 |     
 95 |     df = pd.DataFrame()
 96 | 
 97 |     for df_name in df_names:
 98 |         df_path = os.path.join('./Dataframes', df_name)
 99 |         temp_df = pd.read_csv(df_path)
100 |         temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 |                                 for _ in range(len(temp_df))]
102 |         df = pd.concat([df, temp_df])
103 | 
104 |     df = df.iloc[:, :]
105 | 
106 |     if network=='detector':
107 |         df.rename(
108 |             columns = {
109 |                 'Predicton':'ErrorBlanksPredD1', 
110 |                 'Target':'ErrorBlanksActual', 
111 |                 'Correction':'EBP_Flag_D1', 
112 |             }, 
113 |             inplace = True
114 |         )
115 |         df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 | 
117 |     df.to_csv(f'./Dataset/{network}_preds.csv', index=False)  # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 |                                                                # (purificator)
119 | # ---------------------------
120 | 
121 | 
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 |     df = df.loc[df['ErrorType'] == error]
125 |     df['Word'] = df['Word'].apply(word2char)
126 |     df['Error'] = df['Error'].apply(word2char)
127 |     df = df.sample(frac=1).reset_index(drop=True)
128 |     idx = int(len(df)/1)
129 |     df = df.iloc[:idx, [1, 0]]
130 |     df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 | 
133 | 
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 |     df = df.loc[df['ErrorType'] == error]
137 |     # df['Word'] = df['Word'].apply(word2char)
138 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 |     df = df.sample(frac=1).reset_index(drop=True)
140 |     idx = int(len(df)/1)
141 |     df = df.iloc[:idx, [1, 0]]
142 |     #
143 |     # if(len(df) >= 10000):
144 |     #     df = df.iloc[:10000, :]
145 |     #
146 |     df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 | 
149 | 
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 |     df = df.loc[df['ErrorType'] == error]
153 |     # df['Word'] = df['Word'].apply(word2char)
154 |     # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 |     df = df.sample(frac=1).reset_index(drop=True)
156 |     # idx = int(len(df)/1)
157 |     # df = df.iloc[:idx, [1, 0]]
158 |     #
159 |     # if(len(df) >= 10000):
160 |     #     df = df.iloc[:10000, :]
161 |     #
162 |     df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 | 
165 | 
166 | # ---------------------------
167 | def word2char(word):
168 |     w2c = [char for char in word]
169 |     return ' '.join(w2c)
170 | # ---------------------------
171 | 
172 | 
173 | # ---------------------------
174 | def find_len(seq):
175 |     return len(seq.split(' '))
176 | # ---------------------------
177 | 
178 | 
179 | # ---------------------------
180 | def mask2str(mask):
181 |     x = ''
182 |     for item in mask:
183 |         if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 |             x += str(item)
185 |     return x
186 | # ---------------------------
187 | 
188 | 
189 | # ---------------------------
190 | def error_blank(error, mask):
191 |     error_list = np.array(error.split())
192 |     mask_list = np.array(mask.split())
193 |     idx = np.where(mask_list=='1')[0]
194 |     error_list[idx] = ' '
195 |     error = ' '.join(error_list)
196 |     return error
197 | # ---------------------------
198 | 
199 | 
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 |     return text.split()
203 | # ---------------------------
204 | 
205 | 
206 | # ---------------------------
207 | def count_parameters(model):
208 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 | 
211 | 
212 | # ---------------------------
213 | def initialize_weights(m):
214 |     if hasattr(m, 'weight') and m.weight.dim() > 1:
215 |         nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 | 
218 | 
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 |     torch.save({
222 |         'epoch': epoch,
223 |         'model_state_dict': model.state_dict(),
224 |         # 'optimizer_state_dict': optimizer.state_dict(),
225 |         'loss': train_loss
226 |     }, PATH)
227 |     print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 | 
230 | 
231 | # ---------------------------
232 | def load_model(model, PATH):
233 |     checkpoint = torch.load(PATH)
234 |     model.load_state_dict(checkpoint['model_state_dict'])
235 |     # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 |     epoch = checkpoint['epoch']
237 |     train_loss = checkpoint['loss']
238 |     return checkpoint, epoch, train_loss
239 | # ---------------------------
240 | 
241 | 
242 | if __name__ == '__main__':
243 |     pass
244 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/dtransformer.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     word2char, basic_tokenizer, count_parameters, initialize_weights,
  3 |     save_model, load_model, error_df, train_valid_test_df, mask2str,
  4 |     error_df_2, error_df_3, merge_dfs
  5 | )
  6 | from transformer import (
  7 |     Encoder, EncoderLayer, MultiHeadAttentionLayer,
  8 |     PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
  9 |     Seq2Seq
 10 | )
 11 | from pipeline import train, evaluate
 12 | from metrics import evaluation_report, evaluation_report2
 13 | 
 14 | import pandas as pd
 15 | from sklearn.model_selection import train_test_split
 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
 17 | import torch
 18 | import torch.nn as nn
 19 | import os
 20 | import gc
 21 | import sys
 22 | import argparse
 23 | 
 24 | import warnings as wrn
 25 | wrn.filterwarnings('ignore')
 26 | 
 27 | 
 28 | def main():
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv", 
 31 |         choices=[
 32 |             "./Dataset/corpus.csv",  # Bangla SEC parallel corpus
 33 |             "./Dataset/corpus2.csv",  # Bangla SEC parallel corpus for running test
 34 |             "./Dataset/Hindi/corpus_hindi.csv", 
 35 |             "./Dataset/Telugu/corpus_telugu.csv",
 36 |             "./Dataset/Hindi/corpus_hindi_enhanced.csv", 
 37 |             "./Dataset/Telugu/corpus_telugu_enhanced.csv"
 38 |         ]
 39 |     )
 40 |     parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
 41 |     parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
 42 |     parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
 43 |     parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 44 |     parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 45 |     parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
 46 |     parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
 47 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 48 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 49 |     parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
 50 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 51 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 52 |     args = parser.parse_args()
 53 | 
 54 |     SEED = 1234
 55 |     torch.manual_seed(SEED)
 56 |     torch.cuda.manual_seed(SEED)
 57 | 
 58 |     df = pd.read_csv(args.CORPUS)
 59 |     df['Word'] = df['Word'].apply(word2char)
 60 |     df['Error'] = df['Error'].apply(word2char)
 61 |     df = df.sample(frac=1).reset_index(drop=True)
 62 | 
 63 |     train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
 64 | 
 65 |     train_df.to_csv('./Dataset/train.csv', index=False)
 66 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 67 |     test_df.to_csv('./Dataset/test.csv', index=False)
 68 | 
 69 |     SRC = Field(
 70 |         tokenize=basic_tokenizer, lower=False,
 71 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 72 |     )
 73 |     TRG = Field(
 74 |         tokenize=basic_tokenizer, lower=False,
 75 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 76 |     )
 77 |     fields = {
 78 |         'Error': ('src', SRC),
 79 |         'Word': ('trg', TRG)
 80 |     }
 81 | 
 82 |     train_data, valid_data, test_data = TabularDataset.splits(
 83 |         path='./Dataset',
 84 |         train='train.csv',
 85 |         validation='valid.csv',
 86 |         test='test.csv',
 87 |         format='csv',
 88 |         fields=fields
 89 |     )
 90 | 
 91 |     SRC.build_vocab(train_data, min_freq=100)
 92 |     TRG.build_vocab(train_data, min_freq=50)
 93 | 
 94 |     # ------------------------------
 95 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 96 |     BATCH_SIZE = 512
 97 |     # ------------------------------
 98 |     INPUT_DIM = len(SRC.vocab)
 99 |     OUTPUT_DIM = len(TRG.vocab)
100 |     # ------------------------------
101 |     HID_DIM = int(args.HID_DIM)
102 |     ENC_LAYERS = int(args.ENC_LAYERS)
103 |     DEC_LAYERS = int(args.DEC_LAYERS)
104 |     ENC_HEADS = int(args.ENC_HEADS)
105 |     DEC_HEADS = int(args.DEC_HEADS)
106 |     ENC_PF_DIM = int(args.ENC_PF_DIM)
107 |     DEC_PF_DIM = int(args.DEC_PF_DIM)
108 |     ENC_DROPOUT = float(args.ENC_DROPOUT)
109 |     DEC_DROPOUT = float(args.DEC_DROPOUT)
110 |     CLIP = float(args.CLIP)
111 |     N_EPOCHS = int(args.N_EPOCHS)
112 |     LEARNING_RATE = float(args.LEARNING_RATE)
113 |     # ------------------------------
114 |     PATH = './Checkpoints/dtransformer.pth'
115 |     # ------------------------------
116 |     gc.collect()
117 |     torch.cuda.empty_cache()
118 |     # -----------------------------
119 | 
120 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
121 |         (train_data, valid_data, test_data),
122 |         batch_size=BATCH_SIZE,
123 |         sort_within_batch=True,
124 |         sort_key=lambda x: len(x.src),
125 |         device=DEVICE
126 |     )
127 | 
128 |     enc = Encoder(
129 |         INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
130 |         ENC_DROPOUT, DEVICE
131 |     )
132 |     dec = Decoder(
133 |         OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
134 |         DEC_DROPOUT, DEVICE
135 |     )
136 | 
137 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
138 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
139 | 
140 |     model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
141 |     model.apply(initialize_weights)
142 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
143 | 
144 |     optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
145 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
146 | 
147 |     epoch = 1
148 |     best_loss = 1e10
149 |     if os.path.exists(PATH):
150 |         checkpoint, epoch, train_loss = load_model(model, PATH)
151 |         best_loss = train_loss
152 | 
153 |     for epoch in range(epoch, N_EPOCHS):
154 |         print(f"Epoch: {epoch} / {N_EPOCHS}")
155 |         train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
156 |         print(f"Train Loss: {train_loss:.4f}")
157 |         if train_loss < best_loss:
158 |             best_loss = train_loss
159 |             save_model(model, train_loss, epoch, PATH)
160 | 
161 |     # ---------------------
162 |     # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE)
163 |     # ---------------------
164 |     # error_types = [
165 |     #     'Homonym Error',  # 123
166 |     #     'Typo Deletion',  # 115767
167 |     #     'Typo (Avro) Substituition',  # 119573
168 |     #     'Typo (Bijoy) Substituition',  # 119864
169 |     #     'Cognitive Error',  # 108227
170 |     #     'Run-on Error',  # 124895
171 |     #     'Split-word Error (Left)',  # 62890
172 |     #     'Split-word Error (Random)',  # 124895
173 |     #     'Split-word Error (Right)',  # 13985
174 |     #     'Split-word Error (both)',  # 12800
175 |     #     'Typo Insertion',  # 124807
176 |     #     'Typo Transposition',  # 123245
177 |     #     'Visual Error',  # 117391
178 |     #     'Visual Error (Combined Character)'  # 17617
179 |     # ]
180 |     # ---------------------
181 |     valid_df = pd.read_csv('./Dataset/valid.csv')
182 |     error_types = list(sorted(list(set(df['ErrorType'].values))))
183 |     # ---------------------
184 |     for error_name in error_types:
185 |         print(f'------\nError Type: {error_name}\n------')
186 |         error_df_2(df, error_name)
187 | 
188 |         error_data, _ = TabularDataset.splits(
189 |             path='./Dataset',
190 |             train='error.csv',
191 |             test='error.csv',
192 |             format='csv',
193 |             fields=fields
194 |         )
195 | 
196 |         eval_df = evaluation_report(error_data, SRC, TRG, WORD, model, DEVICE)
197 | 
198 |         error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
199 |         eval_df.to_csv(f'./Dataframes/dtransformer_{error_name}.csv', index=False)
200 |         print('\n\n')
201 |     # ---------------------
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     main()
206 | 


--------------------------------------------------------------------------------
/Requirements/requirements_u.yml:
--------------------------------------------------------------------------------
  1 | # This file may be used to create an environment using:
  2 | # $ conda create --name <env> --file <this file>
  3 | # platform: linux-64
  4 | @EXPLICIT
  5 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cuda.tar.bz2
  6 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
  7 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda
  8 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2022.4.26-h06a4308_0.conda
  9 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2021.4.0-h06a4308_3561.conda
 10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda
 11 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran4-7.5.0-ha8ba4b0_17.conda
 12 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda
 13 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.5.0-ha8ba4b0_17.conda
 14 | https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda
 15 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2021.4.0-h06a4308_640.conda
 16 | https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda
 17 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda
 18 | https://repo.anaconda.com/pkgs/main/linux-64/brotli-1.0.9-he6710b0_2.conda
 19 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda
 20 | https://repo.anaconda.com/pkgs/main/linux-64/cudatoolkit-10.2.89-hfd86e86_1.conda
 21 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.4.4-h295c915_0.conda
 22 | https://repo.anaconda.com/pkgs/main/linux-64/giflib-5.2.1-h7b6447c_0.conda
 23 | https://repo.anaconda.com/pkgs/main/linux-64/gmp-6.2.1-h295c915_3.conda
 24 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda
 25 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9e-h7f8727e_0.conda
 26 | https://repo.anaconda.com/pkgs/main/linux-64/lame-3.100-h7b6447c_0.conda
 27 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda
 28 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.16-h7f8727e_2.conda
 29 | https://repo.anaconda.com/pkgs/main/linux-64/libtasn1-4.16.0-h27cfd23_0.conda
 30 | https://repo.anaconda.com/pkgs/main/linux-64/libunistring-0.9.10-h27cfd23_0.conda
 31 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h7f8727e_2.conda
 32 | https://repo.anaconda.com/pkgs/main/linux-64/libuv-1.40.0-h7b6447c_0.conda
 33 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-base-1.2.2-h7f8727e_0.conda
 34 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.15-h7f8727e_0.conda
 35 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.3-h295c915_1.conda
 36 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.3-h7f8727e_2.conda
 37 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-base-1.10.2-hd09550d_5.conda
 38 | https://repo.anaconda.com/pkgs/main/linux-64/openh264-2.1.1-h4ff587b_0.conda
 39 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1o-h7f8727e_0.conda
 40 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.45-h295c915_0.conda
 41 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7f8727e_1.conda
 42 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.12-h7f8727e_2.conda
 43 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.69.1-h4ff587b_1.conda
 44 | https://repo.anaconda.com/pkgs/main/linux-64/libidn2-2.3.2-h7f8727e_0.conda
 45 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda
 46 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.14-h74e7548_0.conda
 47 | https://repo.anaconda.com/pkgs/main/linux-64/nettle-3.7.3-hbbd107a_1.conda
 48 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1.2-h7f8727e_1.conda
 49 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.12-h1ccaba5_0.conda
 50 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.5.2-ha4553b6_0.conda
 51 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.18-hb2f20db_0.conda
 52 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.11.0-h70c0345_0.conda
 53 | https://repo.anaconda.com/pkgs/main/linux-64/gnutls-3.6.15-he1e5248_0.conda
 54 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-h28cd5cc_2.conda
 55 | https://repo.anaconda.com/pkgs/main/linux-64/libtiff-4.2.0-h2818925_1.conda
 56 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.38.3-hc218d9a_0.conda
 57 | https://conda.anaconda.org/pytorch/linux-64/ffmpeg-4.3-hf484d3e_0.tar.bz2
 58 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.1-h6c09931_0.conda
 59 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-h8213a91_2.conda
 60 | https://repo.anaconda.com/pkgs/main/linux-64/lcms2-2.12-h3be6417_0.conda
 61 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-1.2.2-h55f646e_0.conda
 62 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.8.13-h12debd9_0.conda
 63 | https://repo.anaconda.com/pkgs/main/linux-64/certifi-2022.5.18.1-py38h06a4308_0.conda
 64 | https://repo.anaconda.com/pkgs/main/noarch/charset-normalizer-2.0.4-pyhd3eb1b0_0.conda
 65 | https://repo.anaconda.com/pkgs/main/noarch/cycler-0.11.0-pyhd3eb1b0_0.conda
 66 | https://repo.anaconda.com/pkgs/main/noarch/idna-3.3-pyhd3eb1b0_0.conda
 67 | https://repo.anaconda.com/pkgs/main/noarch/joblib-1.1.0-pyhd3eb1b0_0.conda
 68 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.4.2-py38h295c915_0.conda
 69 | https://repo.anaconda.com/pkgs/main/noarch/munkres-1.1.4-py_0.conda
 70 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-1.10.2-h06a4308_5.conda
 71 | https://repo.anaconda.com/pkgs/main/linux-64/pillow-9.0.1-py38h22f2fdc_0.conda
 72 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.21-pyhd3eb1b0_0.conda
 73 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-3.0.4-pyhd3eb1b0_0.conda
 74 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py38h06a4308_0.conda
 75 | https://repo.anaconda.com/pkgs/main/linux-64/pytz-2022.1-py38h06a4308_0.conda
 76 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda
 77 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.13-py38h295c915_0.conda
 78 | https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_1.conda
 79 | https://repo.anaconda.com/pkgs/main/noarch/threadpoolctl-2.2.0-pyh0d69192_0.conda
 80 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.1-py38h27cfd23_0.conda
 81 | https://repo.anaconda.com/pkgs/main/linux-64/tqdm-4.64.0-py38h06a4308_0.conda
 82 | https://repo.anaconda.com/pkgs/main/noarch/typing_extensions-4.1.1-pyh06a4308_0.conda
 83 | https://repo.anaconda.com/pkgs/main/noarch/wheel-0.37.1-pyhd3eb1b0_0.conda
 84 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.15.0-py38hd667e15_1.conda
 85 | https://repo.anaconda.com/pkgs/main/noarch/fonttools-4.25.0-pyhd3eb1b0_0.conda
 86 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.4.0-py38h7f8727e_0.conda
 87 | https://repo.anaconda.com/pkgs/main/noarch/packaging-21.3-pyhd3eb1b0_0.conda
 88 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py38h05f1152_4.conda
 89 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.2-pyhd3eb1b0_0.conda
 90 | https://conda.anaconda.org/pytorch/linux-64/pytorch-1.9.0-py3.8_cuda10.2_cudnn7.6.5_0.tar.bz2
 91 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-61.2.0-py38h06a4308_0.conda
 92 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py38h27cfd23_1003.conda
 93 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-37.0.1-py38h9ce1e76_0.conda
 94 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.22.3-py38hf524024_0.conda
 95 | https://repo.anaconda.com/pkgs/main/linux-64/pip-21.2.4-py38h06a4308_0.conda
 96 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-22.0.0-pyhd3eb1b0_0.conda
 97 | https://repo.anaconda.com/pkgs/main/linux-64/urllib3-1.26.9-py38h06a4308_0.conda
 98 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.27.1-pyhd3eb1b0_0.conda
 99 | https://conda.anaconda.org/pytorch/linux-64/torchtext-0.10.0-py38.tar.bz2
100 | https://repo.anaconda.com/pkgs/main/linux-64/bottleneck-1.3.4-py38hce1f21e_0.conda
101 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.5.1-py38h06a4308_1.conda
102 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.5.1-py38ha18d171_1.conda
103 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.3.1-py38hd3c417c_0.conda
104 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.2.2-py38h51133e4_0.conda
105 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.22.3-py38he7a7128_0.conda
106 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.8.1-py38h6abb31d_0.conda
107 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.7.3-py38hc147768_0.conda
108 | https://conda.anaconda.org/pytorch/linux-64/torchaudio-0.9.0-py38.tar.bz2
109 | https://conda.anaconda.org/pytorch/noarch/torchvision-0.2.2-py_3.tar.bz2
110 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.4.2-py38h295c915_0.conda
111 | https://repo.anaconda.com/pkgs/main/linux-64/scikit-learn-1.0.2-py38h51133e4_1.conda
112 | 


--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tqdm import tqdm
  3 | from pipeline import translate_sentence
  4 | import numpy as np
  5 | from sklearn import metrics
  6 | import torch
  7 | import gc
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
 13 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 14 | 
 15 |     modified_flags = []
 16 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 17 |     all_words = sorted(all_words.iloc[:, 0].values)
 18 | 
 19 |     for idx, data in enumerate(tqdm(test_data)):
 20 |         # ------------------------------
 21 |         if idx % 5000 == 0:
 22 |             gc.collect()
 23 |             torch.cuda.empty_cache()
 24 |         # ------------------------------
 25 | 
 26 |         src = data.src
 27 |         trg = data.trg
 28 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
 29 | 
 30 |         src = ''.join(src)
 31 |         trg = ''.join(trg)
 32 |         pred = ''.join(translation)
 33 | 
 34 |         erroneous_words.append(src)
 35 |         correct_words.append(trg)
 36 |         predicted_words.append(pred)
 37 | 
 38 |         if trg == pred:
 39 |             flags.append(1)
 40 |         else:
 41 |             flags.append(0)
 42 | 
 43 |         if pred in all_words:
 44 |             modified_flags.append(1)
 45 |         else:
 46 |             modified_flags.append(0)
 47 | 
 48 |     evaluation_df = pd.DataFrame({
 49 |         'Error': erroneous_words,
 50 |         'Predicton': predicted_words,
 51 |         'Target': correct_words,
 52 |         'Correction': flags
 53 |     })
 54 | 
 55 |     corrected_instances = evaluation_df['Correction'].values.sum()
 56 |     total_instances = len(evaluation_df)
 57 |     accuracy = corrected_instances / total_instances
 58 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
 59 | 
 60 |     y_true = np.array(correct_words)
 61 |     y_pred = np.array(predicted_words)
 62 | 
 63 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
 64 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
 65 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
 66 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
 67 |     ACC = metrics.accuracy_score(y_true, y_pred)
 68 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
 69 | 
 70 |     print(f'''
 71 |         Top-1 (Greedy Decoding)
 72 |             Precision: {PR:.4f}
 73 |             Recall: {RE:.4f}
 74 |             F1 Score: {F1:.4f}
 75 |             F0.5 Score: {F05:.4f}
 76 |             Accuracy: {RE * 100:.2f}%
 77 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
 78 |     ''')
 79 | 
 80 |     return evaluation_df
 81 | 
 82 | 
 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
 84 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 85 |     words = []
 86 | 
 87 |     modified_flags = []
 88 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 89 |     all_words = sorted(all_words.iloc[:, 0].values)
 90 | 
 91 |     for idx, data in enumerate(tqdm(test_data)):
 92 |         # ------------------------------
 93 |         if idx % 5000 == 0:
 94 |             gc.collect()
 95 |             torch.cuda.empty_cache()
 96 |         # ------------------------------
 97 | 
 98 |         src = data.src
 99 |         trg = data.trg
100 |         word = data.word
101 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 | 
103 |         src = ''.join(src)
104 |         trg = ''.join(trg)
105 |         pred = ''.join(translation)
106 |         word = ''.join(word)
107 | 
108 |         erroneous_words.append(src)
109 |         correct_words.append(trg)
110 |         predicted_words.append(pred)
111 |         words.append(word)
112 | 
113 |         if trg == pred:
114 |             flags.append(1)
115 |         else:
116 |             flags.append(0)
117 | 
118 |         if pred in all_words:
119 |             modified_flags.append(1)
120 |         else:
121 |             modified_flags.append(0)
122 | 
123 |     evaluation_df = pd.DataFrame({
124 |         'Error': erroneous_words,  # Error
125 |         'Predicton': predicted_words,  # ErrorBlanksPredD1
126 |         'Target': correct_words,  # ErrorBlanksActual
127 |         'Word': words,  # Word
128 |         'Correction': flags  # EBP_Flag_D1
129 |     })
130 | 
131 |     corrected_instances = evaluation_df['Correction'].values.sum()
132 |     total_instances = len(evaluation_df)
133 |     accuracy = corrected_instances / total_instances
134 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 | 
136 |     y_true = np.array(correct_words)
137 |     y_pred = np.array(predicted_words)
138 | 
139 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 |     ACC = metrics.accuracy_score(y_true, y_pred)
144 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 | 
146 |     print(f'''
147 |         Top-1 (Greedy Decoding)
148 |             Precision: {PR:.4f}
149 |             Recall: {RE:.4f}
150 |             F1 Score: {F1:.4f}
151 |             F0.5 Score: {F05:.4f}
152 |             Accuracy: {RE * 100:.2f}%
153 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 |     ''')
155 | 
156 |     return evaluation_df
157 | 
158 | 
159 | 
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 |     errors = []
163 |     words = []
164 |     ebpd1s = []
165 |     ebpfd1s = []
166 | 
167 |     modified_flags = []
168 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 |     all_words = sorted(all_words.iloc[:, 0].values)
170 | 
171 |     for idx, data in enumerate(tqdm(test_data)):
172 |         # ------------------------------
173 |         if idx % 5000 == 0:
174 |             gc.collect()
175 |             torch.cuda.empty_cache()
176 |         # ------------------------------
177 | 
178 |         src = data.src
179 |         trg = data.trg
180 |         error = data.error
181 |         word = data.word
182 |         ebpd1 = data.ebpd1
183 |         ebpfd1 = data.ebpfd1
184 | 
185 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 | 
187 |         src = ''.join(src)
188 |         trg = ''.join(trg)
189 |         pred = ''.join(translation)
190 |         error = ''.join(error)
191 |         word = ''.join(word)
192 |         ebpd1 = ''.join(ebpd1)
193 |         ebpfd1 = ''.join(ebpfd1)
194 | 
195 |         erroneous_words.append(src)
196 |         correct_words.append(trg)
197 |         predicted_words.append(pred)
198 |         errors.append(error)
199 |         words.append(word)
200 |         ebpd1s.append(ebpd1)
201 |         ebpfd1s.append(ebpfd1)
202 | 
203 |         if trg == pred:
204 |             flags.append(1)
205 |         else:
206 |             flags.append(0)
207 | 
208 |         if pred in all_words:
209 |             modified_flags.append(1)
210 |         else:
211 |             modified_flags.append(0)
212 | 
213 |     # evaluation_df = pd.DataFrame({
214 |     #     'Error': erroneous_words,
215 |     #     'Predicton': predicted_words,
216 |     #     'Target': correct_words,
217 |     #     'Word': words,
218 |     #     'Correction': flags
219 |     # })
220 | 
221 |     evaluation_df = pd.DataFrame({
222 |         'Error': errors,
223 |         'Word': words,
224 |         'ErrorBlanksActual': correct_words,
225 |         'MaskErrorBlank': erroneous_words,
226 |         'ErrorBlanksPredD1': ebpd1s,
227 |         'EBP_Flag_D1': ebpfd1s,
228 |         'ErrorBlanksPredD2': predicted_words,
229 |         'EBP_Flag_D2': flags
230 |     })
231 | 
232 |     corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 |     total_instances = len(evaluation_df)
234 |     accuracy = corrected_instances / total_instances
235 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 | 
237 |     y_true = np.array(correct_words)
238 |     y_pred = np.array(predicted_words)
239 | 
240 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 |     ACC = metrics.accuracy_score(y_true, y_pred)
245 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 | 
247 |     print(f'''
248 |         Top-1 (Greedy Decoding)
249 |             Precision: {PR:.4f}
250 |             Recall: {RE:.4f}
251 |             F1 Score: {F1:.4f}
252 |             F0.5 Score: {F05:.4f}
253 |             Accuracy: {RE * 100:.2f}%
254 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 |     ''')
256 | 
257 |     return evaluation_df
258 | 
259 | 
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     pass
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/Baselines/DCSpell/metrics.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tqdm import tqdm
  3 | from pipeline import translate_sentence
  4 | import numpy as np
  5 | from sklearn import metrics
  6 | import torch
  7 | import gc
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
 13 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 14 | 
 15 |     modified_flags = []
 16 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 17 |     all_words = sorted(all_words.iloc[:, 0].values)
 18 | 
 19 |     for idx, data in enumerate(tqdm(test_data)):
 20 |         # ------------------------------
 21 |         if idx % 5000 == 0:
 22 |             gc.collect()
 23 |             torch.cuda.empty_cache()
 24 |         # ------------------------------
 25 | 
 26 |         src = data.src
 27 |         trg = data.trg
 28 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
 29 | 
 30 |         src = ''.join(src)
 31 |         trg = ''.join(trg)
 32 |         pred = ''.join(translation)
 33 | 
 34 |         erroneous_words.append(src)
 35 |         correct_words.append(trg)
 36 |         predicted_words.append(pred)
 37 | 
 38 |         if trg == pred:
 39 |             flags.append(1)
 40 |         else:
 41 |             flags.append(0)
 42 | 
 43 |         if pred in all_words:
 44 |             modified_flags.append(1)
 45 |         else:
 46 |             modified_flags.append(0)
 47 | 
 48 |     evaluation_df = pd.DataFrame({
 49 |         'Error': erroneous_words,
 50 |         'Predicton': predicted_words,
 51 |         'Target': correct_words,
 52 |         'Correction': flags
 53 |     })
 54 | 
 55 |     corrected_instances = evaluation_df['Correction'].values.sum()
 56 |     total_instances = len(evaluation_df)
 57 |     accuracy = corrected_instances / total_instances
 58 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
 59 | 
 60 |     y_true = np.array(correct_words)
 61 |     y_pred = np.array(predicted_words)
 62 | 
 63 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
 64 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
 65 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
 66 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
 67 |     ACC = metrics.accuracy_score(y_true, y_pred)
 68 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
 69 | 
 70 |     print(f'''
 71 |         Top-1 (Greedy Decoding)
 72 |             Precision: {PR:.4f}
 73 |             Recall: {RE:.4f}
 74 |             F1 Score: {F1:.4f}
 75 |             F0.5 Score: {F05:.4f}
 76 |             Accuracy: {RE * 100:.2f}%
 77 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
 78 |     ''')
 79 | 
 80 |     return evaluation_df
 81 | 
 82 | 
 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
 84 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 85 |     words = []
 86 | 
 87 |     modified_flags = []
 88 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 89 |     all_words = sorted(all_words.iloc[:, 0].values)
 90 | 
 91 |     for idx, data in enumerate(tqdm(test_data)):
 92 |         # ------------------------------
 93 |         if idx % 5000 == 0:
 94 |             gc.collect()
 95 |             torch.cuda.empty_cache()
 96 |         # ------------------------------
 97 | 
 98 |         src = data.src
 99 |         trg = data.trg
100 |         word = data.word
101 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 | 
103 |         src = ''.join(src)
104 |         trg = ''.join(trg)
105 |         pred = ''.join(translation)
106 |         word = ''.join(word)
107 | 
108 |         erroneous_words.append(src)
109 |         correct_words.append(trg)
110 |         predicted_words.append(pred)
111 |         words.append(word)
112 | 
113 |         if trg == pred:
114 |             flags.append(1)
115 |         else:
116 |             flags.append(0)
117 | 
118 |         if pred in all_words:
119 |             modified_flags.append(1)
120 |         else:
121 |             modified_flags.append(0)
122 | 
123 |     evaluation_df = pd.DataFrame({
124 |         'Error': erroneous_words,  # Error
125 |         'Predicton': predicted_words,  # ErrorBlanksPredD1
126 |         'Target': correct_words,  # ErrorBlanksActual
127 |         'Word': words,  # Word
128 |         'Correction': flags  # EBP_Flag_D1
129 |     })
130 | 
131 |     corrected_instances = evaluation_df['Correction'].values.sum()
132 |     total_instances = len(evaluation_df)
133 |     accuracy = corrected_instances / total_instances
134 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 | 
136 |     y_true = np.array(correct_words)
137 |     y_pred = np.array(predicted_words)
138 | 
139 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 |     ACC = metrics.accuracy_score(y_true, y_pred)
144 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 | 
146 |     print(f'''
147 |         Top-1 (Greedy Decoding)
148 |             Precision: {PR:.4f}
149 |             Recall: {RE:.4f}
150 |             F1 Score: {F1:.4f}
151 |             F0.5 Score: {F05:.4f}
152 |             Accuracy: {RE * 100:.2f}%
153 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 |     ''')
155 | 
156 |     return evaluation_df
157 | 
158 | 
159 | 
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 |     errors = []
163 |     words = []
164 |     ebpd1s = []
165 |     ebpfd1s = []
166 | 
167 |     modified_flags = []
168 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 |     all_words = sorted(all_words.iloc[:, 0].values)
170 | 
171 |     for idx, data in enumerate(tqdm(test_data)):
172 |         # ------------------------------
173 |         if idx % 5000 == 0:
174 |             gc.collect()
175 |             torch.cuda.empty_cache()
176 |         # ------------------------------
177 | 
178 |         src = data.src
179 |         trg = data.trg
180 |         error = data.error
181 |         word = data.word
182 |         ebpd1 = data.ebpd1
183 |         ebpfd1 = data.ebpfd1
184 | 
185 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 | 
187 |         src = ''.join(src)
188 |         trg = ''.join(trg)
189 |         pred = ''.join(translation)
190 |         error = ''.join(error)
191 |         word = ''.join(word)
192 |         ebpd1 = ''.join(ebpd1)
193 |         ebpfd1 = ''.join(ebpfd1)
194 | 
195 |         erroneous_words.append(src)
196 |         correct_words.append(trg)
197 |         predicted_words.append(pred)
198 |         errors.append(error)
199 |         words.append(word)
200 |         ebpd1s.append(ebpd1)
201 |         ebpfd1s.append(ebpfd1)
202 | 
203 |         if trg == pred:
204 |             flags.append(1)
205 |         else:
206 |             flags.append(0)
207 | 
208 |         if pred in all_words:
209 |             modified_flags.append(1)
210 |         else:
211 |             modified_flags.append(0)
212 | 
213 |     # evaluation_df = pd.DataFrame({
214 |     #     'Error': erroneous_words,
215 |     #     'Predicton': predicted_words,
216 |     #     'Target': correct_words,
217 |     #     'Word': words,
218 |     #     'Correction': flags
219 |     # })
220 | 
221 |     evaluation_df = pd.DataFrame({
222 |         'Error': errors,
223 |         'Word': words,
224 |         'ErrorBlanksActual': correct_words,
225 |         'MaskErrorBlank': erroneous_words,
226 |         'ErrorBlanksPredD1': ebpd1s,
227 |         'EBP_Flag_D1': ebpfd1s,
228 |         'ErrorBlanksPredD2': predicted_words,
229 |         'EBP_Flag_D2': flags
230 |     })
231 | 
232 |     corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 |     total_instances = len(evaluation_df)
234 |     accuracy = corrected_instances / total_instances
235 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 | 
237 |     y_true = np.array(correct_words)
238 |     y_pred = np.array(predicted_words)
239 | 
240 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 |     ACC = metrics.accuracy_score(y_true, y_pred)
245 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 | 
247 |     print(f'''
248 |         Top-1 (Greedy Decoding)
249 |             Precision: {PR:.4f}
250 |             Recall: {RE:.4f}
251 |             F1 Score: {F1:.4f}
252 |             F0.5 Score: {F05:.4f}
253 |             Accuracy: {RE * 100:.2f}%
254 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 |     ''')
256 | 
257 |     return evaluation_df
258 | 
259 | 
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     pass
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/Baselines/DTransformer/metrics.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from tqdm import tqdm
  3 | from pipeline import translate_sentence
  4 | import numpy as np
  5 | from sklearn import metrics
  6 | import torch
  7 | import gc
  8 | import warnings as wrn
  9 | wrn.filterwarnings('ignore')
 10 | 
 11 | 
 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
 13 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 14 | 
 15 |     modified_flags = []
 16 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 17 |     all_words = sorted(all_words.iloc[:, 0].values)
 18 | 
 19 |     for idx, data in enumerate(tqdm(test_data)):
 20 |         # ------------------------------
 21 |         if idx % 5000 == 0:
 22 |             gc.collect()
 23 |             torch.cuda.empty_cache()
 24 |         # ------------------------------
 25 | 
 26 |         src = data.src
 27 |         trg = data.trg
 28 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
 29 | 
 30 |         src = ''.join(src)
 31 |         trg = ''.join(trg)
 32 |         pred = ''.join(translation)
 33 | 
 34 |         erroneous_words.append(src)
 35 |         correct_words.append(trg)
 36 |         predicted_words.append(pred)
 37 | 
 38 |         if trg == pred:
 39 |             flags.append(1)
 40 |         else:
 41 |             flags.append(0)
 42 | 
 43 |         if pred in all_words:
 44 |             modified_flags.append(1)
 45 |         else:
 46 |             modified_flags.append(0)
 47 | 
 48 |     evaluation_df = pd.DataFrame({
 49 |         'Error': erroneous_words,
 50 |         'Predicton': predicted_words,
 51 |         'Target': correct_words,
 52 |         'Correction': flags
 53 |     })
 54 | 
 55 |     corrected_instances = evaluation_df['Correction'].values.sum()
 56 |     total_instances = len(evaluation_df)
 57 |     accuracy = corrected_instances / total_instances
 58 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
 59 | 
 60 |     y_true = np.array(correct_words)
 61 |     y_pred = np.array(predicted_words)
 62 | 
 63 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
 64 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
 65 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
 66 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
 67 |     ACC = metrics.accuracy_score(y_true, y_pred)
 68 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
 69 | 
 70 |     print(f'''
 71 |         Top-1 (Greedy Decoding)
 72 |             Precision: {PR:.4f}
 73 |             Recall: {RE:.4f}
 74 |             F1 Score: {F1:.4f}
 75 |             F0.5 Score: {F05:.4f}
 76 |             Accuracy: {RE * 100:.2f}%
 77 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
 78 |     ''')
 79 | 
 80 |     return evaluation_df
 81 | 
 82 | 
 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
 84 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
 85 |     words = []
 86 | 
 87 |     modified_flags = []
 88 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
 89 |     all_words = sorted(all_words.iloc[:, 0].values)
 90 | 
 91 |     for idx, data in enumerate(tqdm(test_data)):
 92 |         # ------------------------------
 93 |         if idx % 5000 == 0:
 94 |             gc.collect()
 95 |             torch.cuda.empty_cache()
 96 |         # ------------------------------
 97 | 
 98 |         src = data.src
 99 |         trg = data.trg
100 |         word = data.word
101 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 | 
103 |         src = ''.join(src)
104 |         trg = ''.join(trg)
105 |         pred = ''.join(translation)
106 |         word = ''.join(word)
107 | 
108 |         erroneous_words.append(src)
109 |         correct_words.append(trg)
110 |         predicted_words.append(pred)
111 |         words.append(word)
112 | 
113 |         if trg == pred:
114 |             flags.append(1)
115 |         else:
116 |             flags.append(0)
117 | 
118 |         if pred in all_words:
119 |             modified_flags.append(1)
120 |         else:
121 |             modified_flags.append(0)
122 | 
123 |     evaluation_df = pd.DataFrame({
124 |         'Error': erroneous_words,  # Error
125 |         'Predicton': predicted_words,  # ErrorBlanksPredD1
126 |         'Target': correct_words,  # ErrorBlanksActual
127 |         'Word': words,  # Word
128 |         'Correction': flags  # EBP_Flag_D1
129 |     })
130 | 
131 |     corrected_instances = evaluation_df['Correction'].values.sum()
132 |     total_instances = len(evaluation_df)
133 |     accuracy = corrected_instances / total_instances
134 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 | 
136 |     y_true = np.array(correct_words)
137 |     y_pred = np.array(predicted_words)
138 | 
139 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 |     ACC = metrics.accuracy_score(y_true, y_pred)
144 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 | 
146 |     print(f'''
147 |         Top-1 (Greedy Decoding)
148 |             Precision: {PR:.4f}
149 |             Recall: {RE:.4f}
150 |             F1 Score: {F1:.4f}
151 |             F0.5 Score: {F05:.4f}
152 |             Accuracy: {RE * 100:.2f}%
153 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 |     ''')
155 | 
156 |     return evaluation_df
157 | 
158 | 
159 | 
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 |     erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 |     errors = []
163 |     words = []
164 |     ebpd1s = []
165 |     ebpfd1s = []
166 | 
167 |     modified_flags = []
168 |     all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 |     all_words = sorted(all_words.iloc[:, 0].values)
170 | 
171 |     for idx, data in enumerate(tqdm(test_data)):
172 |         # ------------------------------
173 |         if idx % 5000 == 0:
174 |             gc.collect()
175 |             torch.cuda.empty_cache()
176 |         # ------------------------------
177 | 
178 |         src = data.src
179 |         trg = data.trg
180 |         error = data.error
181 |         word = data.word
182 |         ebpd1 = data.ebpd1
183 |         ebpfd1 = data.ebpfd1
184 | 
185 |         translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 | 
187 |         src = ''.join(src)
188 |         trg = ''.join(trg)
189 |         pred = ''.join(translation)
190 |         error = ''.join(error)
191 |         word = ''.join(word)
192 |         ebpd1 = ''.join(ebpd1)
193 |         ebpfd1 = ''.join(ebpfd1)
194 | 
195 |         erroneous_words.append(src)
196 |         correct_words.append(trg)
197 |         predicted_words.append(pred)
198 |         errors.append(error)
199 |         words.append(word)
200 |         ebpd1s.append(ebpd1)
201 |         ebpfd1s.append(ebpfd1)
202 | 
203 |         if trg == pred:
204 |             flags.append(1)
205 |         else:
206 |             flags.append(0)
207 | 
208 |         if pred in all_words:
209 |             modified_flags.append(1)
210 |         else:
211 |             modified_flags.append(0)
212 | 
213 |     # evaluation_df = pd.DataFrame({
214 |     #     'Error': erroneous_words,
215 |     #     'Predicton': predicted_words,
216 |     #     'Target': correct_words,
217 |     #     'Word': words,
218 |     #     'Correction': flags
219 |     # })
220 | 
221 |     evaluation_df = pd.DataFrame({
222 |         'Error': errors,
223 |         'Word': words,
224 |         'ErrorBlanksActual': correct_words,
225 |         'MaskErrorBlank': erroneous_words,
226 |         'ErrorBlanksPredD1': ebpd1s,
227 |         'EBP_Flag_D1': ebpfd1s,
228 |         'ErrorBlanksPredD2': predicted_words,
229 |         'EBP_Flag_D2': flags
230 |     })
231 | 
232 |     corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 |     total_instances = len(evaluation_df)
234 |     accuracy = corrected_instances / total_instances
235 |     print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 | 
237 |     y_true = np.array(correct_words)
238 |     y_pred = np.array(predicted_words)
239 | 
240 |     PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 |     RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 |     F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 |     F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 |     ACC = metrics.accuracy_score(y_true, y_pred)
245 |     MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 | 
247 |     print(f'''
248 |         Top-1 (Greedy Decoding)
249 |             Precision: {PR:.4f}
250 |             Recall: {RE:.4f}
251 |             F1 Score: {F1:.4f}
252 |             F0.5 Score: {F05:.4f}
253 |             Accuracy: {RE * 100:.2f}%
254 |             Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 |     ''')
256 | 
257 |     return evaluation_df
258 | 
259 | 
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     pass
264 | 
265 | 
266 | 


--------------------------------------------------------------------------------
/detector.py:
--------------------------------------------------------------------------------
  1 | from utils import (
  2 |     word2char, basic_tokenizer, count_parameters, initialize_weights,
  3 |     save_model, load_model, error_df, train_valid_test_df, mask2str,
  4 |     error_df_2, error_df_3, merge_dfs
  5 | )
  6 | from transformer import (
  7 |     Encoder, EncoderLayer, MultiHeadAttentionLayer,
  8 |     PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
  9 |     Seq2Seq
 10 | )
 11 | from pipeline import train, evaluate
 12 | from metrics import evaluation_report, evaluation_report2
 13 | 
 14 | import pandas as pd
 15 | from sklearn.model_selection import train_test_split
 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
 17 | import torch
 18 | import torch.nn as nn
 19 | import os
 20 | import gc
 21 | import sys
 22 | import argparse
 23 | 
 24 | import warnings as wrn
 25 | wrn.filterwarnings('ignore')
 26 | 
 27 | 
 28 | def main():
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv", 
 31 |         choices=[
 32 |             "./Dataset/corpus.csv",  # Bangla SEC parallel corpus
 33 |             "./Dataset/corpus2.csv",  # Bangla SEC parallel corpus for running test
 34 |             "./Dataset/Hindi/corpus_hindi.csv", 
 35 |             "./Dataset/Telugu/corpus_telugu.csv",
 36 |             "./Dataset/Hindi/corpus_hindi_enhanced.csv", 
 37 |             "./Dataset/Telugu/corpus_telugu_enhanced.csv"
 38 |         ]
 39 |     )
 40 |     parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
 41 |     parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
 42 |     parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
 43 |     parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 44 |     parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
 45 |     parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
 46 |     parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
 47 |     parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 48 |     parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
 49 |     parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
 50 |     parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
 51 |     parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
 52 |     args = parser.parse_args()
 53 | 
 54 |     SEED = 1234
 55 |     torch.manual_seed(SEED)
 56 |     torch.cuda.manual_seed(SEED)
 57 | 
 58 |     # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv')
 59 |     # df = pd.read_csv('./Dataset/corpus.csv')
 60 |     df = pd.read_csv(args.CORPUS)
 61 |     df['Word'] = df['Word'].apply(word2char)
 62 |     df['Error'] = df['Error'].apply(word2char)
 63 |     df['Mask'] = df['Mask'].apply(mask2str)
 64 |     df['Mask'] = df['Mask'].apply(word2char)
 65 |     df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str)
 66 |     df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char)
 67 |     df = df.sample(frac=1).reset_index(drop=True)
 68 |     # df = df.iloc[:, [4, 1, 2]]
 69 |     df = df[['ErrorBlanks', 'Error', 'ErrorType']]
 70 | 
 71 |     train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
 72 | 
 73 |     train_df.to_csv('./Dataset/train.csv', index=False)
 74 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
 75 |     test_df.to_csv('./Dataset/test.csv', index=False)
 76 | 
 77 |     SRC = Field(
 78 |         tokenize=basic_tokenizer, lower=False,
 79 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 80 |     )
 81 |     TRG = Field(
 82 |         tokenize=basic_tokenizer, lower=False,
 83 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 84 |     )
 85 |     WORD = Field(
 86 |         tokenize=basic_tokenizer, lower=False,
 87 |         init_token='<sos>', eos_token='<eos>', batch_first=True
 88 |     )
 89 |     fields = {
 90 |         'Error': ('src', SRC),
 91 |         'ErrorBlanks': ('trg', TRG)
 92 |     }
 93 | 
 94 |     train_data, valid_data, test_data = TabularDataset.splits(
 95 |         path='./Dataset',
 96 |         train='train.csv',
 97 |         validation='valid.csv',
 98 |         test='test.csv',
 99 |         format='csv',
100 |         fields=fields
101 |     )
102 | 
103 |     SRC.build_vocab(train_data, min_freq=100)
104 |     TRG.build_vocab(train_data, min_freq=50)
105 |     WORD.build_vocab(train_data, min_freq=100)
106 | 
107 |     # ------------------------------
108 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
109 |     BATCH_SIZE = 512
110 |     # ------------------------------
111 |     INPUT_DIM = len(SRC.vocab)
112 |     OUTPUT_DIM = len(TRG.vocab)
113 |     # ------------------------------
114 |     HID_DIM = int(args.HID_DIM)
115 |     ENC_LAYERS = int(args.ENC_LAYERS)
116 |     DEC_LAYERS = int(args.DEC_LAYERS)
117 |     ENC_HEADS = int(args.ENC_HEADS)
118 |     DEC_HEADS = int(args.DEC_HEADS)
119 |     ENC_PF_DIM = int(args.ENC_PF_DIM)
120 |     DEC_PF_DIM = int(args.DEC_PF_DIM)
121 |     ENC_DROPOUT = float(args.ENC_DROPOUT)
122 |     DEC_DROPOUT = float(args.DEC_DROPOUT)
123 |     CLIP = float(args.CLIP)
124 |     N_EPOCHS = int(args.N_EPOCHS)
125 |     LEARNING_RATE = float(args.LEARNING_RATE)
126 |     # ------------------------------
127 |     PATH = './Checkpoints/detector.pth'
128 |     # ------------------------------
129 |     gc.collect()
130 |     torch.cuda.empty_cache()
131 |     # -----------------------------
132 | 
133 |     train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
134 |         (train_data, valid_data, test_data),
135 |         batch_size=BATCH_SIZE,
136 |         sort_within_batch=True,
137 |         sort_key=lambda x: len(x.src),
138 |         device=DEVICE
139 |     )
140 | 
141 |     enc = Encoder(
142 |         INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
143 |         ENC_DROPOUT, DEVICE
144 |     )
145 |     dec = Decoder(
146 |         OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
147 |         DEC_DROPOUT, DEVICE
148 |     )
149 | 
150 |     SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
151 |     TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
152 | 
153 |     model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
154 |     model.apply(initialize_weights)
155 |     # print(f'The model has {count_parameters(model):,} trainable parameters')
156 | 
157 |     optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
158 |     criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
159 | 
160 |     epoch = 1
161 |     best_loss = 1e10
162 |     if os.path.exists(PATH):
163 |         checkpoint, epoch, train_loss = load_model(model, PATH)
164 |         best_loss = train_loss
165 | 
166 |     for epoch in range(epoch, N_EPOCHS):
167 |         print(f"Epoch: {epoch} / {N_EPOCHS}")
168 |         train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
169 |         print(f"Train Loss: {train_loss:.4f}")
170 |         if train_loss < best_loss:
171 |             best_loss = train_loss
172 |             save_model(model, train_loss, epoch, PATH)
173 | 
174 |     # ---------------------
175 |     # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE)
176 |     # ---------------------
177 |     error_types = [
178 |         'Homonym Error',  # 123
179 |         'Typo Deletion',  # 115767
180 |         'Typo (Avro) Substituition',  # 119573
181 |         'Typo (Bijoy) Substituition',  # 119864
182 |         'Cognitive Error',  # 108227
183 |         'Run-on Error',  # 124895
184 |         'Split-word Error (Left)',  # 62890
185 |         'Split-word Error (Random)',  # 124895
186 |         'Split-word Error (Right)',  # 13985
187 |         'Split-word Error (both)',  # 12800
188 |         'Typo Insertion',  # 124807
189 |         'Typo Transposition',  # 123245
190 |         'Visual Error',  # 117391
191 |         'Visual Error (Combined Character)'  # 17617
192 |     ]
193 |     # ---------------------
194 |     # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv')
195 |     df = pd.read_csv('./Dataset/corpus.csv')
196 |     df['Word'] = df['Word'].apply(word2char)
197 |     df['Error'] = df['Error'].apply(word2char)
198 |     df['Mask'] = df['Mask'].apply(mask2str)
199 |     df['Mask'] = df['Mask'].apply(word2char)
200 |     df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str)
201 |     df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char)
202 |     df = df.sample(frac=1).reset_index(drop=True)
203 |     # df = df.iloc[:, [0, 1, -2, 2]]
204 |     df = df[['Word', 'Error', 'ErrorBlanks', 'ErrorType']]
205 | 
206 |     train_df, valid_df, test_df = train_valid_test_df(df, test_size=1./1e10, valid_size=1./1e10)
207 | 
208 |     train_df.to_csv('./Dataset/train.csv', index=False)
209 |     valid_df.to_csv('./Dataset/valid.csv', index=False)
210 |     test_df.to_csv('./Dataset/test.csv', index=False)
211 |     # ---------------------
212 |     for error_name in error_types:
213 |         print(f'------\nError Type: {error_name}\n------')
214 |         error_df_3(df, error_name)
215 | 
216 |         fields = {
217 |             'Error': ('src', SRC),
218 |             'ErrorBlanks': ('trg', TRG),
219 |             'Word': ('word', WORD)
220 |         }
221 | 
222 |         error_data, _ = TabularDataset.splits(
223 |             path='./Dataset',
224 |             train='error.csv',
225 |             test='error.csv',
226 |             format='csv',
227 |             fields=fields
228 |         )
229 | 
230 |         eval_df = evaluation_report2(error_data, SRC, TRG, WORD, model, DEVICE)
231 |         eval_df['ErrorType'] = [error_name for _ in range(len(eval_df))]
232 | 
233 |         error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
234 |         eval_df.to_csv(f'./Dataframes/detector_{error_name}.csv', index=False)
235 |         print('\n\n')
236 |     # ---------------------
237 |     merge_dfs(network='detector')
238 |     # ---------------------
239 | 
240 | 
241 | if __name__ == '__main__':
242 |     main()
243 | 


--------------------------------------------------------------------------------