├── Baselines ├── ConvSeq2Seq │ ├── Dataframes │ │ └── dfs.txt │ ├── Checkpoints │ │ └── checkpoints.txt │ ├── Dataset │ │ └── kalpurush.ttf │ ├── errors.py │ ├── README.md │ ├── pipeline.py │ ├── utils.py │ ├── metrics.py │ ├── models.py │ └── main.py ├── DCSpell │ ├── Dataframes │ │ └── dfs.txt │ ├── Checkpoints │ │ └── checkpoints.txt │ ├── Dataset │ │ └── kalpurush.ttf │ ├── README.md │ ├── process.py │ ├── pipeline.py │ ├── corrector.py │ ├── utils.py │ └── metrics.py ├── DTransformer │ ├── Dataframes │ │ └── dfs.txt │ ├── Checkpoints │ │ └── checkpoints.txt │ ├── Dataset │ │ └── kalpurush.ttf │ ├── README.md │ ├── process.py │ ├── focalLoss.py │ ├── pipeline.py │ ├── utils.py │ ├── dtransformer.py │ └── metrics.py ├── GRUSeq2Seq │ ├── Corrections │ │ └── corpora.txt │ ├── Dataset │ │ └── kalpurush.ttf │ ├── Checkpoints │ │ └── temp.txt │ ├── README.md │ ├── metrics.py │ ├── check.py │ ├── decoding.py │ ├── focalLoss.py │ ├── inference.py │ ├── pipeline.py │ ├── models.py │ ├── utils.py │ ├── errors.py │ └── main.py ├── RuleBased │ ├── Dataset │ │ └── kalpurush.ttf │ ├── README.md │ └── rule_based.py └── README.md ├── Dataframes └── dataframesGeneratedByModels.txt ├── Dataset ├── Hindi │ └── HindiCorpusFromAnotherPaper.txt ├── Bangla │ └── BanglaCorpusFromAnotherPaper.txt └── Telugu │ └── TeluguCorpusFromAnotherPaper.txt ├── figures └── DPCSpell.jpg ├── Checkpoints ├── checkpoints.txt └── Checkpoints.md ├── CorpusCreation ├── README.md ├── scraper.py └── corpus_stats_valid.py ├── LICENSE ├── process.py ├── focalLoss.py ├── README.md ├── pipeline.py ├── corrector.py ├── utils.py ├── Requirements └── requirements_u.yml ├── metrics.py └── detector.py /Baselines/ConvSeq2Seq/Dataframes/dfs.txt: -------------------------------------------------------------------------------- 1 | dfs -------------------------------------------------------------------------------- /Baselines/DCSpell/Dataframes/dfs.txt: -------------------------------------------------------------------------------- 1 | dfs -------------------------------------------------------------------------------- /Baselines/DTransformer/Dataframes/dfs.txt: -------------------------------------------------------------------------------- 1 | dfs -------------------------------------------------------------------------------- /Dataframes/dataframesGeneratedByModels.txt: -------------------------------------------------------------------------------- 1 | https:// -------------------------------------------------------------------------------- /Dataset/Hindi/HindiCorpusFromAnotherPaper.txt: -------------------------------------------------------------------------------- 1 | https:// -------------------------------------------------------------------------------- /Dataset/Bangla/BanglaCorpusFromAnotherPaper.txt: -------------------------------------------------------------------------------- 1 | https:// -------------------------------------------------------------------------------- /Dataset/Telugu/TeluguCorpusFromAnotherPaper.txt: -------------------------------------------------------------------------------- 1 | https:// -------------------------------------------------------------------------------- /Baselines/DCSpell/Checkpoints/checkpoints.txt: -------------------------------------------------------------------------------- 1 | checkpoints 2 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/Checkpoints/checkpoints.txt: -------------------------------------------------------------------------------- 1 | checkpoints 2 | -------------------------------------------------------------------------------- /Baselines/DTransformer/Checkpoints/checkpoints.txt: -------------------------------------------------------------------------------- 1 | checkpoints 2 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/Corrections/corpora.txt: -------------------------------------------------------------------------------- 1 | corpus.csv 2 | corpus2.csv -------------------------------------------------------------------------------- /figures/DPCSpell.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/figures/DPCSpell.jpg -------------------------------------------------------------------------------- /Baselines/DCSpell/Dataset/kalpurush.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DCSpell/Dataset/kalpurush.ttf -------------------------------------------------------------------------------- /Baselines/RuleBased/Dataset/kalpurush.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/RuleBased/Dataset/kalpurush.ttf -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf -------------------------------------------------------------------------------- /Baselines/DTransformer/Dataset/kalpurush.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DTransformer/Dataset/kalpurush.ttf -------------------------------------------------------------------------------- /Checkpoints/checkpoints.txt: -------------------------------------------------------------------------------- 1 | Download checkpoints from the following link: 2 | https://drive.google.com/drive/folders/1prH28CiedKmhDmh3lOqquByQQTD8DN2d?usp=share_link 3 | -------------------------------------------------------------------------------- /Checkpoints/Checkpoints.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |

Download Checkpoints

8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/Checkpoints/temp.txt: -------------------------------------------------------------------------------- 1 | Top1 Acc: 0.5088253438742403 2 | Top2 Acc: 0.13197459214915688 3 | Top3 Acc: 0.10706370241740164 4 | Accuracy: 0.6364529543481241 5 | 100%|██████████| 175064/175064 [1:04:31<00:00, 45.21it/s] 6 | Modified Top1 Acc: 0.6128444454599461 7 | 8 | Process finished with exit code 0 9 | -------------------------------------------------------------------------------- /CorpusCreation/README.md: -------------------------------------------------------------------------------- 1 |

Corpus Creation

2 | 3 | ### Word Accumulation 4 | ``` 5 | python scraper.py 6 | ``` 7 | 8 | ### Error Annexation 9 | ``` 10 | python errors.py 11 | ``` 12 | 13 | ### Error Filtration 14 | 15 | 16 | 17 | ### Corpus Statistic and Error Percentage Validation 18 | ``` 19 | python corpus_stats_valid.py --email "username@gmail.com" --password "facebook_password" 20 | ``` 21 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/errors.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from utils import word2char 3 | 4 | 5 | def error_df(df, error='Cognitive Error'): 6 | df = df.loc[df['ErrorType'] == error] 7 | df['Word'] = df['Word'].apply(word2char) 8 | df['Error'] = df['Error'].apply(word2char) 9 | df = df.sample(frac=1).reset_index(drop=True) 10 | df = df.iloc[:, [1, 0]] 11 | df.to_csv('./Dataset/error.csv', index=False) 12 | 13 | -------------------------------------------------------------------------------- /Baselines/RuleBased/README.md: -------------------------------------------------------------------------------- 1 |

RuleBased

2 | 3 | ## Activate the Environment 4 | ``` 5 | conda activate DPCSpell 6 | ``` 7 | 8 |
9 | 10 | ## Prepare SEC Corpora 11 | ``` 12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder 13 | ``` 14 |

15 | or manually download the folder from here and keep the extracted files into ./Dataset/ 16 |

17 | 18 |
19 | 20 | ## Training and Evaluation of RuleBased 21 | ``` 22 | python rule_based.py 23 | ``` 24 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/README.md: -------------------------------------------------------------------------------- 1 |

ConvSeq2Seq

2 | 3 | ## Activate the Environment 4 | ``` 5 | conda activate DPCSpell 6 | ``` 7 | 8 |
9 | 10 | ## Prepare SEC Corpora 11 | ``` 12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder 13 | ``` 14 |

15 | or manually download the folder from here and keep the extracted files into ./Dataset/ 16 |

17 | 18 |
19 | 20 | ## Training and Evaluation of ConvSeq2Seq 21 | ``` 22 | python main.py --CORPUS "./Dataset/corpus.csv" --EMB_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_KERNEL_SIZE 3 --DEC_KERNEL_SIZE 3 --ENC_DROPOUT 0.2 --DEC_DROPOUT 0.2 --CLIP 0.1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100 23 | ``` 24 | -------------------------------------------------------------------------------- /Baselines/DTransformer/README.md: -------------------------------------------------------------------------------- 1 |

DTransformer

2 | 3 | ## Activate the Environment 4 | ``` 5 | conda activate DPCSpell 6 | ``` 7 | 8 |
9 | 10 | ## Prepare SEC Corpora 11 | ``` 12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder 13 | ``` 14 |

15 | or manually download the folder from here and keep the extracted files into ./Dataset/ 16 |

17 | 18 |
19 | 20 | ## Training and Evaluation of DTransformer 21 | ``` 22 | python dtransformer.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 23 | ``` 24 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/README.md: -------------------------------------------------------------------------------- 1 |

GRUSeq2Seq

2 | 3 | ## Activate the Environment 4 | ``` 5 | conda activate DPCSpell 6 | ``` 7 | 8 |
9 | 10 | ## Prepare SEC Corpora 11 | ``` 12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder 13 | ``` 14 |

15 | or manually download the folder from here and keep the extracted files into ./Dataset/ 16 |

17 | 18 |
19 | 20 | ## Training and Evaluation of GRUSeq2Seq 21 | ``` 22 | python main.py --CORPUS "./Dataset/corpus.csv" --ENC_EMB_DIM 128 --DEC_EMB_DIM 128 --ENC_HIDDEN_DIM 256 --DEC_HIDDEN_DIM 512 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --MAX_LEN 48 --CLIP 1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100 23 | ``` 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Mehedi Hasan Bijoy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Baselines/DCSpell/README.md: -------------------------------------------------------------------------------- 1 |

DCSpell

2 | 3 | ## Activate the Environment 4 | ``` 5 | conda activate DPCSpell 6 | ``` 7 | 8 |
9 | 10 | ## Prepare SEC Corpora 11 | ``` 12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder 13 | ``` 14 |

15 | or manually download the folder from here and keep the extracted files into ./Dataset/ 16 |

17 | 18 |
19 | 20 | 21 | ## Training and Evaluation of DPCSpell 22 | 23 | ### Detector Network 24 | 25 | ``` 26 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 27 | ``` 28 | 29 | ### Corrector Network 30 | 31 | ``` 32 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 33 | ``` 34 | -------------------------------------------------------------------------------- /Baselines/README.md: -------------------------------------------------------------------------------- 1 |

Baselines

2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 |
MethodSource Code
RuleBasedDPCSpell/Baselines/RuleBased
GRUSeq2SeqDPCSpell/Baselines/GRUSeq2Seq
ConvSeq2SeqDPCSpell/Baselines/ConvSeq2Seq
DTransformerDPCSpell/Baselines/DTransformer
DCSpellDPCSpell/Baselines/DCSpell
36 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/pipeline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | 4 | 5 | def train(model, iterator, optimizer, criterion, clip): 6 | model.train() 7 | epoch_loss = 0 8 | for idx, batch in enumerate(tqdm(iterator)): 9 | src = batch.src 10 | trg = batch.trg 11 | optimizer.zero_grad() 12 | output, _ = model(src, trg[:, :-1]) 13 | output_dim = output.shape[-1] 14 | output = output.contiguous().view(-1, output_dim) 15 | trg = trg[:, 1:].contiguous().view(-1) 16 | loss = criterion(output, trg) 17 | loss.backward() 18 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 19 | optimizer.step() 20 | epoch_loss += loss.item() 21 | return epoch_loss / len(iterator) 22 | 23 | 24 | def evaluate(model, iterator, criterion): 25 | model.eval() 26 | epoch_loss = 0 27 | with torch.no_grad(): 28 | for idx, batch in enumerate(tqdm(iterator)): 29 | src = batch.src 30 | trg = batch.trg 31 | output, _ = model(src, trg[:, :-1]) 32 | output_dim = output.shape[-1] 33 | output = output.contiguous().view(-1, output_dim) 34 | trg = trg[:, 1:].contiguous().view(-1) 35 | loss = criterion(output, trg) 36 | epoch_loss += loss.item() 37 | return epoch_loss / len(iterator) 38 | 39 | 40 | if __name__ == '__main__': 41 | pass 42 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def basic_tokenizer(text): 5 | return text.split() 6 | 7 | 8 | def word2char(word): 9 | w2c = [char for char in word] 10 | return ' '.join(w2c) 11 | 12 | 13 | def count_parameters(model): 14 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 15 | 16 | 17 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30): 18 | model.eval() 19 | tokens = [src_field.init_token] + sentence + [src_field.eos_token] 20 | src_indexes = [src_field.vocab.stoi[token] for token in tokens] 21 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device) 22 | with torch.no_grad(): 23 | encoder_conved, encoder_combined = model.encoder(src_tensor) 24 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] 25 | for i in range(max_len): 26 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) 27 | with torch.no_grad(): 28 | output, attention = model.decoder(trg_tensor, encoder_conved, encoder_combined) 29 | pred_token = output.argmax(2)[:, -1].item() 30 | trg_indexes.append(pred_token) 31 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: 32 | break 33 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] 34 | return trg_tokens[1:], attention 35 | 36 | 37 | def save_model(model, train_loss, epoch, PATH): 38 | torch.save({ 39 | 'epoch': epoch, 40 | 'model_state_dict': model.state_dict(), 41 | # 'optimizer_state_dict': optimizer.state_dict(), 42 | 'loss': train_loss 43 | }, PATH) 44 | print(f"---------\nModel Saved at {PATH}\n---------\n") 45 | 46 | 47 | def load_model(model, PATH): 48 | checkpoint = torch.load(PATH) 49 | model.load_state_dict(checkpoint['model_state_dict']) 50 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 51 | epoch = checkpoint['epoch'] 52 | train_loss = checkpoint['loss'] 53 | return checkpoint, epoch, train_loss 54 | 55 | 56 | if __name__ == '__main__': 57 | pass -------------------------------------------------------------------------------- /process.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from utils import word2char 3 | from tqdm import tqdm 4 | 5 | 6 | def check_from_left(word, error): 7 | left = [] 8 | for i in range(len(error)): 9 | if error[i] == word[i]: 10 | left.append(0) 11 | else: 12 | left.append(1) 13 | return left 14 | 15 | 16 | def check_from_right(word, error): 17 | word.reverse() 18 | error.reverse() 19 | right = [] 20 | for i in range(len(error)): 21 | if error[i] == word[i]: 22 | right.append(0) 23 | else: 24 | right.append(1) 25 | right.reverse() 26 | return right 27 | 28 | 29 | def check_from_both(word, error): 30 | length = len(error) 31 | if length % 2 == 0: 32 | iterator = length // 2 33 | else: 34 | iterator = (length // 2) + 1 35 | 36 | x = -1 37 | 38 | left = [] 39 | right = [] 40 | 41 | for i in range(iterator): 42 | if error[i] == word[i]: 43 | left.append(0) 44 | else: 45 | left.append(1) 46 | 47 | if error[x] == word[x]: 48 | right.append(0) 49 | else: 50 | right.append(1) 51 | x -= 1 52 | 53 | right.reverse() 54 | both = [*left, *right] 55 | return both 56 | 57 | 58 | if __name__ == '__main__': 59 | path = './Dataset/sec_dataset_III_v3.csv' 60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv') 61 | df_copy = df.copy() 62 | df['Word'] = df['Word'].apply(word2char) 63 | df['Error'] = df['Error'].apply(word2char) 64 | 65 | for idx in tqdm(range(len(df))): 66 | word = df.iloc[idx, 0].split() 67 | error = df.iloc[idx, 1].split() 68 | word = ['ব', 'া', 'ং', 'ল', 'া'] 69 | error = ['ব', 'ং', 'ল', 'া'] 70 | print(len(word), len(error)) 71 | print(f'{word}\n{error}') 72 | # checking from left 73 | left = check_from_left(word, error) 74 | print(left) 75 | right = check_from_right(word, error) 76 | print(right) 77 | both = check_from_both(word, error) 78 | print(both) 79 | break 80 | -------------------------------------------------------------------------------- /Baselines/DCSpell/process.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from utils import word2char 3 | from tqdm import tqdm 4 | 5 | 6 | def check_from_left(word, error): 7 | left = [] 8 | for i in range(len(error)): 9 | if error[i] == word[i]: 10 | left.append(0) 11 | else: 12 | left.append(1) 13 | return left 14 | 15 | 16 | def check_from_right(word, error): 17 | word.reverse() 18 | error.reverse() 19 | right = [] 20 | for i in range(len(error)): 21 | if error[i] == word[i]: 22 | right.append(0) 23 | else: 24 | right.append(1) 25 | right.reverse() 26 | return right 27 | 28 | 29 | def check_from_both(word, error): 30 | length = len(error) 31 | if length % 2 == 0: 32 | iterator = length // 2 33 | else: 34 | iterator = (length // 2) + 1 35 | 36 | x = -1 37 | 38 | left = [] 39 | right = [] 40 | 41 | for i in range(iterator): 42 | if error[i] == word[i]: 43 | left.append(0) 44 | else: 45 | left.append(1) 46 | 47 | if error[x] == word[x]: 48 | right.append(0) 49 | else: 50 | right.append(1) 51 | x -= 1 52 | 53 | right.reverse() 54 | both = [*left, *right] 55 | return both 56 | 57 | 58 | if __name__ == '__main__': 59 | path = './Dataset/sec_dataset_III_v3.csv' 60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv') 61 | df_copy = df.copy() 62 | df['Word'] = df['Word'].apply(word2char) 63 | df['Error'] = df['Error'].apply(word2char) 64 | 65 | for idx in tqdm(range(len(df))): 66 | word = df.iloc[idx, 0].split() 67 | error = df.iloc[idx, 1].split() 68 | word = ['ব', 'া', 'ং', 'ল', 'া'] 69 | error = ['ব', 'ং', 'ল', 'া'] 70 | print(len(word), len(error)) 71 | print(f'{word}\n{error}') 72 | # checking from left 73 | left = check_from_left(word, error) 74 | print(left) 75 | right = check_from_right(word, error) 76 | print(right) 77 | both = check_from_both(word, error) 78 | print(both) 79 | break 80 | -------------------------------------------------------------------------------- /Baselines/DTransformer/process.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from utils import word2char 3 | from tqdm import tqdm 4 | 5 | 6 | def check_from_left(word, error): 7 | left = [] 8 | for i in range(len(error)): 9 | if error[i] == word[i]: 10 | left.append(0) 11 | else: 12 | left.append(1) 13 | return left 14 | 15 | 16 | def check_from_right(word, error): 17 | word.reverse() 18 | error.reverse() 19 | right = [] 20 | for i in range(len(error)): 21 | if error[i] == word[i]: 22 | right.append(0) 23 | else: 24 | right.append(1) 25 | right.reverse() 26 | return right 27 | 28 | 29 | def check_from_both(word, error): 30 | length = len(error) 31 | if length % 2 == 0: 32 | iterator = length // 2 33 | else: 34 | iterator = (length // 2) + 1 35 | 36 | x = -1 37 | 38 | left = [] 39 | right = [] 40 | 41 | for i in range(iterator): 42 | if error[i] == word[i]: 43 | left.append(0) 44 | else: 45 | left.append(1) 46 | 47 | if error[x] == word[x]: 48 | right.append(0) 49 | else: 50 | right.append(1) 51 | x -= 1 52 | 53 | right.reverse() 54 | both = [*left, *right] 55 | return both 56 | 57 | 58 | if __name__ == '__main__': 59 | path = './Dataset/sec_dataset_III_v3.csv' 60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv') 61 | df_copy = df.copy() 62 | df['Word'] = df['Word'].apply(word2char) 63 | df['Error'] = df['Error'].apply(word2char) 64 | 65 | for idx in tqdm(range(len(df))): 66 | word = df.iloc[idx, 0].split() 67 | error = df.iloc[idx, 1].split() 68 | word = ['ব', 'া', 'ং', 'ল', 'া'] 69 | error = ['ব', 'ং', 'ল', 'া'] 70 | print(len(word), len(error)) 71 | print(f'{word}\n{error}') 72 | # checking from left 73 | left = check_from_left(word, error) 74 | print(left) 75 | right = check_from_right(word, error) 76 | print(right) 77 | both = check_from_both(word, error) 78 | print(both) 79 | break 80 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/metrics.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn, torch.optim as optim 2 | import torch.nn.functional as F 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import math 9 | import time 10 | # from torchtext.data.metrics import bleu_score 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | import matplotlib.font_manager as fm 15 | 16 | import numpy as np 17 | import math 18 | import time 19 | from sklearn import metrics 20 | 21 | import warnings as wrn 22 | wrn.filterwarnings('ignore') 23 | 24 | 25 | def beam_eval_report(trg_words, topk_prediction_list): 26 | y_true = np.array(trg_words) 27 | y_pred = np.array(topk_prediction_list)[:, 0] 28 | 29 | LABELS = np.array(set(list(set(y_true)) + list(set(y_pred)))) 30 | 31 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 32 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 33 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 34 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 35 | ACC = metrics.accuracy_score(y_true, y_pred) 36 | 37 | print("Evaluation report of beam decoding") 38 | print(f''' 39 | Top-1 (Beam Decoding) 40 | Precision: {PR:.4f} 41 | Recall: {RE:.4f} 42 | F1 Score: {F1:.4f} 43 | F0.5 Score: {F05:.4f} 44 | Accuracy: {RE * 100:.2f}% 45 | ''') 46 | 47 | 48 | def greedy_eval_report(correct_words, predicted_words): 49 | y_true = np.array(correct_words) 50 | y_pred = np.array(predicted_words) 51 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 52 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 53 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 54 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 55 | ACC = metrics.accuracy_score(y_true, y_pred) 56 | print("Evaluation report of greedy decoding") 57 | print(f''' 58 | Top-1 (Greedy Decoding) 59 | Precision: {PR:.4f} 60 | Recall: {RE:.4f} 61 | F1 Score: {F1:.4f} 62 | F0.5 Score: {F05:.4f} 63 | Accuracy: {RE * 100:.2f}% 64 | ''') 65 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/metrics.py: -------------------------------------------------------------------------------- 1 | from utils import translate_sentence 2 | 3 | from sklearn import metrics 4 | from tqdm import tqdm 5 | import pandas as pd 6 | import numpy as np 7 | 8 | 9 | def evaluation_report(test_data, SRC, TRG, model, DEVICE): 10 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 11 | 12 | modified_flags = [] 13 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 14 | all_words = sorted(all_words.iloc[:, 0].values) 15 | 16 | for data in tqdm(test_data): 17 | src = data.src 18 | trg = data.trg 19 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 20 | 21 | src = ''.join(src) 22 | trg = ''.join(trg) 23 | pred = ''.join(translation[:-1]) 24 | 25 | erroneous_words.append(src) 26 | predicted_words.append(pred) 27 | correct_words.append(trg) 28 | 29 | if trg == pred: 30 | flags.append(1) 31 | else: 32 | flags.append(0) 33 | 34 | if pred in all_words: 35 | modified_flags.append(1) 36 | else: 37 | modified_flags.append(0) 38 | 39 | evaluation_df = pd.DataFrame({ 40 | 'Error': erroneous_words, 41 | 'Predicton': predicted_words, 42 | 'Target': correct_words, 43 | 'Correction': flags 44 | }) 45 | 46 | corrected_instances = evaluation_df['Correction'].values.sum() 47 | total_instances = len(evaluation_df) 48 | accuracy = corrected_instances / total_instances 49 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 50 | # print(f"Accuracy: {accuracy * 100:.2f}%") 51 | 52 | y_true = np.array(correct_words) 53 | y_pred = np.array(predicted_words) 54 | 55 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 56 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 57 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 58 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 59 | ACC = metrics.accuracy_score(y_true, y_pred) 60 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 61 | 62 | print(f''' 63 | Top-1 (Greedy Decoding) 64 | Precision: {PR:.4f} 65 | Recall: {RE:.4f} 66 | F1 Score: {F1:.4f} 67 | F0.5 Score: {F05:.4f} 68 | Accuracy: {RE * 100:.2f}% 69 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 70 | ''') 71 | 72 | # evaluation_df.to_csv('./Dataset/preds_convs2s.csv', index=False) 73 | return evaluation_df 74 | 75 | 76 | if __name__ == '__main__': 77 | pass -------------------------------------------------------------------------------- /CorpusCreation/scraper.py: -------------------------------------------------------------------------------- 1 | import requests, bs4 2 | import pandas as pd 3 | from tqdm import tqdm 4 | 5 | 6 | def word_accumulation(): 7 | char_pages = { 8 | 'অ': 71, 'আ': 50, 'ই': 10, 'ঈ': 1, 'উ': 25, 'ঊ': 2, 'ঋ': 1, 'এ': 13, 'ঐ': 2, 'ও': 7, 'ঔ': 3, 9 | 'ক': 82, 'খ': 29, 'গ': 35, 'ঘ': 7, 'ঙ': 1, 'চ': 32, 'ছ': 12, 'জ': 28, 'ঝ': 8, 'ঞ': 1, 10 | 'ট': 16, 'ঠ': 4, 'ড': 12, 'ঢ': 6, 'ণ': 1, 'ত': 44, 'থ': 6, 'দ': 44, 'ধ': 13, 'ন': 52, 11 | 'প': 77, 'ফ': 16, 'ব': 90, 'ভ': 24, 'ম': 58, 'য': 11, 'র': 30, 'ল': 18, 'শ': 25, 'ষ': 3, 'স': 86, 'হ': 27 12 | } 13 | 14 | all_urls = {} 15 | 16 | url = 'https://accessibledictionary.gov.bd/bengali-to-bengali/' 17 | 18 | html_codes = requests.get(url).text 19 | document = bs4.BeautifulSoup(html_codes, 'lxml') 20 | alphabet_links = document.find('ul', class_='alphabet') 21 | items = alphabet_links.find_all('li') 22 | 23 | for item in items: 24 | url = str(item).split('"')[1] 25 | all_urls[url[-1:]] = url 26 | 27 | df_dict = {} 28 | 29 | for url in all_urls.values(): 30 | no_of_pages = char_pages[url[-1:]] 31 | for idx in tqdm(range(1, no_of_pages + 1)): 32 | desired_url = url + '&page=' + str(idx) 33 | html_codes = requests.get(desired_url).text 34 | document = bs4.BeautifulSoup(html_codes, 'lxml') 35 | article = document.find('article', class_='dicDisplay') 36 | items = article.find_all('li') 37 | 38 | for item in items: 39 | text = item.get_text() 40 | text = text.split('Bengali Word')[1] 41 | text = text.split('Bengali definition') 42 | ben_word = text[0] 43 | ben_def = text[1] 44 | df_dict[ben_word] = ben_def 45 | # break 46 | 47 | df = pd.DataFrame( 48 | { 49 | 'Word': df_dict.keys(), 50 | 'Defination': df_dict.values() 51 | } 52 | ) 53 | return df 54 | 55 | 56 | def get_len(word): 57 | return len(word) 58 | 59 | 60 | def text_preprocessing(df): 61 | all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 62 | 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 63 | 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 64 | 'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 65 | 'ৗ', 'ড়', 'ঢ়', 'য়', ' '] 66 | 67 | words = '' 68 | 69 | df_words = ' '.join(df['Word'].values) 70 | for char in df_words: 71 | if char in all_chars: 72 | words += char 73 | 74 | words += ' ' 75 | 76 | df_definations = ' '.join(df['Defination'].values) 77 | for char in df_definations: 78 | if char in all_chars: 79 | words += char 80 | 81 | words = sorted(list(set(words.split(' ')))) 82 | df_all_words = pd.DataFrame({'word': words}) 83 | df_all_words['len'] = df_all_words['word'].apply(get_len) 84 | df_all_words = df_all_words.loc[df_all_words['len'] > 2] 85 | return df_all_words 86 | 87 | 88 | if __name__ == '__main__': 89 | df = word_accumulation() 90 | df_all_words = text_preprocessing(df) 91 | df_all_words.to_csv('./dfs/df_all_words.csv', index=False) 92 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/check.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from tqdm import tqdm 4 | 5 | 6 | def within_topk(df, k): 7 | correct = df['Correct'] 8 | topk = df.iloc[:, 1:k+1].values 9 | preds = 0 10 | # for idx in tqdm(range(len(df))): 11 | for idx in range(len(df)): 12 | if correct[idx] in topk[idx]: 13 | preds += 1 14 | acc_within_topk = preds / len(df) 15 | print(f"Within Top-{k} Acc: {acc_within_topk}") 16 | 17 | 18 | def modified_acc(df_allWords, df, k): 19 | df_allWords = sorted(df_allWords.iloc[:, 0].values) 20 | correct = df['Correct'] 21 | topk = df.iloc[:, 1:k + 1].values 22 | preds = 0 23 | for words in tqdm(topk): 24 | for word in words: 25 | if word in df_allWords: 26 | preds += 1 27 | break; 28 | modified_acc_within_topk = preds / len(df) 29 | print(f"Within Top-{k} Modified Acc: {modified_acc_within_topk}") 30 | 31 | 32 | def beam_report(): 33 | print(""" 34 | -------------------------------- 35 | Beam Decoding Evaluation Report 36 | -------------------------------- 37 | """) 38 | df_allWords = pd.read_csv('./Dataset/allDictWords_df.csv') 39 | df_beam = pd.read_csv('./Corrections/preds_beam_colab.csv') 40 | top1_acc = np.sum(df_beam['Pred-1'] == df_beam['Correct']) / len(df_beam) 41 | top2_acc = np.sum(df_beam['Pred-2'] == df_beam['Correct']) / len(df_beam) 42 | top3_acc = np.sum(df_beam['Pred-3'] == df_beam['Correct']) / len(df_beam) 43 | print(f"Top1 Acc: {top1_acc}") 44 | print(f"Top2 Acc: {top2_acc}") 45 | print(f"Top3 Acc: {top3_acc}\n") 46 | within_topk(df_beam, 1) 47 | within_topk(df_beam, 2) 48 | within_topk(df_beam, 3) 49 | modified_acc(df_allWords, df_beam, 1) 50 | modified_acc(df_allWords, df_beam, 2) 51 | modified_acc(df_allWords, df_beam, 3) 52 | 53 | def test(): 54 | df = pd.read_csv('./Dataset/allDictWords_df.csv') 55 | words = sorted(df.iloc[:, 0].values) 56 | print(words) 57 | # 58 | # acc = (df_beam['Pred-1'] == df_beam['Correct'])*1 + \ 59 | # (df_beam['Pred-2'] == df_beam['Correct'])*1 + \ 60 | # (df_beam['Pred-3'] == df_beam['Correct'])*1 61 | # acc = acc.values 62 | # acc = [1 if x>0 else 0 for x in acc] 63 | # print(f"Accuracy: {np.sum(acc) / len(df_beam)}") 64 | # 65 | # df_dict = pd.read_csv('./Dataset/allDictWords_df.csv') 66 | # df_allWords = pd.read_csv('./Dataset/df_all_words.csv') 67 | # # 68 | # preds1 = [] 69 | # for word in tqdm(df_beam['Pred-1'].values): 70 | # # similar_words = df_dict.loc[df_dict['word'].str.startswith(word)].iloc[:, 0].values 71 | # if word in df_allWords.iloc[:, 0].values: 72 | # preds1.append(1) 73 | # else: 74 | # preds1.append(0) 75 | # print(f"Modified Top1 Acc: {np.sum(preds1) / len(preds1)}") 76 | # 77 | # df_greedy = pd.read_csv('./Corrections/preds_greedy_colab.csv') 78 | # # print(df_greedy) 79 | # greedy_acc = np.sum(df_greedy['Predicton'] == df_greedy['Target'])/len(df_greedy) 80 | # print(f'Greedy Accuracy: {greedy_acc}') 81 | # preds = [] 82 | # for word in tqdm(df_greedy['Predicton'].values): 83 | # if word in df_allWords.iloc[:, 0].values: 84 | # preds.append(1) 85 | # else: 86 | # preds.append(0) 87 | # print(f"Modified Greedy Accuracy: {np.sum(preds) / len(preds)}") 88 | 89 | if __name__ == '__main__': 90 | beam_report() -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/decoding.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn, torch.optim as optim 2 | import torch.nn.functional as F 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import math 9 | import time 10 | # from torchtext.data.metrics import bleu_score 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | import matplotlib.font_manager as fm 15 | 16 | import numpy as np 17 | import math 18 | import time 19 | 20 | import copy 21 | from heapq import heappush, heappop 22 | 23 | import warnings as wrn 24 | wrn.filterwarnings('ignore') 25 | 26 | 27 | class BeamSearchNode(object): 28 | def __init__(self, h, prev_node, wid, logp, length): 29 | self.h = h 30 | self.prev_node = prev_node 31 | self.wid = wid 32 | self.logp = logp 33 | self.length = length 34 | 35 | def eval(self): 36 | return self.logp / float(self.length - 1 + 1e-6) 37 | 38 | 39 | def beam_search_decoding(model, src, decoder, enc_outs, enc_last_h, beam_width, n_best, \ 40 | sos_token, eos_token, max_dec_steps, device): 41 | assert beam_width >= n_best 42 | n_best_list = [] 43 | bs = enc_outs.shape[1] 44 | 45 | for batch_id in range(bs): 46 | decoder_hidden = enc_last_h[batch_id] 47 | enc_out = enc_outs[:, batch_id].unsqueeze(1) 48 | 49 | # decoder_input = torch.tensor([sos_token].long().to(DEVICE)) 50 | decoder_input = torch.tensor([sos_token]).to(device) 51 | end_nodes = [] 52 | 53 | node = BeamSearchNode(h=decoder_hidden, prev_node=None, wid=decoder_input, logp=0, length=1) 54 | nodes = [] 55 | 56 | heappush(nodes, (-node.eval(), id(node), node)) 57 | n_dec_steps = 0 58 | 59 | while True: 60 | if n_dec_steps > max_dec_steps: 61 | break 62 | 63 | score, _, n = heappop(nodes) 64 | decoder_input = n.wid 65 | decoder_hidden = n.h 66 | 67 | if n.wid.item() == eos_token and n.prev_node is not None: 68 | end_nodes.append((score, id(n), n)) 69 | if len(end_nodes) >= n_best: 70 | break 71 | else: 72 | continue 73 | 74 | mask = model.create_mask(src) 75 | decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden.unsqueeze(0), enc_out, mask) 76 | 77 | # restricting length 78 | topk_log_prob, topk_indexes = torch.topk(decoder_output, beam_width) 79 | 80 | for new_k in range(beam_width): 81 | decoded_t = topk_indexes[0][new_k].view(1) 82 | logp = topk_log_prob[0][new_k].item() 83 | 84 | node = BeamSearchNode( 85 | h=decoder_hidden.squeeze(0), prev_node=n, wid=decoded_t, logp=n.logp + logp, length=n.length + 1 86 | ) 87 | 88 | heappush(nodes, (-node.eval(), id(node), node)) 89 | 90 | n_dec_steps += beam_width 91 | 92 | if len(end_nodes) == 0: 93 | end_nodes = [heappop(nodes) for _ in range(beam_width)] 94 | 95 | n_best_seq_list = [] 96 | for score, _id, n in sorted(end_nodes, key=lambda x: x[0]): 97 | sequence = [n.wid.item()] 98 | while n.prev_node is not None: 99 | n = n.prev_node 100 | sequence.append(n.wid.item()) 101 | sequence = sequence[::-1] 102 | n_best_seq_list.append(sequence) 103 | 104 | n_best_list.append(n_best_seq_list) 105 | 106 | return n_best_list 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /focalLoss.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | # following: 9 | # https://github.com/kornia/kornia/ 10 | # which is based on: 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py 12 | 13 | 14 | def one_hot( 15 | labels: torch.Tensor, 16 | num_classes: int, 17 | device: Optional[torch.device] = None, 18 | dtype: Optional[torch.dtype] = None, 19 | eps: float = 1e-6, 20 | ) -> torch.Tensor: 21 | 22 | if not isinstance(labels, torch.Tensor): 23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}") 24 | 25 | if not labels.dtype == torch.int64: 26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}") 27 | 28 | if num_classes < 1: 29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes)) 30 | 31 | shape = labels.shape 32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype) 33 | 34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps 35 | 36 | 37 | 38 | def focal_loss( 39 | input: torch.Tensor, 40 | target: torch.Tensor, 41 | alpha: float, 42 | gamma: float = 2.0, 43 | reduction: str = 'none', 44 | eps: Optional[float] = None, 45 | ) -> torch.Tensor: 46 | 47 | if eps is not None and not torch.jit.is_scripting(): 48 | warnings.warn( 49 | "`focal_loss` has been reworked for improved numerical stability " 50 | "and the `eps` argument is no longer necessary", 51 | DeprecationWarning, 52 | stacklevel=2, 53 | ) 54 | 55 | if not isinstance(input, torch.Tensor): 56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") 57 | 58 | if not len(input.shape) >= 2: 59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}") 60 | 61 | if input.size(0) != target.size(0): 62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).') 63 | 64 | n = input.size(0) 65 | out_size = (n,) + input.size()[2:] 66 | if target.size()[1:] != input.size()[2:]: 67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}') 68 | 69 | if not input.device == target.device: 70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}") 71 | 72 | # compute softmax over the classes axis 73 | input_soft: torch.Tensor = F.softmax(input, dim=1) 74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1) 75 | 76 | # create the labels one hot tensor 77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype) 78 | 79 | # compute the actual focal loss 80 | weight = torch.pow(-input_soft + 1.0, gamma) 81 | 82 | focal = -alpha * weight * log_input_soft 83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal)) 84 | 85 | if reduction == 'none': 86 | loss = loss_tmp 87 | elif reduction == 'mean': 88 | loss = torch.mean(loss_tmp) 89 | elif reduction == 'sum': 90 | loss = torch.sum(loss_tmp) 91 | else: 92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}") 93 | return loss 94 | 95 | 96 | class FocalLoss(nn.Module): 97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None: 98 | super().__init__() 99 | self.alpha: float = alpha 100 | self.gamma: float = gamma 101 | self.reduction: str = reduction 102 | self.eps: Optional[float] = eps 103 | 104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps) 106 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/focalLoss.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | # following: 9 | # https://github.com/kornia/kornia/ 10 | # which is based on: 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py 12 | 13 | 14 | def one_hot( 15 | labels: torch.Tensor, 16 | num_classes: int, 17 | device: Optional[torch.device] = None, 18 | dtype: Optional[torch.dtype] = None, 19 | eps: float = 1e-6, 20 | ) -> torch.Tensor: 21 | 22 | if not isinstance(labels, torch.Tensor): 23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}") 24 | 25 | if not labels.dtype == torch.int64: 26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}") 27 | 28 | if num_classes < 1: 29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes)) 30 | 31 | shape = labels.shape 32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype) 33 | 34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps 35 | 36 | 37 | 38 | def focal_loss( 39 | input: torch.Tensor, 40 | target: torch.Tensor, 41 | alpha: float, 42 | gamma: float = 2.0, 43 | reduction: str = 'none', 44 | eps: Optional[float] = None, 45 | ) -> torch.Tensor: 46 | 47 | if eps is not None and not torch.jit.is_scripting(): 48 | warnings.warn( 49 | "`focal_loss` has been reworked for improved numerical stability " 50 | "and the `eps` argument is no longer necessary", 51 | DeprecationWarning, 52 | stacklevel=2, 53 | ) 54 | 55 | if not isinstance(input, torch.Tensor): 56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") 57 | 58 | if not len(input.shape) >= 2: 59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}") 60 | 61 | if input.size(0) != target.size(0): 62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).') 63 | 64 | n = input.size(0) 65 | out_size = (n,) + input.size()[2:] 66 | if target.size()[1:] != input.size()[2:]: 67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}') 68 | 69 | if not input.device == target.device: 70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}") 71 | 72 | # compute softmax over the classes axis 73 | input_soft: torch.Tensor = F.softmax(input, dim=1) 74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1) 75 | 76 | # create the labels one hot tensor 77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype) 78 | 79 | # compute the actual focal loss 80 | weight = torch.pow(-input_soft + 1.0, gamma) 81 | 82 | focal = -alpha * weight * log_input_soft 83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal)) 84 | 85 | if reduction == 'none': 86 | loss = loss_tmp 87 | elif reduction == 'mean': 88 | loss = torch.mean(loss_tmp) 89 | elif reduction == 'sum': 90 | loss = torch.sum(loss_tmp) 91 | else: 92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}") 93 | return loss 94 | 95 | 96 | class FocalLoss(nn.Module): 97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None: 98 | super().__init__() 99 | self.alpha: float = alpha 100 | self.gamma: float = gamma 101 | self.reduction: str = reduction 102 | self.eps: Optional[float] = eps 103 | 104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps) 106 | -------------------------------------------------------------------------------- /Baselines/DTransformer/focalLoss.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from typing import Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | # following: 9 | # https://github.com/kornia/kornia/ 10 | # which is based on: 11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py 12 | 13 | 14 | def one_hot( 15 | labels: torch.Tensor, 16 | num_classes: int, 17 | device: Optional[torch.device] = None, 18 | dtype: Optional[torch.dtype] = None, 19 | eps: float = 1e-6, 20 | ) -> torch.Tensor: 21 | 22 | if not isinstance(labels, torch.Tensor): 23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}") 24 | 25 | if not labels.dtype == torch.int64: 26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}") 27 | 28 | if num_classes < 1: 29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes)) 30 | 31 | shape = labels.shape 32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype) 33 | 34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps 35 | 36 | 37 | 38 | def focal_loss( 39 | input: torch.Tensor, 40 | target: torch.Tensor, 41 | alpha: float, 42 | gamma: float = 2.0, 43 | reduction: str = 'none', 44 | eps: Optional[float] = None, 45 | ) -> torch.Tensor: 46 | 47 | if eps is not None and not torch.jit.is_scripting(): 48 | warnings.warn( 49 | "`focal_loss` has been reworked for improved numerical stability " 50 | "and the `eps` argument is no longer necessary", 51 | DeprecationWarning, 52 | stacklevel=2, 53 | ) 54 | 55 | if not isinstance(input, torch.Tensor): 56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}") 57 | 58 | if not len(input.shape) >= 2: 59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}") 60 | 61 | if input.size(0) != target.size(0): 62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).') 63 | 64 | n = input.size(0) 65 | out_size = (n,) + input.size()[2:] 66 | if target.size()[1:] != input.size()[2:]: 67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}') 68 | 69 | if not input.device == target.device: 70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}") 71 | 72 | # compute softmax over the classes axis 73 | input_soft: torch.Tensor = F.softmax(input, dim=1) 74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1) 75 | 76 | # create the labels one hot tensor 77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype) 78 | 79 | # compute the actual focal loss 80 | weight = torch.pow(-input_soft + 1.0, gamma) 81 | 82 | focal = -alpha * weight * log_input_soft 83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal)) 84 | 85 | if reduction == 'none': 86 | loss = loss_tmp 87 | elif reduction == 'mean': 88 | loss = torch.mean(loss_tmp) 89 | elif reduction == 'sum': 90 | loss = torch.sum(loss_tmp) 91 | else: 92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}") 93 | return loss 94 | 95 | 96 | class FocalLoss(nn.Module): 97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None: 98 | super().__init__() 99 | self.alpha: float = alpha 100 | self.gamma: float = gamma 101 | self.reduction: str = reduction 102 | self.eps: Optional[float] = eps 103 | 104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor: 105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps) 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

DPCSpell

2 |

3 | A transformer-based spelling error correction framework for Bangla and resource scarce Indic languages
Link — Computer Speech & Language 4 |

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | ## 13 | 14 | ## How DPCSpell works? 15 | 16 | ![dpcspell](https://user-images.githubusercontent.com/58245357/202089360-6fb3a70d-09cc-47ba-b5f5-3b100001c124.gif) 17 | 18 | ## Running Test 19 | | Operating System | Requirement | Remark | 20 | | ------------- | ------------- | ------------- | 21 | | Ubuntu 16.04.7 LTS | requirements_u.yml | :heavy_check_mark: Successful | 22 | | Ubuntu 18.04.6 LTS (Google Colab) | requirements_c.txt | :heavy_check_mark: Successful* | 23 | | Windows 10 | requirements_w.yml | :heavy_check_mark: Successful | 24 | 25 |
26 | 27 | ## Get Started 28 | 29 | ``` 30 | git clone https://github.com/mehedihasanbijoy/DPCSpell.git 31 | ``` 32 | or manually **download** and **extract** the github repository of DPCSpell. 33 | 34 |
35 | 36 | ## Environment Setup 37 | ### Create A Virtual Environment 38 | ``` 39 | conda env create -f requirements_u.yml (for Ubuntu 16.04.7 LTS) 40 | or 41 | conda env create -f requirements_w.yml (for Windows 10) 42 | ``` 43 | 44 | 45 | ### Activate the Environment 46 | ``` 47 | conda activate DPCSpell 48 | ``` 49 | 50 |
51 | 52 | ## Prepare SEC Corpora 53 | ``` 54 | gdown https://drive.google.com/drive/folders/1_sWSi-LFsvuYh9c5GBMDd4V6_uM8yYjH?usp=share_link -O ./Dataset --folder 55 | ``` 56 |

57 | or manually download the folder from here and keep the extracted files into ./Dataset/ 58 |

59 | 60 |
61 | 62 | ## Training and Evaluation of DPCSpell 63 | 64 | ### Detector Network 65 | 66 | ``` 67 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 68 | ``` 69 | 70 | ### Purificator Network 71 | 72 | ``` 73 | python purificator.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 74 | ``` 75 | 76 | ### Corrector Network 77 | 78 | ``` 79 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100 80 | ``` 81 | 82 |
83 | 84 | ## Benchmarking Bangla SEC Task 85 | 86 | ![benchmark](https://user-images.githubusercontent.com/58245357/195144459-0150f456-f06b-4aff-93f5-36b1fb76ea42.png) 87 | 88 | 89 | ## BibTeX Entry and Citation Info 90 | 91 | ``` 92 | @article{bijoy2024transformer, 93 | title={A transformer based spelling error correction framework for Bangla and resource scarce Indic languages}, 94 | author={Bijoy, Mehedi Hasan and Hossain, Nahid and Islam, Salekul and Shatabda, Swakkhar}, 95 | journal={Computer Speech \& Language}, 96 | volume = {89}, 97 | pages = {101703}, 98 | year = {2025}, 99 | issn = {0885-2308}, 100 | doi = {https://doi.org/10.1016/j.csl.2024.101703}, 101 | url = {https://www.sciencedirect.com/science/article/pii/S088523082400086X}, 102 | publisher={Elsevier} 103 | } 104 | ``` 105 | -------------------------------------------------------------------------------- /pipeline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from tqdm import tqdm 4 | from utils import basic_tokenizer 5 | import matplotlib.pyplot as plt 6 | import matplotlib.ticker as ticker 7 | import matplotlib.font_manager as fm 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def train(model, iterator, optimizer, criterion, clip): 13 | model.train() 14 | epoch_loss = 0 15 | for idx, batch in enumerate(tqdm(iterator)): 16 | src = batch.src 17 | trg = batch.trg 18 | 19 | optimizer.zero_grad() 20 | output, _ = model(src, trg[:, :-1]) 21 | # output = [batch size, trg len - 1, output dim] 22 | # trg = [batch size, trg len] 23 | 24 | output_dim = output.shape[-1] 25 | output = output.contiguous().view(-1, output_dim) 26 | trg = trg[:, 1:].contiguous().view(-1) 27 | # output = [batch size * trg len - 1, output dim] 28 | # trg = [batch size * trg len - 1] 29 | 30 | # trg one hot for BCEwLogits 31 | # trg = F.one_hot(trg, num_classes=66) 32 | 33 | loss = criterion(output, trg) 34 | loss.backward() 35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 36 | optimizer.step() 37 | epoch_loss += loss.item() 38 | 39 | return epoch_loss / len(iterator) 40 | 41 | 42 | def evaluate(model, iterator, criterion): 43 | model.eval() 44 | epoch_loss = 0 45 | with torch.no_grad(): 46 | for idx, batch in enumerate(tqdm(iterator)): 47 | src = batch.src 48 | trg = batch.trg 49 | 50 | output, _ = model(src, trg[:, :-1]) 51 | # output = [batch size, trg len - 1, output dim] 52 | # trg = [batch size, trg len] 53 | 54 | output_dim = output.shape[-1] 55 | output = output.contiguous().view(-1, output_dim) 56 | trg = trg[:, 1:].contiguous().view(-1) 57 | # output = [batch size * trg len - 1, output dim] 58 | # trg = [batch size * trg len - 1] 59 | 60 | loss = criterion(output, trg) 61 | epoch_loss += loss.item() 62 | return epoch_loss / len(iterator) 63 | 64 | 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50): 66 | model.eval() 67 | 68 | if isinstance(sentence, str): 69 | tokens = basic_tokenizer(sentence) 70 | else: 71 | tokens = sentence 72 | 73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token] 74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens] 75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device) 76 | src_mask = model.make_src_mask(src_tensor) 77 | 78 | with torch.no_grad(): 79 | enc_src = model.encoder(src_tensor, src_mask) 80 | 81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] 82 | 83 | for i in range(max_len): 84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) 85 | trg_mask = model.make_trg_mask(trg_tensor) 86 | 87 | with torch.no_grad(): 88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask) 89 | 90 | pred_token = output.argmax(2)[:, -1].item() 91 | trg_indexes.append(pred_token) 92 | 93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: 94 | break 95 | 96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] 97 | return trg_tokens[1:-1], attention 98 | 99 | 100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2): 101 | assert n_rows * n_cols == n_heads 102 | 103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf') 104 | 105 | fig = plt.figure(figsize=(15, 25)) 106 | for i in range(n_heads): 107 | ax = fig.add_subplot(n_rows, n_cols, i + 1) 108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy() 109 | cax = ax.matshow(_attention, cmap='bone') 110 | 111 | ax.tick_params(labelsize=12) 112 | ax.set_xticklabels( 113 | [''] + [''] + [t for t in sentence] + [''], 114 | rotation=45, fontproperties=prop, fontsize=20 115 | ) 116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20) 117 | 118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 120 | 121 | plt.show() 122 | plt.close() 123 | 124 | 125 | if __name__ == '__main__': 126 | pass 127 | -------------------------------------------------------------------------------- /Baselines/DCSpell/pipeline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from tqdm import tqdm 4 | from utils import basic_tokenizer 5 | import matplotlib.pyplot as plt 6 | import matplotlib.ticker as ticker 7 | import matplotlib.font_manager as fm 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def train(model, iterator, optimizer, criterion, clip): 13 | model.train() 14 | epoch_loss = 0 15 | for idx, batch in enumerate(tqdm(iterator)): 16 | src = batch.src 17 | trg = batch.trg 18 | 19 | optimizer.zero_grad() 20 | output, _ = model(src, trg[:, :-1]) 21 | # output = [batch size, trg len - 1, output dim] 22 | # trg = [batch size, trg len] 23 | 24 | output_dim = output.shape[-1] 25 | output = output.contiguous().view(-1, output_dim) 26 | trg = trg[:, 1:].contiguous().view(-1) 27 | # output = [batch size * trg len - 1, output dim] 28 | # trg = [batch size * trg len - 1] 29 | 30 | # trg one hot for BCEwLogits 31 | # trg = F.one_hot(trg, num_classes=66) 32 | 33 | loss = criterion(output, trg) 34 | loss.backward() 35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 36 | optimizer.step() 37 | epoch_loss += loss.item() 38 | 39 | return epoch_loss / len(iterator) 40 | 41 | 42 | def evaluate(model, iterator, criterion): 43 | model.eval() 44 | epoch_loss = 0 45 | with torch.no_grad(): 46 | for idx, batch in enumerate(tqdm(iterator)): 47 | src = batch.src 48 | trg = batch.trg 49 | 50 | output, _ = model(src, trg[:, :-1]) 51 | # output = [batch size, trg len - 1, output dim] 52 | # trg = [batch size, trg len] 53 | 54 | output_dim = output.shape[-1] 55 | output = output.contiguous().view(-1, output_dim) 56 | trg = trg[:, 1:].contiguous().view(-1) 57 | # output = [batch size * trg len - 1, output dim] 58 | # trg = [batch size * trg len - 1] 59 | 60 | loss = criterion(output, trg) 61 | epoch_loss += loss.item() 62 | return epoch_loss / len(iterator) 63 | 64 | 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50): 66 | model.eval() 67 | 68 | if isinstance(sentence, str): 69 | tokens = basic_tokenizer(sentence) 70 | else: 71 | tokens = sentence 72 | 73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token] 74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens] 75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device) 76 | src_mask = model.make_src_mask(src_tensor) 77 | 78 | with torch.no_grad(): 79 | enc_src = model.encoder(src_tensor, src_mask) 80 | 81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] 82 | 83 | for i in range(max_len): 84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) 85 | trg_mask = model.make_trg_mask(trg_tensor) 86 | 87 | with torch.no_grad(): 88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask) 89 | 90 | pred_token = output.argmax(2)[:, -1].item() 91 | trg_indexes.append(pred_token) 92 | 93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: 94 | break 95 | 96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] 97 | return trg_tokens[1:-1], attention 98 | 99 | 100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2): 101 | assert n_rows * n_cols == n_heads 102 | 103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf') 104 | 105 | fig = plt.figure(figsize=(15, 25)) 106 | for i in range(n_heads): 107 | ax = fig.add_subplot(n_rows, n_cols, i + 1) 108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy() 109 | cax = ax.matshow(_attention, cmap='bone') 110 | 111 | ax.tick_params(labelsize=12) 112 | ax.set_xticklabels( 113 | [''] + [''] + [t for t in sentence] + [''], 114 | rotation=45, fontproperties=prop, fontsize=20 115 | ) 116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20) 117 | 118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 120 | 121 | plt.show() 122 | plt.close() 123 | 124 | 125 | if __name__ == '__main__': 126 | pass 127 | -------------------------------------------------------------------------------- /Baselines/DTransformer/pipeline.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from tqdm import tqdm 4 | from utils import basic_tokenizer 5 | import matplotlib.pyplot as plt 6 | import matplotlib.ticker as ticker 7 | import matplotlib.font_manager as fm 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def train(model, iterator, optimizer, criterion, clip): 13 | model.train() 14 | epoch_loss = 0 15 | for idx, batch in enumerate(tqdm(iterator)): 16 | src = batch.src 17 | trg = batch.trg 18 | 19 | optimizer.zero_grad() 20 | output, _ = model(src, trg[:, :-1]) 21 | # output = [batch size, trg len - 1, output dim] 22 | # trg = [batch size, trg len] 23 | 24 | output_dim = output.shape[-1] 25 | output = output.contiguous().view(-1, output_dim) 26 | trg = trg[:, 1:].contiguous().view(-1) 27 | # output = [batch size * trg len - 1, output dim] 28 | # trg = [batch size * trg len - 1] 29 | 30 | # trg one hot for BCEwLogits 31 | # trg = F.one_hot(trg, num_classes=66) 32 | 33 | loss = criterion(output, trg) 34 | loss.backward() 35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 36 | optimizer.step() 37 | epoch_loss += loss.item() 38 | 39 | return epoch_loss / len(iterator) 40 | 41 | 42 | def evaluate(model, iterator, criterion): 43 | model.eval() 44 | epoch_loss = 0 45 | with torch.no_grad(): 46 | for idx, batch in enumerate(tqdm(iterator)): 47 | src = batch.src 48 | trg = batch.trg 49 | 50 | output, _ = model(src, trg[:, :-1]) 51 | # output = [batch size, trg len - 1, output dim] 52 | # trg = [batch size, trg len] 53 | 54 | output_dim = output.shape[-1] 55 | output = output.contiguous().view(-1, output_dim) 56 | trg = trg[:, 1:].contiguous().view(-1) 57 | # output = [batch size * trg len - 1, output dim] 58 | # trg = [batch size * trg len - 1] 59 | 60 | loss = criterion(output, trg) 61 | epoch_loss += loss.item() 62 | return epoch_loss / len(iterator) 63 | 64 | 65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50): 66 | model.eval() 67 | 68 | if isinstance(sentence, str): 69 | tokens = basic_tokenizer(sentence) 70 | else: 71 | tokens = sentence 72 | 73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token] 74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens] 75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device) 76 | src_mask = model.make_src_mask(src_tensor) 77 | 78 | with torch.no_grad(): 79 | enc_src = model.encoder(src_tensor, src_mask) 80 | 81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] 82 | 83 | for i in range(max_len): 84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device) 85 | trg_mask = model.make_trg_mask(trg_tensor) 86 | 87 | with torch.no_grad(): 88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask) 89 | 90 | pred_token = output.argmax(2)[:, -1].item() 91 | trg_indexes.append(pred_token) 92 | 93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: 94 | break 95 | 96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] 97 | return trg_tokens[1:-1], attention 98 | 99 | 100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2): 101 | assert n_rows * n_cols == n_heads 102 | 103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf') 104 | 105 | fig = plt.figure(figsize=(15, 25)) 106 | for i in range(n_heads): 107 | ax = fig.add_subplot(n_rows, n_cols, i + 1) 108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy() 109 | cax = ax.matshow(_attention, cmap='bone') 110 | 111 | ax.tick_params(labelsize=12) 112 | ax.set_xticklabels( 113 | [''] + [''] + [t for t in sentence] + [''], 114 | rotation=45, fontproperties=prop, fontsize=20 115 | ) 116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20) 117 | 118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 120 | 121 | plt.show() 122 | plt.close() 123 | 124 | 125 | if __name__ == '__main__': 126 | pass 127 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/inference.py: -------------------------------------------------------------------------------- 1 | from decoding import beam_search_decoding 2 | from metrics import beam_eval_report, greedy_eval_report 3 | from utils import print_n_best 4 | from utils import translate_sentence 5 | 6 | import torch, torch.nn as nn, torch.optim as optim 7 | import torch.nn.functional as F 8 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 9 | import random 10 | from tqdm import tqdm 11 | import pandas as pd 12 | from sklearn.model_selection import train_test_split 13 | import math 14 | import time 15 | # from torchtext.data.metrics import bleu_score 16 | 17 | import matplotlib.pyplot as plt 18 | import matplotlib.ticker as ticker 19 | import matplotlib.font_manager as fm 20 | 21 | import numpy as np 22 | import math 23 | import time 24 | 25 | import warnings as wrn 26 | wrn.filterwarnings('ignore') 27 | 28 | 29 | def test_beam(model, train_data, test_data, SRC, TRG, DEVICE): 30 | _, test_iterator = BucketIterator.splits( 31 | (train_data, test_data), 32 | batch_size=1, 33 | sort_within_batch=True, 34 | sort_key=lambda x: len(x.src), 35 | device=DEVICE 36 | ) 37 | 38 | TRG_SOS_IDX = TRG.vocab.stoi[TRG.init_token] 39 | TRG_EOS_IDX = TRG.vocab.stoi[TRG.eos_token] 40 | 41 | src_words = [] 42 | topk_prediction_list = [] 43 | trg_words = [] 44 | found_within_topk = [] 45 | found_at_top1 = [] 46 | 47 | model.eval() 48 | with torch.no_grad(): 49 | for batch_id, batch in enumerate(tqdm(test_iterator)): 50 | src, src_len = batch.src 51 | trg = batch.trg 52 | 53 | src_word = "".join(SRC.vocab.itos[idx] for idx in src[:, 0][1:-1]) 54 | trg_word = "".join(TRG.vocab.itos[idx] for idx in trg[:, 0][1:-1]) 55 | # print(f'\nSRC: {src_word}') 56 | # print(f'\nTRG: {trg_word}') 57 | 58 | enc_outs, h = model.encoder(src, src_len) 59 | # print(enc_outs.shape, h.shape) 60 | 61 | # decoder, enc_outs, enc_last_h, beam_width, n_best, sos_token, eos_token, max_dec_steps, device 62 | decoded_seqs = beam_search_decoding( 63 | model = model, 64 | src = src, 65 | decoder=model.decoder, 66 | enc_outs=enc_outs, 67 | enc_last_h=h, 68 | beam_width=1, 69 | n_best=1, 70 | sos_token=TRG_SOS_IDX, 71 | eos_token=TRG_EOS_IDX, 72 | max_dec_steps=100, 73 | device=DEVICE 74 | ) 75 | topk_preds = print_n_best(decoded_seqs[0], TRG.vocab.itos) 76 | # print(topk_preds) 77 | 78 | src_words.append(src_word) 79 | trg_words.append(trg_word) 80 | topk_prediction_list.append((topk_preds * 3)[:3]) 81 | found_within_topk.append(1) if trg_word in topk_preds else found_within_topk.append(0) 82 | found_at_top1.append(1) if trg_word == topk_preds[0] else found_at_top1.append(0) 83 | 84 | # if batch_id == 100: 85 | # break 86 | 87 | topk_pred_df = pd.DataFrame({ 88 | 'Error': src_words, 89 | 'Pred-1': np.array(topk_prediction_list)[:, 0], 90 | 'Pred-2': np.array(topk_prediction_list)[:, 1], 91 | 'Pred-3': np.array(topk_prediction_list)[:, 2], 92 | 'Correct': trg_words, 93 | 'Greedy': found_at_top1, 94 | 'Beam': found_within_topk 95 | }) 96 | topk_pred_df.to_csv('./Corrections/preds_beam.csv', index=False) 97 | 98 | beam_eval_report(trg_words, topk_prediction_list) 99 | 100 | 101 | def test_greedy(test_data, SRC, TRG, model, DEVICE): 102 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 103 | for idx, data in enumerate(tqdm(test_data)): 104 | src = data.src 105 | trg = data.trg 106 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 107 | 108 | src = ''.join(src) 109 | trg = ''.join(trg) 110 | pred = ''.join(translation[:-1]) 111 | 112 | erroneous_words.append(src) 113 | predicted_words.append(pred) 114 | correct_words.append(trg) 115 | if trg == pred: 116 | flags.append(1) 117 | else: 118 | flags.append(0) 119 | 120 | evaluation_df = pd.DataFrame({ 121 | 'Error': erroneous_words, 122 | 'Predicton': predicted_words, 123 | 'Target': correct_words, 124 | 'Correction': flags 125 | }) 126 | evaluation_df.to_csv('./Corrections/preds_greedy.csv', index=False) 127 | 128 | greedy_eval_report(correct_words, predicted_words) 129 | 130 | 131 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/pipeline.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn, torch.optim as optim 2 | import torch.nn.functional as F 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import math 9 | import time 10 | # from torchtext.data.metrics import bleu_score 11 | from utils import translate_sentence 12 | from sklearn import metrics 13 | 14 | import matplotlib.pyplot as plt 15 | import matplotlib.ticker as ticker 16 | import matplotlib.font_manager as fm 17 | 18 | import numpy as np 19 | import math 20 | import time 21 | 22 | import warnings as wrn 23 | wrn.filterwarnings('ignore') 24 | 25 | 26 | def train(model, iterator, optimizer, criterion, clip=1): 27 | model.train() 28 | epoch_loss = 0 29 | for idx, batch in enumerate(tqdm(iterator)): 30 | src, src_len = batch.src 31 | trg = batch.trg 32 | 33 | optimizer.zero_grad() 34 | output = model(src, src_len, trg) 35 | output_dim = output.shape[-1] 36 | 37 | output = output[1:].view(-1, output_dim) 38 | trg = trg[1:].view(-1) 39 | 40 | # print(f"output: {output.shape}, target: {trg.shape} \n\n{trg}") 41 | 42 | loss = criterion(output, trg) 43 | loss.backward() 44 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip) 45 | optimizer.step() 46 | epoch_loss += loss.item() 47 | 48 | return epoch_loss / len(iterator) 49 | 50 | 51 | def evaluate(model, iterator, criterion): 52 | model.eval() 53 | epoch_loss = 0 54 | with torch.no_grad(): 55 | for idx, batch in enumerate(tqdm(iterator)): 56 | src, src_len = batch.src 57 | trg = batch.trg 58 | 59 | output = model(src, src_len, trg, 0) 60 | 61 | output_dim = output.shape[-1] 62 | output = output[1:].view(-1, output_dim) 63 | trg = trg[1:].view(-1) 64 | 65 | loss = criterion(output, trg) 66 | epoch_loss += loss.item() 67 | 68 | return epoch_loss / len(iterator) 69 | 70 | 71 | def test_accuracy(test_data, SRC, TRG, model, DEVICE): 72 | df = pd.read_csv('./Dataset/allDictWords_df.csv') 73 | # df = pd.read_csv('./Dataset/df_all_words.csv') 74 | all_words = sorted(df.iloc[:, 0].values) 75 | 76 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 77 | modified_flags = [] 78 | for idx, data in enumerate(tqdm(test_data)): 79 | src = data.src 80 | trg = data.trg 81 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 82 | 83 | src = ''.join(src) 84 | trg = ''.join(trg) 85 | pred = ''.join(translation[:-1]) 86 | 87 | erroneous_words.append(src) 88 | predicted_words.append(pred) 89 | correct_words.append(trg) 90 | if trg == pred: 91 | flags.append(1) 92 | else: 93 | flags.append(0) 94 | 95 | if pred in all_words: 96 | modified_flags.append(1) 97 | else: 98 | modified_flags.append(0) 99 | 100 | modified_acc = np.sum(modified_flags) / len(modified_flags) 101 | 102 | evaluation_df = pd.DataFrame({ 103 | 'Error': erroneous_words, 104 | 'Predicton': predicted_words, 105 | 'Target': correct_words, 106 | 'Correction': flags 107 | }) 108 | # evaluation_df.to_csv('/content/drive/MyDrive/Bangla Spell & Grammar Checker/Codes/GEDC/Seq2Seq/preds_greedy.csv', index=False) 109 | 110 | corrected_instances = evaluation_df['Correction'].values.sum() 111 | total_instances = len(evaluation_df) 112 | accuracy = corrected_instances / total_instances 113 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 114 | # print(f"Accuracy: {accuracy*100:.2f}%") 115 | 116 | y_true = np.array(correct_words) 117 | y_pred = np.array(predicted_words) 118 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 119 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 120 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 121 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 122 | ACC = metrics.accuracy_score(y_true, y_pred) 123 | print(f'''Top-1 (Greedy Decoding) 124 | Precision: {PR:.4f} 125 | Recall: {RE:.4f} 126 | F1 Score: {F1:.4f} 127 | F0.5 Score: {F05:.4f} 128 | Accuracy: {ACC * 100:.2f}% 129 | Modified Accuracy: {modified_acc * 100:.2f}% 130 | ''') 131 | 132 | return evaluation_df 133 | 134 | # evaluation_df.sample(10) 135 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/models.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn, torch.optim as optim 2 | import torch.nn.functional as F 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import math 9 | import time 10 | # from torchtext.data.metrics import bleu_score 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | import matplotlib.font_manager as fm 15 | 16 | import numpy as np 17 | import math 18 | import time 19 | 20 | import warnings as wrn 21 | wrn.filterwarnings('ignore') 22 | 23 | 24 | class Encoder(nn.Module): 25 | def __init__(self, input_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout): 26 | super().__init__() 27 | self.embedding = nn.Embedding(input_dim, embed_dim) 28 | self.rnn = nn.GRU(embed_dim, enc_hidden_dim, bidirectional=True) 29 | self.fc = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim) 30 | self.dropout = nn.Dropout(dropout) 31 | 32 | def forward(self, src, src_len): 33 | embedded = self.dropout(self.embedding(src)) 34 | packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu')) 35 | packed_outputs, hidden = self.rnn(packed_embedded) 36 | outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 37 | hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))) 38 | return outputs, hidden 39 | 40 | 41 | class Attention(nn.Module): 42 | def __init__(self, enc_hidden_dim, dec_hidden_dim): 43 | super().__init__() 44 | self.attn = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim, dec_hidden_dim) 45 | self.v = nn.Linear(dec_hidden_dim, 1, bias=False) 46 | 47 | def forward(self, hidden, encoder_outputs, mask): 48 | batch_size = encoder_outputs.shape[1] 49 | src_len = encoder_outputs.shape[0] 50 | hidden = hidden.unsqueeze(1).repeat(1, src_len, 1) 51 | encoder_outputs = encoder_outputs.permute(1, 0, 2) 52 | energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) 53 | attention = self.v(energy).squeeze(2) 54 | attention = attention.masked_fill(mask==0, -1e10) 55 | return F.softmax(attention, dim=1) 56 | 57 | 58 | class Decoder(nn.Module): 59 | def __init__(self, output_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention): 60 | super().__init__() 61 | self.output_dim = output_dim 62 | self.attention = attention 63 | self.embedding = nn.Embedding(output_dim, embed_dim) 64 | self.rnn = nn.GRU((enc_hidden_dim*2) + embed_dim, dec_hidden_dim) 65 | self.fc_out = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim + embed_dim, output_dim) 66 | self.dropout = nn.Dropout(dropout) 67 | 68 | def forward(self, input, hidden, encoder_outputs, mask): 69 | input = input.unsqueeze(0) 70 | embedded = self.dropout(self.embedding(input)) 71 | a = self.attention(hidden, encoder_outputs, mask) 72 | a = a.unsqueeze(1) 73 | encoder_outputs = encoder_outputs.permute(1, 0, 2) 74 | weighted = torch.bmm(a, encoder_outputs) 75 | weighted = weighted.permute(1, 0, 2) 76 | rnn_input = torch.cat((embedded, weighted), dim=2) 77 | 78 | output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0)) 79 | 80 | assert (output == hidden).all() 81 | 82 | embedded = embedded.squeeze(0) 83 | output = output.squeeze(0) 84 | weighted = weighted.squeeze(0) 85 | 86 | prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1)) 87 | 88 | return prediction, hidden.squeeze(0), a.squeeze(1) 89 | 90 | 91 | class Seq2Seq(nn.Module): 92 | def __init__(self, encoder, decoder, src_pad_idx, device): 93 | super().__init__() 94 | self.encoder = encoder 95 | self.decoder = decoder 96 | self.src_pad_idx = src_pad_idx 97 | self.device = device 98 | 99 | def create_mask(self, src): 100 | mask = (src != self.src_pad_idx).permute(1, 0) 101 | return mask 102 | 103 | def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5): 104 | batch_size = src.shape[1] 105 | trg_len = trg.shape[0] 106 | trg_vocab_size = self.decoder.output_dim 107 | 108 | outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device) 109 | 110 | encoder_outputs, hidden = self.encoder(src, src_len) 111 | input = trg[0, :] 112 | mask = self.create_mask(src) 113 | 114 | for t in range(1, trg_len): 115 | output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask) 116 | outputs[t] = output 117 | 118 | top1 = output.argmax(1) 119 | 120 | input = trg[t] if random.random() < teacher_forcing_ratio else top1 121 | 122 | return outputs 123 | -------------------------------------------------------------------------------- /Baselines/RuleBased/rule_based.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """PreviousEditDistanceBasedSpellChecker.ipynb 3 | 4 | Automatically generated by Colaboratory. 5 | 6 | Original file is located at 7 | https://colab.research.google.com/drive/1Kp3C18yaWmfhKJU_8294UKqfHrmLA1Ow 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | from sklearn.model_selection import train_test_split 13 | from sklearn import metrics 14 | from tqdm import tqdm 15 | import warnings as wrn 16 | 17 | wrn.filterwarnings('ignore') 18 | 19 | def editDistance(str1, str2, m, n): 20 | if m == 0: 21 | return n 22 | 23 | if n == 0: 24 | return m 25 | 26 | if str1[m-1] == str2[n-1]: 27 | return editDistance(str1, str2, m-1, n-1) 28 | 29 | return 1 + min(editDistance(str1, str2, m, n-1), # Insert 30 | editDistance(str1, str2, m-1, n), # Remove 31 | editDistance(str1, str2, m-1, n-1) # Replace 32 | ) 33 | 34 | # Dynamic Programming based 35 | def editDistDP(str1, str2, m, n): 36 | # Create a table to store results of subproblems 37 | dp = [[0 for x in range(n + 1)] for x in range(m + 1)] 38 | 39 | # Fill d[][] in bottom up manner 40 | for i in range(m + 1): 41 | for j in range(n + 1): 42 | 43 | # If first string is empty, only option is to 44 | # insert all characters of second string 45 | if i == 0: 46 | dp[i][j] = j # Min. operations = j 47 | 48 | # If second string is empty, only option is to 49 | # remove all characters of second string 50 | elif j == 0: 51 | dp[i][j] = i # Min. operations = i 52 | 53 | # If last characters are same, ignore last char 54 | # and recur for remaining string 55 | elif str1[i-1] == str2[j-1]: 56 | dp[i][j] = dp[i-1][j-1] 57 | 58 | # If last character are different, consider all 59 | # possibilities and find minimum 60 | else: 61 | dp[i][j] = 1 + min(dp[i][j-1], # Insert 62 | dp[i-1][j], # Remove 63 | dp[i-1][j-1]) # Replace 64 | 65 | return dp[m][n] 66 | 67 | 68 | # Driver code 69 | # str1 = "sunday" 70 | # str2 = "saturday" 71 | 72 | # print(editDistDP(str1, str2, len(str1), len(str2))) 73 | # This code is contributed by Bhavya Jain 74 | 75 | df = pd.read_csv('./Dataset/corpus.csv') 76 | # df 77 | 78 | train_df, test_df = train_test_split(df, test_size=.15) 79 | train_df, valid_df = train_test_split(train_df, test_size=.05) 80 | 81 | # len(train_df), len(valid_df), len(test_df) 82 | 83 | erroneous_words = [] 84 | actual_words = [] 85 | calculated_words = [] 86 | 87 | for i in tqdm(range(10000)): 88 | word = valid_df['Error'].values[i] 89 | # print(word) 90 | 91 | x = len(word) 92 | while True: 93 | temp_df = train_df['Word'].str.startswith(word[:x], na = False) 94 | temp_df = train_df[temp_df] 95 | if len(temp_df) != 0: 96 | break 97 | x -= 1 98 | 99 | if len(temp_df) > 100: 100 | temp_df = temp_df.sample(100) 101 | 102 | # print(temp_df) 103 | 104 | scores = [] 105 | for temp_word in temp_df['Word'].values: 106 | # score = editDistance(word, temp_word, len(word), len(temp_word)) 107 | score = editDistDP(word, temp_word, len(word), len(temp_word)) 108 | scores.append(score) 109 | 110 | temp_df['Scores'] = scores 111 | temp_df = temp_df.sort_values(by=['Scores'], ascending=True) 112 | 113 | calculated = temp_df.iloc[0, 0] 114 | 115 | act_word = valid_df['Word'].values[i] 116 | 117 | erroneous_words.append(word) 118 | calculated_words.append(calculated) 119 | actual_words.append(act_word) 120 | 121 | if i % 100 == 0 and i > 0: 122 | x = pd.DataFrame({ 123 | 'Error': erroneous_words, 124 | 'Actual': actual_words, 125 | 'Calculated': calculated_words 126 | }) 127 | x.to_csv('./Dataset/ed_output.csv', index=False) 128 | 129 | 130 | # print(word, calculated) 131 | print(f"\n erroneous: {word}\n actual: {act_word}\n calculated: {calculated}") 132 | 133 | words = [] 134 | for i in tqdm(range(len(df))): 135 | if df.iloc[i, 1] not in x['Error'].values: 136 | words.append(df.iloc[i, 0]) 137 | 138 | # x = pd.DataFrame({ 139 | # 'Error': erroneous_words, 140 | # 'Actual': actual_words, 141 | # 'Calculated': calculated_words 142 | # }) 143 | 144 | acc_flags = [] 145 | for i in range(len(x)): 146 | if x.iloc[i, 1] == x.iloc[i, -1]: 147 | acc_flags.append(1) 148 | else: 149 | acc_flags.append(0) 150 | x['EM'] = acc_flags 151 | 152 | train_df = df 153 | mod_acc_flags = [] 154 | for pred in x['Calculated'].values: 155 | if pred in words: 156 | mod_acc_flags.append(1) 157 | else: 158 | mod_acc_flags.append(0) 159 | x['MA'] = mod_acc_flags 160 | 161 | y_true = np.array(x['Actual'].values) 162 | y_pred = np.array(x['Calculated'].values) 163 | 164 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 165 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 166 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 167 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 168 | ACC = metrics.accuracy_score(y_true, y_pred) 169 | 170 | print(f'Accuracy = {ACC*100:.2f}%') 171 | print(f'Precision = {PR:.4f}') 172 | print(f'Recall = {RE:.4f}') 173 | print(f'F1 Score = {F1:.4f}') 174 | print(f'F0.5 Score = {F05:.4f}') 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Encoder(nn.Module): 7 | def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device, max_length=50): 8 | super().__init__() 9 | assert kernel_size % 2 == 1, "Kernel size should be odd in encoder" 10 | self.device = device 11 | self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device) 12 | self.tok_embedding = nn.Embedding(input_dim, emb_dim) 13 | self.pos_embedding = nn.Embedding(max_length, emb_dim) 14 | self.emb2hid = nn.Linear(emb_dim, hid_dim) 15 | self.hid2emb = nn.Linear(hid_dim, emb_dim) 16 | self.convs = nn.ModuleList([ 17 | nn.Conv1d( 18 | in_channels=hid_dim, out_channels= 2 *hid_dim, kernel_size=kernel_size, padding=(kernel_size-1 )//2 19 | ) for _ in range(n_layers) 20 | ]) 21 | self.dropout = nn.Dropout(dropout) 22 | 23 | def forward(self, src): 24 | batch_size = src.shape[0] 25 | src_len = src.shape[1] 26 | pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) 27 | tok_embedded = self.tok_embedding(src) 28 | pos_embedded = self.pos_embedding(pos) 29 | embedded = self.dropout(tok_embedded + pos_embedded) 30 | conv_inp = self.emb2hid(embedded) 31 | conv_inp = conv_inp.permute(0, 2, 1) 32 | 33 | for idx, conv in enumerate(self.convs): 34 | conved = conv(self.dropout(conv_inp)) 35 | conved = F.glu(conved, dim=1) 36 | conved = (conved + conv_inp) * self.scale 37 | conv_inp = conved 38 | 39 | conved = self.hid2emb(conved.permute(0, 2, 1)) 40 | combined = (conved + embedded) * self.scale 41 | return conved, combined 42 | 43 | 44 | class Decoder(nn.Module): 45 | def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, \ 46 | trg_pad_idx, device, max_length=50): 47 | super().__init__() 48 | self.kernel_size = kernel_size 49 | self.trg_pad_idx = trg_pad_idx 50 | self.device = device 51 | self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device) 52 | 53 | self.tok_embedding = nn.Embedding(output_dim, emb_dim) 54 | self.pos_embedding = nn.Embedding(max_length, emb_dim) 55 | self.emb2hid = nn.Linear(emb_dim, hid_dim) 56 | self.hid2emb = nn.Linear(hid_dim, emb_dim) 57 | 58 | self.attn_hid2emb = nn.Linear(hid_dim, emb_dim) 59 | self.attn_emb2hid = nn.Linear(emb_dim, hid_dim) 60 | 61 | self.fc_out = nn.Linear(emb_dim, output_dim) 62 | self.convs = nn.ModuleList([ 63 | nn.Conv1d( 64 | in_channels=hid_dim, out_channels=2 * hid_dim, kernel_size=kernel_size 65 | ) for _ in range(n_layers) 66 | ]) 67 | self.dropout = nn.Dropout(dropout) 68 | 69 | def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined): 70 | conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1)) 71 | combined = (conved_emb + embedded) * self.scale 72 | energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1)) 73 | attention = F.softmax(energy, dim=2) 74 | attended_encoding = torch.matmul(attention, encoder_combined) 75 | attended_encoding = self.attn_emb2hid(attended_encoding) 76 | attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale 77 | return attention, attended_combined 78 | 79 | def forward(self, trg, encoder_conved, encoder_combined): 80 | batch_size = trg.shape[0] 81 | trg_len = trg.shape[1] 82 | pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device) 83 | 84 | tok_embedded = self.tok_embedding(trg) 85 | pos_embedded = self.pos_embedding(pos) 86 | embedded = self.dropout(tok_embedded + pos_embedded) 87 | 88 | conv_inp = self.emb2hid(embedded) 89 | conv_inp = conv_inp.permute(0, 2, 1) 90 | 91 | batch_size = conv_inp.shape[0] 92 | hid_dim = conv_inp.shape[1] 93 | for idx, conv in enumerate(self.convs): 94 | conv_inp = self.dropout(conv_inp) 95 | padding = torch.zeros( 96 | batch_size, hid_dim, self.kernel_size - 1 97 | ).fill_(self.trg_pad_idx).to(self.device) 98 | padded_conv_inp = torch.cat((padding, conv_inp), dim=2) 99 | conved = conv(padded_conv_inp) 100 | conved = F.glu(conved, dim=1) 101 | 102 | attention, conved = self.calculate_attention( 103 | embedded, conved, encoder_conved, encoder_combined 104 | ) 105 | conved = (conved + conv_inp) * self.scale 106 | conv_inp = conved 107 | 108 | conved = self.hid2emb(conved.permute(0, 2, 1)) 109 | output = self.fc_out(self.dropout(conved)) 110 | return output, attention 111 | 112 | 113 | class Seq2Seq(nn.Module): 114 | def __init__(self, encoder, decoder): 115 | super().__init__() 116 | self.encoder = encoder 117 | self.decoder = decoder 118 | 119 | def forward(self, src, trg): 120 | encoder_conved, encoder_combined = self.encoder(src) 121 | output, attention = self.decoder(trg, encoder_conved, encoder_combined) 122 | return output, attention 123 | 124 | if __name__ == '__main__': 125 | pass -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/utils.py: -------------------------------------------------------------------------------- 1 | import torch, torch.nn as nn, torch.optim as optim 2 | import torch.nn.functional as F 3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 4 | import random 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from sklearn.model_selection import train_test_split 8 | import math 9 | import time 10 | # from torchtext.data.metrics import bleu_score 11 | 12 | import matplotlib.pyplot as plt 13 | import matplotlib.ticker as ticker 14 | import matplotlib.font_manager as fm 15 | 16 | import numpy as np 17 | import math 18 | import time 19 | 20 | import warnings as wrn 21 | wrn.filterwarnings('ignore') 22 | 23 | 24 | def word2chars(word): 25 | w2c = [char for char in word] 26 | return ' '.join(w2c) 27 | 28 | 29 | def df2train_test_dfs(df, test_size=0.15): 30 | df['Word'] = df['Word'].apply(word2chars) 31 | df['Error'] = df['Error'].apply(word2chars) 32 | df = df.sample(frac=1).reset_index(drop=True) 33 | df = df.iloc[:, [1, 0]] 34 | train_df, test_df = train_test_split(df, test_size=test_size) 35 | train_df.to_csv('./Dataset/train.csv', index=False) 36 | test_df.to_csv('./Dataset/test.csv', index=False) 37 | 38 | 39 | def df2train_valid_test_dfs(df, test_size=0.15): 40 | df['Word'] = df['Word'].apply(word2chars) 41 | df['Error'] = df['Error'].apply(word2chars) 42 | df = df.sample(frac=1).reset_index(drop=True) 43 | df = df.iloc[:, [1, 0]] 44 | train_df, test_df = train_test_split(df, test_size=test_size) 45 | train_df, valid_df = train_test_split(train_df, test_size=.05) 46 | 47 | train_df.to_csv('./Dataset/train.csv', index=False) 48 | valid_df.to_csv('./Dataset/valid.csv', index=False) 49 | test_df.to_csv('./Dataset/test.csv', index=False) 50 | 51 | 52 | def df2train_error_dfs(df, error='Cognitive Error', test_size=0.20): 53 | df['Word'] = df['Word'].apply(word2chars) 54 | df['Error'] = df['Error'].apply(word2chars) 55 | df = df.sample(frac=1).reset_index(drop=True) 56 | # df = df.iloc[:, [1, 0]] 57 | train_df, error_df = train_test_split(df, test_size=test_size) 58 | error_df = error_df.loc[error_df['ErrorType'] == error] 59 | train_df = train_df.iloc[:, [1, 0]] 60 | error_df = error_df.iloc[:, [1, 0]] 61 | 62 | train_df.to_csv('./Dataset/train.csv', index=False) 63 | error_df.to_csv('./Dataset/error.csv', index=False) 64 | 65 | 66 | def basic_tokenizer(text): 67 | return text.split() 68 | 69 | 70 | def init_weights(m): 71 | for name, param in m.named_parameters(): 72 | if 'weight' in name: 73 | nn.init.normal_(param.data, mean=0, std=0.01) 74 | else: 75 | nn.init.constant_(param.data, 0) 76 | 77 | 78 | def count_parameters(model): 79 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 80 | 81 | 82 | def save_model(model, epoch, optimizer, train_loss, PATH): 83 | torch.save({ 84 | 'epoch': epoch, 85 | 'model_state_dict': model.state_dict(), 86 | 'optimizer_state_dict': optimizer.state_dict(), 87 | 'loss': train_loss 88 | }, PATH) 89 | print(f"---------\nModel Saved at {PATH}\n---------\n") 90 | 91 | 92 | def load_model(model, optimizer, PATH): 93 | checkpoint = torch.load(PATH) 94 | model.load_state_dict(checkpoint['model_state_dict']) 95 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 96 | epoch = checkpoint['epoch'] 97 | train_loss = checkpoint['loss'] 98 | return checkpoint, epoch, train_loss 99 | 100 | 101 | def print_n_best(decoded_seq, itos): 102 | topk_preds = [] 103 | for rank, seq in enumerate(decoded_seq): 104 | pred = "".join([itos[idx] for idx in seq[1:-1]]) 105 | topk_preds.append(pred) 106 | # print(f'Out: Rank-{rank+1}: {pred}') 107 | return topk_preds 108 | 109 | 110 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30): 111 | model.eval() 112 | tokens = [token for token in sentence] 113 | 114 | tokens = [src_field.init_token] + tokens + [src_field.eos_token] 115 | 116 | src_indexes = [src_field.vocab.stoi[token] for token in tokens] 117 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device) 118 | src_len = torch.LongTensor([len(src_indexes)]) 119 | 120 | with torch.no_grad(): 121 | encoder_outputs, hidden = model.encoder(src_tensor, src_len) 122 | 123 | mask = model.create_mask(src_tensor) 124 | 125 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]] 126 | attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device) 127 | 128 | for i in range(max_len): 129 | trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device) 130 | with torch.no_grad(): 131 | output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask) 132 | 133 | attentions[i] = attention 134 | 135 | pred_token = output.argmax(1).item() 136 | trg_indexes.append(pred_token) 137 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]: 138 | break 139 | 140 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes] 141 | return trg_tokens[1:], attentions[:len(trg_tokens) - 1] 142 | 143 | 144 | def display_attention(sentence, translation, attention): 145 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf') 146 | 147 | fig = plt.figure(figsize=(7, 10)) 148 | ax = fig.add_subplot(111) 149 | 150 | attention = attention.squeeze(1).cpu().detach().numpy() 151 | 152 | cax = ax.matshow(attention, cmap='bone') 153 | 154 | ax.tick_params(labelsize=15) 155 | 156 | x_ticks = [''] + [''] + [t.lower() for t in sentence] + [''] 157 | y_ticks = [''] + translation 158 | 159 | ax.set_xticklabels(x_ticks, rotation=0, fontproperties=prop, fontsize=20) 160 | ax.set_yticklabels(y_ticks, fontproperties=prop, fontsize=20) 161 | 162 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 163 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 164 | 165 | plt.show() 166 | plt.close() 167 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/errors.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | df2train_test_dfs, basic_tokenizer, init_weights, count_parameters, 3 | translate_sentence, display_attention, df2train_valid_test_dfs, 4 | save_model, load_model, df2train_error_dfs, word2chars 5 | ) 6 | from models import Encoder, Decoder, Attention, Seq2Seq 7 | from pipeline import train, test_accuracy 8 | from inference import test_beam, test_greedy 9 | from focalLoss import FocalLoss 10 | 11 | import torch, torch.nn as nn, torch.optim as optim 12 | import torch.nn.functional as F 13 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 14 | import random 15 | from tqdm import tqdm 16 | import pandas as pd 17 | from sklearn.model_selection import train_test_split 18 | import math 19 | import time 20 | # from torchtext.data.metrics import bleu_score 21 | 22 | import matplotlib.pyplot as plt 23 | import matplotlib.ticker as ticker 24 | import matplotlib.font_manager as fm 25 | 26 | import numpy as np 27 | import math 28 | import time 29 | import sys 30 | 31 | import warnings as wrn 32 | wrn.filterwarnings('ignore') 33 | 34 | 35 | def error_df(df, error='Cognitive Error'): 36 | df = df.loc[df['ErrorType'] == error] 37 | df['Word'] = df['Word'].apply(word2chars) 38 | df['Error'] = df['Error'].apply(word2chars) 39 | df = df.sample(frac=1).reset_index(drop=True) 40 | df = df.iloc[:, [1, 0]] 41 | df.to_csv('./Dataset/error.csv', index=False) 42 | 43 | 44 | def check_error(): 45 | df = pd.read_csv('./Dataset/sec_dataset_II.csv') 46 | df = df.iloc[:, :] 47 | # df2train_test_dfs(df=df, test_size=0.15) 48 | df2train_valid_test_dfs(df=df, test_size=0.15) 49 | 50 | # ['Cognitive Error', 'Homonym Error', 'Run-on Error', 51 | # 'Split-word Error (Left)', 'Split-word Error (Random)', 52 | # 'Split-word Error (Right)', 'Split-word Error (both)', 53 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition', 54 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition', 55 | # 'Visual Error', 'Visual Error (Combined Character)'] 56 | error_name = 'Cognitive Error' 57 | error_df(df, error_name) 58 | # df2train_error_dfs(df, error='Cognitive Error') 59 | # sys.exit() 60 | 61 | SRC = Field( 62 | tokenize=basic_tokenizer, lower=False, 63 | init_token='', eos_token='', 64 | sequential=True, use_vocab=True, include_lengths=True 65 | ) 66 | TRG = Field( 67 | tokenize=basic_tokenizer, lower=False, 68 | init_token='', eos_token='', 69 | sequential=True, use_vocab=True 70 | ) 71 | fields = { 72 | 'Error': ('src', SRC), 73 | 'Word': ('trg', TRG) 74 | } 75 | train_data, valid_data, test_data = TabularDataset.splits( 76 | path='./Dataset', 77 | train='train.csv', 78 | validation='valid.csv', 79 | test='test.csv', 80 | format='csv', 81 | fields=fields 82 | ) 83 | error_data, _ = TabularDataset.splits( 84 | path='./Dataset', 85 | train='error.csv', 86 | test='error.csv', 87 | format='csv', 88 | fields=fields 89 | ) 90 | 91 | # print(error_data) 92 | # sys.exit() 93 | 94 | SRC.build_vocab(train_data, max_size=64, min_freq=100) 95 | TRG.build_vocab(train_data, max_size=64, min_freq=75) 96 | # print(len(SRC.vocab), len(TRG.vocab)) 97 | 98 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 99 | BATCH_SIZE = 256 100 | INPUT_DIM = len(SRC.vocab) 101 | OUTPUT_DIM = len(TRG.vocab) 102 | ENC_EMB_DIM = 64 103 | DEC_EMB_DIM = 64 104 | ENC_HIDDEN_DIM = 256 105 | DEC_HIDDEN_DIM = 512 106 | ENC_DROPOUT = 0.25 107 | DEC_DROPOUT = 0.25 108 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 109 | MAX_LEN = 32 110 | N_EPOCHS = 10 111 | CLIP = 1 112 | PATH = '' 113 | 114 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 115 | (train_data, valid_data, test_data), 116 | batch_size=BATCH_SIZE, 117 | sort_within_batch=True, 118 | sort_key=lambda x: len(x.src), 119 | device=DEVICE 120 | ) 121 | error_iterator, _ = BucketIterator.splits( 122 | (error_data, error_data), 123 | batch_size=BATCH_SIZE, 124 | sort_within_batch=True, 125 | sort_key=lambda x: len(x.src), 126 | device=DEVICE 127 | ) 128 | 129 | attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM) 130 | encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT) 131 | decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention) 132 | 133 | model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE) 134 | model.apply(init_weights) 135 | # print(model) 136 | # print(f'The model has {count_parameters(model):,} trainable parameters') 137 | 138 | optimizer = optim.Adam(model.parameters()) 139 | # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4) 140 | 141 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 142 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 143 | # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX) 144 | # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean') 145 | 146 | PATH = './Checkpoints/spell_s2s.pth' 147 | # best_loss = 1e10 148 | 149 | checkpoint, epoch, train_loss = load_model(model, optimizer, PATH) 150 | # best_loss = train_loss 151 | error_df_ = pd.read_csv('./Dataset/error.csv') 152 | error_pct = (len(error_df_) / len(df)) * 100 153 | 154 | print(f"\n------------\nError Name: {error_name} - {error_pct:.2f}% of dataset\n------------") 155 | test_accuracy(error_data, SRC, TRG, model, DEVICE) 156 | 157 | 158 | # test_beam(model, train_data, test_data, SRC, TRG, DEVICE) 159 | # test_greedy(test_data, SRC, TRG, model, DEVICE) 160 | 161 | # example_idx = 1 162 | # src = vars(train_data.examples[example_idx])['src'] 163 | # trg = vars(train_data.examples[example_idx])['trg'] 164 | # print(f'src = {src}') 165 | # print(f'trg = {trg}') 166 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 167 | # print(f'predicted trg = {translation}') 168 | # display_attention(src, translation, attention) 169 | 170 | 171 | if __name__ == '__main__': 172 | check_error() 173 | -------------------------------------------------------------------------------- /Baselines/ConvSeq2Seq/main.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | basic_tokenizer, word2char, count_parameters, translate_sentence, 3 | save_model, load_model 4 | ) 5 | from errors import error_df 6 | from models import Encoder, Decoder, Seq2Seq 7 | from pipeline import train, evaluate 8 | from metrics import evaluation_report 9 | 10 | import torch 11 | import torch.optim as optim 12 | import torch.nn as nn 13 | import pandas as pd 14 | from sklearn.model_selection import train_test_split 15 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator 16 | import os 17 | import argparse 18 | 19 | import warnings as wrn 20 | wrn.filterwarnings('ignore') 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv", 26 | choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"] 27 | ) 28 | parser.add_argument("--EMB_DIM", help="Embedding Dimension", type=int, default=128, choices=[64, 128, 256]) 29 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=256, choices=[64, 128, 256]) 30 | parser.add_argument("--ENC_LAYERS", help="Encoder Layers", type=int,default=5, choices=[5, 10, 20]) 31 | parser.add_argument("--DEC_LAYERS", help="Decoder Layers", type=int,default=5, choices=[5, 10, 20]) 32 | parser.add_argument("--ENC_KERNEL_SIZE", help="Encoder Kernel Size", type=int, default=3, choices=[3, 5, 10]) 33 | parser.add_argument("--DEC_KERNEL_SIZE", help="Decoder Kernel Size", type=int, default=3, choices=[3, 5, 10]) 34 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5]) 35 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5]) 36 | parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=0.1, choices=[0.1, 0.2, 0.5, 1]) 37 | parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512]) 38 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 39 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 40 | args = parser.parse_args() 41 | 42 | df = pd.read_csv(args.CORPUS) 43 | df['Word'] = df['Word'].apply(word2char) 44 | df['Error'] = df['Error'].apply(word2char) 45 | df = df.sample(frac=1).reset_index(drop=True) 46 | df = df[['Error', 'Word']] 47 | 48 | train_df, test_df = train_test_split(df, test_size=.15) 49 | train_df, valid_df = train_test_split(train_df, test_size=.05) 50 | 51 | train_df.to_csv('./Dataset/train.csv', index=False) 52 | valid_df.to_csv('./Dataset/valid.csv', index=False) 53 | test_df.to_csv('./Dataset/test.csv', index=False) 54 | 55 | SRC = Field( 56 | tokenize=basic_tokenizer, lower=False, 57 | init_token='', eos_token='', batch_first=True 58 | ) 59 | TRG = Field( 60 | tokenize=basic_tokenizer, lower=False, 61 | init_token='', eos_token='', batch_first=True 62 | ) 63 | fields = { 64 | 'Error': ('src', SRC), 65 | 'Word': ('trg', TRG) 66 | } 67 | 68 | train_data, valid_data, test_data = TabularDataset.splits( 69 | path='./Dataset', 70 | train='train.csv', 71 | validation='valid.csv', 72 | test='test.csv', 73 | format='csv', 74 | fields=fields 75 | ) 76 | 77 | SRC.build_vocab(train_data, min_freq=100) 78 | TRG.build_vocab(train_data, min_freq=50) 79 | 80 | # Hyperparameters 81 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 82 | BATCH_SIZE = args.BATCH_SIZE 83 | # 84 | INPUT_DIM = len(SRC.vocab) 85 | OUTPUT_DIM = len(TRG.vocab) 86 | EMB_DIM = args.EMB_DIM # 64 87 | HID_DIM = args.HID_DIM # 256 # each conv. layer has 2 * hid_dim filters 88 | ENC_LAYERS = args.ENC_LAYERS # 10 # number of conv. blocks in encoder 89 | DEC_LAYERS = args.DEC_LAYERS # 10 # number of conv. blocks in decoder 90 | ENC_KERNEL_SIZE = args.ENC_KERNEL_SIZE # must be odd! 91 | DEC_KERNEL_SIZE = args.DEC_KERNEL_SIZE # can be even or odd 92 | ENC_DROPOUT = args.ENC_DROPOUT 93 | DEC_DROPOUT = args.DEC_DROPOUT 94 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 95 | CLIP = args.CLIP 96 | PATH = './Checkpoints/conv_s2s.pth' 97 | 98 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 99 | (train_data, valid_data, test_data), 100 | batch_size=BATCH_SIZE, 101 | sort_within_batch=True, 102 | sort_key=lambda x: len(x.src), 103 | device=DEVICE 104 | ) 105 | 106 | enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, DEVICE) 107 | dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, DEVICE) 108 | model = Seq2Seq(enc, dec).to(DEVICE) 109 | # print(f'The model has {count_parameters(model):,} trainable parameters') 110 | 111 | optimizer = optim.Adam(model.parameters()) 112 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 113 | 114 | epoch = 1 115 | # load the model 116 | if os.path.exists(PATH): 117 | checkpoint, epoch, train_loss = load_model(model, PATH) 118 | # 119 | best_loss = 1e10 120 | 121 | for epoch in range(epoch, N_EPOCHS): 122 | print(f"Epoch: {epoch} / {N_EPOCHS}") 123 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP) 124 | print(f"Train Loss: {train_loss:.4f}") 125 | if train_loss < best_loss: 126 | best_loss = train_loss 127 | save_model(model, train_loss, epoch, PATH) 128 | 129 | # example_idx = 10 130 | # src = vars(train_data.examples[example_idx])['src'] 131 | # trg = vars(train_data.examples[example_idx])['trg'] 132 | # print(f'src = {src}') 133 | # print(f'trg = {trg}') 134 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 135 | # print(f'predicted trg = {translation}') 136 | 137 | evaluation_report(valid_data, SRC, TRG, model, DEVICE) 138 | # evaluation_report(error_data, SRC, TRG, model, DEVICE) 139 | 140 | 141 | # ------------- 142 | # error_types = ['Cognitive Error', 'Homonym Error', 'Run-on Error', 143 | # 'Split-word Error (Left)', 'Split-word Error (Random)', 144 | # 'Split-word Error (Right)', 'Split-word Error (both)', 145 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition', 146 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition', 147 | # 'Visual Error', 'Visual Error (Combined Character)'] 148 | 149 | # for error_name in error_types: 150 | # print(f'------\nError Type: {error_name}\n------') 151 | # error_df(df_copy, error_name) 152 | 153 | # error_data, _ = TabularDataset.splits( 154 | # path='./Dataset', 155 | # train='error.csv', 156 | # test='error.csv', 157 | # format='csv', 158 | # fields=fields 159 | # ) 160 | 161 | # eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE) 162 | 163 | # error_name = error_name.replace(' ', '').replace('(', '').replace(')', '') 164 | # eval_df.to_csv(f'./Dataframes/convs2s_{error_name}_2.csv') 165 | # print('\n\n') 166 | # ------------- 167 | 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /corrector.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | word2char, basic_tokenizer, count_parameters, initialize_weights, 3 | save_model, load_model, error_df, train_valid_test_df, mask2str, 4 | error_blank, find_len, error_df_2 5 | ) 6 | from transformer import ( 7 | Encoder, EncoderLayer, MultiHeadAttentionLayer, 8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer, 9 | Seq2Seq 10 | ) 11 | from pipeline import train, evaluate 12 | from metrics import evaluation_report 13 | 14 | import pandas as pd 15 | from sklearn.model_selection import train_test_split 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator 17 | import torch 18 | import torch.nn as nn 19 | import os 20 | import gc 21 | from tqdm import tqdm 22 | import sys 23 | import argparse 24 | 25 | import warnings as wrn 26 | wrn.filterwarnings('ignore') 27 | 28 | import os 29 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1" 30 | 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256]) 36 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7]) 37 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7]) 38 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 39 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 40 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256]) 41 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256]) 42 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 43 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 44 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10]) 45 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 46 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 47 | args = parser.parse_args() 48 | 49 | SEED = 1234 50 | torch.manual_seed(SEED) 51 | torch.cuda.manual_seed(SEED) 52 | 53 | df = pd.read_csv('./Dataset/purificator_preds.csv') 54 | df_copy = df.copy() 55 | df['Word'] = df['Word'].apply(word2char) 56 | df['Error'] = df['Error'].apply(word2char) 57 | df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char) 58 | df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char) 59 | df['ErrorBlanksPredD2'] = df['ErrorBlanksPredD2'].apply(word2char) 60 | 61 | df['MaskErrorBlank'] = ' ' + df['Error'] + ' ' + df['ErrorBlanksPredD2'] + ' ' 62 | df['Length'] = df['MaskErrorBlank'].apply(find_len) 63 | 64 | df = df.loc[df['Length'] <= 48] # 48 works 65 | 66 | # df = df.iloc[:, [1, -2, 8]] # word - maskerrorblank - errortype 67 | df = df[['Word', 'MaskErrorBlank', 'ErrorType']] 68 | 69 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=.15, valid_size=.05) 70 | 71 | train_df.to_csv('./Dataset/train.csv', index=False) 72 | valid_df.to_csv('./Dataset/valid.csv', index=False) 73 | test_df.to_csv('./Dataset/test.csv', index=False) 74 | 75 | SRC = Field( 76 | tokenize=basic_tokenizer, lower=False, 77 | init_token='', eos_token='', batch_first=True 78 | ) 79 | TRG = Field( 80 | tokenize=basic_tokenizer, lower=False, 81 | init_token='', eos_token='', batch_first=True 82 | ) 83 | fields = { 84 | 'MaskErrorBlank': ('src', SRC), 85 | 'Word': ('trg', TRG) 86 | } 87 | 88 | train_data, valid_data, test_data = TabularDataset.splits( 89 | path='./Dataset', 90 | train='train.csv', 91 | validation='valid.csv', 92 | test='test.csv', 93 | format='csv', 94 | fields=fields 95 | ) 96 | 97 | SRC.build_vocab(train_data, min_freq=100) 98 | TRG.build_vocab(train_data, min_freq=50) 99 | 100 | 101 | # ------------------------------ 102 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 103 | BATCH_SIZE = 512 # 512 104 | # ------------------------------ 105 | INPUT_DIM = len(SRC.vocab) 106 | OUTPUT_DIM = len(TRG.vocab) 107 | # ------------------------------ 108 | HID_DIM = int(args.HID_DIM) 109 | ENC_LAYERS = int(args.ENC_LAYERS) 110 | DEC_LAYERS = int(args.DEC_LAYERS) 111 | ENC_HEADS = int(args.ENC_HEADS) 112 | DEC_HEADS = int(args.DEC_HEADS) 113 | ENC_PF_DIM = int(args.ENC_PF_DIM) 114 | DEC_PF_DIM = int(args.DEC_PF_DIM) 115 | ENC_DROPOUT = float(args.ENC_DROPOUT) 116 | DEC_DROPOUT = float(args.DEC_DROPOUT) 117 | CLIP = float(args.CLIP) 118 | N_EPOCHS = int(args.N_EPOCHS) 119 | LEARNING_RATE = float(args.LEARNING_RATE) 120 | # ------------------------------ 121 | PATH = './Checkpoints/corrector.pth' 122 | # ------------------------------ 123 | gc.collect() 124 | torch.cuda.empty_cache() 125 | # ----------------------------- 126 | 127 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 128 | (train_data, valid_data, test_data), 129 | batch_size=BATCH_SIZE, 130 | sort_within_batch=True, 131 | sort_key=lambda x: len(x.src), 132 | device=DEVICE 133 | ) 134 | 135 | enc = Encoder( 136 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, 137 | ENC_DROPOUT, DEVICE 138 | ) 139 | dec = Decoder( 140 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, 141 | DEC_DROPOUT, DEVICE 142 | ) 143 | 144 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 145 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 146 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE) 147 | model.apply(initialize_weights) 148 | # print(f'The model has {count_parameters(model):,} trainable parameters') 149 | 150 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) 151 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 152 | # criterion = nn.BCEWithLogitsLoss() 153 | 154 | epoch = 1 155 | best_loss = 1e10 156 | if os.path.exists(PATH): 157 | checkpoint, epoch, train_loss = load_model(model, PATH) 158 | best_loss = train_loss 159 | 160 | # model.resize_token_embeddings(len(TRG.vocab)) 161 | for epoch in range(epoch, N_EPOCHS): 162 | print(f"Epoch: {epoch} / {N_EPOCHS}") 163 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP) 164 | print(f"Train Loss: {train_loss:.4f}") 165 | if train_loss < best_loss: 166 | best_loss = train_loss 167 | save_model(model, train_loss, epoch, PATH) 168 | 169 | # --------------------- 170 | error_types = sorted(list(set(df.iloc[:, -1].values))) 171 | 172 | for error_name in error_types: 173 | print(f'------\nError Type: {error_name}\n------') 174 | error_df_2(df, error_name) 175 | 176 | error_data, _ = TabularDataset.splits( 177 | path='./Dataset', 178 | train='error.csv', 179 | test='error.csv', 180 | format='csv', 181 | fields=fields 182 | ) 183 | 184 | eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE) 185 | 186 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '') 187 | print('\n\n') 188 | # --------------------- 189 | 190 | 191 | if __name__ == '__main__': 192 | main() 193 | -------------------------------------------------------------------------------- /Baselines/DCSpell/corrector.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | word2char, basic_tokenizer, count_parameters, initialize_weights, 3 | save_model, load_model, error_df, train_valid_test_df, mask2str, 4 | error_df_2, error_df_3, find_len, train_valid_test_df2, merge_dfs 5 | ) 6 | from transformer import ( 7 | Encoder, EncoderLayer, MultiHeadAttentionLayer, 8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer, 9 | Seq2Seq 10 | ) 11 | from pipeline import train, evaluate 12 | from metrics import evaluation_report, evaluation_report2, evaluation_report3 13 | 14 | import pandas as pd 15 | from sklearn.model_selection import train_test_split 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator 17 | import torch 18 | import torch.nn as nn 19 | import os 20 | import gc 21 | import argparse 22 | import sys 23 | 24 | import warnings as wrn 25 | wrn.filterwarnings('ignore') 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256]) 31 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7]) 32 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7]) 33 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 34 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 35 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256]) 36 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256]) 37 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 38 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 39 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10]) 40 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 41 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 42 | args = parser.parse_args() 43 | 44 | SEED = 1234 45 | torch.manual_seed(SEED) 46 | torch.cuda.manual_seed(SEED) 47 | 48 | df = pd.read_csv('./Dataset/detector_preds.csv') 49 | df['Error'] = df['Error'].apply(word2char) 50 | df['Word'] = df['Word'].apply(word2char) 51 | df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char) 52 | df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char) 53 | 54 | df['MaskErrorBlank'] = ' ' + df['Error'] + ' ' + df['ErrorBlanksPredD1'] + ' ' 55 | df['Length'] = df['MaskErrorBlank'].apply(find_len) 56 | df = df.loc[df['Length'] <= 48] # 48 works 57 | 58 | df = df.sample(frac=1).reset_index(drop=True) 59 | df = df[['ErrorBlanksActual', 'MaskErrorBlank', 'ErrorType']] 60 | 61 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05) 62 | 63 | train_df.to_csv('./Dataset/train.csv', index=False) 64 | valid_df.to_csv('./Dataset/valid.csv', index=False) 65 | test_df.to_csv('./Dataset/test.csv', index=False) 66 | 67 | SRC = Field( 68 | tokenize=basic_tokenizer, lower=False, 69 | init_token='', eos_token='', batch_first=True 70 | ) 71 | TRG = Field( 72 | tokenize=basic_tokenizer, lower=False, 73 | init_token='', eos_token='', batch_first=True 74 | ) 75 | WORD = Field( 76 | tokenize=basic_tokenizer, lower=False, 77 | init_token='', eos_token='', batch_first=True 78 | ) 79 | fields = { 80 | 'ErrorBlanksPredD1': ('src', SRC), 81 | 'Word': ('trg', TRG) 82 | } 83 | 84 | train_data, valid_data, test_data = TabularDataset.splits( 85 | path='./Dataset', 86 | train='train.csv', 87 | validation='valid.csv', 88 | test='test.csv', 89 | format='csv', 90 | fields=fields 91 | ) 92 | 93 | SRC.build_vocab(train_data, min_freq=100) # 100 94 | TRG.build_vocab(train_data, min_freq=50) # 50 95 | 96 | # ------------------------------ 97 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 98 | BATCH_SIZE = 512 # 512 99 | # ------------------------------ 100 | INPUT_DIM = len(SRC.vocab) 101 | OUTPUT_DIM = len(TRG.vocab) 102 | # ------------------------------ 103 | HID_DIM = int(args.HID_DIM) 104 | ENC_LAYERS = int(args.ENC_LAYERS) 105 | DEC_LAYERS = int(args.DEC_LAYERS) 106 | ENC_HEADS = int(args.ENC_HEADS) 107 | DEC_HEADS = int(args.DEC_HEADS) 108 | ENC_PF_DIM = int(args.ENC_PF_DIM) 109 | DEC_PF_DIM = int(args.DEC_PF_DIM) 110 | ENC_DROPOUT = float(args.ENC_DROPOUT) 111 | DEC_DROPOUT = float(args.DEC_DROPOUT) 112 | CLIP = float(args.CLIP) 113 | N_EPOCHS = int(args.N_EPOCHS) 114 | LEARNING_RATE = float(args.LEARNING_RATE) 115 | # ------------------------------ 116 | PATH = './Checkpoints/corrector.pth' 117 | # ------------------------------ 118 | gc.collect() 119 | torch.cuda.empty_cache() 120 | # ----------------------------- 121 | 122 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 123 | (train_data, valid_data, test_data), 124 | batch_size=BATCH_SIZE, 125 | sort_within_batch=True, 126 | sort_key=lambda x: len(x.src), 127 | device=DEVICE 128 | ) 129 | 130 | enc = Encoder( 131 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, 132 | ENC_DROPOUT, DEVICE 133 | ) 134 | dec = Decoder( 135 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, 136 | DEC_DROPOUT, DEVICE 137 | ) 138 | 139 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 140 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 141 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE) 142 | model.apply(initialize_weights) 143 | # print(f'The model has {count_parameters(model):,} trainable parameters') 144 | 145 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) 146 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 147 | # criterion = nn.BCEWithLogitsLoss() 148 | 149 | epoch = 1 150 | best_loss = 1e10 151 | if os.path.exists(PATH): 152 | checkpoint, epoch, train_loss = load_model(model, PATH) 153 | best_loss = train_loss 154 | 155 | # model.resize_token_embeddings(len(TRG.vocab)) 156 | for epoch in range(epoch, N_EPOCHS): 157 | print(f"Epoch: {epoch} / {N_EPOCHS}") 158 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP) 159 | print(f"Train Loss: {train_loss:.4f}") 160 | if train_loss < best_loss: 161 | best_loss = train_loss 162 | save_model(model, train_loss, epoch, PATH) 163 | 164 | # --------------------- 165 | error_types = sorted(list(set(df.iloc[:, -1].values))) 166 | 167 | for error_name in error_types: 168 | print(f'------\nError Type: {error_name}\n------') 169 | error_df_3(df, error_name) 170 | 171 | error_data, _ = TabularDataset.splits( 172 | path='./Dataset', 173 | train='error.csv', 174 | test='error.csv', 175 | format='csv', 176 | fields=fields 177 | ) 178 | 179 | eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE) 180 | 181 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '') 182 | print('\n\n') 183 | # --------------------- 184 | 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /CorpusCreation/corpus_stats_valid.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.support import expected_conditions as EC 4 | from selenium.webdriver.common.by import By 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | import time 7 | import pandas as pd 8 | import re 9 | import sys 10 | import argparse 11 | from tqdm import tqdm 12 | 13 | 14 | # ######################################################## 15 | def login(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument("--email", help="Enter Your Email") 18 | parser.add_argument("--password", help="Enter Your Facebook Password") 19 | args = parser.parse_args() 20 | 21 | # code to ignore browser notifications 22 | chrome_options = webdriver.ChromeOptions() 23 | prefs = {"profile.default_content_setting_values.notifications": 2} 24 | chrome_options.add_experimental_option("prefs", prefs) 25 | driver = webdriver.Chrome('./chromedriver.exe', chrome_options=chrome_options) 26 | # open the webpage 27 | driver.get("https://wwww.facebook.com/") 28 | # target username 29 | username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']"))) 30 | password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']"))) 31 | # entering email as username 32 | username.clear() 33 | username.send_keys(args.email) 34 | # entering password 35 | password.clear() 36 | password.send_keys(args.password) 37 | # target the login button and click it 38 | time.sleep(5) 39 | button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click() 40 | # We are logged in! 41 | print("Logged in") 42 | return driver 43 | # ######################################################## 44 | 45 | 46 | # ######################################################## 47 | def scrape_post_1(): 48 | driver = login() 49 | # https://fb.watch/eN-nBOb45t/ 50 | url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid02TjtvmwDs51fyVRaHbvM5XgxL1gBGb6USBYvsxgMdn8c4BcQvjbLv1BFCjw52UsXQl&id=111762869482599&eav=Afba2OolCuRXElnzf97xViXfIosR66LZPdko_Q9oxtd5fhvZMDjeKOC_JD1Nx2LKtEE&__tn__=%2AW&paipv=0" 51 | # url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0" 52 | while True: 53 | driver.get(url) 54 | comments = driver.find_element(By.CLASS_NAME, "ef").text 55 | comments = re.sub("[A-Za-z0-9·\\n]", "", comments) 56 | next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href') 57 | if type(next_page) != str: 58 | break 59 | url = next_page 60 | time.sleep(5) 61 | sys.exit() 62 | with open('./dfs/comments.txt', 'a', encoding='utf-8') as f: 63 | f.write(comments) 64 | f.write(' \n ') 65 | # ######################################################## 66 | 67 | 68 | # ######################################################## 69 | def scrape_post_2(): 70 | driver = login() 71 | # https://fb.watch/eNQHYjDuA6/ 72 | url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0" 73 | while True: 74 | driver.get(url) 75 | comments = driver.find_elements(By.CLASS_NAME, "eb") 76 | for comment in comments: 77 | comment = comment.text 78 | comment = re.sub("[A-Za-z0-9·.\\n]", "", comment) 79 | with open('comments.txt', 'a', encoding='utf-8') as f: 80 | f.write(comment) 81 | f.write(' ') 82 | 83 | comments = driver.find_elements(By.CLASS_NAME, "ec") 84 | for comment in comments: 85 | comment = comment.text 86 | comment = re.sub("[A-Za-z0-9·.\\n]", "", comment) 87 | with open('./dfs/comments.txt', 'a', encoding='utf-8') as f: 88 | f.write(comment) 89 | f.write(' ') 90 | 91 | next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href') 92 | if type(next_page) != str: 93 | break 94 | 95 | url = next_page 96 | time.sleep(5) 97 | # ######################################################## 98 | 99 | 100 | # ######################################################## 101 | def clean_text(text): 102 | all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 103 | 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 104 | 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 105 | 'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 106 | 'ৗ', 'ড়', 'ঢ়', 'য়'] 107 | cleaned_text = '' 108 | for i in tqdm(range(len(text))): 109 | if text[i] in all_chars: 110 | cleaned_text += text[i] 111 | else: 112 | cleaned_text += ' ' 113 | return cleaned_text 114 | 115 | def find_stats(): 116 | f = open("./dfs/comments.txt", "r", encoding='utf-8') 117 | text = f.read() 118 | text = clean_text(text) 119 | 120 | words = sorted(text.split()) 121 | unique_words = sorted(list(set(words))) 122 | 123 | error_df = pd.read_csv('./dfs/sec_dataset_IV.csv') 124 | balanced_df = pd.DataFrame() 125 | all_error_types = sorted(list(set(error_df.iloc[:, -1].values))) 126 | for error in all_error_types: 127 | x = error_df.loc[error_df['ErrorType'] == error] 128 | if (len(x)) < 100000: 129 | balanced_df = pd.concat([balanced_df, x]) 130 | else: 131 | balanced_df = pd.concat([balanced_df, x.sample(100000)]) 132 | 133 | erroneous_words = balanced_df.iloc[:, 1].values 134 | erroneous_words_type = balanced_df.iloc[:, 2].values 135 | 136 | found = [] 137 | types = [] 138 | for i in tqdm(range(len(unique_words))): 139 | word = unique_words[i] 140 | if word in erroneous_words: 141 | found.append(word) 142 | types.append(erroneous_words_type[i]) 143 | if (i != 0 and i % 1000 == 0): 144 | print(len(found)) 145 | 146 | error_words = [] 147 | error_types = [] 148 | for i in tqdm(range(len(found))): 149 | word = found[i] 150 | etype = error_df.loc[error_df['Error'] == word]['ErrorType'].values[0] 151 | error_words.append(word) 152 | error_types.append(etype) 153 | 154 | temp = pd.DataFrame({ 155 | 'Error': error_words, 156 | 'ErrorType': error_types 157 | }) 158 | 159 | unique_etypes = sorted(list(set(error_types))) 160 | err_names, instances, pcts = [], [], [] 161 | for etype in unique_etypes: 162 | x = temp.loc[temp['ErrorType'] == etype] 163 | print(f"{etype}, {len(x)}/{len(temp)}, {len(x) / len(temp) * 100:.2f}%") 164 | err_names.append(etype) 165 | instances.append(f"{len(x)}/{len(temp)}") 166 | pcts.append(len(x) / len(temp) * 100) 167 | 168 | df = pd.DataFrame({ 169 | 'ErrorType': err_names, 170 | 'Instances': instances, 171 | 'Pct': pcts 172 | }) 173 | print(df) 174 | 175 | print("Missing error types") 176 | found = sorted(list(set(error_types))) 177 | target = sorted(list(set(error_df.iloc[:, -1].values))) 178 | 179 | for item in target: 180 | if item not in found: 181 | print(item) 182 | # ######################################################## 183 | 184 | 185 | # ######################################################## 186 | if __name__ == '__main__': 187 | scrape_post_1() 188 | scrape_post_2() 189 | find_stats() 190 | -------------------------------------------------------------------------------- /Baselines/GRUSeq2Seq/main.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | df2train_test_dfs, basic_tokenizer, init_weights, count_parameters, 3 | translate_sentence, display_attention, df2train_valid_test_dfs, 4 | save_model, load_model, df2train_error_dfs 5 | ) 6 | from models import Encoder, Decoder, Attention, Seq2Seq 7 | from pipeline import train, test_accuracy 8 | from inference import test_beam, test_greedy 9 | from focalLoss import FocalLoss 10 | from errors import error_df 11 | 12 | import torch, torch.nn as nn, torch.optim as optim 13 | import torch.nn.functional as F 14 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset 15 | import random 16 | from tqdm import tqdm 17 | import pandas as pd 18 | from sklearn.model_selection import train_test_split 19 | import math 20 | import time 21 | 22 | import matplotlib.pyplot as plt 23 | import matplotlib.ticker as ticker 24 | import matplotlib.font_manager as fm 25 | 26 | import numpy as np 27 | import math 28 | import time 29 | import sys 30 | import os 31 | import argparse 32 | 33 | import warnings as wrn 34 | wrn.filterwarnings('ignore') 35 | 36 | 37 | def main(): 38 | parser = argparse.ArgumentParser() 39 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv", 40 | choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"] 41 | ) 42 | parser.add_argument("--ENC_EMB_DIM", help="Encoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256]) 43 | parser.add_argument("--DEC_EMB_DIM", help="Decoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256]) 44 | parser.add_argument("--ENC_HIDDEN_DIM", help="Encoder Hidden Dimension", type=int,default=256, choices=[128, 256, 512]) 45 | parser.add_argument("--DEC_HIDDEN_DIM", help="Decoder Hidden Dimension", type=int, default=512, choices=[256, 512, 1024]) 46 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 47 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 48 | parser.add_argument("--MAX_LEN", help="Maximum Length", type=int, default=48, choices=[48, 56, 64]) 49 | parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512]) 50 | parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=1, choices=[0.1, 0.2, 0.5, 1]) 51 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 52 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 53 | args = parser.parse_args() 54 | 55 | 56 | df = pd.read_csv(args.CORPUS) 57 | df2train_valid_test_dfs(df=df, test_size=0.15) 58 | 59 | SRC = Field( 60 | tokenize=basic_tokenizer, lower=False, 61 | init_token='', eos_token='', 62 | sequential=True, use_vocab=True, include_lengths=True 63 | ) 64 | TRG = Field( 65 | tokenize=basic_tokenizer, lower=False, 66 | init_token='', eos_token='', 67 | sequential=True, use_vocab=True 68 | ) 69 | fields = { 70 | 'Error': ('src', SRC), 71 | 'Word': ('trg', TRG) 72 | } 73 | train_data, valid_data, test_data = TabularDataset.splits( 74 | path='./Dataset', 75 | train='train.csv', 76 | validation='valid.csv', 77 | test='test.csv', 78 | format='csv', 79 | fields=fields 80 | ) 81 | 82 | SRC.build_vocab(train_data, max_size=64, min_freq=100) 83 | TRG.build_vocab(train_data, max_size=64, min_freq=75) 84 | # ------------------------------------- 85 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 86 | BATCH_SIZE = args.BATCH_SIZE 87 | INPUT_DIM = len(SRC.vocab) 88 | OUTPUT_DIM = len(TRG.vocab) 89 | ENC_EMB_DIM = args.ENC_EMB_DIM 90 | DEC_EMB_DIM = args.DEC_EMB_DIM 91 | ENC_HIDDEN_DIM = args.ENC_HIDDEN_DIM 92 | DEC_HIDDEN_DIM = args.DEC_HIDDEN_DIM 93 | ENC_DROPOUT = args.ENC_DROPOUT 94 | DEC_DROPOUT = args.DEC_DROPOUT 95 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 96 | MAX_LEN = args.MAX_LEN 97 | N_EPOCHS = args.N_EPOCHS 98 | CLIP = args.CLIP 99 | # ------------------------------------- 100 | PATH = './Checkpoints/GRUSeq2Seq.pth' 101 | 102 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 103 | (train_data, valid_data, test_data), 104 | batch_size=BATCH_SIZE, 105 | sort_within_batch=True, 106 | sort_key=lambda x: len(x.src), 107 | device=DEVICE 108 | ) 109 | 110 | attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM) 111 | encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT) 112 | decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention) 113 | 114 | model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE) 115 | model.apply(init_weights) 116 | # print(f'The model has {count_parameters(model):,} trainable parameters') 117 | 118 | optimizer = optim.Adam(model.parameters()) 119 | # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4) 120 | 121 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 122 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 123 | # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX) 124 | # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean') 125 | 126 | best_loss = 1e10 127 | epoch = 1 128 | if os.path.exists(PATH): 129 | checkpoint, epoch, train_loss = load_model(model, optimizer, PATH) 130 | best_loss = train_loss 131 | 132 | for epoch in range(epoch, N_EPOCHS): 133 | print(f'Epoch: {epoch} / {N_EPOCHS}') 134 | train_loss = train(model, train_iterator, optimizer, criterion) 135 | print(f"Train Loss: {train_loss:.2f}") 136 | 137 | if train_loss < best_loss: 138 | best_loss = train_loss 139 | save_model(model, epoch, optimizer, train_loss, PATH) 140 | 141 | # scheduler.step() 142 | # if epoch%10 == 0: 143 | # # test_accuracy(valid_data, SRC, TRG, model, DEVICE) 144 | # test_accuracy(error_data, SRC, TRG, model, DEVICE) 145 | 146 | test_accuracy(valid_data, SRC, TRG, model, DEVICE) 147 | 148 | 149 | # errors = ['Cognitive Error', 'Homonym Error', 'Run-on Error', 150 | # 'Split-word Error (Left)', 'Split-word Error (Random)', 151 | # 'Split-word Error (Right)', 'Split-word Error (both)', 152 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition', 153 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition', 154 | # 'Visual Error', 'Visual Error (Combined Character)'] 155 | 156 | # for error in errors: 157 | # print(f"-----\nError Type: {error}\n-----") 158 | # error_df(df, error) 159 | # error_data, _ = TabularDataset.splits( 160 | # path='./Dataset', 161 | # train='error.csv', 162 | # test='error.csv', 163 | # format='csv', 164 | # fields=fields 165 | # ) 166 | # eval_df = test_accuracy(error_data, SRC, TRG, model, DEVICE) 167 | # error = error.replace(' ', '').replace('(', '').replace(')', '') 168 | # eval_df.to_csv(f'./Corrections/s2sJL_{error}.csv') 169 | # print('\n\n') 170 | 171 | 172 | # test_beam(model, train_data, test_data, SRC, TRG, DEVICE) 173 | # test_greedy(test_data, SRC, TRG, model, DEVICE) 174 | 175 | # example_idx = 1 176 | # src = vars(train_data.examples[example_idx])['src'] 177 | # trg = vars(train_data.examples[example_idx])['trg'] 178 | # print(f'src = {src}') 179 | # print(f'trg = {trg}') 180 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 181 | # print(f'predicted trg = {translation}') 182 | # display_attention(src, translation, attention) 183 | 184 | 185 | if __name__ == '__main__': 186 | main() 187 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | import os 7 | 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | SEED = 1234 12 | torch.manual_seed(SEED) 13 | torch.cuda.manual_seed(SEED) 14 | 15 | 16 | # --------------------------- 17 | def train_valid_test_df(df, test_size, valid_size): 18 | # etypes = list(set(df.iloc[:, -1])) 19 | etypes = list(set(df['ErrorType'])) 20 | 21 | train_df = pd.DataFrame() 22 | valid_df = pd.DataFrame() 23 | test_df = pd.DataFrame() 24 | 25 | for etype in etypes: 26 | etype_df = df.loc[df['ErrorType'] == etype] 27 | train, test = train_test_split(etype_df, test_size=test_size) 28 | train, valid = train_test_split(train, test_size=valid_size) 29 | 30 | train_df = pd.concat([train_df, train]) 31 | valid_df = pd.concat([valid_df, valid]) 32 | test_df = pd.concat([test_df, test]) 33 | 34 | train_df = train_df.sample(frac=1).reset_index(drop=True) 35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 36 | test_df = test_df.sample(frac=1).reset_index(drop=True) 37 | 38 | train_df = train_df.iloc[:, [1, 0]] 39 | valid_df = valid_df.iloc[:, [1, 0]] 40 | test_df = test_df.iloc[:, [1, 0]] 41 | 42 | return train_df, valid_df, test_df 43 | # --------------------------- 44 | 45 | 46 | # --------------------------- 47 | def train_valid_test_df2(df, test_size, valid_size): 48 | # etypes = list(set(df.iloc[:, -1])) 49 | etypes = list(set(df['ErrorType'])) 50 | 51 | train_df = pd.DataFrame() 52 | valid_df = pd.DataFrame() 53 | test_df = pd.DataFrame() 54 | 55 | for etype in etypes: 56 | etype_df = df.loc[df['ErrorType'] == etype] 57 | train, test = train_test_split(etype_df, test_size=test_size) 58 | train, valid = train_test_split(train, test_size=valid_size) 59 | 60 | train_df = pd.concat([train_df, train]) 61 | valid_df = pd.concat([valid_df, valid]) 62 | test_df = pd.concat([test_df, test]) 63 | 64 | train_df = train_df.sample(frac=1).reset_index(drop=True) 65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 66 | test_df = test_df.sample(frac=1).reset_index(drop=True) 67 | 68 | # train_df = train_df.iloc[:, [1, 0]] 69 | # valid_df = valid_df.iloc[:, [1, 0]] 70 | # test_df = test_df.iloc[:, [1, 0]] 71 | 72 | return train_df, valid_df, test_df 73 | # --------------------------- 74 | 75 | 76 | # --------------------------- 77 | def merge_dfs(network='detector'): 78 | df_names = [ 79 | f'{network}_CognitiveError.csv', 80 | f'{network}_HomonymError.csv', 81 | f'{network}_Run-onError.csv', 82 | f'{network}_Split-wordErrorLeft.csv', 83 | f'{network}_Split-wordErrorRandom.csv', 84 | f'{network}_Split-wordErrorRight.csv', 85 | f'{network}_Split-wordErrorboth.csv', 86 | f'{network}_TypoAvroSubstituition.csv', 87 | f'{network}_TypoBijoySubstituition.csv', 88 | f'{network}_TypoDeletion.csv', 89 | f'{network}_TypoInsertion.csv', 90 | f'{network}_TypoTransposition.csv', 91 | f'{network}_VisualError.csv', 92 | f'{network}_VisualErrorCombinedCharacter.csv' 93 | ] 94 | 95 | df = pd.DataFrame() 96 | 97 | for df_name in df_names: 98 | df_path = os.path.join('./Dataframes', df_name) 99 | temp_df = pd.read_csv(df_path) 100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1] 101 | for _ in range(len(temp_df))] 102 | df = pd.concat([df, temp_df]) 103 | 104 | df = df.iloc[:, :] 105 | 106 | if network=='detector': 107 | df.rename( 108 | columns = { 109 | 'Predicton':'ErrorBlanksPredD1', 110 | 'Target':'ErrorBlanksActual', 111 | 'Correction':'EBP_Flag_D1', 112 | }, 113 | inplace = True 114 | ) 115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']] 116 | 117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector) 118 | # (purificator) 119 | # --------------------------- 120 | 121 | 122 | # --------------------------- 123 | def error_df(df, error='Cognitive Error'): 124 | df = df.loc[df['ErrorType'] == error] 125 | df['Word'] = df['Word'].apply(word2char) 126 | df['Error'] = df['Error'].apply(word2char) 127 | df = df.sample(frac=1).reset_index(drop=True) 128 | idx = int(len(df)/1) 129 | df = df.iloc[:idx, [1, 0]] 130 | df.to_csv('./Dataset/error.csv', index=False) 131 | # --------------------------- 132 | 133 | 134 | # --------------------------- 135 | def error_df_2(df, error='Cognitive Error'): 136 | df = df.loc[df['ErrorType'] == error] 137 | # df['Word'] = df['Word'].apply(word2char) 138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 139 | df = df.sample(frac=1).reset_index(drop=True) 140 | idx = int(len(df)/1) 141 | df = df.iloc[:idx, [1, 0]] 142 | # 143 | # if(len(df) >= 10000): 144 | # df = df.iloc[:10000, :] 145 | # 146 | df.to_csv('./Dataset/error.csv', index=False) 147 | # --------------------------- 148 | 149 | 150 | # --------------------------- 151 | def error_df_3(df, error='Cognitive Error'): 152 | df = df.loc[df['ErrorType'] == error] 153 | # df['Word'] = df['Word'].apply(word2char) 154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 155 | df = df.sample(frac=1).reset_index(drop=True) 156 | # idx = int(len(df)/1) 157 | # df = df.iloc[:idx, [1, 0]] 158 | # 159 | # if(len(df) >= 10000): 160 | # df = df.iloc[:10000, :] 161 | # 162 | df.to_csv('./Dataset/error.csv', index=False) 163 | # --------------------------- 164 | 165 | 166 | # --------------------------- 167 | def word2char(word): 168 | w2c = [char for char in word] 169 | return ' '.join(w2c) 170 | # --------------------------- 171 | 172 | 173 | # --------------------------- 174 | def find_len(seq): 175 | return len(seq.split(' ')) 176 | # --------------------------- 177 | 178 | 179 | # --------------------------- 180 | def mask2str(mask): 181 | x = '' 182 | for item in mask: 183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]": 184 | x += str(item) 185 | return x 186 | # --------------------------- 187 | 188 | 189 | # --------------------------- 190 | def error_blank(error, mask): 191 | error_list = np.array(error.split()) 192 | mask_list = np.array(mask.split()) 193 | idx = np.where(mask_list=='1')[0] 194 | error_list[idx] = ' ' 195 | error = ' '.join(error_list) 196 | return error 197 | # --------------------------- 198 | 199 | 200 | # --------------------------- 201 | def basic_tokenizer(text): 202 | return text.split() 203 | # --------------------------- 204 | 205 | 206 | # --------------------------- 207 | def count_parameters(model): 208 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 209 | # --------------------------- 210 | 211 | 212 | # --------------------------- 213 | def initialize_weights(m): 214 | if hasattr(m, 'weight') and m.weight.dim() > 1: 215 | nn.init.xavier_uniform_(m.weight.data) 216 | # --------------------------- 217 | 218 | 219 | # --------------------------- 220 | def save_model(model, train_loss, epoch, PATH): 221 | torch.save({ 222 | 'epoch': epoch, 223 | 'model_state_dict': model.state_dict(), 224 | # 'optimizer_state_dict': optimizer.state_dict(), 225 | 'loss': train_loss 226 | }, PATH) 227 | print(f"---------\nModel Saved at {PATH}\n---------\n") 228 | # --------------------------- 229 | 230 | 231 | # --------------------------- 232 | def load_model(model, PATH): 233 | checkpoint = torch.load(PATH) 234 | model.load_state_dict(checkpoint['model_state_dict']) 235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 236 | epoch = checkpoint['epoch'] 237 | train_loss = checkpoint['loss'] 238 | return checkpoint, epoch, train_loss 239 | # --------------------------- 240 | 241 | 242 | if __name__ == '__main__': 243 | pass 244 | -------------------------------------------------------------------------------- /Baselines/DCSpell/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | import os 7 | 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | SEED = 1234 12 | torch.manual_seed(SEED) 13 | torch.cuda.manual_seed(SEED) 14 | 15 | 16 | # --------------------------- 17 | def train_valid_test_df(df, test_size, valid_size): 18 | # etypes = list(set(df.iloc[:, -1])) 19 | etypes = list(set(df['ErrorType'])) 20 | 21 | train_df = pd.DataFrame() 22 | valid_df = pd.DataFrame() 23 | test_df = pd.DataFrame() 24 | 25 | for etype in etypes: 26 | etype_df = df.loc[df['ErrorType'] == etype] 27 | train, test = train_test_split(etype_df, test_size=test_size) 28 | train, valid = train_test_split(train, test_size=valid_size) 29 | 30 | train_df = pd.concat([train_df, train]) 31 | valid_df = pd.concat([valid_df, valid]) 32 | test_df = pd.concat([test_df, test]) 33 | 34 | train_df = train_df.sample(frac=1).reset_index(drop=True) 35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 36 | test_df = test_df.sample(frac=1).reset_index(drop=True) 37 | 38 | train_df = train_df.iloc[:, [1, 0]] 39 | valid_df = valid_df.iloc[:, [1, 0]] 40 | test_df = test_df.iloc[:, [1, 0]] 41 | 42 | return train_df, valid_df, test_df 43 | # --------------------------- 44 | 45 | 46 | # --------------------------- 47 | def train_valid_test_df2(df, test_size, valid_size): 48 | # etypes = list(set(df.iloc[:, -1])) 49 | etypes = list(set(df['ErrorType'])) 50 | 51 | train_df = pd.DataFrame() 52 | valid_df = pd.DataFrame() 53 | test_df = pd.DataFrame() 54 | 55 | for etype in etypes: 56 | etype_df = df.loc[df['ErrorType'] == etype] 57 | train, test = train_test_split(etype_df, test_size=test_size) 58 | train, valid = train_test_split(train, test_size=valid_size) 59 | 60 | train_df = pd.concat([train_df, train]) 61 | valid_df = pd.concat([valid_df, valid]) 62 | test_df = pd.concat([test_df, test]) 63 | 64 | train_df = train_df.sample(frac=1).reset_index(drop=True) 65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 66 | test_df = test_df.sample(frac=1).reset_index(drop=True) 67 | 68 | # train_df = train_df.iloc[:, [1, 0]] 69 | # valid_df = valid_df.iloc[:, [1, 0]] 70 | # test_df = test_df.iloc[:, [1, 0]] 71 | 72 | return train_df, valid_df, test_df 73 | # --------------------------- 74 | 75 | 76 | # --------------------------- 77 | def merge_dfs(network='detector'): 78 | df_names = [ 79 | f'{network}_CognitiveError.csv', 80 | f'{network}_HomonymError.csv', 81 | f'{network}_Run-onError.csv', 82 | f'{network}_Split-wordErrorLeft.csv', 83 | f'{network}_Split-wordErrorRandom.csv', 84 | f'{network}_Split-wordErrorRight.csv', 85 | f'{network}_Split-wordErrorboth.csv', 86 | f'{network}_TypoAvroSubstituition.csv', 87 | f'{network}_TypoBijoySubstituition.csv', 88 | f'{network}_TypoDeletion.csv', 89 | f'{network}_TypoInsertion.csv', 90 | f'{network}_TypoTransposition.csv', 91 | f'{network}_VisualError.csv', 92 | f'{network}_VisualErrorCombinedCharacter.csv' 93 | ] 94 | 95 | df = pd.DataFrame() 96 | 97 | for df_name in df_names: 98 | df_path = os.path.join('./Dataframes', df_name) 99 | temp_df = pd.read_csv(df_path) 100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1] 101 | for _ in range(len(temp_df))] 102 | df = pd.concat([df, temp_df]) 103 | 104 | df = df.iloc[:, :] 105 | 106 | if network=='detector': 107 | df.rename( 108 | columns = { 109 | 'Predicton':'ErrorBlanksPredD1', 110 | 'Target':'ErrorBlanksActual', 111 | 'Correction':'EBP_Flag_D1', 112 | }, 113 | inplace = True 114 | ) 115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']] 116 | 117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector) 118 | # (purificator) 119 | # --------------------------- 120 | 121 | 122 | # --------------------------- 123 | def error_df(df, error='Cognitive Error'): 124 | df = df.loc[df['ErrorType'] == error] 125 | df['Word'] = df['Word'].apply(word2char) 126 | df['Error'] = df['Error'].apply(word2char) 127 | df = df.sample(frac=1).reset_index(drop=True) 128 | idx = int(len(df)/1) 129 | df = df.iloc[:idx, [1, 0]] 130 | df.to_csv('./Dataset/error.csv', index=False) 131 | # --------------------------- 132 | 133 | 134 | # --------------------------- 135 | def error_df_2(df, error='Cognitive Error'): 136 | df = df.loc[df['ErrorType'] == error] 137 | # df['Word'] = df['Word'].apply(word2char) 138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 139 | df = df.sample(frac=1).reset_index(drop=True) 140 | idx = int(len(df)/1) 141 | df = df.iloc[:idx, [1, 0]] 142 | # 143 | # if(len(df) >= 10000): 144 | # df = df.iloc[:10000, :] 145 | # 146 | df.to_csv('./Dataset/error.csv', index=False) 147 | # --------------------------- 148 | 149 | 150 | # --------------------------- 151 | def error_df_3(df, error='Cognitive Error'): 152 | df = df.loc[df['ErrorType'] == error] 153 | # df['Word'] = df['Word'].apply(word2char) 154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 155 | df = df.sample(frac=1).reset_index(drop=True) 156 | # idx = int(len(df)/1) 157 | # df = df.iloc[:idx, [1, 0]] 158 | # 159 | # if(len(df) >= 10000): 160 | # df = df.iloc[:10000, :] 161 | # 162 | df.to_csv('./Dataset/error.csv', index=False) 163 | # --------------------------- 164 | 165 | 166 | # --------------------------- 167 | def word2char(word): 168 | w2c = [char for char in word] 169 | return ' '.join(w2c) 170 | # --------------------------- 171 | 172 | 173 | # --------------------------- 174 | def find_len(seq): 175 | return len(seq.split(' ')) 176 | # --------------------------- 177 | 178 | 179 | # --------------------------- 180 | def mask2str(mask): 181 | x = '' 182 | for item in mask: 183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]": 184 | x += str(item) 185 | return x 186 | # --------------------------- 187 | 188 | 189 | # --------------------------- 190 | def error_blank(error, mask): 191 | error_list = np.array(error.split()) 192 | mask_list = np.array(mask.split()) 193 | idx = np.where(mask_list=='1')[0] 194 | error_list[idx] = ' ' 195 | error = ' '.join(error_list) 196 | return error 197 | # --------------------------- 198 | 199 | 200 | # --------------------------- 201 | def basic_tokenizer(text): 202 | return text.split() 203 | # --------------------------- 204 | 205 | 206 | # --------------------------- 207 | def count_parameters(model): 208 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 209 | # --------------------------- 210 | 211 | 212 | # --------------------------- 213 | def initialize_weights(m): 214 | if hasattr(m, 'weight') and m.weight.dim() > 1: 215 | nn.init.xavier_uniform_(m.weight.data) 216 | # --------------------------- 217 | 218 | 219 | # --------------------------- 220 | def save_model(model, train_loss, epoch, PATH): 221 | torch.save({ 222 | 'epoch': epoch, 223 | 'model_state_dict': model.state_dict(), 224 | # 'optimizer_state_dict': optimizer.state_dict(), 225 | 'loss': train_loss 226 | }, PATH) 227 | print(f"---------\nModel Saved at {PATH}\n---------\n") 228 | # --------------------------- 229 | 230 | 231 | # --------------------------- 232 | def load_model(model, PATH): 233 | checkpoint = torch.load(PATH) 234 | model.load_state_dict(checkpoint['model_state_dict']) 235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 236 | epoch = checkpoint['epoch'] 237 | train_loss = checkpoint['loss'] 238 | return checkpoint, epoch, train_loss 239 | # --------------------------- 240 | 241 | 242 | if __name__ == '__main__': 243 | pass 244 | -------------------------------------------------------------------------------- /Baselines/DTransformer/utils.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | import os 7 | 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | SEED = 1234 12 | torch.manual_seed(SEED) 13 | torch.cuda.manual_seed(SEED) 14 | 15 | 16 | # --------------------------- 17 | def train_valid_test_df(df, test_size, valid_size): 18 | # etypes = list(set(df.iloc[:, -1])) 19 | etypes = list(set(df['ErrorType'])) 20 | 21 | train_df = pd.DataFrame() 22 | valid_df = pd.DataFrame() 23 | test_df = pd.DataFrame() 24 | 25 | for etype in etypes: 26 | etype_df = df.loc[df['ErrorType'] == etype] 27 | train, test = train_test_split(etype_df, test_size=test_size) 28 | train, valid = train_test_split(train, test_size=valid_size) 29 | 30 | train_df = pd.concat([train_df, train]) 31 | valid_df = pd.concat([valid_df, valid]) 32 | test_df = pd.concat([test_df, test]) 33 | 34 | train_df = train_df.sample(frac=1).reset_index(drop=True) 35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 36 | test_df = test_df.sample(frac=1).reset_index(drop=True) 37 | 38 | train_df = train_df.iloc[:, [1, 0]] 39 | valid_df = valid_df.iloc[:, [1, 0]] 40 | test_df = test_df.iloc[:, [1, 0]] 41 | 42 | return train_df, valid_df, test_df 43 | # --------------------------- 44 | 45 | 46 | # --------------------------- 47 | def train_valid_test_df2(df, test_size, valid_size): 48 | # etypes = list(set(df.iloc[:, -1])) 49 | etypes = list(set(df['ErrorType'])) 50 | 51 | train_df = pd.DataFrame() 52 | valid_df = pd.DataFrame() 53 | test_df = pd.DataFrame() 54 | 55 | for etype in etypes: 56 | etype_df = df.loc[df['ErrorType'] == etype] 57 | train, test = train_test_split(etype_df, test_size=test_size) 58 | train, valid = train_test_split(train, test_size=valid_size) 59 | 60 | train_df = pd.concat([train_df, train]) 61 | valid_df = pd.concat([valid_df, valid]) 62 | test_df = pd.concat([test_df, test]) 63 | 64 | train_df = train_df.sample(frac=1).reset_index(drop=True) 65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True) 66 | test_df = test_df.sample(frac=1).reset_index(drop=True) 67 | 68 | # train_df = train_df.iloc[:, [1, 0]] 69 | # valid_df = valid_df.iloc[:, [1, 0]] 70 | # test_df = test_df.iloc[:, [1, 0]] 71 | 72 | return train_df, valid_df, test_df 73 | # --------------------------- 74 | 75 | 76 | # --------------------------- 77 | def merge_dfs(network='detector'): 78 | df_names = [ 79 | f'{network}_CognitiveError.csv', 80 | f'{network}_HomonymError.csv', 81 | f'{network}_Run-onError.csv', 82 | f'{network}_Split-wordErrorLeft.csv', 83 | f'{network}_Split-wordErrorRandom.csv', 84 | f'{network}_Split-wordErrorRight.csv', 85 | f'{network}_Split-wordErrorboth.csv', 86 | f'{network}_TypoAvroSubstituition.csv', 87 | f'{network}_TypoBijoySubstituition.csv', 88 | f'{network}_TypoDeletion.csv', 89 | f'{network}_TypoInsertion.csv', 90 | f'{network}_TypoTransposition.csv', 91 | f'{network}_VisualError.csv', 92 | f'{network}_VisualErrorCombinedCharacter.csv' 93 | ] 94 | 95 | df = pd.DataFrame() 96 | 97 | for df_name in df_names: 98 | df_path = os.path.join('./Dataframes', df_name) 99 | temp_df = pd.read_csv(df_path) 100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1] 101 | for _ in range(len(temp_df))] 102 | df = pd.concat([df, temp_df]) 103 | 104 | df = df.iloc[:, :] 105 | 106 | if network=='detector': 107 | df.rename( 108 | columns = { 109 | 'Predicton':'ErrorBlanksPredD1', 110 | 'Target':'ErrorBlanksActual', 111 | 'Correction':'EBP_Flag_D1', 112 | }, 113 | inplace = True 114 | ) 115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']] 116 | 117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector) 118 | # (purificator) 119 | # --------------------------- 120 | 121 | 122 | # --------------------------- 123 | def error_df(df, error='Cognitive Error'): 124 | df = df.loc[df['ErrorType'] == error] 125 | df['Word'] = df['Word'].apply(word2char) 126 | df['Error'] = df['Error'].apply(word2char) 127 | df = df.sample(frac=1).reset_index(drop=True) 128 | idx = int(len(df)/1) 129 | df = df.iloc[:idx, [1, 0]] 130 | df.to_csv('./Dataset/error.csv', index=False) 131 | # --------------------------- 132 | 133 | 134 | # --------------------------- 135 | def error_df_2(df, error='Cognitive Error'): 136 | df = df.loc[df['ErrorType'] == error] 137 | # df['Word'] = df['Word'].apply(word2char) 138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 139 | df = df.sample(frac=1).reset_index(drop=True) 140 | idx = int(len(df)/1) 141 | df = df.iloc[:idx, [1, 0]] 142 | # 143 | # if(len(df) >= 10000): 144 | # df = df.iloc[:10000, :] 145 | # 146 | df.to_csv('./Dataset/error.csv', index=False) 147 | # --------------------------- 148 | 149 | 150 | # --------------------------- 151 | def error_df_3(df, error='Cognitive Error'): 152 | df = df.loc[df['ErrorType'] == error] 153 | # df['Word'] = df['Word'].apply(word2char) 154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char) 155 | df = df.sample(frac=1).reset_index(drop=True) 156 | # idx = int(len(df)/1) 157 | # df = df.iloc[:idx, [1, 0]] 158 | # 159 | # if(len(df) >= 10000): 160 | # df = df.iloc[:10000, :] 161 | # 162 | df.to_csv('./Dataset/error.csv', index=False) 163 | # --------------------------- 164 | 165 | 166 | # --------------------------- 167 | def word2char(word): 168 | w2c = [char for char in word] 169 | return ' '.join(w2c) 170 | # --------------------------- 171 | 172 | 173 | # --------------------------- 174 | def find_len(seq): 175 | return len(seq.split(' ')) 176 | # --------------------------- 177 | 178 | 179 | # --------------------------- 180 | def mask2str(mask): 181 | x = '' 182 | for item in mask: 183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]": 184 | x += str(item) 185 | return x 186 | # --------------------------- 187 | 188 | 189 | # --------------------------- 190 | def error_blank(error, mask): 191 | error_list = np.array(error.split()) 192 | mask_list = np.array(mask.split()) 193 | idx = np.where(mask_list=='1')[0] 194 | error_list[idx] = ' ' 195 | error = ' '.join(error_list) 196 | return error 197 | # --------------------------- 198 | 199 | 200 | # --------------------------- 201 | def basic_tokenizer(text): 202 | return text.split() 203 | # --------------------------- 204 | 205 | 206 | # --------------------------- 207 | def count_parameters(model): 208 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 209 | # --------------------------- 210 | 211 | 212 | # --------------------------- 213 | def initialize_weights(m): 214 | if hasattr(m, 'weight') and m.weight.dim() > 1: 215 | nn.init.xavier_uniform_(m.weight.data) 216 | # --------------------------- 217 | 218 | 219 | # --------------------------- 220 | def save_model(model, train_loss, epoch, PATH): 221 | torch.save({ 222 | 'epoch': epoch, 223 | 'model_state_dict': model.state_dict(), 224 | # 'optimizer_state_dict': optimizer.state_dict(), 225 | 'loss': train_loss 226 | }, PATH) 227 | print(f"---------\nModel Saved at {PATH}\n---------\n") 228 | # --------------------------- 229 | 230 | 231 | # --------------------------- 232 | def load_model(model, PATH): 233 | checkpoint = torch.load(PATH) 234 | model.load_state_dict(checkpoint['model_state_dict']) 235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 236 | epoch = checkpoint['epoch'] 237 | train_loss = checkpoint['loss'] 238 | return checkpoint, epoch, train_loss 239 | # --------------------------- 240 | 241 | 242 | if __name__ == '__main__': 243 | pass 244 | -------------------------------------------------------------------------------- /Baselines/DTransformer/dtransformer.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | word2char, basic_tokenizer, count_parameters, initialize_weights, 3 | save_model, load_model, error_df, train_valid_test_df, mask2str, 4 | error_df_2, error_df_3, merge_dfs 5 | ) 6 | from transformer import ( 7 | Encoder, EncoderLayer, MultiHeadAttentionLayer, 8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer, 9 | Seq2Seq 10 | ) 11 | from pipeline import train, evaluate 12 | from metrics import evaluation_report, evaluation_report2 13 | 14 | import pandas as pd 15 | from sklearn.model_selection import train_test_split 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator 17 | import torch 18 | import torch.nn as nn 19 | import os 20 | import gc 21 | import sys 22 | import argparse 23 | 24 | import warnings as wrn 25 | wrn.filterwarnings('ignore') 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv", 31 | choices=[ 32 | "./Dataset/corpus.csv", # Bangla SEC parallel corpus 33 | "./Dataset/corpus2.csv", # Bangla SEC parallel corpus for running test 34 | "./Dataset/Hindi/corpus_hindi.csv", 35 | "./Dataset/Telugu/corpus_telugu.csv", 36 | "./Dataset/Hindi/corpus_hindi_enhanced.csv", 37 | "./Dataset/Telugu/corpus_telugu_enhanced.csv" 38 | ] 39 | ) 40 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256]) 41 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7]) 42 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7]) 43 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 44 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 45 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256]) 46 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256]) 47 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 48 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 49 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10]) 50 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 51 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 52 | args = parser.parse_args() 53 | 54 | SEED = 1234 55 | torch.manual_seed(SEED) 56 | torch.cuda.manual_seed(SEED) 57 | 58 | df = pd.read_csv(args.CORPUS) 59 | df['Word'] = df['Word'].apply(word2char) 60 | df['Error'] = df['Error'].apply(word2char) 61 | df = df.sample(frac=1).reset_index(drop=True) 62 | 63 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05) 64 | 65 | train_df.to_csv('./Dataset/train.csv', index=False) 66 | valid_df.to_csv('./Dataset/valid.csv', index=False) 67 | test_df.to_csv('./Dataset/test.csv', index=False) 68 | 69 | SRC = Field( 70 | tokenize=basic_tokenizer, lower=False, 71 | init_token='', eos_token='', batch_first=True 72 | ) 73 | TRG = Field( 74 | tokenize=basic_tokenizer, lower=False, 75 | init_token='', eos_token='', batch_first=True 76 | ) 77 | fields = { 78 | 'Error': ('src', SRC), 79 | 'Word': ('trg', TRG) 80 | } 81 | 82 | train_data, valid_data, test_data = TabularDataset.splits( 83 | path='./Dataset', 84 | train='train.csv', 85 | validation='valid.csv', 86 | test='test.csv', 87 | format='csv', 88 | fields=fields 89 | ) 90 | 91 | SRC.build_vocab(train_data, min_freq=100) 92 | TRG.build_vocab(train_data, min_freq=50) 93 | 94 | # ------------------------------ 95 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 96 | BATCH_SIZE = 512 97 | # ------------------------------ 98 | INPUT_DIM = len(SRC.vocab) 99 | OUTPUT_DIM = len(TRG.vocab) 100 | # ------------------------------ 101 | HID_DIM = int(args.HID_DIM) 102 | ENC_LAYERS = int(args.ENC_LAYERS) 103 | DEC_LAYERS = int(args.DEC_LAYERS) 104 | ENC_HEADS = int(args.ENC_HEADS) 105 | DEC_HEADS = int(args.DEC_HEADS) 106 | ENC_PF_DIM = int(args.ENC_PF_DIM) 107 | DEC_PF_DIM = int(args.DEC_PF_DIM) 108 | ENC_DROPOUT = float(args.ENC_DROPOUT) 109 | DEC_DROPOUT = float(args.DEC_DROPOUT) 110 | CLIP = float(args.CLIP) 111 | N_EPOCHS = int(args.N_EPOCHS) 112 | LEARNING_RATE = float(args.LEARNING_RATE) 113 | # ------------------------------ 114 | PATH = './Checkpoints/dtransformer.pth' 115 | # ------------------------------ 116 | gc.collect() 117 | torch.cuda.empty_cache() 118 | # ----------------------------- 119 | 120 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 121 | (train_data, valid_data, test_data), 122 | batch_size=BATCH_SIZE, 123 | sort_within_batch=True, 124 | sort_key=lambda x: len(x.src), 125 | device=DEVICE 126 | ) 127 | 128 | enc = Encoder( 129 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, 130 | ENC_DROPOUT, DEVICE 131 | ) 132 | dec = Decoder( 133 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, 134 | DEC_DROPOUT, DEVICE 135 | ) 136 | 137 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 138 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 139 | 140 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE) 141 | model.apply(initialize_weights) 142 | # print(f'The model has {count_parameters(model):,} trainable parameters') 143 | 144 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) 145 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 146 | 147 | epoch = 1 148 | best_loss = 1e10 149 | if os.path.exists(PATH): 150 | checkpoint, epoch, train_loss = load_model(model, PATH) 151 | best_loss = train_loss 152 | 153 | for epoch in range(epoch, N_EPOCHS): 154 | print(f"Epoch: {epoch} / {N_EPOCHS}") 155 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP) 156 | print(f"Train Loss: {train_loss:.4f}") 157 | if train_loss < best_loss: 158 | best_loss = train_loss 159 | save_model(model, train_loss, epoch, PATH) 160 | 161 | # --------------------- 162 | # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE) 163 | # --------------------- 164 | # error_types = [ 165 | # 'Homonym Error', # 123 166 | # 'Typo Deletion', # 115767 167 | # 'Typo (Avro) Substituition', # 119573 168 | # 'Typo (Bijoy) Substituition', # 119864 169 | # 'Cognitive Error', # 108227 170 | # 'Run-on Error', # 124895 171 | # 'Split-word Error (Left)', # 62890 172 | # 'Split-word Error (Random)', # 124895 173 | # 'Split-word Error (Right)', # 13985 174 | # 'Split-word Error (both)', # 12800 175 | # 'Typo Insertion', # 124807 176 | # 'Typo Transposition', # 123245 177 | # 'Visual Error', # 117391 178 | # 'Visual Error (Combined Character)' # 17617 179 | # ] 180 | # --------------------- 181 | valid_df = pd.read_csv('./Dataset/valid.csv') 182 | error_types = list(sorted(list(set(df['ErrorType'].values)))) 183 | # --------------------- 184 | for error_name in error_types: 185 | print(f'------\nError Type: {error_name}\n------') 186 | error_df_2(df, error_name) 187 | 188 | error_data, _ = TabularDataset.splits( 189 | path='./Dataset', 190 | train='error.csv', 191 | test='error.csv', 192 | format='csv', 193 | fields=fields 194 | ) 195 | 196 | eval_df = evaluation_report(error_data, SRC, TRG, WORD, model, DEVICE) 197 | 198 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '') 199 | eval_df.to_csv(f'./Dataframes/dtransformer_{error_name}.csv', index=False) 200 | print('\n\n') 201 | # --------------------- 202 | 203 | 204 | if __name__ == '__main__': 205 | main() 206 | -------------------------------------------------------------------------------- /Requirements/requirements_u.yml: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | @EXPLICIT 5 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cuda.tar.bz2 6 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda 7 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda 8 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2022.4.26-h06a4308_0.conda 9 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2021.4.0-h06a4308_3561.conda 10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda 11 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran4-7.5.0-ha8ba4b0_17.conda 12 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda 13 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.5.0-ha8ba4b0_17.conda 14 | https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda 15 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2021.4.0-h06a4308_640.conda 16 | https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda 17 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda 18 | https://repo.anaconda.com/pkgs/main/linux-64/brotli-1.0.9-he6710b0_2.conda 19 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda 20 | https://repo.anaconda.com/pkgs/main/linux-64/cudatoolkit-10.2.89-hfd86e86_1.conda 21 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.4.4-h295c915_0.conda 22 | https://repo.anaconda.com/pkgs/main/linux-64/giflib-5.2.1-h7b6447c_0.conda 23 | https://repo.anaconda.com/pkgs/main/linux-64/gmp-6.2.1-h295c915_3.conda 24 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda 25 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9e-h7f8727e_0.conda 26 | https://repo.anaconda.com/pkgs/main/linux-64/lame-3.100-h7b6447c_0.conda 27 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda 28 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.16-h7f8727e_2.conda 29 | https://repo.anaconda.com/pkgs/main/linux-64/libtasn1-4.16.0-h27cfd23_0.conda 30 | https://repo.anaconda.com/pkgs/main/linux-64/libunistring-0.9.10-h27cfd23_0.conda 31 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h7f8727e_2.conda 32 | https://repo.anaconda.com/pkgs/main/linux-64/libuv-1.40.0-h7b6447c_0.conda 33 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-base-1.2.2-h7f8727e_0.conda 34 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.15-h7f8727e_0.conda 35 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.3-h295c915_1.conda 36 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.3-h7f8727e_2.conda 37 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-base-1.10.2-hd09550d_5.conda 38 | https://repo.anaconda.com/pkgs/main/linux-64/openh264-2.1.1-h4ff587b_0.conda 39 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1o-h7f8727e_0.conda 40 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.45-h295c915_0.conda 41 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7f8727e_1.conda 42 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.12-h7f8727e_2.conda 43 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.69.1-h4ff587b_1.conda 44 | https://repo.anaconda.com/pkgs/main/linux-64/libidn2-2.3.2-h7f8727e_0.conda 45 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda 46 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.14-h74e7548_0.conda 47 | https://repo.anaconda.com/pkgs/main/linux-64/nettle-3.7.3-hbbd107a_1.conda 48 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1.2-h7f8727e_1.conda 49 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.12-h1ccaba5_0.conda 50 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.5.2-ha4553b6_0.conda 51 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.18-hb2f20db_0.conda 52 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.11.0-h70c0345_0.conda 53 | https://repo.anaconda.com/pkgs/main/linux-64/gnutls-3.6.15-he1e5248_0.conda 54 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-h28cd5cc_2.conda 55 | https://repo.anaconda.com/pkgs/main/linux-64/libtiff-4.2.0-h2818925_1.conda 56 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.38.3-hc218d9a_0.conda 57 | https://conda.anaconda.org/pytorch/linux-64/ffmpeg-4.3-hf484d3e_0.tar.bz2 58 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.1-h6c09931_0.conda 59 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-h8213a91_2.conda 60 | https://repo.anaconda.com/pkgs/main/linux-64/lcms2-2.12-h3be6417_0.conda 61 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-1.2.2-h55f646e_0.conda 62 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.8.13-h12debd9_0.conda 63 | https://repo.anaconda.com/pkgs/main/linux-64/certifi-2022.5.18.1-py38h06a4308_0.conda 64 | https://repo.anaconda.com/pkgs/main/noarch/charset-normalizer-2.0.4-pyhd3eb1b0_0.conda 65 | https://repo.anaconda.com/pkgs/main/noarch/cycler-0.11.0-pyhd3eb1b0_0.conda 66 | https://repo.anaconda.com/pkgs/main/noarch/idna-3.3-pyhd3eb1b0_0.conda 67 | https://repo.anaconda.com/pkgs/main/noarch/joblib-1.1.0-pyhd3eb1b0_0.conda 68 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.4.2-py38h295c915_0.conda 69 | https://repo.anaconda.com/pkgs/main/noarch/munkres-1.1.4-py_0.conda 70 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-1.10.2-h06a4308_5.conda 71 | https://repo.anaconda.com/pkgs/main/linux-64/pillow-9.0.1-py38h22f2fdc_0.conda 72 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.21-pyhd3eb1b0_0.conda 73 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-3.0.4-pyhd3eb1b0_0.conda 74 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py38h06a4308_0.conda 75 | https://repo.anaconda.com/pkgs/main/linux-64/pytz-2022.1-py38h06a4308_0.conda 76 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda 77 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.13-py38h295c915_0.conda 78 | https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_1.conda 79 | https://repo.anaconda.com/pkgs/main/noarch/threadpoolctl-2.2.0-pyh0d69192_0.conda 80 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.1-py38h27cfd23_0.conda 81 | https://repo.anaconda.com/pkgs/main/linux-64/tqdm-4.64.0-py38h06a4308_0.conda 82 | https://repo.anaconda.com/pkgs/main/noarch/typing_extensions-4.1.1-pyh06a4308_0.conda 83 | https://repo.anaconda.com/pkgs/main/noarch/wheel-0.37.1-pyhd3eb1b0_0.conda 84 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.15.0-py38hd667e15_1.conda 85 | https://repo.anaconda.com/pkgs/main/noarch/fonttools-4.25.0-pyhd3eb1b0_0.conda 86 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.4.0-py38h7f8727e_0.conda 87 | https://repo.anaconda.com/pkgs/main/noarch/packaging-21.3-pyhd3eb1b0_0.conda 88 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py38h05f1152_4.conda 89 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.2-pyhd3eb1b0_0.conda 90 | https://conda.anaconda.org/pytorch/linux-64/pytorch-1.9.0-py3.8_cuda10.2_cudnn7.6.5_0.tar.bz2 91 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-61.2.0-py38h06a4308_0.conda 92 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py38h27cfd23_1003.conda 93 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-37.0.1-py38h9ce1e76_0.conda 94 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.22.3-py38hf524024_0.conda 95 | https://repo.anaconda.com/pkgs/main/linux-64/pip-21.2.4-py38h06a4308_0.conda 96 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-22.0.0-pyhd3eb1b0_0.conda 97 | https://repo.anaconda.com/pkgs/main/linux-64/urllib3-1.26.9-py38h06a4308_0.conda 98 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.27.1-pyhd3eb1b0_0.conda 99 | https://conda.anaconda.org/pytorch/linux-64/torchtext-0.10.0-py38.tar.bz2 100 | https://repo.anaconda.com/pkgs/main/linux-64/bottleneck-1.3.4-py38hce1f21e_0.conda 101 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.5.1-py38h06a4308_1.conda 102 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.5.1-py38ha18d171_1.conda 103 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.3.1-py38hd3c417c_0.conda 104 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.2.2-py38h51133e4_0.conda 105 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.22.3-py38he7a7128_0.conda 106 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.8.1-py38h6abb31d_0.conda 107 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.7.3-py38hc147768_0.conda 108 | https://conda.anaconda.org/pytorch/linux-64/torchaudio-0.9.0-py38.tar.bz2 109 | https://conda.anaconda.org/pytorch/noarch/torchvision-0.2.2-py_3.tar.bz2 110 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.4.2-py38h295c915_0.conda 111 | https://repo.anaconda.com/pkgs/main/linux-64/scikit-learn-1.0.2-py38h51133e4_1.conda 112 | -------------------------------------------------------------------------------- /metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from pipeline import translate_sentence 4 | import numpy as np 5 | from sklearn import metrics 6 | import torch 7 | import gc 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE): 13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 14 | 15 | modified_flags = [] 16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 17 | all_words = sorted(all_words.iloc[:, 0].values) 18 | 19 | for idx, data in enumerate(tqdm(test_data)): 20 | # ------------------------------ 21 | if idx % 5000 == 0: 22 | gc.collect() 23 | torch.cuda.empty_cache() 24 | # ------------------------------ 25 | 26 | src = data.src 27 | trg = data.trg 28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 29 | 30 | src = ''.join(src) 31 | trg = ''.join(trg) 32 | pred = ''.join(translation) 33 | 34 | erroneous_words.append(src) 35 | correct_words.append(trg) 36 | predicted_words.append(pred) 37 | 38 | if trg == pred: 39 | flags.append(1) 40 | else: 41 | flags.append(0) 42 | 43 | if pred in all_words: 44 | modified_flags.append(1) 45 | else: 46 | modified_flags.append(0) 47 | 48 | evaluation_df = pd.DataFrame({ 49 | 'Error': erroneous_words, 50 | 'Predicton': predicted_words, 51 | 'Target': correct_words, 52 | 'Correction': flags 53 | }) 54 | 55 | corrected_instances = evaluation_df['Correction'].values.sum() 56 | total_instances = len(evaluation_df) 57 | accuracy = corrected_instances / total_instances 58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 59 | 60 | y_true = np.array(correct_words) 61 | y_pred = np.array(predicted_words) 62 | 63 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 64 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 67 | ACC = metrics.accuracy_score(y_true, y_pred) 68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 69 | 70 | print(f''' 71 | Top-1 (Greedy Decoding) 72 | Precision: {PR:.4f} 73 | Recall: {RE:.4f} 74 | F1 Score: {F1:.4f} 75 | F0.5 Score: {F05:.4f} 76 | Accuracy: {RE * 100:.2f}% 77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 78 | ''') 79 | 80 | return evaluation_df 81 | 82 | 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE): 84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 85 | words = [] 86 | 87 | modified_flags = [] 88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 89 | all_words = sorted(all_words.iloc[:, 0].values) 90 | 91 | for idx, data in enumerate(tqdm(test_data)): 92 | # ------------------------------ 93 | if idx % 5000 == 0: 94 | gc.collect() 95 | torch.cuda.empty_cache() 96 | # ------------------------------ 97 | 98 | src = data.src 99 | trg = data.trg 100 | word = data.word 101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 102 | 103 | src = ''.join(src) 104 | trg = ''.join(trg) 105 | pred = ''.join(translation) 106 | word = ''.join(word) 107 | 108 | erroneous_words.append(src) 109 | correct_words.append(trg) 110 | predicted_words.append(pred) 111 | words.append(word) 112 | 113 | if trg == pred: 114 | flags.append(1) 115 | else: 116 | flags.append(0) 117 | 118 | if pred in all_words: 119 | modified_flags.append(1) 120 | else: 121 | modified_flags.append(0) 122 | 123 | evaluation_df = pd.DataFrame({ 124 | 'Error': erroneous_words, # Error 125 | 'Predicton': predicted_words, # ErrorBlanksPredD1 126 | 'Target': correct_words, # ErrorBlanksActual 127 | 'Word': words, # Word 128 | 'Correction': flags # EBP_Flag_D1 129 | }) 130 | 131 | corrected_instances = evaluation_df['Correction'].values.sum() 132 | total_instances = len(evaluation_df) 133 | accuracy = corrected_instances / total_instances 134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 135 | 136 | y_true = np.array(correct_words) 137 | y_pred = np.array(predicted_words) 138 | 139 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 140 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 143 | ACC = metrics.accuracy_score(y_true, y_pred) 144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 145 | 146 | print(f''' 147 | Top-1 (Greedy Decoding) 148 | Precision: {PR:.4f} 149 | Recall: {RE:.4f} 150 | F1 Score: {F1:.4f} 151 | F0.5 Score: {F05:.4f} 152 | Accuracy: {RE * 100:.2f}% 153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 154 | ''') 155 | 156 | return evaluation_df 157 | 158 | 159 | 160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE): 161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 162 | errors = [] 163 | words = [] 164 | ebpd1s = [] 165 | ebpfd1s = [] 166 | 167 | modified_flags = [] 168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 169 | all_words = sorted(all_words.iloc[:, 0].values) 170 | 171 | for idx, data in enumerate(tqdm(test_data)): 172 | # ------------------------------ 173 | if idx % 5000 == 0: 174 | gc.collect() 175 | torch.cuda.empty_cache() 176 | # ------------------------------ 177 | 178 | src = data.src 179 | trg = data.trg 180 | error = data.error 181 | word = data.word 182 | ebpd1 = data.ebpd1 183 | ebpfd1 = data.ebpfd1 184 | 185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 186 | 187 | src = ''.join(src) 188 | trg = ''.join(trg) 189 | pred = ''.join(translation) 190 | error = ''.join(error) 191 | word = ''.join(word) 192 | ebpd1 = ''.join(ebpd1) 193 | ebpfd1 = ''.join(ebpfd1) 194 | 195 | erroneous_words.append(src) 196 | correct_words.append(trg) 197 | predicted_words.append(pred) 198 | errors.append(error) 199 | words.append(word) 200 | ebpd1s.append(ebpd1) 201 | ebpfd1s.append(ebpfd1) 202 | 203 | if trg == pred: 204 | flags.append(1) 205 | else: 206 | flags.append(0) 207 | 208 | if pred in all_words: 209 | modified_flags.append(1) 210 | else: 211 | modified_flags.append(0) 212 | 213 | # evaluation_df = pd.DataFrame({ 214 | # 'Error': erroneous_words, 215 | # 'Predicton': predicted_words, 216 | # 'Target': correct_words, 217 | # 'Word': words, 218 | # 'Correction': flags 219 | # }) 220 | 221 | evaluation_df = pd.DataFrame({ 222 | 'Error': errors, 223 | 'Word': words, 224 | 'ErrorBlanksActual': correct_words, 225 | 'MaskErrorBlank': erroneous_words, 226 | 'ErrorBlanksPredD1': ebpd1s, 227 | 'EBP_Flag_D1': ebpfd1s, 228 | 'ErrorBlanksPredD2': predicted_words, 229 | 'EBP_Flag_D2': flags 230 | }) 231 | 232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum() 233 | total_instances = len(evaluation_df) 234 | accuracy = corrected_instances / total_instances 235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 236 | 237 | y_true = np.array(correct_words) 238 | y_pred = np.array(predicted_words) 239 | 240 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 241 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 244 | ACC = metrics.accuracy_score(y_true, y_pred) 245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 246 | 247 | print(f''' 248 | Top-1 (Greedy Decoding) 249 | Precision: {PR:.4f} 250 | Recall: {RE:.4f} 251 | F1 Score: {F1:.4f} 252 | F0.5 Score: {F05:.4f} 253 | Accuracy: {RE * 100:.2f}% 254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 255 | ''') 256 | 257 | return evaluation_df 258 | 259 | 260 | 261 | 262 | if __name__ == '__main__': 263 | pass 264 | 265 | 266 | -------------------------------------------------------------------------------- /Baselines/DCSpell/metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from pipeline import translate_sentence 4 | import numpy as np 5 | from sklearn import metrics 6 | import torch 7 | import gc 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE): 13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 14 | 15 | modified_flags = [] 16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 17 | all_words = sorted(all_words.iloc[:, 0].values) 18 | 19 | for idx, data in enumerate(tqdm(test_data)): 20 | # ------------------------------ 21 | if idx % 5000 == 0: 22 | gc.collect() 23 | torch.cuda.empty_cache() 24 | # ------------------------------ 25 | 26 | src = data.src 27 | trg = data.trg 28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 29 | 30 | src = ''.join(src) 31 | trg = ''.join(trg) 32 | pred = ''.join(translation) 33 | 34 | erroneous_words.append(src) 35 | correct_words.append(trg) 36 | predicted_words.append(pred) 37 | 38 | if trg == pred: 39 | flags.append(1) 40 | else: 41 | flags.append(0) 42 | 43 | if pred in all_words: 44 | modified_flags.append(1) 45 | else: 46 | modified_flags.append(0) 47 | 48 | evaluation_df = pd.DataFrame({ 49 | 'Error': erroneous_words, 50 | 'Predicton': predicted_words, 51 | 'Target': correct_words, 52 | 'Correction': flags 53 | }) 54 | 55 | corrected_instances = evaluation_df['Correction'].values.sum() 56 | total_instances = len(evaluation_df) 57 | accuracy = corrected_instances / total_instances 58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 59 | 60 | y_true = np.array(correct_words) 61 | y_pred = np.array(predicted_words) 62 | 63 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 64 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 67 | ACC = metrics.accuracy_score(y_true, y_pred) 68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 69 | 70 | print(f''' 71 | Top-1 (Greedy Decoding) 72 | Precision: {PR:.4f} 73 | Recall: {RE:.4f} 74 | F1 Score: {F1:.4f} 75 | F0.5 Score: {F05:.4f} 76 | Accuracy: {RE * 100:.2f}% 77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 78 | ''') 79 | 80 | return evaluation_df 81 | 82 | 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE): 84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 85 | words = [] 86 | 87 | modified_flags = [] 88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 89 | all_words = sorted(all_words.iloc[:, 0].values) 90 | 91 | for idx, data in enumerate(tqdm(test_data)): 92 | # ------------------------------ 93 | if idx % 5000 == 0: 94 | gc.collect() 95 | torch.cuda.empty_cache() 96 | # ------------------------------ 97 | 98 | src = data.src 99 | trg = data.trg 100 | word = data.word 101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 102 | 103 | src = ''.join(src) 104 | trg = ''.join(trg) 105 | pred = ''.join(translation) 106 | word = ''.join(word) 107 | 108 | erroneous_words.append(src) 109 | correct_words.append(trg) 110 | predicted_words.append(pred) 111 | words.append(word) 112 | 113 | if trg == pred: 114 | flags.append(1) 115 | else: 116 | flags.append(0) 117 | 118 | if pred in all_words: 119 | modified_flags.append(1) 120 | else: 121 | modified_flags.append(0) 122 | 123 | evaluation_df = pd.DataFrame({ 124 | 'Error': erroneous_words, # Error 125 | 'Predicton': predicted_words, # ErrorBlanksPredD1 126 | 'Target': correct_words, # ErrorBlanksActual 127 | 'Word': words, # Word 128 | 'Correction': flags # EBP_Flag_D1 129 | }) 130 | 131 | corrected_instances = evaluation_df['Correction'].values.sum() 132 | total_instances = len(evaluation_df) 133 | accuracy = corrected_instances / total_instances 134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 135 | 136 | y_true = np.array(correct_words) 137 | y_pred = np.array(predicted_words) 138 | 139 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 140 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 143 | ACC = metrics.accuracy_score(y_true, y_pred) 144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 145 | 146 | print(f''' 147 | Top-1 (Greedy Decoding) 148 | Precision: {PR:.4f} 149 | Recall: {RE:.4f} 150 | F1 Score: {F1:.4f} 151 | F0.5 Score: {F05:.4f} 152 | Accuracy: {RE * 100:.2f}% 153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 154 | ''') 155 | 156 | return evaluation_df 157 | 158 | 159 | 160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE): 161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 162 | errors = [] 163 | words = [] 164 | ebpd1s = [] 165 | ebpfd1s = [] 166 | 167 | modified_flags = [] 168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 169 | all_words = sorted(all_words.iloc[:, 0].values) 170 | 171 | for idx, data in enumerate(tqdm(test_data)): 172 | # ------------------------------ 173 | if idx % 5000 == 0: 174 | gc.collect() 175 | torch.cuda.empty_cache() 176 | # ------------------------------ 177 | 178 | src = data.src 179 | trg = data.trg 180 | error = data.error 181 | word = data.word 182 | ebpd1 = data.ebpd1 183 | ebpfd1 = data.ebpfd1 184 | 185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 186 | 187 | src = ''.join(src) 188 | trg = ''.join(trg) 189 | pred = ''.join(translation) 190 | error = ''.join(error) 191 | word = ''.join(word) 192 | ebpd1 = ''.join(ebpd1) 193 | ebpfd1 = ''.join(ebpfd1) 194 | 195 | erroneous_words.append(src) 196 | correct_words.append(trg) 197 | predicted_words.append(pred) 198 | errors.append(error) 199 | words.append(word) 200 | ebpd1s.append(ebpd1) 201 | ebpfd1s.append(ebpfd1) 202 | 203 | if trg == pred: 204 | flags.append(1) 205 | else: 206 | flags.append(0) 207 | 208 | if pred in all_words: 209 | modified_flags.append(1) 210 | else: 211 | modified_flags.append(0) 212 | 213 | # evaluation_df = pd.DataFrame({ 214 | # 'Error': erroneous_words, 215 | # 'Predicton': predicted_words, 216 | # 'Target': correct_words, 217 | # 'Word': words, 218 | # 'Correction': flags 219 | # }) 220 | 221 | evaluation_df = pd.DataFrame({ 222 | 'Error': errors, 223 | 'Word': words, 224 | 'ErrorBlanksActual': correct_words, 225 | 'MaskErrorBlank': erroneous_words, 226 | 'ErrorBlanksPredD1': ebpd1s, 227 | 'EBP_Flag_D1': ebpfd1s, 228 | 'ErrorBlanksPredD2': predicted_words, 229 | 'EBP_Flag_D2': flags 230 | }) 231 | 232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum() 233 | total_instances = len(evaluation_df) 234 | accuracy = corrected_instances / total_instances 235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 236 | 237 | y_true = np.array(correct_words) 238 | y_pred = np.array(predicted_words) 239 | 240 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 241 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 244 | ACC = metrics.accuracy_score(y_true, y_pred) 245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 246 | 247 | print(f''' 248 | Top-1 (Greedy Decoding) 249 | Precision: {PR:.4f} 250 | Recall: {RE:.4f} 251 | F1 Score: {F1:.4f} 252 | F0.5 Score: {F05:.4f} 253 | Accuracy: {RE * 100:.2f}% 254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 255 | ''') 256 | 257 | return evaluation_df 258 | 259 | 260 | 261 | 262 | if __name__ == '__main__': 263 | pass 264 | 265 | 266 | -------------------------------------------------------------------------------- /Baselines/DTransformer/metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from tqdm import tqdm 3 | from pipeline import translate_sentence 4 | import numpy as np 5 | from sklearn import metrics 6 | import torch 7 | import gc 8 | import warnings as wrn 9 | wrn.filterwarnings('ignore') 10 | 11 | 12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE): 13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 14 | 15 | modified_flags = [] 16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 17 | all_words = sorted(all_words.iloc[:, 0].values) 18 | 19 | for idx, data in enumerate(tqdm(test_data)): 20 | # ------------------------------ 21 | if idx % 5000 == 0: 22 | gc.collect() 23 | torch.cuda.empty_cache() 24 | # ------------------------------ 25 | 26 | src = data.src 27 | trg = data.trg 28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 29 | 30 | src = ''.join(src) 31 | trg = ''.join(trg) 32 | pred = ''.join(translation) 33 | 34 | erroneous_words.append(src) 35 | correct_words.append(trg) 36 | predicted_words.append(pred) 37 | 38 | if trg == pred: 39 | flags.append(1) 40 | else: 41 | flags.append(0) 42 | 43 | if pred in all_words: 44 | modified_flags.append(1) 45 | else: 46 | modified_flags.append(0) 47 | 48 | evaluation_df = pd.DataFrame({ 49 | 'Error': erroneous_words, 50 | 'Predicton': predicted_words, 51 | 'Target': correct_words, 52 | 'Correction': flags 53 | }) 54 | 55 | corrected_instances = evaluation_df['Correction'].values.sum() 56 | total_instances = len(evaluation_df) 57 | accuracy = corrected_instances / total_instances 58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 59 | 60 | y_true = np.array(correct_words) 61 | y_pred = np.array(predicted_words) 62 | 63 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 64 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 67 | ACC = metrics.accuracy_score(y_true, y_pred) 68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 69 | 70 | print(f''' 71 | Top-1 (Greedy Decoding) 72 | Precision: {PR:.4f} 73 | Recall: {RE:.4f} 74 | F1 Score: {F1:.4f} 75 | F0.5 Score: {F05:.4f} 76 | Accuracy: {RE * 100:.2f}% 77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 78 | ''') 79 | 80 | return evaluation_df 81 | 82 | 83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE): 84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 85 | words = [] 86 | 87 | modified_flags = [] 88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 89 | all_words = sorted(all_words.iloc[:, 0].values) 90 | 91 | for idx, data in enumerate(tqdm(test_data)): 92 | # ------------------------------ 93 | if idx % 5000 == 0: 94 | gc.collect() 95 | torch.cuda.empty_cache() 96 | # ------------------------------ 97 | 98 | src = data.src 99 | trg = data.trg 100 | word = data.word 101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 102 | 103 | src = ''.join(src) 104 | trg = ''.join(trg) 105 | pred = ''.join(translation) 106 | word = ''.join(word) 107 | 108 | erroneous_words.append(src) 109 | correct_words.append(trg) 110 | predicted_words.append(pred) 111 | words.append(word) 112 | 113 | if trg == pred: 114 | flags.append(1) 115 | else: 116 | flags.append(0) 117 | 118 | if pred in all_words: 119 | modified_flags.append(1) 120 | else: 121 | modified_flags.append(0) 122 | 123 | evaluation_df = pd.DataFrame({ 124 | 'Error': erroneous_words, # Error 125 | 'Predicton': predicted_words, # ErrorBlanksPredD1 126 | 'Target': correct_words, # ErrorBlanksActual 127 | 'Word': words, # Word 128 | 'Correction': flags # EBP_Flag_D1 129 | }) 130 | 131 | corrected_instances = evaluation_df['Correction'].values.sum() 132 | total_instances = len(evaluation_df) 133 | accuracy = corrected_instances / total_instances 134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 135 | 136 | y_true = np.array(correct_words) 137 | y_pred = np.array(predicted_words) 138 | 139 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 140 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 143 | ACC = metrics.accuracy_score(y_true, y_pred) 144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 145 | 146 | print(f''' 147 | Top-1 (Greedy Decoding) 148 | Precision: {PR:.4f} 149 | Recall: {RE:.4f} 150 | F1 Score: {F1:.4f} 151 | F0.5 Score: {F05:.4f} 152 | Accuracy: {RE * 100:.2f}% 153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 154 | ''') 155 | 156 | return evaluation_df 157 | 158 | 159 | 160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE): 161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], [] 162 | errors = [] 163 | words = [] 164 | ebpd1s = [] 165 | ebpfd1s = [] 166 | 167 | modified_flags = [] 168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv') 169 | all_words = sorted(all_words.iloc[:, 0].values) 170 | 171 | for idx, data in enumerate(tqdm(test_data)): 172 | # ------------------------------ 173 | if idx % 5000 == 0: 174 | gc.collect() 175 | torch.cuda.empty_cache() 176 | # ------------------------------ 177 | 178 | src = data.src 179 | trg = data.trg 180 | error = data.error 181 | word = data.word 182 | ebpd1 = data.ebpd1 183 | ebpfd1 = data.ebpfd1 184 | 185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE) 186 | 187 | src = ''.join(src) 188 | trg = ''.join(trg) 189 | pred = ''.join(translation) 190 | error = ''.join(error) 191 | word = ''.join(word) 192 | ebpd1 = ''.join(ebpd1) 193 | ebpfd1 = ''.join(ebpfd1) 194 | 195 | erroneous_words.append(src) 196 | correct_words.append(trg) 197 | predicted_words.append(pred) 198 | errors.append(error) 199 | words.append(word) 200 | ebpd1s.append(ebpd1) 201 | ebpfd1s.append(ebpfd1) 202 | 203 | if trg == pred: 204 | flags.append(1) 205 | else: 206 | flags.append(0) 207 | 208 | if pred in all_words: 209 | modified_flags.append(1) 210 | else: 211 | modified_flags.append(0) 212 | 213 | # evaluation_df = pd.DataFrame({ 214 | # 'Error': erroneous_words, 215 | # 'Predicton': predicted_words, 216 | # 'Target': correct_words, 217 | # 'Word': words, 218 | # 'Correction': flags 219 | # }) 220 | 221 | evaluation_df = pd.DataFrame({ 222 | 'Error': errors, 223 | 'Word': words, 224 | 'ErrorBlanksActual': correct_words, 225 | 'MaskErrorBlank': erroneous_words, 226 | 'ErrorBlanksPredD1': ebpd1s, 227 | 'EBP_Flag_D1': ebpfd1s, 228 | 'ErrorBlanksPredD2': predicted_words, 229 | 'EBP_Flag_D2': flags 230 | }) 231 | 232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum() 233 | total_instances = len(evaluation_df) 234 | accuracy = corrected_instances / total_instances 235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}") 236 | 237 | y_true = np.array(correct_words) 238 | y_pred = np.array(predicted_words) 239 | 240 | PR = metrics.precision_score(y_true, y_pred, average='weighted') 241 | RE = metrics.recall_score(y_true, y_pred, average='weighted') 242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted') 243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 244 | ACC = metrics.accuracy_score(y_true, y_pred) 245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags) 246 | 247 | print(f''' 248 | Top-1 (Greedy Decoding) 249 | Precision: {PR:.4f} 250 | Recall: {RE:.4f} 251 | F1 Score: {F1:.4f} 252 | F0.5 Score: {F05:.4f} 253 | Accuracy: {RE * 100:.2f}% 254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}% 255 | ''') 256 | 257 | return evaluation_df 258 | 259 | 260 | 261 | 262 | if __name__ == '__main__': 263 | pass 264 | 265 | 266 | -------------------------------------------------------------------------------- /detector.py: -------------------------------------------------------------------------------- 1 | from utils import ( 2 | word2char, basic_tokenizer, count_parameters, initialize_weights, 3 | save_model, load_model, error_df, train_valid_test_df, mask2str, 4 | error_df_2, error_df_3, merge_dfs 5 | ) 6 | from transformer import ( 7 | Encoder, EncoderLayer, MultiHeadAttentionLayer, 8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer, 9 | Seq2Seq 10 | ) 11 | from pipeline import train, evaluate 12 | from metrics import evaluation_report, evaluation_report2 13 | 14 | import pandas as pd 15 | from sklearn.model_selection import train_test_split 16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator 17 | import torch 18 | import torch.nn as nn 19 | import os 20 | import gc 21 | import sys 22 | import argparse 23 | 24 | import warnings as wrn 25 | wrn.filterwarnings('ignore') 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv", 31 | choices=[ 32 | "./Dataset/corpus.csv", # Bangla SEC parallel corpus 33 | "./Dataset/corpus2.csv", # Bangla SEC parallel corpus for running test 34 | "./Dataset/Hindi/corpus_hindi.csv", 35 | "./Dataset/Telugu/corpus_telugu.csv", 36 | "./Dataset/Hindi/corpus_hindi_enhanced.csv", 37 | "./Dataset/Telugu/corpus_telugu_enhanced.csv" 38 | ] 39 | ) 40 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256]) 41 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7]) 42 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7]) 43 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 44 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8]) 45 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256]) 46 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256]) 47 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 48 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5]) 49 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10]) 50 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100) 51 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005]) 52 | args = parser.parse_args() 53 | 54 | SEED = 1234 55 | torch.manual_seed(SEED) 56 | torch.cuda.manual_seed(SEED) 57 | 58 | # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv') 59 | # df = pd.read_csv('./Dataset/corpus.csv') 60 | df = pd.read_csv(args.CORPUS) 61 | df['Word'] = df['Word'].apply(word2char) 62 | df['Error'] = df['Error'].apply(word2char) 63 | df['Mask'] = df['Mask'].apply(mask2str) 64 | df['Mask'] = df['Mask'].apply(word2char) 65 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str) 66 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char) 67 | df = df.sample(frac=1).reset_index(drop=True) 68 | # df = df.iloc[:, [4, 1, 2]] 69 | df = df[['ErrorBlanks', 'Error', 'ErrorType']] 70 | 71 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05) 72 | 73 | train_df.to_csv('./Dataset/train.csv', index=False) 74 | valid_df.to_csv('./Dataset/valid.csv', index=False) 75 | test_df.to_csv('./Dataset/test.csv', index=False) 76 | 77 | SRC = Field( 78 | tokenize=basic_tokenizer, lower=False, 79 | init_token='', eos_token='', batch_first=True 80 | ) 81 | TRG = Field( 82 | tokenize=basic_tokenizer, lower=False, 83 | init_token='', eos_token='', batch_first=True 84 | ) 85 | WORD = Field( 86 | tokenize=basic_tokenizer, lower=False, 87 | init_token='', eos_token='', batch_first=True 88 | ) 89 | fields = { 90 | 'Error': ('src', SRC), 91 | 'ErrorBlanks': ('trg', TRG) 92 | } 93 | 94 | train_data, valid_data, test_data = TabularDataset.splits( 95 | path='./Dataset', 96 | train='train.csv', 97 | validation='valid.csv', 98 | test='test.csv', 99 | format='csv', 100 | fields=fields 101 | ) 102 | 103 | SRC.build_vocab(train_data, min_freq=100) 104 | TRG.build_vocab(train_data, min_freq=50) 105 | WORD.build_vocab(train_data, min_freq=100) 106 | 107 | # ------------------------------ 108 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 109 | BATCH_SIZE = 512 110 | # ------------------------------ 111 | INPUT_DIM = len(SRC.vocab) 112 | OUTPUT_DIM = len(TRG.vocab) 113 | # ------------------------------ 114 | HID_DIM = int(args.HID_DIM) 115 | ENC_LAYERS = int(args.ENC_LAYERS) 116 | DEC_LAYERS = int(args.DEC_LAYERS) 117 | ENC_HEADS = int(args.ENC_HEADS) 118 | DEC_HEADS = int(args.DEC_HEADS) 119 | ENC_PF_DIM = int(args.ENC_PF_DIM) 120 | DEC_PF_DIM = int(args.DEC_PF_DIM) 121 | ENC_DROPOUT = float(args.ENC_DROPOUT) 122 | DEC_DROPOUT = float(args.DEC_DROPOUT) 123 | CLIP = float(args.CLIP) 124 | N_EPOCHS = int(args.N_EPOCHS) 125 | LEARNING_RATE = float(args.LEARNING_RATE) 126 | # ------------------------------ 127 | PATH = './Checkpoints/detector.pth' 128 | # ------------------------------ 129 | gc.collect() 130 | torch.cuda.empty_cache() 131 | # ----------------------------- 132 | 133 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits( 134 | (train_data, valid_data, test_data), 135 | batch_size=BATCH_SIZE, 136 | sort_within_batch=True, 137 | sort_key=lambda x: len(x.src), 138 | device=DEVICE 139 | ) 140 | 141 | enc = Encoder( 142 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, 143 | ENC_DROPOUT, DEVICE 144 | ) 145 | dec = Decoder( 146 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, 147 | DEC_DROPOUT, DEVICE 148 | ) 149 | 150 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token] 151 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token] 152 | 153 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE) 154 | model.apply(initialize_weights) 155 | # print(f'The model has {count_parameters(model):,} trainable parameters') 156 | 157 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) 158 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) 159 | 160 | epoch = 1 161 | best_loss = 1e10 162 | if os.path.exists(PATH): 163 | checkpoint, epoch, train_loss = load_model(model, PATH) 164 | best_loss = train_loss 165 | 166 | for epoch in range(epoch, N_EPOCHS): 167 | print(f"Epoch: {epoch} / {N_EPOCHS}") 168 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP) 169 | print(f"Train Loss: {train_loss:.4f}") 170 | if train_loss < best_loss: 171 | best_loss = train_loss 172 | save_model(model, train_loss, epoch, PATH) 173 | 174 | # --------------------- 175 | # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE) 176 | # --------------------- 177 | error_types = [ 178 | 'Homonym Error', # 123 179 | 'Typo Deletion', # 115767 180 | 'Typo (Avro) Substituition', # 119573 181 | 'Typo (Bijoy) Substituition', # 119864 182 | 'Cognitive Error', # 108227 183 | 'Run-on Error', # 124895 184 | 'Split-word Error (Left)', # 62890 185 | 'Split-word Error (Random)', # 124895 186 | 'Split-word Error (Right)', # 13985 187 | 'Split-word Error (both)', # 12800 188 | 'Typo Insertion', # 124807 189 | 'Typo Transposition', # 123245 190 | 'Visual Error', # 117391 191 | 'Visual Error (Combined Character)' # 17617 192 | ] 193 | # --------------------- 194 | # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv') 195 | df = pd.read_csv('./Dataset/corpus.csv') 196 | df['Word'] = df['Word'].apply(word2char) 197 | df['Error'] = df['Error'].apply(word2char) 198 | df['Mask'] = df['Mask'].apply(mask2str) 199 | df['Mask'] = df['Mask'].apply(word2char) 200 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str) 201 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char) 202 | df = df.sample(frac=1).reset_index(drop=True) 203 | # df = df.iloc[:, [0, 1, -2, 2]] 204 | df = df[['Word', 'Error', 'ErrorBlanks', 'ErrorType']] 205 | 206 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=1./1e10, valid_size=1./1e10) 207 | 208 | train_df.to_csv('./Dataset/train.csv', index=False) 209 | valid_df.to_csv('./Dataset/valid.csv', index=False) 210 | test_df.to_csv('./Dataset/test.csv', index=False) 211 | # --------------------- 212 | for error_name in error_types: 213 | print(f'------\nError Type: {error_name}\n------') 214 | error_df_3(df, error_name) 215 | 216 | fields = { 217 | 'Error': ('src', SRC), 218 | 'ErrorBlanks': ('trg', TRG), 219 | 'Word': ('word', WORD) 220 | } 221 | 222 | error_data, _ = TabularDataset.splits( 223 | path='./Dataset', 224 | train='error.csv', 225 | test='error.csv', 226 | format='csv', 227 | fields=fields 228 | ) 229 | 230 | eval_df = evaluation_report2(error_data, SRC, TRG, WORD, model, DEVICE) 231 | eval_df['ErrorType'] = [error_name for _ in range(len(eval_df))] 232 | 233 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '') 234 | eval_df.to_csv(f'./Dataframes/detector_{error_name}.csv', index=False) 235 | print('\n\n') 236 | # --------------------- 237 | merge_dfs(network='detector') 238 | # --------------------- 239 | 240 | 241 | if __name__ == '__main__': 242 | main() 243 | --------------------------------------------------------------------------------