├── Baselines
├── ConvSeq2Seq
│ ├── Dataframes
│ │ └── dfs.txt
│ ├── Checkpoints
│ │ └── checkpoints.txt
│ ├── Dataset
│ │ └── kalpurush.ttf
│ ├── errors.py
│ ├── README.md
│ ├── pipeline.py
│ ├── utils.py
│ ├── metrics.py
│ ├── models.py
│ └── main.py
├── DCSpell
│ ├── Dataframes
│ │ └── dfs.txt
│ ├── Checkpoints
│ │ └── checkpoints.txt
│ ├── Dataset
│ │ └── kalpurush.ttf
│ ├── README.md
│ ├── process.py
│ ├── pipeline.py
│ ├── corrector.py
│ ├── utils.py
│ └── metrics.py
├── DTransformer
│ ├── Dataframes
│ │ └── dfs.txt
│ ├── Checkpoints
│ │ └── checkpoints.txt
│ ├── Dataset
│ │ └── kalpurush.ttf
│ ├── README.md
│ ├── process.py
│ ├── focalLoss.py
│ ├── pipeline.py
│ ├── utils.py
│ ├── dtransformer.py
│ └── metrics.py
├── GRUSeq2Seq
│ ├── Corrections
│ │ └── corpora.txt
│ ├── Dataset
│ │ └── kalpurush.ttf
│ ├── Checkpoints
│ │ └── temp.txt
│ ├── README.md
│ ├── metrics.py
│ ├── check.py
│ ├── decoding.py
│ ├── focalLoss.py
│ ├── inference.py
│ ├── pipeline.py
│ ├── models.py
│ ├── utils.py
│ ├── errors.py
│ └── main.py
├── RuleBased
│ ├── Dataset
│ │ └── kalpurush.ttf
│ ├── README.md
│ └── rule_based.py
└── README.md
├── Dataframes
└── dataframesGeneratedByModels.txt
├── Dataset
├── Hindi
│ └── HindiCorpusFromAnotherPaper.txt
├── Bangla
│ └── BanglaCorpusFromAnotherPaper.txt
└── Telugu
│ └── TeluguCorpusFromAnotherPaper.txt
├── figures
└── DPCSpell.jpg
├── Checkpoints
├── checkpoints.txt
└── Checkpoints.md
├── CorpusCreation
├── README.md
├── scraper.py
└── corpus_stats_valid.py
├── LICENSE
├── process.py
├── focalLoss.py
├── README.md
├── pipeline.py
├── corrector.py
├── utils.py
├── Requirements
└── requirements_u.yml
├── metrics.py
└── detector.py
/Baselines/ConvSeq2Seq/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs
--------------------------------------------------------------------------------
/Baselines/DCSpell/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs
--------------------------------------------------------------------------------
/Baselines/DTransformer/Dataframes/dfs.txt:
--------------------------------------------------------------------------------
1 | dfs
--------------------------------------------------------------------------------
/Dataframes/dataframesGeneratedByModels.txt:
--------------------------------------------------------------------------------
1 | https://
--------------------------------------------------------------------------------
/Dataset/Hindi/HindiCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://
--------------------------------------------------------------------------------
/Dataset/Bangla/BanglaCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://
--------------------------------------------------------------------------------
/Dataset/Telugu/TeluguCorpusFromAnotherPaper.txt:
--------------------------------------------------------------------------------
1 | https://
--------------------------------------------------------------------------------
/Baselines/DCSpell/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | checkpoints
2 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Corrections/corpora.txt:
--------------------------------------------------------------------------------
1 | corpus.csv
2 | corpus2.csv
--------------------------------------------------------------------------------
/figures/DPCSpell.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/figures/DPCSpell.jpg
--------------------------------------------------------------------------------
/Baselines/DCSpell/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DCSpell/Dataset/kalpurush.ttf
--------------------------------------------------------------------------------
/Baselines/RuleBased/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/RuleBased/Dataset/kalpurush.ttf
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/ConvSeq2Seq/Dataset/kalpurush.ttf
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/GRUSeq2Seq/Dataset/kalpurush.ttf
--------------------------------------------------------------------------------
/Baselines/DTransformer/Dataset/kalpurush.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mehedihasanbijoy/DPCSpell/HEAD/Baselines/DTransformer/Dataset/kalpurush.ttf
--------------------------------------------------------------------------------
/Checkpoints/checkpoints.txt:
--------------------------------------------------------------------------------
1 | Download checkpoints from the following link:
2 | https://drive.google.com/drive/folders/1prH28CiedKmhDmh3lOqquByQQTD8DN2d?usp=share_link
3 |
--------------------------------------------------------------------------------
/Checkpoints/Checkpoints.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/Checkpoints/temp.txt:
--------------------------------------------------------------------------------
1 | Top1 Acc: 0.5088253438742403
2 | Top2 Acc: 0.13197459214915688
3 | Top3 Acc: 0.10706370241740164
4 | Accuracy: 0.6364529543481241
5 | 100%|██████████| 175064/175064 [1:04:31<00:00, 45.21it/s]
6 | Modified Top1 Acc: 0.6128444454599461
7 |
8 | Process finished with exit code 0
9 |
--------------------------------------------------------------------------------
/CorpusCreation/README.md:
--------------------------------------------------------------------------------
1 | Corpus Creation
2 |
3 | ### Word Accumulation
4 | ```
5 | python scraper.py
6 | ```
7 |
8 | ### Error Annexation
9 | ```
10 | python errors.py
11 | ```
12 |
13 | ### Error Filtration
14 |
15 |
16 |
17 | ### Corpus Statistic and Error Percentage Validation
18 | ```
19 | python corpus_stats_valid.py --email "username@gmail.com" --password "facebook_password"
20 | ```
21 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/errors.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from utils import word2char
3 |
4 |
5 | def error_df(df, error='Cognitive Error'):
6 | df = df.loc[df['ErrorType'] == error]
7 | df['Word'] = df['Word'].apply(word2char)
8 | df['Error'] = df['Error'].apply(word2char)
9 | df = df.sample(frac=1).reset_index(drop=True)
10 | df = df.iloc[:, [1, 0]]
11 | df.to_csv('./Dataset/error.csv', index=False)
12 |
13 |
--------------------------------------------------------------------------------
/Baselines/RuleBased/README.md:
--------------------------------------------------------------------------------
1 | RuleBased
2 |
3 | ## Activate the Environment
4 | ```
5 | conda activate DPCSpell
6 | ```
7 |
8 |
9 |
10 | ## Prepare SEC Corpora
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 |
15 | or manually download the folder from here and keep the extracted files into ./Dataset/
16 |
17 |
18 |
19 |
20 | ## Training and Evaluation of RuleBased
21 | ```
22 | python rule_based.py
23 | ```
24 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/README.md:
--------------------------------------------------------------------------------
1 | ConvSeq2Seq
2 |
3 | ## Activate the Environment
4 | ```
5 | conda activate DPCSpell
6 | ```
7 |
8 |
9 |
10 | ## Prepare SEC Corpora
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 |
15 | or manually download the folder from here and keep the extracted files into ./Dataset/
16 |
17 |
18 |
19 |
20 | ## Training and Evaluation of ConvSeq2Seq
21 | ```
22 | python main.py --CORPUS "./Dataset/corpus.csv" --EMB_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_KERNEL_SIZE 3 --DEC_KERNEL_SIZE 3 --ENC_DROPOUT 0.2 --DEC_DROPOUT 0.2 --CLIP 0.1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/README.md:
--------------------------------------------------------------------------------
1 | DTransformer
2 |
3 | ## Activate the Environment
4 | ```
5 | conda activate DPCSpell
6 | ```
7 |
8 |
9 |
10 | ## Prepare SEC Corpora
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 |
15 | or manually download the folder from here and keep the extracted files into ./Dataset/
16 |
17 |
18 |
19 |
20 | ## Training and Evaluation of DTransformer
21 | ```
22 | python dtransformer.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/README.md:
--------------------------------------------------------------------------------
1 | GRUSeq2Seq
2 |
3 | ## Activate the Environment
4 | ```
5 | conda activate DPCSpell
6 | ```
7 |
8 |
9 |
10 | ## Prepare SEC Corpora
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 |
15 | or manually download the folder from here and keep the extracted files into ./Dataset/
16 |
17 |
18 |
19 |
20 | ## Training and Evaluation of GRUSeq2Seq
21 | ```
22 | python main.py --CORPUS "./Dataset/corpus.csv" --ENC_EMB_DIM 128 --DEC_EMB_DIM 128 --ENC_HIDDEN_DIM 256 --DEC_HIDDEN_DIM 512 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --MAX_LEN 48 --CLIP 1 --BATCH_SIZE 256 --LEARNING_RATE 0.0005 --N_EPOCHS 100
23 | ```
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Mehedi Hasan Bijoy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/README.md:
--------------------------------------------------------------------------------
1 | DCSpell
2 |
3 | ## Activate the Environment
4 | ```
5 | conda activate DPCSpell
6 | ```
7 |
8 |
9 |
10 | ## Prepare SEC Corpora
11 | ```
12 | gdown https://drive.google.com/drive/folders/1vfCAqqXy0ZTL8cPKR-K5q5coBnNv2Zxf?usp=sharing -O ./Dataset --folder
13 | ```
14 |
15 | or manually download the folder from here and keep the extracted files into ./Dataset/
16 |
17 |
18 |
19 |
20 |
21 | ## Training and Evaluation of DPCSpell
22 |
23 | ### Detector Network
24 |
25 | ```
26 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
27 | ```
28 |
29 | ### Corrector Network
30 |
31 | ```
32 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
33 | ```
34 |
--------------------------------------------------------------------------------
/Baselines/README.md:
--------------------------------------------------------------------------------
1 | Baselines
2 |
3 |
36 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/pipeline.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tqdm import tqdm
3 |
4 |
5 | def train(model, iterator, optimizer, criterion, clip):
6 | model.train()
7 | epoch_loss = 0
8 | for idx, batch in enumerate(tqdm(iterator)):
9 | src = batch.src
10 | trg = batch.trg
11 | optimizer.zero_grad()
12 | output, _ = model(src, trg[:, :-1])
13 | output_dim = output.shape[-1]
14 | output = output.contiguous().view(-1, output_dim)
15 | trg = trg[:, 1:].contiguous().view(-1)
16 | loss = criterion(output, trg)
17 | loss.backward()
18 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
19 | optimizer.step()
20 | epoch_loss += loss.item()
21 | return epoch_loss / len(iterator)
22 |
23 |
24 | def evaluate(model, iterator, criterion):
25 | model.eval()
26 | epoch_loss = 0
27 | with torch.no_grad():
28 | for idx, batch in enumerate(tqdm(iterator)):
29 | src = batch.src
30 | trg = batch.trg
31 | output, _ = model(src, trg[:, :-1])
32 | output_dim = output.shape[-1]
33 | output = output.contiguous().view(-1, output_dim)
34 | trg = trg[:, 1:].contiguous().view(-1)
35 | loss = criterion(output, trg)
36 | epoch_loss += loss.item()
37 | return epoch_loss / len(iterator)
38 |
39 |
40 | if __name__ == '__main__':
41 | pass
42 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def basic_tokenizer(text):
5 | return text.split()
6 |
7 |
8 | def word2char(word):
9 | w2c = [char for char in word]
10 | return ' '.join(w2c)
11 |
12 |
13 | def count_parameters(model):
14 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
15 |
16 |
17 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30):
18 | model.eval()
19 | tokens = [src_field.init_token] + sentence + [src_field.eos_token]
20 | src_indexes = [src_field.vocab.stoi[token] for token in tokens]
21 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
22 | with torch.no_grad():
23 | encoder_conved, encoder_combined = model.encoder(src_tensor)
24 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
25 | for i in range(max_len):
26 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
27 | with torch.no_grad():
28 | output, attention = model.decoder(trg_tensor, encoder_conved, encoder_combined)
29 | pred_token = output.argmax(2)[:, -1].item()
30 | trg_indexes.append(pred_token)
31 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
32 | break
33 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
34 | return trg_tokens[1:], attention
35 |
36 |
37 | def save_model(model, train_loss, epoch, PATH):
38 | torch.save({
39 | 'epoch': epoch,
40 | 'model_state_dict': model.state_dict(),
41 | # 'optimizer_state_dict': optimizer.state_dict(),
42 | 'loss': train_loss
43 | }, PATH)
44 | print(f"---------\nModel Saved at {PATH}\n---------\n")
45 |
46 |
47 | def load_model(model, PATH):
48 | checkpoint = torch.load(PATH)
49 | model.load_state_dict(checkpoint['model_state_dict'])
50 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
51 | epoch = checkpoint['epoch']
52 | train_loss = checkpoint['loss']
53 | return checkpoint, epoch, train_loss
54 |
55 |
56 | if __name__ == '__main__':
57 | pass
--------------------------------------------------------------------------------
/process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from utils import word2char
3 | from tqdm import tqdm
4 |
5 |
6 | def check_from_left(word, error):
7 | left = []
8 | for i in range(len(error)):
9 | if error[i] == word[i]:
10 | left.append(0)
11 | else:
12 | left.append(1)
13 | return left
14 |
15 |
16 | def check_from_right(word, error):
17 | word.reverse()
18 | error.reverse()
19 | right = []
20 | for i in range(len(error)):
21 | if error[i] == word[i]:
22 | right.append(0)
23 | else:
24 | right.append(1)
25 | right.reverse()
26 | return right
27 |
28 |
29 | def check_from_both(word, error):
30 | length = len(error)
31 | if length % 2 == 0:
32 | iterator = length // 2
33 | else:
34 | iterator = (length // 2) + 1
35 |
36 | x = -1
37 |
38 | left = []
39 | right = []
40 |
41 | for i in range(iterator):
42 | if error[i] == word[i]:
43 | left.append(0)
44 | else:
45 | left.append(1)
46 |
47 | if error[x] == word[x]:
48 | right.append(0)
49 | else:
50 | right.append(1)
51 | x -= 1
52 |
53 | right.reverse()
54 | both = [*left, *right]
55 | return both
56 |
57 |
58 | if __name__ == '__main__':
59 | path = './Dataset/sec_dataset_III_v3.csv'
60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 | df_copy = df.copy()
62 | df['Word'] = df['Word'].apply(word2char)
63 | df['Error'] = df['Error'].apply(word2char)
64 |
65 | for idx in tqdm(range(len(df))):
66 | word = df.iloc[idx, 0].split()
67 | error = df.iloc[idx, 1].split()
68 | word = ['ব', 'া', 'ং', 'ল', 'া']
69 | error = ['ব', 'ং', 'ল', 'া']
70 | print(len(word), len(error))
71 | print(f'{word}\n{error}')
72 | # checking from left
73 | left = check_from_left(word, error)
74 | print(left)
75 | right = check_from_right(word, error)
76 | print(right)
77 | both = check_from_both(word, error)
78 | print(both)
79 | break
80 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from utils import word2char
3 | from tqdm import tqdm
4 |
5 |
6 | def check_from_left(word, error):
7 | left = []
8 | for i in range(len(error)):
9 | if error[i] == word[i]:
10 | left.append(0)
11 | else:
12 | left.append(1)
13 | return left
14 |
15 |
16 | def check_from_right(word, error):
17 | word.reverse()
18 | error.reverse()
19 | right = []
20 | for i in range(len(error)):
21 | if error[i] == word[i]:
22 | right.append(0)
23 | else:
24 | right.append(1)
25 | right.reverse()
26 | return right
27 |
28 |
29 | def check_from_both(word, error):
30 | length = len(error)
31 | if length % 2 == 0:
32 | iterator = length // 2
33 | else:
34 | iterator = (length // 2) + 1
35 |
36 | x = -1
37 |
38 | left = []
39 | right = []
40 |
41 | for i in range(iterator):
42 | if error[i] == word[i]:
43 | left.append(0)
44 | else:
45 | left.append(1)
46 |
47 | if error[x] == word[x]:
48 | right.append(0)
49 | else:
50 | right.append(1)
51 | x -= 1
52 |
53 | right.reverse()
54 | both = [*left, *right]
55 | return both
56 |
57 |
58 | if __name__ == '__main__':
59 | path = './Dataset/sec_dataset_III_v3.csv'
60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 | df_copy = df.copy()
62 | df['Word'] = df['Word'].apply(word2char)
63 | df['Error'] = df['Error'].apply(word2char)
64 |
65 | for idx in tqdm(range(len(df))):
66 | word = df.iloc[idx, 0].split()
67 | error = df.iloc[idx, 1].split()
68 | word = ['ব', 'া', 'ং', 'ল', 'া']
69 | error = ['ব', 'ং', 'ল', 'া']
70 | print(len(word), len(error))
71 | print(f'{word}\n{error}')
72 | # checking from left
73 | left = check_from_left(word, error)
74 | print(left)
75 | right = check_from_right(word, error)
76 | print(right)
77 | both = check_from_both(word, error)
78 | print(both)
79 | break
80 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/process.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from utils import word2char
3 | from tqdm import tqdm
4 |
5 |
6 | def check_from_left(word, error):
7 | left = []
8 | for i in range(len(error)):
9 | if error[i] == word[i]:
10 | left.append(0)
11 | else:
12 | left.append(1)
13 | return left
14 |
15 |
16 | def check_from_right(word, error):
17 | word.reverse()
18 | error.reverse()
19 | right = []
20 | for i in range(len(error)):
21 | if error[i] == word[i]:
22 | right.append(0)
23 | else:
24 | right.append(1)
25 | right.reverse()
26 | return right
27 |
28 |
29 | def check_from_both(word, error):
30 | length = len(error)
31 | if length % 2 == 0:
32 | iterator = length // 2
33 | else:
34 | iterator = (length // 2) + 1
35 |
36 | x = -1
37 |
38 | left = []
39 | right = []
40 |
41 | for i in range(iterator):
42 | if error[i] == word[i]:
43 | left.append(0)
44 | else:
45 | left.append(1)
46 |
47 | if error[x] == word[x]:
48 | right.append(0)
49 | else:
50 | right.append(1)
51 | x -= 1
52 |
53 | right.reverse()
54 | both = [*left, *right]
55 | return both
56 |
57 |
58 | if __name__ == '__main__':
59 | path = './Dataset/sec_dataset_III_v3.csv'
60 | df = pd.read_csv('./Dataset/sec_dataset_III_v3.csv')
61 | df_copy = df.copy()
62 | df['Word'] = df['Word'].apply(word2char)
63 | df['Error'] = df['Error'].apply(word2char)
64 |
65 | for idx in tqdm(range(len(df))):
66 | word = df.iloc[idx, 0].split()
67 | error = df.iloc[idx, 1].split()
68 | word = ['ব', 'া', 'ং', 'ল', 'া']
69 | error = ['ব', 'ং', 'ল', 'া']
70 | print(len(word), len(error))
71 | print(f'{word}\n{error}')
72 | # checking from left
73 | left = check_from_left(word, error)
74 | print(left)
75 | right = check_from_right(word, error)
76 | print(right)
77 | both = check_from_both(word, error)
78 | print(both)
79 | break
80 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/metrics.py:
--------------------------------------------------------------------------------
1 | import torch, torch.nn as nn, torch.optim as optim
2 | import torch.nn.functional as F
3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | import math
9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 |
12 | import matplotlib.pyplot as plt
13 | import matplotlib.ticker as ticker
14 | import matplotlib.font_manager as fm
15 |
16 | import numpy as np
17 | import math
18 | import time
19 | from sklearn import metrics
20 |
21 | import warnings as wrn
22 | wrn.filterwarnings('ignore')
23 |
24 |
25 | def beam_eval_report(trg_words, topk_prediction_list):
26 | y_true = np.array(trg_words)
27 | y_pred = np.array(topk_prediction_list)[:, 0]
28 |
29 | LABELS = np.array(set(list(set(y_true)) + list(set(y_pred))))
30 |
31 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
32 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
33 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
34 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
35 | ACC = metrics.accuracy_score(y_true, y_pred)
36 |
37 | print("Evaluation report of beam decoding")
38 | print(f'''
39 | Top-1 (Beam Decoding)
40 | Precision: {PR:.4f}
41 | Recall: {RE:.4f}
42 | F1 Score: {F1:.4f}
43 | F0.5 Score: {F05:.4f}
44 | Accuracy: {RE * 100:.2f}%
45 | ''')
46 |
47 |
48 | def greedy_eval_report(correct_words, predicted_words):
49 | y_true = np.array(correct_words)
50 | y_pred = np.array(predicted_words)
51 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
52 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
53 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
54 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
55 | ACC = metrics.accuracy_score(y_true, y_pred)
56 | print("Evaluation report of greedy decoding")
57 | print(f'''
58 | Top-1 (Greedy Decoding)
59 | Precision: {PR:.4f}
60 | Recall: {RE:.4f}
61 | F1 Score: {F1:.4f}
62 | F0.5 Score: {F05:.4f}
63 | Accuracy: {RE * 100:.2f}%
64 | ''')
65 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/metrics.py:
--------------------------------------------------------------------------------
1 | from utils import translate_sentence
2 |
3 | from sklearn import metrics
4 | from tqdm import tqdm
5 | import pandas as pd
6 | import numpy as np
7 |
8 |
9 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
10 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
11 |
12 | modified_flags = []
13 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
14 | all_words = sorted(all_words.iloc[:, 0].values)
15 |
16 | for data in tqdm(test_data):
17 | src = data.src
18 | trg = data.trg
19 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
20 |
21 | src = ''.join(src)
22 | trg = ''.join(trg)
23 | pred = ''.join(translation[:-1])
24 |
25 | erroneous_words.append(src)
26 | predicted_words.append(pred)
27 | correct_words.append(trg)
28 |
29 | if trg == pred:
30 | flags.append(1)
31 | else:
32 | flags.append(0)
33 |
34 | if pred in all_words:
35 | modified_flags.append(1)
36 | else:
37 | modified_flags.append(0)
38 |
39 | evaluation_df = pd.DataFrame({
40 | 'Error': erroneous_words,
41 | 'Predicton': predicted_words,
42 | 'Target': correct_words,
43 | 'Correction': flags
44 | })
45 |
46 | corrected_instances = evaluation_df['Correction'].values.sum()
47 | total_instances = len(evaluation_df)
48 | accuracy = corrected_instances / total_instances
49 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
50 | # print(f"Accuracy: {accuracy * 100:.2f}%")
51 |
52 | y_true = np.array(correct_words)
53 | y_pred = np.array(predicted_words)
54 |
55 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
56 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
57 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
58 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
59 | ACC = metrics.accuracy_score(y_true, y_pred)
60 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
61 |
62 | print(f'''
63 | Top-1 (Greedy Decoding)
64 | Precision: {PR:.4f}
65 | Recall: {RE:.4f}
66 | F1 Score: {F1:.4f}
67 | F0.5 Score: {F05:.4f}
68 | Accuracy: {RE * 100:.2f}%
69 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
70 | ''')
71 |
72 | # evaluation_df.to_csv('./Dataset/preds_convs2s.csv', index=False)
73 | return evaluation_df
74 |
75 |
76 | if __name__ == '__main__':
77 | pass
--------------------------------------------------------------------------------
/CorpusCreation/scraper.py:
--------------------------------------------------------------------------------
1 | import requests, bs4
2 | import pandas as pd
3 | from tqdm import tqdm
4 |
5 |
6 | def word_accumulation():
7 | char_pages = {
8 | 'অ': 71, 'আ': 50, 'ই': 10, 'ঈ': 1, 'উ': 25, 'ঊ': 2, 'ঋ': 1, 'এ': 13, 'ঐ': 2, 'ও': 7, 'ঔ': 3,
9 | 'ক': 82, 'খ': 29, 'গ': 35, 'ঘ': 7, 'ঙ': 1, 'চ': 32, 'ছ': 12, 'জ': 28, 'ঝ': 8, 'ঞ': 1,
10 | 'ট': 16, 'ঠ': 4, 'ড': 12, 'ঢ': 6, 'ণ': 1, 'ত': 44, 'থ': 6, 'দ': 44, 'ধ': 13, 'ন': 52,
11 | 'প': 77, 'ফ': 16, 'ব': 90, 'ভ': 24, 'ম': 58, 'য': 11, 'র': 30, 'ল': 18, 'শ': 25, 'ষ': 3, 'স': 86, 'হ': 27
12 | }
13 |
14 | all_urls = {}
15 |
16 | url = 'https://accessibledictionary.gov.bd/bengali-to-bengali/'
17 |
18 | html_codes = requests.get(url).text
19 | document = bs4.BeautifulSoup(html_codes, 'lxml')
20 | alphabet_links = document.find('ul', class_='alphabet')
21 | items = alphabet_links.find_all('li')
22 |
23 | for item in items:
24 | url = str(item).split('"')[1]
25 | all_urls[url[-1:]] = url
26 |
27 | df_dict = {}
28 |
29 | for url in all_urls.values():
30 | no_of_pages = char_pages[url[-1:]]
31 | for idx in tqdm(range(1, no_of_pages + 1)):
32 | desired_url = url + '&page=' + str(idx)
33 | html_codes = requests.get(desired_url).text
34 | document = bs4.BeautifulSoup(html_codes, 'lxml')
35 | article = document.find('article', class_='dicDisplay')
36 | items = article.find_all('li')
37 |
38 | for item in items:
39 | text = item.get_text()
40 | text = text.split('Bengali Word')[1]
41 | text = text.split('Bengali definition')
42 | ben_word = text[0]
43 | ben_def = text[1]
44 | df_dict[ben_word] = ben_def
45 | # break
46 |
47 | df = pd.DataFrame(
48 | {
49 | 'Word': df_dict.keys(),
50 | 'Defination': df_dict.values()
51 | }
52 | )
53 | return df
54 |
55 |
56 | def get_len(word):
57 | return len(word)
58 |
59 |
60 | def text_preprocessing(df):
61 | all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ',
62 | 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ',
63 | 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ',
64 | 'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ',
65 | 'ৗ', 'ড়', 'ঢ়', 'য়', ' ']
66 |
67 | words = ''
68 |
69 | df_words = ' '.join(df['Word'].values)
70 | for char in df_words:
71 | if char in all_chars:
72 | words += char
73 |
74 | words += ' '
75 |
76 | df_definations = ' '.join(df['Defination'].values)
77 | for char in df_definations:
78 | if char in all_chars:
79 | words += char
80 |
81 | words = sorted(list(set(words.split(' '))))
82 | df_all_words = pd.DataFrame({'word': words})
83 | df_all_words['len'] = df_all_words['word'].apply(get_len)
84 | df_all_words = df_all_words.loc[df_all_words['len'] > 2]
85 | return df_all_words
86 |
87 |
88 | if __name__ == '__main__':
89 | df = word_accumulation()
90 | df_all_words = text_preprocessing(df)
91 | df_all_words.to_csv('./dfs/df_all_words.csv', index=False)
92 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/check.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from tqdm import tqdm
4 |
5 |
6 | def within_topk(df, k):
7 | correct = df['Correct']
8 | topk = df.iloc[:, 1:k+1].values
9 | preds = 0
10 | # for idx in tqdm(range(len(df))):
11 | for idx in range(len(df)):
12 | if correct[idx] in topk[idx]:
13 | preds += 1
14 | acc_within_topk = preds / len(df)
15 | print(f"Within Top-{k} Acc: {acc_within_topk}")
16 |
17 |
18 | def modified_acc(df_allWords, df, k):
19 | df_allWords = sorted(df_allWords.iloc[:, 0].values)
20 | correct = df['Correct']
21 | topk = df.iloc[:, 1:k + 1].values
22 | preds = 0
23 | for words in tqdm(topk):
24 | for word in words:
25 | if word in df_allWords:
26 | preds += 1
27 | break;
28 | modified_acc_within_topk = preds / len(df)
29 | print(f"Within Top-{k} Modified Acc: {modified_acc_within_topk}")
30 |
31 |
32 | def beam_report():
33 | print("""
34 | --------------------------------
35 | Beam Decoding Evaluation Report
36 | --------------------------------
37 | """)
38 | df_allWords = pd.read_csv('./Dataset/allDictWords_df.csv')
39 | df_beam = pd.read_csv('./Corrections/preds_beam_colab.csv')
40 | top1_acc = np.sum(df_beam['Pred-1'] == df_beam['Correct']) / len(df_beam)
41 | top2_acc = np.sum(df_beam['Pred-2'] == df_beam['Correct']) / len(df_beam)
42 | top3_acc = np.sum(df_beam['Pred-3'] == df_beam['Correct']) / len(df_beam)
43 | print(f"Top1 Acc: {top1_acc}")
44 | print(f"Top2 Acc: {top2_acc}")
45 | print(f"Top3 Acc: {top3_acc}\n")
46 | within_topk(df_beam, 1)
47 | within_topk(df_beam, 2)
48 | within_topk(df_beam, 3)
49 | modified_acc(df_allWords, df_beam, 1)
50 | modified_acc(df_allWords, df_beam, 2)
51 | modified_acc(df_allWords, df_beam, 3)
52 |
53 | def test():
54 | df = pd.read_csv('./Dataset/allDictWords_df.csv')
55 | words = sorted(df.iloc[:, 0].values)
56 | print(words)
57 | #
58 | # acc = (df_beam['Pred-1'] == df_beam['Correct'])*1 + \
59 | # (df_beam['Pred-2'] == df_beam['Correct'])*1 + \
60 | # (df_beam['Pred-3'] == df_beam['Correct'])*1
61 | # acc = acc.values
62 | # acc = [1 if x>0 else 0 for x in acc]
63 | # print(f"Accuracy: {np.sum(acc) / len(df_beam)}")
64 | #
65 | # df_dict = pd.read_csv('./Dataset/allDictWords_df.csv')
66 | # df_allWords = pd.read_csv('./Dataset/df_all_words.csv')
67 | # #
68 | # preds1 = []
69 | # for word in tqdm(df_beam['Pred-1'].values):
70 | # # similar_words = df_dict.loc[df_dict['word'].str.startswith(word)].iloc[:, 0].values
71 | # if word in df_allWords.iloc[:, 0].values:
72 | # preds1.append(1)
73 | # else:
74 | # preds1.append(0)
75 | # print(f"Modified Top1 Acc: {np.sum(preds1) / len(preds1)}")
76 | #
77 | # df_greedy = pd.read_csv('./Corrections/preds_greedy_colab.csv')
78 | # # print(df_greedy)
79 | # greedy_acc = np.sum(df_greedy['Predicton'] == df_greedy['Target'])/len(df_greedy)
80 | # print(f'Greedy Accuracy: {greedy_acc}')
81 | # preds = []
82 | # for word in tqdm(df_greedy['Predicton'].values):
83 | # if word in df_allWords.iloc[:, 0].values:
84 | # preds.append(1)
85 | # else:
86 | # preds.append(0)
87 | # print(f"Modified Greedy Accuracy: {np.sum(preds) / len(preds)}")
88 |
89 | if __name__ == '__main__':
90 | beam_report()
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/decoding.py:
--------------------------------------------------------------------------------
1 | import torch, torch.nn as nn, torch.optim as optim
2 | import torch.nn.functional as F
3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | import math
9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 |
12 | import matplotlib.pyplot as plt
13 | import matplotlib.ticker as ticker
14 | import matplotlib.font_manager as fm
15 |
16 | import numpy as np
17 | import math
18 | import time
19 |
20 | import copy
21 | from heapq import heappush, heappop
22 |
23 | import warnings as wrn
24 | wrn.filterwarnings('ignore')
25 |
26 |
27 | class BeamSearchNode(object):
28 | def __init__(self, h, prev_node, wid, logp, length):
29 | self.h = h
30 | self.prev_node = prev_node
31 | self.wid = wid
32 | self.logp = logp
33 | self.length = length
34 |
35 | def eval(self):
36 | return self.logp / float(self.length - 1 + 1e-6)
37 |
38 |
39 | def beam_search_decoding(model, src, decoder, enc_outs, enc_last_h, beam_width, n_best, \
40 | sos_token, eos_token, max_dec_steps, device):
41 | assert beam_width >= n_best
42 | n_best_list = []
43 | bs = enc_outs.shape[1]
44 |
45 | for batch_id in range(bs):
46 | decoder_hidden = enc_last_h[batch_id]
47 | enc_out = enc_outs[:, batch_id].unsqueeze(1)
48 |
49 | # decoder_input = torch.tensor([sos_token].long().to(DEVICE))
50 | decoder_input = torch.tensor([sos_token]).to(device)
51 | end_nodes = []
52 |
53 | node = BeamSearchNode(h=decoder_hidden, prev_node=None, wid=decoder_input, logp=0, length=1)
54 | nodes = []
55 |
56 | heappush(nodes, (-node.eval(), id(node), node))
57 | n_dec_steps = 0
58 |
59 | while True:
60 | if n_dec_steps > max_dec_steps:
61 | break
62 |
63 | score, _, n = heappop(nodes)
64 | decoder_input = n.wid
65 | decoder_hidden = n.h
66 |
67 | if n.wid.item() == eos_token and n.prev_node is not None:
68 | end_nodes.append((score, id(n), n))
69 | if len(end_nodes) >= n_best:
70 | break
71 | else:
72 | continue
73 |
74 | mask = model.create_mask(src)
75 | decoder_output, decoder_hidden, _ = decoder(decoder_input, decoder_hidden.unsqueeze(0), enc_out, mask)
76 |
77 | # restricting length
78 | topk_log_prob, topk_indexes = torch.topk(decoder_output, beam_width)
79 |
80 | for new_k in range(beam_width):
81 | decoded_t = topk_indexes[0][new_k].view(1)
82 | logp = topk_log_prob[0][new_k].item()
83 |
84 | node = BeamSearchNode(
85 | h=decoder_hidden.squeeze(0), prev_node=n, wid=decoded_t, logp=n.logp + logp, length=n.length + 1
86 | )
87 |
88 | heappush(nodes, (-node.eval(), id(node), node))
89 |
90 | n_dec_steps += beam_width
91 |
92 | if len(end_nodes) == 0:
93 | end_nodes = [heappop(nodes) for _ in range(beam_width)]
94 |
95 | n_best_seq_list = []
96 | for score, _id, n in sorted(end_nodes, key=lambda x: x[0]):
97 | sequence = [n.wid.item()]
98 | while n.prev_node is not None:
99 | n = n.prev_node
100 | sequence.append(n.wid.item())
101 | sequence = sequence[::-1]
102 | n_best_seq_list.append(sequence)
103 |
104 | n_best_list.append(n_best_seq_list)
105 |
106 | return n_best_list
107 |
108 |
109 |
110 |
111 |
112 |
113 |
--------------------------------------------------------------------------------
/focalLoss.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from typing import Optional
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 | # following:
9 | # https://github.com/kornia/kornia/
10 | # which is based on:
11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
12 |
13 |
14 | def one_hot(
15 | labels: torch.Tensor,
16 | num_classes: int,
17 | device: Optional[torch.device] = None,
18 | dtype: Optional[torch.dtype] = None,
19 | eps: float = 1e-6,
20 | ) -> torch.Tensor:
21 |
22 | if not isinstance(labels, torch.Tensor):
23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
24 |
25 | if not labels.dtype == torch.int64:
26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
27 |
28 | if num_classes < 1:
29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
30 |
31 | shape = labels.shape
32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
33 |
34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
35 |
36 |
37 |
38 | def focal_loss(
39 | input: torch.Tensor,
40 | target: torch.Tensor,
41 | alpha: float,
42 | gamma: float = 2.0,
43 | reduction: str = 'none',
44 | eps: Optional[float] = None,
45 | ) -> torch.Tensor:
46 |
47 | if eps is not None and not torch.jit.is_scripting():
48 | warnings.warn(
49 | "`focal_loss` has been reworked for improved numerical stability "
50 | "and the `eps` argument is no longer necessary",
51 | DeprecationWarning,
52 | stacklevel=2,
53 | )
54 |
55 | if not isinstance(input, torch.Tensor):
56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
57 |
58 | if not len(input.shape) >= 2:
59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
60 |
61 | if input.size(0) != target.size(0):
62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
63 |
64 | n = input.size(0)
65 | out_size = (n,) + input.size()[2:]
66 | if target.size()[1:] != input.size()[2:]:
67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}')
68 |
69 | if not input.device == target.device:
70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
71 |
72 | # compute softmax over the classes axis
73 | input_soft: torch.Tensor = F.softmax(input, dim=1)
74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
75 |
76 | # create the labels one hot tensor
77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
78 |
79 | # compute the actual focal loss
80 | weight = torch.pow(-input_soft + 1.0, gamma)
81 |
82 | focal = -alpha * weight * log_input_soft
83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
84 |
85 | if reduction == 'none':
86 | loss = loss_tmp
87 | elif reduction == 'mean':
88 | loss = torch.mean(loss_tmp)
89 | elif reduction == 'sum':
90 | loss = torch.sum(loss_tmp)
91 | else:
92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}")
93 | return loss
94 |
95 |
96 | class FocalLoss(nn.Module):
97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
98 | super().__init__()
99 | self.alpha: float = alpha
100 | self.gamma: float = gamma
101 | self.reduction: str = reduction
102 | self.eps: Optional[float] = eps
103 |
104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/focalLoss.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from typing import Optional
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 | # following:
9 | # https://github.com/kornia/kornia/
10 | # which is based on:
11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
12 |
13 |
14 | def one_hot(
15 | labels: torch.Tensor,
16 | num_classes: int,
17 | device: Optional[torch.device] = None,
18 | dtype: Optional[torch.dtype] = None,
19 | eps: float = 1e-6,
20 | ) -> torch.Tensor:
21 |
22 | if not isinstance(labels, torch.Tensor):
23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
24 |
25 | if not labels.dtype == torch.int64:
26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
27 |
28 | if num_classes < 1:
29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
30 |
31 | shape = labels.shape
32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
33 |
34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
35 |
36 |
37 |
38 | def focal_loss(
39 | input: torch.Tensor,
40 | target: torch.Tensor,
41 | alpha: float,
42 | gamma: float = 2.0,
43 | reduction: str = 'none',
44 | eps: Optional[float] = None,
45 | ) -> torch.Tensor:
46 |
47 | if eps is not None and not torch.jit.is_scripting():
48 | warnings.warn(
49 | "`focal_loss` has been reworked for improved numerical stability "
50 | "and the `eps` argument is no longer necessary",
51 | DeprecationWarning,
52 | stacklevel=2,
53 | )
54 |
55 | if not isinstance(input, torch.Tensor):
56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
57 |
58 | if not len(input.shape) >= 2:
59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
60 |
61 | if input.size(0) != target.size(0):
62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
63 |
64 | n = input.size(0)
65 | out_size = (n,) + input.size()[2:]
66 | if target.size()[1:] != input.size()[2:]:
67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}')
68 |
69 | if not input.device == target.device:
70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
71 |
72 | # compute softmax over the classes axis
73 | input_soft: torch.Tensor = F.softmax(input, dim=1)
74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
75 |
76 | # create the labels one hot tensor
77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
78 |
79 | # compute the actual focal loss
80 | weight = torch.pow(-input_soft + 1.0, gamma)
81 |
82 | focal = -alpha * weight * log_input_soft
83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
84 |
85 | if reduction == 'none':
86 | loss = loss_tmp
87 | elif reduction == 'mean':
88 | loss = torch.mean(loss_tmp)
89 | elif reduction == 'sum':
90 | loss = torch.sum(loss_tmp)
91 | else:
92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}")
93 | return loss
94 |
95 |
96 | class FocalLoss(nn.Module):
97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
98 | super().__init__()
99 | self.alpha: float = alpha
100 | self.gamma: float = gamma
101 | self.reduction: str = reduction
102 | self.eps: Optional[float] = eps
103 |
104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/focalLoss.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from typing import Optional
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 | # following:
9 | # https://github.com/kornia/kornia/
10 | # which is based on:
11 | # https://github.com/zhezh/focalloss/blob/master/focalloss.py
12 |
13 |
14 | def one_hot(
15 | labels: torch.Tensor,
16 | num_classes: int,
17 | device: Optional[torch.device] = None,
18 | dtype: Optional[torch.dtype] = None,
19 | eps: float = 1e-6,
20 | ) -> torch.Tensor:
21 |
22 | if not isinstance(labels, torch.Tensor):
23 | raise TypeError(f"Input labels type is not a torch.Tensor. Got {type(labels)}")
24 |
25 | if not labels.dtype == torch.int64:
26 | raise ValueError(f"labels must be of the same dtype torch.int64. Got: {labels.dtype}")
27 |
28 | if num_classes < 1:
29 | raise ValueError("The number of classes must be bigger than one." " Got: {}".format(num_classes))
30 |
31 | shape = labels.shape
32 | one_hot = torch.zeros((shape[0], num_classes) + shape[1:], device=device, dtype=dtype)
33 |
34 | return one_hot.scatter_(1, labels.unsqueeze(1), 1.0) + eps
35 |
36 |
37 |
38 | def focal_loss(
39 | input: torch.Tensor,
40 | target: torch.Tensor,
41 | alpha: float,
42 | gamma: float = 2.0,
43 | reduction: str = 'none',
44 | eps: Optional[float] = None,
45 | ) -> torch.Tensor:
46 |
47 | if eps is not None and not torch.jit.is_scripting():
48 | warnings.warn(
49 | "`focal_loss` has been reworked for improved numerical stability "
50 | "and the `eps` argument is no longer necessary",
51 | DeprecationWarning,
52 | stacklevel=2,
53 | )
54 |
55 | if not isinstance(input, torch.Tensor):
56 | raise TypeError(f"Input type is not a torch.Tensor. Got {type(input)}")
57 |
58 | if not len(input.shape) >= 2:
59 | raise ValueError(f"Invalid input shape, we expect BxCx*. Got: {input.shape}")
60 |
61 | if input.size(0) != target.size(0):
62 | raise ValueError(f'Expected input batch_size ({input.size(0)}) to match target batch_size ({target.size(0)}).')
63 |
64 | n = input.size(0)
65 | out_size = (n,) + input.size()[2:]
66 | if target.size()[1:] != input.size()[2:]:
67 | raise ValueError(f'Expected target size {out_size}, got {target.size()}')
68 |
69 | if not input.device == target.device:
70 | raise ValueError(f"input and target must be in the same device. Got: {input.device} and {target.device}")
71 |
72 | # compute softmax over the classes axis
73 | input_soft: torch.Tensor = F.softmax(input, dim=1)
74 | log_input_soft: torch.Tensor = F.log_softmax(input, dim=1)
75 |
76 | # create the labels one hot tensor
77 | target_one_hot: torch.Tensor = one_hot(target, num_classes=input.shape[1], device=input.device, dtype=input.dtype)
78 |
79 | # compute the actual focal loss
80 | weight = torch.pow(-input_soft + 1.0, gamma)
81 |
82 | focal = -alpha * weight * log_input_soft
83 | loss_tmp = torch.einsum('bc...,bc...->b...', (target_one_hot, focal))
84 |
85 | if reduction == 'none':
86 | loss = loss_tmp
87 | elif reduction == 'mean':
88 | loss = torch.mean(loss_tmp)
89 | elif reduction == 'sum':
90 | loss = torch.sum(loss_tmp)
91 | else:
92 | raise NotImplementedError(f"Invalid reduction mode: {reduction}")
93 | return loss
94 |
95 |
96 | class FocalLoss(nn.Module):
97 | def __init__(self, alpha: float, gamma: float = 2.0, reduction: str = 'none', eps: Optional[float] = None) -> None:
98 | super().__init__()
99 | self.alpha: float = alpha
100 | self.gamma: float = gamma
101 | self.reduction: str = reduction
102 | self.eps: Optional[float] = eps
103 |
104 | def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
105 | return focal_loss(input, target, self.alpha, self.gamma, self.reduction, self.eps)
106 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | DPCSpell
2 |
3 | A transformer-based spelling error correction framework for Bangla and resource scarce Indic languages Link — Computer Speech & Language
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | ##
13 |
14 | ## How DPCSpell works?
15 |
16 | 
17 |
18 | ## Running Test
19 | | Operating System | Requirement | Remark |
20 | | ------------- | ------------- | ------------- |
21 | | Ubuntu 16.04.7 LTS | requirements_u.yml | :heavy_check_mark: Successful |
22 | | Ubuntu 18.04.6 LTS (Google Colab) | requirements_c.txt | :heavy_check_mark: Successful* |
23 | | Windows 10 | requirements_w.yml | :heavy_check_mark: Successful |
24 |
25 |
26 |
27 | ## Get Started
28 |
29 | ```
30 | git clone https://github.com/mehedihasanbijoy/DPCSpell.git
31 | ```
32 | or manually **download** and **extract** the github repository of DPCSpell.
33 |
34 |
35 |
36 | ## Environment Setup
37 | ### Create A Virtual Environment
38 | ```
39 | conda env create -f requirements_u.yml (for Ubuntu 16.04.7 LTS)
40 | or
41 | conda env create -f requirements_w.yml (for Windows 10)
42 | ```
43 |
44 |
45 | ### Activate the Environment
46 | ```
47 | conda activate DPCSpell
48 | ```
49 |
50 |
51 |
52 | ## Prepare SEC Corpora
53 | ```
54 | gdown https://drive.google.com/drive/folders/1_sWSi-LFsvuYh9c5GBMDd4V6_uM8yYjH?usp=share_link -O ./Dataset --folder
55 | ```
56 |
57 | or manually download the folder from here and keep the extracted files into ./Dataset/
58 |
59 |
60 |
61 |
62 | ## Training and Evaluation of DPCSpell
63 |
64 | ### Detector Network
65 |
66 | ```
67 | python detector.py --CORPUS "./Dataset/corpus.csv" --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
68 | ```
69 |
70 | ### Purificator Network
71 |
72 | ```
73 | python purificator.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
74 | ```
75 |
76 | ### Corrector Network
77 |
78 | ```
79 | python corrector.py --HID_DIM 128 --ENC_LAYERS 5 --DEC_LAYERS 5 --ENC_HEADS 8 --DEC_HEADS 8 --ENC_PF_DIM 256 --DEC_PF_DIM 256 --ENC_DROPOUT 0.1 --DEC_DROPOUT 0.1 --CLIP 1 --LEARNING_RATE 0.0005 --N_EPOCHS 100
80 | ```
81 |
82 |
83 |
84 | ## Benchmarking Bangla SEC Task
85 |
86 | 
87 |
88 |
89 | ## BibTeX Entry and Citation Info
90 |
91 | ```
92 | @article{bijoy2024transformer,
93 | title={A transformer based spelling error correction framework for Bangla and resource scarce Indic languages},
94 | author={Bijoy, Mehedi Hasan and Hossain, Nahid and Islam, Salekul and Shatabda, Swakkhar},
95 | journal={Computer Speech \& Language},
96 | volume = {89},
97 | pages = {101703},
98 | year = {2025},
99 | issn = {0885-2308},
100 | doi = {https://doi.org/10.1016/j.csl.2024.101703},
101 | url = {https://www.sciencedirect.com/science/article/pii/S088523082400086X},
102 | publisher={Elsevier}
103 | }
104 | ```
105 |
--------------------------------------------------------------------------------
/pipeline.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from tqdm import tqdm
4 | from utils import basic_tokenizer
5 | import matplotlib.pyplot as plt
6 | import matplotlib.ticker as ticker
7 | import matplotlib.font_manager as fm
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def train(model, iterator, optimizer, criterion, clip):
13 | model.train()
14 | epoch_loss = 0
15 | for idx, batch in enumerate(tqdm(iterator)):
16 | src = batch.src
17 | trg = batch.trg
18 |
19 | optimizer.zero_grad()
20 | output, _ = model(src, trg[:, :-1])
21 | # output = [batch size, trg len - 1, output dim]
22 | # trg = [batch size, trg len]
23 |
24 | output_dim = output.shape[-1]
25 | output = output.contiguous().view(-1, output_dim)
26 | trg = trg[:, 1:].contiguous().view(-1)
27 | # output = [batch size * trg len - 1, output dim]
28 | # trg = [batch size * trg len - 1]
29 |
30 | # trg one hot for BCEwLogits
31 | # trg = F.one_hot(trg, num_classes=66)
32 |
33 | loss = criterion(output, trg)
34 | loss.backward()
35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
36 | optimizer.step()
37 | epoch_loss += loss.item()
38 |
39 | return epoch_loss / len(iterator)
40 |
41 |
42 | def evaluate(model, iterator, criterion):
43 | model.eval()
44 | epoch_loss = 0
45 | with torch.no_grad():
46 | for idx, batch in enumerate(tqdm(iterator)):
47 | src = batch.src
48 | trg = batch.trg
49 |
50 | output, _ = model(src, trg[:, :-1])
51 | # output = [batch size, trg len - 1, output dim]
52 | # trg = [batch size, trg len]
53 |
54 | output_dim = output.shape[-1]
55 | output = output.contiguous().view(-1, output_dim)
56 | trg = trg[:, 1:].contiguous().view(-1)
57 | # output = [batch size * trg len - 1, output dim]
58 | # trg = [batch size * trg len - 1]
59 |
60 | loss = criterion(output, trg)
61 | epoch_loss += loss.item()
62 | return epoch_loss / len(iterator)
63 |
64 |
65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
66 | model.eval()
67 |
68 | if isinstance(sentence, str):
69 | tokens = basic_tokenizer(sentence)
70 | else:
71 | tokens = sentence
72 |
73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token]
74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens]
75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
76 | src_mask = model.make_src_mask(src_tensor)
77 |
78 | with torch.no_grad():
79 | enc_src = model.encoder(src_tensor, src_mask)
80 |
81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
82 |
83 | for i in range(max_len):
84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
85 | trg_mask = model.make_trg_mask(trg_tensor)
86 |
87 | with torch.no_grad():
88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
89 |
90 | pred_token = output.argmax(2)[:, -1].item()
91 | trg_indexes.append(pred_token)
92 |
93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
94 | break
95 |
96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
97 | return trg_tokens[1:-1], attention
98 |
99 |
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 | assert n_rows * n_cols == n_heads
102 |
103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 |
105 | fig = plt.figure(figsize=(15, 25))
106 | for i in range(n_heads):
107 | ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 | cax = ax.matshow(_attention, cmap='bone')
110 |
111 | ax.tick_params(labelsize=12)
112 | ax.set_xticklabels(
113 | [''] + [''] + [t for t in sentence] + [''],
114 | rotation=45, fontproperties=prop, fontsize=20
115 | )
116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 |
118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 |
121 | plt.show()
122 | plt.close()
123 |
124 |
125 | if __name__ == '__main__':
126 | pass
127 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/pipeline.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from tqdm import tqdm
4 | from utils import basic_tokenizer
5 | import matplotlib.pyplot as plt
6 | import matplotlib.ticker as ticker
7 | import matplotlib.font_manager as fm
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def train(model, iterator, optimizer, criterion, clip):
13 | model.train()
14 | epoch_loss = 0
15 | for idx, batch in enumerate(tqdm(iterator)):
16 | src = batch.src
17 | trg = batch.trg
18 |
19 | optimizer.zero_grad()
20 | output, _ = model(src, trg[:, :-1])
21 | # output = [batch size, trg len - 1, output dim]
22 | # trg = [batch size, trg len]
23 |
24 | output_dim = output.shape[-1]
25 | output = output.contiguous().view(-1, output_dim)
26 | trg = trg[:, 1:].contiguous().view(-1)
27 | # output = [batch size * trg len - 1, output dim]
28 | # trg = [batch size * trg len - 1]
29 |
30 | # trg one hot for BCEwLogits
31 | # trg = F.one_hot(trg, num_classes=66)
32 |
33 | loss = criterion(output, trg)
34 | loss.backward()
35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
36 | optimizer.step()
37 | epoch_loss += loss.item()
38 |
39 | return epoch_loss / len(iterator)
40 |
41 |
42 | def evaluate(model, iterator, criterion):
43 | model.eval()
44 | epoch_loss = 0
45 | with torch.no_grad():
46 | for idx, batch in enumerate(tqdm(iterator)):
47 | src = batch.src
48 | trg = batch.trg
49 |
50 | output, _ = model(src, trg[:, :-1])
51 | # output = [batch size, trg len - 1, output dim]
52 | # trg = [batch size, trg len]
53 |
54 | output_dim = output.shape[-1]
55 | output = output.contiguous().view(-1, output_dim)
56 | trg = trg[:, 1:].contiguous().view(-1)
57 | # output = [batch size * trg len - 1, output dim]
58 | # trg = [batch size * trg len - 1]
59 |
60 | loss = criterion(output, trg)
61 | epoch_loss += loss.item()
62 | return epoch_loss / len(iterator)
63 |
64 |
65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
66 | model.eval()
67 |
68 | if isinstance(sentence, str):
69 | tokens = basic_tokenizer(sentence)
70 | else:
71 | tokens = sentence
72 |
73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token]
74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens]
75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
76 | src_mask = model.make_src_mask(src_tensor)
77 |
78 | with torch.no_grad():
79 | enc_src = model.encoder(src_tensor, src_mask)
80 |
81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
82 |
83 | for i in range(max_len):
84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
85 | trg_mask = model.make_trg_mask(trg_tensor)
86 |
87 | with torch.no_grad():
88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
89 |
90 | pred_token = output.argmax(2)[:, -1].item()
91 | trg_indexes.append(pred_token)
92 |
93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
94 | break
95 |
96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
97 | return trg_tokens[1:-1], attention
98 |
99 |
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 | assert n_rows * n_cols == n_heads
102 |
103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 |
105 | fig = plt.figure(figsize=(15, 25))
106 | for i in range(n_heads):
107 | ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 | cax = ax.matshow(_attention, cmap='bone')
110 |
111 | ax.tick_params(labelsize=12)
112 | ax.set_xticklabels(
113 | [''] + [''] + [t for t in sentence] + [''],
114 | rotation=45, fontproperties=prop, fontsize=20
115 | )
116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 |
118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 |
121 | plt.show()
122 | plt.close()
123 |
124 |
125 | if __name__ == '__main__':
126 | pass
127 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/pipeline.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | from tqdm import tqdm
4 | from utils import basic_tokenizer
5 | import matplotlib.pyplot as plt
6 | import matplotlib.ticker as ticker
7 | import matplotlib.font_manager as fm
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def train(model, iterator, optimizer, criterion, clip):
13 | model.train()
14 | epoch_loss = 0
15 | for idx, batch in enumerate(tqdm(iterator)):
16 | src = batch.src
17 | trg = batch.trg
18 |
19 | optimizer.zero_grad()
20 | output, _ = model(src, trg[:, :-1])
21 | # output = [batch size, trg len - 1, output dim]
22 | # trg = [batch size, trg len]
23 |
24 | output_dim = output.shape[-1]
25 | output = output.contiguous().view(-1, output_dim)
26 | trg = trg[:, 1:].contiguous().view(-1)
27 | # output = [batch size * trg len - 1, output dim]
28 | # trg = [batch size * trg len - 1]
29 |
30 | # trg one hot for BCEwLogits
31 | # trg = F.one_hot(trg, num_classes=66)
32 |
33 | loss = criterion(output, trg)
34 | loss.backward()
35 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
36 | optimizer.step()
37 | epoch_loss += loss.item()
38 |
39 | return epoch_loss / len(iterator)
40 |
41 |
42 | def evaluate(model, iterator, criterion):
43 | model.eval()
44 | epoch_loss = 0
45 | with torch.no_grad():
46 | for idx, batch in enumerate(tqdm(iterator)):
47 | src = batch.src
48 | trg = batch.trg
49 |
50 | output, _ = model(src, trg[:, :-1])
51 | # output = [batch size, trg len - 1, output dim]
52 | # trg = [batch size, trg len]
53 |
54 | output_dim = output.shape[-1]
55 | output = output.contiguous().view(-1, output_dim)
56 | trg = trg[:, 1:].contiguous().view(-1)
57 | # output = [batch size * trg len - 1, output dim]
58 | # trg = [batch size * trg len - 1]
59 |
60 | loss = criterion(output, trg)
61 | epoch_loss += loss.item()
62 | return epoch_loss / len(iterator)
63 |
64 |
65 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50):
66 | model.eval()
67 |
68 | if isinstance(sentence, str):
69 | tokens = basic_tokenizer(sentence)
70 | else:
71 | tokens = sentence
72 |
73 | tokens = [src_field.init_token] + tokens + [src_field.eos_token]
74 | src_indexes = [src_field.vocab.stoi[token] for token in tokens]
75 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
76 | src_mask = model.make_src_mask(src_tensor)
77 |
78 | with torch.no_grad():
79 | enc_src = model.encoder(src_tensor, src_mask)
80 |
81 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
82 |
83 | for i in range(max_len):
84 | trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
85 | trg_mask = model.make_trg_mask(trg_tensor)
86 |
87 | with torch.no_grad():
88 | output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
89 |
90 | pred_token = output.argmax(2)[:, -1].item()
91 | trg_indexes.append(pred_token)
92 |
93 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
94 | break
95 |
96 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
97 | return trg_tokens[1:-1], attention
98 |
99 |
100 | def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
101 | assert n_rows * n_cols == n_heads
102 |
103 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
104 |
105 | fig = plt.figure(figsize=(15, 25))
106 | for i in range(n_heads):
107 | ax = fig.add_subplot(n_rows, n_cols, i + 1)
108 | _attention = attention.squeeze(0)[i].cpu().detach().numpy()
109 | cax = ax.matshow(_attention, cmap='bone')
110 |
111 | ax.tick_params(labelsize=12)
112 | ax.set_xticklabels(
113 | [''] + [''] + [t for t in sentence] + [''],
114 | rotation=45, fontproperties=prop, fontsize=20
115 | )
116 | ax.set_yticklabels([''] + translation, fontproperties=prop, fontsize=20)
117 |
118 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
119 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
120 |
121 | plt.show()
122 | plt.close()
123 |
124 |
125 | if __name__ == '__main__':
126 | pass
127 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/inference.py:
--------------------------------------------------------------------------------
1 | from decoding import beam_search_decoding
2 | from metrics import beam_eval_report, greedy_eval_report
3 | from utils import print_n_best
4 | from utils import translate_sentence
5 |
6 | import torch, torch.nn as nn, torch.optim as optim
7 | import torch.nn.functional as F
8 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
9 | import random
10 | from tqdm import tqdm
11 | import pandas as pd
12 | from sklearn.model_selection import train_test_split
13 | import math
14 | import time
15 | # from torchtext.data.metrics import bleu_score
16 |
17 | import matplotlib.pyplot as plt
18 | import matplotlib.ticker as ticker
19 | import matplotlib.font_manager as fm
20 |
21 | import numpy as np
22 | import math
23 | import time
24 |
25 | import warnings as wrn
26 | wrn.filterwarnings('ignore')
27 |
28 |
29 | def test_beam(model, train_data, test_data, SRC, TRG, DEVICE):
30 | _, test_iterator = BucketIterator.splits(
31 | (train_data, test_data),
32 | batch_size=1,
33 | sort_within_batch=True,
34 | sort_key=lambda x: len(x.src),
35 | device=DEVICE
36 | )
37 |
38 | TRG_SOS_IDX = TRG.vocab.stoi[TRG.init_token]
39 | TRG_EOS_IDX = TRG.vocab.stoi[TRG.eos_token]
40 |
41 | src_words = []
42 | topk_prediction_list = []
43 | trg_words = []
44 | found_within_topk = []
45 | found_at_top1 = []
46 |
47 | model.eval()
48 | with torch.no_grad():
49 | for batch_id, batch in enumerate(tqdm(test_iterator)):
50 | src, src_len = batch.src
51 | trg = batch.trg
52 |
53 | src_word = "".join(SRC.vocab.itos[idx] for idx in src[:, 0][1:-1])
54 | trg_word = "".join(TRG.vocab.itos[idx] for idx in trg[:, 0][1:-1])
55 | # print(f'\nSRC: {src_word}')
56 | # print(f'\nTRG: {trg_word}')
57 |
58 | enc_outs, h = model.encoder(src, src_len)
59 | # print(enc_outs.shape, h.shape)
60 |
61 | # decoder, enc_outs, enc_last_h, beam_width, n_best, sos_token, eos_token, max_dec_steps, device
62 | decoded_seqs = beam_search_decoding(
63 | model = model,
64 | src = src,
65 | decoder=model.decoder,
66 | enc_outs=enc_outs,
67 | enc_last_h=h,
68 | beam_width=1,
69 | n_best=1,
70 | sos_token=TRG_SOS_IDX,
71 | eos_token=TRG_EOS_IDX,
72 | max_dec_steps=100,
73 | device=DEVICE
74 | )
75 | topk_preds = print_n_best(decoded_seqs[0], TRG.vocab.itos)
76 | # print(topk_preds)
77 |
78 | src_words.append(src_word)
79 | trg_words.append(trg_word)
80 | topk_prediction_list.append((topk_preds * 3)[:3])
81 | found_within_topk.append(1) if trg_word in topk_preds else found_within_topk.append(0)
82 | found_at_top1.append(1) if trg_word == topk_preds[0] else found_at_top1.append(0)
83 |
84 | # if batch_id == 100:
85 | # break
86 |
87 | topk_pred_df = pd.DataFrame({
88 | 'Error': src_words,
89 | 'Pred-1': np.array(topk_prediction_list)[:, 0],
90 | 'Pred-2': np.array(topk_prediction_list)[:, 1],
91 | 'Pred-3': np.array(topk_prediction_list)[:, 2],
92 | 'Correct': trg_words,
93 | 'Greedy': found_at_top1,
94 | 'Beam': found_within_topk
95 | })
96 | topk_pred_df.to_csv('./Corrections/preds_beam.csv', index=False)
97 |
98 | beam_eval_report(trg_words, topk_prediction_list)
99 |
100 |
101 | def test_greedy(test_data, SRC, TRG, model, DEVICE):
102 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
103 | for idx, data in enumerate(tqdm(test_data)):
104 | src = data.src
105 | trg = data.trg
106 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
107 |
108 | src = ''.join(src)
109 | trg = ''.join(trg)
110 | pred = ''.join(translation[:-1])
111 |
112 | erroneous_words.append(src)
113 | predicted_words.append(pred)
114 | correct_words.append(trg)
115 | if trg == pred:
116 | flags.append(1)
117 | else:
118 | flags.append(0)
119 |
120 | evaluation_df = pd.DataFrame({
121 | 'Error': erroneous_words,
122 | 'Predicton': predicted_words,
123 | 'Target': correct_words,
124 | 'Correction': flags
125 | })
126 | evaluation_df.to_csv('./Corrections/preds_greedy.csv', index=False)
127 |
128 | greedy_eval_report(correct_words, predicted_words)
129 |
130 |
131 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/pipeline.py:
--------------------------------------------------------------------------------
1 | import torch, torch.nn as nn, torch.optim as optim
2 | import torch.nn.functional as F
3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | import math
9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 | from utils import translate_sentence
12 | from sklearn import metrics
13 |
14 | import matplotlib.pyplot as plt
15 | import matplotlib.ticker as ticker
16 | import matplotlib.font_manager as fm
17 |
18 | import numpy as np
19 | import math
20 | import time
21 |
22 | import warnings as wrn
23 | wrn.filterwarnings('ignore')
24 |
25 |
26 | def train(model, iterator, optimizer, criterion, clip=1):
27 | model.train()
28 | epoch_loss = 0
29 | for idx, batch in enumerate(tqdm(iterator)):
30 | src, src_len = batch.src
31 | trg = batch.trg
32 |
33 | optimizer.zero_grad()
34 | output = model(src, src_len, trg)
35 | output_dim = output.shape[-1]
36 |
37 | output = output[1:].view(-1, output_dim)
38 | trg = trg[1:].view(-1)
39 |
40 | # print(f"output: {output.shape}, target: {trg.shape} \n\n{trg}")
41 |
42 | loss = criterion(output, trg)
43 | loss.backward()
44 | torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
45 | optimizer.step()
46 | epoch_loss += loss.item()
47 |
48 | return epoch_loss / len(iterator)
49 |
50 |
51 | def evaluate(model, iterator, criterion):
52 | model.eval()
53 | epoch_loss = 0
54 | with torch.no_grad():
55 | for idx, batch in enumerate(tqdm(iterator)):
56 | src, src_len = batch.src
57 | trg = batch.trg
58 |
59 | output = model(src, src_len, trg, 0)
60 |
61 | output_dim = output.shape[-1]
62 | output = output[1:].view(-1, output_dim)
63 | trg = trg[1:].view(-1)
64 |
65 | loss = criterion(output, trg)
66 | epoch_loss += loss.item()
67 |
68 | return epoch_loss / len(iterator)
69 |
70 |
71 | def test_accuracy(test_data, SRC, TRG, model, DEVICE):
72 | df = pd.read_csv('./Dataset/allDictWords_df.csv')
73 | # df = pd.read_csv('./Dataset/df_all_words.csv')
74 | all_words = sorted(df.iloc[:, 0].values)
75 |
76 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
77 | modified_flags = []
78 | for idx, data in enumerate(tqdm(test_data)):
79 | src = data.src
80 | trg = data.trg
81 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
82 |
83 | src = ''.join(src)
84 | trg = ''.join(trg)
85 | pred = ''.join(translation[:-1])
86 |
87 | erroneous_words.append(src)
88 | predicted_words.append(pred)
89 | correct_words.append(trg)
90 | if trg == pred:
91 | flags.append(1)
92 | else:
93 | flags.append(0)
94 |
95 | if pred in all_words:
96 | modified_flags.append(1)
97 | else:
98 | modified_flags.append(0)
99 |
100 | modified_acc = np.sum(modified_flags) / len(modified_flags)
101 |
102 | evaluation_df = pd.DataFrame({
103 | 'Error': erroneous_words,
104 | 'Predicton': predicted_words,
105 | 'Target': correct_words,
106 | 'Correction': flags
107 | })
108 | # evaluation_df.to_csv('/content/drive/MyDrive/Bangla Spell & Grammar Checker/Codes/GEDC/Seq2Seq/preds_greedy.csv', index=False)
109 |
110 | corrected_instances = evaluation_df['Correction'].values.sum()
111 | total_instances = len(evaluation_df)
112 | accuracy = corrected_instances / total_instances
113 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
114 | # print(f"Accuracy: {accuracy*100:.2f}%")
115 |
116 | y_true = np.array(correct_words)
117 | y_pred = np.array(predicted_words)
118 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
119 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
120 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
121 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
122 | ACC = metrics.accuracy_score(y_true, y_pred)
123 | print(f'''Top-1 (Greedy Decoding)
124 | Precision: {PR:.4f}
125 | Recall: {RE:.4f}
126 | F1 Score: {F1:.4f}
127 | F0.5 Score: {F05:.4f}
128 | Accuracy: {ACC * 100:.2f}%
129 | Modified Accuracy: {modified_acc * 100:.2f}%
130 | ''')
131 |
132 | return evaluation_df
133 |
134 | # evaluation_df.sample(10)
135 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/models.py:
--------------------------------------------------------------------------------
1 | import torch, torch.nn as nn, torch.optim as optim
2 | import torch.nn.functional as F
3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | import math
9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 |
12 | import matplotlib.pyplot as plt
13 | import matplotlib.ticker as ticker
14 | import matplotlib.font_manager as fm
15 |
16 | import numpy as np
17 | import math
18 | import time
19 |
20 | import warnings as wrn
21 | wrn.filterwarnings('ignore')
22 |
23 |
24 | class Encoder(nn.Module):
25 | def __init__(self, input_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout):
26 | super().__init__()
27 | self.embedding = nn.Embedding(input_dim, embed_dim)
28 | self.rnn = nn.GRU(embed_dim, enc_hidden_dim, bidirectional=True)
29 | self.fc = nn.Linear(enc_hidden_dim * 2, dec_hidden_dim)
30 | self.dropout = nn.Dropout(dropout)
31 |
32 | def forward(self, src, src_len):
33 | embedded = self.dropout(self.embedding(src))
34 | packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to('cpu'))
35 | packed_outputs, hidden = self.rnn(packed_embedded)
36 | outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
37 | hidden = torch.tanh(self.fc(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)))
38 | return outputs, hidden
39 |
40 |
41 | class Attention(nn.Module):
42 | def __init__(self, enc_hidden_dim, dec_hidden_dim):
43 | super().__init__()
44 | self.attn = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim, dec_hidden_dim)
45 | self.v = nn.Linear(dec_hidden_dim, 1, bias=False)
46 |
47 | def forward(self, hidden, encoder_outputs, mask):
48 | batch_size = encoder_outputs.shape[1]
49 | src_len = encoder_outputs.shape[0]
50 | hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
51 | encoder_outputs = encoder_outputs.permute(1, 0, 2)
52 | energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
53 | attention = self.v(energy).squeeze(2)
54 | attention = attention.masked_fill(mask==0, -1e10)
55 | return F.softmax(attention, dim=1)
56 |
57 |
58 | class Decoder(nn.Module):
59 | def __init__(self, output_dim, embed_dim, enc_hidden_dim, dec_hidden_dim, dropout, attention):
60 | super().__init__()
61 | self.output_dim = output_dim
62 | self.attention = attention
63 | self.embedding = nn.Embedding(output_dim, embed_dim)
64 | self.rnn = nn.GRU((enc_hidden_dim*2) + embed_dim, dec_hidden_dim)
65 | self.fc_out = nn.Linear((enc_hidden_dim*2) + dec_hidden_dim + embed_dim, output_dim)
66 | self.dropout = nn.Dropout(dropout)
67 |
68 | def forward(self, input, hidden, encoder_outputs, mask):
69 | input = input.unsqueeze(0)
70 | embedded = self.dropout(self.embedding(input))
71 | a = self.attention(hidden, encoder_outputs, mask)
72 | a = a.unsqueeze(1)
73 | encoder_outputs = encoder_outputs.permute(1, 0, 2)
74 | weighted = torch.bmm(a, encoder_outputs)
75 | weighted = weighted.permute(1, 0, 2)
76 | rnn_input = torch.cat((embedded, weighted), dim=2)
77 |
78 | output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
79 |
80 | assert (output == hidden).all()
81 |
82 | embedded = embedded.squeeze(0)
83 | output = output.squeeze(0)
84 | weighted = weighted.squeeze(0)
85 |
86 | prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
87 |
88 | return prediction, hidden.squeeze(0), a.squeeze(1)
89 |
90 |
91 | class Seq2Seq(nn.Module):
92 | def __init__(self, encoder, decoder, src_pad_idx, device):
93 | super().__init__()
94 | self.encoder = encoder
95 | self.decoder = decoder
96 | self.src_pad_idx = src_pad_idx
97 | self.device = device
98 |
99 | def create_mask(self, src):
100 | mask = (src != self.src_pad_idx).permute(1, 0)
101 | return mask
102 |
103 | def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
104 | batch_size = src.shape[1]
105 | trg_len = trg.shape[0]
106 | trg_vocab_size = self.decoder.output_dim
107 |
108 | outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
109 |
110 | encoder_outputs, hidden = self.encoder(src, src_len)
111 | input = trg[0, :]
112 | mask = self.create_mask(src)
113 |
114 | for t in range(1, trg_len):
115 | output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
116 | outputs[t] = output
117 |
118 | top1 = output.argmax(1)
119 |
120 | input = trg[t] if random.random() < teacher_forcing_ratio else top1
121 |
122 | return outputs
123 |
--------------------------------------------------------------------------------
/Baselines/RuleBased/rule_based.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """PreviousEditDistanceBasedSpellChecker.ipynb
3 |
4 | Automatically generated by Colaboratory.
5 |
6 | Original file is located at
7 | https://colab.research.google.com/drive/1Kp3C18yaWmfhKJU_8294UKqfHrmLA1Ow
8 | """
9 |
10 | import pandas as pd
11 | import numpy as np
12 | from sklearn.model_selection import train_test_split
13 | from sklearn import metrics
14 | from tqdm import tqdm
15 | import warnings as wrn
16 |
17 | wrn.filterwarnings('ignore')
18 |
19 | def editDistance(str1, str2, m, n):
20 | if m == 0:
21 | return n
22 |
23 | if n == 0:
24 | return m
25 |
26 | if str1[m-1] == str2[n-1]:
27 | return editDistance(str1, str2, m-1, n-1)
28 |
29 | return 1 + min(editDistance(str1, str2, m, n-1), # Insert
30 | editDistance(str1, str2, m-1, n), # Remove
31 | editDistance(str1, str2, m-1, n-1) # Replace
32 | )
33 |
34 | # Dynamic Programming based
35 | def editDistDP(str1, str2, m, n):
36 | # Create a table to store results of subproblems
37 | dp = [[0 for x in range(n + 1)] for x in range(m + 1)]
38 |
39 | # Fill d[][] in bottom up manner
40 | for i in range(m + 1):
41 | for j in range(n + 1):
42 |
43 | # If first string is empty, only option is to
44 | # insert all characters of second string
45 | if i == 0:
46 | dp[i][j] = j # Min. operations = j
47 |
48 | # If second string is empty, only option is to
49 | # remove all characters of second string
50 | elif j == 0:
51 | dp[i][j] = i # Min. operations = i
52 |
53 | # If last characters are same, ignore last char
54 | # and recur for remaining string
55 | elif str1[i-1] == str2[j-1]:
56 | dp[i][j] = dp[i-1][j-1]
57 |
58 | # If last character are different, consider all
59 | # possibilities and find minimum
60 | else:
61 | dp[i][j] = 1 + min(dp[i][j-1], # Insert
62 | dp[i-1][j], # Remove
63 | dp[i-1][j-1]) # Replace
64 |
65 | return dp[m][n]
66 |
67 |
68 | # Driver code
69 | # str1 = "sunday"
70 | # str2 = "saturday"
71 |
72 | # print(editDistDP(str1, str2, len(str1), len(str2)))
73 | # This code is contributed by Bhavya Jain
74 |
75 | df = pd.read_csv('./Dataset/corpus.csv')
76 | # df
77 |
78 | train_df, test_df = train_test_split(df, test_size=.15)
79 | train_df, valid_df = train_test_split(train_df, test_size=.05)
80 |
81 | # len(train_df), len(valid_df), len(test_df)
82 |
83 | erroneous_words = []
84 | actual_words = []
85 | calculated_words = []
86 |
87 | for i in tqdm(range(10000)):
88 | word = valid_df['Error'].values[i]
89 | # print(word)
90 |
91 | x = len(word)
92 | while True:
93 | temp_df = train_df['Word'].str.startswith(word[:x], na = False)
94 | temp_df = train_df[temp_df]
95 | if len(temp_df) != 0:
96 | break
97 | x -= 1
98 |
99 | if len(temp_df) > 100:
100 | temp_df = temp_df.sample(100)
101 |
102 | # print(temp_df)
103 |
104 | scores = []
105 | for temp_word in temp_df['Word'].values:
106 | # score = editDistance(word, temp_word, len(word), len(temp_word))
107 | score = editDistDP(word, temp_word, len(word), len(temp_word))
108 | scores.append(score)
109 |
110 | temp_df['Scores'] = scores
111 | temp_df = temp_df.sort_values(by=['Scores'], ascending=True)
112 |
113 | calculated = temp_df.iloc[0, 0]
114 |
115 | act_word = valid_df['Word'].values[i]
116 |
117 | erroneous_words.append(word)
118 | calculated_words.append(calculated)
119 | actual_words.append(act_word)
120 |
121 | if i % 100 == 0 and i > 0:
122 | x = pd.DataFrame({
123 | 'Error': erroneous_words,
124 | 'Actual': actual_words,
125 | 'Calculated': calculated_words
126 | })
127 | x.to_csv('./Dataset/ed_output.csv', index=False)
128 |
129 |
130 | # print(word, calculated)
131 | print(f"\n erroneous: {word}\n actual: {act_word}\n calculated: {calculated}")
132 |
133 | words = []
134 | for i in tqdm(range(len(df))):
135 | if df.iloc[i, 1] not in x['Error'].values:
136 | words.append(df.iloc[i, 0])
137 |
138 | # x = pd.DataFrame({
139 | # 'Error': erroneous_words,
140 | # 'Actual': actual_words,
141 | # 'Calculated': calculated_words
142 | # })
143 |
144 | acc_flags = []
145 | for i in range(len(x)):
146 | if x.iloc[i, 1] == x.iloc[i, -1]:
147 | acc_flags.append(1)
148 | else:
149 | acc_flags.append(0)
150 | x['EM'] = acc_flags
151 |
152 | train_df = df
153 | mod_acc_flags = []
154 | for pred in x['Calculated'].values:
155 | if pred in words:
156 | mod_acc_flags.append(1)
157 | else:
158 | mod_acc_flags.append(0)
159 | x['MA'] = mod_acc_flags
160 |
161 | y_true = np.array(x['Actual'].values)
162 | y_pred = np.array(x['Calculated'].values)
163 |
164 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
165 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
166 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
167 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
168 | ACC = metrics.accuracy_score(y_true, y_pred)
169 |
170 | print(f'Accuracy = {ACC*100:.2f}%')
171 | print(f'Precision = {PR:.4f}')
172 | print(f'Recall = {RE:.4f}')
173 | print(f'F1 Score = {F1:.4f}')
174 | print(f'F0.5 Score = {F05:.4f}')
175 |
176 |
177 |
178 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Encoder(nn.Module):
7 | def __init__(self, input_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, device, max_length=50):
8 | super().__init__()
9 | assert kernel_size % 2 == 1, "Kernel size should be odd in encoder"
10 | self.device = device
11 | self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
12 | self.tok_embedding = nn.Embedding(input_dim, emb_dim)
13 | self.pos_embedding = nn.Embedding(max_length, emb_dim)
14 | self.emb2hid = nn.Linear(emb_dim, hid_dim)
15 | self.hid2emb = nn.Linear(hid_dim, emb_dim)
16 | self.convs = nn.ModuleList([
17 | nn.Conv1d(
18 | in_channels=hid_dim, out_channels= 2 *hid_dim, kernel_size=kernel_size, padding=(kernel_size-1 )//2
19 | ) for _ in range(n_layers)
20 | ])
21 | self.dropout = nn.Dropout(dropout)
22 |
23 | def forward(self, src):
24 | batch_size = src.shape[0]
25 | src_len = src.shape[1]
26 | pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
27 | tok_embedded = self.tok_embedding(src)
28 | pos_embedded = self.pos_embedding(pos)
29 | embedded = self.dropout(tok_embedded + pos_embedded)
30 | conv_inp = self.emb2hid(embedded)
31 | conv_inp = conv_inp.permute(0, 2, 1)
32 |
33 | for idx, conv in enumerate(self.convs):
34 | conved = conv(self.dropout(conv_inp))
35 | conved = F.glu(conved, dim=1)
36 | conved = (conved + conv_inp) * self.scale
37 | conv_inp = conved
38 |
39 | conved = self.hid2emb(conved.permute(0, 2, 1))
40 | combined = (conved + embedded) * self.scale
41 | return conved, combined
42 |
43 |
44 | class Decoder(nn.Module):
45 | def __init__(self, output_dim, emb_dim, hid_dim, n_layers, kernel_size, dropout, \
46 | trg_pad_idx, device, max_length=50):
47 | super().__init__()
48 | self.kernel_size = kernel_size
49 | self.trg_pad_idx = trg_pad_idx
50 | self.device = device
51 | self.scale = torch.sqrt(torch.FloatTensor([0.5])).to(device)
52 |
53 | self.tok_embedding = nn.Embedding(output_dim, emb_dim)
54 | self.pos_embedding = nn.Embedding(max_length, emb_dim)
55 | self.emb2hid = nn.Linear(emb_dim, hid_dim)
56 | self.hid2emb = nn.Linear(hid_dim, emb_dim)
57 |
58 | self.attn_hid2emb = nn.Linear(hid_dim, emb_dim)
59 | self.attn_emb2hid = nn.Linear(emb_dim, hid_dim)
60 |
61 | self.fc_out = nn.Linear(emb_dim, output_dim)
62 | self.convs = nn.ModuleList([
63 | nn.Conv1d(
64 | in_channels=hid_dim, out_channels=2 * hid_dim, kernel_size=kernel_size
65 | ) for _ in range(n_layers)
66 | ])
67 | self.dropout = nn.Dropout(dropout)
68 |
69 | def calculate_attention(self, embedded, conved, encoder_conved, encoder_combined):
70 | conved_emb = self.attn_hid2emb(conved.permute(0, 2, 1))
71 | combined = (conved_emb + embedded) * self.scale
72 | energy = torch.matmul(combined, encoder_conved.permute(0, 2, 1))
73 | attention = F.softmax(energy, dim=2)
74 | attended_encoding = torch.matmul(attention, encoder_combined)
75 | attended_encoding = self.attn_emb2hid(attended_encoding)
76 | attended_combined = (conved + attended_encoding.permute(0, 2, 1)) * self.scale
77 | return attention, attended_combined
78 |
79 | def forward(self, trg, encoder_conved, encoder_combined):
80 | batch_size = trg.shape[0]
81 | trg_len = trg.shape[1]
82 | pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
83 |
84 | tok_embedded = self.tok_embedding(trg)
85 | pos_embedded = self.pos_embedding(pos)
86 | embedded = self.dropout(tok_embedded + pos_embedded)
87 |
88 | conv_inp = self.emb2hid(embedded)
89 | conv_inp = conv_inp.permute(0, 2, 1)
90 |
91 | batch_size = conv_inp.shape[0]
92 | hid_dim = conv_inp.shape[1]
93 | for idx, conv in enumerate(self.convs):
94 | conv_inp = self.dropout(conv_inp)
95 | padding = torch.zeros(
96 | batch_size, hid_dim, self.kernel_size - 1
97 | ).fill_(self.trg_pad_idx).to(self.device)
98 | padded_conv_inp = torch.cat((padding, conv_inp), dim=2)
99 | conved = conv(padded_conv_inp)
100 | conved = F.glu(conved, dim=1)
101 |
102 | attention, conved = self.calculate_attention(
103 | embedded, conved, encoder_conved, encoder_combined
104 | )
105 | conved = (conved + conv_inp) * self.scale
106 | conv_inp = conved
107 |
108 | conved = self.hid2emb(conved.permute(0, 2, 1))
109 | output = self.fc_out(self.dropout(conved))
110 | return output, attention
111 |
112 |
113 | class Seq2Seq(nn.Module):
114 | def __init__(self, encoder, decoder):
115 | super().__init__()
116 | self.encoder = encoder
117 | self.decoder = decoder
118 |
119 | def forward(self, src, trg):
120 | encoder_conved, encoder_combined = self.encoder(src)
121 | output, attention = self.decoder(trg, encoder_conved, encoder_combined)
122 | return output, attention
123 |
124 | if __name__ == '__main__':
125 | pass
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/utils.py:
--------------------------------------------------------------------------------
1 | import torch, torch.nn as nn, torch.optim as optim
2 | import torch.nn.functional as F
3 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
4 | import random
5 | from tqdm import tqdm
6 | import pandas as pd
7 | from sklearn.model_selection import train_test_split
8 | import math
9 | import time
10 | # from torchtext.data.metrics import bleu_score
11 |
12 | import matplotlib.pyplot as plt
13 | import matplotlib.ticker as ticker
14 | import matplotlib.font_manager as fm
15 |
16 | import numpy as np
17 | import math
18 | import time
19 |
20 | import warnings as wrn
21 | wrn.filterwarnings('ignore')
22 |
23 |
24 | def word2chars(word):
25 | w2c = [char for char in word]
26 | return ' '.join(w2c)
27 |
28 |
29 | def df2train_test_dfs(df, test_size=0.15):
30 | df['Word'] = df['Word'].apply(word2chars)
31 | df['Error'] = df['Error'].apply(word2chars)
32 | df = df.sample(frac=1).reset_index(drop=True)
33 | df = df.iloc[:, [1, 0]]
34 | train_df, test_df = train_test_split(df, test_size=test_size)
35 | train_df.to_csv('./Dataset/train.csv', index=False)
36 | test_df.to_csv('./Dataset/test.csv', index=False)
37 |
38 |
39 | def df2train_valid_test_dfs(df, test_size=0.15):
40 | df['Word'] = df['Word'].apply(word2chars)
41 | df['Error'] = df['Error'].apply(word2chars)
42 | df = df.sample(frac=1).reset_index(drop=True)
43 | df = df.iloc[:, [1, 0]]
44 | train_df, test_df = train_test_split(df, test_size=test_size)
45 | train_df, valid_df = train_test_split(train_df, test_size=.05)
46 |
47 | train_df.to_csv('./Dataset/train.csv', index=False)
48 | valid_df.to_csv('./Dataset/valid.csv', index=False)
49 | test_df.to_csv('./Dataset/test.csv', index=False)
50 |
51 |
52 | def df2train_error_dfs(df, error='Cognitive Error', test_size=0.20):
53 | df['Word'] = df['Word'].apply(word2chars)
54 | df['Error'] = df['Error'].apply(word2chars)
55 | df = df.sample(frac=1).reset_index(drop=True)
56 | # df = df.iloc[:, [1, 0]]
57 | train_df, error_df = train_test_split(df, test_size=test_size)
58 | error_df = error_df.loc[error_df['ErrorType'] == error]
59 | train_df = train_df.iloc[:, [1, 0]]
60 | error_df = error_df.iloc[:, [1, 0]]
61 |
62 | train_df.to_csv('./Dataset/train.csv', index=False)
63 | error_df.to_csv('./Dataset/error.csv', index=False)
64 |
65 |
66 | def basic_tokenizer(text):
67 | return text.split()
68 |
69 |
70 | def init_weights(m):
71 | for name, param in m.named_parameters():
72 | if 'weight' in name:
73 | nn.init.normal_(param.data, mean=0, std=0.01)
74 | else:
75 | nn.init.constant_(param.data, 0)
76 |
77 |
78 | def count_parameters(model):
79 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
80 |
81 |
82 | def save_model(model, epoch, optimizer, train_loss, PATH):
83 | torch.save({
84 | 'epoch': epoch,
85 | 'model_state_dict': model.state_dict(),
86 | 'optimizer_state_dict': optimizer.state_dict(),
87 | 'loss': train_loss
88 | }, PATH)
89 | print(f"---------\nModel Saved at {PATH}\n---------\n")
90 |
91 |
92 | def load_model(model, optimizer, PATH):
93 | checkpoint = torch.load(PATH)
94 | model.load_state_dict(checkpoint['model_state_dict'])
95 | optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
96 | epoch = checkpoint['epoch']
97 | train_loss = checkpoint['loss']
98 | return checkpoint, epoch, train_loss
99 |
100 |
101 | def print_n_best(decoded_seq, itos):
102 | topk_preds = []
103 | for rank, seq in enumerate(decoded_seq):
104 | pred = "".join([itos[idx] for idx in seq[1:-1]])
105 | topk_preds.append(pred)
106 | # print(f'Out: Rank-{rank+1}: {pred}')
107 | return topk_preds
108 |
109 |
110 | def translate_sentence(sentence, src_field, trg_field, model, device, max_len=30):
111 | model.eval()
112 | tokens = [token for token in sentence]
113 |
114 | tokens = [src_field.init_token] + tokens + [src_field.eos_token]
115 |
116 | src_indexes = [src_field.vocab.stoi[token] for token in tokens]
117 | src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)
118 | src_len = torch.LongTensor([len(src_indexes)])
119 |
120 | with torch.no_grad():
121 | encoder_outputs, hidden = model.encoder(src_tensor, src_len)
122 |
123 | mask = model.create_mask(src_tensor)
124 |
125 | trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]
126 | attentions = torch.zeros(max_len, 1, len(src_indexes)).to(device)
127 |
128 | for i in range(max_len):
129 | trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(device)
130 | with torch.no_grad():
131 | output, hidden, attention = model.decoder(trg_tensor, hidden, encoder_outputs, mask)
132 |
133 | attentions[i] = attention
134 |
135 | pred_token = output.argmax(1).item()
136 | trg_indexes.append(pred_token)
137 | if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
138 | break
139 |
140 | trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
141 | return trg_tokens[1:], attentions[:len(trg_tokens) - 1]
142 |
143 |
144 | def display_attention(sentence, translation, attention):
145 | prop = fm.FontProperties(fname='./Dataset/kalpurush.ttf')
146 |
147 | fig = plt.figure(figsize=(7, 10))
148 | ax = fig.add_subplot(111)
149 |
150 | attention = attention.squeeze(1).cpu().detach().numpy()
151 |
152 | cax = ax.matshow(attention, cmap='bone')
153 |
154 | ax.tick_params(labelsize=15)
155 |
156 | x_ticks = [''] + [''] + [t.lower() for t in sentence] + ['']
157 | y_ticks = [''] + translation
158 |
159 | ax.set_xticklabels(x_ticks, rotation=0, fontproperties=prop, fontsize=20)
160 | ax.set_yticklabels(y_ticks, fontproperties=prop, fontsize=20)
161 |
162 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
163 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
164 |
165 | plt.show()
166 | plt.close()
167 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/errors.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | df2train_test_dfs, basic_tokenizer, init_weights, count_parameters,
3 | translate_sentence, display_attention, df2train_valid_test_dfs,
4 | save_model, load_model, df2train_error_dfs, word2chars
5 | )
6 | from models import Encoder, Decoder, Attention, Seq2Seq
7 | from pipeline import train, test_accuracy
8 | from inference import test_beam, test_greedy
9 | from focalLoss import FocalLoss
10 |
11 | import torch, torch.nn as nn, torch.optim as optim
12 | import torch.nn.functional as F
13 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
14 | import random
15 | from tqdm import tqdm
16 | import pandas as pd
17 | from sklearn.model_selection import train_test_split
18 | import math
19 | import time
20 | # from torchtext.data.metrics import bleu_score
21 |
22 | import matplotlib.pyplot as plt
23 | import matplotlib.ticker as ticker
24 | import matplotlib.font_manager as fm
25 |
26 | import numpy as np
27 | import math
28 | import time
29 | import sys
30 |
31 | import warnings as wrn
32 | wrn.filterwarnings('ignore')
33 |
34 |
35 | def error_df(df, error='Cognitive Error'):
36 | df = df.loc[df['ErrorType'] == error]
37 | df['Word'] = df['Word'].apply(word2chars)
38 | df['Error'] = df['Error'].apply(word2chars)
39 | df = df.sample(frac=1).reset_index(drop=True)
40 | df = df.iloc[:, [1, 0]]
41 | df.to_csv('./Dataset/error.csv', index=False)
42 |
43 |
44 | def check_error():
45 | df = pd.read_csv('./Dataset/sec_dataset_II.csv')
46 | df = df.iloc[:, :]
47 | # df2train_test_dfs(df=df, test_size=0.15)
48 | df2train_valid_test_dfs(df=df, test_size=0.15)
49 |
50 | # ['Cognitive Error', 'Homonym Error', 'Run-on Error',
51 | # 'Split-word Error (Left)', 'Split-word Error (Random)',
52 | # 'Split-word Error (Right)', 'Split-word Error (both)',
53 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
54 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
55 | # 'Visual Error', 'Visual Error (Combined Character)']
56 | error_name = 'Cognitive Error'
57 | error_df(df, error_name)
58 | # df2train_error_dfs(df, error='Cognitive Error')
59 | # sys.exit()
60 |
61 | SRC = Field(
62 | tokenize=basic_tokenizer, lower=False,
63 | init_token='', eos_token='',
64 | sequential=True, use_vocab=True, include_lengths=True
65 | )
66 | TRG = Field(
67 | tokenize=basic_tokenizer, lower=False,
68 | init_token='', eos_token='',
69 | sequential=True, use_vocab=True
70 | )
71 | fields = {
72 | 'Error': ('src', SRC),
73 | 'Word': ('trg', TRG)
74 | }
75 | train_data, valid_data, test_data = TabularDataset.splits(
76 | path='./Dataset',
77 | train='train.csv',
78 | validation='valid.csv',
79 | test='test.csv',
80 | format='csv',
81 | fields=fields
82 | )
83 | error_data, _ = TabularDataset.splits(
84 | path='./Dataset',
85 | train='error.csv',
86 | test='error.csv',
87 | format='csv',
88 | fields=fields
89 | )
90 |
91 | # print(error_data)
92 | # sys.exit()
93 |
94 | SRC.build_vocab(train_data, max_size=64, min_freq=100)
95 | TRG.build_vocab(train_data, max_size=64, min_freq=75)
96 | # print(len(SRC.vocab), len(TRG.vocab))
97 |
98 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
99 | BATCH_SIZE = 256
100 | INPUT_DIM = len(SRC.vocab)
101 | OUTPUT_DIM = len(TRG.vocab)
102 | ENC_EMB_DIM = 64
103 | DEC_EMB_DIM = 64
104 | ENC_HIDDEN_DIM = 256
105 | DEC_HIDDEN_DIM = 512
106 | ENC_DROPOUT = 0.25
107 | DEC_DROPOUT = 0.25
108 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
109 | MAX_LEN = 32
110 | N_EPOCHS = 10
111 | CLIP = 1
112 | PATH = ''
113 |
114 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
115 | (train_data, valid_data, test_data),
116 | batch_size=BATCH_SIZE,
117 | sort_within_batch=True,
118 | sort_key=lambda x: len(x.src),
119 | device=DEVICE
120 | )
121 | error_iterator, _ = BucketIterator.splits(
122 | (error_data, error_data),
123 | batch_size=BATCH_SIZE,
124 | sort_within_batch=True,
125 | sort_key=lambda x: len(x.src),
126 | device=DEVICE
127 | )
128 |
129 | attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
130 | encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT)
131 | decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention)
132 |
133 | model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE)
134 | model.apply(init_weights)
135 | # print(model)
136 | # print(f'The model has {count_parameters(model):,} trainable parameters')
137 |
138 | optimizer = optim.Adam(model.parameters())
139 | # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4)
140 |
141 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
142 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
143 | # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX)
144 | # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')
145 |
146 | PATH = './Checkpoints/spell_s2s.pth'
147 | # best_loss = 1e10
148 |
149 | checkpoint, epoch, train_loss = load_model(model, optimizer, PATH)
150 | # best_loss = train_loss
151 | error_df_ = pd.read_csv('./Dataset/error.csv')
152 | error_pct = (len(error_df_) / len(df)) * 100
153 |
154 | print(f"\n------------\nError Name: {error_name} - {error_pct:.2f}% of dataset\n------------")
155 | test_accuracy(error_data, SRC, TRG, model, DEVICE)
156 |
157 |
158 | # test_beam(model, train_data, test_data, SRC, TRG, DEVICE)
159 | # test_greedy(test_data, SRC, TRG, model, DEVICE)
160 |
161 | # example_idx = 1
162 | # src = vars(train_data.examples[example_idx])['src']
163 | # trg = vars(train_data.examples[example_idx])['trg']
164 | # print(f'src = {src}')
165 | # print(f'trg = {trg}')
166 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
167 | # print(f'predicted trg = {translation}')
168 | # display_attention(src, translation, attention)
169 |
170 |
171 | if __name__ == '__main__':
172 | check_error()
173 |
--------------------------------------------------------------------------------
/Baselines/ConvSeq2Seq/main.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | basic_tokenizer, word2char, count_parameters, translate_sentence,
3 | save_model, load_model
4 | )
5 | from errors import error_df
6 | from models import Encoder, Decoder, Seq2Seq
7 | from pipeline import train, evaluate
8 | from metrics import evaluation_report
9 |
10 | import torch
11 | import torch.optim as optim
12 | import torch.nn as nn
13 | import pandas as pd
14 | from sklearn.model_selection import train_test_split
15 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
16 | import os
17 | import argparse
18 |
19 | import warnings as wrn
20 | wrn.filterwarnings('ignore')
21 |
22 |
23 | def main():
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv",
26 | choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"]
27 | )
28 | parser.add_argument("--EMB_DIM", help="Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
29 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=256, choices=[64, 128, 256])
30 | parser.add_argument("--ENC_LAYERS", help="Encoder Layers", type=int,default=5, choices=[5, 10, 20])
31 | parser.add_argument("--DEC_LAYERS", help="Decoder Layers", type=int,default=5, choices=[5, 10, 20])
32 | parser.add_argument("--ENC_KERNEL_SIZE", help="Encoder Kernel Size", type=int, default=3, choices=[3, 5, 10])
33 | parser.add_argument("--DEC_KERNEL_SIZE", help="Decoder Kernel Size", type=int, default=3, choices=[3, 5, 10])
34 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5])
35 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=.2, choices=[.1, .2, .5])
36 | parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=0.1, choices=[0.1, 0.2, 0.5, 1])
37 | parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512])
38 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
39 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
40 | args = parser.parse_args()
41 |
42 | df = pd.read_csv(args.CORPUS)
43 | df['Word'] = df['Word'].apply(word2char)
44 | df['Error'] = df['Error'].apply(word2char)
45 | df = df.sample(frac=1).reset_index(drop=True)
46 | df = df[['Error', 'Word']]
47 |
48 | train_df, test_df = train_test_split(df, test_size=.15)
49 | train_df, valid_df = train_test_split(train_df, test_size=.05)
50 |
51 | train_df.to_csv('./Dataset/train.csv', index=False)
52 | valid_df.to_csv('./Dataset/valid.csv', index=False)
53 | test_df.to_csv('./Dataset/test.csv', index=False)
54 |
55 | SRC = Field(
56 | tokenize=basic_tokenizer, lower=False,
57 | init_token='', eos_token='', batch_first=True
58 | )
59 | TRG = Field(
60 | tokenize=basic_tokenizer, lower=False,
61 | init_token='', eos_token='', batch_first=True
62 | )
63 | fields = {
64 | 'Error': ('src', SRC),
65 | 'Word': ('trg', TRG)
66 | }
67 |
68 | train_data, valid_data, test_data = TabularDataset.splits(
69 | path='./Dataset',
70 | train='train.csv',
71 | validation='valid.csv',
72 | test='test.csv',
73 | format='csv',
74 | fields=fields
75 | )
76 |
77 | SRC.build_vocab(train_data, min_freq=100)
78 | TRG.build_vocab(train_data, min_freq=50)
79 |
80 | # Hyperparameters
81 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
82 | BATCH_SIZE = args.BATCH_SIZE
83 | #
84 | INPUT_DIM = len(SRC.vocab)
85 | OUTPUT_DIM = len(TRG.vocab)
86 | EMB_DIM = args.EMB_DIM # 64
87 | HID_DIM = args.HID_DIM # 256 # each conv. layer has 2 * hid_dim filters
88 | ENC_LAYERS = args.ENC_LAYERS # 10 # number of conv. blocks in encoder
89 | DEC_LAYERS = args.DEC_LAYERS # 10 # number of conv. blocks in decoder
90 | ENC_KERNEL_SIZE = args.ENC_KERNEL_SIZE # must be odd!
91 | DEC_KERNEL_SIZE = args.DEC_KERNEL_SIZE # can be even or odd
92 | ENC_DROPOUT = args.ENC_DROPOUT
93 | DEC_DROPOUT = args.DEC_DROPOUT
94 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
95 | CLIP = args.CLIP
96 | PATH = './Checkpoints/conv_s2s.pth'
97 |
98 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
99 | (train_data, valid_data, test_data),
100 | batch_size=BATCH_SIZE,
101 | sort_within_batch=True,
102 | sort_key=lambda x: len(x.src),
103 | device=DEVICE
104 | )
105 |
106 | enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, DEVICE)
107 | dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, DEVICE)
108 | model = Seq2Seq(enc, dec).to(DEVICE)
109 | # print(f'The model has {count_parameters(model):,} trainable parameters')
110 |
111 | optimizer = optim.Adam(model.parameters())
112 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
113 |
114 | epoch = 1
115 | # load the model
116 | if os.path.exists(PATH):
117 | checkpoint, epoch, train_loss = load_model(model, PATH)
118 | #
119 | best_loss = 1e10
120 |
121 | for epoch in range(epoch, N_EPOCHS):
122 | print(f"Epoch: {epoch} / {N_EPOCHS}")
123 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
124 | print(f"Train Loss: {train_loss:.4f}")
125 | if train_loss < best_loss:
126 | best_loss = train_loss
127 | save_model(model, train_loss, epoch, PATH)
128 |
129 | # example_idx = 10
130 | # src = vars(train_data.examples[example_idx])['src']
131 | # trg = vars(train_data.examples[example_idx])['trg']
132 | # print(f'src = {src}')
133 | # print(f'trg = {trg}')
134 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
135 | # print(f'predicted trg = {translation}')
136 |
137 | evaluation_report(valid_data, SRC, TRG, model, DEVICE)
138 | # evaluation_report(error_data, SRC, TRG, model, DEVICE)
139 |
140 |
141 | # -------------
142 | # error_types = ['Cognitive Error', 'Homonym Error', 'Run-on Error',
143 | # 'Split-word Error (Left)', 'Split-word Error (Random)',
144 | # 'Split-word Error (Right)', 'Split-word Error (both)',
145 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
146 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
147 | # 'Visual Error', 'Visual Error (Combined Character)']
148 |
149 | # for error_name in error_types:
150 | # print(f'------\nError Type: {error_name}\n------')
151 | # error_df(df_copy, error_name)
152 |
153 | # error_data, _ = TabularDataset.splits(
154 | # path='./Dataset',
155 | # train='error.csv',
156 | # test='error.csv',
157 | # format='csv',
158 | # fields=fields
159 | # )
160 |
161 | # eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
162 |
163 | # error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
164 | # eval_df.to_csv(f'./Dataframes/convs2s_{error_name}_2.csv')
165 | # print('\n\n')
166 | # -------------
167 |
168 |
169 | if __name__ == '__main__':
170 | main()
171 |
--------------------------------------------------------------------------------
/corrector.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | word2char, basic_tokenizer, count_parameters, initialize_weights,
3 | save_model, load_model, error_df, train_valid_test_df, mask2str,
4 | error_blank, find_len, error_df_2
5 | )
6 | from transformer import (
7 | Encoder, EncoderLayer, MultiHeadAttentionLayer,
8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
9 | Seq2Seq
10 | )
11 | from pipeline import train, evaluate
12 | from metrics import evaluation_report
13 |
14 | import pandas as pd
15 | from sklearn.model_selection import train_test_split
16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
17 | import torch
18 | import torch.nn as nn
19 | import os
20 | import gc
21 | from tqdm import tqdm
22 | import sys
23 | import argparse
24 |
25 | import warnings as wrn
26 | wrn.filterwarnings('ignore')
27 |
28 | import os
29 | os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
30 |
31 |
32 |
33 | def main():
34 | parser = argparse.ArgumentParser()
35 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
36 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
37 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
38 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
39 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
40 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
41 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
42 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
43 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
44 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
45 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
46 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
47 | args = parser.parse_args()
48 |
49 | SEED = 1234
50 | torch.manual_seed(SEED)
51 | torch.cuda.manual_seed(SEED)
52 |
53 | df = pd.read_csv('./Dataset/purificator_preds.csv')
54 | df_copy = df.copy()
55 | df['Word'] = df['Word'].apply(word2char)
56 | df['Error'] = df['Error'].apply(word2char)
57 | df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char)
58 | df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char)
59 | df['ErrorBlanksPredD2'] = df['ErrorBlanksPredD2'].apply(word2char)
60 |
61 | df['MaskErrorBlank'] = ' ' + df['Error'] + ' ' + df['ErrorBlanksPredD2'] + ' '
62 | df['Length'] = df['MaskErrorBlank'].apply(find_len)
63 |
64 | df = df.loc[df['Length'] <= 48] # 48 works
65 |
66 | # df = df.iloc[:, [1, -2, 8]] # word - maskerrorblank - errortype
67 | df = df[['Word', 'MaskErrorBlank', 'ErrorType']]
68 |
69 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=.15, valid_size=.05)
70 |
71 | train_df.to_csv('./Dataset/train.csv', index=False)
72 | valid_df.to_csv('./Dataset/valid.csv', index=False)
73 | test_df.to_csv('./Dataset/test.csv', index=False)
74 |
75 | SRC = Field(
76 | tokenize=basic_tokenizer, lower=False,
77 | init_token='', eos_token='', batch_first=True
78 | )
79 | TRG = Field(
80 | tokenize=basic_tokenizer, lower=False,
81 | init_token='', eos_token='', batch_first=True
82 | )
83 | fields = {
84 | 'MaskErrorBlank': ('src', SRC),
85 | 'Word': ('trg', TRG)
86 | }
87 |
88 | train_data, valid_data, test_data = TabularDataset.splits(
89 | path='./Dataset',
90 | train='train.csv',
91 | validation='valid.csv',
92 | test='test.csv',
93 | format='csv',
94 | fields=fields
95 | )
96 |
97 | SRC.build_vocab(train_data, min_freq=100)
98 | TRG.build_vocab(train_data, min_freq=50)
99 |
100 |
101 | # ------------------------------
102 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
103 | BATCH_SIZE = 512 # 512
104 | # ------------------------------
105 | INPUT_DIM = len(SRC.vocab)
106 | OUTPUT_DIM = len(TRG.vocab)
107 | # ------------------------------
108 | HID_DIM = int(args.HID_DIM)
109 | ENC_LAYERS = int(args.ENC_LAYERS)
110 | DEC_LAYERS = int(args.DEC_LAYERS)
111 | ENC_HEADS = int(args.ENC_HEADS)
112 | DEC_HEADS = int(args.DEC_HEADS)
113 | ENC_PF_DIM = int(args.ENC_PF_DIM)
114 | DEC_PF_DIM = int(args.DEC_PF_DIM)
115 | ENC_DROPOUT = float(args.ENC_DROPOUT)
116 | DEC_DROPOUT = float(args.DEC_DROPOUT)
117 | CLIP = float(args.CLIP)
118 | N_EPOCHS = int(args.N_EPOCHS)
119 | LEARNING_RATE = float(args.LEARNING_RATE)
120 | # ------------------------------
121 | PATH = './Checkpoints/corrector.pth'
122 | # ------------------------------
123 | gc.collect()
124 | torch.cuda.empty_cache()
125 | # -----------------------------
126 |
127 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
128 | (train_data, valid_data, test_data),
129 | batch_size=BATCH_SIZE,
130 | sort_within_batch=True,
131 | sort_key=lambda x: len(x.src),
132 | device=DEVICE
133 | )
134 |
135 | enc = Encoder(
136 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
137 | ENC_DROPOUT, DEVICE
138 | )
139 | dec = Decoder(
140 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
141 | DEC_DROPOUT, DEVICE
142 | )
143 |
144 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
145 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
146 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
147 | model.apply(initialize_weights)
148 | # print(f'The model has {count_parameters(model):,} trainable parameters')
149 |
150 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
151 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
152 | # criterion = nn.BCEWithLogitsLoss()
153 |
154 | epoch = 1
155 | best_loss = 1e10
156 | if os.path.exists(PATH):
157 | checkpoint, epoch, train_loss = load_model(model, PATH)
158 | best_loss = train_loss
159 |
160 | # model.resize_token_embeddings(len(TRG.vocab))
161 | for epoch in range(epoch, N_EPOCHS):
162 | print(f"Epoch: {epoch} / {N_EPOCHS}")
163 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
164 | print(f"Train Loss: {train_loss:.4f}")
165 | if train_loss < best_loss:
166 | best_loss = train_loss
167 | save_model(model, train_loss, epoch, PATH)
168 |
169 | # ---------------------
170 | error_types = sorted(list(set(df.iloc[:, -1].values)))
171 |
172 | for error_name in error_types:
173 | print(f'------\nError Type: {error_name}\n------')
174 | error_df_2(df, error_name)
175 |
176 | error_data, _ = TabularDataset.splits(
177 | path='./Dataset',
178 | train='error.csv',
179 | test='error.csv',
180 | format='csv',
181 | fields=fields
182 | )
183 |
184 | eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
185 |
186 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
187 | print('\n\n')
188 | # ---------------------
189 |
190 |
191 | if __name__ == '__main__':
192 | main()
193 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/corrector.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | word2char, basic_tokenizer, count_parameters, initialize_weights,
3 | save_model, load_model, error_df, train_valid_test_df, mask2str,
4 | error_df_2, error_df_3, find_len, train_valid_test_df2, merge_dfs
5 | )
6 | from transformer import (
7 | Encoder, EncoderLayer, MultiHeadAttentionLayer,
8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
9 | Seq2Seq
10 | )
11 | from pipeline import train, evaluate
12 | from metrics import evaluation_report, evaluation_report2, evaluation_report3
13 |
14 | import pandas as pd
15 | from sklearn.model_selection import train_test_split
16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
17 | import torch
18 | import torch.nn as nn
19 | import os
20 | import gc
21 | import argparse
22 | import sys
23 |
24 | import warnings as wrn
25 | wrn.filterwarnings('ignore')
26 |
27 |
28 | def main():
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
31 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
32 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
33 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
34 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
35 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
36 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
37 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
38 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
39 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
40 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
41 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
42 | args = parser.parse_args()
43 |
44 | SEED = 1234
45 | torch.manual_seed(SEED)
46 | torch.cuda.manual_seed(SEED)
47 |
48 | df = pd.read_csv('./Dataset/detector_preds.csv')
49 | df['Error'] = df['Error'].apply(word2char)
50 | df['Word'] = df['Word'].apply(word2char)
51 | df['ErrorBlanksPredD1'] = df['ErrorBlanksPredD1'].apply(word2char)
52 | df['ErrorBlanksActual'] = df['ErrorBlanksActual'].apply(word2char)
53 |
54 | df['MaskErrorBlank'] = ' ' + df['Error'] + ' ' + df['ErrorBlanksPredD1'] + ' '
55 | df['Length'] = df['MaskErrorBlank'].apply(find_len)
56 | df = df.loc[df['Length'] <= 48] # 48 works
57 |
58 | df = df.sample(frac=1).reset_index(drop=True)
59 | df = df[['ErrorBlanksActual', 'MaskErrorBlank', 'ErrorType']]
60 |
61 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
62 |
63 | train_df.to_csv('./Dataset/train.csv', index=False)
64 | valid_df.to_csv('./Dataset/valid.csv', index=False)
65 | test_df.to_csv('./Dataset/test.csv', index=False)
66 |
67 | SRC = Field(
68 | tokenize=basic_tokenizer, lower=False,
69 | init_token='', eos_token='', batch_first=True
70 | )
71 | TRG = Field(
72 | tokenize=basic_tokenizer, lower=False,
73 | init_token='', eos_token='', batch_first=True
74 | )
75 | WORD = Field(
76 | tokenize=basic_tokenizer, lower=False,
77 | init_token='', eos_token='', batch_first=True
78 | )
79 | fields = {
80 | 'ErrorBlanksPredD1': ('src', SRC),
81 | 'Word': ('trg', TRG)
82 | }
83 |
84 | train_data, valid_data, test_data = TabularDataset.splits(
85 | path='./Dataset',
86 | train='train.csv',
87 | validation='valid.csv',
88 | test='test.csv',
89 | format='csv',
90 | fields=fields
91 | )
92 |
93 | SRC.build_vocab(train_data, min_freq=100) # 100
94 | TRG.build_vocab(train_data, min_freq=50) # 50
95 |
96 | # ------------------------------
97 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
98 | BATCH_SIZE = 512 # 512
99 | # ------------------------------
100 | INPUT_DIM = len(SRC.vocab)
101 | OUTPUT_DIM = len(TRG.vocab)
102 | # ------------------------------
103 | HID_DIM = int(args.HID_DIM)
104 | ENC_LAYERS = int(args.ENC_LAYERS)
105 | DEC_LAYERS = int(args.DEC_LAYERS)
106 | ENC_HEADS = int(args.ENC_HEADS)
107 | DEC_HEADS = int(args.DEC_HEADS)
108 | ENC_PF_DIM = int(args.ENC_PF_DIM)
109 | DEC_PF_DIM = int(args.DEC_PF_DIM)
110 | ENC_DROPOUT = float(args.ENC_DROPOUT)
111 | DEC_DROPOUT = float(args.DEC_DROPOUT)
112 | CLIP = float(args.CLIP)
113 | N_EPOCHS = int(args.N_EPOCHS)
114 | LEARNING_RATE = float(args.LEARNING_RATE)
115 | # ------------------------------
116 | PATH = './Checkpoints/corrector.pth'
117 | # ------------------------------
118 | gc.collect()
119 | torch.cuda.empty_cache()
120 | # -----------------------------
121 |
122 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
123 | (train_data, valid_data, test_data),
124 | batch_size=BATCH_SIZE,
125 | sort_within_batch=True,
126 | sort_key=lambda x: len(x.src),
127 | device=DEVICE
128 | )
129 |
130 | enc = Encoder(
131 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
132 | ENC_DROPOUT, DEVICE
133 | )
134 | dec = Decoder(
135 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
136 | DEC_DROPOUT, DEVICE
137 | )
138 |
139 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
140 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
141 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
142 | model.apply(initialize_weights)
143 | # print(f'The model has {count_parameters(model):,} trainable parameters')
144 |
145 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
146 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
147 | # criterion = nn.BCEWithLogitsLoss()
148 |
149 | epoch = 1
150 | best_loss = 1e10
151 | if os.path.exists(PATH):
152 | checkpoint, epoch, train_loss = load_model(model, PATH)
153 | best_loss = train_loss
154 |
155 | # model.resize_token_embeddings(len(TRG.vocab))
156 | for epoch in range(epoch, N_EPOCHS):
157 | print(f"Epoch: {epoch} / {N_EPOCHS}")
158 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
159 | print(f"Train Loss: {train_loss:.4f}")
160 | if train_loss < best_loss:
161 | best_loss = train_loss
162 | save_model(model, train_loss, epoch, PATH)
163 |
164 | # ---------------------
165 | error_types = sorted(list(set(df.iloc[:, -1].values)))
166 |
167 | for error_name in error_types:
168 | print(f'------\nError Type: {error_name}\n------')
169 | error_df_3(df, error_name)
170 |
171 | error_data, _ = TabularDataset.splits(
172 | path='./Dataset',
173 | train='error.csv',
174 | test='error.csv',
175 | format='csv',
176 | fields=fields
177 | )
178 |
179 | eval_df = evaluation_report(error_data, SRC, TRG, model, DEVICE)
180 |
181 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
182 | print('\n\n')
183 | # ---------------------
184 |
185 |
186 | if __name__ == '__main__':
187 | main()
188 |
--------------------------------------------------------------------------------
/CorpusCreation/corpus_stats_valid.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | from selenium.webdriver.support import expected_conditions as EC
4 | from selenium.webdriver.common.by import By
5 | from selenium.webdriver.support.wait import WebDriverWait
6 | import time
7 | import pandas as pd
8 | import re
9 | import sys
10 | import argparse
11 | from tqdm import tqdm
12 |
13 |
14 | # ########################################################
15 | def login():
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument("--email", help="Enter Your Email")
18 | parser.add_argument("--password", help="Enter Your Facebook Password")
19 | args = parser.parse_args()
20 |
21 | # code to ignore browser notifications
22 | chrome_options = webdriver.ChromeOptions()
23 | prefs = {"profile.default_content_setting_values.notifications": 2}
24 | chrome_options.add_experimental_option("prefs", prefs)
25 | driver = webdriver.Chrome('./chromedriver.exe', chrome_options=chrome_options)
26 | # open the webpage
27 | driver.get("https://wwww.facebook.com/")
28 | # target username
29 | username = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']")))
30 | password = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']")))
31 | # entering email as username
32 | username.clear()
33 | username.send_keys(args.email)
34 | # entering password
35 | password.clear()
36 | password.send_keys(args.password)
37 | # target the login button and click it
38 | time.sleep(5)
39 | button = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']"))).click()
40 | # We are logged in!
41 | print("Logged in")
42 | return driver
43 | # ########################################################
44 |
45 |
46 | # ########################################################
47 | def scrape_post_1():
48 | driver = login()
49 | # https://fb.watch/eN-nBOb45t/
50 | url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid02TjtvmwDs51fyVRaHbvM5XgxL1gBGb6USBYvsxgMdn8c4BcQvjbLv1BFCjw52UsXQl&id=111762869482599&eav=Afba2OolCuRXElnzf97xViXfIosR66LZPdko_Q9oxtd5fhvZMDjeKOC_JD1Nx2LKtEE&__tn__=%2AW&paipv=0"
51 | # url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0"
52 | while True:
53 | driver.get(url)
54 | comments = driver.find_element(By.CLASS_NAME, "ef").text
55 | comments = re.sub("[A-Za-z0-9·\\n]", "", comments)
56 | next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href')
57 | if type(next_page) != str:
58 | break
59 | url = next_page
60 | time.sleep(5)
61 | sys.exit()
62 | with open('./dfs/comments.txt', 'a', encoding='utf-8') as f:
63 | f.write(comments)
64 | f.write(' \n ')
65 | # ########################################################
66 |
67 |
68 | # ########################################################
69 | def scrape_post_2():
70 | driver = login()
71 | # https://fb.watch/eNQHYjDuA6/
72 | url = "https://mbasic.facebook.com/story.php?story_fbid=pfbid0eP3VufmYZQEdDrGybgzg9ganLPXRo9JXQ8q5pUjiaBF7gTQ9FnkJdw44PDfx11JKl&id=313147292549612&eav=AfbiujhhnbU2KOwEYD6oavgC5llyK5uWWqiecav3DYpPCCC4llyMqpaYY9rPUvap1z0&ref=sharing&__tn__=%2AW&paipv=0"
73 | while True:
74 | driver.get(url)
75 | comments = driver.find_elements(By.CLASS_NAME, "eb")
76 | for comment in comments:
77 | comment = comment.text
78 | comment = re.sub("[A-Za-z0-9·.\\n]", "", comment)
79 | with open('comments.txt', 'a', encoding='utf-8') as f:
80 | f.write(comment)
81 | f.write(' ')
82 |
83 | comments = driver.find_elements(By.CLASS_NAME, "ec")
84 | for comment in comments:
85 | comment = comment.text
86 | comment = re.sub("[A-Za-z0-9·.\\n]", "", comment)
87 | with open('./dfs/comments.txt', 'a', encoding='utf-8') as f:
88 | f.write(comment)
89 | f.write(' ')
90 |
91 | next_page = driver.find_elements(By.TAG_NAME, "a")[-1].get_attribute('href')
92 | if type(next_page) != str:
93 | break
94 |
95 | url = next_page
96 | time.sleep(5)
97 | # ########################################################
98 |
99 |
100 | # ########################################################
101 | def clean_text(text):
102 | all_chars = ['ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ',
103 | 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ',
104 | 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ',
105 | 'ষ', 'স', 'হ', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ',
106 | 'ৗ', 'ড়', 'ঢ়', 'য়']
107 | cleaned_text = ''
108 | for i in tqdm(range(len(text))):
109 | if text[i] in all_chars:
110 | cleaned_text += text[i]
111 | else:
112 | cleaned_text += ' '
113 | return cleaned_text
114 |
115 | def find_stats():
116 | f = open("./dfs/comments.txt", "r", encoding='utf-8')
117 | text = f.read()
118 | text = clean_text(text)
119 |
120 | words = sorted(text.split())
121 | unique_words = sorted(list(set(words)))
122 |
123 | error_df = pd.read_csv('./dfs/sec_dataset_IV.csv')
124 | balanced_df = pd.DataFrame()
125 | all_error_types = sorted(list(set(error_df.iloc[:, -1].values)))
126 | for error in all_error_types:
127 | x = error_df.loc[error_df['ErrorType'] == error]
128 | if (len(x)) < 100000:
129 | balanced_df = pd.concat([balanced_df, x])
130 | else:
131 | balanced_df = pd.concat([balanced_df, x.sample(100000)])
132 |
133 | erroneous_words = balanced_df.iloc[:, 1].values
134 | erroneous_words_type = balanced_df.iloc[:, 2].values
135 |
136 | found = []
137 | types = []
138 | for i in tqdm(range(len(unique_words))):
139 | word = unique_words[i]
140 | if word in erroneous_words:
141 | found.append(word)
142 | types.append(erroneous_words_type[i])
143 | if (i != 0 and i % 1000 == 0):
144 | print(len(found))
145 |
146 | error_words = []
147 | error_types = []
148 | for i in tqdm(range(len(found))):
149 | word = found[i]
150 | etype = error_df.loc[error_df['Error'] == word]['ErrorType'].values[0]
151 | error_words.append(word)
152 | error_types.append(etype)
153 |
154 | temp = pd.DataFrame({
155 | 'Error': error_words,
156 | 'ErrorType': error_types
157 | })
158 |
159 | unique_etypes = sorted(list(set(error_types)))
160 | err_names, instances, pcts = [], [], []
161 | for etype in unique_etypes:
162 | x = temp.loc[temp['ErrorType'] == etype]
163 | print(f"{etype}, {len(x)}/{len(temp)}, {len(x) / len(temp) * 100:.2f}%")
164 | err_names.append(etype)
165 | instances.append(f"{len(x)}/{len(temp)}")
166 | pcts.append(len(x) / len(temp) * 100)
167 |
168 | df = pd.DataFrame({
169 | 'ErrorType': err_names,
170 | 'Instances': instances,
171 | 'Pct': pcts
172 | })
173 | print(df)
174 |
175 | print("Missing error types")
176 | found = sorted(list(set(error_types)))
177 | target = sorted(list(set(error_df.iloc[:, -1].values)))
178 |
179 | for item in target:
180 | if item not in found:
181 | print(item)
182 | # ########################################################
183 |
184 |
185 | # ########################################################
186 | if __name__ == '__main__':
187 | scrape_post_1()
188 | scrape_post_2()
189 | find_stats()
190 |
--------------------------------------------------------------------------------
/Baselines/GRUSeq2Seq/main.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | df2train_test_dfs, basic_tokenizer, init_weights, count_parameters,
3 | translate_sentence, display_attention, df2train_valid_test_dfs,
4 | save_model, load_model, df2train_error_dfs
5 | )
6 | from models import Encoder, Decoder, Attention, Seq2Seq
7 | from pipeline import train, test_accuracy
8 | from inference import test_beam, test_greedy
9 | from focalLoss import FocalLoss
10 | from errors import error_df
11 |
12 | import torch, torch.nn as nn, torch.optim as optim
13 | import torch.nn.functional as F
14 | from torchtext.legacy.data import Field, BucketIterator, TabularDataset
15 | import random
16 | from tqdm import tqdm
17 | import pandas as pd
18 | from sklearn.model_selection import train_test_split
19 | import math
20 | import time
21 |
22 | import matplotlib.pyplot as plt
23 | import matplotlib.ticker as ticker
24 | import matplotlib.font_manager as fm
25 |
26 | import numpy as np
27 | import math
28 | import time
29 | import sys
30 | import os
31 | import argparse
32 |
33 | import warnings as wrn
34 | wrn.filterwarnings('ignore')
35 |
36 |
37 | def main():
38 | parser = argparse.ArgumentParser()
39 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus2.csv",
40 | choices=["./Dataset/corpus.csv", "./Dataset/corpus2.csv"]
41 | )
42 | parser.add_argument("--ENC_EMB_DIM", help="Encoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
43 | parser.add_argument("--DEC_EMB_DIM", help="Decoder Embedding Dimension", type=int, default=128, choices=[64, 128, 256])
44 | parser.add_argument("--ENC_HIDDEN_DIM", help="Encoder Hidden Dimension", type=int,default=256, choices=[128, 256, 512])
45 | parser.add_argument("--DEC_HIDDEN_DIM", help="Decoder Hidden Dimension", type=int, default=512, choices=[256, 512, 1024])
46 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
47 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
48 | parser.add_argument("--MAX_LEN", help="Maximum Length", type=int, default=48, choices=[48, 56, 64])
49 | parser.add_argument("--BATCH_SIZE", help="Batch Size", type=int, default=256, choices=[256, 512])
50 | parser.add_argument("--CLIP", help="Gradient Clipping", type=float, default=1, choices=[0.1, 0.2, 0.5, 1])
51 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
52 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
53 | args = parser.parse_args()
54 |
55 |
56 | df = pd.read_csv(args.CORPUS)
57 | df2train_valid_test_dfs(df=df, test_size=0.15)
58 |
59 | SRC = Field(
60 | tokenize=basic_tokenizer, lower=False,
61 | init_token='', eos_token='',
62 | sequential=True, use_vocab=True, include_lengths=True
63 | )
64 | TRG = Field(
65 | tokenize=basic_tokenizer, lower=False,
66 | init_token='', eos_token='',
67 | sequential=True, use_vocab=True
68 | )
69 | fields = {
70 | 'Error': ('src', SRC),
71 | 'Word': ('trg', TRG)
72 | }
73 | train_data, valid_data, test_data = TabularDataset.splits(
74 | path='./Dataset',
75 | train='train.csv',
76 | validation='valid.csv',
77 | test='test.csv',
78 | format='csv',
79 | fields=fields
80 | )
81 |
82 | SRC.build_vocab(train_data, max_size=64, min_freq=100)
83 | TRG.build_vocab(train_data, max_size=64, min_freq=75)
84 | # -------------------------------------
85 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
86 | BATCH_SIZE = args.BATCH_SIZE
87 | INPUT_DIM = len(SRC.vocab)
88 | OUTPUT_DIM = len(TRG.vocab)
89 | ENC_EMB_DIM = args.ENC_EMB_DIM
90 | DEC_EMB_DIM = args.DEC_EMB_DIM
91 | ENC_HIDDEN_DIM = args.ENC_HIDDEN_DIM
92 | DEC_HIDDEN_DIM = args.DEC_HIDDEN_DIM
93 | ENC_DROPOUT = args.ENC_DROPOUT
94 | DEC_DROPOUT = args.DEC_DROPOUT
95 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
96 | MAX_LEN = args.MAX_LEN
97 | N_EPOCHS = args.N_EPOCHS
98 | CLIP = args.CLIP
99 | # -------------------------------------
100 | PATH = './Checkpoints/GRUSeq2Seq.pth'
101 |
102 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
103 | (train_data, valid_data, test_data),
104 | batch_size=BATCH_SIZE,
105 | sort_within_batch=True,
106 | sort_key=lambda x: len(x.src),
107 | device=DEVICE
108 | )
109 |
110 | attention = Attention(ENC_HIDDEN_DIM, DEC_HIDDEN_DIM)
111 | encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, ENC_DROPOUT)
112 | decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HIDDEN_DIM, DEC_HIDDEN_DIM, DEC_DROPOUT, attention)
113 |
114 | model = Seq2Seq(encoder, decoder, SRC_PAD_IDX, DEVICE).to(DEVICE)
115 | model.apply(init_weights)
116 | # print(f'The model has {count_parameters(model):,} trainable parameters')
117 |
118 | optimizer = optim.Adam(model.parameters())
119 | # scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=0.5, total_iters=4)
120 |
121 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
122 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
123 | # criterion = nn.NLLLoss(ignore_index=TRG_PAD_IDX)
124 | # criterion = FocalLoss(alpha=0.5, gamma=2.0, reduction='mean')
125 |
126 | best_loss = 1e10
127 | epoch = 1
128 | if os.path.exists(PATH):
129 | checkpoint, epoch, train_loss = load_model(model, optimizer, PATH)
130 | best_loss = train_loss
131 |
132 | for epoch in range(epoch, N_EPOCHS):
133 | print(f'Epoch: {epoch} / {N_EPOCHS}')
134 | train_loss = train(model, train_iterator, optimizer, criterion)
135 | print(f"Train Loss: {train_loss:.2f}")
136 |
137 | if train_loss < best_loss:
138 | best_loss = train_loss
139 | save_model(model, epoch, optimizer, train_loss, PATH)
140 |
141 | # scheduler.step()
142 | # if epoch%10 == 0:
143 | # # test_accuracy(valid_data, SRC, TRG, model, DEVICE)
144 | # test_accuracy(error_data, SRC, TRG, model, DEVICE)
145 |
146 | test_accuracy(valid_data, SRC, TRG, model, DEVICE)
147 |
148 |
149 | # errors = ['Cognitive Error', 'Homonym Error', 'Run-on Error',
150 | # 'Split-word Error (Left)', 'Split-word Error (Random)',
151 | # 'Split-word Error (Right)', 'Split-word Error (both)',
152 | # 'Typo (Avro) Substituition', 'Typo (Bijoy) Substituition',
153 | # 'Typo Deletion', 'Typo Insertion', 'Typo Transposition',
154 | # 'Visual Error', 'Visual Error (Combined Character)']
155 |
156 | # for error in errors:
157 | # print(f"-----\nError Type: {error}\n-----")
158 | # error_df(df, error)
159 | # error_data, _ = TabularDataset.splits(
160 | # path='./Dataset',
161 | # train='error.csv',
162 | # test='error.csv',
163 | # format='csv',
164 | # fields=fields
165 | # )
166 | # eval_df = test_accuracy(error_data, SRC, TRG, model, DEVICE)
167 | # error = error.replace(' ', '').replace('(', '').replace(')', '')
168 | # eval_df.to_csv(f'./Corrections/s2sJL_{error}.csv')
169 | # print('\n\n')
170 |
171 |
172 | # test_beam(model, train_data, test_data, SRC, TRG, DEVICE)
173 | # test_greedy(test_data, SRC, TRG, model, DEVICE)
174 |
175 | # example_idx = 1
176 | # src = vars(train_data.examples[example_idx])['src']
177 | # trg = vars(train_data.examples[example_idx])['trg']
178 | # print(f'src = {src}')
179 | # print(f'trg = {trg}')
180 | # translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
181 | # print(f'predicted trg = {translation}')
182 | # display_attention(src, translation, attention)
183 |
184 |
185 | if __name__ == '__main__':
186 | main()
187 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.model_selection import train_test_split
6 | import os
7 |
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 | SEED = 1234
12 | torch.manual_seed(SEED)
13 | torch.cuda.manual_seed(SEED)
14 |
15 |
16 | # ---------------------------
17 | def train_valid_test_df(df, test_size, valid_size):
18 | # etypes = list(set(df.iloc[:, -1]))
19 | etypes = list(set(df['ErrorType']))
20 |
21 | train_df = pd.DataFrame()
22 | valid_df = pd.DataFrame()
23 | test_df = pd.DataFrame()
24 |
25 | for etype in etypes:
26 | etype_df = df.loc[df['ErrorType'] == etype]
27 | train, test = train_test_split(etype_df, test_size=test_size)
28 | train, valid = train_test_split(train, test_size=valid_size)
29 |
30 | train_df = pd.concat([train_df, train])
31 | valid_df = pd.concat([valid_df, valid])
32 | test_df = pd.concat([test_df, test])
33 |
34 | train_df = train_df.sample(frac=1).reset_index(drop=True)
35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
36 | test_df = test_df.sample(frac=1).reset_index(drop=True)
37 |
38 | train_df = train_df.iloc[:, [1, 0]]
39 | valid_df = valid_df.iloc[:, [1, 0]]
40 | test_df = test_df.iloc[:, [1, 0]]
41 |
42 | return train_df, valid_df, test_df
43 | # ---------------------------
44 |
45 |
46 | # ---------------------------
47 | def train_valid_test_df2(df, test_size, valid_size):
48 | # etypes = list(set(df.iloc[:, -1]))
49 | etypes = list(set(df['ErrorType']))
50 |
51 | train_df = pd.DataFrame()
52 | valid_df = pd.DataFrame()
53 | test_df = pd.DataFrame()
54 |
55 | for etype in etypes:
56 | etype_df = df.loc[df['ErrorType'] == etype]
57 | train, test = train_test_split(etype_df, test_size=test_size)
58 | train, valid = train_test_split(train, test_size=valid_size)
59 |
60 | train_df = pd.concat([train_df, train])
61 | valid_df = pd.concat([valid_df, valid])
62 | test_df = pd.concat([test_df, test])
63 |
64 | train_df = train_df.sample(frac=1).reset_index(drop=True)
65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
66 | test_df = test_df.sample(frac=1).reset_index(drop=True)
67 |
68 | # train_df = train_df.iloc[:, [1, 0]]
69 | # valid_df = valid_df.iloc[:, [1, 0]]
70 | # test_df = test_df.iloc[:, [1, 0]]
71 |
72 | return train_df, valid_df, test_df
73 | # ---------------------------
74 |
75 |
76 | # ---------------------------
77 | def merge_dfs(network='detector'):
78 | df_names = [
79 | f'{network}_CognitiveError.csv',
80 | f'{network}_HomonymError.csv',
81 | f'{network}_Run-onError.csv',
82 | f'{network}_Split-wordErrorLeft.csv',
83 | f'{network}_Split-wordErrorRandom.csv',
84 | f'{network}_Split-wordErrorRight.csv',
85 | f'{network}_Split-wordErrorboth.csv',
86 | f'{network}_TypoAvroSubstituition.csv',
87 | f'{network}_TypoBijoySubstituition.csv',
88 | f'{network}_TypoDeletion.csv',
89 | f'{network}_TypoInsertion.csv',
90 | f'{network}_TypoTransposition.csv',
91 | f'{network}_VisualError.csv',
92 | f'{network}_VisualErrorCombinedCharacter.csv'
93 | ]
94 |
95 | df = pd.DataFrame()
96 |
97 | for df_name in df_names:
98 | df_path = os.path.join('./Dataframes', df_name)
99 | temp_df = pd.read_csv(df_path)
100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 | for _ in range(len(temp_df))]
102 | df = pd.concat([df, temp_df])
103 |
104 | df = df.iloc[:, :]
105 |
106 | if network=='detector':
107 | df.rename(
108 | columns = {
109 | 'Predicton':'ErrorBlanksPredD1',
110 | 'Target':'ErrorBlanksActual',
111 | 'Correction':'EBP_Flag_D1',
112 | },
113 | inplace = True
114 | )
115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 |
117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 | # (purificator)
119 | # ---------------------------
120 |
121 |
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 | df = df.loc[df['ErrorType'] == error]
125 | df['Word'] = df['Word'].apply(word2char)
126 | df['Error'] = df['Error'].apply(word2char)
127 | df = df.sample(frac=1).reset_index(drop=True)
128 | idx = int(len(df)/1)
129 | df = df.iloc[:idx, [1, 0]]
130 | df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 |
133 |
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 | df = df.loc[df['ErrorType'] == error]
137 | # df['Word'] = df['Word'].apply(word2char)
138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 | df = df.sample(frac=1).reset_index(drop=True)
140 | idx = int(len(df)/1)
141 | df = df.iloc[:idx, [1, 0]]
142 | #
143 | # if(len(df) >= 10000):
144 | # df = df.iloc[:10000, :]
145 | #
146 | df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 |
149 |
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 | df = df.loc[df['ErrorType'] == error]
153 | # df['Word'] = df['Word'].apply(word2char)
154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 | df = df.sample(frac=1).reset_index(drop=True)
156 | # idx = int(len(df)/1)
157 | # df = df.iloc[:idx, [1, 0]]
158 | #
159 | # if(len(df) >= 10000):
160 | # df = df.iloc[:10000, :]
161 | #
162 | df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 |
165 |
166 | # ---------------------------
167 | def word2char(word):
168 | w2c = [char for char in word]
169 | return ' '.join(w2c)
170 | # ---------------------------
171 |
172 |
173 | # ---------------------------
174 | def find_len(seq):
175 | return len(seq.split(' '))
176 | # ---------------------------
177 |
178 |
179 | # ---------------------------
180 | def mask2str(mask):
181 | x = ''
182 | for item in mask:
183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 | x += str(item)
185 | return x
186 | # ---------------------------
187 |
188 |
189 | # ---------------------------
190 | def error_blank(error, mask):
191 | error_list = np.array(error.split())
192 | mask_list = np.array(mask.split())
193 | idx = np.where(mask_list=='1')[0]
194 | error_list[idx] = ' '
195 | error = ' '.join(error_list)
196 | return error
197 | # ---------------------------
198 |
199 |
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 | return text.split()
203 | # ---------------------------
204 |
205 |
206 | # ---------------------------
207 | def count_parameters(model):
208 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 |
211 |
212 | # ---------------------------
213 | def initialize_weights(m):
214 | if hasattr(m, 'weight') and m.weight.dim() > 1:
215 | nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 |
218 |
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 | torch.save({
222 | 'epoch': epoch,
223 | 'model_state_dict': model.state_dict(),
224 | # 'optimizer_state_dict': optimizer.state_dict(),
225 | 'loss': train_loss
226 | }, PATH)
227 | print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 |
230 |
231 | # ---------------------------
232 | def load_model(model, PATH):
233 | checkpoint = torch.load(PATH)
234 | model.load_state_dict(checkpoint['model_state_dict'])
235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 | epoch = checkpoint['epoch']
237 | train_loss = checkpoint['loss']
238 | return checkpoint, epoch, train_loss
239 | # ---------------------------
240 |
241 |
242 | if __name__ == '__main__':
243 | pass
244 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/utils.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.model_selection import train_test_split
6 | import os
7 |
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 | SEED = 1234
12 | torch.manual_seed(SEED)
13 | torch.cuda.manual_seed(SEED)
14 |
15 |
16 | # ---------------------------
17 | def train_valid_test_df(df, test_size, valid_size):
18 | # etypes = list(set(df.iloc[:, -1]))
19 | etypes = list(set(df['ErrorType']))
20 |
21 | train_df = pd.DataFrame()
22 | valid_df = pd.DataFrame()
23 | test_df = pd.DataFrame()
24 |
25 | for etype in etypes:
26 | etype_df = df.loc[df['ErrorType'] == etype]
27 | train, test = train_test_split(etype_df, test_size=test_size)
28 | train, valid = train_test_split(train, test_size=valid_size)
29 |
30 | train_df = pd.concat([train_df, train])
31 | valid_df = pd.concat([valid_df, valid])
32 | test_df = pd.concat([test_df, test])
33 |
34 | train_df = train_df.sample(frac=1).reset_index(drop=True)
35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
36 | test_df = test_df.sample(frac=1).reset_index(drop=True)
37 |
38 | train_df = train_df.iloc[:, [1, 0]]
39 | valid_df = valid_df.iloc[:, [1, 0]]
40 | test_df = test_df.iloc[:, [1, 0]]
41 |
42 | return train_df, valid_df, test_df
43 | # ---------------------------
44 |
45 |
46 | # ---------------------------
47 | def train_valid_test_df2(df, test_size, valid_size):
48 | # etypes = list(set(df.iloc[:, -1]))
49 | etypes = list(set(df['ErrorType']))
50 |
51 | train_df = pd.DataFrame()
52 | valid_df = pd.DataFrame()
53 | test_df = pd.DataFrame()
54 |
55 | for etype in etypes:
56 | etype_df = df.loc[df['ErrorType'] == etype]
57 | train, test = train_test_split(etype_df, test_size=test_size)
58 | train, valid = train_test_split(train, test_size=valid_size)
59 |
60 | train_df = pd.concat([train_df, train])
61 | valid_df = pd.concat([valid_df, valid])
62 | test_df = pd.concat([test_df, test])
63 |
64 | train_df = train_df.sample(frac=1).reset_index(drop=True)
65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
66 | test_df = test_df.sample(frac=1).reset_index(drop=True)
67 |
68 | # train_df = train_df.iloc[:, [1, 0]]
69 | # valid_df = valid_df.iloc[:, [1, 0]]
70 | # test_df = test_df.iloc[:, [1, 0]]
71 |
72 | return train_df, valid_df, test_df
73 | # ---------------------------
74 |
75 |
76 | # ---------------------------
77 | def merge_dfs(network='detector'):
78 | df_names = [
79 | f'{network}_CognitiveError.csv',
80 | f'{network}_HomonymError.csv',
81 | f'{network}_Run-onError.csv',
82 | f'{network}_Split-wordErrorLeft.csv',
83 | f'{network}_Split-wordErrorRandom.csv',
84 | f'{network}_Split-wordErrorRight.csv',
85 | f'{network}_Split-wordErrorboth.csv',
86 | f'{network}_TypoAvroSubstituition.csv',
87 | f'{network}_TypoBijoySubstituition.csv',
88 | f'{network}_TypoDeletion.csv',
89 | f'{network}_TypoInsertion.csv',
90 | f'{network}_TypoTransposition.csv',
91 | f'{network}_VisualError.csv',
92 | f'{network}_VisualErrorCombinedCharacter.csv'
93 | ]
94 |
95 | df = pd.DataFrame()
96 |
97 | for df_name in df_names:
98 | df_path = os.path.join('./Dataframes', df_name)
99 | temp_df = pd.read_csv(df_path)
100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 | for _ in range(len(temp_df))]
102 | df = pd.concat([df, temp_df])
103 |
104 | df = df.iloc[:, :]
105 |
106 | if network=='detector':
107 | df.rename(
108 | columns = {
109 | 'Predicton':'ErrorBlanksPredD1',
110 | 'Target':'ErrorBlanksActual',
111 | 'Correction':'EBP_Flag_D1',
112 | },
113 | inplace = True
114 | )
115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 |
117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 | # (purificator)
119 | # ---------------------------
120 |
121 |
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 | df = df.loc[df['ErrorType'] == error]
125 | df['Word'] = df['Word'].apply(word2char)
126 | df['Error'] = df['Error'].apply(word2char)
127 | df = df.sample(frac=1).reset_index(drop=True)
128 | idx = int(len(df)/1)
129 | df = df.iloc[:idx, [1, 0]]
130 | df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 |
133 |
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 | df = df.loc[df['ErrorType'] == error]
137 | # df['Word'] = df['Word'].apply(word2char)
138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 | df = df.sample(frac=1).reset_index(drop=True)
140 | idx = int(len(df)/1)
141 | df = df.iloc[:idx, [1, 0]]
142 | #
143 | # if(len(df) >= 10000):
144 | # df = df.iloc[:10000, :]
145 | #
146 | df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 |
149 |
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 | df = df.loc[df['ErrorType'] == error]
153 | # df['Word'] = df['Word'].apply(word2char)
154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 | df = df.sample(frac=1).reset_index(drop=True)
156 | # idx = int(len(df)/1)
157 | # df = df.iloc[:idx, [1, 0]]
158 | #
159 | # if(len(df) >= 10000):
160 | # df = df.iloc[:10000, :]
161 | #
162 | df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 |
165 |
166 | # ---------------------------
167 | def word2char(word):
168 | w2c = [char for char in word]
169 | return ' '.join(w2c)
170 | # ---------------------------
171 |
172 |
173 | # ---------------------------
174 | def find_len(seq):
175 | return len(seq.split(' '))
176 | # ---------------------------
177 |
178 |
179 | # ---------------------------
180 | def mask2str(mask):
181 | x = ''
182 | for item in mask:
183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 | x += str(item)
185 | return x
186 | # ---------------------------
187 |
188 |
189 | # ---------------------------
190 | def error_blank(error, mask):
191 | error_list = np.array(error.split())
192 | mask_list = np.array(mask.split())
193 | idx = np.where(mask_list=='1')[0]
194 | error_list[idx] = ' '
195 | error = ' '.join(error_list)
196 | return error
197 | # ---------------------------
198 |
199 |
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 | return text.split()
203 | # ---------------------------
204 |
205 |
206 | # ---------------------------
207 | def count_parameters(model):
208 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 |
211 |
212 | # ---------------------------
213 | def initialize_weights(m):
214 | if hasattr(m, 'weight') and m.weight.dim() > 1:
215 | nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 |
218 |
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 | torch.save({
222 | 'epoch': epoch,
223 | 'model_state_dict': model.state_dict(),
224 | # 'optimizer_state_dict': optimizer.state_dict(),
225 | 'loss': train_loss
226 | }, PATH)
227 | print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 |
230 |
231 | # ---------------------------
232 | def load_model(model, PATH):
233 | checkpoint = torch.load(PATH)
234 | model.load_state_dict(checkpoint['model_state_dict'])
235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 | epoch = checkpoint['epoch']
237 | train_loss = checkpoint['loss']
238 | return checkpoint, epoch, train_loss
239 | # ---------------------------
240 |
241 |
242 | if __name__ == '__main__':
243 | pass
244 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/utils.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | import pandas as pd
4 | import numpy as np
5 | from sklearn.model_selection import train_test_split
6 | import os
7 |
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 | SEED = 1234
12 | torch.manual_seed(SEED)
13 | torch.cuda.manual_seed(SEED)
14 |
15 |
16 | # ---------------------------
17 | def train_valid_test_df(df, test_size, valid_size):
18 | # etypes = list(set(df.iloc[:, -1]))
19 | etypes = list(set(df['ErrorType']))
20 |
21 | train_df = pd.DataFrame()
22 | valid_df = pd.DataFrame()
23 | test_df = pd.DataFrame()
24 |
25 | for etype in etypes:
26 | etype_df = df.loc[df['ErrorType'] == etype]
27 | train, test = train_test_split(etype_df, test_size=test_size)
28 | train, valid = train_test_split(train, test_size=valid_size)
29 |
30 | train_df = pd.concat([train_df, train])
31 | valid_df = pd.concat([valid_df, valid])
32 | test_df = pd.concat([test_df, test])
33 |
34 | train_df = train_df.sample(frac=1).reset_index(drop=True)
35 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
36 | test_df = test_df.sample(frac=1).reset_index(drop=True)
37 |
38 | train_df = train_df.iloc[:, [1, 0]]
39 | valid_df = valid_df.iloc[:, [1, 0]]
40 | test_df = test_df.iloc[:, [1, 0]]
41 |
42 | return train_df, valid_df, test_df
43 | # ---------------------------
44 |
45 |
46 | # ---------------------------
47 | def train_valid_test_df2(df, test_size, valid_size):
48 | # etypes = list(set(df.iloc[:, -1]))
49 | etypes = list(set(df['ErrorType']))
50 |
51 | train_df = pd.DataFrame()
52 | valid_df = pd.DataFrame()
53 | test_df = pd.DataFrame()
54 |
55 | for etype in etypes:
56 | etype_df = df.loc[df['ErrorType'] == etype]
57 | train, test = train_test_split(etype_df, test_size=test_size)
58 | train, valid = train_test_split(train, test_size=valid_size)
59 |
60 | train_df = pd.concat([train_df, train])
61 | valid_df = pd.concat([valid_df, valid])
62 | test_df = pd.concat([test_df, test])
63 |
64 | train_df = train_df.sample(frac=1).reset_index(drop=True)
65 | valid_df = valid_df.sample(frac=1).reset_index(drop=True)
66 | test_df = test_df.sample(frac=1).reset_index(drop=True)
67 |
68 | # train_df = train_df.iloc[:, [1, 0]]
69 | # valid_df = valid_df.iloc[:, [1, 0]]
70 | # test_df = test_df.iloc[:, [1, 0]]
71 |
72 | return train_df, valid_df, test_df
73 | # ---------------------------
74 |
75 |
76 | # ---------------------------
77 | def merge_dfs(network='detector'):
78 | df_names = [
79 | f'{network}_CognitiveError.csv',
80 | f'{network}_HomonymError.csv',
81 | f'{network}_Run-onError.csv',
82 | f'{network}_Split-wordErrorLeft.csv',
83 | f'{network}_Split-wordErrorRandom.csv',
84 | f'{network}_Split-wordErrorRight.csv',
85 | f'{network}_Split-wordErrorboth.csv',
86 | f'{network}_TypoAvroSubstituition.csv',
87 | f'{network}_TypoBijoySubstituition.csv',
88 | f'{network}_TypoDeletion.csv',
89 | f'{network}_TypoInsertion.csv',
90 | f'{network}_TypoTransposition.csv',
91 | f'{network}_VisualError.csv',
92 | f'{network}_VisualErrorCombinedCharacter.csv'
93 | ]
94 |
95 | df = pd.DataFrame()
96 |
97 | for df_name in df_names:
98 | df_path = os.path.join('./Dataframes', df_name)
99 | temp_df = pd.read_csv(df_path)
100 | temp_df['ErrorType'] = [df_name.split('.')[0].split('_')[-1]
101 | for _ in range(len(temp_df))]
102 | df = pd.concat([df, temp_df])
103 |
104 | df = df.iloc[:, :]
105 |
106 | if network=='detector':
107 | df.rename(
108 | columns = {
109 | 'Predicton':'ErrorBlanksPredD1',
110 | 'Target':'ErrorBlanksActual',
111 | 'Correction':'EBP_Flag_D1',
112 | },
113 | inplace = True
114 | )
115 | df = df[['Error', 'Word', 'ErrorBlanksPredD1', 'ErrorBlanksActual', 'EBP_Flag_D1', 'ErrorType']]
116 |
117 | df.to_csv(f'./Dataset/{network}_preds.csv', index=False) # sec_dataset_III_v3_masked_d1_gen.csv (detector)
118 | # (purificator)
119 | # ---------------------------
120 |
121 |
122 | # ---------------------------
123 | def error_df(df, error='Cognitive Error'):
124 | df = df.loc[df['ErrorType'] == error]
125 | df['Word'] = df['Word'].apply(word2char)
126 | df['Error'] = df['Error'].apply(word2char)
127 | df = df.sample(frac=1).reset_index(drop=True)
128 | idx = int(len(df)/1)
129 | df = df.iloc[:idx, [1, 0]]
130 | df.to_csv('./Dataset/error.csv', index=False)
131 | # ---------------------------
132 |
133 |
134 | # ---------------------------
135 | def error_df_2(df, error='Cognitive Error'):
136 | df = df.loc[df['ErrorType'] == error]
137 | # df['Word'] = df['Word'].apply(word2char)
138 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
139 | df = df.sample(frac=1).reset_index(drop=True)
140 | idx = int(len(df)/1)
141 | df = df.iloc[:idx, [1, 0]]
142 | #
143 | # if(len(df) >= 10000):
144 | # df = df.iloc[:10000, :]
145 | #
146 | df.to_csv('./Dataset/error.csv', index=False)
147 | # ---------------------------
148 |
149 |
150 | # ---------------------------
151 | def error_df_3(df, error='Cognitive Error'):
152 | df = df.loc[df['ErrorType'] == error]
153 | # df['Word'] = df['Word'].apply(word2char)
154 | # df['MaskErrorBlank'] = df['MaskErrorBlank'].apply(word2char)
155 | df = df.sample(frac=1).reset_index(drop=True)
156 | # idx = int(len(df)/1)
157 | # df = df.iloc[:idx, [1, 0]]
158 | #
159 | # if(len(df) >= 10000):
160 | # df = df.iloc[:10000, :]
161 | #
162 | df.to_csv('./Dataset/error.csv', index=False)
163 | # ---------------------------
164 |
165 |
166 | # ---------------------------
167 | def word2char(word):
168 | w2c = [char for char in word]
169 | return ' '.join(w2c)
170 | # ---------------------------
171 |
172 |
173 | # ---------------------------
174 | def find_len(seq):
175 | return len(seq.split(' '))
176 | # ---------------------------
177 |
178 |
179 | # ---------------------------
180 | def mask2str(mask):
181 | x = ''
182 | for item in mask:
183 | if item != "[" and item != "'" and item != "," and item != " " and item != "]":
184 | x += str(item)
185 | return x
186 | # ---------------------------
187 |
188 |
189 | # ---------------------------
190 | def error_blank(error, mask):
191 | error_list = np.array(error.split())
192 | mask_list = np.array(mask.split())
193 | idx = np.where(mask_list=='1')[0]
194 | error_list[idx] = ' '
195 | error = ' '.join(error_list)
196 | return error
197 | # ---------------------------
198 |
199 |
200 | # ---------------------------
201 | def basic_tokenizer(text):
202 | return text.split()
203 | # ---------------------------
204 |
205 |
206 | # ---------------------------
207 | def count_parameters(model):
208 | return sum(p.numel() for p in model.parameters() if p.requires_grad)
209 | # ---------------------------
210 |
211 |
212 | # ---------------------------
213 | def initialize_weights(m):
214 | if hasattr(m, 'weight') and m.weight.dim() > 1:
215 | nn.init.xavier_uniform_(m.weight.data)
216 | # ---------------------------
217 |
218 |
219 | # ---------------------------
220 | def save_model(model, train_loss, epoch, PATH):
221 | torch.save({
222 | 'epoch': epoch,
223 | 'model_state_dict': model.state_dict(),
224 | # 'optimizer_state_dict': optimizer.state_dict(),
225 | 'loss': train_loss
226 | }, PATH)
227 | print(f"---------\nModel Saved at {PATH}\n---------\n")
228 | # ---------------------------
229 |
230 |
231 | # ---------------------------
232 | def load_model(model, PATH):
233 | checkpoint = torch.load(PATH)
234 | model.load_state_dict(checkpoint['model_state_dict'])
235 | # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
236 | epoch = checkpoint['epoch']
237 | train_loss = checkpoint['loss']
238 | return checkpoint, epoch, train_loss
239 | # ---------------------------
240 |
241 |
242 | if __name__ == '__main__':
243 | pass
244 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/dtransformer.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | word2char, basic_tokenizer, count_parameters, initialize_weights,
3 | save_model, load_model, error_df, train_valid_test_df, mask2str,
4 | error_df_2, error_df_3, merge_dfs
5 | )
6 | from transformer import (
7 | Encoder, EncoderLayer, MultiHeadAttentionLayer,
8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
9 | Seq2Seq
10 | )
11 | from pipeline import train, evaluate
12 | from metrics import evaluation_report, evaluation_report2
13 |
14 | import pandas as pd
15 | from sklearn.model_selection import train_test_split
16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
17 | import torch
18 | import torch.nn as nn
19 | import os
20 | import gc
21 | import sys
22 | import argparse
23 |
24 | import warnings as wrn
25 | wrn.filterwarnings('ignore')
26 |
27 |
28 | def main():
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv",
31 | choices=[
32 | "./Dataset/corpus.csv", # Bangla SEC parallel corpus
33 | "./Dataset/corpus2.csv", # Bangla SEC parallel corpus for running test
34 | "./Dataset/Hindi/corpus_hindi.csv",
35 | "./Dataset/Telugu/corpus_telugu.csv",
36 | "./Dataset/Hindi/corpus_hindi_enhanced.csv",
37 | "./Dataset/Telugu/corpus_telugu_enhanced.csv"
38 | ]
39 | )
40 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
41 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
42 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
43 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
44 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
45 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
46 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
47 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
48 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
49 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
50 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
51 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
52 | args = parser.parse_args()
53 |
54 | SEED = 1234
55 | torch.manual_seed(SEED)
56 | torch.cuda.manual_seed(SEED)
57 |
58 | df = pd.read_csv(args.CORPUS)
59 | df['Word'] = df['Word'].apply(word2char)
60 | df['Error'] = df['Error'].apply(word2char)
61 | df = df.sample(frac=1).reset_index(drop=True)
62 |
63 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
64 |
65 | train_df.to_csv('./Dataset/train.csv', index=False)
66 | valid_df.to_csv('./Dataset/valid.csv', index=False)
67 | test_df.to_csv('./Dataset/test.csv', index=False)
68 |
69 | SRC = Field(
70 | tokenize=basic_tokenizer, lower=False,
71 | init_token='', eos_token='', batch_first=True
72 | )
73 | TRG = Field(
74 | tokenize=basic_tokenizer, lower=False,
75 | init_token='', eos_token='', batch_first=True
76 | )
77 | fields = {
78 | 'Error': ('src', SRC),
79 | 'Word': ('trg', TRG)
80 | }
81 |
82 | train_data, valid_data, test_data = TabularDataset.splits(
83 | path='./Dataset',
84 | train='train.csv',
85 | validation='valid.csv',
86 | test='test.csv',
87 | format='csv',
88 | fields=fields
89 | )
90 |
91 | SRC.build_vocab(train_data, min_freq=100)
92 | TRG.build_vocab(train_data, min_freq=50)
93 |
94 | # ------------------------------
95 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
96 | BATCH_SIZE = 512
97 | # ------------------------------
98 | INPUT_DIM = len(SRC.vocab)
99 | OUTPUT_DIM = len(TRG.vocab)
100 | # ------------------------------
101 | HID_DIM = int(args.HID_DIM)
102 | ENC_LAYERS = int(args.ENC_LAYERS)
103 | DEC_LAYERS = int(args.DEC_LAYERS)
104 | ENC_HEADS = int(args.ENC_HEADS)
105 | DEC_HEADS = int(args.DEC_HEADS)
106 | ENC_PF_DIM = int(args.ENC_PF_DIM)
107 | DEC_PF_DIM = int(args.DEC_PF_DIM)
108 | ENC_DROPOUT = float(args.ENC_DROPOUT)
109 | DEC_DROPOUT = float(args.DEC_DROPOUT)
110 | CLIP = float(args.CLIP)
111 | N_EPOCHS = int(args.N_EPOCHS)
112 | LEARNING_RATE = float(args.LEARNING_RATE)
113 | # ------------------------------
114 | PATH = './Checkpoints/dtransformer.pth'
115 | # ------------------------------
116 | gc.collect()
117 | torch.cuda.empty_cache()
118 | # -----------------------------
119 |
120 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
121 | (train_data, valid_data, test_data),
122 | batch_size=BATCH_SIZE,
123 | sort_within_batch=True,
124 | sort_key=lambda x: len(x.src),
125 | device=DEVICE
126 | )
127 |
128 | enc = Encoder(
129 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
130 | ENC_DROPOUT, DEVICE
131 | )
132 | dec = Decoder(
133 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
134 | DEC_DROPOUT, DEVICE
135 | )
136 |
137 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
138 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
139 |
140 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
141 | model.apply(initialize_weights)
142 | # print(f'The model has {count_parameters(model):,} trainable parameters')
143 |
144 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
145 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
146 |
147 | epoch = 1
148 | best_loss = 1e10
149 | if os.path.exists(PATH):
150 | checkpoint, epoch, train_loss = load_model(model, PATH)
151 | best_loss = train_loss
152 |
153 | for epoch in range(epoch, N_EPOCHS):
154 | print(f"Epoch: {epoch} / {N_EPOCHS}")
155 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
156 | print(f"Train Loss: {train_loss:.4f}")
157 | if train_loss < best_loss:
158 | best_loss = train_loss
159 | save_model(model, train_loss, epoch, PATH)
160 |
161 | # ---------------------
162 | # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE)
163 | # ---------------------
164 | # error_types = [
165 | # 'Homonym Error', # 123
166 | # 'Typo Deletion', # 115767
167 | # 'Typo (Avro) Substituition', # 119573
168 | # 'Typo (Bijoy) Substituition', # 119864
169 | # 'Cognitive Error', # 108227
170 | # 'Run-on Error', # 124895
171 | # 'Split-word Error (Left)', # 62890
172 | # 'Split-word Error (Random)', # 124895
173 | # 'Split-word Error (Right)', # 13985
174 | # 'Split-word Error (both)', # 12800
175 | # 'Typo Insertion', # 124807
176 | # 'Typo Transposition', # 123245
177 | # 'Visual Error', # 117391
178 | # 'Visual Error (Combined Character)' # 17617
179 | # ]
180 | # ---------------------
181 | valid_df = pd.read_csv('./Dataset/valid.csv')
182 | error_types = list(sorted(list(set(df['ErrorType'].values))))
183 | # ---------------------
184 | for error_name in error_types:
185 | print(f'------\nError Type: {error_name}\n------')
186 | error_df_2(df, error_name)
187 |
188 | error_data, _ = TabularDataset.splits(
189 | path='./Dataset',
190 | train='error.csv',
191 | test='error.csv',
192 | format='csv',
193 | fields=fields
194 | )
195 |
196 | eval_df = evaluation_report(error_data, SRC, TRG, WORD, model, DEVICE)
197 |
198 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
199 | eval_df.to_csv(f'./Dataframes/dtransformer_{error_name}.csv', index=False)
200 | print('\n\n')
201 | # ---------------------
202 |
203 |
204 | if __name__ == '__main__':
205 | main()
206 |
--------------------------------------------------------------------------------
/Requirements/requirements_u.yml:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: linux-64
4 | @EXPLICIT
5 | https://conda.anaconda.org/pytorch/noarch/pytorch-mutex-1.0-cuda.tar.bz2
6 | https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.conda
7 | https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.conda
8 | https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2022.4.26-h06a4308_0.conda
9 | https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2021.4.0-h06a4308_3561.conda
10 | https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.38-h1181459_1.conda
11 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran4-7.5.0-ha8ba4b0_17.conda
12 | https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-11.2.0-h1234567_1.conda
13 | https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.5.0-ha8ba4b0_17.conda
14 | https://repo.anaconda.com/pkgs/main/linux-64/libgomp-11.2.0-h1234567_1.conda
15 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-2021.4.0-h06a4308_640.conda
16 | https://repo.anaconda.com/pkgs/main/linux-64/_openmp_mutex-5.1-1_gnu.conda
17 | https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-11.2.0-h1234567_1.conda
18 | https://repo.anaconda.com/pkgs/main/linux-64/brotli-1.0.9-he6710b0_2.conda
19 | https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.conda
20 | https://repo.anaconda.com/pkgs/main/linux-64/cudatoolkit-10.2.89-hfd86e86_1.conda
21 | https://repo.anaconda.com/pkgs/main/linux-64/expat-2.4.4-h295c915_0.conda
22 | https://repo.anaconda.com/pkgs/main/linux-64/giflib-5.2.1-h7b6447c_0.conda
23 | https://repo.anaconda.com/pkgs/main/linux-64/gmp-6.2.1-h295c915_3.conda
24 | https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.conda
25 | https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9e-h7f8727e_0.conda
26 | https://repo.anaconda.com/pkgs/main/linux-64/lame-3.100-h7b6447c_0.conda
27 | https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.conda
28 | https://repo.anaconda.com/pkgs/main/linux-64/libiconv-1.16-h7f8727e_2.conda
29 | https://repo.anaconda.com/pkgs/main/linux-64/libtasn1-4.16.0-h27cfd23_0.conda
30 | https://repo.anaconda.com/pkgs/main/linux-64/libunistring-0.9.10-h27cfd23_0.conda
31 | https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h7f8727e_2.conda
32 | https://repo.anaconda.com/pkgs/main/linux-64/libuv-1.40.0-h7b6447c_0.conda
33 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-base-1.2.2-h7f8727e_0.conda
34 | https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.15-h7f8727e_0.conda
35 | https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.3-h295c915_1.conda
36 | https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.3-h7f8727e_2.conda
37 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-base-1.10.2-hd09550d_5.conda
38 | https://repo.anaconda.com/pkgs/main/linux-64/openh264-2.1.1-h4ff587b_0.conda
39 | https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1o-h7f8727e_0.conda
40 | https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.45-h295c915_0.conda
41 | https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7f8727e_1.conda
42 | https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.12-h7f8727e_2.conda
43 | https://repo.anaconda.com/pkgs/main/linux-64/glib-2.69.1-h4ff587b_1.conda
44 | https://repo.anaconda.com/pkgs/main/linux-64/libidn2-2.3.2-h7f8727e_0.conda
45 | https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.conda
46 | https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.14-h74e7548_0.conda
47 | https://repo.anaconda.com/pkgs/main/linux-64/nettle-3.7.3-hbbd107a_1.conda
48 | https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1.2-h7f8727e_1.conda
49 | https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.12-h1ccaba5_0.conda
50 | https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.5.2-ha4553b6_0.conda
51 | https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.18-hb2f20db_0.conda
52 | https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.11.0-h70c0345_0.conda
53 | https://repo.anaconda.com/pkgs/main/linux-64/gnutls-3.6.15-he1e5248_0.conda
54 | https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-h28cd5cc_2.conda
55 | https://repo.anaconda.com/pkgs/main/linux-64/libtiff-4.2.0-h2818925_1.conda
56 | https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.38.3-hc218d9a_0.conda
57 | https://conda.anaconda.org/pytorch/linux-64/ffmpeg-4.3-hf484d3e_0.tar.bz2
58 | https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.1-h6c09931_0.conda
59 | https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-h8213a91_2.conda
60 | https://repo.anaconda.com/pkgs/main/linux-64/lcms2-2.12-h3be6417_0.conda
61 | https://repo.anaconda.com/pkgs/main/linux-64/libwebp-1.2.2-h55f646e_0.conda
62 | https://repo.anaconda.com/pkgs/main/linux-64/python-3.8.13-h12debd9_0.conda
63 | https://repo.anaconda.com/pkgs/main/linux-64/certifi-2022.5.18.1-py38h06a4308_0.conda
64 | https://repo.anaconda.com/pkgs/main/noarch/charset-normalizer-2.0.4-pyhd3eb1b0_0.conda
65 | https://repo.anaconda.com/pkgs/main/noarch/cycler-0.11.0-pyhd3eb1b0_0.conda
66 | https://repo.anaconda.com/pkgs/main/noarch/idna-3.3-pyhd3eb1b0_0.conda
67 | https://repo.anaconda.com/pkgs/main/noarch/joblib-1.1.0-pyhd3eb1b0_0.conda
68 | https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.4.2-py38h295c915_0.conda
69 | https://repo.anaconda.com/pkgs/main/noarch/munkres-1.1.4-py_0.conda
70 | https://repo.anaconda.com/pkgs/main/linux-64/ninja-1.10.2-h06a4308_5.conda
71 | https://repo.anaconda.com/pkgs/main/linux-64/pillow-9.0.1-py38h22f2fdc_0.conda
72 | https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.21-pyhd3eb1b0_0.conda
73 | https://repo.anaconda.com/pkgs/main/noarch/pyparsing-3.0.4-pyhd3eb1b0_0.conda
74 | https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py38h06a4308_0.conda
75 | https://repo.anaconda.com/pkgs/main/linux-64/pytz-2022.1-py38h06a4308_0.conda
76 | https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.conda
77 | https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.13-py38h295c915_0.conda
78 | https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_1.conda
79 | https://repo.anaconda.com/pkgs/main/noarch/threadpoolctl-2.2.0-pyh0d69192_0.conda
80 | https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.1-py38h27cfd23_0.conda
81 | https://repo.anaconda.com/pkgs/main/linux-64/tqdm-4.64.0-py38h06a4308_0.conda
82 | https://repo.anaconda.com/pkgs/main/noarch/typing_extensions-4.1.1-pyh06a4308_0.conda
83 | https://repo.anaconda.com/pkgs/main/noarch/wheel-0.37.1-pyhd3eb1b0_0.conda
84 | https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.15.0-py38hd667e15_1.conda
85 | https://repo.anaconda.com/pkgs/main/noarch/fonttools-4.25.0-pyhd3eb1b0_0.conda
86 | https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.4.0-py38h7f8727e_0.conda
87 | https://repo.anaconda.com/pkgs/main/noarch/packaging-21.3-pyhd3eb1b0_0.conda
88 | https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py38h05f1152_4.conda
89 | https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.2-pyhd3eb1b0_0.conda
90 | https://conda.anaconda.org/pytorch/linux-64/pytorch-1.9.0-py3.8_cuda10.2_cudnn7.6.5_0.tar.bz2
91 | https://repo.anaconda.com/pkgs/main/linux-64/setuptools-61.2.0-py38h06a4308_0.conda
92 | https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py38h27cfd23_1003.conda
93 | https://repo.anaconda.com/pkgs/main/linux-64/cryptography-37.0.1-py38h9ce1e76_0.conda
94 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.22.3-py38hf524024_0.conda
95 | https://repo.anaconda.com/pkgs/main/linux-64/pip-21.2.4-py38h06a4308_0.conda
96 | https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-22.0.0-pyhd3eb1b0_0.conda
97 | https://repo.anaconda.com/pkgs/main/linux-64/urllib3-1.26.9-py38h06a4308_0.conda
98 | https://repo.anaconda.com/pkgs/main/noarch/requests-2.27.1-pyhd3eb1b0_0.conda
99 | https://conda.anaconda.org/pytorch/linux-64/torchtext-0.10.0-py38.tar.bz2
100 | https://repo.anaconda.com/pkgs/main/linux-64/bottleneck-1.3.4-py38hce1f21e_0.conda
101 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.5.1-py38h06a4308_1.conda
102 | https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.5.1-py38ha18d171_1.conda
103 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.3.1-py38hd3c417c_0.conda
104 | https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.2.2-py38h51133e4_0.conda
105 | https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.22.3-py38he7a7128_0.conda
106 | https://repo.anaconda.com/pkgs/main/linux-64/numexpr-2.8.1-py38h6abb31d_0.conda
107 | https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.7.3-py38hc147768_0.conda
108 | https://conda.anaconda.org/pytorch/linux-64/torchaudio-0.9.0-py38.tar.bz2
109 | https://conda.anaconda.org/pytorch/noarch/torchvision-0.2.2-py_3.tar.bz2
110 | https://repo.anaconda.com/pkgs/main/linux-64/pandas-1.4.2-py38h295c915_0.conda
111 | https://repo.anaconda.com/pkgs/main/linux-64/scikit-learn-1.0.2-py38h51133e4_1.conda
112 |
--------------------------------------------------------------------------------
/metrics.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 | from pipeline import translate_sentence
4 | import numpy as np
5 | from sklearn import metrics
6 | import torch
7 | import gc
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
14 |
15 | modified_flags = []
16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
17 | all_words = sorted(all_words.iloc[:, 0].values)
18 |
19 | for idx, data in enumerate(tqdm(test_data)):
20 | # ------------------------------
21 | if idx % 5000 == 0:
22 | gc.collect()
23 | torch.cuda.empty_cache()
24 | # ------------------------------
25 |
26 | src = data.src
27 | trg = data.trg
28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
29 |
30 | src = ''.join(src)
31 | trg = ''.join(trg)
32 | pred = ''.join(translation)
33 |
34 | erroneous_words.append(src)
35 | correct_words.append(trg)
36 | predicted_words.append(pred)
37 |
38 | if trg == pred:
39 | flags.append(1)
40 | else:
41 | flags.append(0)
42 |
43 | if pred in all_words:
44 | modified_flags.append(1)
45 | else:
46 | modified_flags.append(0)
47 |
48 | evaluation_df = pd.DataFrame({
49 | 'Error': erroneous_words,
50 | 'Predicton': predicted_words,
51 | 'Target': correct_words,
52 | 'Correction': flags
53 | })
54 |
55 | corrected_instances = evaluation_df['Correction'].values.sum()
56 | total_instances = len(evaluation_df)
57 | accuracy = corrected_instances / total_instances
58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
59 |
60 | y_true = np.array(correct_words)
61 | y_pred = np.array(predicted_words)
62 |
63 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
64 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
67 | ACC = metrics.accuracy_score(y_true, y_pred)
68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
69 |
70 | print(f'''
71 | Top-1 (Greedy Decoding)
72 | Precision: {PR:.4f}
73 | Recall: {RE:.4f}
74 | F1 Score: {F1:.4f}
75 | F0.5 Score: {F05:.4f}
76 | Accuracy: {RE * 100:.2f}%
77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
78 | ''')
79 |
80 | return evaluation_df
81 |
82 |
83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
85 | words = []
86 |
87 | modified_flags = []
88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
89 | all_words = sorted(all_words.iloc[:, 0].values)
90 |
91 | for idx, data in enumerate(tqdm(test_data)):
92 | # ------------------------------
93 | if idx % 5000 == 0:
94 | gc.collect()
95 | torch.cuda.empty_cache()
96 | # ------------------------------
97 |
98 | src = data.src
99 | trg = data.trg
100 | word = data.word
101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 |
103 | src = ''.join(src)
104 | trg = ''.join(trg)
105 | pred = ''.join(translation)
106 | word = ''.join(word)
107 |
108 | erroneous_words.append(src)
109 | correct_words.append(trg)
110 | predicted_words.append(pred)
111 | words.append(word)
112 |
113 | if trg == pred:
114 | flags.append(1)
115 | else:
116 | flags.append(0)
117 |
118 | if pred in all_words:
119 | modified_flags.append(1)
120 | else:
121 | modified_flags.append(0)
122 |
123 | evaluation_df = pd.DataFrame({
124 | 'Error': erroneous_words, # Error
125 | 'Predicton': predicted_words, # ErrorBlanksPredD1
126 | 'Target': correct_words, # ErrorBlanksActual
127 | 'Word': words, # Word
128 | 'Correction': flags # EBP_Flag_D1
129 | })
130 |
131 | corrected_instances = evaluation_df['Correction'].values.sum()
132 | total_instances = len(evaluation_df)
133 | accuracy = corrected_instances / total_instances
134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 |
136 | y_true = np.array(correct_words)
137 | y_pred = np.array(predicted_words)
138 |
139 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 | ACC = metrics.accuracy_score(y_true, y_pred)
144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 |
146 | print(f'''
147 | Top-1 (Greedy Decoding)
148 | Precision: {PR:.4f}
149 | Recall: {RE:.4f}
150 | F1 Score: {F1:.4f}
151 | F0.5 Score: {F05:.4f}
152 | Accuracy: {RE * 100:.2f}%
153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 | ''')
155 |
156 | return evaluation_df
157 |
158 |
159 |
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 | errors = []
163 | words = []
164 | ebpd1s = []
165 | ebpfd1s = []
166 |
167 | modified_flags = []
168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 | all_words = sorted(all_words.iloc[:, 0].values)
170 |
171 | for idx, data in enumerate(tqdm(test_data)):
172 | # ------------------------------
173 | if idx % 5000 == 0:
174 | gc.collect()
175 | torch.cuda.empty_cache()
176 | # ------------------------------
177 |
178 | src = data.src
179 | trg = data.trg
180 | error = data.error
181 | word = data.word
182 | ebpd1 = data.ebpd1
183 | ebpfd1 = data.ebpfd1
184 |
185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 |
187 | src = ''.join(src)
188 | trg = ''.join(trg)
189 | pred = ''.join(translation)
190 | error = ''.join(error)
191 | word = ''.join(word)
192 | ebpd1 = ''.join(ebpd1)
193 | ebpfd1 = ''.join(ebpfd1)
194 |
195 | erroneous_words.append(src)
196 | correct_words.append(trg)
197 | predicted_words.append(pred)
198 | errors.append(error)
199 | words.append(word)
200 | ebpd1s.append(ebpd1)
201 | ebpfd1s.append(ebpfd1)
202 |
203 | if trg == pred:
204 | flags.append(1)
205 | else:
206 | flags.append(0)
207 |
208 | if pred in all_words:
209 | modified_flags.append(1)
210 | else:
211 | modified_flags.append(0)
212 |
213 | # evaluation_df = pd.DataFrame({
214 | # 'Error': erroneous_words,
215 | # 'Predicton': predicted_words,
216 | # 'Target': correct_words,
217 | # 'Word': words,
218 | # 'Correction': flags
219 | # })
220 |
221 | evaluation_df = pd.DataFrame({
222 | 'Error': errors,
223 | 'Word': words,
224 | 'ErrorBlanksActual': correct_words,
225 | 'MaskErrorBlank': erroneous_words,
226 | 'ErrorBlanksPredD1': ebpd1s,
227 | 'EBP_Flag_D1': ebpfd1s,
228 | 'ErrorBlanksPredD2': predicted_words,
229 | 'EBP_Flag_D2': flags
230 | })
231 |
232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 | total_instances = len(evaluation_df)
234 | accuracy = corrected_instances / total_instances
235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 |
237 | y_true = np.array(correct_words)
238 | y_pred = np.array(predicted_words)
239 |
240 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 | ACC = metrics.accuracy_score(y_true, y_pred)
245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 |
247 | print(f'''
248 | Top-1 (Greedy Decoding)
249 | Precision: {PR:.4f}
250 | Recall: {RE:.4f}
251 | F1 Score: {F1:.4f}
252 | F0.5 Score: {F05:.4f}
253 | Accuracy: {RE * 100:.2f}%
254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 | ''')
256 |
257 | return evaluation_df
258 |
259 |
260 |
261 |
262 | if __name__ == '__main__':
263 | pass
264 |
265 |
266 |
--------------------------------------------------------------------------------
/Baselines/DCSpell/metrics.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 | from pipeline import translate_sentence
4 | import numpy as np
5 | from sklearn import metrics
6 | import torch
7 | import gc
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
14 |
15 | modified_flags = []
16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
17 | all_words = sorted(all_words.iloc[:, 0].values)
18 |
19 | for idx, data in enumerate(tqdm(test_data)):
20 | # ------------------------------
21 | if idx % 5000 == 0:
22 | gc.collect()
23 | torch.cuda.empty_cache()
24 | # ------------------------------
25 |
26 | src = data.src
27 | trg = data.trg
28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
29 |
30 | src = ''.join(src)
31 | trg = ''.join(trg)
32 | pred = ''.join(translation)
33 |
34 | erroneous_words.append(src)
35 | correct_words.append(trg)
36 | predicted_words.append(pred)
37 |
38 | if trg == pred:
39 | flags.append(1)
40 | else:
41 | flags.append(0)
42 |
43 | if pred in all_words:
44 | modified_flags.append(1)
45 | else:
46 | modified_flags.append(0)
47 |
48 | evaluation_df = pd.DataFrame({
49 | 'Error': erroneous_words,
50 | 'Predicton': predicted_words,
51 | 'Target': correct_words,
52 | 'Correction': flags
53 | })
54 |
55 | corrected_instances = evaluation_df['Correction'].values.sum()
56 | total_instances = len(evaluation_df)
57 | accuracy = corrected_instances / total_instances
58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
59 |
60 | y_true = np.array(correct_words)
61 | y_pred = np.array(predicted_words)
62 |
63 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
64 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
67 | ACC = metrics.accuracy_score(y_true, y_pred)
68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
69 |
70 | print(f'''
71 | Top-1 (Greedy Decoding)
72 | Precision: {PR:.4f}
73 | Recall: {RE:.4f}
74 | F1 Score: {F1:.4f}
75 | F0.5 Score: {F05:.4f}
76 | Accuracy: {RE * 100:.2f}%
77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
78 | ''')
79 |
80 | return evaluation_df
81 |
82 |
83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
85 | words = []
86 |
87 | modified_flags = []
88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
89 | all_words = sorted(all_words.iloc[:, 0].values)
90 |
91 | for idx, data in enumerate(tqdm(test_data)):
92 | # ------------------------------
93 | if idx % 5000 == 0:
94 | gc.collect()
95 | torch.cuda.empty_cache()
96 | # ------------------------------
97 |
98 | src = data.src
99 | trg = data.trg
100 | word = data.word
101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 |
103 | src = ''.join(src)
104 | trg = ''.join(trg)
105 | pred = ''.join(translation)
106 | word = ''.join(word)
107 |
108 | erroneous_words.append(src)
109 | correct_words.append(trg)
110 | predicted_words.append(pred)
111 | words.append(word)
112 |
113 | if trg == pred:
114 | flags.append(1)
115 | else:
116 | flags.append(0)
117 |
118 | if pred in all_words:
119 | modified_flags.append(1)
120 | else:
121 | modified_flags.append(0)
122 |
123 | evaluation_df = pd.DataFrame({
124 | 'Error': erroneous_words, # Error
125 | 'Predicton': predicted_words, # ErrorBlanksPredD1
126 | 'Target': correct_words, # ErrorBlanksActual
127 | 'Word': words, # Word
128 | 'Correction': flags # EBP_Flag_D1
129 | })
130 |
131 | corrected_instances = evaluation_df['Correction'].values.sum()
132 | total_instances = len(evaluation_df)
133 | accuracy = corrected_instances / total_instances
134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 |
136 | y_true = np.array(correct_words)
137 | y_pred = np.array(predicted_words)
138 |
139 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 | ACC = metrics.accuracy_score(y_true, y_pred)
144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 |
146 | print(f'''
147 | Top-1 (Greedy Decoding)
148 | Precision: {PR:.4f}
149 | Recall: {RE:.4f}
150 | F1 Score: {F1:.4f}
151 | F0.5 Score: {F05:.4f}
152 | Accuracy: {RE * 100:.2f}%
153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 | ''')
155 |
156 | return evaluation_df
157 |
158 |
159 |
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 | errors = []
163 | words = []
164 | ebpd1s = []
165 | ebpfd1s = []
166 |
167 | modified_flags = []
168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 | all_words = sorted(all_words.iloc[:, 0].values)
170 |
171 | for idx, data in enumerate(tqdm(test_data)):
172 | # ------------------------------
173 | if idx % 5000 == 0:
174 | gc.collect()
175 | torch.cuda.empty_cache()
176 | # ------------------------------
177 |
178 | src = data.src
179 | trg = data.trg
180 | error = data.error
181 | word = data.word
182 | ebpd1 = data.ebpd1
183 | ebpfd1 = data.ebpfd1
184 |
185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 |
187 | src = ''.join(src)
188 | trg = ''.join(trg)
189 | pred = ''.join(translation)
190 | error = ''.join(error)
191 | word = ''.join(word)
192 | ebpd1 = ''.join(ebpd1)
193 | ebpfd1 = ''.join(ebpfd1)
194 |
195 | erroneous_words.append(src)
196 | correct_words.append(trg)
197 | predicted_words.append(pred)
198 | errors.append(error)
199 | words.append(word)
200 | ebpd1s.append(ebpd1)
201 | ebpfd1s.append(ebpfd1)
202 |
203 | if trg == pred:
204 | flags.append(1)
205 | else:
206 | flags.append(0)
207 |
208 | if pred in all_words:
209 | modified_flags.append(1)
210 | else:
211 | modified_flags.append(0)
212 |
213 | # evaluation_df = pd.DataFrame({
214 | # 'Error': erroneous_words,
215 | # 'Predicton': predicted_words,
216 | # 'Target': correct_words,
217 | # 'Word': words,
218 | # 'Correction': flags
219 | # })
220 |
221 | evaluation_df = pd.DataFrame({
222 | 'Error': errors,
223 | 'Word': words,
224 | 'ErrorBlanksActual': correct_words,
225 | 'MaskErrorBlank': erroneous_words,
226 | 'ErrorBlanksPredD1': ebpd1s,
227 | 'EBP_Flag_D1': ebpfd1s,
228 | 'ErrorBlanksPredD2': predicted_words,
229 | 'EBP_Flag_D2': flags
230 | })
231 |
232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 | total_instances = len(evaluation_df)
234 | accuracy = corrected_instances / total_instances
235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 |
237 | y_true = np.array(correct_words)
238 | y_pred = np.array(predicted_words)
239 |
240 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 | ACC = metrics.accuracy_score(y_true, y_pred)
245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 |
247 | print(f'''
248 | Top-1 (Greedy Decoding)
249 | Precision: {PR:.4f}
250 | Recall: {RE:.4f}
251 | F1 Score: {F1:.4f}
252 | F0.5 Score: {F05:.4f}
253 | Accuracy: {RE * 100:.2f}%
254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 | ''')
256 |
257 | return evaluation_df
258 |
259 |
260 |
261 |
262 | if __name__ == '__main__':
263 | pass
264 |
265 |
266 |
--------------------------------------------------------------------------------
/Baselines/DTransformer/metrics.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from tqdm import tqdm
3 | from pipeline import translate_sentence
4 | import numpy as np
5 | from sklearn import metrics
6 | import torch
7 | import gc
8 | import warnings as wrn
9 | wrn.filterwarnings('ignore')
10 |
11 |
12 | def evaluation_report(test_data, SRC, TRG, model, DEVICE):
13 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
14 |
15 | modified_flags = []
16 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
17 | all_words = sorted(all_words.iloc[:, 0].values)
18 |
19 | for idx, data in enumerate(tqdm(test_data)):
20 | # ------------------------------
21 | if idx % 5000 == 0:
22 | gc.collect()
23 | torch.cuda.empty_cache()
24 | # ------------------------------
25 |
26 | src = data.src
27 | trg = data.trg
28 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
29 |
30 | src = ''.join(src)
31 | trg = ''.join(trg)
32 | pred = ''.join(translation)
33 |
34 | erroneous_words.append(src)
35 | correct_words.append(trg)
36 | predicted_words.append(pred)
37 |
38 | if trg == pred:
39 | flags.append(1)
40 | else:
41 | flags.append(0)
42 |
43 | if pred in all_words:
44 | modified_flags.append(1)
45 | else:
46 | modified_flags.append(0)
47 |
48 | evaluation_df = pd.DataFrame({
49 | 'Error': erroneous_words,
50 | 'Predicton': predicted_words,
51 | 'Target': correct_words,
52 | 'Correction': flags
53 | })
54 |
55 | corrected_instances = evaluation_df['Correction'].values.sum()
56 | total_instances = len(evaluation_df)
57 | accuracy = corrected_instances / total_instances
58 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
59 |
60 | y_true = np.array(correct_words)
61 | y_pred = np.array(predicted_words)
62 |
63 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
64 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
65 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
66 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
67 | ACC = metrics.accuracy_score(y_true, y_pred)
68 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
69 |
70 | print(f'''
71 | Top-1 (Greedy Decoding)
72 | Precision: {PR:.4f}
73 | Recall: {RE:.4f}
74 | F1 Score: {F1:.4f}
75 | F0.5 Score: {F05:.4f}
76 | Accuracy: {RE * 100:.2f}%
77 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
78 | ''')
79 |
80 | return evaluation_df
81 |
82 |
83 | def evaluation_report2(test_data, SRC, TRG, WORD, model, DEVICE):
84 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
85 | words = []
86 |
87 | modified_flags = []
88 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
89 | all_words = sorted(all_words.iloc[:, 0].values)
90 |
91 | for idx, data in enumerate(tqdm(test_data)):
92 | # ------------------------------
93 | if idx % 5000 == 0:
94 | gc.collect()
95 | torch.cuda.empty_cache()
96 | # ------------------------------
97 |
98 | src = data.src
99 | trg = data.trg
100 | word = data.word
101 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
102 |
103 | src = ''.join(src)
104 | trg = ''.join(trg)
105 | pred = ''.join(translation)
106 | word = ''.join(word)
107 |
108 | erroneous_words.append(src)
109 | correct_words.append(trg)
110 | predicted_words.append(pred)
111 | words.append(word)
112 |
113 | if trg == pred:
114 | flags.append(1)
115 | else:
116 | flags.append(0)
117 |
118 | if pred in all_words:
119 | modified_flags.append(1)
120 | else:
121 | modified_flags.append(0)
122 |
123 | evaluation_df = pd.DataFrame({
124 | 'Error': erroneous_words, # Error
125 | 'Predicton': predicted_words, # ErrorBlanksPredD1
126 | 'Target': correct_words, # ErrorBlanksActual
127 | 'Word': words, # Word
128 | 'Correction': flags # EBP_Flag_D1
129 | })
130 |
131 | corrected_instances = evaluation_df['Correction'].values.sum()
132 | total_instances = len(evaluation_df)
133 | accuracy = corrected_instances / total_instances
134 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
135 |
136 | y_true = np.array(correct_words)
137 | y_pred = np.array(predicted_words)
138 |
139 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
140 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
141 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
142 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
143 | ACC = metrics.accuracy_score(y_true, y_pred)
144 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
145 |
146 | print(f'''
147 | Top-1 (Greedy Decoding)
148 | Precision: {PR:.4f}
149 | Recall: {RE:.4f}
150 | F1 Score: {F1:.4f}
151 | F0.5 Score: {F05:.4f}
152 | Accuracy: {RE * 100:.2f}%
153 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
154 | ''')
155 |
156 | return evaluation_df
157 |
158 |
159 |
160 | def evaluation_report3(test_data, SRC, TRG, ERROR, WORD, EBPD1, EBPFD1, model, DEVICE):
161 | erroneous_words, predicted_words, correct_words, flags = [], [], [], []
162 | errors = []
163 | words = []
164 | ebpd1s = []
165 | ebpfd1s = []
166 |
167 | modified_flags = []
168 | all_words = pd.read_csv('./Dataset/allDictWords_df.csv')
169 | all_words = sorted(all_words.iloc[:, 0].values)
170 |
171 | for idx, data in enumerate(tqdm(test_data)):
172 | # ------------------------------
173 | if idx % 5000 == 0:
174 | gc.collect()
175 | torch.cuda.empty_cache()
176 | # ------------------------------
177 |
178 | src = data.src
179 | trg = data.trg
180 | error = data.error
181 | word = data.word
182 | ebpd1 = data.ebpd1
183 | ebpfd1 = data.ebpfd1
184 |
185 | translation, attention = translate_sentence(src, SRC, TRG, model, DEVICE)
186 |
187 | src = ''.join(src)
188 | trg = ''.join(trg)
189 | pred = ''.join(translation)
190 | error = ''.join(error)
191 | word = ''.join(word)
192 | ebpd1 = ''.join(ebpd1)
193 | ebpfd1 = ''.join(ebpfd1)
194 |
195 | erroneous_words.append(src)
196 | correct_words.append(trg)
197 | predicted_words.append(pred)
198 | errors.append(error)
199 | words.append(word)
200 | ebpd1s.append(ebpd1)
201 | ebpfd1s.append(ebpfd1)
202 |
203 | if trg == pred:
204 | flags.append(1)
205 | else:
206 | flags.append(0)
207 |
208 | if pred in all_words:
209 | modified_flags.append(1)
210 | else:
211 | modified_flags.append(0)
212 |
213 | # evaluation_df = pd.DataFrame({
214 | # 'Error': erroneous_words,
215 | # 'Predicton': predicted_words,
216 | # 'Target': correct_words,
217 | # 'Word': words,
218 | # 'Correction': flags
219 | # })
220 |
221 | evaluation_df = pd.DataFrame({
222 | 'Error': errors,
223 | 'Word': words,
224 | 'ErrorBlanksActual': correct_words,
225 | 'MaskErrorBlank': erroneous_words,
226 | 'ErrorBlanksPredD1': ebpd1s,
227 | 'EBP_Flag_D1': ebpfd1s,
228 | 'ErrorBlanksPredD2': predicted_words,
229 | 'EBP_Flag_D2': flags
230 | })
231 |
232 | corrected_instances = evaluation_df['EBP_Flag_D2'].values.sum()
233 | total_instances = len(evaluation_df)
234 | accuracy = corrected_instances / total_instances
235 | print(f"\nCorrection/Total: {corrected_instances} / {total_instances}")
236 |
237 | y_true = np.array(correct_words)
238 | y_pred = np.array(predicted_words)
239 |
240 | PR = metrics.precision_score(y_true, y_pred, average='weighted')
241 | RE = metrics.recall_score(y_true, y_pred, average='weighted')
242 | F1 = metrics.f1_score(y_true, y_pred, average='weighted')
243 | F05 = metrics.fbeta_score(y_true, y_pred, average='weighted', beta=0.5)
244 | ACC = metrics.accuracy_score(y_true, y_pred)
245 | MODIFIED_ACC = np.sum(modified_flags) / len(modified_flags)
246 |
247 | print(f'''
248 | Top-1 (Greedy Decoding)
249 | Precision: {PR:.4f}
250 | Recall: {RE:.4f}
251 | F1 Score: {F1:.4f}
252 | F0.5 Score: {F05:.4f}
253 | Accuracy: {RE * 100:.2f}%
254 | Modified Accuracy: {MODIFIED_ACC * 100:.2f}%
255 | ''')
256 |
257 | return evaluation_df
258 |
259 |
260 |
261 |
262 | if __name__ == '__main__':
263 | pass
264 |
265 |
266 |
--------------------------------------------------------------------------------
/detector.py:
--------------------------------------------------------------------------------
1 | from utils import (
2 | word2char, basic_tokenizer, count_parameters, initialize_weights,
3 | save_model, load_model, error_df, train_valid_test_df, mask2str,
4 | error_df_2, error_df_3, merge_dfs
5 | )
6 | from transformer import (
7 | Encoder, EncoderLayer, MultiHeadAttentionLayer,
8 | PositionwiseFeedforwardLayer, Decoder, DecoderLayer,
9 | Seq2Seq
10 | )
11 | from pipeline import train, evaluate
12 | from metrics import evaluation_report, evaluation_report2
13 |
14 | import pandas as pd
15 | from sklearn.model_selection import train_test_split
16 | from torchtext.legacy.data import Field, TabularDataset, BucketIterator
17 | import torch
18 | import torch.nn as nn
19 | import os
20 | import gc
21 | import sys
22 | import argparse
23 |
24 | import warnings as wrn
25 | wrn.filterwarnings('ignore')
26 |
27 |
28 | def main():
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument("--CORPUS", help="Path of the Corpus", type=str, default="./Dataset/corpus.csv",
31 | choices=[
32 | "./Dataset/corpus.csv", # Bangla SEC parallel corpus
33 | "./Dataset/corpus2.csv", # Bangla SEC parallel corpus for running test
34 | "./Dataset/Hindi/corpus_hindi.csv",
35 | "./Dataset/Telugu/corpus_telugu.csv",
36 | "./Dataset/Hindi/corpus_hindi_enhanced.csv",
37 | "./Dataset/Telugu/corpus_telugu_enhanced.csv"
38 | ]
39 | )
40 | parser.add_argument("--HID_DIM", help="Hidden Dimension", type=int, default=128, choices=[64, 128, 256])
41 | parser.add_argument("--ENC_LAYERS", help="Number of Encoder Layers", type=int, default=3, choices=[3, 5, 7])
42 | parser.add_argument("--DEC_LAYERS", help="Number of Decoder Layers", type=int,default=3, choices=[3, 5, 7])
43 | parser.add_argument("--ENC_HEADS", help="Number of Encoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
44 | parser.add_argument("--DEC_HEADS", help="Number of Decoder Attention Heades", type=int, default=8, choices=[4, 6, 8])
45 | parser.add_argument("--ENC_PF_DIM", help="Encoder PF Dimension", type=int, default=256, choices=[64, 128, 256])
46 | parser.add_argument("--DEC_PF_DIM", help="Decoder PF Dimesnion", type=int, default=256, choices=[64, 128, 256])
47 | parser.add_argument("--ENC_DROPOUT", help="Encoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
48 | parser.add_argument("--DEC_DROPOUT", help="Decoder Dropout Ratio", type=float, default=0.1, choices=[0.1, 0.2, 0.5])
49 | parser.add_argument("--CLIP", help="Gradient Clipping at", type=float, default=1, choices=[.1, 1, 10])
50 | parser.add_argument("--N_EPOCHS", help="Number of Epochs", type=int, default=100)
51 | parser.add_argument("--LEARNING_RATE", help="Learning Rate", type=float, default=0.0005, choices=[0.0005, 0.00005, 0.000005])
52 | args = parser.parse_args()
53 |
54 | SEED = 1234
55 | torch.manual_seed(SEED)
56 | torch.cuda.manual_seed(SEED)
57 |
58 | # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv')
59 | # df = pd.read_csv('./Dataset/corpus.csv')
60 | df = pd.read_csv(args.CORPUS)
61 | df['Word'] = df['Word'].apply(word2char)
62 | df['Error'] = df['Error'].apply(word2char)
63 | df['Mask'] = df['Mask'].apply(mask2str)
64 | df['Mask'] = df['Mask'].apply(word2char)
65 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str)
66 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char)
67 | df = df.sample(frac=1).reset_index(drop=True)
68 | # df = df.iloc[:, [4, 1, 2]]
69 | df = df[['ErrorBlanks', 'Error', 'ErrorType']]
70 |
71 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=0.15, valid_size=0.05)
72 |
73 | train_df.to_csv('./Dataset/train.csv', index=False)
74 | valid_df.to_csv('./Dataset/valid.csv', index=False)
75 | test_df.to_csv('./Dataset/test.csv', index=False)
76 |
77 | SRC = Field(
78 | tokenize=basic_tokenizer, lower=False,
79 | init_token='', eos_token='', batch_first=True
80 | )
81 | TRG = Field(
82 | tokenize=basic_tokenizer, lower=False,
83 | init_token='', eos_token='', batch_first=True
84 | )
85 | WORD = Field(
86 | tokenize=basic_tokenizer, lower=False,
87 | init_token='', eos_token='', batch_first=True
88 | )
89 | fields = {
90 | 'Error': ('src', SRC),
91 | 'ErrorBlanks': ('trg', TRG)
92 | }
93 |
94 | train_data, valid_data, test_data = TabularDataset.splits(
95 | path='./Dataset',
96 | train='train.csv',
97 | validation='valid.csv',
98 | test='test.csv',
99 | format='csv',
100 | fields=fields
101 | )
102 |
103 | SRC.build_vocab(train_data, min_freq=100)
104 | TRG.build_vocab(train_data, min_freq=50)
105 | WORD.build_vocab(train_data, min_freq=100)
106 |
107 | # ------------------------------
108 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
109 | BATCH_SIZE = 512
110 | # ------------------------------
111 | INPUT_DIM = len(SRC.vocab)
112 | OUTPUT_DIM = len(TRG.vocab)
113 | # ------------------------------
114 | HID_DIM = int(args.HID_DIM)
115 | ENC_LAYERS = int(args.ENC_LAYERS)
116 | DEC_LAYERS = int(args.DEC_LAYERS)
117 | ENC_HEADS = int(args.ENC_HEADS)
118 | DEC_HEADS = int(args.DEC_HEADS)
119 | ENC_PF_DIM = int(args.ENC_PF_DIM)
120 | DEC_PF_DIM = int(args.DEC_PF_DIM)
121 | ENC_DROPOUT = float(args.ENC_DROPOUT)
122 | DEC_DROPOUT = float(args.DEC_DROPOUT)
123 | CLIP = float(args.CLIP)
124 | N_EPOCHS = int(args.N_EPOCHS)
125 | LEARNING_RATE = float(args.LEARNING_RATE)
126 | # ------------------------------
127 | PATH = './Checkpoints/detector.pth'
128 | # ------------------------------
129 | gc.collect()
130 | torch.cuda.empty_cache()
131 | # -----------------------------
132 |
133 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
134 | (train_data, valid_data, test_data),
135 | batch_size=BATCH_SIZE,
136 | sort_within_batch=True,
137 | sort_key=lambda x: len(x.src),
138 | device=DEVICE
139 | )
140 |
141 | enc = Encoder(
142 | INPUT_DIM, HID_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM,
143 | ENC_DROPOUT, DEVICE
144 | )
145 | dec = Decoder(
146 | OUTPUT_DIM, HID_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM,
147 | DEC_DROPOUT, DEVICE
148 | )
149 |
150 | SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
151 | TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
152 |
153 | model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, DEVICE).to(DEVICE)
154 | model.apply(initialize_weights)
155 | # print(f'The model has {count_parameters(model):,} trainable parameters')
156 |
157 | optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
158 | criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
159 |
160 | epoch = 1
161 | best_loss = 1e10
162 | if os.path.exists(PATH):
163 | checkpoint, epoch, train_loss = load_model(model, PATH)
164 | best_loss = train_loss
165 |
166 | for epoch in range(epoch, N_EPOCHS):
167 | print(f"Epoch: {epoch} / {N_EPOCHS}")
168 | train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
169 | print(f"Train Loss: {train_loss:.4f}")
170 | if train_loss < best_loss:
171 | best_loss = train_loss
172 | save_model(model, train_loss, epoch, PATH)
173 |
174 | # ---------------------
175 | # eval_df = evaluation_report(test_data, SRC, TRG, model, DEVICE)
176 | # ---------------------
177 | error_types = [
178 | 'Homonym Error', # 123
179 | 'Typo Deletion', # 115767
180 | 'Typo (Avro) Substituition', # 119573
181 | 'Typo (Bijoy) Substituition', # 119864
182 | 'Cognitive Error', # 108227
183 | 'Run-on Error', # 124895
184 | 'Split-word Error (Left)', # 62890
185 | 'Split-word Error (Random)', # 124895
186 | 'Split-word Error (Right)', # 13985
187 | 'Split-word Error (both)', # 12800
188 | 'Typo Insertion', # 124807
189 | 'Typo Transposition', # 123245
190 | 'Visual Error', # 117391
191 | 'Visual Error (Combined Character)' # 17617
192 | ]
193 | # ---------------------
194 | # df = pd.read_csv('./Dataset/sec_dataset_III_v3_new_masked_b.csv')
195 | df = pd.read_csv('./Dataset/corpus.csv')
196 | df['Word'] = df['Word'].apply(word2char)
197 | df['Error'] = df['Error'].apply(word2char)
198 | df['Mask'] = df['Mask'].apply(mask2str)
199 | df['Mask'] = df['Mask'].apply(word2char)
200 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(mask2str)
201 | df['ErrorBlanks'] = df['ErrorBlanks'].apply(word2char)
202 | df = df.sample(frac=1).reset_index(drop=True)
203 | # df = df.iloc[:, [0, 1, -2, 2]]
204 | df = df[['Word', 'Error', 'ErrorBlanks', 'ErrorType']]
205 |
206 | train_df, valid_df, test_df = train_valid_test_df(df, test_size=1./1e10, valid_size=1./1e10)
207 |
208 | train_df.to_csv('./Dataset/train.csv', index=False)
209 | valid_df.to_csv('./Dataset/valid.csv', index=False)
210 | test_df.to_csv('./Dataset/test.csv', index=False)
211 | # ---------------------
212 | for error_name in error_types:
213 | print(f'------\nError Type: {error_name}\n------')
214 | error_df_3(df, error_name)
215 |
216 | fields = {
217 | 'Error': ('src', SRC),
218 | 'ErrorBlanks': ('trg', TRG),
219 | 'Word': ('word', WORD)
220 | }
221 |
222 | error_data, _ = TabularDataset.splits(
223 | path='./Dataset',
224 | train='error.csv',
225 | test='error.csv',
226 | format='csv',
227 | fields=fields
228 | )
229 |
230 | eval_df = evaluation_report2(error_data, SRC, TRG, WORD, model, DEVICE)
231 | eval_df['ErrorType'] = [error_name for _ in range(len(eval_df))]
232 |
233 | error_name = error_name.replace(' ', '').replace('(', '').replace(')', '')
234 | eval_df.to_csv(f'./Dataframes/detector_{error_name}.csv', index=False)
235 | print('\n\n')
236 | # ---------------------
237 | merge_dfs(network='detector')
238 | # ---------------------
239 |
240 |
241 | if __name__ == '__main__':
242 | main()
243 |
--------------------------------------------------------------------------------