├── code ├── baselines │ ├── classic_models │ │ └── Readme.txt │ └── neural_models │ │ ├── BiGRU.py │ │ ├── CNN.py │ │ └── char-cnn.py ├── cleaning │ └── clean_dataset.py └── embeddings │ └── word2vec_training.py ├── stopwords ├── Kirundi │ ├── Kirundi stopwords set.txt │ └── listed.txt └── Kinyarwanda │ ├── Kinyarwanda stopwords set.txt │ └── listed.txt ├── LICENSE └── README.md /code/baselines/classic_models/Readme.txt: -------------------------------------------------------------------------------- 1 | All of the classic baseline models (MNB, LR, and SVM) are implemented with the help of the scikit-learn framework and use its default hyperparameters and Term Frequency Inverse Document Frequency (TFIDF) to get the values of unigram input features. For other hyperparameter settings, please, refer to the original paper. -------------------------------------------------------------------------------- /stopwords/Kirundi/Kirundi stopwords set.txt: -------------------------------------------------------------------------------- 1 | stopset_kir = {'aba','abo','aho','ari','ata','ati','ayo','ba','bari','bo','bose','bw','bwa','bwo','ca','cane','co','de', 2 | 'ico','iryo','ivyo','iyo','izo','ko','ku','kuri','kuva','kw','maze','mu','muri','mw','na','naho','nayo', 3 | 'ngo','ni','nk','no','rero','rw','ry','rya','ubu','uko','uri','uwo','uyu','vy','vya','vyo','wa','wo', 4 | 'ya','yari','yo','yose','za','zo'} -------------------------------------------------------------------------------- /stopwords/Kirundi/listed.txt: -------------------------------------------------------------------------------- 1 | aba 2 | abo 3 | aho 4 | ari 5 | ata 6 | ati 7 | ayo 8 | ba 9 | bari 10 | bo 11 | bose 12 | bw 13 | bwa 14 | bwo 15 | ca 16 | cane 17 | co 18 | de 19 | ico 20 | iryo 21 | ivyo 22 | iyo 23 | izo 24 | ko 25 | ku 26 | kuri 27 | kuva 28 | kw 29 | maze 30 | mu 31 | muri 32 | mw 33 | na 34 | naho 35 | nayo 36 | ngo 37 | ni 38 | nk 39 | no 40 | rero 41 | rw 42 | ry 43 | rya 44 | ubu 45 | uko 46 | uri 47 | uwo 48 | uyu 49 | vy 50 | vya 51 | vyo 52 | wa 53 | wo 54 | ya 55 | yari 56 | yo 57 | yose 58 | za 59 | zo 60 | -------------------------------------------------------------------------------- /stopwords/Kinyarwanda/Kinyarwanda stopwords set.txt: -------------------------------------------------------------------------------- 1 | stopset_kin = {'aba', 'abo', 'aha', 'aho', 'ari', 'ati', 'aya', 'ayo', 'ba', 'baba', 'babo', 'bari', 'be', 'bo', 'bose', 2 | 'bw', 'bwa', 'bwo', 'by', 'bya', 'byo', 'cy', 'cya', 'cyo', 'hafi', 'ibi', 'ibyo', 'icyo', 'iki', 3 | 'imwe', 'iri', 'iyi', 'iyo', 'izi', 'izo', 'ka', 'ko', 'ku', 'kuri', 'kuva', 'kwa', 'maze', 'mu', 'muri', 4 | 'na', 'naho','nawe', 'ngo', 'ni', 'niba', 'nk', 'nka', 'no', 'nta', 'nuko', 'rero', 'rw', 'rwa', 'rwo', 'ry', 5 | 'rya','ubu', 'ubwo', 'uko', 'undi', 'uri', 'uwo', 'uyu', 'wa', 'wari', 'we', 'wo', 'ya', 'yabo', 'yari', 'ye', 6 | 'yo', 'yose', 'za', 'zo'} -------------------------------------------------------------------------------- /stopwords/Kinyarwanda/listed.txt: -------------------------------------------------------------------------------- 1 | aba 2 | abo 3 | aha 4 | aho 5 | ari 6 | ati 7 | aya 8 | ayo 9 | ba 10 | baba 11 | babo 12 | bari 13 | be 14 | bo 15 | bose 16 | bw 17 | bwa 18 | bwo 19 | by 20 | bya 21 | byo 22 | cy 23 | cya 24 | cyo 25 | hafi 26 | ibi 27 | ibyo 28 | icyo 29 | iki 30 | imwe 31 | iri 32 | iyi 33 | iyo 34 | izi 35 | izo 36 | ka 37 | ko 38 | ku 39 | kuri 40 | kuva 41 | kwa 42 | maze 43 | mu 44 | muri 45 | na 46 | naho 47 | nawe 48 | ngo 49 | ni 50 | niba 51 | nk 52 | nka 53 | no 54 | nta 55 | nuko 56 | rero 57 | rw 58 | rwa 59 | rwo 60 | ry 61 | rya 62 | ubu 63 | ubwo 64 | uko 65 | undi 66 | uri 67 | uwo 68 | uyu 69 | wa 70 | wari 71 | we 72 | wo 73 | ya 74 | yabo 75 | yari 76 | ye 77 | yo 78 | yose 79 | za 80 | zo 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Andrew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code/cleaning/clean_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from kkltk.kin_kir_stopwords import stopwords # check https://github.com/Andrews2017/kkltk for more detailed information about how to use kkltk package 3 | 4 | stopset_kin = stopwords.words('kinyarwanda') 5 | 6 | # loading the data 7 | data = pd.read_csv('../data/KINNEWS/raw/train.csv') 8 | 9 | # Cleaning the data (preprocessing) 10 | # Removing the special characters and urls 11 | data.title = data.title.str.replace('[^A-Za-z\s\’\-]+', '') 12 | data.content = data.content.str.replace('[^A-Za-z\s\’\-]+', '') 13 | data.title = data.title.str.replace('[\n]+', '') 14 | data.content = data.content.str.replace('[\n]+', '') 15 | data.title = data.title.str.replace('^https?:\/\/.*[\r\n]*', '') 16 | data.content = data.content.str.replace('^https?:\/\/.*[\r\n]*', '') 17 | 18 | # Removing the stopwords 19 | data['title'] = data['title'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin])) 20 | data['content'] = data['content'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin])) 21 | 22 | # Save the cleaned dataset 23 | data.to_csv("../data/KINNEWS/cleaned/train.csv", index=False) 24 | -------------------------------------------------------------------------------- /code/embeddings/word2vec_training.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | import pandas as pd 3 | 4 | # load the data 5 | data_train = pd.read_csv('../data/KINNEWS/cleaned/train.csv') 6 | data_test = pd.read_csv('../data/KINNEWS/cleaned/test.csv') 7 | data = pd.concat([data_train, data_test]) 8 | data['whole_doc'] = data['title'] + ' ' + data['content'].astype(str) 9 | 10 | # clean the data (preprocessing) 11 | data.whole_doc = data.whole_doc.str.replace('[^A-Za-z\s\’\-]+', '') 12 | data.whole_doc = data.whole_doc.str.replace('[\n]+', '') 13 | data.whole_doc = data.whole_doc.str.replace('^https?:\/\/.*[\r\n]*', '') 14 | 15 | # Create the list of list format of the custom corpus for gensim modeling 16 | sent = [row.split(' ') for row in data['whole_doc'] if len(row)] 17 | sent = [[tok.lower() for tok in sub_sent if len(tok) != 0] for sub_sent in sent] 18 | 19 | # Training the model 20 | w2v_model = Word2Vec(sent, window=5, min_count=5, sg=1, hs=1, vector_size=50) 21 | 22 | # Generate a list of words with their vectors to make the custom embeddings generation possible 23 | w2v_vectors = [] 24 | for token, idx in w2v_model.wv.key_to_index.items(): 25 | str_vec = '' 26 | if token in w2v_model.wv.key_to_index.keys(): 27 | str_vec += token 28 | for i in range(len(w2v_model.wv[token])): 29 | str_vec += ' ' + str(w2v_model.wv[token][i]) 30 | w2v_vectors.append(str_vec) 31 | 32 | # Save the above embeddings list in txt file 33 | with open("../pre-trained_embeddings/Kinyarwanda/W2V-Kin-50.txt", 'w') as output: 34 | for row in w2v_vectors: 35 | output.write(str(row) + '\n') 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KINNEWS-and-KIRNEWS 2 | Data, Embeddings, Stopword lists, code, and baselines for [COLING 2020](https://coling2020.org/) paper titled ["KINNEWS and KIRNEWS: Benchmarking Cross-Lingual Text Classification for Kinyarwanda and Kirundi"](https://www.aclweb.org/anthology/2020.coling-main.480/) by [Rubungo Andre Niyongabo](https://scholar.google.com/citations?user=5qnTWQEAAAAJ&hl=en), [Hong Qu](https://scholar.google.com/citations?user=Aiq9mFMAAAAJ&hl=en), [Julia Kreutzer](https://scholar.google.co.uk/citations?user=j4cOSzAAAAAJ&hl=en), and Li Huang. 3 | 4 | This paper introduces Kinyarwanda and Kirundi news classification datasets (KINNEWS and KIRNEWS,respectively), which were both collected from Rwanda and Burundi news websites and newspapers, for low-resource monolingual and cross-lingual multiclass classification tasks. Along with the datasets, we provide statistics, guidelines for preprocessing, pretrained word embeddings, and monolingual and cross-lingual baseline models. 5 | 6 | **Note:** Please, when using any of the resources provided here, remember to [cite](https://www.aclweb.org/anthology/2020.coling-main.480.bib) our paper. 7 | 8 | ## Data 9 | ### Download the datasets 10 | - The raw and cleaned versions of KINNEWS can be downloaded from [here](https://drive.google.com/drive/folders/1zxn0hgrOLlUsK5V0c7l71eAj1t2jiyox?usp=sharing) (21,268 articles, 14 classes, 45.07MB(raw) and 38.85MB(cleaned)) 11 | - The raw and cleaned versions of KIRNEWS can be downloaded from [here](https://drive.google.com/uc?export=download&id=1-53VQFOHqBeoX2JiN01X1Sxgfh78ckru) (4,612 articles, 12 classes, 9.31MB(raw) and 7.77MB(cleaned)) 12 | 13 | ### Datasets description 14 | Each dataset is in camma-separated-value (csv) format, with columns that are described bellow (Note that in the cleaned versions we only remain with 'label','title', and 'content' columns): 15 | | Field | Description | 16 | | ----- | ----------- | 17 | | label | Numerical labels that range from 1 to 14 | 18 | | en_label | English labels | 19 | | kin_label | Kinyarwanda labels | 20 | | kir_label | Kirundi labels | 21 | | url | The link to the news source | 22 | | title | The title of the news article | 23 | | content | The full content of the news article | 24 | 25 | ## Word embeddings 26 | ### Download pre-trained word embeddings 27 | - The Kinyarwanda embeddings can be downloaded form [here](https://drive.google.com/uc?export=download&id=1-DJuVhD-8YxxOP9CBxzAJeoMTA5An2Xw) (59.88MB for 100d 28 | and 29.94MB for 50d) 29 | - The Kirundi embeddings can be downloaded from [here](https://drive.google.com/uc?export=download&id=1-BlECMnNPBIVspfBF-qccKQ1LeiEQ7JO) (17.98MB for 100d and 8.96MB for 50d) 30 | 31 | ### Training your own embeddings 32 | To train you own word vectors, check out [code/embeddings/word2vec_training.py](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/tree/main/code/embeddings) file or refer to this [gensim](https://radimrehurek.com/gensim/models/word2vec.html) documentation. 33 | 34 | ## Stopwords 35 | To use our stopwords you may just copy the whole [stopset_kin](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/blob/main/stopwords/Kinyarwanda/Kinyarwanda%20stopwords%20set.txt) for Kinyarwanda and [stopset_kir](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/blob/main/stopwords/Kirundi/Kirundi%20stopwords%20set.txt) for Kirundi into your code or import them directly from [KKLTK](https://github.com/Andrews2017/kkltk) package, which is more recommended. 36 | 37 | ## Leaderboard (baselines) 38 | ### Monolingual 39 | #### KINNEWS 40 | | Model | Accuracy(%)| 41 | | ----- | ----------- | 42 | | BiGRU(W2V-Kin-50*) | 88.65 | 43 | | SVM(TF-IDF) | 88.53 | 44 | | BiGRU(W2V-Kin-100) | 88.29 | 45 | | CNN(W2V-Kin-50) | 87.55 | 46 | | CNN(W2V-Kin-100) | 87.54 | 47 | | LR(TF-IDF) | 87.14 | 48 | | MNB(TF-IDF) | 82.70 | 49 | | Char-CNN | 71.70 | 50 | #### KIRNEWS 51 | | Model | Accuracy(%)| 52 | | ----- | ----------- | 53 | | SVM(TF-IDF) | 90.14 | 54 | | CNN(W2V-Kin-100) | 88.01 | 55 | | BiGRU(W2V-Kin-100) | 86.61 | 56 | | LR(TF-IDF) | 86.13| 57 | | BiGRU(W2V-Kin-50) | 85.86 | 58 | | CNN(W2V-Kin-50) | 85.75 | 59 | | MNB(TF-IDF) | 82.67 | 60 | | Char-CNN | 69.23 | 61 | 62 | ### Cross-lingual 63 | | Model | Train set| Test set | Accuracy(%) | 64 | | ----- | ----------- | ------- | ---------| 65 | | MNB(TF-IDF) | KINNEWS | KIRNEWS | 73.46 | 66 | | SVM(TF-IDF) | KINNEWS | KIRNEWS | 72.70 | 67 | | LR(TF-IDF) | KINNEWS | KIRNEWS | 68.26 | 68 | | BiGRU(W2V-Kin-50) | KINNEWS | KIRNEWS | 67.54 | 69 | | BiGRU(W2V-Kin-100*) | KINNEWS | KIRNEWS | 65.06 | 70 | | CNN(W2V-Kin-100) | KINNEWS | KIRNEWS | 61.72 | 71 | | CNN(W2V-Kin-50) | KINNEWS | KIRNEWS | 60.64 | 72 | | Char-CNN | KINNEWS | KIRNEWS | 49.60 | 73 | 74 | | Model | Train set| Test set | Accuracy(%) | 75 | | ----- | ----------- | ------- | ---------| 76 | | CNN(W2V-Kin-100) | KIRNEWS | KIRNEWS | 88.01 | 77 | | BiGRU(W2V-Kin-100) | KIRNEWS | KIRNEWS | 86.61 | 78 | | CNN(W2V-Kin-50) | KIRNEWS | KIRNEWS | 85.75 | 79 | | BiGRU(W2V-Kin-50) | KIRNEWS | KIRNEWS | 83.38 | 80 | -------------------------------------------------------------------------------- /code/baselines/neural_models/BiGRU.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchtext import data 3 | import random 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torchtext.vocab as vocab 7 | from kkltk.kin_kir_stopwords import stopwords # check https://github.com/Andrews2017/kkltk for more detailed information about kkltk package 8 | 9 | stopset_kin = stopwords.words('kinyarwanda') 10 | 11 | ## Build the model 12 | # Generate the custom embeddings 13 | custom_embeddings = vocab.Vectors(name='../pre-trained_embeddings/kinyarwanda/W2V-Kin-50.txt', 14 | cache='../pre-trained_embeddings/kinyarwanda', 15 | unk_init=torch.Tensor.normal_) 16 | 17 | ### Prepare the data 18 | SEED = 1234 19 | torch.manual_seed(SEED) 20 | torch.cuda.manual_seed(SEED) 21 | torch.backends.cudnn.deterministic = True 22 | 23 | ## Load the dataset 24 | # Define fields to hold the data 25 | LABEL = data.LabelField(dtype=torch.float) 26 | TITLE = data.Field(tokenize='spacy', stop_words=stopset_kin) 27 | TEXT = data.Field(tokenize='spacy', stop_words=stopset_kin) 28 | 29 | fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)] 30 | 31 | # Upload CSV format dataset and do train/test splits 32 | train_data, test_data = data.TabularDataset.splits( 33 | path='../data/KINNEWS/cleaned', 34 | train='train.csv', 35 | test='test.csv', 36 | format='csv', 37 | fields=fields, 38 | skip_header=True # dataset has a header(title) 39 | ) 40 | 41 | # Train/validation set split 42 | train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED)) 43 | 44 | # Build the vocabulary 45 | TEXT.build_vocab(train_data.title, train_data.content, max_size=10000, vectors=custom_embeddings) 46 | TITLE.vocab = TEXT.vocab 47 | LABEL.build_vocab(train_data) 48 | 49 | # Create the iterator and place the tensor it returned on GPU(if it is available) 50 | BATCH_SIZE = 32 51 | 52 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 53 | 54 | train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( 55 | (train_data, valid_data, test_data), 56 | batch_size=BATCH_SIZE, 57 | sort_key=lambda x: len(x.content), 58 | device=device) 59 | 60 | class RNN(nn.Module): 61 | def __init__(self, vocab_size, embedding_dim, hidden_dim, 62 | output_dim, n_layers, bidirectional, dropout): 63 | super().__init__() 64 | 65 | self.embedding = nn.Embedding(vocab_size, embedding_dim) 66 | self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers, 67 | bidirectional=bidirectional, dropout=dropout) 68 | self.fc = nn.Linear(hidden_dim * 2, output_dim) 69 | self.dropout = nn.Dropout(dropout) 70 | 71 | def forward(self, text): 72 | embedded = self.dropout(self.embedding(text)) 73 | output, hidden = self.rnn(embedded) 74 | hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)) 75 | return self.fc(hidden.squeeze(0)) 76 | 77 | # Create the instance of the model 78 | INPUT_DIM = len(TEXT.vocab) 79 | EMBEDDING_DIM = 50 80 | HIDDEN_DIM = 256 81 | OUTPUT_DIM = len(LABEL.vocab) 82 | N_LAYERS = 2 83 | BIDIRECTIONAL = True 84 | DROPOUT = 0.5 85 | 86 | model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT) 87 | 88 | pretrained_embeddings = TEXT.vocab.vectors 89 | 90 | model.embedding.weight.data.copy_(pretrained_embeddings) 91 | 92 | optimizer = optim.Adam(model.parameters()) 93 | 94 | criterion = nn.CrossEntropyLoss() 95 | 96 | model = model.to(device) 97 | criterion = criterion.to(device) 98 | 99 | def multiclass_accuracy(preds, y): 100 | """ 101 | Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 102 | """ 103 | # Round predictions to the closest integer 104 | rounded_preds = torch.max(preds, -1)[1] 105 | correct = (rounded_preds == y).float() # convert into float for division 106 | acc = correct.sum() / len(correct) 107 | return acc 108 | 109 | def train(model, iterator, optimizer, criterion): 110 | """ Training the model""" 111 | epoch_loss = 0 112 | epoch_acc = 0 113 | model.train() 114 | 115 | for batch in iterator: 116 | optimizer.zero_grad() 117 | predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1) 118 | loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor)) 119 | acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor)) 120 | loss.backward() 121 | optimizer.step() 122 | epoch_loss += loss.item() 123 | epoch_acc += acc.item() 124 | 125 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 126 | 127 | def evaluate(model, iterator, criterion): 128 | """ Evaluating the model""" 129 | epoch_loss = 0 130 | epoch_acc = 0 131 | model.eval() 132 | 133 | with torch.no_grad(): 134 | for batch in iterator: 135 | predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1) 136 | loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor)) 137 | acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor)) 138 | epoch_loss += loss.item() 139 | epoch_acc += acc.item() 140 | 141 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 142 | 143 | def count_parameters(model): 144 | """ Counting the model parameters""" 145 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 146 | 147 | if __name__ == "__main__": 148 | N_EPOCHS = 10 149 | 150 | for epoch in range(N_EPOCHS): 151 | train_loss, train_acc = train(model, train_iterator, optimizer, criterion) 152 | valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) 153 | print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |' 154 | f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |') 155 | 156 | test_loss, test_acc = evaluate(model, test_iterator, criterion) 157 | print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |') 158 | 159 | print(f'The model has {count_parameters(model):,} trainable parameters') 160 | -------------------------------------------------------------------------------- /code/baselines/neural_models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchtext import data 3 | import random 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torchtext.vocab as vocab 7 | import torch.nn.functional as F 8 | from kkltk.kin_kir_stopwords import stopwords # check https://github.com/Andrews2017/kkltk for more detailed information about kkltk package 9 | 10 | stopset_kin = stopwords.words('kinyarwanda') 11 | 12 | # Generate the custom embeddings 13 | custom_embeddings = vocab.Vectors(name='../pre-trained_embeddings/kinyarwanda/W2V-Kin-50.txt', 14 | cache='../pre-trained_embeddings/kinyarwanda', 15 | unk_init=torch.Tensor.normal_) 16 | 17 | ### Prepare the data 18 | SEED = 1234 19 | torch.manual_seed(SEED) 20 | torch.cuda.manual_seed(SEED) 21 | torch.backends.cudnn.deterministic = True 22 | 23 | ## Load the dataset 24 | # Define fields to hold the data 25 | LABEL = data.LabelField(dtype=torch.float) 26 | TITLE = data.Field(tokenize='spacy') 27 | TEXT = data.Field(tokenize='spacy') 28 | 29 | fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)] 30 | 31 | # Upload CSV format dataset and do train/test splits 32 | train_data, test_data = data.TabularDataset.splits( 33 | path='../data/KINNEWS/cleaned', 34 | train='train.csv', 35 | test='test.csv', 36 | format='csv', 37 | fields=fields, 38 | skip_header=True # dataset has a header(title) 39 | ) 40 | 41 | # train/validation set split 42 | train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED), ) 43 | 44 | # build the vocabulary 45 | TEXT.build_vocab(train_data.title, train_data.content, max_size=15000, vectors=custom_embeddings) 46 | TITLE.vocab = TEXT.vocab 47 | LABEL.build_vocab(train_data) 48 | 49 | # create the iterator and place the tensor it returned on GPU(if it is available) 50 | BATCH_SIZE = 32 51 | 52 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 53 | # print(TEXT.vocab.vectors) 54 | 55 | train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( 56 | (train_data, valid_data, test_data), 57 | batch_size=BATCH_SIZE, 58 | sort_key=lambda x: len(x.content), 59 | device=device) 60 | 61 | 62 | # Build the model 63 | class CNN(nn.Module): 64 | def __init__(self, vocab_size, embedding_dim, n_filters, 65 | filter_sizes, output_dim, dropout, pad_idx): 66 | super().__init__() 67 | 68 | self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx) 69 | self.convs = nn.ModuleList([ 70 | nn.Conv2d(in_channels=1, 71 | out_channels=n_filters, 72 | kernel_size=(fs, embedding_dim)) 73 | for fs in filter_sizes 74 | ]) 75 | self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) 76 | self.dropout = nn.Dropout(dropout) 77 | 78 | def forward(self, text): 79 | text = text.permute(1, 0) 80 | embedded = self.embedding(text) 81 | embedded = embedded.unsqueeze(1) 82 | conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] 83 | pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] 84 | cat = self.dropout(torch.cat(pooled, dim=1)) 85 | return self.fc(cat) 86 | 87 | # create the instance of the model 88 | INPUT_DIM = len(TEXT.vocab) 89 | EMBEDDING_DIM = 50 90 | N_FILTERS = 150 91 | FILTER_SIZES = [3, 4, 5] 92 | OUTPUT_DIM = len(LABEL.vocab) 93 | DROPOUT = 0.5 94 | PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] 95 | 96 | model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX) 97 | 98 | pretrained_embeddings = TEXT.vocab.vectors 99 | 100 | model.embedding.weight.data.copy_(pretrained_embeddings) 101 | 102 | UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] 103 | 104 | model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) 105 | model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) 106 | 107 | optimizer = optim.Adam(model.parameters()) 108 | 109 | criterion = nn.CrossEntropyLoss() 110 | 111 | model = model.to(device) 112 | criterion = criterion.to(device) 113 | 114 | def multiclass_accuracy(preds, y): 115 | """ 116 | Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8 117 | """ 118 | # round predictions to the closest integer 119 | rounded_preds = torch.max(preds, -1)[1] 120 | correct = (rounded_preds == y).float() # convert into float for division 121 | acc = correct.sum() / len(correct) 122 | return acc 123 | 124 | 125 | # train the model 126 | def train(model, iterator, optimizer, criterion): 127 | """ Training the model""" 128 | epoch_loss = 0 129 | epoch_acc = 0 130 | model.train() 131 | 132 | for batch in iterator: 133 | optimizer.zero_grad() 134 | 135 | predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1) 136 | loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor)) 137 | acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor)) 138 | loss.backward() 139 | optimizer.step() 140 | epoch_loss += loss.item() 141 | epoch_acc += acc.item() 142 | 143 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 144 | 145 | # evaluate the model 146 | def evaluate(model, iterator, criterion): 147 | """ Evaluating the model""" 148 | epoch_loss = 0 149 | epoch_acc = 0 150 | model.eval() 151 | 152 | with torch.no_grad(): 153 | for batch in iterator: 154 | predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1) 155 | loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor)) 156 | acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor)) 157 | epoch_loss += loss.item() 158 | epoch_acc += acc.item() 159 | 160 | return epoch_loss / len(iterator), epoch_acc / len(iterator) 161 | 162 | def count_parameters(model): 163 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 164 | 165 | if __name__ == "__main__": 166 | N_EPOCHS = 8 167 | 168 | for epoch in range(N_EPOCHS): 169 | train_loss, train_acc = train(model, train_iterator, optimizer, criterion) 170 | valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) 171 | print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |' 172 | f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |') 173 | 174 | test_loss, test_acc = evaluate(model, test_iterator, criterion) 175 | print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |') 176 | 177 | print(f'The model has {count_parameters(model):,} trainable parameters') 178 | -------------------------------------------------------------------------------- /code/baselines/neural_models/char-cnn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import torch 5 | import shutil 6 | import torch.nn as nn 7 | import numpy as np 8 | from sklearn import metrics 9 | from torch.utils.data import Dataset 10 | from torch.utils.data import DataLoader 11 | from tensorboardX import SummaryWriter 12 | csv.field_size_limit(sys.maxsize) 13 | 14 | class MyDataset(Dataset): 15 | def __init__(self, data_path, max_length=1500): 16 | self.data_path = data_path 17 | self.vocabulary = list("""abcdefghijklmnoprstuvwyz0123456789-,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=<>()[]{}""") # removed 'q' and 'x' as they do not exist in Kinyarwanda and Kirundi alphabets 18 | self.identity_mat = np.identity(len(self.vocabulary)) 19 | texts, labels = [], [] 20 | with open(data_path) as csv_file: 21 | reader = csv.reader(csv_file, quotechar='"') 22 | for idx, line in enumerate(reader): 23 | if idx != 0: 24 | text = "" 25 | for tx in line[1:]: 26 | text += tx 27 | text += " " 28 | label = line[0] 29 | texts.append(text) 30 | labels.append(label) 31 | self.texts = texts 32 | self.labels = labels 33 | self.max_length = max_length 34 | self.length = len(self.labels) 35 | self.num_classes = len(set(self.labels)) 36 | 37 | def __len__(self): 38 | return self.length 39 | 40 | def __getitem__(self, index): 41 | raw_text = self.texts[index] 42 | data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text) if i in self.vocabulary], 43 | dtype=np.float32) 44 | if len(data) > self.max_length: 45 | data = data[:self.max_length] 46 | elif 0 < len(data) < self.max_length: 47 | data = np.concatenate( 48 | (data, np.zeros((self.max_length - len(data), len(self.vocabulary)), dtype=np.float32))) 49 | elif len(data) == 0: 50 | data = np.zeros((self.max_length, len(self.vocabulary)), dtype=np.float32) 51 | label = self.labels[index] 52 | return data, label 53 | 54 | def get_evaluation(y_true, y_prob, list_metrics): 55 | y_pred = np.argmax(y_prob, -1) 56 | output = {} 57 | if 'accuracy' in list_metrics: 58 | output['accuracy'] = metrics.accuracy_score(y_true, y_pred) 59 | if 'loss' in list_metrics: 60 | try: 61 | output['loss'] = metrics.log_loss(y_true, y_prob) 62 | except ValueError: 63 | output['loss'] = -1 64 | if 'confusion_matrix' in list_metrics: 65 | output['confusion_matrix'] = str(metrics.confusion_matrix(y_true, y_pred)) 66 | return output 67 | 68 | class CharacterLevelCNN(nn.Module): 69 | def __init__(self, n_classes=14, input_length=1500, input_dim=68, 70 | n_conv_filters=256, 71 | n_fc_neurons=1024): 72 | super(CharacterLevelCNN, self).__init__() 73 | self.conv1 = nn.Sequential(nn.Conv1d(input_dim, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(), 74 | nn.MaxPool1d(3)) 75 | self.conv2 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(), 76 | nn.MaxPool1d(3)) 77 | self.conv3 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU()) 78 | self.conv4 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU()) 79 | self.conv5 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU()) 80 | self.conv6 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU(), 81 | nn.MaxPool1d(3)) 82 | # compute the output shape after forwarding an input to the conv layers 83 | input_shape = (128, 84 | input_length, 85 | input_dim) 86 | self.output_dimension = self._get_conv_output(input_shape) 87 | 88 | self.fc1 = nn.Sequential(nn.Linear(self.output_dimension, n_fc_neurons), nn.Dropout(0.5)) 89 | self.fc2 = nn.Sequential(nn.Linear(n_fc_neurons, n_fc_neurons), nn.Dropout(0.5)) 90 | self.fc3 = nn.Linear(n_fc_neurons, n_classes) 91 | 92 | if n_conv_filters == 256 and n_fc_neurons == 1024: 93 | self._create_weights(mean=0.0, std=0.05) 94 | elif n_conv_filters == 1024 and n_fc_neurons == 2048: 95 | self._create_weights(mean=0.0, std=0.02) 96 | 97 | def _create_weights(self, mean=0.0, std=0.05): 98 | for module in self.modules(): 99 | if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear): 100 | module.weight.data.normal_(mean, std) 101 | 102 | def _get_conv_output(self, shape): 103 | x = torch.rand(shape) 104 | x = x.transpose(1, 2) 105 | x = self.conv1(x) 106 | x = self.conv2(x) 107 | x = self.conv3(x) 108 | x = self.conv4(x) 109 | x = self.conv5(x) 110 | x = self.conv6(x) 111 | x = x.view(x.size(0), -1) 112 | output_dimension = x.size(1) 113 | return output_dimension 114 | 115 | def forward(self, input): 116 | input = input.transpose(1, 2) 117 | output = self.conv1(input) 118 | output = self.conv2(output) 119 | output = self.conv3(output) 120 | output = self.conv4(output) 121 | output = self.conv5(output) 122 | output = self.conv6(output) 123 | 124 | output = output.view(output.size(0), -1) 125 | output = self.fc1(output) 126 | output = self.fc2(output) 127 | output = self.fc3(output) 128 | 129 | return output 130 | 131 | def train(feature,log_path, optimizer): 132 | if torch.cuda.is_available(): 133 | torch.cuda.manual_seed(123) 134 | else: 135 | torch.manual_seed(123) 136 | 137 | if not os.path.exists(output): 138 | os.makedirs(output) 139 | output_file = open(output + os.sep + "logs.txt", "w") 140 | output_file.write("Model's parameters: {}".format()) 141 | 142 | training_params = {"batch_size": batch_size, 143 | "shuffle": True, 144 | "num_workers": 0} 145 | test_params = {"batch_size": batch_size, 146 | "shuffle": True, 147 | "num_workers": 0} 148 | training_set = MyDataset(input + "/train.csv", max_length) 149 | test_set = MyDataset(input + "/test.csv", max_length) 150 | training_generator = DataLoader(training_set, **training_params) 151 | test_generator = DataLoader(test_set, **test_params) 152 | 153 | if feature == "small": 154 | model = CharacterLevelCNN(input_length=max_length, n_classes=training_set.num_classes, 155 | input_dim=len(alphabet), 156 | n_conv_filters=256, n_fc_neurons=1024) 157 | 158 | elif feature == "large": 159 | model = CharacterLevelCNN(input_length=max_length, n_classes=training_set.num_classes, 160 | input_dim=len(alphabet), 161 | n_conv_filters=1024, n_fc_neurons=2048) 162 | else: 163 | sys.exit("Invalid feature mode!") 164 | 165 | log_path = "{}_{}_{}".format(log_path, feature, dataset) 166 | if os.path.isdir(log_path): 167 | shutil.rmtree(log_path) 168 | os.makedirs(log_path) 169 | writer = SummaryWriter(log_path) 170 | 171 | if torch.cuda.is_available(): 172 | model.cuda() 173 | 174 | criterion = nn.CrossEntropyLoss() 175 | if optimizer == "adam": 176 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 177 | elif optimizer == "sgd": 178 | optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9) 179 | best_loss = 1e5 180 | best_epoch = 0 181 | model.train() 182 | num_iter_per_epoch = len(training_generator) 183 | 184 | for epoch in range(num_epochs): 185 | for iter, batch in enumerate(training_generator): 186 | features, label = batch 187 | label = np.array(label, int) 188 | label = torch.Tensor(label) 189 | if torch.cuda.is_available(): 190 | features = features.cuda() 191 | label = label.cuda() 192 | optimizer.zero_grad() 193 | predictions = model(features) 194 | loss = criterion(predictions, label.type(torch.cuda.LongTensor)) 195 | loss.backward() 196 | optimizer.step() 197 | 198 | training_metrics = get_evaluation(label.cpu().numpy(), predictions.cpu().detach().numpy(), 199 | list_metrics=["accuracy"]) 200 | 201 | writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + iter) 202 | writer.add_scalar('Train/Accuracy', training_metrics["accuracy"], epoch * num_iter_per_epoch + iter) 203 | print("Epoch: {}/{}, Lr: {}, Train Loss: {}, Train Accuracy: {}".format( 204 | epoch + 1, 205 | num_epochs, 206 | optimizer.param_groups[0]['lr'], 207 | loss, training_metrics["accuracy"])) 208 | 209 | model.eval() 210 | loss_ls = [] 211 | te_label_ls = [] 212 | te_pred_ls = [] 213 | for batch in test_generator: 214 | te_feature, te_label = batch 215 | num_sample = len(te_label) 216 | te_label = np.array(te_label, int) 217 | te_label = torch.Tensor(te_label) 218 | if torch.cuda.is_available(): 219 | te_feature = te_feature.cuda() 220 | te_label = te_label.cuda() 221 | with torch.no_grad(): 222 | te_predictions = model(te_feature) 223 | te_loss = criterion(te_predictions, te_label.type(torch.cuda.LongTensor)) 224 | loss_ls.append(te_loss * num_sample) 225 | te_label_ls.extend(te_label.clone().cpu()) 226 | te_pred_ls.append(te_predictions.clone().cpu()) 227 | 228 | te_loss = sum(loss_ls) / test_set.__len__() 229 | te_pred = torch.cat(te_pred_ls, 0) 230 | te_label = np.array(te_label_ls) 231 | test_metrics = get_evaluation(te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"]) 232 | output_file.write( 233 | "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n".format( 234 | epoch + 1, num_epochs, 235 | te_loss, 236 | test_metrics["accuracy"], 237 | test_metrics["confusion_matrix"])) 238 | print("Epoch: {}/{}, Lr: {}, Test Loss: {}, Test Accuracy: {}".format( 239 | epoch + 1, 240 | num_epochs, 241 | optimizer.param_groups[0]['lr'], 242 | te_loss, test_metrics["accuracy"])) 243 | writer.add_scalar('Test/Loss', te_loss, epoch) 244 | writer.add_scalar('Test/Accuracy', test_metrics["accuracy"], epoch) 245 | model.train() 246 | if te_loss + es_min_delta < best_loss: 247 | best_loss = te_loss 248 | best_epoch = epoch 249 | torch.save(model, "{}/char-cnn_{}_{}".format(output, dataset, feature)) 250 | # Early stopping 251 | if epoch - best_epoch > es_patience > 0: 252 | print("Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format(epoch, te_loss, best_epoch)) 253 | break 254 | if optimizer == "sgd" and epoch % 3 == 0 and epoch > 0: 255 | current_lr = optimizer.state_dict()['param_groups'][0]['lr'] 256 | current_lr /= 2 257 | for param_group in optimizer.param_groups: 258 | param_group['lr'] = current_lr 259 | 260 | if __name__ == "__main__": 261 | torch.backends.cudnn.deterministic = True 262 | 263 | alphabet = "abcdefghijklmnoprstuvwyz0123456789-,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=<>()[]{}" 264 | max_length = 1500 265 | optimizer = "sgd" 266 | batch_size = 128 267 | num_epochs = 20 268 | lr = 0.001 269 | es_min_delta = 0.0 270 | es_patience = 3 271 | input = "../data/KINNEWS/cleaned" 272 | output = "../output" 273 | log_path = "../tensorboard/char-cnn" 274 | train("small", log_path, optimizer) 275 | --------------------------------------------------------------------------------