├── code
    ├── baselines
    │   ├── classic_models
    │   │   └── Readme.txt
    │   └── neural_models
    │   │   ├── BiGRU.py
    │   │   ├── CNN.py
    │   │   └── char-cnn.py
    ├── cleaning
    │   └── clean_dataset.py
    └── embeddings
    │   └── word2vec_training.py
├── stopwords
    ├── Kirundi
    │   ├── Kirundi stopwords set.txt
    │   └── listed.txt
    └── Kinyarwanda
    │   ├── Kinyarwanda stopwords set.txt
    │   └── listed.txt
├── LICENSE
└── README.md


/code/baselines/classic_models/Readme.txt:
--------------------------------------------------------------------------------
1 | All of the classic baseline models (MNB, LR, and SVM) are implemented with the help of the scikit-learn framework and use its default hyperparameters and Term Frequency Inverse Document Frequency (TFIDF) to get the values of unigram input features. For other hyperparameter settings, please, refer to the original paper.


--------------------------------------------------------------------------------
/stopwords/Kirundi/Kirundi stopwords set.txt:
--------------------------------------------------------------------------------
1 | stopset_kir = {'aba','abo','aho','ari','ata','ati','ayo','ba','bari','bo','bose','bw','bwa','bwo','ca','cane','co','de',
2 |                'ico','iryo','ivyo','iyo','izo','ko','ku','kuri','kuva','kw','maze','mu','muri','mw','na','naho','nayo',
3 |                'ngo','ni','nk','no','rero','rw','ry','rya','ubu','uko','uri','uwo','uyu','vy','vya','vyo','wa','wo',
4 |                'ya','yari','yo','yose','za','zo'}


--------------------------------------------------------------------------------
/stopwords/Kirundi/listed.txt:
--------------------------------------------------------------------------------
 1 | aba
 2 | abo
 3 | aho
 4 | ari
 5 | ata
 6 | ati
 7 | ayo
 8 | ba
 9 | bari
10 | bo
11 | bose
12 | bw
13 | bwa
14 | bwo
15 | ca
16 | cane
17 | co
18 | de
19 | ico
20 | iryo
21 | ivyo
22 | iyo
23 | izo
24 | ko
25 | ku
26 | kuri
27 | kuva
28 | kw
29 | maze
30 | mu
31 | muri
32 | mw
33 | na
34 | naho
35 | nayo
36 | ngo
37 | ni
38 | nk
39 | no
40 | rero
41 | rw
42 | ry
43 | rya
44 | ubu
45 | uko
46 | uri
47 | uwo
48 | uyu
49 | vy
50 | vya
51 | vyo
52 | wa
53 | wo
54 | ya
55 | yari
56 | yo
57 | yose
58 | za
59 | zo
60 | 


--------------------------------------------------------------------------------
/stopwords/Kinyarwanda/Kinyarwanda stopwords set.txt:
--------------------------------------------------------------------------------
1 | stopset_kin = {'aba', 'abo', 'aha', 'aho', 'ari', 'ati', 'aya', 'ayo', 'ba', 'baba', 'babo', 'bari', 'be', 'bo', 'bose',
2 |            'bw', 'bwa', 'bwo', 'by', 'bya', 'byo', 'cy', 'cya', 'cyo', 'hafi', 'ibi', 'ibyo', 'icyo', 'iki',
3 |            'imwe', 'iri', 'iyi', 'iyo', 'izi', 'izo', 'ka', 'ko', 'ku', 'kuri', 'kuva', 'kwa', 'maze', 'mu', 'muri',
4 |            'na', 'naho','nawe', 'ngo', 'ni', 'niba', 'nk', 'nka', 'no', 'nta', 'nuko', 'rero', 'rw', 'rwa', 'rwo', 'ry',
5 |            'rya','ubu', 'ubwo', 'uko', 'undi', 'uri', 'uwo', 'uyu', 'wa', 'wari', 'we', 'wo', 'ya', 'yabo', 'yari', 'ye',
6 |            'yo', 'yose', 'za', 'zo'}


--------------------------------------------------------------------------------
/stopwords/Kinyarwanda/listed.txt:
--------------------------------------------------------------------------------
 1 | aba
 2 | abo
 3 | aha
 4 | aho
 5 | ari
 6 | ati
 7 | aya
 8 | ayo
 9 | ba
10 | baba
11 | babo
12 | bari
13 | be
14 | bo
15 | bose
16 | bw
17 | bwa
18 | bwo
19 | by
20 | bya
21 | byo
22 | cy
23 | cya
24 | cyo
25 | hafi
26 | ibi
27 | ibyo
28 | icyo
29 | iki
30 | imwe
31 | iri
32 | iyi
33 | iyo
34 | izi
35 | izo
36 | ka
37 | ko
38 | ku
39 | kuri
40 | kuva
41 | kwa
42 | maze
43 | mu
44 | muri
45 | na
46 | naho
47 | nawe
48 | ngo
49 | ni
50 | niba
51 | nk
52 | nka
53 | no
54 | nta
55 | nuko
56 | rero
57 | rw
58 | rwa
59 | rwo
60 | ry
61 | rya
62 | ubu
63 | ubwo
64 | uko
65 | undi
66 | uri
67 | uwo
68 | uyu
69 | wa
70 | wari
71 | we
72 | wo
73 | ya
74 | yabo
75 | yari
76 | ye
77 | yo
78 | yose
79 | za
80 | zo
81 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Andrew
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code/cleaning/clean_dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from kkltk.kin_kir_stopwords import stopwords   # check https://github.com/Andrews2017/kkltk for more detailed information about how to use kkltk package
 3 | 
 4 | stopset_kin = stopwords.words('kinyarwanda') 
 5 | 
 6 | # loading the data
 7 | data = pd.read_csv('../data/KINNEWS/raw/train.csv')
 8 | 
 9 | # Cleaning the data (preprocessing)
10 | # Removing the special characters and urls
11 | data.title = data.title.str.replace('[^A-Za-z\s\’\-]+', '')
12 | data.content = data.content.str.replace('[^A-Za-z\s\’\-]+', '')
13 | data.title = data.title.str.replace('[\n]+', '')
14 | data.content = data.content.str.replace('[\n]+', '')
15 | data.title = data.title.str.replace('^https?:\/\/.*[\r\n]*', '')
16 | data.content = data.content.str.replace('^https?:\/\/.*[\r\n]*', '')
17 | 
18 | # Removing the stopwords
19 | data['title'] = data['title'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))
20 | data['content'] = data['content'].apply(lambda x: ' '.join([item.lower() for item in str(x).split() if item not in stopset_kin]))
21 | 
22 | # Save the cleaned dataset
23 | data.to_csv("../data/KINNEWS/cleaned/train.csv", index=False)
24 | 


--------------------------------------------------------------------------------
/code/embeddings/word2vec_training.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import Word2Vec
 2 | import pandas as pd
 3 | 
 4 | # load the data
 5 | data_train = pd.read_csv('../data/KINNEWS/cleaned/train.csv')
 6 | data_test = pd.read_csv('../data/KINNEWS/cleaned/test.csv')
 7 | data = pd.concat([data_train, data_test])
 8 | data['whole_doc'] = data['title'] + ' ' + data['content'].astype(str)
 9 | 
10 | # clean the data (preprocessing)
11 | data.whole_doc = data.whole_doc.str.replace('[^A-Za-z\s\’\-]+', '')
12 | data.whole_doc = data.whole_doc.str.replace('[\n]+', '')
13 | data.whole_doc = data.whole_doc.str.replace('^https?:\/\/.*[\r\n]*', '')
14 | 
15 | # Create the list of list format of the custom corpus for gensim modeling
16 | sent = [row.split(' ') for row in data['whole_doc'] if len(row)]
17 | sent = [[tok.lower() for tok in sub_sent if len(tok) != 0] for sub_sent in sent]
18 | 
19 | # Training the model
20 | w2v_model = Word2Vec(sent, window=5, min_count=5, sg=1, hs=1, vector_size=50)
21 | 
22 | # Generate a list of words with their vectors to make the custom embeddings generation possible
23 | w2v_vectors = []
24 | for token, idx in w2v_model.wv.key_to_index.items():
25 |     str_vec = ''
26 |     if token in w2v_model.wv.key_to_index.keys():
27 |         str_vec += token
28 |         for i in range(len(w2v_model.wv[token])):
29 |             str_vec += ' ' + str(w2v_model.wv[token][i])
30 |     w2v_vectors.append(str_vec)
31 | 
32 | # Save the above embeddings list in txt file
33 | with open("../pre-trained_embeddings/Kinyarwanda/W2V-Kin-50.txt", 'w') as output:
34 |     for row in w2v_vectors:
35 |         output.write(str(row) + '\n')
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # KINNEWS-and-KIRNEWS
 2 | Data, Embeddings, Stopword lists, code, and baselines for [COLING 2020](https://coling2020.org/) paper titled ["KINNEWS and KIRNEWS: Benchmarking Cross-Lingual Text Classification for Kinyarwanda and Kirundi"](https://www.aclweb.org/anthology/2020.coling-main.480/) by [Rubungo Andre Niyongabo](https://scholar.google.com/citations?user=5qnTWQEAAAAJ&hl=en), [Hong Qu](https://scholar.google.com/citations?user=Aiq9mFMAAAAJ&hl=en), [Julia Kreutzer](https://scholar.google.co.uk/citations?user=j4cOSzAAAAAJ&hl=en), and Li Huang.
 3 | 
 4 | This paper introduces Kinyarwanda and Kirundi news classification datasets (KINNEWS and KIRNEWS,respectively), which were both collected from Rwanda and Burundi news websites and newspapers, for low-resource monolingual and cross-lingual multiclass classification tasks. Along with the datasets, we provide statistics, guidelines for preprocessing, pretrained word embeddings, and monolingual and cross-lingual baseline models.
 5 | 
 6 | **Note:** Please, when using any of the resources provided here, remember to [cite](https://www.aclweb.org/anthology/2020.coling-main.480.bib) our paper.
 7 | 
 8 | ## Data
 9 | ### Download the datasets
10 | - The raw and cleaned versions of KINNEWS can be downloaded from [here](https://drive.google.com/drive/folders/1zxn0hgrOLlUsK5V0c7l71eAj1t2jiyox?usp=sharing) (21,268 articles, 14 classes, 45.07MB(raw) and 38.85MB(cleaned))
11 | - The raw and cleaned versions of KIRNEWS can be downloaded from [here](https://drive.google.com/uc?export=download&id=1-53VQFOHqBeoX2JiN01X1Sxgfh78ckru) (4,612 articles, 12 classes, 9.31MB(raw) and 7.77MB(cleaned))
12 | 
13 | ### Datasets description
14 | Each dataset is in camma-separated-value (csv) format, with columns that are described bellow (Note that in the cleaned versions we only remain with 'label','title', and 'content' columns):
15 | | Field | Description |
16 | | ----- | ----------- |
17 | | label | Numerical labels that range from 1 to 14 |
18 | | en_label | English labels |
19 | | kin_label | Kinyarwanda labels |
20 | | kir_label | Kirundi labels |
21 | | url | The link to the news source |
22 | | title | The title of the news article |
23 | | content | The full content of the news article |
24 | 
25 | ## Word embeddings
26 | ### Download pre-trained word embeddings
27 | - The Kinyarwanda embeddings can be downloaded form [here](https://drive.google.com/uc?export=download&id=1-DJuVhD-8YxxOP9CBxzAJeoMTA5An2Xw) (59.88MB for 100d
28 |  and 29.94MB for 50d)
29 | - The Kirundi embeddings can be downloaded from [here](https://drive.google.com/uc?export=download&id=1-BlECMnNPBIVspfBF-qccKQ1LeiEQ7JO) (17.98MB for 100d and 8.96MB for 50d)
30 | 
31 | ### Training your own embeddings 
32 | To train you own word vectors, check out [code/embeddings/word2vec_training.py](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/tree/main/code/embeddings) file or refer to this [gensim](https://radimrehurek.com/gensim/models/word2vec.html) documentation.
33 | 
34 | ## Stopwords
35 | To use our stopwords you may just copy the whole [stopset_kin](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/blob/main/stopwords/Kinyarwanda/Kinyarwanda%20stopwords%20set.txt) for Kinyarwanda and [stopset_kir](https://github.com/Andrews2017/KINNEWS-and-KIRNEWS/blob/main/stopwords/Kirundi/Kirundi%20stopwords%20set.txt) for Kirundi into your code or import them directly from [KKLTK](https://github.com/Andrews2017/kkltk) package, which is more recommended.
36 | 
37 | ## Leaderboard (baselines)
38 | ### Monolingual
39 | #### KINNEWS
40 | | Model | Accuracy(%)|
41 | | ----- | ----------- |
42 | | BiGRU(W2V-Kin-50*) | 88.65 |
43 | | SVM(TF-IDF) | 88.53 |
44 | | BiGRU(W2V-Kin-100) | 88.29 |
45 | | CNN(W2V-Kin-50) | 87.55 |
46 | | CNN(W2V-Kin-100) | 87.54 |
47 | | LR(TF-IDF) | 87.14 |
48 | | MNB(TF-IDF) | 82.70 |
49 | | Char-CNN | 71.70 |
50 | #### KIRNEWS
51 | | Model | Accuracy(%)|
52 | | ----- | ----------- |
53 | | SVM(TF-IDF) | 90.14 |
54 | | CNN(W2V-Kin-100) | 88.01 |
55 | | BiGRU(W2V-Kin-100) | 86.61 |
56 | | LR(TF-IDF) | 86.13|
57 | | BiGRU(W2V-Kin-50) | 85.86 |
58 | | CNN(W2V-Kin-50) | 85.75 |
59 | | MNB(TF-IDF) | 82.67 |
60 | | Char-CNN | 69.23 |
61 | 
62 | ### Cross-lingual
63 | | Model | Train set| Test set | Accuracy(%) |
64 | | ----- | ----------- | ------- | ---------|
65 | | MNB(TF-IDF) | KINNEWS | KIRNEWS | 73.46 |
66 | | SVM(TF-IDF) | KINNEWS | KIRNEWS | 72.70 |
67 | | LR(TF-IDF) | KINNEWS | KIRNEWS | 68.26 |
68 | | BiGRU(W2V-Kin-50) | KINNEWS | KIRNEWS | 67.54 |
69 | | BiGRU(W2V-Kin-100*) | KINNEWS | KIRNEWS | 65.06 |
70 | | CNN(W2V-Kin-100) | KINNEWS | KIRNEWS | 61.72 |
71 | | CNN(W2V-Kin-50) | KINNEWS | KIRNEWS | 60.64 |
72 | | Char-CNN | KINNEWS | KIRNEWS | 49.60 |
73 | 
74 | | Model | Train set| Test set | Accuracy(%) |
75 | | ----- | ----------- | ------- | ---------|
76 | | CNN(W2V-Kin-100) | KIRNEWS | KIRNEWS | 88.01 |
77 | | BiGRU(W2V-Kin-100) | KIRNEWS | KIRNEWS | 86.61 |
78 | | CNN(W2V-Kin-50) | KIRNEWS | KIRNEWS | 85.75 |
79 | | BiGRU(W2V-Kin-50) | KIRNEWS | KIRNEWS | 83.38 |
80 | 


--------------------------------------------------------------------------------
/code/baselines/neural_models/BiGRU.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchtext import data
  3 | import random
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import torchtext.vocab as vocab
  7 | from kkltk.kin_kir_stopwords import stopwords   # check https://github.com/Andrews2017/kkltk for more detailed information about kkltk package
  8 | 
  9 | stopset_kin = stopwords.words('kinyarwanda') 
 10 | 
 11 | ## Build the model
 12 | # Generate the custom embeddings
 13 | custom_embeddings = vocab.Vectors(name='../pre-trained_embeddings/kinyarwanda/W2V-Kin-50.txt',
 14 |                                   cache='../pre-trained_embeddings/kinyarwanda',
 15 |                                   unk_init=torch.Tensor.normal_)
 16 | 
 17 | ### Prepare the data
 18 | SEED = 1234
 19 | torch.manual_seed(SEED)
 20 | torch.cuda.manual_seed(SEED)
 21 | torch.backends.cudnn.deterministic = True
 22 | 
 23 | ## Load the dataset
 24 | # Define fields to hold the data
 25 | LABEL = data.LabelField(dtype=torch.float)
 26 | TITLE = data.Field(tokenize='spacy', stop_words=stopset_kin)
 27 | TEXT = data.Field(tokenize='spacy', stop_words=stopset_kin)
 28 | 
 29 | fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]
 30 | 
 31 | # Upload CSV format dataset and do train/test splits
 32 | train_data, test_data = data.TabularDataset.splits(
 33 |     path='../data/KINNEWS/cleaned',
 34 |     train='train.csv',
 35 |     test='test.csv',
 36 |     format='csv',
 37 |     fields=fields,
 38 |     skip_header=True  # dataset has a header(title)
 39 | )
 40 | 
 41 | # Train/validation set split
 42 | train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED))
 43 | 
 44 | # Build the vocabulary
 45 | TEXT.build_vocab(train_data.title, train_data.content, max_size=10000, vectors=custom_embeddings)
 46 | TITLE.vocab = TEXT.vocab
 47 | LABEL.build_vocab(train_data)
 48 | 
 49 | # Create the iterator and place the tensor it returned on GPU(if it is available)
 50 | BATCH_SIZE = 32
 51 | 
 52 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 53 | 
 54 | train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
 55 |     (train_data, valid_data, test_data),
 56 |     batch_size=BATCH_SIZE,
 57 |     sort_key=lambda x: len(x.content),
 58 |     device=device)
 59 | 
 60 | class RNN(nn.Module):
 61 |     def __init__(self, vocab_size, embedding_dim, hidden_dim,
 62 |                  output_dim, n_layers, bidirectional, dropout):
 63 |         super().__init__()
 64 | 
 65 |         self.embedding = nn.Embedding(vocab_size, embedding_dim)
 66 |         self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=n_layers,
 67 |                           bidirectional=bidirectional, dropout=dropout)
 68 |         self.fc = nn.Linear(hidden_dim * 2, output_dim)
 69 |         self.dropout = nn.Dropout(dropout)
 70 | 
 71 |     def forward(self, text):
 72 |         embedded = self.dropout(self.embedding(text))
 73 |         output, hidden = self.rnn(embedded)
 74 |         hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
 75 |         return self.fc(hidden.squeeze(0))
 76 | 
 77 | # Create the instance of the model
 78 | INPUT_DIM = len(TEXT.vocab)
 79 | EMBEDDING_DIM = 50
 80 | HIDDEN_DIM = 256
 81 | OUTPUT_DIM = len(LABEL.vocab)
 82 | N_LAYERS = 2
 83 | BIDIRECTIONAL = True
 84 | DROPOUT = 0.5
 85 | 
 86 | model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
 87 | 
 88 | pretrained_embeddings = TEXT.vocab.vectors
 89 | 
 90 | model.embedding.weight.data.copy_(pretrained_embeddings)
 91 | 
 92 | optimizer = optim.Adam(model.parameters())
 93 | 
 94 | criterion = nn.CrossEntropyLoss()
 95 | 
 96 | model = model.to(device)
 97 | criterion = criterion.to(device)
 98 | 
 99 | def multiclass_accuracy(preds, y):
100 |     """
101 |     Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
102 |     """
103 |     # Round predictions to the closest integer
104 |     rounded_preds = torch.max(preds, -1)[1]
105 |     correct = (rounded_preds == y).float()  # convert into float for division
106 |     acc = correct.sum() / len(correct)
107 |     return acc
108 | 
109 | def train(model, iterator, optimizer, criterion):
110 |     """ Training the model"""
111 |     epoch_loss = 0
112 |     epoch_acc = 0
113 |     model.train()
114 | 
115 |     for batch in iterator:
116 |         optimizer.zero_grad()
117 |         predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
118 |         loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor))
119 |         acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor))
120 |         loss.backward()
121 |         optimizer.step()
122 |         epoch_loss += loss.item()
123 |         epoch_acc += acc.item()
124 | 
125 |     return epoch_loss / len(iterator), epoch_acc / len(iterator)
126 | 
127 | def evaluate(model, iterator, criterion):
128 |     """ Evaluating the model"""
129 |     epoch_loss = 0
130 |     epoch_acc = 0
131 |     model.eval()
132 | 
133 |     with torch.no_grad():
134 |         for batch in iterator:
135 |             predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
136 |             loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor))
137 |             acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor))
138 |             epoch_loss += loss.item()
139 |             epoch_acc += acc.item()
140 | 
141 |     return epoch_loss / len(iterator), epoch_acc / len(iterator)
142 | 
143 | def count_parameters(model):
144 |     """ Counting the model parameters"""
145 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
146 | 
147 | if __name__ == "__main__":
148 |     N_EPOCHS = 10
149 | 
150 |     for epoch in range(N_EPOCHS):
151 |         train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
152 |         valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
153 |         print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |'
154 |               f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |')
155 | 
156 |     test_loss, test_acc = evaluate(model, test_iterator, criterion)
157 |     print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |')
158 | 
159 |     print(f'The model has {count_parameters(model):,} trainable parameters')
160 | 


--------------------------------------------------------------------------------
/code/baselines/neural_models/CNN.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torchtext import data
  3 | import random
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | import torchtext.vocab as vocab
  7 | import torch.nn.functional as F
  8 | from kkltk.kin_kir_stopwords import stopwords   # check https://github.com/Andrews2017/kkltk for more detailed information about kkltk package
  9 | 
 10 | stopset_kin = stopwords.words('kinyarwanda') 
 11 | 
 12 | # Generate the custom embeddings
 13 | custom_embeddings = vocab.Vectors(name='../pre-trained_embeddings/kinyarwanda/W2V-Kin-50.txt',
 14 |                                   cache='../pre-trained_embeddings/kinyarwanda',
 15 |                                   unk_init=torch.Tensor.normal_)
 16 | 
 17 | ### Prepare the data
 18 | SEED = 1234
 19 | torch.manual_seed(SEED)
 20 | torch.cuda.manual_seed(SEED)
 21 | torch.backends.cudnn.deterministic = True
 22 | 
 23 | ## Load the dataset
 24 | # Define fields to hold the data
 25 | LABEL = data.LabelField(dtype=torch.float)
 26 | TITLE = data.Field(tokenize='spacy')
 27 | TEXT = data.Field(tokenize='spacy')
 28 | 
 29 | fields = [('label', LABEL), ('title', TITLE), ('content', TEXT)]
 30 | 
 31 | # Upload CSV format dataset and do train/test splits
 32 | train_data, test_data = data.TabularDataset.splits(
 33 |     path='../data/KINNEWS/cleaned',
 34 |     train='train.csv',
 35 |     test='test.csv',
 36 |     format='csv',
 37 |     fields=fields,
 38 |     skip_header=True  # dataset has a header(title)
 39 | )
 40 | 
 41 | # train/validation set split
 42 | train_data, valid_data = train_data.split(split_ratio=0.9, random_state=random.seed(SEED), )
 43 | 
 44 | # build the vocabulary
 45 | TEXT.build_vocab(train_data.title, train_data.content, max_size=15000, vectors=custom_embeddings)
 46 | TITLE.vocab = TEXT.vocab
 47 | LABEL.build_vocab(train_data)
 48 | 
 49 | # create the iterator and place the tensor it returned on GPU(if it is available)
 50 | BATCH_SIZE = 32
 51 | 
 52 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 53 | # print(TEXT.vocab.vectors)
 54 | 
 55 | train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
 56 |     (train_data, valid_data, test_data),
 57 |     batch_size=BATCH_SIZE,
 58 |     sort_key=lambda x: len(x.content),
 59 |     device=device)
 60 | 
 61 | 
 62 | # Build the model
 63 | class CNN(nn.Module):
 64 |     def __init__(self, vocab_size, embedding_dim, n_filters,
 65 |                  filter_sizes, output_dim, dropout, pad_idx):
 66 |         super().__init__()
 67 | 
 68 |         self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
 69 |         self.convs = nn.ModuleList([
 70 |             nn.Conv2d(in_channels=1,
 71 |                       out_channels=n_filters,
 72 |                       kernel_size=(fs, embedding_dim))
 73 |             for fs in filter_sizes
 74 |         ])
 75 |         self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
 76 |         self.dropout = nn.Dropout(dropout)
 77 | 
 78 |     def forward(self, text):
 79 |         text = text.permute(1, 0)
 80 |         embedded = self.embedding(text)
 81 |         embedded = embedded.unsqueeze(1)
 82 |         conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
 83 |         pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
 84 |         cat = self.dropout(torch.cat(pooled, dim=1))
 85 |         return self.fc(cat)
 86 | 
 87 | # create the instance of the model
 88 | INPUT_DIM = len(TEXT.vocab)
 89 | EMBEDDING_DIM = 50
 90 | N_FILTERS = 150
 91 | FILTER_SIZES = [3, 4, 5]
 92 | OUTPUT_DIM = len(LABEL.vocab)
 93 | DROPOUT = 0.5
 94 | PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
 95 | 
 96 | model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
 97 | 
 98 | pretrained_embeddings = TEXT.vocab.vectors
 99 | 
100 | model.embedding.weight.data.copy_(pretrained_embeddings)
101 | 
102 | UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
103 | 
104 | model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
105 | model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
106 | 
107 | optimizer = optim.Adam(model.parameters())
108 | 
109 | criterion = nn.CrossEntropyLoss()
110 | 
111 | model = model.to(device)
112 | criterion = criterion.to(device)
113 | 
114 | def multiclass_accuracy(preds, y):
115 |     """
116 |     Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
117 |     """
118 |     # round predictions to the closest integer
119 |     rounded_preds = torch.max(preds, -1)[1]
120 |     correct = (rounded_preds == y).float()  # convert into float for division
121 |     acc = correct.sum() / len(correct)
122 |     return acc
123 | 
124 | 
125 | # train the model
126 | def train(model, iterator, optimizer, criterion):
127 |     """ Training the model"""
128 |     epoch_loss = 0
129 |     epoch_acc = 0
130 |     model.train()
131 | 
132 |     for batch in iterator:
133 |         optimizer.zero_grad()
134 | 
135 |         predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
136 |         loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor))
137 |         acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor))
138 |         loss.backward()
139 |         optimizer.step()
140 |         epoch_loss += loss.item()
141 |         epoch_acc += acc.item()
142 | 
143 |     return epoch_loss / len(iterator), epoch_acc / len(iterator)
144 | 
145 | # evaluate the model
146 | def evaluate(model, iterator, criterion):
147 |     """ Evaluating the model"""
148 |     epoch_loss = 0
149 |     epoch_acc = 0
150 |     model.eval()
151 | 
152 |     with torch.no_grad():
153 |         for batch in iterator:
154 |             predictions = model(torch.cat((batch.title, batch.content), 0)).squeeze(1)
155 |             loss = criterion(predictions, batch.label.type(torch.cuda.LongTensor))
156 |             acc = multiclass_accuracy(predictions, batch.label.type(torch.cuda.LongTensor))
157 |             epoch_loss += loss.item()
158 |             epoch_acc += acc.item()
159 | 
160 |     return epoch_loss / len(iterator), epoch_acc / len(iterator)
161 | 
162 | def count_parameters(model):
163 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
164 | 
165 | if __name__ == "__main__":
166 |     N_EPOCHS = 8
167 | 
168 |     for epoch in range(N_EPOCHS):
169 |         train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
170 |         valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
171 |         print(f'\n| Epoch: {epoch + 1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}% |'
172 |               f' Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}% |')
173 | 
174 |     test_loss, test_acc = evaluate(model, test_iterator, criterion)
175 |     print(f'| Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}% |')
176 | 
177 |     print(f'The model has {count_parameters(model):,} trainable parameters')
178 | 


--------------------------------------------------------------------------------
/code/baselines/neural_models/char-cnn.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import csv
  4 | import torch
  5 | import shutil
  6 | import torch.nn as nn
  7 | import numpy as np
  8 | from sklearn import metrics
  9 | from torch.utils.data import Dataset
 10 | from torch.utils.data import DataLoader
 11 | from tensorboardX import SummaryWriter
 12 | csv.field_size_limit(sys.maxsize)
 13 | 
 14 | class MyDataset(Dataset):
 15 |     def __init__(self, data_path, max_length=1500):
 16 |         self.data_path = data_path
 17 |         self.vocabulary = list("""abcdefghijklmnoprstuvwyz0123456789-,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=<>()[]{}""") # removed 'q' and 'x' as they do not exist in Kinyarwanda and Kirundi alphabets
 18 |         self.identity_mat = np.identity(len(self.vocabulary))
 19 |         texts, labels = [], []
 20 |         with open(data_path) as csv_file:
 21 |             reader = csv.reader(csv_file, quotechar='"')
 22 |             for idx, line in enumerate(reader):
 23 |               if idx != 0:
 24 |                 text = ""
 25 |                 for tx in line[1:]:
 26 |                     text += tx
 27 |                     text += " "
 28 |                 label = line[0]
 29 |                 texts.append(text)
 30 |                 labels.append(label)
 31 |         self.texts = texts
 32 |         self.labels = labels
 33 |         self.max_length = max_length
 34 |         self.length = len(self.labels)
 35 |         self.num_classes = len(set(self.labels))
 36 | 
 37 |     def __len__(self):
 38 |         return self.length
 39 | 
 40 |     def __getitem__(self, index):
 41 |         raw_text = self.texts[index]
 42 |         data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text) if i in self.vocabulary],
 43 |                         dtype=np.float32)
 44 |         if len(data) > self.max_length:
 45 |             data = data[:self.max_length]
 46 |         elif 0 < len(data) < self.max_length:
 47 |             data = np.concatenate(
 48 |                 (data, np.zeros((self.max_length - len(data), len(self.vocabulary)), dtype=np.float32)))
 49 |         elif len(data) == 0:
 50 |             data = np.zeros((self.max_length, len(self.vocabulary)), dtype=np.float32)
 51 |         label = self.labels[index]
 52 |         return data, label
 53 | 
 54 | def get_evaluation(y_true, y_prob, list_metrics):
 55 |     y_pred = np.argmax(y_prob, -1)
 56 |     output = {}
 57 |     if 'accuracy' in list_metrics:
 58 |         output['accuracy'] = metrics.accuracy_score(y_true, y_pred)
 59 |     if 'loss' in list_metrics:
 60 |         try:
 61 |             output['loss'] = metrics.log_loss(y_true, y_prob)
 62 |         except ValueError:
 63 |             output['loss'] = -1
 64 |     if 'confusion_matrix' in list_metrics:
 65 |         output['confusion_matrix'] = str(metrics.confusion_matrix(y_true, y_pred))
 66 |     return output
 67 | 
 68 | class CharacterLevelCNN(nn.Module):
 69 |     def __init__(self, n_classes=14, input_length=1500, input_dim=68,
 70 |                  n_conv_filters=256,
 71 |                  n_fc_neurons=1024):
 72 |         super(CharacterLevelCNN, self).__init__()
 73 |         self.conv1 = nn.Sequential(nn.Conv1d(input_dim, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(),
 74 |                                    nn.MaxPool1d(3))
 75 |         self.conv2 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=7, padding=0), nn.ReLU(),
 76 |                                    nn.MaxPool1d(3))
 77 |         self.conv3 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
 78 |         self.conv4 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
 79 |         self.conv5 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU())
 80 |         self.conv6 = nn.Sequential(nn.Conv1d(n_conv_filters, n_conv_filters, kernel_size=3, padding=0), nn.ReLU(),
 81 |                                    nn.MaxPool1d(3))
 82 |         # compute the  output shape after forwarding an input to the conv layers
 83 |         input_shape = (128,
 84 |                       input_length,
 85 |                       input_dim)
 86 |         self.output_dimension = self._get_conv_output(input_shape)
 87 | 
 88 |         self.fc1 = nn.Sequential(nn.Linear(self.output_dimension, n_fc_neurons), nn.Dropout(0.5))
 89 |         self.fc2 = nn.Sequential(nn.Linear(n_fc_neurons, n_fc_neurons), nn.Dropout(0.5))
 90 |         self.fc3 = nn.Linear(n_fc_neurons, n_classes)
 91 | 
 92 |         if n_conv_filters == 256 and n_fc_neurons == 1024:
 93 |             self._create_weights(mean=0.0, std=0.05)
 94 |         elif n_conv_filters == 1024 and n_fc_neurons == 2048:
 95 |             self._create_weights(mean=0.0, std=0.02)
 96 | 
 97 |     def _create_weights(self, mean=0.0, std=0.05):
 98 |         for module in self.modules():
 99 |             if isinstance(module, nn.Conv1d) or isinstance(module, nn.Linear):
100 |                 module.weight.data.normal_(mean, std)
101 | 
102 |     def _get_conv_output(self, shape):
103 |         x = torch.rand(shape)
104 |         x = x.transpose(1, 2)
105 |         x = self.conv1(x)
106 |         x = self.conv2(x)
107 |         x = self.conv3(x)
108 |         x = self.conv4(x)
109 |         x = self.conv5(x)
110 |         x = self.conv6(x)
111 |         x = x.view(x.size(0), -1)
112 |         output_dimension = x.size(1)
113 |         return output_dimension
114 | 
115 |     def forward(self, input):
116 |         input = input.transpose(1, 2)
117 |         output = self.conv1(input)
118 |         output = self.conv2(output)
119 |         output = self.conv3(output)
120 |         output = self.conv4(output)
121 |         output = self.conv5(output)
122 |         output = self.conv6(output)
123 | 
124 |         output = output.view(output.size(0), -1)
125 |         output = self.fc1(output)
126 |         output = self.fc2(output)
127 |         output = self.fc3(output)
128 | 
129 |         return output
130 | 
131 | def train(feature,log_path, optimizer):
132 |     if torch.cuda.is_available():
133 |         torch.cuda.manual_seed(123)
134 |     else:
135 |         torch.manual_seed(123)
136 | 
137 |     if not os.path.exists(output):
138 |         os.makedirs(output)
139 |     output_file = open(output + os.sep + "logs.txt", "w")
140 |     output_file.write("Model's parameters: {}".format())
141 | 
142 |     training_params = {"batch_size": batch_size,
143 |                        "shuffle": True,
144 |                        "num_workers": 0}
145 |     test_params = {"batch_size": batch_size,
146 |                    "shuffle": True,
147 |                    "num_workers": 0}
148 |     training_set = MyDataset(input + "/train.csv", max_length)
149 |     test_set = MyDataset(input + "/test.csv", max_length)
150 |     training_generator = DataLoader(training_set, **training_params)
151 |     test_generator = DataLoader(test_set, **test_params)
152 | 
153 |     if feature == "small":
154 |         model = CharacterLevelCNN(input_length=max_length, n_classes=training_set.num_classes,
155 |                                   input_dim=len(alphabet),
156 |                                   n_conv_filters=256, n_fc_neurons=1024)
157 | 
158 |     elif feature == "large":
159 |         model = CharacterLevelCNN(input_length=max_length, n_classes=training_set.num_classes,
160 |                                   input_dim=len(alphabet),
161 |                                   n_conv_filters=1024, n_fc_neurons=2048)
162 |     else:
163 |         sys.exit("Invalid feature mode!")
164 | 
165 |     log_path = "{}_{}_{}".format(log_path, feature, dataset)
166 |     if os.path.isdir(log_path):
167 |         shutil.rmtree(log_path)
168 |     os.makedirs(log_path)
169 |     writer = SummaryWriter(log_path)
170 | 
171 |     if torch.cuda.is_available():
172 |         model.cuda()
173 | 
174 |     criterion = nn.CrossEntropyLoss()
175 |     if optimizer == "adam":
176 |         optimizer = torch.optim.Adam(model.parameters(), lr=lr)
177 |     elif optimizer == "sgd":
178 |         optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
179 |     best_loss = 1e5
180 |     best_epoch = 0
181 |     model.train()
182 |     num_iter_per_epoch = len(training_generator)
183 | 
184 |     for epoch in range(num_epochs):
185 |         for iter, batch in enumerate(training_generator):
186 |             features, label = batch
187 |             label = np.array(label, int)
188 |             label = torch.Tensor(label)
189 |             if torch.cuda.is_available():
190 |                 features = features.cuda()
191 |                 label = label.cuda()
192 |             optimizer.zero_grad()
193 |             predictions = model(features)
194 |             loss = criterion(predictions, label.type(torch.cuda.LongTensor))
195 |             loss.backward()
196 |             optimizer.step()
197 | 
198 |             training_metrics = get_evaluation(label.cpu().numpy(), predictions.cpu().detach().numpy(),
199 |                                               list_metrics=["accuracy"])
200 | 
201 |             writer.add_scalar('Train/Loss', loss, epoch * num_iter_per_epoch + iter)
202 |             writer.add_scalar('Train/Accuracy', training_metrics["accuracy"], epoch * num_iter_per_epoch + iter)
203 |         print("Epoch: {}/{}, Lr: {}, Train Loss: {}, Train Accuracy: {}".format(
204 |                 epoch + 1,
205 |                 num_epochs,
206 |                 optimizer.param_groups[0]['lr'],
207 |                 loss, training_metrics["accuracy"]))
208 | 
209 |         model.eval()
210 |         loss_ls = []
211 |         te_label_ls = []
212 |         te_pred_ls = []
213 |         for batch in test_generator:
214 |             te_feature, te_label = batch
215 |             num_sample = len(te_label)
216 |             te_label = np.array(te_label, int)
217 |             te_label = torch.Tensor(te_label)
218 |             if torch.cuda.is_available():
219 |                 te_feature = te_feature.cuda()
220 |                 te_label = te_label.cuda()
221 |             with torch.no_grad():
222 |                 te_predictions = model(te_feature)
223 |             te_loss = criterion(te_predictions, te_label.type(torch.cuda.LongTensor))
224 |             loss_ls.append(te_loss * num_sample)
225 |             te_label_ls.extend(te_label.clone().cpu())
226 |             te_pred_ls.append(te_predictions.clone().cpu())
227 | 
228 |         te_loss = sum(loss_ls) / test_set.__len__()
229 |         te_pred = torch.cat(te_pred_ls, 0)
230 |         te_label = np.array(te_label_ls)
231 |         test_metrics = get_evaluation(te_label, te_pred.numpy(), list_metrics=["accuracy", "confusion_matrix"])
232 |         output_file.write(
233 |             "Epoch: {}/{} \nTest loss: {} Test accuracy: {} \nTest confusion matrix: \n{}\n\n".format(
234 |                 epoch + 1, num_epochs,
235 |                 te_loss,
236 |                 test_metrics["accuracy"],
237 |                 test_metrics["confusion_matrix"]))
238 |         print("Epoch: {}/{}, Lr: {}, Test Loss: {}, Test Accuracy: {}".format(
239 |             epoch + 1,
240 |             num_epochs,
241 |             optimizer.param_groups[0]['lr'],
242 |             te_loss, test_metrics["accuracy"]))
243 |         writer.add_scalar('Test/Loss', te_loss, epoch)
244 |         writer.add_scalar('Test/Accuracy', test_metrics["accuracy"], epoch)
245 |         model.train()
246 |         if te_loss + es_min_delta < best_loss:
247 |             best_loss = te_loss
248 |             best_epoch = epoch
249 |             torch.save(model, "{}/char-cnn_{}_{}".format(output, dataset, feature))
250 |         # Early stopping
251 |         if epoch - best_epoch > es_patience > 0:
252 |             print("Stop training at epoch {}. The lowest loss achieved is {} at epoch {}".format(epoch, te_loss, best_epoch))
253 |             break
254 |         if optimizer == "sgd" and epoch % 3 == 0 and epoch > 0:
255 |             current_lr = optimizer.state_dict()['param_groups'][0]['lr']
256 |             current_lr /= 2
257 |             for param_group in optimizer.param_groups:
258 |                 param_group['lr'] = current_lr
259 | 
260 | if __name__ == "__main__":
261 |     torch.backends.cudnn.deterministic = True
262 | 
263 |     alphabet = "abcdefghijklmnoprstuvwyz0123456789-,;.!?:’’’/\|_@#$%ˆ&*˜‘+-=<>()[]{}"
264 |     max_length = 1500
265 |     optimizer = "sgd"
266 |     batch_size = 128
267 |     num_epochs = 20
268 |     lr = 0.001
269 |     es_min_delta = 0.0
270 |     es_patience = 3
271 |     input = "../data/KINNEWS/cleaned"
272 |     output = "../output"
273 |     log_path = "../tensorboard/char-cnn"
274 |     train("small", log_path, optimizer)
275 | 


--------------------------------------------------------------------------------