├── README.md
└── Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # AI4D Swahili News Classification Challenge
2 |
3 | Kiswahili is a lingua franca spoken by 100-150 million people across East Africa. It is an official language in the DRC, Kenya, Tanzania, and Uganda; in Tanzania it is a first language for most people and the official language of instruction in all schools, while in other countries it is a common second language. News in Kiswahili is an important part of the media sphere in East Africa.
4 |
5 | News contributes to education, technology, and economic growth of a country, and news in local languages plays an important cultural role in many Africa countries. In the modern age, African languages in news and other spheres are at risk of being lost as English becomes the dominant language in online spaces.
6 |
7 | The objective of this hackathon was to develop a multi-class classification model to classify news content according to six specific categories.The model is to be used by Swahili online platforms to automatically group news according to their categories and help readers find the specific news they want to read.
8 |
9 | The evaluation metric for this challenge is **Log Loss**.
10 | ---
11 |
--------------------------------------------------------------------------------
/Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "name": "Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb",
7 | "provenance": [],
8 | "collapsed_sections": []
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "accelerator": "GPU",
15 | "widgets": {
16 | "application/vnd.jupyter.widget-state+json": {
17 | "fa3a8735e68c41c7accffc74ce9447ae": {
18 | "model_module": "@jupyter-widgets/controls",
19 | "model_name": "HBoxModel",
20 | "state": {
21 | "_view_name": "HBoxView",
22 | "_dom_classes": [],
23 | "_model_name": "HBoxModel",
24 | "_view_module": "@jupyter-widgets/controls",
25 | "_model_module_version": "1.5.0",
26 | "_view_count": null,
27 | "_view_module_version": "1.5.0",
28 | "box_style": "",
29 | "layout": "IPY_MODEL_ec598ba5ac874ce7853c68e8ebe93645",
30 | "_model_module": "@jupyter-widgets/controls",
31 | "children": [
32 | "IPY_MODEL_8b748858dc2f43dbbe61e0e7d60364a0",
33 | "IPY_MODEL_4d3c49b123ea443a875493bb77e162cb"
34 | ]
35 | }
36 | },
37 | "ec598ba5ac874ce7853c68e8ebe93645": {
38 | "model_module": "@jupyter-widgets/base",
39 | "model_name": "LayoutModel",
40 | "state": {
41 | "_view_name": "LayoutView",
42 | "grid_template_rows": null,
43 | "right": null,
44 | "justify_content": null,
45 | "_view_module": "@jupyter-widgets/base",
46 | "overflow": null,
47 | "_model_module_version": "1.2.0",
48 | "_view_count": null,
49 | "flex_flow": null,
50 | "width": null,
51 | "min_width": null,
52 | "border": null,
53 | "align_items": null,
54 | "bottom": null,
55 | "_model_module": "@jupyter-widgets/base",
56 | "top": null,
57 | "grid_column": null,
58 | "overflow_y": null,
59 | "overflow_x": null,
60 | "grid_auto_flow": null,
61 | "grid_area": null,
62 | "grid_template_columns": null,
63 | "flex": null,
64 | "_model_name": "LayoutModel",
65 | "justify_items": null,
66 | "grid_row": null,
67 | "max_height": null,
68 | "align_content": null,
69 | "visibility": null,
70 | "align_self": null,
71 | "height": null,
72 | "min_height": null,
73 | "padding": null,
74 | "grid_auto_rows": null,
75 | "grid_gap": null,
76 | "max_width": null,
77 | "order": null,
78 | "_view_module_version": "1.2.0",
79 | "grid_template_areas": null,
80 | "object_position": null,
81 | "object_fit": null,
82 | "grid_auto_columns": null,
83 | "margin": null,
84 | "display": null,
85 | "left": null
86 | }
87 | },
88 | "8b748858dc2f43dbbe61e0e7d60364a0": {
89 | "model_module": "@jupyter-widgets/controls",
90 | "model_name": "FloatProgressModel",
91 | "state": {
92 | "_view_name": "ProgressView",
93 | "style": "IPY_MODEL_958db85b2ea744dd81c994a50933d1b5",
94 | "_dom_classes": [],
95 | "description": "Downloading: 100%",
96 | "_model_name": "FloatProgressModel",
97 | "bar_style": "success",
98 | "max": 512,
99 | "_view_module": "@jupyter-widgets/controls",
100 | "_model_module_version": "1.5.0",
101 | "value": 512,
102 | "_view_count": null,
103 | "_view_module_version": "1.5.0",
104 | "orientation": "horizontal",
105 | "min": 0,
106 | "description_tooltip": null,
107 | "_model_module": "@jupyter-widgets/controls",
108 | "layout": "IPY_MODEL_f78f05533cca49f995448596c4231a98"
109 | }
110 | },
111 | "4d3c49b123ea443a875493bb77e162cb": {
112 | "model_module": "@jupyter-widgets/controls",
113 | "model_name": "HTMLModel",
114 | "state": {
115 | "_view_name": "HTMLView",
116 | "style": "IPY_MODEL_9619e3c1eb3a409899c702486a17e792",
117 | "_dom_classes": [],
118 | "description": "",
119 | "_model_name": "HTMLModel",
120 | "placeholder": "",
121 | "_view_module": "@jupyter-widgets/controls",
122 | "_model_module_version": "1.5.0",
123 | "value": " 512/512 [00:00<00:00, 6.18kB/s]",
124 | "_view_count": null,
125 | "_view_module_version": "1.5.0",
126 | "description_tooltip": null,
127 | "_model_module": "@jupyter-widgets/controls",
128 | "layout": "IPY_MODEL_28422b1b07ea447db4261b9ea662659c"
129 | }
130 | },
131 | "958db85b2ea744dd81c994a50933d1b5": {
132 | "model_module": "@jupyter-widgets/controls",
133 | "model_name": "ProgressStyleModel",
134 | "state": {
135 | "_view_name": "StyleView",
136 | "_model_name": "ProgressStyleModel",
137 | "description_width": "initial",
138 | "_view_module": "@jupyter-widgets/base",
139 | "_model_module_version": "1.5.0",
140 | "_view_count": null,
141 | "_view_module_version": "1.2.0",
142 | "bar_color": null,
143 | "_model_module": "@jupyter-widgets/controls"
144 | }
145 | },
146 | "f78f05533cca49f995448596c4231a98": {
147 | "model_module": "@jupyter-widgets/base",
148 | "model_name": "LayoutModel",
149 | "state": {
150 | "_view_name": "LayoutView",
151 | "grid_template_rows": null,
152 | "right": null,
153 | "justify_content": null,
154 | "_view_module": "@jupyter-widgets/base",
155 | "overflow": null,
156 | "_model_module_version": "1.2.0",
157 | "_view_count": null,
158 | "flex_flow": null,
159 | "width": null,
160 | "min_width": null,
161 | "border": null,
162 | "align_items": null,
163 | "bottom": null,
164 | "_model_module": "@jupyter-widgets/base",
165 | "top": null,
166 | "grid_column": null,
167 | "overflow_y": null,
168 | "overflow_x": null,
169 | "grid_auto_flow": null,
170 | "grid_area": null,
171 | "grid_template_columns": null,
172 | "flex": null,
173 | "_model_name": "LayoutModel",
174 | "justify_items": null,
175 | "grid_row": null,
176 | "max_height": null,
177 | "align_content": null,
178 | "visibility": null,
179 | "align_self": null,
180 | "height": null,
181 | "min_height": null,
182 | "padding": null,
183 | "grid_auto_rows": null,
184 | "grid_gap": null,
185 | "max_width": null,
186 | "order": null,
187 | "_view_module_version": "1.2.0",
188 | "grid_template_areas": null,
189 | "object_position": null,
190 | "object_fit": null,
191 | "grid_auto_columns": null,
192 | "margin": null,
193 | "display": null,
194 | "left": null
195 | }
196 | },
197 | "9619e3c1eb3a409899c702486a17e792": {
198 | "model_module": "@jupyter-widgets/controls",
199 | "model_name": "DescriptionStyleModel",
200 | "state": {
201 | "_view_name": "StyleView",
202 | "_model_name": "DescriptionStyleModel",
203 | "description_width": "",
204 | "_view_module": "@jupyter-widgets/base",
205 | "_model_module_version": "1.5.0",
206 | "_view_count": null,
207 | "_view_module_version": "1.2.0",
208 | "_model_module": "@jupyter-widgets/controls"
209 | }
210 | },
211 | "28422b1b07ea447db4261b9ea662659c": {
212 | "model_module": "@jupyter-widgets/base",
213 | "model_name": "LayoutModel",
214 | "state": {
215 | "_view_name": "LayoutView",
216 | "grid_template_rows": null,
217 | "right": null,
218 | "justify_content": null,
219 | "_view_module": "@jupyter-widgets/base",
220 | "overflow": null,
221 | "_model_module_version": "1.2.0",
222 | "_view_count": null,
223 | "flex_flow": null,
224 | "width": null,
225 | "min_width": null,
226 | "border": null,
227 | "align_items": null,
228 | "bottom": null,
229 | "_model_module": "@jupyter-widgets/base",
230 | "top": null,
231 | "grid_column": null,
232 | "overflow_y": null,
233 | "overflow_x": null,
234 | "grid_auto_flow": null,
235 | "grid_area": null,
236 | "grid_template_columns": null,
237 | "flex": null,
238 | "_model_name": "LayoutModel",
239 | "justify_items": null,
240 | "grid_row": null,
241 | "max_height": null,
242 | "align_content": null,
243 | "visibility": null,
244 | "align_self": null,
245 | "height": null,
246 | "min_height": null,
247 | "padding": null,
248 | "grid_auto_rows": null,
249 | "grid_gap": null,
250 | "max_width": null,
251 | "order": null,
252 | "_view_module_version": "1.2.0",
253 | "grid_template_areas": null,
254 | "object_position": null,
255 | "object_fit": null,
256 | "grid_auto_columns": null,
257 | "margin": null,
258 | "display": null,
259 | "left": null
260 | }
261 | },
262 | "560d3fa635694e2a9a9410a011737075": {
263 | "model_module": "@jupyter-widgets/controls",
264 | "model_name": "HBoxModel",
265 | "state": {
266 | "_view_name": "HBoxView",
267 | "_dom_classes": [],
268 | "_model_name": "HBoxModel",
269 | "_view_module": "@jupyter-widgets/controls",
270 | "_model_module_version": "1.5.0",
271 | "_view_count": null,
272 | "_view_module_version": "1.5.0",
273 | "box_style": "",
274 | "layout": "IPY_MODEL_2c5283f30c8f428a8dd509702b90729b",
275 | "_model_module": "@jupyter-widgets/controls",
276 | "children": [
277 | "IPY_MODEL_86f143f04251403e8ed50fe52e72df2a",
278 | "IPY_MODEL_144eea02d8b2441196d91c646be08d45"
279 | ]
280 | }
281 | },
282 | "2c5283f30c8f428a8dd509702b90729b": {
283 | "model_module": "@jupyter-widgets/base",
284 | "model_name": "LayoutModel",
285 | "state": {
286 | "_view_name": "LayoutView",
287 | "grid_template_rows": null,
288 | "right": null,
289 | "justify_content": null,
290 | "_view_module": "@jupyter-widgets/base",
291 | "overflow": null,
292 | "_model_module_version": "1.2.0",
293 | "_view_count": null,
294 | "flex_flow": null,
295 | "width": null,
296 | "min_width": null,
297 | "border": null,
298 | "align_items": null,
299 | "bottom": null,
300 | "_model_module": "@jupyter-widgets/base",
301 | "top": null,
302 | "grid_column": null,
303 | "overflow_y": null,
304 | "overflow_x": null,
305 | "grid_auto_flow": null,
306 | "grid_area": null,
307 | "grid_template_columns": null,
308 | "flex": null,
309 | "_model_name": "LayoutModel",
310 | "justify_items": null,
311 | "grid_row": null,
312 | "max_height": null,
313 | "align_content": null,
314 | "visibility": null,
315 | "align_self": null,
316 | "height": null,
317 | "min_height": null,
318 | "padding": null,
319 | "grid_auto_rows": null,
320 | "grid_gap": null,
321 | "max_width": null,
322 | "order": null,
323 | "_view_module_version": "1.2.0",
324 | "grid_template_areas": null,
325 | "object_position": null,
326 | "object_fit": null,
327 | "grid_auto_columns": null,
328 | "margin": null,
329 | "display": null,
330 | "left": null
331 | }
332 | },
333 | "86f143f04251403e8ed50fe52e72df2a": {
334 | "model_module": "@jupyter-widgets/controls",
335 | "model_name": "FloatProgressModel",
336 | "state": {
337 | "_view_name": "ProgressView",
338 | "style": "IPY_MODEL_fdb5ff5ec5ab4fa486f8714cd185d799",
339 | "_dom_classes": [],
340 | "description": "Downloading: 100%",
341 | "_model_name": "FloatProgressModel",
342 | "bar_style": "success",
343 | "max": 512,
344 | "_view_module": "@jupyter-widgets/controls",
345 | "_model_module_version": "1.5.0",
346 | "value": 512,
347 | "_view_count": null,
348 | "_view_module_version": "1.5.0",
349 | "orientation": "horizontal",
350 | "min": 0,
351 | "description_tooltip": null,
352 | "_model_module": "@jupyter-widgets/controls",
353 | "layout": "IPY_MODEL_901791b19c524307b8105bc71fa3d27b"
354 | }
355 | },
356 | "144eea02d8b2441196d91c646be08d45": {
357 | "model_module": "@jupyter-widgets/controls",
358 | "model_name": "HTMLModel",
359 | "state": {
360 | "_view_name": "HTMLView",
361 | "style": "IPY_MODEL_b06a0c9a224641fabc62ebcc8241bca6",
362 | "_dom_classes": [],
363 | "description": "",
364 | "_model_name": "HTMLModel",
365 | "placeholder": "",
366 | "_view_module": "@jupyter-widgets/controls",
367 | "_model_module_version": "1.5.0",
368 | "value": " 512/512 [00:03<00:00, 160B/s]",
369 | "_view_count": null,
370 | "_view_module_version": "1.5.0",
371 | "description_tooltip": null,
372 | "_model_module": "@jupyter-widgets/controls",
373 | "layout": "IPY_MODEL_c014fdd5fa564f50be83089471cc21e4"
374 | }
375 | },
376 | "fdb5ff5ec5ab4fa486f8714cd185d799": {
377 | "model_module": "@jupyter-widgets/controls",
378 | "model_name": "ProgressStyleModel",
379 | "state": {
380 | "_view_name": "StyleView",
381 | "_model_name": "ProgressStyleModel",
382 | "description_width": "initial",
383 | "_view_module": "@jupyter-widgets/base",
384 | "_model_module_version": "1.5.0",
385 | "_view_count": null,
386 | "_view_module_version": "1.2.0",
387 | "bar_color": null,
388 | "_model_module": "@jupyter-widgets/controls"
389 | }
390 | },
391 | "901791b19c524307b8105bc71fa3d27b": {
392 | "model_module": "@jupyter-widgets/base",
393 | "model_name": "LayoutModel",
394 | "state": {
395 | "_view_name": "LayoutView",
396 | "grid_template_rows": null,
397 | "right": null,
398 | "justify_content": null,
399 | "_view_module": "@jupyter-widgets/base",
400 | "overflow": null,
401 | "_model_module_version": "1.2.0",
402 | "_view_count": null,
403 | "flex_flow": null,
404 | "width": null,
405 | "min_width": null,
406 | "border": null,
407 | "align_items": null,
408 | "bottom": null,
409 | "_model_module": "@jupyter-widgets/base",
410 | "top": null,
411 | "grid_column": null,
412 | "overflow_y": null,
413 | "overflow_x": null,
414 | "grid_auto_flow": null,
415 | "grid_area": null,
416 | "grid_template_columns": null,
417 | "flex": null,
418 | "_model_name": "LayoutModel",
419 | "justify_items": null,
420 | "grid_row": null,
421 | "max_height": null,
422 | "align_content": null,
423 | "visibility": null,
424 | "align_self": null,
425 | "height": null,
426 | "min_height": null,
427 | "padding": null,
428 | "grid_auto_rows": null,
429 | "grid_gap": null,
430 | "max_width": null,
431 | "order": null,
432 | "_view_module_version": "1.2.0",
433 | "grid_template_areas": null,
434 | "object_position": null,
435 | "object_fit": null,
436 | "grid_auto_columns": null,
437 | "margin": null,
438 | "display": null,
439 | "left": null
440 | }
441 | },
442 | "b06a0c9a224641fabc62ebcc8241bca6": {
443 | "model_module": "@jupyter-widgets/controls",
444 | "model_name": "DescriptionStyleModel",
445 | "state": {
446 | "_view_name": "StyleView",
447 | "_model_name": "DescriptionStyleModel",
448 | "description_width": "",
449 | "_view_module": "@jupyter-widgets/base",
450 | "_model_module_version": "1.5.0",
451 | "_view_count": null,
452 | "_view_module_version": "1.2.0",
453 | "_model_module": "@jupyter-widgets/controls"
454 | }
455 | },
456 | "c014fdd5fa564f50be83089471cc21e4": {
457 | "model_module": "@jupyter-widgets/base",
458 | "model_name": "LayoutModel",
459 | "state": {
460 | "_view_name": "LayoutView",
461 | "grid_template_rows": null,
462 | "right": null,
463 | "justify_content": null,
464 | "_view_module": "@jupyter-widgets/base",
465 | "overflow": null,
466 | "_model_module_version": "1.2.0",
467 | "_view_count": null,
468 | "flex_flow": null,
469 | "width": null,
470 | "min_width": null,
471 | "border": null,
472 | "align_items": null,
473 | "bottom": null,
474 | "_model_module": "@jupyter-widgets/base",
475 | "top": null,
476 | "grid_column": null,
477 | "overflow_y": null,
478 | "overflow_x": null,
479 | "grid_auto_flow": null,
480 | "grid_area": null,
481 | "grid_template_columns": null,
482 | "flex": null,
483 | "_model_name": "LayoutModel",
484 | "justify_items": null,
485 | "grid_row": null,
486 | "max_height": null,
487 | "align_content": null,
488 | "visibility": null,
489 | "align_self": null,
490 | "height": null,
491 | "min_height": null,
492 | "padding": null,
493 | "grid_auto_rows": null,
494 | "grid_gap": null,
495 | "max_width": null,
496 | "order": null,
497 | "_view_module_version": "1.2.0",
498 | "grid_template_areas": null,
499 | "object_position": null,
500 | "object_fit": null,
501 | "grid_auto_columns": null,
502 | "margin": null,
503 | "display": null,
504 | "left": null
505 | }
506 | },
507 | "a2f4de745dd648bf97d0916abc6d0df8": {
508 | "model_module": "@jupyter-widgets/controls",
509 | "model_name": "HBoxModel",
510 | "state": {
511 | "_view_name": "HBoxView",
512 | "_dom_classes": [],
513 | "_model_name": "HBoxModel",
514 | "_view_module": "@jupyter-widgets/controls",
515 | "_model_module_version": "1.5.0",
516 | "_view_count": null,
517 | "_view_module_version": "1.5.0",
518 | "box_style": "",
519 | "layout": "IPY_MODEL_ac11c6a44879425e85b78443e4036dcd",
520 | "_model_module": "@jupyter-widgets/controls",
521 | "children": [
522 | "IPY_MODEL_e2c4ee3ce4d54982a367d76a3eb665a9",
523 | "IPY_MODEL_999d3d3939c545dbaf6cfb7db9caf652"
524 | ]
525 | }
526 | },
527 | "ac11c6a44879425e85b78443e4036dcd": {
528 | "model_module": "@jupyter-widgets/base",
529 | "model_name": "LayoutModel",
530 | "state": {
531 | "_view_name": "LayoutView",
532 | "grid_template_rows": null,
533 | "right": null,
534 | "justify_content": null,
535 | "_view_module": "@jupyter-widgets/base",
536 | "overflow": null,
537 | "_model_module_version": "1.2.0",
538 | "_view_count": null,
539 | "flex_flow": null,
540 | "width": null,
541 | "min_width": null,
542 | "border": null,
543 | "align_items": null,
544 | "bottom": null,
545 | "_model_module": "@jupyter-widgets/base",
546 | "top": null,
547 | "grid_column": null,
548 | "overflow_y": null,
549 | "overflow_x": null,
550 | "grid_auto_flow": null,
551 | "grid_area": null,
552 | "grid_template_columns": null,
553 | "flex": null,
554 | "_model_name": "LayoutModel",
555 | "justify_items": null,
556 | "grid_row": null,
557 | "max_height": null,
558 | "align_content": null,
559 | "visibility": null,
560 | "align_self": null,
561 | "height": null,
562 | "min_height": null,
563 | "padding": null,
564 | "grid_auto_rows": null,
565 | "grid_gap": null,
566 | "max_width": null,
567 | "order": null,
568 | "_view_module_version": "1.2.0",
569 | "grid_template_areas": null,
570 | "object_position": null,
571 | "object_fit": null,
572 | "grid_auto_columns": null,
573 | "margin": null,
574 | "display": null,
575 | "left": null
576 | }
577 | },
578 | "e2c4ee3ce4d54982a367d76a3eb665a9": {
579 | "model_module": "@jupyter-widgets/controls",
580 | "model_name": "FloatProgressModel",
581 | "state": {
582 | "_view_name": "ProgressView",
583 | "style": "IPY_MODEL_90fbdf46e39b4a929d838664c182f190",
584 | "_dom_classes": [],
585 | "description": "Downloading: 100%",
586 | "_model_name": "FloatProgressModel",
587 | "bar_style": "success",
588 | "max": 5069051,
589 | "_view_module": "@jupyter-widgets/controls",
590 | "_model_module_version": "1.5.0",
591 | "value": 5069051,
592 | "_view_count": null,
593 | "_view_module_version": "1.5.0",
594 | "orientation": "horizontal",
595 | "min": 0,
596 | "description_tooltip": null,
597 | "_model_module": "@jupyter-widgets/controls",
598 | "layout": "IPY_MODEL_8def438e71a849b485daab55c4d2dfbd"
599 | }
600 | },
601 | "999d3d3939c545dbaf6cfb7db9caf652": {
602 | "model_module": "@jupyter-widgets/controls",
603 | "model_name": "HTMLModel",
604 | "state": {
605 | "_view_name": "HTMLView",
606 | "style": "IPY_MODEL_24f5449bbb924cb086c8376ba0a0217a",
607 | "_dom_classes": [],
608 | "description": "",
609 | "_model_name": "HTMLModel",
610 | "placeholder": "",
611 | "_view_module": "@jupyter-widgets/controls",
612 | "_model_module_version": "1.5.0",
613 | "value": " 5.07M/5.07M [00:02<00:00, 2.43MB/s]",
614 | "_view_count": null,
615 | "_view_module_version": "1.5.0",
616 | "description_tooltip": null,
617 | "_model_module": "@jupyter-widgets/controls",
618 | "layout": "IPY_MODEL_53e0c97d7438446480b6b203e61e2763"
619 | }
620 | },
621 | "90fbdf46e39b4a929d838664c182f190": {
622 | "model_module": "@jupyter-widgets/controls",
623 | "model_name": "ProgressStyleModel",
624 | "state": {
625 | "_view_name": "StyleView",
626 | "_model_name": "ProgressStyleModel",
627 | "description_width": "initial",
628 | "_view_module": "@jupyter-widgets/base",
629 | "_model_module_version": "1.5.0",
630 | "_view_count": null,
631 | "_view_module_version": "1.2.0",
632 | "bar_color": null,
633 | "_model_module": "@jupyter-widgets/controls"
634 | }
635 | },
636 | "8def438e71a849b485daab55c4d2dfbd": {
637 | "model_module": "@jupyter-widgets/base",
638 | "model_name": "LayoutModel",
639 | "state": {
640 | "_view_name": "LayoutView",
641 | "grid_template_rows": null,
642 | "right": null,
643 | "justify_content": null,
644 | "_view_module": "@jupyter-widgets/base",
645 | "overflow": null,
646 | "_model_module_version": "1.2.0",
647 | "_view_count": null,
648 | "flex_flow": null,
649 | "width": null,
650 | "min_width": null,
651 | "border": null,
652 | "align_items": null,
653 | "bottom": null,
654 | "_model_module": "@jupyter-widgets/base",
655 | "top": null,
656 | "grid_column": null,
657 | "overflow_y": null,
658 | "overflow_x": null,
659 | "grid_auto_flow": null,
660 | "grid_area": null,
661 | "grid_template_columns": null,
662 | "flex": null,
663 | "_model_name": "LayoutModel",
664 | "justify_items": null,
665 | "grid_row": null,
666 | "max_height": null,
667 | "align_content": null,
668 | "visibility": null,
669 | "align_self": null,
670 | "height": null,
671 | "min_height": null,
672 | "padding": null,
673 | "grid_auto_rows": null,
674 | "grid_gap": null,
675 | "max_width": null,
676 | "order": null,
677 | "_view_module_version": "1.2.0",
678 | "grid_template_areas": null,
679 | "object_position": null,
680 | "object_fit": null,
681 | "grid_auto_columns": null,
682 | "margin": null,
683 | "display": null,
684 | "left": null
685 | }
686 | },
687 | "24f5449bbb924cb086c8376ba0a0217a": {
688 | "model_module": "@jupyter-widgets/controls",
689 | "model_name": "DescriptionStyleModel",
690 | "state": {
691 | "_view_name": "StyleView",
692 | "_model_name": "DescriptionStyleModel",
693 | "description_width": "",
694 | "_view_module": "@jupyter-widgets/base",
695 | "_model_module_version": "1.5.0",
696 | "_view_count": null,
697 | "_view_module_version": "1.2.0",
698 | "_model_module": "@jupyter-widgets/controls"
699 | }
700 | },
701 | "53e0c97d7438446480b6b203e61e2763": {
702 | "model_module": "@jupyter-widgets/base",
703 | "model_name": "LayoutModel",
704 | "state": {
705 | "_view_name": "LayoutView",
706 | "grid_template_rows": null,
707 | "right": null,
708 | "justify_content": null,
709 | "_view_module": "@jupyter-widgets/base",
710 | "overflow": null,
711 | "_model_module_version": "1.2.0",
712 | "_view_count": null,
713 | "flex_flow": null,
714 | "width": null,
715 | "min_width": null,
716 | "border": null,
717 | "align_items": null,
718 | "bottom": null,
719 | "_model_module": "@jupyter-widgets/base",
720 | "top": null,
721 | "grid_column": null,
722 | "overflow_y": null,
723 | "overflow_x": null,
724 | "grid_auto_flow": null,
725 | "grid_area": null,
726 | "grid_template_columns": null,
727 | "flex": null,
728 | "_model_name": "LayoutModel",
729 | "justify_items": null,
730 | "grid_row": null,
731 | "max_height": null,
732 | "align_content": null,
733 | "visibility": null,
734 | "align_self": null,
735 | "height": null,
736 | "min_height": null,
737 | "padding": null,
738 | "grid_auto_rows": null,
739 | "grid_gap": null,
740 | "max_width": null,
741 | "order": null,
742 | "_view_module_version": "1.2.0",
743 | "grid_template_areas": null,
744 | "object_position": null,
745 | "object_fit": null,
746 | "grid_auto_columns": null,
747 | "margin": null,
748 | "display": null,
749 | "left": null
750 | }
751 | },
752 | "987cc41166a04adba2b8b78039aee22a": {
753 | "model_module": "@jupyter-widgets/controls",
754 | "model_name": "HBoxModel",
755 | "state": {
756 | "_view_name": "HBoxView",
757 | "_dom_classes": [],
758 | "_model_name": "HBoxModel",
759 | "_view_module": "@jupyter-widgets/controls",
760 | "_model_module_version": "1.5.0",
761 | "_view_count": null,
762 | "_view_module_version": "1.5.0",
763 | "box_style": "",
764 | "layout": "IPY_MODEL_bf3031a3cec04b6683aef27dbf2aa3b0",
765 | "_model_module": "@jupyter-widgets/controls",
766 | "children": [
767 | "IPY_MODEL_d2bff619756249ebb8b5fa428faf02a5",
768 | "IPY_MODEL_c1fafc3f913342cd851b9ba34db8d5d9"
769 | ]
770 | }
771 | },
772 | "bf3031a3cec04b6683aef27dbf2aa3b0": {
773 | "model_module": "@jupyter-widgets/base",
774 | "model_name": "LayoutModel",
775 | "state": {
776 | "_view_name": "LayoutView",
777 | "grid_template_rows": null,
778 | "right": null,
779 | "justify_content": null,
780 | "_view_module": "@jupyter-widgets/base",
781 | "overflow": null,
782 | "_model_module_version": "1.2.0",
783 | "_view_count": null,
784 | "flex_flow": null,
785 | "width": null,
786 | "min_width": null,
787 | "border": null,
788 | "align_items": null,
789 | "bottom": null,
790 | "_model_module": "@jupyter-widgets/base",
791 | "top": null,
792 | "grid_column": null,
793 | "overflow_y": null,
794 | "overflow_x": null,
795 | "grid_auto_flow": null,
796 | "grid_area": null,
797 | "grid_template_columns": null,
798 | "flex": null,
799 | "_model_name": "LayoutModel",
800 | "justify_items": null,
801 | "grid_row": null,
802 | "max_height": null,
803 | "align_content": null,
804 | "visibility": null,
805 | "align_self": null,
806 | "height": null,
807 | "min_height": null,
808 | "padding": null,
809 | "grid_auto_rows": null,
810 | "grid_gap": null,
811 | "max_width": null,
812 | "order": null,
813 | "_view_module_version": "1.2.0",
814 | "grid_template_areas": null,
815 | "object_position": null,
816 | "object_fit": null,
817 | "grid_auto_columns": null,
818 | "margin": null,
819 | "display": null,
820 | "left": null
821 | }
822 | },
823 | "d2bff619756249ebb8b5fa428faf02a5": {
824 | "model_module": "@jupyter-widgets/controls",
825 | "model_name": "FloatProgressModel",
826 | "state": {
827 | "_view_name": "ProgressView",
828 | "style": "IPY_MODEL_13d8e85845ae4ba69bd95904e14a542e",
829 | "_dom_classes": [],
830 | "description": "Downloading: 100%",
831 | "_model_name": "FloatProgressModel",
832 | "bar_style": "success",
833 | "max": 1885418496,
834 | "_view_module": "@jupyter-widgets/controls",
835 | "_model_module_version": "1.5.0",
836 | "value": 1885418496,
837 | "_view_count": null,
838 | "_view_module_version": "1.5.0",
839 | "orientation": "horizontal",
840 | "min": 0,
841 | "description_tooltip": null,
842 | "_model_module": "@jupyter-widgets/controls",
843 | "layout": "IPY_MODEL_06441c2d907e4bcf9494d7ba9abe759f"
844 | }
845 | },
846 | "c1fafc3f913342cd851b9ba34db8d5d9": {
847 | "model_module": "@jupyter-widgets/controls",
848 | "model_name": "HTMLModel",
849 | "state": {
850 | "_view_name": "HTMLView",
851 | "style": "IPY_MODEL_0d7e6bc240e64734a98c710e0ab92112",
852 | "_dom_classes": [],
853 | "description": "",
854 | "_model_name": "HTMLModel",
855 | "placeholder": "",
856 | "_view_module": "@jupyter-widgets/controls",
857 | "_model_module_version": "1.5.0",
858 | "value": " 1.89G/1.89G [00:37<00:00, 50.8MB/s]",
859 | "_view_count": null,
860 | "_view_module_version": "1.5.0",
861 | "description_tooltip": null,
862 | "_model_module": "@jupyter-widgets/controls",
863 | "layout": "IPY_MODEL_3d12ea39d6764e0f90f02c265cb7d295"
864 | }
865 | },
866 | "13d8e85845ae4ba69bd95904e14a542e": {
867 | "model_module": "@jupyter-widgets/controls",
868 | "model_name": "ProgressStyleModel",
869 | "state": {
870 | "_view_name": "StyleView",
871 | "_model_name": "ProgressStyleModel",
872 | "description_width": "initial",
873 | "_view_module": "@jupyter-widgets/base",
874 | "_model_module_version": "1.5.0",
875 | "_view_count": null,
876 | "_view_module_version": "1.2.0",
877 | "bar_color": null,
878 | "_model_module": "@jupyter-widgets/controls"
879 | }
880 | },
881 | "06441c2d907e4bcf9494d7ba9abe759f": {
882 | "model_module": "@jupyter-widgets/base",
883 | "model_name": "LayoutModel",
884 | "state": {
885 | "_view_name": "LayoutView",
886 | "grid_template_rows": null,
887 | "right": null,
888 | "justify_content": null,
889 | "_view_module": "@jupyter-widgets/base",
890 | "overflow": null,
891 | "_model_module_version": "1.2.0",
892 | "_view_count": null,
893 | "flex_flow": null,
894 | "width": null,
895 | "min_width": null,
896 | "border": null,
897 | "align_items": null,
898 | "bottom": null,
899 | "_model_module": "@jupyter-widgets/base",
900 | "top": null,
901 | "grid_column": null,
902 | "overflow_y": null,
903 | "overflow_x": null,
904 | "grid_auto_flow": null,
905 | "grid_area": null,
906 | "grid_template_columns": null,
907 | "flex": null,
908 | "_model_name": "LayoutModel",
909 | "justify_items": null,
910 | "grid_row": null,
911 | "max_height": null,
912 | "align_content": null,
913 | "visibility": null,
914 | "align_self": null,
915 | "height": null,
916 | "min_height": null,
917 | "padding": null,
918 | "grid_auto_rows": null,
919 | "grid_gap": null,
920 | "max_width": null,
921 | "order": null,
922 | "_view_module_version": "1.2.0",
923 | "grid_template_areas": null,
924 | "object_position": null,
925 | "object_fit": null,
926 | "grid_auto_columns": null,
927 | "margin": null,
928 | "display": null,
929 | "left": null
930 | }
931 | },
932 | "0d7e6bc240e64734a98c710e0ab92112": {
933 | "model_module": "@jupyter-widgets/controls",
934 | "model_name": "DescriptionStyleModel",
935 | "state": {
936 | "_view_name": "StyleView",
937 | "_model_name": "DescriptionStyleModel",
938 | "description_width": "",
939 | "_view_module": "@jupyter-widgets/base",
940 | "_model_module_version": "1.5.0",
941 | "_view_count": null,
942 | "_view_module_version": "1.2.0",
943 | "_model_module": "@jupyter-widgets/controls"
944 | }
945 | },
946 | "3d12ea39d6764e0f90f02c265cb7d295": {
947 | "model_module": "@jupyter-widgets/base",
948 | "model_name": "LayoutModel",
949 | "state": {
950 | "_view_name": "LayoutView",
951 | "grid_template_rows": null,
952 | "right": null,
953 | "justify_content": null,
954 | "_view_module": "@jupyter-widgets/base",
955 | "overflow": null,
956 | "_model_module_version": "1.2.0",
957 | "_view_count": null,
958 | "flex_flow": null,
959 | "width": null,
960 | "min_width": null,
961 | "border": null,
962 | "align_items": null,
963 | "bottom": null,
964 | "_model_module": "@jupyter-widgets/base",
965 | "top": null,
966 | "grid_column": null,
967 | "overflow_y": null,
968 | "overflow_x": null,
969 | "grid_auto_flow": null,
970 | "grid_area": null,
971 | "grid_template_columns": null,
972 | "flex": null,
973 | "_model_name": "LayoutModel",
974 | "justify_items": null,
975 | "grid_row": null,
976 | "max_height": null,
977 | "align_content": null,
978 | "visibility": null,
979 | "align_self": null,
980 | "height": null,
981 | "min_height": null,
982 | "padding": null,
983 | "grid_auto_rows": null,
984 | "grid_gap": null,
985 | "max_width": null,
986 | "order": null,
987 | "_view_module_version": "1.2.0",
988 | "grid_template_areas": null,
989 | "object_position": null,
990 | "object_fit": null,
991 | "grid_auto_columns": null,
992 | "margin": null,
993 | "display": null,
994 | "left": null
995 | }
996 | }
997 | }
998 | }
999 | },
1000 | "cells": [
1001 | {
1002 | "cell_type": "code",
1003 | "metadata": {
1004 | "colab": {
1005 | "base_uri": "https://localhost:8080/"
1006 | },
1007 | "id": "ojSAlH9W_gfv",
1008 | "outputId": "b315bf07-ec72-45c4-91d5-c7a99f775514"
1009 | },
1010 | "source": [
1011 | "# Check GPU type\r\n",
1012 | "!nvidia-smi"
1013 | ],
1014 | "execution_count": 1,
1015 | "outputs": [
1016 | {
1017 | "output_type": "stream",
1018 | "text": [
1019 | "Sun Feb 28 17:29:50 2021 \n",
1020 | "+-----------------------------------------------------------------------------+\n",
1021 | "| NVIDIA-SMI 460.39 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
1022 | "|-------------------------------+----------------------+----------------------+\n",
1023 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
1024 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
1025 | "| | | MIG M. |\n",
1026 | "|===============================+======================+======================|\n",
1027 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
1028 | "| N/A 46C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |\n",
1029 | "| | | N/A |\n",
1030 | "+-------------------------------+----------------------+----------------------+\n",
1031 | " \n",
1032 | "+-----------------------------------------------------------------------------+\n",
1033 | "| Processes: |\n",
1034 | "| GPU GI CI PID Type Process name GPU Memory |\n",
1035 | "| ID ID Usage |\n",
1036 | "|=============================================================================|\n",
1037 | "| No running processes found |\n",
1038 | "+-----------------------------------------------------------------------------+\n"
1039 | ],
1040 | "name": "stdout"
1041 | }
1042 | ]
1043 | },
1044 | {
1045 | "cell_type": "code",
1046 | "metadata": {
1047 | "colab": {
1048 | "base_uri": "https://localhost:8080/"
1049 | },
1050 | "id": "5016OlJnp2kC",
1051 | "outputId": "021ee1d6-8a7a-42cc-e0f1-813f0473181c"
1052 | },
1053 | "source": [
1054 | "# Upgrade pip and install ktrain\r\n",
1055 | "!pip -qq install -U pip\r\n",
1056 | "!pip -qq install ktrain"
1057 | ],
1058 | "execution_count": 2,
1059 | "outputs": [
1060 | {
1061 | "output_type": "stream",
1062 | "text": [
1063 | "\u001b[K |████████████████████████████████| 1.5MB 7.6MB/s \n",
1064 | "\u001b[K |████████████████████████████████| 25.3 MB 73 kB/s \n",
1065 | "\u001b[K |████████████████████████████████| 6.8 MB 62.4 MB/s \n",
1066 | "\u001b[K |████████████████████████████████| 981 kB 60.1 MB/s \n",
1067 | "\u001b[K |████████████████████████████████| 263 kB 53.8 MB/s \n",
1068 | "\u001b[K |████████████████████████████████| 1.3 MB 31.5 MB/s \n",
1069 | "\u001b[K |████████████████████████████████| 1.2 MB 56.9 MB/s \n",
1070 | "\u001b[K |████████████████████████████████| 468 kB 53.4 MB/s \n",
1071 | "\u001b[K |████████████████████████████████| 1.1 MB 60.7 MB/s \n",
1072 | "\u001b[K |████████████████████████████████| 883 kB 61.2 MB/s \n",
1073 | "\u001b[K |████████████████████████████████| 2.9 MB 65.4 MB/s \n",
1074 | "\u001b[?25h Building wheel for ktrain (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1075 | " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1076 | " Building wheel for keras-bert (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1077 | " Building wheel for keras-transformer (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1078 | " Building wheel for keras-embed-sim (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1079 | " Building wheel for keras-layer-normalization (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1080 | " Building wheel for keras-multi-head (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1081 | " Building wheel for keras-self-attention (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1082 | " Building wheel for keras-pos-embd (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1083 | " Building wheel for keras-position-wise-feed-forward (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1084 | " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1085 | " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
1086 | " Building wheel for syntok (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
1087 | ],
1088 | "name": "stdout"
1089 | }
1090 | ]
1091 | },
1092 | {
1093 | "cell_type": "code",
1094 | "metadata": {
1095 | "id": "qDHDhHzWrDmm",
1096 | "colab": {
1097 | "base_uri": "https://localhost:8080/"
1098 | },
1099 | "outputId": "6ec8c799-c033-43d3-be91-f6e6e3fbd83d"
1100 | },
1101 | "source": [
1102 | "# Download data\r\n",
1103 | "!gdown --id 1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK\r\n",
1104 | "!unzip -q '/content/aai4_data.zip'"
1105 | ],
1106 | "execution_count": 3,
1107 | "outputs": [
1108 | {
1109 | "output_type": "stream",
1110 | "text": [
1111 | "Downloading...\n",
1112 | "From: https://drive.google.com/uc?id=1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK\n",
1113 | "To: /content/aai4_data.zip\n",
1114 | "23.6MB [00:00, 57.1MB/s]\n"
1115 | ],
1116 | "name": "stdout"
1117 | }
1118 | ]
1119 | },
1120 | {
1121 | "cell_type": "code",
1122 | "metadata": {
1123 | "id": "TWw-1GHGqVI1"
1124 | },
1125 | "source": [
1126 | "# Import libaries\r\n",
1127 | "import numpy as np \r\n",
1128 | "import pandas as pd\r\n",
1129 | "import random\r\n",
1130 | "import os\r\n",
1131 | "import re\r\n",
1132 | "import ktrain\r\n",
1133 | "from ktrain import text\r\n",
1134 | "import tensorflow as tf\r\n",
1135 | "from sklearn.model_selection import StratifiedKFold\r\n",
1136 | "import warnings\r\n",
1137 | "warnings.filterwarnings('ignore')"
1138 | ],
1139 | "execution_count": 4,
1140 | "outputs": []
1141 | },
1142 | {
1143 | "cell_type": "code",
1144 | "metadata": {
1145 | "id": "gdIMbg9vnM9b"
1146 | },
1147 | "source": [
1148 | "# Set seed\r\n",
1149 | "SEED = 3031\r\n",
1150 | "\r\n",
1151 | "def set_seeds(seed=SEED):\r\n",
1152 | " os.environ['PYTHONHASHSEED'] = str(seed)\r\n",
1153 | " random.seed(seed)\r\n",
1154 | " tf.random.set_seed(seed)\r\n",
1155 | " np.random.seed(seed)\r\n",
1156 | "\r\n",
1157 | "def set_global_determinism(seed=SEED):\r\n",
1158 | " set_seeds(seed=seed)\r\n",
1159 | "\r\n",
1160 | " os.environ['TF_DETERMINISTIC_OPS'] = '1'\r\n",
1161 | " os.environ['TF_CUDNN_DETERMINISTIC'] = '1'\r\n",
1162 | " \r\n",
1163 | " tf.config.threading.set_inter_op_parallelism_threads(1)\r\n",
1164 | " tf.config.threading.set_intra_op_parallelism_threads(1)\r\n",
1165 | "\r\n",
1166 | "set_global_determinism(seed=SEED)"
1167 | ],
1168 | "execution_count": 5,
1169 | "outputs": []
1170 | },
1171 | {
1172 | "cell_type": "code",
1173 | "metadata": {
1174 | "id": "q0ECuC-bqVGi"
1175 | },
1176 | "source": [
1177 | "# Load data\r\n",
1178 | "train = pd.read_csv('/content/aai4_data/train.csv')\r\n",
1179 | "test = pd.read_csv('/content/aai4_data/test.csv')\r\n",
1180 | "sample = pd.read_csv('/content/aai4_data/sample_submission.csv')"
1181 | ],
1182 | "execution_count": 6,
1183 | "outputs": []
1184 | },
1185 | {
1186 | "cell_type": "code",
1187 | "metadata": {
1188 | "colab": {
1189 | "base_uri": "https://localhost:8080/",
1190 | "height": 195
1191 | },
1192 | "id": "iJu-Skwq_JaM",
1193 | "outputId": "a6b5eb3b-84f2-494b-e638-7fac56ac8b1a"
1194 | },
1195 | "source": [
1196 | "# Preview last five rows in test\r\n",
1197 | "test.tail()"
1198 | ],
1199 | "execution_count": 7,
1200 | "outputs": [
1201 | {
1202 | "output_type": "execute_result",
1203 | "data": {
1204 | "text/html": [
1205 | "
\n",
1206 | "\n",
1219 | "
\n",
1220 | " \n",
1221 | " \n",
1222 | " | \n",
1223 | " id | \n",
1224 | " content | \n",
1225 | "
\n",
1226 | " \n",
1227 | " \n",
1228 | " \n",
1229 | " | 7751 | \n",
1230 | " SW18887 | \n",
1231 | " \\n\\n \\nNa Ibrahim Yassin-Nkasi\\n \\n\\tMWANAFUNZ... | \n",
1232 | "
\n",
1233 | " \n",
1234 | " | 7752 | \n",
1235 | " SW23779 | \n",
1236 | " BAADA ya R. Kelly kukumbwa na\\nkashfa ya unyan... | \n",
1237 | "
\n",
1238 | " \n",
1239 | " | 7753 | \n",
1240 | " SW20243 | \n",
1241 | " \\n\\tNa JUDITH NYANGE-MWANZA\\n \\n\\n \\n\\tKAMPUNI... | \n",
1242 | "
\n",
1243 | " \n",
1244 | " | 7754 | \n",
1245 | " SW27943 | \n",
1246 | " WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wa... | \n",
1247 | "
\n",
1248 | " \n",
1249 | " | 7755 | \n",
1250 | " SW22906 | \n",
1251 | " WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy... | \n",
1252 | "
\n",
1253 | " \n",
1254 | "
\n",
1255 | "
"
1256 | ],
1257 | "text/plain": [
1258 | " id content\n",
1259 | "7751 SW18887 \\n\\n \\nNa Ibrahim Yassin-Nkasi\\n \\n\\tMWANAFUNZ...\n",
1260 | "7752 SW23779 BAADA ya R. Kelly kukumbwa na\\nkashfa ya unyan...\n",
1261 | "7753 SW20243 \\n\\tNa JUDITH NYANGE-MWANZA\\n \\n\\n \\n\\tKAMPUNI...\n",
1262 | "7754 SW27943 WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wa...\n",
1263 | "7755 SW22906 WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy..."
1264 | ]
1265 | },
1266 | "metadata": {
1267 | "tags": []
1268 | },
1269 | "execution_count": 7
1270 | }
1271 | ]
1272 | },
1273 | {
1274 | "cell_type": "code",
1275 | "metadata": {
1276 | "colab": {
1277 | "base_uri": "https://localhost:8080/",
1278 | "height": 195
1279 | },
1280 | "id": "6DWa0uFBupn_",
1281 | "outputId": "98bd7bd4-bf1d-4454-c51f-0290cfbe4735"
1282 | },
1283 | "source": [
1284 | "# Remove trailing spaces, new lines and tab spaces from data\r\n",
1285 | "train.content = train.content.apply(lambda x: (re.sub('\\s+',' ', x)).strip())\r\n",
1286 | "test.content = test.content.apply(lambda x: (re.sub('\\s+',' ', x)).strip())\r\n",
1287 | "test.tail()"
1288 | ],
1289 | "execution_count": 8,
1290 | "outputs": [
1291 | {
1292 | "output_type": "execute_result",
1293 | "data": {
1294 | "text/html": [
1295 | "\n",
1296 | "\n",
1309 | "
\n",
1310 | " \n",
1311 | " \n",
1312 | " | \n",
1313 | " id | \n",
1314 | " content | \n",
1315 | "
\n",
1316 | " \n",
1317 | " \n",
1318 | " \n",
1319 | " | 7751 | \n",
1320 | " SW18887 | \n",
1321 | " Na Ibrahim Yassin-Nkasi MWANAFUNZI wa kidato c... | \n",
1322 | "
\n",
1323 | " \n",
1324 | " | 7752 | \n",
1325 | " SW23779 | \n",
1326 | " BAADA ya R. Kelly kukumbwa na kashfa ya unyany... | \n",
1327 | "
\n",
1328 | " \n",
1329 | " | 7753 | \n",
1330 | " SW20243 | \n",
1331 | " Na JUDITH NYANGE-MWANZA KAMPUNI ya Ujenzi wa N... | \n",
1332 | "
\n",
1333 | " \n",
1334 | " | 7754 | \n",
1335 | " SW27943 | \n",
1336 | " WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wan... | \n",
1337 | "
\n",
1338 | " \n",
1339 | " | 7755 | \n",
1340 | " SW22906 | \n",
1341 | " WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy... | \n",
1342 | "
\n",
1343 | " \n",
1344 | "
\n",
1345 | "
"
1346 | ],
1347 | "text/plain": [
1348 | " id content\n",
1349 | "7751 SW18887 Na Ibrahim Yassin-Nkasi MWANAFUNZI wa kidato c...\n",
1350 | "7752 SW23779 BAADA ya R. Kelly kukumbwa na kashfa ya unyany...\n",
1351 | "7753 SW20243 Na JUDITH NYANGE-MWANZA KAMPUNI ya Ujenzi wa N...\n",
1352 | "7754 SW27943 WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wan...\n",
1353 | "7755 SW22906 WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy..."
1354 | ]
1355 | },
1356 | "metadata": {
1357 | "tags": []
1358 | },
1359 | "execution_count": 8
1360 | }
1361 | ]
1362 | },
1363 | {
1364 | "cell_type": "code",
1365 | "metadata": {
1366 | "id": "h5dscE0Vre-J",
1367 | "colab": {
1368 | "base_uri": "https://localhost:8080/",
1369 | "height": 66,
1370 | "referenced_widgets": [
1371 | "fa3a8735e68c41c7accffc74ce9447ae",
1372 | "ec598ba5ac874ce7853c68e8ebe93645",
1373 | "8b748858dc2f43dbbe61e0e7d60364a0",
1374 | "4d3c49b123ea443a875493bb77e162cb",
1375 | "958db85b2ea744dd81c994a50933d1b5",
1376 | "f78f05533cca49f995448596c4231a98",
1377 | "9619e3c1eb3a409899c702486a17e792",
1378 | "28422b1b07ea447db4261b9ea662659c"
1379 | ]
1380 | },
1381 | "outputId": "36f61834-2628-4afe-e44f-ab7a42fcd56a"
1382 | },
1383 | "source": [
1384 | "# Set model parameters\r\n",
1385 | "MODEL_NAME = 'xlm-roberta-base'\r\n",
1386 | "MAX_LEN = 256\r\n",
1387 | "BATCH_SIZE = 16\r\n",
1388 | "FOLDS = 3\r\n",
1389 | "LR = 3e-5\r\n",
1390 | "EPOCHS = 2\r\n",
1391 | "\r\n",
1392 | "# List of class names\r\n",
1393 | "CLASS_NAMES = sorted(train.category.unique().tolist()) # ['afya', 'burudani', 'kimataifa', 'kitaifa', 'michezo', 'uchumi']\r\n",
1394 | "\r\n",
1395 | "# Instantiate transformer with the provided parameters\r\n",
1396 | "t = text.Transformer(model_name=MODEL_NAME, maxlen=MAX_LEN, class_names=CLASS_NAMES, batch_size=BATCH_SIZE)"
1397 | ],
1398 | "execution_count": 9,
1399 | "outputs": [
1400 | {
1401 | "output_type": "display_data",
1402 | "data": {
1403 | "application/vnd.jupyter.widget-view+json": {
1404 | "model_id": "fa3a8735e68c41c7accffc74ce9447ae",
1405 | "version_minor": 0,
1406 | "version_major": 2
1407 | },
1408 | "text/plain": [
1409 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…"
1410 | ]
1411 | },
1412 | "metadata": {
1413 | "tags": []
1414 | }
1415 | },
1416 | {
1417 | "output_type": "stream",
1418 | "text": [
1419 | "\n"
1420 | ],
1421 | "name": "stdout"
1422 | }
1423 | ]
1424 | },
1425 | {
1426 | "cell_type": "code",
1427 | "metadata": {
1428 | "id": "chBN-wZiy1QL",
1429 | "colab": {
1430 | "base_uri": "https://localhost:8080/",
1431 | "height": 1000,
1432 | "referenced_widgets": [
1433 | "560d3fa635694e2a9a9410a011737075",
1434 | "2c5283f30c8f428a8dd509702b90729b",
1435 | "86f143f04251403e8ed50fe52e72df2a",
1436 | "144eea02d8b2441196d91c646be08d45",
1437 | "fdb5ff5ec5ab4fa486f8714cd185d799",
1438 | "901791b19c524307b8105bc71fa3d27b",
1439 | "b06a0c9a224641fabc62ebcc8241bca6",
1440 | "c014fdd5fa564f50be83089471cc21e4",
1441 | "a2f4de745dd648bf97d0916abc6d0df8",
1442 | "ac11c6a44879425e85b78443e4036dcd",
1443 | "e2c4ee3ce4d54982a367d76a3eb665a9",
1444 | "999d3d3939c545dbaf6cfb7db9caf652",
1445 | "90fbdf46e39b4a929d838664c182f190",
1446 | "8def438e71a849b485daab55c4d2dfbd",
1447 | "24f5449bbb924cb086c8376ba0a0217a",
1448 | "53e0c97d7438446480b6b203e61e2763",
1449 | "987cc41166a04adba2b8b78039aee22a",
1450 | "bf3031a3cec04b6683aef27dbf2aa3b0",
1451 | "d2bff619756249ebb8b5fa428faf02a5",
1452 | "c1fafc3f913342cd851b9ba34db8d5d9",
1453 | "13d8e85845ae4ba69bd95904e14a542e",
1454 | "06441c2d907e4bcf9494d7ba9abe759f",
1455 | "0d7e6bc240e64734a98c710e0ab92112",
1456 | "3d12ea39d6764e0f90f02c265cb7d295"
1457 | ]
1458 | },
1459 | "outputId": "8168056b-7d5c-4668-ba9a-51215462bec7"
1460 | },
1461 | "source": [
1462 | "%%time\r\n",
1463 | "# Prepare test data\r\n",
1464 | "test_data = np.asarray(test.content)\r\n",
1465 | "\r\n",
1466 | "# Set number of folds to 3\r\n",
1467 | "folds = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=False)\r\n",
1468 | "\r\n",
1469 | "# List to store predictions and loss-score per fold\r\n",
1470 | "oof_preds = []\r\n",
1471 | "oof_loss_score = []\r\n",
1472 | "\r\n",
1473 | "for train_index, test_index in folds.split(train.content, train.category):\r\n",
1474 | " X_train, X_test = list(train.loc[train_index, 'content']), list(train.loc[test_index, 'content'])\r\n",
1475 | " y_train, y_test = np.asarray(train.loc[train_index, 'category']), np.asarray(train.loc[test_index, 'category'])\r\n",
1476 | "\r\n",
1477 | " # Preprocess training and validation data\r\n",
1478 | " train_set = t.preprocess_train(X_train, y_train)\r\n",
1479 | " val_set = t.preprocess_test(X_test, y_test)\r\n",
1480 | "\r\n",
1481 | " # Instantiate model\r\n",
1482 | " model = t.get_classifier()\r\n",
1483 | " learner = ktrain.get_learner(model, train_data=train_set, val_data=val_set, batch_size=BATCH_SIZE)\r\n",
1484 | "\r\n",
1485 | " # Train model\r\n",
1486 | " history = learner.fit(LR, n_cycles=EPOCHS, checkpoint_folder='/tmp')\r\n",
1487 | " learner.validate(class_names=t.get_classes())\r\n",
1488 | "\r\n",
1489 | " # Append score of each fold\r\n",
1490 | " oof_loss_score.append(history.history['val_loss'][-1])\r\n",
1491 | "\r\n",
1492 | " # Make predictions\r\n",
1493 | " preds = ktrain.get_predictor(learner.model, preproc=t).predict(test_data, return_proba=True)\r\n",
1494 | "\r\n",
1495 | " # Append preds to oof_preds list\r\n",
1496 | " oof_preds.append(preds)\r\n",
1497 | "\r\n",
1498 | "# Check cv score and prepare submission file\r\n",
1499 | "print(f'Mean Loss: {np.mean(oof_loss_score)}')\r\n",
1500 | "sub = pd.DataFrame(np.mean(oof_preds, axis=0), columns = t.get_classes())\r\n",
1501 | "sub['test_id'] = test.id\r\n",
1502 | "sub = sub[sample.columns]\r\n",
1503 | "sub.to_csv('Submission.csv', index = False)"
1504 | ],
1505 | "execution_count": 10,
1506 | "outputs": [
1507 | {
1508 | "output_type": "stream",
1509 | "text": [
1510 | "preprocessing train...\n",
1511 | "language: sw\n",
1512 | "train sequence lengths:\n",
1513 | "\tmean : 333\n",
1514 | "\t95percentile : 792\n",
1515 | "\t99percentile : 1268\n"
1516 | ],
1517 | "name": "stdout"
1518 | },
1519 | {
1520 | "output_type": "display_data",
1521 | "data": {
1522 | "application/vnd.jupyter.widget-view+json": {
1523 | "model_id": "560d3fa635694e2a9a9410a011737075",
1524 | "version_minor": 0,
1525 | "version_major": 2
1526 | },
1527 | "text/plain": [
1528 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…"
1529 | ]
1530 | },
1531 | "metadata": {
1532 | "tags": []
1533 | }
1534 | },
1535 | {
1536 | "output_type": "stream",
1537 | "text": [
1538 | "\n"
1539 | ],
1540 | "name": "stdout"
1541 | },
1542 | {
1543 | "output_type": "display_data",
1544 | "data": {
1545 | "application/vnd.jupyter.widget-view+json": {
1546 | "model_id": "a2f4de745dd648bf97d0916abc6d0df8",
1547 | "version_minor": 0,
1548 | "version_major": 2
1549 | },
1550 | "text/plain": [
1551 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…"
1552 | ]
1553 | },
1554 | "metadata": {
1555 | "tags": []
1556 | }
1557 | },
1558 | {
1559 | "output_type": "stream",
1560 | "text": [
1561 | "\n"
1562 | ],
1563 | "name": "stdout"
1564 | },
1565 | {
1566 | "output_type": "display_data",
1567 | "data": {
1568 | "text/html": [
1569 | ""
1570 | ],
1571 | "text/plain": [
1572 | ""
1573 | ]
1574 | },
1575 | "metadata": {
1576 | "tags": []
1577 | }
1578 | },
1579 | {
1580 | "output_type": "stream",
1581 | "text": [
1582 | "Is Multi-Label? False\n",
1583 | "preprocessing test...\n",
1584 | "language: sw\n",
1585 | "test sequence lengths:\n",
1586 | "\tmean : 331\n",
1587 | "\t95percentile : 768\n",
1588 | "\t99percentile : 1234\n"
1589 | ],
1590 | "name": "stdout"
1591 | },
1592 | {
1593 | "output_type": "display_data",
1594 | "data": {
1595 | "text/html": [
1596 | ""
1597 | ],
1598 | "text/plain": [
1599 | ""
1600 | ]
1601 | },
1602 | "metadata": {
1603 | "tags": []
1604 | }
1605 | },
1606 | {
1607 | "output_type": "display_data",
1608 | "data": {
1609 | "application/vnd.jupyter.widget-view+json": {
1610 | "model_id": "987cc41166a04adba2b8b78039aee22a",
1611 | "version_minor": 0,
1612 | "version_major": 2
1613 | },
1614 | "text/plain": [
1615 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…"
1616 | ]
1617 | },
1618 | "metadata": {
1619 | "tags": []
1620 | }
1621 | },
1622 | {
1623 | "output_type": "stream",
1624 | "text": [
1625 | "\n",
1626 | "Epoch 1/2\n",
1627 | "970/970 [==============================] - 1094s 1s/step - loss: 0.8316 - accuracy: 0.7242 - val_loss: 0.3710 - val_accuracy: 0.8945\n",
1628 | "Epoch 2/2\n",
1629 | "970/970 [==============================] - 1081s 1s/step - loss: 0.3307 - accuracy: 0.8989 - val_loss: 0.2845 - val_accuracy: 0.9119\n",
1630 | " precision recall f1-score support\n",
1631 | "\n",
1632 | " afya 0.61 0.42 0.50 286\n",
1633 | " burudani 0.92 0.89 0.90 743\n",
1634 | " kimataifa 0.93 0.89 0.91 635\n",
1635 | " kitaifa 0.90 0.94 0.92 3414\n",
1636 | " michezo 0.95 0.97 0.96 2002\n",
1637 | " uchumi 0.92 0.84 0.88 676\n",
1638 | "\n",
1639 | " accuracy 0.91 7756\n",
1640 | " macro avg 0.87 0.83 0.85 7756\n",
1641 | "weighted avg 0.91 0.91 0.91 7756\n",
1642 | "\n",
1643 | "preprocessing train...\n",
1644 | "language: sw\n",
1645 | "train sequence lengths:\n",
1646 | "\tmean : 332\n",
1647 | "\t95percentile : 782\n",
1648 | "\t99percentile : 1279\n"
1649 | ],
1650 | "name": "stdout"
1651 | },
1652 | {
1653 | "output_type": "display_data",
1654 | "data": {
1655 | "text/html": [
1656 | ""
1657 | ],
1658 | "text/plain": [
1659 | ""
1660 | ]
1661 | },
1662 | "metadata": {
1663 | "tags": []
1664 | }
1665 | },
1666 | {
1667 | "output_type": "stream",
1668 | "text": [
1669 | "Is Multi-Label? False\n",
1670 | "preprocessing test...\n",
1671 | "language: sw\n",
1672 | "test sequence lengths:\n",
1673 | "\tmean : 334\n",
1674 | "\t95percentile : 787\n",
1675 | "\t99percentile : 1239\n"
1676 | ],
1677 | "name": "stdout"
1678 | },
1679 | {
1680 | "output_type": "display_data",
1681 | "data": {
1682 | "text/html": [
1683 | ""
1684 | ],
1685 | "text/plain": [
1686 | ""
1687 | ]
1688 | },
1689 | "metadata": {
1690 | "tags": []
1691 | }
1692 | },
1693 | {
1694 | "output_type": "stream",
1695 | "text": [
1696 | "Epoch 1/2\n",
1697 | "970/970 [==============================] - 1096s 1s/step - loss: 0.7408 - accuracy: 0.7570 - val_loss: 0.3460 - val_accuracy: 0.8813\n",
1698 | "Epoch 2/2\n",
1699 | "970/970 [==============================] - 1080s 1s/step - loss: 0.3208 - accuracy: 0.8940 - val_loss: 0.3023 - val_accuracy: 0.9082\n",
1700 | " precision recall f1-score support\n",
1701 | "\n",
1702 | " afya 0.58 0.50 0.53 286\n",
1703 | " burudani 0.92 0.90 0.91 743\n",
1704 | " kimataifa 0.90 0.87 0.89 636\n",
1705 | " kitaifa 0.91 0.93 0.92 3414\n",
1706 | " michezo 0.94 0.97 0.96 2001\n",
1707 | " uchumi 0.92 0.83 0.88 676\n",
1708 | "\n",
1709 | " accuracy 0.91 7756\n",
1710 | " macro avg 0.86 0.83 0.85 7756\n",
1711 | "weighted avg 0.91 0.91 0.91 7756\n",
1712 | "\n",
1713 | "preprocessing train...\n",
1714 | "language: sw\n",
1715 | "train sequence lengths:\n",
1716 | "\tmean : 332\n",
1717 | "\t95percentile : 778\n",
1718 | "\t99percentile : 1238\n"
1719 | ],
1720 | "name": "stdout"
1721 | },
1722 | {
1723 | "output_type": "display_data",
1724 | "data": {
1725 | "text/html": [
1726 | ""
1727 | ],
1728 | "text/plain": [
1729 | ""
1730 | ]
1731 | },
1732 | "metadata": {
1733 | "tags": []
1734 | }
1735 | },
1736 | {
1737 | "output_type": "stream",
1738 | "text": [
1739 | "Is Multi-Label? False\n",
1740 | "preprocessing test...\n",
1741 | "language: sw\n",
1742 | "test sequence lengths:\n",
1743 | "\tmean : 333\n",
1744 | "\t95percentile : 794\n",
1745 | "\t99percentile : 1299\n"
1746 | ],
1747 | "name": "stdout"
1748 | },
1749 | {
1750 | "output_type": "display_data",
1751 | "data": {
1752 | "text/html": [
1753 | ""
1754 | ],
1755 | "text/plain": [
1756 | ""
1757 | ]
1758 | },
1759 | "metadata": {
1760 | "tags": []
1761 | }
1762 | },
1763 | {
1764 | "output_type": "stream",
1765 | "text": [
1766 | "Epoch 1/2\n",
1767 | "970/970 [==============================] - 1097s 1s/step - loss: 0.7295 - accuracy: 0.7622 - val_loss: 0.2984 - val_accuracy: 0.9045\n",
1768 | "Epoch 2/2\n",
1769 | "970/970 [==============================] - 1082s 1s/step - loss: 0.2952 - accuracy: 0.9076 - val_loss: 0.3021 - val_accuracy: 0.9067\n",
1770 | " precision recall f1-score support\n",
1771 | "\n",
1772 | " afya 0.55 0.66 0.60 287\n",
1773 | " burudani 0.88 0.95 0.91 743\n",
1774 | " kimataifa 0.94 0.86 0.90 635\n",
1775 | " kitaifa 0.93 0.90 0.91 3414\n",
1776 | " michezo 0.93 0.99 0.96 2001\n",
1777 | " uchumi 0.92 0.82 0.86 676\n",
1778 | "\n",
1779 | " accuracy 0.91 7756\n",
1780 | " macro avg 0.86 0.86 0.86 7756\n",
1781 | "weighted avg 0.91 0.91 0.91 7756\n",
1782 | "\n",
1783 | "Mean Loss: 0.2963431775569916\n",
1784 | "CPU times: user 24min 1s, sys: 13min 2s, total: 37min 3s\n",
1785 | "Wall time: 2h 9min 20s\n"
1786 | ],
1787 | "name": "stdout"
1788 | }
1789 | ]
1790 | },
1791 | {
1792 | "cell_type": "code",
1793 | "metadata": {
1794 | "id": "rSzTRo1x-HCj"
1795 | },
1796 | "source": [
1797 | ""
1798 | ],
1799 | "execution_count": 11,
1800 | "outputs": []
1801 | },
1802 | {
1803 | "cell_type": "code",
1804 | "metadata": {
1805 | "id": "9t71teFG-G-_"
1806 | },
1807 | "source": [
1808 | ""
1809 | ],
1810 | "execution_count": null,
1811 | "outputs": []
1812 | },
1813 | {
1814 | "cell_type": "code",
1815 | "metadata": {
1816 | "id": "k_Mml3aA-G7r"
1817 | },
1818 | "source": [
1819 | ""
1820 | ],
1821 | "execution_count": null,
1822 | "outputs": []
1823 | }
1824 | ]
1825 | }
--------------------------------------------------------------------------------