├── README.md └── Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # AI4D Swahili News Classification Challenge 2 | 3 | Kiswahili is a lingua franca spoken by 100-150 million people across East Africa. It is an official language in the DRC, Kenya, Tanzania, and Uganda; in Tanzania it is a first language for most people and the official language of instruction in all schools, while in other countries it is a common second language. News in Kiswahili is an important part of the media sphere in East Africa. 4 | 5 | News contributes to education, technology, and economic growth of a country, and news in local languages plays an important cultural role in many Africa countries. In the modern age, African languages in news and other spheres are at risk of being lost as English becomes the dominant language in online spaces. 6 | 7 | The objective of this hackathon was to develop a multi-class classification model to classify news content according to six specific categories.The model is to be used by Swahili online platforms to automatically group news according to their categories and help readers find the specific news they want to read. 8 | 9 | The evaluation metric for this challenge is **Log Loss**. 10 | --- 11 | -------------------------------------------------------------------------------- /Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Brainiac_2nd_place_solution_Swahili_News_Classification_Challenge.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "accelerator": "GPU", 15 | "widgets": { 16 | "application/vnd.jupyter.widget-state+json": { 17 | "fa3a8735e68c41c7accffc74ce9447ae": { 18 | "model_module": "@jupyter-widgets/controls", 19 | "model_name": "HBoxModel", 20 | "state": { 21 | "_view_name": "HBoxView", 22 | "_dom_classes": [], 23 | "_model_name": "HBoxModel", 24 | "_view_module": "@jupyter-widgets/controls", 25 | "_model_module_version": "1.5.0", 26 | "_view_count": null, 27 | "_view_module_version": "1.5.0", 28 | "box_style": "", 29 | "layout": "IPY_MODEL_ec598ba5ac874ce7853c68e8ebe93645", 30 | "_model_module": "@jupyter-widgets/controls", 31 | "children": [ 32 | "IPY_MODEL_8b748858dc2f43dbbe61e0e7d60364a0", 33 | "IPY_MODEL_4d3c49b123ea443a875493bb77e162cb" 34 | ] 35 | } 36 | }, 37 | "ec598ba5ac874ce7853c68e8ebe93645": { 38 | "model_module": "@jupyter-widgets/base", 39 | "model_name": "LayoutModel", 40 | "state": { 41 | "_view_name": "LayoutView", 42 | "grid_template_rows": null, 43 | "right": null, 44 | "justify_content": null, 45 | "_view_module": "@jupyter-widgets/base", 46 | "overflow": null, 47 | "_model_module_version": "1.2.0", 48 | "_view_count": null, 49 | "flex_flow": null, 50 | "width": null, 51 | "min_width": null, 52 | "border": null, 53 | "align_items": null, 54 | "bottom": null, 55 | "_model_module": "@jupyter-widgets/base", 56 | "top": null, 57 | "grid_column": null, 58 | "overflow_y": null, 59 | "overflow_x": null, 60 | "grid_auto_flow": null, 61 | "grid_area": null, 62 | "grid_template_columns": null, 63 | "flex": null, 64 | "_model_name": "LayoutModel", 65 | "justify_items": null, 66 | "grid_row": null, 67 | "max_height": null, 68 | "align_content": null, 69 | "visibility": null, 70 | "align_self": null, 71 | "height": null, 72 | "min_height": null, 73 | "padding": null, 74 | "grid_auto_rows": null, 75 | "grid_gap": null, 76 | "max_width": null, 77 | "order": null, 78 | "_view_module_version": "1.2.0", 79 | "grid_template_areas": null, 80 | "object_position": null, 81 | "object_fit": null, 82 | "grid_auto_columns": null, 83 | "margin": null, 84 | "display": null, 85 | "left": null 86 | } 87 | }, 88 | "8b748858dc2f43dbbe61e0e7d60364a0": { 89 | "model_module": "@jupyter-widgets/controls", 90 | "model_name": "FloatProgressModel", 91 | "state": { 92 | "_view_name": "ProgressView", 93 | "style": "IPY_MODEL_958db85b2ea744dd81c994a50933d1b5", 94 | "_dom_classes": [], 95 | "description": "Downloading: 100%", 96 | "_model_name": "FloatProgressModel", 97 | "bar_style": "success", 98 | "max": 512, 99 | "_view_module": "@jupyter-widgets/controls", 100 | "_model_module_version": "1.5.0", 101 | "value": 512, 102 | "_view_count": null, 103 | "_view_module_version": "1.5.0", 104 | "orientation": "horizontal", 105 | "min": 0, 106 | "description_tooltip": null, 107 | "_model_module": "@jupyter-widgets/controls", 108 | "layout": "IPY_MODEL_f78f05533cca49f995448596c4231a98" 109 | } 110 | }, 111 | "4d3c49b123ea443a875493bb77e162cb": { 112 | "model_module": "@jupyter-widgets/controls", 113 | "model_name": "HTMLModel", 114 | "state": { 115 | "_view_name": "HTMLView", 116 | "style": "IPY_MODEL_9619e3c1eb3a409899c702486a17e792", 117 | "_dom_classes": [], 118 | "description": "", 119 | "_model_name": "HTMLModel", 120 | "placeholder": "​", 121 | "_view_module": "@jupyter-widgets/controls", 122 | "_model_module_version": "1.5.0", 123 | "value": " 512/512 [00:00<00:00, 6.18kB/s]", 124 | "_view_count": null, 125 | "_view_module_version": "1.5.0", 126 | "description_tooltip": null, 127 | "_model_module": "@jupyter-widgets/controls", 128 | "layout": "IPY_MODEL_28422b1b07ea447db4261b9ea662659c" 129 | } 130 | }, 131 | "958db85b2ea744dd81c994a50933d1b5": { 132 | "model_module": "@jupyter-widgets/controls", 133 | "model_name": "ProgressStyleModel", 134 | "state": { 135 | "_view_name": "StyleView", 136 | "_model_name": "ProgressStyleModel", 137 | "description_width": "initial", 138 | "_view_module": "@jupyter-widgets/base", 139 | "_model_module_version": "1.5.0", 140 | "_view_count": null, 141 | "_view_module_version": "1.2.0", 142 | "bar_color": null, 143 | "_model_module": "@jupyter-widgets/controls" 144 | } 145 | }, 146 | "f78f05533cca49f995448596c4231a98": { 147 | "model_module": "@jupyter-widgets/base", 148 | "model_name": "LayoutModel", 149 | "state": { 150 | "_view_name": "LayoutView", 151 | "grid_template_rows": null, 152 | "right": null, 153 | "justify_content": null, 154 | "_view_module": "@jupyter-widgets/base", 155 | "overflow": null, 156 | "_model_module_version": "1.2.0", 157 | "_view_count": null, 158 | "flex_flow": null, 159 | "width": null, 160 | "min_width": null, 161 | "border": null, 162 | "align_items": null, 163 | "bottom": null, 164 | "_model_module": "@jupyter-widgets/base", 165 | "top": null, 166 | "grid_column": null, 167 | "overflow_y": null, 168 | "overflow_x": null, 169 | "grid_auto_flow": null, 170 | "grid_area": null, 171 | "grid_template_columns": null, 172 | "flex": null, 173 | "_model_name": "LayoutModel", 174 | "justify_items": null, 175 | "grid_row": null, 176 | "max_height": null, 177 | "align_content": null, 178 | "visibility": null, 179 | "align_self": null, 180 | "height": null, 181 | "min_height": null, 182 | "padding": null, 183 | "grid_auto_rows": null, 184 | "grid_gap": null, 185 | "max_width": null, 186 | "order": null, 187 | "_view_module_version": "1.2.0", 188 | "grid_template_areas": null, 189 | "object_position": null, 190 | "object_fit": null, 191 | "grid_auto_columns": null, 192 | "margin": null, 193 | "display": null, 194 | "left": null 195 | } 196 | }, 197 | "9619e3c1eb3a409899c702486a17e792": { 198 | "model_module": "@jupyter-widgets/controls", 199 | "model_name": "DescriptionStyleModel", 200 | "state": { 201 | "_view_name": "StyleView", 202 | "_model_name": "DescriptionStyleModel", 203 | "description_width": "", 204 | "_view_module": "@jupyter-widgets/base", 205 | "_model_module_version": "1.5.0", 206 | "_view_count": null, 207 | "_view_module_version": "1.2.0", 208 | "_model_module": "@jupyter-widgets/controls" 209 | } 210 | }, 211 | "28422b1b07ea447db4261b9ea662659c": { 212 | "model_module": "@jupyter-widgets/base", 213 | "model_name": "LayoutModel", 214 | "state": { 215 | "_view_name": "LayoutView", 216 | "grid_template_rows": null, 217 | "right": null, 218 | "justify_content": null, 219 | "_view_module": "@jupyter-widgets/base", 220 | "overflow": null, 221 | "_model_module_version": "1.2.0", 222 | "_view_count": null, 223 | "flex_flow": null, 224 | "width": null, 225 | "min_width": null, 226 | "border": null, 227 | "align_items": null, 228 | "bottom": null, 229 | "_model_module": "@jupyter-widgets/base", 230 | "top": null, 231 | "grid_column": null, 232 | "overflow_y": null, 233 | "overflow_x": null, 234 | "grid_auto_flow": null, 235 | "grid_area": null, 236 | "grid_template_columns": null, 237 | "flex": null, 238 | "_model_name": "LayoutModel", 239 | "justify_items": null, 240 | "grid_row": null, 241 | "max_height": null, 242 | "align_content": null, 243 | "visibility": null, 244 | "align_self": null, 245 | "height": null, 246 | "min_height": null, 247 | "padding": null, 248 | "grid_auto_rows": null, 249 | "grid_gap": null, 250 | "max_width": null, 251 | "order": null, 252 | "_view_module_version": "1.2.0", 253 | "grid_template_areas": null, 254 | "object_position": null, 255 | "object_fit": null, 256 | "grid_auto_columns": null, 257 | "margin": null, 258 | "display": null, 259 | "left": null 260 | } 261 | }, 262 | "560d3fa635694e2a9a9410a011737075": { 263 | "model_module": "@jupyter-widgets/controls", 264 | "model_name": "HBoxModel", 265 | "state": { 266 | "_view_name": "HBoxView", 267 | "_dom_classes": [], 268 | "_model_name": "HBoxModel", 269 | "_view_module": "@jupyter-widgets/controls", 270 | "_model_module_version": "1.5.0", 271 | "_view_count": null, 272 | "_view_module_version": "1.5.0", 273 | "box_style": "", 274 | "layout": "IPY_MODEL_2c5283f30c8f428a8dd509702b90729b", 275 | "_model_module": "@jupyter-widgets/controls", 276 | "children": [ 277 | "IPY_MODEL_86f143f04251403e8ed50fe52e72df2a", 278 | "IPY_MODEL_144eea02d8b2441196d91c646be08d45" 279 | ] 280 | } 281 | }, 282 | "2c5283f30c8f428a8dd509702b90729b": { 283 | "model_module": "@jupyter-widgets/base", 284 | "model_name": "LayoutModel", 285 | "state": { 286 | "_view_name": "LayoutView", 287 | "grid_template_rows": null, 288 | "right": null, 289 | "justify_content": null, 290 | "_view_module": "@jupyter-widgets/base", 291 | "overflow": null, 292 | "_model_module_version": "1.2.0", 293 | "_view_count": null, 294 | "flex_flow": null, 295 | "width": null, 296 | "min_width": null, 297 | "border": null, 298 | "align_items": null, 299 | "bottom": null, 300 | "_model_module": "@jupyter-widgets/base", 301 | "top": null, 302 | "grid_column": null, 303 | "overflow_y": null, 304 | "overflow_x": null, 305 | "grid_auto_flow": null, 306 | "grid_area": null, 307 | "grid_template_columns": null, 308 | "flex": null, 309 | "_model_name": "LayoutModel", 310 | "justify_items": null, 311 | "grid_row": null, 312 | "max_height": null, 313 | "align_content": null, 314 | "visibility": null, 315 | "align_self": null, 316 | "height": null, 317 | "min_height": null, 318 | "padding": null, 319 | "grid_auto_rows": null, 320 | "grid_gap": null, 321 | "max_width": null, 322 | "order": null, 323 | "_view_module_version": "1.2.0", 324 | "grid_template_areas": null, 325 | "object_position": null, 326 | "object_fit": null, 327 | "grid_auto_columns": null, 328 | "margin": null, 329 | "display": null, 330 | "left": null 331 | } 332 | }, 333 | "86f143f04251403e8ed50fe52e72df2a": { 334 | "model_module": "@jupyter-widgets/controls", 335 | "model_name": "FloatProgressModel", 336 | "state": { 337 | "_view_name": "ProgressView", 338 | "style": "IPY_MODEL_fdb5ff5ec5ab4fa486f8714cd185d799", 339 | "_dom_classes": [], 340 | "description": "Downloading: 100%", 341 | "_model_name": "FloatProgressModel", 342 | "bar_style": "success", 343 | "max": 512, 344 | "_view_module": "@jupyter-widgets/controls", 345 | "_model_module_version": "1.5.0", 346 | "value": 512, 347 | "_view_count": null, 348 | "_view_module_version": "1.5.0", 349 | "orientation": "horizontal", 350 | "min": 0, 351 | "description_tooltip": null, 352 | "_model_module": "@jupyter-widgets/controls", 353 | "layout": "IPY_MODEL_901791b19c524307b8105bc71fa3d27b" 354 | } 355 | }, 356 | "144eea02d8b2441196d91c646be08d45": { 357 | "model_module": "@jupyter-widgets/controls", 358 | "model_name": "HTMLModel", 359 | "state": { 360 | "_view_name": "HTMLView", 361 | "style": "IPY_MODEL_b06a0c9a224641fabc62ebcc8241bca6", 362 | "_dom_classes": [], 363 | "description": "", 364 | "_model_name": "HTMLModel", 365 | "placeholder": "​", 366 | "_view_module": "@jupyter-widgets/controls", 367 | "_model_module_version": "1.5.0", 368 | "value": " 512/512 [00:03<00:00, 160B/s]", 369 | "_view_count": null, 370 | "_view_module_version": "1.5.0", 371 | "description_tooltip": null, 372 | "_model_module": "@jupyter-widgets/controls", 373 | "layout": "IPY_MODEL_c014fdd5fa564f50be83089471cc21e4" 374 | } 375 | }, 376 | "fdb5ff5ec5ab4fa486f8714cd185d799": { 377 | "model_module": "@jupyter-widgets/controls", 378 | "model_name": "ProgressStyleModel", 379 | "state": { 380 | "_view_name": "StyleView", 381 | "_model_name": "ProgressStyleModel", 382 | "description_width": "initial", 383 | "_view_module": "@jupyter-widgets/base", 384 | "_model_module_version": "1.5.0", 385 | "_view_count": null, 386 | "_view_module_version": "1.2.0", 387 | "bar_color": null, 388 | "_model_module": "@jupyter-widgets/controls" 389 | } 390 | }, 391 | "901791b19c524307b8105bc71fa3d27b": { 392 | "model_module": "@jupyter-widgets/base", 393 | "model_name": "LayoutModel", 394 | "state": { 395 | "_view_name": "LayoutView", 396 | "grid_template_rows": null, 397 | "right": null, 398 | "justify_content": null, 399 | "_view_module": "@jupyter-widgets/base", 400 | "overflow": null, 401 | "_model_module_version": "1.2.0", 402 | "_view_count": null, 403 | "flex_flow": null, 404 | "width": null, 405 | "min_width": null, 406 | "border": null, 407 | "align_items": null, 408 | "bottom": null, 409 | "_model_module": "@jupyter-widgets/base", 410 | "top": null, 411 | "grid_column": null, 412 | "overflow_y": null, 413 | "overflow_x": null, 414 | "grid_auto_flow": null, 415 | "grid_area": null, 416 | "grid_template_columns": null, 417 | "flex": null, 418 | "_model_name": "LayoutModel", 419 | "justify_items": null, 420 | "grid_row": null, 421 | "max_height": null, 422 | "align_content": null, 423 | "visibility": null, 424 | "align_self": null, 425 | "height": null, 426 | "min_height": null, 427 | "padding": null, 428 | "grid_auto_rows": null, 429 | "grid_gap": null, 430 | "max_width": null, 431 | "order": null, 432 | "_view_module_version": "1.2.0", 433 | "grid_template_areas": null, 434 | "object_position": null, 435 | "object_fit": null, 436 | "grid_auto_columns": null, 437 | "margin": null, 438 | "display": null, 439 | "left": null 440 | } 441 | }, 442 | "b06a0c9a224641fabc62ebcc8241bca6": { 443 | "model_module": "@jupyter-widgets/controls", 444 | "model_name": "DescriptionStyleModel", 445 | "state": { 446 | "_view_name": "StyleView", 447 | "_model_name": "DescriptionStyleModel", 448 | "description_width": "", 449 | "_view_module": "@jupyter-widgets/base", 450 | "_model_module_version": "1.5.0", 451 | "_view_count": null, 452 | "_view_module_version": "1.2.0", 453 | "_model_module": "@jupyter-widgets/controls" 454 | } 455 | }, 456 | "c014fdd5fa564f50be83089471cc21e4": { 457 | "model_module": "@jupyter-widgets/base", 458 | "model_name": "LayoutModel", 459 | "state": { 460 | "_view_name": "LayoutView", 461 | "grid_template_rows": null, 462 | "right": null, 463 | "justify_content": null, 464 | "_view_module": "@jupyter-widgets/base", 465 | "overflow": null, 466 | "_model_module_version": "1.2.0", 467 | "_view_count": null, 468 | "flex_flow": null, 469 | "width": null, 470 | "min_width": null, 471 | "border": null, 472 | "align_items": null, 473 | "bottom": null, 474 | "_model_module": "@jupyter-widgets/base", 475 | "top": null, 476 | "grid_column": null, 477 | "overflow_y": null, 478 | "overflow_x": null, 479 | "grid_auto_flow": null, 480 | "grid_area": null, 481 | "grid_template_columns": null, 482 | "flex": null, 483 | "_model_name": "LayoutModel", 484 | "justify_items": null, 485 | "grid_row": null, 486 | "max_height": null, 487 | "align_content": null, 488 | "visibility": null, 489 | "align_self": null, 490 | "height": null, 491 | "min_height": null, 492 | "padding": null, 493 | "grid_auto_rows": null, 494 | "grid_gap": null, 495 | "max_width": null, 496 | "order": null, 497 | "_view_module_version": "1.2.0", 498 | "grid_template_areas": null, 499 | "object_position": null, 500 | "object_fit": null, 501 | "grid_auto_columns": null, 502 | "margin": null, 503 | "display": null, 504 | "left": null 505 | } 506 | }, 507 | "a2f4de745dd648bf97d0916abc6d0df8": { 508 | "model_module": "@jupyter-widgets/controls", 509 | "model_name": "HBoxModel", 510 | "state": { 511 | "_view_name": "HBoxView", 512 | "_dom_classes": [], 513 | "_model_name": "HBoxModel", 514 | "_view_module": "@jupyter-widgets/controls", 515 | "_model_module_version": "1.5.0", 516 | "_view_count": null, 517 | "_view_module_version": "1.5.0", 518 | "box_style": "", 519 | "layout": "IPY_MODEL_ac11c6a44879425e85b78443e4036dcd", 520 | "_model_module": "@jupyter-widgets/controls", 521 | "children": [ 522 | "IPY_MODEL_e2c4ee3ce4d54982a367d76a3eb665a9", 523 | "IPY_MODEL_999d3d3939c545dbaf6cfb7db9caf652" 524 | ] 525 | } 526 | }, 527 | "ac11c6a44879425e85b78443e4036dcd": { 528 | "model_module": "@jupyter-widgets/base", 529 | "model_name": "LayoutModel", 530 | "state": { 531 | "_view_name": "LayoutView", 532 | "grid_template_rows": null, 533 | "right": null, 534 | "justify_content": null, 535 | "_view_module": "@jupyter-widgets/base", 536 | "overflow": null, 537 | "_model_module_version": "1.2.0", 538 | "_view_count": null, 539 | "flex_flow": null, 540 | "width": null, 541 | "min_width": null, 542 | "border": null, 543 | "align_items": null, 544 | "bottom": null, 545 | "_model_module": "@jupyter-widgets/base", 546 | "top": null, 547 | "grid_column": null, 548 | "overflow_y": null, 549 | "overflow_x": null, 550 | "grid_auto_flow": null, 551 | "grid_area": null, 552 | "grid_template_columns": null, 553 | "flex": null, 554 | "_model_name": "LayoutModel", 555 | "justify_items": null, 556 | "grid_row": null, 557 | "max_height": null, 558 | "align_content": null, 559 | "visibility": null, 560 | "align_self": null, 561 | "height": null, 562 | "min_height": null, 563 | "padding": null, 564 | "grid_auto_rows": null, 565 | "grid_gap": null, 566 | "max_width": null, 567 | "order": null, 568 | "_view_module_version": "1.2.0", 569 | "grid_template_areas": null, 570 | "object_position": null, 571 | "object_fit": null, 572 | "grid_auto_columns": null, 573 | "margin": null, 574 | "display": null, 575 | "left": null 576 | } 577 | }, 578 | "e2c4ee3ce4d54982a367d76a3eb665a9": { 579 | "model_module": "@jupyter-widgets/controls", 580 | "model_name": "FloatProgressModel", 581 | "state": { 582 | "_view_name": "ProgressView", 583 | "style": "IPY_MODEL_90fbdf46e39b4a929d838664c182f190", 584 | "_dom_classes": [], 585 | "description": "Downloading: 100%", 586 | "_model_name": "FloatProgressModel", 587 | "bar_style": "success", 588 | "max": 5069051, 589 | "_view_module": "@jupyter-widgets/controls", 590 | "_model_module_version": "1.5.0", 591 | "value": 5069051, 592 | "_view_count": null, 593 | "_view_module_version": "1.5.0", 594 | "orientation": "horizontal", 595 | "min": 0, 596 | "description_tooltip": null, 597 | "_model_module": "@jupyter-widgets/controls", 598 | "layout": "IPY_MODEL_8def438e71a849b485daab55c4d2dfbd" 599 | } 600 | }, 601 | "999d3d3939c545dbaf6cfb7db9caf652": { 602 | "model_module": "@jupyter-widgets/controls", 603 | "model_name": "HTMLModel", 604 | "state": { 605 | "_view_name": "HTMLView", 606 | "style": "IPY_MODEL_24f5449bbb924cb086c8376ba0a0217a", 607 | "_dom_classes": [], 608 | "description": "", 609 | "_model_name": "HTMLModel", 610 | "placeholder": "​", 611 | "_view_module": "@jupyter-widgets/controls", 612 | "_model_module_version": "1.5.0", 613 | "value": " 5.07M/5.07M [00:02<00:00, 2.43MB/s]", 614 | "_view_count": null, 615 | "_view_module_version": "1.5.0", 616 | "description_tooltip": null, 617 | "_model_module": "@jupyter-widgets/controls", 618 | "layout": "IPY_MODEL_53e0c97d7438446480b6b203e61e2763" 619 | } 620 | }, 621 | "90fbdf46e39b4a929d838664c182f190": { 622 | "model_module": "@jupyter-widgets/controls", 623 | "model_name": "ProgressStyleModel", 624 | "state": { 625 | "_view_name": "StyleView", 626 | "_model_name": "ProgressStyleModel", 627 | "description_width": "initial", 628 | "_view_module": "@jupyter-widgets/base", 629 | "_model_module_version": "1.5.0", 630 | "_view_count": null, 631 | "_view_module_version": "1.2.0", 632 | "bar_color": null, 633 | "_model_module": "@jupyter-widgets/controls" 634 | } 635 | }, 636 | "8def438e71a849b485daab55c4d2dfbd": { 637 | "model_module": "@jupyter-widgets/base", 638 | "model_name": "LayoutModel", 639 | "state": { 640 | "_view_name": "LayoutView", 641 | "grid_template_rows": null, 642 | "right": null, 643 | "justify_content": null, 644 | "_view_module": "@jupyter-widgets/base", 645 | "overflow": null, 646 | "_model_module_version": "1.2.0", 647 | "_view_count": null, 648 | "flex_flow": null, 649 | "width": null, 650 | "min_width": null, 651 | "border": null, 652 | "align_items": null, 653 | "bottom": null, 654 | "_model_module": "@jupyter-widgets/base", 655 | "top": null, 656 | "grid_column": null, 657 | "overflow_y": null, 658 | "overflow_x": null, 659 | "grid_auto_flow": null, 660 | "grid_area": null, 661 | "grid_template_columns": null, 662 | "flex": null, 663 | "_model_name": "LayoutModel", 664 | "justify_items": null, 665 | "grid_row": null, 666 | "max_height": null, 667 | "align_content": null, 668 | "visibility": null, 669 | "align_self": null, 670 | "height": null, 671 | "min_height": null, 672 | "padding": null, 673 | "grid_auto_rows": null, 674 | "grid_gap": null, 675 | "max_width": null, 676 | "order": null, 677 | "_view_module_version": "1.2.0", 678 | "grid_template_areas": null, 679 | "object_position": null, 680 | "object_fit": null, 681 | "grid_auto_columns": null, 682 | "margin": null, 683 | "display": null, 684 | "left": null 685 | } 686 | }, 687 | "24f5449bbb924cb086c8376ba0a0217a": { 688 | "model_module": "@jupyter-widgets/controls", 689 | "model_name": "DescriptionStyleModel", 690 | "state": { 691 | "_view_name": "StyleView", 692 | "_model_name": "DescriptionStyleModel", 693 | "description_width": "", 694 | "_view_module": "@jupyter-widgets/base", 695 | "_model_module_version": "1.5.0", 696 | "_view_count": null, 697 | "_view_module_version": "1.2.0", 698 | "_model_module": "@jupyter-widgets/controls" 699 | } 700 | }, 701 | "53e0c97d7438446480b6b203e61e2763": { 702 | "model_module": "@jupyter-widgets/base", 703 | "model_name": "LayoutModel", 704 | "state": { 705 | "_view_name": "LayoutView", 706 | "grid_template_rows": null, 707 | "right": null, 708 | "justify_content": null, 709 | "_view_module": "@jupyter-widgets/base", 710 | "overflow": null, 711 | "_model_module_version": "1.2.0", 712 | "_view_count": null, 713 | "flex_flow": null, 714 | "width": null, 715 | "min_width": null, 716 | "border": null, 717 | "align_items": null, 718 | "bottom": null, 719 | "_model_module": "@jupyter-widgets/base", 720 | "top": null, 721 | "grid_column": null, 722 | "overflow_y": null, 723 | "overflow_x": null, 724 | "grid_auto_flow": null, 725 | "grid_area": null, 726 | "grid_template_columns": null, 727 | "flex": null, 728 | "_model_name": "LayoutModel", 729 | "justify_items": null, 730 | "grid_row": null, 731 | "max_height": null, 732 | "align_content": null, 733 | "visibility": null, 734 | "align_self": null, 735 | "height": null, 736 | "min_height": null, 737 | "padding": null, 738 | "grid_auto_rows": null, 739 | "grid_gap": null, 740 | "max_width": null, 741 | "order": null, 742 | "_view_module_version": "1.2.0", 743 | "grid_template_areas": null, 744 | "object_position": null, 745 | "object_fit": null, 746 | "grid_auto_columns": null, 747 | "margin": null, 748 | "display": null, 749 | "left": null 750 | } 751 | }, 752 | "987cc41166a04adba2b8b78039aee22a": { 753 | "model_module": "@jupyter-widgets/controls", 754 | "model_name": "HBoxModel", 755 | "state": { 756 | "_view_name": "HBoxView", 757 | "_dom_classes": [], 758 | "_model_name": "HBoxModel", 759 | "_view_module": "@jupyter-widgets/controls", 760 | "_model_module_version": "1.5.0", 761 | "_view_count": null, 762 | "_view_module_version": "1.5.0", 763 | "box_style": "", 764 | "layout": "IPY_MODEL_bf3031a3cec04b6683aef27dbf2aa3b0", 765 | "_model_module": "@jupyter-widgets/controls", 766 | "children": [ 767 | "IPY_MODEL_d2bff619756249ebb8b5fa428faf02a5", 768 | "IPY_MODEL_c1fafc3f913342cd851b9ba34db8d5d9" 769 | ] 770 | } 771 | }, 772 | "bf3031a3cec04b6683aef27dbf2aa3b0": { 773 | "model_module": "@jupyter-widgets/base", 774 | "model_name": "LayoutModel", 775 | "state": { 776 | "_view_name": "LayoutView", 777 | "grid_template_rows": null, 778 | "right": null, 779 | "justify_content": null, 780 | "_view_module": "@jupyter-widgets/base", 781 | "overflow": null, 782 | "_model_module_version": "1.2.0", 783 | "_view_count": null, 784 | "flex_flow": null, 785 | "width": null, 786 | "min_width": null, 787 | "border": null, 788 | "align_items": null, 789 | "bottom": null, 790 | "_model_module": "@jupyter-widgets/base", 791 | "top": null, 792 | "grid_column": null, 793 | "overflow_y": null, 794 | "overflow_x": null, 795 | "grid_auto_flow": null, 796 | "grid_area": null, 797 | "grid_template_columns": null, 798 | "flex": null, 799 | "_model_name": "LayoutModel", 800 | "justify_items": null, 801 | "grid_row": null, 802 | "max_height": null, 803 | "align_content": null, 804 | "visibility": null, 805 | "align_self": null, 806 | "height": null, 807 | "min_height": null, 808 | "padding": null, 809 | "grid_auto_rows": null, 810 | "grid_gap": null, 811 | "max_width": null, 812 | "order": null, 813 | "_view_module_version": "1.2.0", 814 | "grid_template_areas": null, 815 | "object_position": null, 816 | "object_fit": null, 817 | "grid_auto_columns": null, 818 | "margin": null, 819 | "display": null, 820 | "left": null 821 | } 822 | }, 823 | "d2bff619756249ebb8b5fa428faf02a5": { 824 | "model_module": "@jupyter-widgets/controls", 825 | "model_name": "FloatProgressModel", 826 | "state": { 827 | "_view_name": "ProgressView", 828 | "style": "IPY_MODEL_13d8e85845ae4ba69bd95904e14a542e", 829 | "_dom_classes": [], 830 | "description": "Downloading: 100%", 831 | "_model_name": "FloatProgressModel", 832 | "bar_style": "success", 833 | "max": 1885418496, 834 | "_view_module": "@jupyter-widgets/controls", 835 | "_model_module_version": "1.5.0", 836 | "value": 1885418496, 837 | "_view_count": null, 838 | "_view_module_version": "1.5.0", 839 | "orientation": "horizontal", 840 | "min": 0, 841 | "description_tooltip": null, 842 | "_model_module": "@jupyter-widgets/controls", 843 | "layout": "IPY_MODEL_06441c2d907e4bcf9494d7ba9abe759f" 844 | } 845 | }, 846 | "c1fafc3f913342cd851b9ba34db8d5d9": { 847 | "model_module": "@jupyter-widgets/controls", 848 | "model_name": "HTMLModel", 849 | "state": { 850 | "_view_name": "HTMLView", 851 | "style": "IPY_MODEL_0d7e6bc240e64734a98c710e0ab92112", 852 | "_dom_classes": [], 853 | "description": "", 854 | "_model_name": "HTMLModel", 855 | "placeholder": "​", 856 | "_view_module": "@jupyter-widgets/controls", 857 | "_model_module_version": "1.5.0", 858 | "value": " 1.89G/1.89G [00:37<00:00, 50.8MB/s]", 859 | "_view_count": null, 860 | "_view_module_version": "1.5.0", 861 | "description_tooltip": null, 862 | "_model_module": "@jupyter-widgets/controls", 863 | "layout": "IPY_MODEL_3d12ea39d6764e0f90f02c265cb7d295" 864 | } 865 | }, 866 | "13d8e85845ae4ba69bd95904e14a542e": { 867 | "model_module": "@jupyter-widgets/controls", 868 | "model_name": "ProgressStyleModel", 869 | "state": { 870 | "_view_name": "StyleView", 871 | "_model_name": "ProgressStyleModel", 872 | "description_width": "initial", 873 | "_view_module": "@jupyter-widgets/base", 874 | "_model_module_version": "1.5.0", 875 | "_view_count": null, 876 | "_view_module_version": "1.2.0", 877 | "bar_color": null, 878 | "_model_module": "@jupyter-widgets/controls" 879 | } 880 | }, 881 | "06441c2d907e4bcf9494d7ba9abe759f": { 882 | "model_module": "@jupyter-widgets/base", 883 | "model_name": "LayoutModel", 884 | "state": { 885 | "_view_name": "LayoutView", 886 | "grid_template_rows": null, 887 | "right": null, 888 | "justify_content": null, 889 | "_view_module": "@jupyter-widgets/base", 890 | "overflow": null, 891 | "_model_module_version": "1.2.0", 892 | "_view_count": null, 893 | "flex_flow": null, 894 | "width": null, 895 | "min_width": null, 896 | "border": null, 897 | "align_items": null, 898 | "bottom": null, 899 | "_model_module": "@jupyter-widgets/base", 900 | "top": null, 901 | "grid_column": null, 902 | "overflow_y": null, 903 | "overflow_x": null, 904 | "grid_auto_flow": null, 905 | "grid_area": null, 906 | "grid_template_columns": null, 907 | "flex": null, 908 | "_model_name": "LayoutModel", 909 | "justify_items": null, 910 | "grid_row": null, 911 | "max_height": null, 912 | "align_content": null, 913 | "visibility": null, 914 | "align_self": null, 915 | "height": null, 916 | "min_height": null, 917 | "padding": null, 918 | "grid_auto_rows": null, 919 | "grid_gap": null, 920 | "max_width": null, 921 | "order": null, 922 | "_view_module_version": "1.2.0", 923 | "grid_template_areas": null, 924 | "object_position": null, 925 | "object_fit": null, 926 | "grid_auto_columns": null, 927 | "margin": null, 928 | "display": null, 929 | "left": null 930 | } 931 | }, 932 | "0d7e6bc240e64734a98c710e0ab92112": { 933 | "model_module": "@jupyter-widgets/controls", 934 | "model_name": "DescriptionStyleModel", 935 | "state": { 936 | "_view_name": "StyleView", 937 | "_model_name": "DescriptionStyleModel", 938 | "description_width": "", 939 | "_view_module": "@jupyter-widgets/base", 940 | "_model_module_version": "1.5.0", 941 | "_view_count": null, 942 | "_view_module_version": "1.2.0", 943 | "_model_module": "@jupyter-widgets/controls" 944 | } 945 | }, 946 | "3d12ea39d6764e0f90f02c265cb7d295": { 947 | "model_module": "@jupyter-widgets/base", 948 | "model_name": "LayoutModel", 949 | "state": { 950 | "_view_name": "LayoutView", 951 | "grid_template_rows": null, 952 | "right": null, 953 | "justify_content": null, 954 | "_view_module": "@jupyter-widgets/base", 955 | "overflow": null, 956 | "_model_module_version": "1.2.0", 957 | "_view_count": null, 958 | "flex_flow": null, 959 | "width": null, 960 | "min_width": null, 961 | "border": null, 962 | "align_items": null, 963 | "bottom": null, 964 | "_model_module": "@jupyter-widgets/base", 965 | "top": null, 966 | "grid_column": null, 967 | "overflow_y": null, 968 | "overflow_x": null, 969 | "grid_auto_flow": null, 970 | "grid_area": null, 971 | "grid_template_columns": null, 972 | "flex": null, 973 | "_model_name": "LayoutModel", 974 | "justify_items": null, 975 | "grid_row": null, 976 | "max_height": null, 977 | "align_content": null, 978 | "visibility": null, 979 | "align_self": null, 980 | "height": null, 981 | "min_height": null, 982 | "padding": null, 983 | "grid_auto_rows": null, 984 | "grid_gap": null, 985 | "max_width": null, 986 | "order": null, 987 | "_view_module_version": "1.2.0", 988 | "grid_template_areas": null, 989 | "object_position": null, 990 | "object_fit": null, 991 | "grid_auto_columns": null, 992 | "margin": null, 993 | "display": null, 994 | "left": null 995 | } 996 | } 997 | } 998 | } 999 | }, 1000 | "cells": [ 1001 | { 1002 | "cell_type": "code", 1003 | "metadata": { 1004 | "colab": { 1005 | "base_uri": "https://localhost:8080/" 1006 | }, 1007 | "id": "ojSAlH9W_gfv", 1008 | "outputId": "b315bf07-ec72-45c4-91d5-c7a99f775514" 1009 | }, 1010 | "source": [ 1011 | "# Check GPU type\r\n", 1012 | "!nvidia-smi" 1013 | ], 1014 | "execution_count": 1, 1015 | "outputs": [ 1016 | { 1017 | "output_type": "stream", 1018 | "text": [ 1019 | "Sun Feb 28 17:29:50 2021 \n", 1020 | "+-----------------------------------------------------------------------------+\n", 1021 | "| NVIDIA-SMI 460.39 Driver Version: 460.32.03 CUDA Version: 11.2 |\n", 1022 | "|-------------------------------+----------------------+----------------------+\n", 1023 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 1024 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 1025 | "| | | MIG M. |\n", 1026 | "|===============================+======================+======================|\n", 1027 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 1028 | "| N/A 46C P8 10W / 70W | 0MiB / 15109MiB | 0% Default |\n", 1029 | "| | | N/A |\n", 1030 | "+-------------------------------+----------------------+----------------------+\n", 1031 | " \n", 1032 | "+-----------------------------------------------------------------------------+\n", 1033 | "| Processes: |\n", 1034 | "| GPU GI CI PID Type Process name GPU Memory |\n", 1035 | "| ID ID Usage |\n", 1036 | "|=============================================================================|\n", 1037 | "| No running processes found |\n", 1038 | "+-----------------------------------------------------------------------------+\n" 1039 | ], 1040 | "name": "stdout" 1041 | } 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "metadata": { 1047 | "colab": { 1048 | "base_uri": "https://localhost:8080/" 1049 | }, 1050 | "id": "5016OlJnp2kC", 1051 | "outputId": "021ee1d6-8a7a-42cc-e0f1-813f0473181c" 1052 | }, 1053 | "source": [ 1054 | "# Upgrade pip and install ktrain\r\n", 1055 | "!pip -qq install -U pip\r\n", 1056 | "!pip -qq install ktrain" 1057 | ], 1058 | "execution_count": 2, 1059 | "outputs": [ 1060 | { 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "\u001b[K |████████████████████████████████| 1.5MB 7.6MB/s \n", 1064 | "\u001b[K |████████████████████████████████| 25.3 MB 73 kB/s \n", 1065 | "\u001b[K |████████████████████████████████| 6.8 MB 62.4 MB/s \n", 1066 | "\u001b[K |████████████████████████████████| 981 kB 60.1 MB/s \n", 1067 | "\u001b[K |████████████████████████████████| 263 kB 53.8 MB/s \n", 1068 | "\u001b[K |████████████████████████████████| 1.3 MB 31.5 MB/s \n", 1069 | "\u001b[K |████████████████████████████████| 1.2 MB 56.9 MB/s \n", 1070 | "\u001b[K |████████████████████████████████| 468 kB 53.4 MB/s \n", 1071 | "\u001b[K |████████████████████████████████| 1.1 MB 60.7 MB/s \n", 1072 | "\u001b[K |████████████████████████████████| 883 kB 61.2 MB/s \n", 1073 | "\u001b[K |████████████████████████████████| 2.9 MB 65.4 MB/s \n", 1074 | "\u001b[?25h Building wheel for ktrain (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1075 | " Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1076 | " Building wheel for keras-bert (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1077 | " Building wheel for keras-transformer (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1078 | " Building wheel for keras-embed-sim (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1079 | " Building wheel for keras-layer-normalization (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1080 | " Building wheel for keras-multi-head (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1081 | " Building wheel for keras-self-attention (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1082 | " Building wheel for keras-pos-embd (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1083 | " Building wheel for keras-position-wise-feed-forward (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1084 | " Building wheel for langdetect (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1085 | " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 1086 | " Building wheel for syntok (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 1087 | ], 1088 | "name": "stdout" 1089 | } 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "code", 1094 | "metadata": { 1095 | "id": "qDHDhHzWrDmm", 1096 | "colab": { 1097 | "base_uri": "https://localhost:8080/" 1098 | }, 1099 | "outputId": "6ec8c799-c033-43d3-be91-f6e6e3fbd83d" 1100 | }, 1101 | "source": [ 1102 | "# Download data\r\n", 1103 | "!gdown --id 1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK\r\n", 1104 | "!unzip -q '/content/aai4_data.zip'" 1105 | ], 1106 | "execution_count": 3, 1107 | "outputs": [ 1108 | { 1109 | "output_type": "stream", 1110 | "text": [ 1111 | "Downloading...\n", 1112 | "From: https://drive.google.com/uc?id=1gLdjbkhHVZd2RVP7k1lCR2UyBu0YPcsK\n", 1113 | "To: /content/aai4_data.zip\n", 1114 | "23.6MB [00:00, 57.1MB/s]\n" 1115 | ], 1116 | "name": "stdout" 1117 | } 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "metadata": { 1123 | "id": "TWw-1GHGqVI1" 1124 | }, 1125 | "source": [ 1126 | "# Import libaries\r\n", 1127 | "import numpy as np \r\n", 1128 | "import pandas as pd\r\n", 1129 | "import random\r\n", 1130 | "import os\r\n", 1131 | "import re\r\n", 1132 | "import ktrain\r\n", 1133 | "from ktrain import text\r\n", 1134 | "import tensorflow as tf\r\n", 1135 | "from sklearn.model_selection import StratifiedKFold\r\n", 1136 | "import warnings\r\n", 1137 | "warnings.filterwarnings('ignore')" 1138 | ], 1139 | "execution_count": 4, 1140 | "outputs": [] 1141 | }, 1142 | { 1143 | "cell_type": "code", 1144 | "metadata": { 1145 | "id": "gdIMbg9vnM9b" 1146 | }, 1147 | "source": [ 1148 | "# Set seed\r\n", 1149 | "SEED = 3031\r\n", 1150 | "\r\n", 1151 | "def set_seeds(seed=SEED):\r\n", 1152 | " os.environ['PYTHONHASHSEED'] = str(seed)\r\n", 1153 | " random.seed(seed)\r\n", 1154 | " tf.random.set_seed(seed)\r\n", 1155 | " np.random.seed(seed)\r\n", 1156 | "\r\n", 1157 | "def set_global_determinism(seed=SEED):\r\n", 1158 | " set_seeds(seed=seed)\r\n", 1159 | "\r\n", 1160 | " os.environ['TF_DETERMINISTIC_OPS'] = '1'\r\n", 1161 | " os.environ['TF_CUDNN_DETERMINISTIC'] = '1'\r\n", 1162 | " \r\n", 1163 | " tf.config.threading.set_inter_op_parallelism_threads(1)\r\n", 1164 | " tf.config.threading.set_intra_op_parallelism_threads(1)\r\n", 1165 | "\r\n", 1166 | "set_global_determinism(seed=SEED)" 1167 | ], 1168 | "execution_count": 5, 1169 | "outputs": [] 1170 | }, 1171 | { 1172 | "cell_type": "code", 1173 | "metadata": { 1174 | "id": "q0ECuC-bqVGi" 1175 | }, 1176 | "source": [ 1177 | "# Load data\r\n", 1178 | "train = pd.read_csv('/content/aai4_data/train.csv')\r\n", 1179 | "test = pd.read_csv('/content/aai4_data/test.csv')\r\n", 1180 | "sample = pd.read_csv('/content/aai4_data/sample_submission.csv')" 1181 | ], 1182 | "execution_count": 6, 1183 | "outputs": [] 1184 | }, 1185 | { 1186 | "cell_type": "code", 1187 | "metadata": { 1188 | "colab": { 1189 | "base_uri": "https://localhost:8080/", 1190 | "height": 195 1191 | }, 1192 | "id": "iJu-Skwq_JaM", 1193 | "outputId": "a6b5eb3b-84f2-494b-e638-7fac56ac8b1a" 1194 | }, 1195 | "source": [ 1196 | "# Preview last five rows in test\r\n", 1197 | "test.tail()" 1198 | ], 1199 | "execution_count": 7, 1200 | "outputs": [ 1201 | { 1202 | "output_type": "execute_result", 1203 | "data": { 1204 | "text/html": [ 1205 | "
\n", 1206 | "\n", 1219 | "\n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | "
idcontent
7751SW18887\\n\\n \\nNa Ibrahim Yassin-Nkasi\\n \\n\\tMWANAFUNZ...
7752SW23779BAADA ya R. Kelly kukumbwa na\\nkashfa ya unyan...
7753SW20243\\n\\tNa JUDITH NYANGE-MWANZA\\n \\n\\n \\n\\tKAMPUNI...
7754SW27943WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wa...
7755SW22906WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy...
\n", 1255 | "
" 1256 | ], 1257 | "text/plain": [ 1258 | " id content\n", 1259 | "7751 SW18887 \\n\\n \\nNa Ibrahim Yassin-Nkasi\\n \\n\\tMWANAFUNZ...\n", 1260 | "7752 SW23779 BAADA ya R. Kelly kukumbwa na\\nkashfa ya unyan...\n", 1261 | "7753 SW20243 \\n\\tNa JUDITH NYANGE-MWANZA\\n \\n\\n \\n\\tKAMPUNI...\n", 1262 | "7754 SW27943 WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wa...\n", 1263 | "7755 SW22906 WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy..." 1264 | ] 1265 | }, 1266 | "metadata": { 1267 | "tags": [] 1268 | }, 1269 | "execution_count": 7 1270 | } 1271 | ] 1272 | }, 1273 | { 1274 | "cell_type": "code", 1275 | "metadata": { 1276 | "colab": { 1277 | "base_uri": "https://localhost:8080/", 1278 | "height": 195 1279 | }, 1280 | "id": "6DWa0uFBupn_", 1281 | "outputId": "98bd7bd4-bf1d-4454-c51f-0290cfbe4735" 1282 | }, 1283 | "source": [ 1284 | "# Remove trailing spaces, new lines and tab spaces from data\r\n", 1285 | "train.content = train.content.apply(lambda x: (re.sub('\\s+',' ', x)).strip())\r\n", 1286 | "test.content = test.content.apply(lambda x: (re.sub('\\s+',' ', x)).strip())\r\n", 1287 | "test.tail()" 1288 | ], 1289 | "execution_count": 8, 1290 | "outputs": [ 1291 | { 1292 | "output_type": "execute_result", 1293 | "data": { 1294 | "text/html": [ 1295 | "
\n", 1296 | "\n", 1309 | "\n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | "
idcontent
7751SW18887Na Ibrahim Yassin-Nkasi MWANAFUNZI wa kidato c...
7752SW23779BAADA ya R. Kelly kukumbwa na kashfa ya unyany...
7753SW20243Na JUDITH NYANGE-MWANZA KAMPUNI ya Ujenzi wa N...
7754SW27943WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wan...
7755SW22906WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy...
\n", 1345 | "
" 1346 | ], 1347 | "text/plain": [ 1348 | " id content\n", 1349 | "7751 SW18887 Na Ibrahim Yassin-Nkasi MWANAFUNZI wa kidato c...\n", 1350 | "7752 SW23779 BAADA ya R. Kelly kukumbwa na kashfa ya unyany...\n", 1351 | "7753 SW20243 Na JUDITH NYANGE-MWANZA KAMPUNI ya Ujenzi wa N...\n", 1352 | "7754 SW27943 WAZIRI wa Ulinzi, Dk Hussein Mwinyi vijana wan...\n", 1353 | "7755 SW22906 WAANDISHI WETU – DAR/MIKOANI KASI ya virusi vy..." 1354 | ] 1355 | }, 1356 | "metadata": { 1357 | "tags": [] 1358 | }, 1359 | "execution_count": 8 1360 | } 1361 | ] 1362 | }, 1363 | { 1364 | "cell_type": "code", 1365 | "metadata": { 1366 | "id": "h5dscE0Vre-J", 1367 | "colab": { 1368 | "base_uri": "https://localhost:8080/", 1369 | "height": 66, 1370 | "referenced_widgets": [ 1371 | "fa3a8735e68c41c7accffc74ce9447ae", 1372 | "ec598ba5ac874ce7853c68e8ebe93645", 1373 | "8b748858dc2f43dbbe61e0e7d60364a0", 1374 | "4d3c49b123ea443a875493bb77e162cb", 1375 | "958db85b2ea744dd81c994a50933d1b5", 1376 | "f78f05533cca49f995448596c4231a98", 1377 | "9619e3c1eb3a409899c702486a17e792", 1378 | "28422b1b07ea447db4261b9ea662659c" 1379 | ] 1380 | }, 1381 | "outputId": "36f61834-2628-4afe-e44f-ab7a42fcd56a" 1382 | }, 1383 | "source": [ 1384 | "# Set model parameters\r\n", 1385 | "MODEL_NAME = 'xlm-roberta-base'\r\n", 1386 | "MAX_LEN = 256\r\n", 1387 | "BATCH_SIZE = 16\r\n", 1388 | "FOLDS = 3\r\n", 1389 | "LR = 3e-5\r\n", 1390 | "EPOCHS = 2\r\n", 1391 | "\r\n", 1392 | "# List of class names\r\n", 1393 | "CLASS_NAMES = sorted(train.category.unique().tolist()) # ['afya', 'burudani', 'kimataifa', 'kitaifa', 'michezo', 'uchumi']\r\n", 1394 | "\r\n", 1395 | "# Instantiate transformer with the provided parameters\r\n", 1396 | "t = text.Transformer(model_name=MODEL_NAME, maxlen=MAX_LEN, class_names=CLASS_NAMES, batch_size=BATCH_SIZE)" 1397 | ], 1398 | "execution_count": 9, 1399 | "outputs": [ 1400 | { 1401 | "output_type": "display_data", 1402 | "data": { 1403 | "application/vnd.jupyter.widget-view+json": { 1404 | "model_id": "fa3a8735e68c41c7accffc74ce9447ae", 1405 | "version_minor": 0, 1406 | "version_major": 2 1407 | }, 1408 | "text/plain": [ 1409 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…" 1410 | ] 1411 | }, 1412 | "metadata": { 1413 | "tags": [] 1414 | } 1415 | }, 1416 | { 1417 | "output_type": "stream", 1418 | "text": [ 1419 | "\n" 1420 | ], 1421 | "name": "stdout" 1422 | } 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "metadata": { 1428 | "id": "chBN-wZiy1QL", 1429 | "colab": { 1430 | "base_uri": "https://localhost:8080/", 1431 | "height": 1000, 1432 | "referenced_widgets": [ 1433 | "560d3fa635694e2a9a9410a011737075", 1434 | "2c5283f30c8f428a8dd509702b90729b", 1435 | "86f143f04251403e8ed50fe52e72df2a", 1436 | "144eea02d8b2441196d91c646be08d45", 1437 | "fdb5ff5ec5ab4fa486f8714cd185d799", 1438 | "901791b19c524307b8105bc71fa3d27b", 1439 | "b06a0c9a224641fabc62ebcc8241bca6", 1440 | "c014fdd5fa564f50be83089471cc21e4", 1441 | "a2f4de745dd648bf97d0916abc6d0df8", 1442 | "ac11c6a44879425e85b78443e4036dcd", 1443 | "e2c4ee3ce4d54982a367d76a3eb665a9", 1444 | "999d3d3939c545dbaf6cfb7db9caf652", 1445 | "90fbdf46e39b4a929d838664c182f190", 1446 | "8def438e71a849b485daab55c4d2dfbd", 1447 | "24f5449bbb924cb086c8376ba0a0217a", 1448 | "53e0c97d7438446480b6b203e61e2763", 1449 | "987cc41166a04adba2b8b78039aee22a", 1450 | "bf3031a3cec04b6683aef27dbf2aa3b0", 1451 | "d2bff619756249ebb8b5fa428faf02a5", 1452 | "c1fafc3f913342cd851b9ba34db8d5d9", 1453 | "13d8e85845ae4ba69bd95904e14a542e", 1454 | "06441c2d907e4bcf9494d7ba9abe759f", 1455 | "0d7e6bc240e64734a98c710e0ab92112", 1456 | "3d12ea39d6764e0f90f02c265cb7d295" 1457 | ] 1458 | }, 1459 | "outputId": "8168056b-7d5c-4668-ba9a-51215462bec7" 1460 | }, 1461 | "source": [ 1462 | "%%time\r\n", 1463 | "# Prepare test data\r\n", 1464 | "test_data = np.asarray(test.content)\r\n", 1465 | "\r\n", 1466 | "# Set number of folds to 3\r\n", 1467 | "folds = StratifiedKFold(n_splits=FOLDS, random_state=SEED, shuffle=False)\r\n", 1468 | "\r\n", 1469 | "# List to store predictions and loss-score per fold\r\n", 1470 | "oof_preds = []\r\n", 1471 | "oof_loss_score = []\r\n", 1472 | "\r\n", 1473 | "for train_index, test_index in folds.split(train.content, train.category):\r\n", 1474 | " X_train, X_test = list(train.loc[train_index, 'content']), list(train.loc[test_index, 'content'])\r\n", 1475 | " y_train, y_test = np.asarray(train.loc[train_index, 'category']), np.asarray(train.loc[test_index, 'category'])\r\n", 1476 | "\r\n", 1477 | " # Preprocess training and validation data\r\n", 1478 | " train_set = t.preprocess_train(X_train, y_train)\r\n", 1479 | " val_set = t.preprocess_test(X_test, y_test)\r\n", 1480 | "\r\n", 1481 | " # Instantiate model\r\n", 1482 | " model = t.get_classifier()\r\n", 1483 | " learner = ktrain.get_learner(model, train_data=train_set, val_data=val_set, batch_size=BATCH_SIZE)\r\n", 1484 | "\r\n", 1485 | " # Train model\r\n", 1486 | " history = learner.fit(LR, n_cycles=EPOCHS, checkpoint_folder='/tmp')\r\n", 1487 | " learner.validate(class_names=t.get_classes())\r\n", 1488 | "\r\n", 1489 | " # Append score of each fold\r\n", 1490 | " oof_loss_score.append(history.history['val_loss'][-1])\r\n", 1491 | "\r\n", 1492 | " # Make predictions\r\n", 1493 | " preds = ktrain.get_predictor(learner.model, preproc=t).predict(test_data, return_proba=True)\r\n", 1494 | "\r\n", 1495 | " # Append preds to oof_preds list\r\n", 1496 | " oof_preds.append(preds)\r\n", 1497 | "\r\n", 1498 | "# Check cv score and prepare submission file\r\n", 1499 | "print(f'Mean Loss: {np.mean(oof_loss_score)}')\r\n", 1500 | "sub = pd.DataFrame(np.mean(oof_preds, axis=0), columns = t.get_classes())\r\n", 1501 | "sub['test_id'] = test.id\r\n", 1502 | "sub = sub[sample.columns]\r\n", 1503 | "sub.to_csv('Submission.csv', index = False)" 1504 | ], 1505 | "execution_count": 10, 1506 | "outputs": [ 1507 | { 1508 | "output_type": "stream", 1509 | "text": [ 1510 | "preprocessing train...\n", 1511 | "language: sw\n", 1512 | "train sequence lengths:\n", 1513 | "\tmean : 333\n", 1514 | "\t95percentile : 792\n", 1515 | "\t99percentile : 1268\n" 1516 | ], 1517 | "name": "stdout" 1518 | }, 1519 | { 1520 | "output_type": "display_data", 1521 | "data": { 1522 | "application/vnd.jupyter.widget-view+json": { 1523 | "model_id": "560d3fa635694e2a9a9410a011737075", 1524 | "version_minor": 0, 1525 | "version_major": 2 1526 | }, 1527 | "text/plain": [ 1528 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=512.0, style=ProgressStyle(description_…" 1529 | ] 1530 | }, 1531 | "metadata": { 1532 | "tags": [] 1533 | } 1534 | }, 1535 | { 1536 | "output_type": "stream", 1537 | "text": [ 1538 | "\n" 1539 | ], 1540 | "name": "stdout" 1541 | }, 1542 | { 1543 | "output_type": "display_data", 1544 | "data": { 1545 | "application/vnd.jupyter.widget-view+json": { 1546 | "model_id": "a2f4de745dd648bf97d0916abc6d0df8", 1547 | "version_minor": 0, 1548 | "version_major": 2 1549 | }, 1550 | "text/plain": [ 1551 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…" 1552 | ] 1553 | }, 1554 | "metadata": { 1555 | "tags": [] 1556 | } 1557 | }, 1558 | { 1559 | "output_type": "stream", 1560 | "text": [ 1561 | "\n" 1562 | ], 1563 | "name": "stdout" 1564 | }, 1565 | { 1566 | "output_type": "display_data", 1567 | "data": { 1568 | "text/html": [ 1569 | "" 1570 | ], 1571 | "text/plain": [ 1572 | "" 1573 | ] 1574 | }, 1575 | "metadata": { 1576 | "tags": [] 1577 | } 1578 | }, 1579 | { 1580 | "output_type": "stream", 1581 | "text": [ 1582 | "Is Multi-Label? False\n", 1583 | "preprocessing test...\n", 1584 | "language: sw\n", 1585 | "test sequence lengths:\n", 1586 | "\tmean : 331\n", 1587 | "\t95percentile : 768\n", 1588 | "\t99percentile : 1234\n" 1589 | ], 1590 | "name": "stdout" 1591 | }, 1592 | { 1593 | "output_type": "display_data", 1594 | "data": { 1595 | "text/html": [ 1596 | "" 1597 | ], 1598 | "text/plain": [ 1599 | "" 1600 | ] 1601 | }, 1602 | "metadata": { 1603 | "tags": [] 1604 | } 1605 | }, 1606 | { 1607 | "output_type": "display_data", 1608 | "data": { 1609 | "application/vnd.jupyter.widget-view+json": { 1610 | "model_id": "987cc41166a04adba2b8b78039aee22a", 1611 | "version_minor": 0, 1612 | "version_major": 2 1613 | }, 1614 | "text/plain": [ 1615 | "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1885418496.0, style=ProgressStyle(descr…" 1616 | ] 1617 | }, 1618 | "metadata": { 1619 | "tags": [] 1620 | } 1621 | }, 1622 | { 1623 | "output_type": "stream", 1624 | "text": [ 1625 | "\n", 1626 | "Epoch 1/2\n", 1627 | "970/970 [==============================] - 1094s 1s/step - loss: 0.8316 - accuracy: 0.7242 - val_loss: 0.3710 - val_accuracy: 0.8945\n", 1628 | "Epoch 2/2\n", 1629 | "970/970 [==============================] - 1081s 1s/step - loss: 0.3307 - accuracy: 0.8989 - val_loss: 0.2845 - val_accuracy: 0.9119\n", 1630 | " precision recall f1-score support\n", 1631 | "\n", 1632 | " afya 0.61 0.42 0.50 286\n", 1633 | " burudani 0.92 0.89 0.90 743\n", 1634 | " kimataifa 0.93 0.89 0.91 635\n", 1635 | " kitaifa 0.90 0.94 0.92 3414\n", 1636 | " michezo 0.95 0.97 0.96 2002\n", 1637 | " uchumi 0.92 0.84 0.88 676\n", 1638 | "\n", 1639 | " accuracy 0.91 7756\n", 1640 | " macro avg 0.87 0.83 0.85 7756\n", 1641 | "weighted avg 0.91 0.91 0.91 7756\n", 1642 | "\n", 1643 | "preprocessing train...\n", 1644 | "language: sw\n", 1645 | "train sequence lengths:\n", 1646 | "\tmean : 332\n", 1647 | "\t95percentile : 782\n", 1648 | "\t99percentile : 1279\n" 1649 | ], 1650 | "name": "stdout" 1651 | }, 1652 | { 1653 | "output_type": "display_data", 1654 | "data": { 1655 | "text/html": [ 1656 | "" 1657 | ], 1658 | "text/plain": [ 1659 | "" 1660 | ] 1661 | }, 1662 | "metadata": { 1663 | "tags": [] 1664 | } 1665 | }, 1666 | { 1667 | "output_type": "stream", 1668 | "text": [ 1669 | "Is Multi-Label? False\n", 1670 | "preprocessing test...\n", 1671 | "language: sw\n", 1672 | "test sequence lengths:\n", 1673 | "\tmean : 334\n", 1674 | "\t95percentile : 787\n", 1675 | "\t99percentile : 1239\n" 1676 | ], 1677 | "name": "stdout" 1678 | }, 1679 | { 1680 | "output_type": "display_data", 1681 | "data": { 1682 | "text/html": [ 1683 | "" 1684 | ], 1685 | "text/plain": [ 1686 | "" 1687 | ] 1688 | }, 1689 | "metadata": { 1690 | "tags": [] 1691 | } 1692 | }, 1693 | { 1694 | "output_type": "stream", 1695 | "text": [ 1696 | "Epoch 1/2\n", 1697 | "970/970 [==============================] - 1096s 1s/step - loss: 0.7408 - accuracy: 0.7570 - val_loss: 0.3460 - val_accuracy: 0.8813\n", 1698 | "Epoch 2/2\n", 1699 | "970/970 [==============================] - 1080s 1s/step - loss: 0.3208 - accuracy: 0.8940 - val_loss: 0.3023 - val_accuracy: 0.9082\n", 1700 | " precision recall f1-score support\n", 1701 | "\n", 1702 | " afya 0.58 0.50 0.53 286\n", 1703 | " burudani 0.92 0.90 0.91 743\n", 1704 | " kimataifa 0.90 0.87 0.89 636\n", 1705 | " kitaifa 0.91 0.93 0.92 3414\n", 1706 | " michezo 0.94 0.97 0.96 2001\n", 1707 | " uchumi 0.92 0.83 0.88 676\n", 1708 | "\n", 1709 | " accuracy 0.91 7756\n", 1710 | " macro avg 0.86 0.83 0.85 7756\n", 1711 | "weighted avg 0.91 0.91 0.91 7756\n", 1712 | "\n", 1713 | "preprocessing train...\n", 1714 | "language: sw\n", 1715 | "train sequence lengths:\n", 1716 | "\tmean : 332\n", 1717 | "\t95percentile : 778\n", 1718 | "\t99percentile : 1238\n" 1719 | ], 1720 | "name": "stdout" 1721 | }, 1722 | { 1723 | "output_type": "display_data", 1724 | "data": { 1725 | "text/html": [ 1726 | "" 1727 | ], 1728 | "text/plain": [ 1729 | "" 1730 | ] 1731 | }, 1732 | "metadata": { 1733 | "tags": [] 1734 | } 1735 | }, 1736 | { 1737 | "output_type": "stream", 1738 | "text": [ 1739 | "Is Multi-Label? False\n", 1740 | "preprocessing test...\n", 1741 | "language: sw\n", 1742 | "test sequence lengths:\n", 1743 | "\tmean : 333\n", 1744 | "\t95percentile : 794\n", 1745 | "\t99percentile : 1299\n" 1746 | ], 1747 | "name": "stdout" 1748 | }, 1749 | { 1750 | "output_type": "display_data", 1751 | "data": { 1752 | "text/html": [ 1753 | "" 1754 | ], 1755 | "text/plain": [ 1756 | "" 1757 | ] 1758 | }, 1759 | "metadata": { 1760 | "tags": [] 1761 | } 1762 | }, 1763 | { 1764 | "output_type": "stream", 1765 | "text": [ 1766 | "Epoch 1/2\n", 1767 | "970/970 [==============================] - 1097s 1s/step - loss: 0.7295 - accuracy: 0.7622 - val_loss: 0.2984 - val_accuracy: 0.9045\n", 1768 | "Epoch 2/2\n", 1769 | "970/970 [==============================] - 1082s 1s/step - loss: 0.2952 - accuracy: 0.9076 - val_loss: 0.3021 - val_accuracy: 0.9067\n", 1770 | " precision recall f1-score support\n", 1771 | "\n", 1772 | " afya 0.55 0.66 0.60 287\n", 1773 | " burudani 0.88 0.95 0.91 743\n", 1774 | " kimataifa 0.94 0.86 0.90 635\n", 1775 | " kitaifa 0.93 0.90 0.91 3414\n", 1776 | " michezo 0.93 0.99 0.96 2001\n", 1777 | " uchumi 0.92 0.82 0.86 676\n", 1778 | "\n", 1779 | " accuracy 0.91 7756\n", 1780 | " macro avg 0.86 0.86 0.86 7756\n", 1781 | "weighted avg 0.91 0.91 0.91 7756\n", 1782 | "\n", 1783 | "Mean Loss: 0.2963431775569916\n", 1784 | "CPU times: user 24min 1s, sys: 13min 2s, total: 37min 3s\n", 1785 | "Wall time: 2h 9min 20s\n" 1786 | ], 1787 | "name": "stdout" 1788 | } 1789 | ] 1790 | }, 1791 | { 1792 | "cell_type": "code", 1793 | "metadata": { 1794 | "id": "rSzTRo1x-HCj" 1795 | }, 1796 | "source": [ 1797 | "" 1798 | ], 1799 | "execution_count": 11, 1800 | "outputs": [] 1801 | }, 1802 | { 1803 | "cell_type": "code", 1804 | "metadata": { 1805 | "id": "9t71teFG-G-_" 1806 | }, 1807 | "source": [ 1808 | "" 1809 | ], 1810 | "execution_count": null, 1811 | "outputs": [] 1812 | }, 1813 | { 1814 | "cell_type": "code", 1815 | "metadata": { 1816 | "id": "k_Mml3aA-G7r" 1817 | }, 1818 | "source": [ 1819 | "" 1820 | ], 1821 | "execution_count": null, 1822 | "outputs": [] 1823 | } 1824 | ] 1825 | } --------------------------------------------------------------------------------