├── .gitignore ├── LICENSE ├── PyConES2015.pdf ├── README.md ├── requirements.txt ├── resources ├── badwords.txt ├── execution.ini ├── test │ ├── test.csv │ └── test_with_solutions.csv └── train │ └── train.csv ├── src └── trolling_detection │ ├── __init__.py │ ├── feature_extraction.py │ └── training.py └── trolling_detection.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # PyCharm 60 | .idea/ 61 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /PyConES2015.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafaharo/trolling_detection/e919fa3fc6d8cd8251050fd2355fdff51ac0bbdc/PyConES2015.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Trolling Detection with python NLP libraries and Scikit-Learn 2 | A (quite-simple) attempt to solve Kaggle's Trolling Detection Competition (https://www.kaggle.com/c/detecting-insults-in-social-commentary) 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.10.0 2 | appnope==0.1.0 3 | argon2-cffi==20.1.0 4 | astunparse==1.6.3 5 | async-generator==1.10 6 | attrs==20.2.0 7 | backcall==0.2.0 8 | bleach==3.2.1 9 | blis==0.4.1 10 | cachetools==4.1.1 11 | catalogue==1.0.0 12 | certifi==2020.6.20 13 | cffi==1.14.3 14 | chardet==3.0.4 15 | click==7.1.2 16 | configparser==5.0.1 17 | cycler==0.10.0 18 | cymem==2.0.3 19 | decorator==4.4.2 20 | defusedxml==0.6.0 21 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz 22 | entrypoints==0.3 23 | filelock==3.0.12 24 | future==0.18.2 25 | gast==0.3.3 26 | gensim==3.8.3 27 | google-auth==1.22.1 28 | google-auth-oauthlib==0.4.1 29 | google-pasta==0.2.0 30 | grpcio==1.32.0 31 | h5py==2.10.0 32 | idna==2.10 33 | ipykernel==5.3.4 34 | ipython==7.18.1 35 | ipython-genutils==0.2.0 36 | ipywidgets==7.5.1 37 | jedi==0.17.2 38 | Jinja2==2.11.2 39 | joblib==0.17.0 40 | jsonschema==3.2.0 41 | jupyter==1.0.0 42 | jupyter-client==6.1.7 43 | jupyter-console==6.2.0 44 | jupyter-core==4.6.3 45 | jupyterlab-pygments==0.1.2 46 | Keras==2.4.3 47 | Keras-Preprocessing==1.1.2 48 | kiwisolver==1.2.0 49 | Markdown==3.3.1 50 | MarkupSafe==1.1.1 51 | matplotlib==3.3.2 52 | mistune==0.8.4 53 | murmurhash==1.0.2 54 | nbclient==0.5.0 55 | nbconvert==6.0.7 56 | nbformat==5.0.8 57 | nest-asyncio==1.4.1 58 | nltk==3.5 59 | notebook==6.1.4 60 | numpy==1.18.5 61 | oauthlib==3.1.0 62 | opt-einsum==3.3.0 63 | packaging==20.4 64 | pandas==1.1.3 65 | pandocfilters==1.4.2 66 | parso==0.7.1 67 | pexpect==4.8.0 68 | pickleshare==0.7.5 69 | Pillow==8.0.0 70 | plac==1.1.3 71 | preshed==3.0.2 72 | prometheus-client==0.8.0 73 | prompt-toolkit==3.0.8 74 | protobuf==3.13.0 75 | ptyprocess==0.6.0 76 | pyasn1==0.4.8 77 | pyasn1-modules==0.2.8 78 | pycparser==2.20 79 | pyenchant==3.1.1 80 | Pygments==2.7.1 81 | pyparsing==2.4.7 82 | pyrsistent==0.17.3 83 | python-dateutil==2.8.1 84 | pytz==2020.1 85 | PyYAML==5.3.1 86 | pyzmq==19.0.2 87 | qtconsole==4.7.7 88 | QtPy==1.9.0 89 | regex==2020.10.11 90 | requests==2.24.0 91 | requests-oauthlib==1.3.0 92 | rsa==4.6 93 | sacremoses==0.0.43 94 | scikit-learn==0.23.2 95 | scipy==1.5.2 96 | Send2Trash==1.5.0 97 | sentencepiece==0.1.91 98 | SentLex @ file:///Users/rharo/temp/sentlex 99 | six==1.15.0 100 | smart-open==3.0.0 101 | spacy==2.3.2 102 | srsly==1.0.2 103 | tensorboard==2.3.0 104 | tensorboard-plugin-wit==1.7.0 105 | tensorflow==2.3.1 106 | tensorflow-estimator==2.3.0 107 | termcolor==1.1.0 108 | terminado==0.9.1 109 | testpath==0.4.4 110 | thinc==7.4.1 111 | threadpoolctl==2.1.0 112 | tokenizers==0.8.1rc2 113 | torch==1.6.0 114 | tornado==6.0.4 115 | tqdm==4.50.2 116 | traitlets==5.0.4 117 | transformers==3.3.1 118 | urllib3==1.25.10 119 | wasabi==0.8.0 120 | wcwidth==0.2.5 121 | webencodings==0.5.1 122 | Werkzeug==1.0.1 123 | widgetsnbextension==3.5.1 124 | wrapt==1.12.1 125 | -------------------------------------------------------------------------------- /resources/badwords.txt: -------------------------------------------------------------------------------- 1 | PieceOfShit 2 | fuck 3 | a-hole 4 | dumb 5 | fool 6 | imbecile 7 | loser 8 | nutcase 9 | dipstick 10 | lunatic 11 | weirdo 12 | dork 13 | dope 14 | dimwit 15 | half-wit 16 | oaf 17 | bimbo 18 | jerk 19 | numskull 20 | numbskull 21 | goof 22 | suck 23 | moron 24 | morons 25 | idiot 26 | idi0t 27 | stupid 28 | dump 29 | rape 30 | rapist 31 | hitler 32 | 4r5e 33 | 5h1t 34 | 5hit 35 | a55 36 | anal 37 | anus 38 | ar5e 39 | arrse 40 | arse 41 | ass 42 | ass-fucker 43 | asses 44 | assfucker 45 | assfukka 46 | asshole 47 | assholes 48 | asswhole 49 | a_s_s 50 | b!tch 51 | b00bs 52 | b17ch 53 | b1tch 54 | ballbag 55 | balls 56 | ballsack 57 | bastard 58 | beastial 59 | beastiality 60 | bellend 61 | bestial 62 | bestiality 63 | bi+ch 64 | biatch 65 | bitch 66 | bitcher 67 | bitchers 68 | bitches 69 | bitchin 70 | bitching 71 | bloody 72 | blow job 73 | blowjob 74 | blowjobs 75 | boiolas 76 | bollock 77 | bollok 78 | boner 79 | boob 80 | boobs 81 | booobs 82 | boooobs 83 | booooobs 84 | booooooobs 85 | breasts 86 | buceta 87 | bugger 88 | bum 89 | bunny fucker 90 | butt 91 | butthole 92 | buttmuch 93 | buttplug 94 | c0ck 95 | c0cksucker 96 | carpet muncher 97 | cawk 98 | chink 99 | cipa 100 | cl1t 101 | clit 102 | clitoris 103 | clits 104 | cnut 105 | cock 106 | cock-sucker 107 | cockface 108 | cockhead 109 | cockmunch 110 | cockmuncher 111 | cocks 112 | cocksuck 113 | cocksucked 114 | cocksucker 115 | cocksucking 116 | cocksucks 117 | cocksuka 118 | cocksukka 119 | cok 120 | cokmuncher 121 | coksucka 122 | coon 123 | cox 124 | crap 125 | cum 126 | cummer 127 | cumming 128 | cums 129 | cumshot 130 | cunilingus 131 | cunillingus 132 | cunnilingus 133 | cunt 134 | cuntlick 135 | cuntlicker 136 | cuntlicking 137 | cunts 138 | cyalis 139 | cyberfuc 140 | cyberfuck 141 | cyberfucked 142 | cyberfucker 143 | cyberfuckers 144 | cyberfucking 145 | d1ck 146 | damn 147 | dick 148 | dickhead 149 | dildo 150 | dildos 151 | dink 152 | dinks 153 | dirsa 154 | dlck 155 | dog-fucker 156 | doggin 157 | dogging 158 | donkeyribber 159 | doosh 160 | duche 161 | dyke 162 | ejaculate 163 | ejaculated 164 | ejaculates 165 | ejaculating 166 | ejaculatings 167 | ejaculation 168 | ejakulate 169 | f u c k 170 | f u c k e r 171 | f4nny 172 | fag 173 | fagging 174 | faggitt 175 | faggot 176 | faggs 177 | fagot 178 | fagots 179 | fags 180 | fanny 181 | fannyflaps 182 | fannyfucker 183 | fanyy 184 | fatass 185 | fcuk 186 | fcuker 187 | fcuking 188 | feck 189 | fecker 190 | felching 191 | fellate 192 | fellatio 193 | fingerfuck 194 | fingerfucked 195 | fingerfucker 196 | fingerfuckers 197 | fingerfucking 198 | fingerfucks 199 | fistfuck 200 | fistfucked 201 | fistfucker 202 | fistfuckers 203 | fistfucking 204 | fistfuckings 205 | fistfucks 206 | flange 207 | fook 208 | fooker 209 | fucka 210 | fucked 211 | fucker 212 | fuckers 213 | fuckhead 214 | fuckheads 215 | fuckings 216 | fuckingshitmotherfucker 217 | fuckme 218 | fucks 219 | fuckwhit 220 | fuckwit 221 | fudge packer 222 | fudgepacker 223 | fuk 224 | fuker 225 | fukker 226 | fukkin 227 | fuks 228 | fukwhit 229 | fukwit 230 | fux 231 | fux0r 232 | f_u_c_k 233 | gangbang 234 | gangbanged 235 | gangbangs 236 | gaylord 237 | gaysex 238 | goatse 239 | God 240 | god-dam 241 | god-damned 242 | goddamn 243 | goddamned 244 | hardcoresex 245 | hell 246 | heshe 247 | hoar 248 | hoare 249 | hoer 250 | homo 251 | hore 252 | horniest 253 | horny 254 | hotsex 255 | jack-off 256 | jackoff 257 | jap 258 | jerk-off 259 | jism 260 | jiz 261 | jizm 262 | jizz 263 | kawk 264 | knob 265 | knobead 266 | knobed 267 | knobend 268 | knobhead 269 | knobjocky 270 | knobjokey 271 | kock 272 | kondum 273 | kondums 274 | kum 275 | kummer 276 | kumming 277 | kums 278 | kunilingus 279 | l3i+ch 280 | l3itch 281 | labia 282 | lmfao 283 | lust 284 | lusting 285 | m0f0 286 | m0fo 287 | m45terbate 288 | ma5terb8 289 | ma5terbate 290 | masochist 291 | master-bate 292 | masterb8 293 | masterbat* 294 | masterbat3 295 | masterbate 296 | masterbation 297 | masterbations 298 | masturbate 299 | mo-fo 300 | mof0 301 | mofo 302 | mothafuck 303 | mothafucka 304 | mothafuckas 305 | mothafuckaz 306 | mothafucked 307 | mothafucker 308 | mothafuckers 309 | mothafuckin 310 | mothafucking 311 | mothafuckings 312 | mothafucks 313 | mother fucker 314 | motherfuck 315 | motherfucked 316 | motherfucker 317 | motherfuckers 318 | motherfuckin 319 | motherfucking 320 | motherfuckings 321 | motherfuckka 322 | motherfucks 323 | muff 324 | mutha 325 | muthafecker 326 | muthafuckker 327 | muther 328 | mutherfucker 329 | n1gga 330 | n1gger 331 | nazi 332 | nigg3r 333 | nigg4h 334 | nigga 335 | niggah 336 | niggas 337 | niggaz 338 | nigger 339 | niggers 340 | nob 341 | nob jokey 342 | nobhead 343 | nobjocky 344 | nobjokey 345 | numbnuts 346 | nutsack 347 | orgasim 348 | orgasims 349 | orgasm 350 | orgasms 351 | p0rn 352 | pawn 353 | pecker 354 | penis 355 | penisfucker 356 | phonesex 357 | phuck 358 | phuk 359 | phuked 360 | phuking 361 | phukked 362 | phukking 363 | phuks 364 | phuq 365 | pigfucker 366 | pimpis 367 | piss 368 | pissed 369 | pisser 370 | pissers 371 | pisses 372 | pissflaps 373 | pissin 374 | pissing 375 | pissoff 376 | poop 377 | porn 378 | porno 379 | pornography 380 | pornos 381 | prick 382 | pricks 383 | pron 384 | pube 385 | pusse 386 | pussi 387 | pussies 388 | pussy 389 | pussys 390 | rectum 391 | retard 392 | retards 393 | rimjaw 394 | rimming 395 | s hit 396 | s.o.b. 397 | sadist 398 | schlong 399 | screwing 400 | scroat 401 | scrote 402 | scrotum 403 | semen 404 | sex 405 | sh!+ 406 | sh!t 407 | sh1t 408 | shag 409 | shagger 410 | shaggin 411 | shagging 412 | shemale 413 | shi+ 414 | shit 415 | shitdick 416 | shite 417 | shited 418 | shitey 419 | shitfuck 420 | shitfull 421 | shithead 422 | shiting 423 | shitings 424 | shits 425 | shitted 426 | shitter 427 | shitters 428 | shitting 429 | shittings 430 | shitty 431 | skank 432 | slut 433 | sluts 434 | smegma 435 | smut 436 | snatch 437 | son-of-a-bitch 438 | spac 439 | sperm 440 | spunk 441 | s_h_i_t 442 | t1tt1e5 443 | t1tties 444 | teets 445 | teez 446 | testical 447 | testicle 448 | tit 449 | titfuck 450 | tits 451 | titt 452 | tittie5 453 | tittiefucker 454 | titties 455 | tittyfuck 456 | tittywank 457 | titwank 458 | tosser 459 | turd 460 | tw4t 461 | twat 462 | twathead 463 | twatty 464 | twunt 465 | twunter 466 | v14gra 467 | v1gra 468 | vagina 469 | viagra 470 | vulva 471 | w00se 472 | wang 473 | wank 474 | wanker 475 | wanky 476 | whoar 477 | whore 478 | willies 479 | willy 480 | xrated 481 | xxx -------------------------------------------------------------------------------- /resources/execution.ini: -------------------------------------------------------------------------------- 1 | [Execution] 2 | Naive: True 3 | TF-IDF: True 4 | TF-IDF-NGRAM-WORDS: False 5 | WordVec: True 6 | LanguageModel: True 7 | Custom: True 8 | Average: True 9 | Final: True 10 | 11 | [Featuring] 12 | replace_badwords: True -------------------------------------------------------------------------------- /src/trolling_detection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rafaharo/trolling_detection/e919fa3fc6d8cd8251050fd2355fdff51ac0bbdc/src/trolling_detection/__init__.py -------------------------------------------------------------------------------- /src/trolling_detection/feature_extraction.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import numpy as np 3 | from sklearn.base import BaseEstimator 4 | 5 | nlp = spacy.load("en_core_web_sm") 6 | 7 | def corpus_stats(collection): 8 | import nltk 9 | import pprint 10 | words = tokenize_collection(collection, lowercase=True, stopwords='english', min_length=3) 11 | text = nltk.Text(word.lower() for word in words) 12 | print("Number of Words: " + str(len(text))) 13 | print("Number of unique words: " + str(len(set(text)))) 14 | dist = nltk.FreqDist(text) 15 | pp = pprint.PrettyPrinter(indent=4) 16 | pp.pprint(dist.most_common(20)) 17 | 18 | 19 | def tf_idf_stats(collection, num=20): 20 | from sklearn.feature_extraction.text import TfidfVectorizer 21 | tfidf = TfidfVectorizer(analyzer='word', min_df=3, stop_words='english') 22 | matrix = tfidf.fit_transform(collection) 23 | dense = matrix.todense() 24 | features_names = tfidf.get_feature_names() 25 | print("\nNumber of Features: " + str(len(features_names))) 26 | for index, row in enumerate(dense[0:num]): 27 | print("\nScores for comment: " + str(index)) 28 | comment = row.tolist()[0] 29 | scores = [pair for pair in zip(range(0, len(comment)), comment) if pair[1] > 0] 30 | sorted_scores = sorted(scores, key=lambda t: t[1] * -1) 31 | for phrase, score in [(features_names[word_id], score) for (word_id, score) in sorted_scores][:num]: 32 | print('{0: <20} {1}'.format(phrase, score)) 33 | print("\n") 34 | 35 | 36 | def similar_words(collection, word, num=10): 37 | import nltk 38 | words = tokenize_collection(collection, stopwords='english') 39 | text = nltk.Text(word.lower() for word in words) 40 | text.similar(word, num) 41 | 42 | 43 | def language_model(collection): 44 | from nltk import ConditionalProbDist 45 | from nltk import ConditionalFreqDist 46 | from nltk import bigrams 47 | from nltk import MLEProbDist 48 | words = tokenize_collection(collection) 49 | freq_model = ConditionalFreqDist(bigrams(words)) 50 | prob_model = ConditionalProbDist(freq_model, MLEProbDist) 51 | return prob_model 52 | 53 | 54 | def word2vec_model(documents, n_dim=1000): 55 | from gensim.models import Word2Vec 56 | model = Word2Vec(documents, size=n_dim, window=8, min_count=3, workers=4) 57 | return model 58 | 59 | 60 | def tokenize_collection(collection, lowercase=True, stopwords=True, min_length=3): 61 | documents = [tokenize_document(document, lowercase=lowercase, stopwords=stopwords, min_length=min_length) 62 | for document in collection] 63 | words = [token for document in documents for token in document] 64 | return words 65 | 66 | 67 | def tokenize_document(document, lowercase=True, stopwords=None, min_length=3): 68 | import nltk 69 | if not document or len(document) == 0: 70 | raise ValueError("Can't tokenize null or empty texts") 71 | 72 | if lowercase: 73 | document = document.lower() 74 | 75 | tokens = nltk.wordpunct_tokenize(document) 76 | 77 | if stopwords and isinstance(stopwords, str): 78 | stops = set(nltk.corpus.stopwords.words(stopwords)) 79 | elif stopwords and isinstance(stopwords, list): 80 | stops = set(stopwords) 81 | else: 82 | stops = set() 83 | 84 | result = [token for token in tokens if token not in stops and len(token) >= min_length] 85 | return result 86 | 87 | 88 | def replace_badwords(comment, badwords): 89 | comment = comment.lower() 90 | candidates = [insult for insult in badwords if insult in comment] 91 | candidates_sorted = sorted(candidates, key=len) 92 | for candidate in candidates_sorted: 93 | comment = comment.replace(candidate, " fakeinsult ") 94 | return comment 95 | 96 | 97 | class CustomTransformer(BaseEstimator): 98 | def __init__(self): 99 | from spacy.matcher import Matcher 100 | self.__matcher = Matcher(nlp.vocab) 101 | 102 | pattern1 = [{"LEMMA": "-PRON-", "LOWER": {"IN": ["you", "your"]}}, 103 | {"LEMMA": {"IN": ["be", "sound"]}}, {"OP": "*", "LENGTH": {"<=": 10}}, 104 | {"LOWER": "fakeinsult"}] 105 | self.__matcher.add("insult1", None, pattern1) 106 | 107 | pattern2 = [{"LEMMA": "-PRON-", "LOWER": {"IN": ["you", "your"]}}, 108 | {"OP": "*", "LENGTH": {"<=": 4}}, 109 | {"LOWER": "fakeinsult"}] 110 | self.__matcher.add("insult2", None, pattern2) 111 | 112 | def get_feature_names(self): 113 | return np.array(['n_words', 'n_chars', 'n_dwords', 'you_re', '!', 'allcaps', '@', 'bad_ratio', 'n_bad', 114 | 'capsratio', 'dicratio' 'sent']) 115 | 116 | def fit(self, documents, y=None): 117 | return self 118 | 119 | def transform(self, documents): 120 | import enchant 121 | import sentlex 122 | from feature_extraction import tokenize_document 123 | 124 | d = enchant.Dict("en_US") 125 | swn = sentlex.SWN3Lexicon() 126 | tokenized_documents = [tokenize_document(document) for document in documents] 127 | n_words = [] 128 | n_chars = [] 129 | # number of uppercase words 130 | all_caps = [] 131 | n_bad = [] 132 | exclamation = [] 133 | addressing = [] 134 | 135 | n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents] 136 | 137 | sent_pos = [] 138 | sent_neg = [] 139 | n_you_re = [] 140 | for comment in documents: 141 | n_words.append(len(comment.split())) 142 | n_chars.append(len(comment)) 143 | all_caps.append(np.sum([w.isupper() for w in comment.split()])) 144 | n_bad.append(comment.count('fakeinsult')) 145 | exclamation.append(comment.count("!")) 146 | addressing.append(comment.count("@")) 147 | doc = nlp(comment) 148 | count = 0. 149 | pos_sum = 0. 150 | neg_sum = 0. 151 | for token in doc: 152 | if token.text == 'fakeinsult': 153 | pos_sum += 0. 154 | neg_sum += 1. 155 | count += 1. 156 | continue 157 | if token.pos_.startswith('RB'): 158 | sentiment = swn.getadverb(token.text) 159 | pos_sum += sentiment[0] 160 | neg_sum += sentiment[1] 161 | count += 1. 162 | elif token.pos_.startswith('NN'): 163 | sentiment = swn.getnoun(token.text) 164 | pos_sum += sentiment[0] 165 | neg_sum += sentiment[1] 166 | count += 1. 167 | if token.pos_.startswith('JJ'): 168 | sentiment = swn.getadjective(token.text) 169 | pos_sum += sentiment[0] 170 | neg_sum += sentiment[1] 171 | count += 1. 172 | if token.pos_.startswith('VB'): 173 | sentiment = swn.getverb(token.text) 174 | pos_sum += sentiment[0] 175 | neg_sum += sentiment[1] 176 | count += 1. 177 | if count != 0: 178 | pos_sum /= count 179 | neg_sum /= count 180 | sent_neg.append(neg_sum) 181 | sent_pos.append(pos_sum) 182 | matches = self.__matcher(doc) 183 | n_you_re.append(len(matches)) 184 | 185 | allcaps_ratio = np.array(all_caps) / np.array(n_words, dtype=np.float) 186 | bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float) 187 | dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float) 188 | 189 | return np.array([n_words, n_chars, n_dwords, n_you_re, exclamation, all_caps, 190 | addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, 191 | sent_pos]).T 192 | 193 | def get_params(self, deep=True): 194 | if not deep: 195 | return super(CustomTransformer, self).get_params(deep=False) 196 | else: 197 | out = super(CustomTransformer, self).get_params(deep=False) 198 | out.update(self.__matcher.copy()) 199 | return out -------------------------------------------------------------------------------- /src/trolling_detection/training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, ClassifierMixin 3 | 4 | def train_tfidf(comments, categories, class_weight=None): 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from sklearn.linear_model import SGDClassifier 8 | 9 | text_clf = Pipeline([('vect', TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", min_df=3)), 10 | ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, 11 | class_weight=class_weight))]) 12 | text_clf = text_clf.fit(comments, categories) 13 | return text_clf 14 | 15 | 16 | class NaiveClassifier(BaseEstimator, ClassifierMixin): 17 | def __init__(self, badwords=[], fake=True): 18 | self.badwords = badwords 19 | self.fake = fake 20 | 21 | def fit(self, X, y): 22 | return self 23 | 24 | def predict(self, X, y=None): 25 | predictions = [] 26 | for x in X: 27 | if not self.fake: 28 | if any(badword in x.lower() for badword in self.badwords): 29 | predictions.append(1) 30 | else: 31 | predictions.append(0) 32 | else: 33 | if "fakeinsult" in x: 34 | predictions.append(1) 35 | else: 36 | predictions.append(0) 37 | return np.array(predictions) 38 | 39 | def build_word_vector(w2vmodel, text, size): 40 | vec = np.zeros(size).reshape((1, size)) 41 | count = 0. 42 | for word in text: 43 | try: 44 | sorted_vec = np.sort(w2vmodel[word]) 45 | vec += sorted_vec.reshape((1, size)) 46 | count += 1. 47 | except KeyError: 48 | continue 49 | if count != 0: 50 | vec /= count 51 | return vec 52 | 53 | 54 | def w2vectorize(collection, model, n_dim): 55 | from sklearn.preprocessing import scale 56 | vecs = np.concatenate([build_word_vector(model, z, n_dim) for z in collection]) 57 | vecs = scale(vecs) 58 | return vecs 59 | 60 | 61 | def train_word2vec(categories, comments, n_dim): 62 | from feature_extraction import tokenize_document 63 | from feature_extraction import word2vec_model 64 | from sklearn.linear_model import SGDClassifier 65 | documents = [tokenize_document(document) for document in comments] 66 | model = word2vec_model(documents, n_dim) 67 | train_vecs = w2vectorize(documents, model, n_dim) 68 | classifier = SGDClassifier(loss='log', penalty='l1') 69 | classifier.fit(train_vecs, categories) 70 | 71 | return model, classifier 72 | 73 | 74 | def train_custom(comments, categories): 75 | from sklearn.pipeline import Pipeline 76 | from sklearn.svm import LinearSVC 77 | from feature_extraction import CustomTransformer 78 | 79 | text_clf = Pipeline([('vect', CustomTransformer()), 80 | ('clf', LinearSVC(random_state=42, dual=False))]) 81 | text_clf = text_clf.fit(comments, categories) 82 | return text_clf 83 | 84 | 85 | def train_feature_union(comments, categories): 86 | from sklearn.feature_extraction.text import TfidfVectorizer 87 | from sklearn.linear_model import LogisticRegression 88 | from sklearn.pipeline import Pipeline, FeatureUnion 89 | from sklearn.feature_selection import SelectPercentile, chi2 90 | from feature_extraction import CustomTransformer 91 | 92 | select = SelectPercentile(score_func=chi2, percentile=70) 93 | countvect_word = TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", binary=False, min_df=3) 94 | custom = CustomTransformer() 95 | union = FeatureUnion([("custom", custom), ("words", countvect_word)]) 96 | clf = LogisticRegression(tol=1e-8, penalty='l2', C=4, max_iter=10000) 97 | classifier = Pipeline([('vect', union), ('select', select), ('clf', clf)]) 98 | classifier = classifier.fit(comments, categories) 99 | return classifier 100 | 101 | 102 | def train_assembling_voting(comments, categories): 103 | from sklearn.feature_extraction.text import TfidfVectorizer 104 | from sklearn.pipeline import Pipeline 105 | from sklearn.linear_model import SGDClassifier 106 | from sklearn.ensemble import VotingClassifier 107 | from feature_extraction import CustomTransformer 108 | 109 | text_clf = Pipeline([('vect', TfidfVectorizer(lowercase=True, ngram_range=(1, 3), analyzer="word", min_df=3)), 110 | ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, 111 | random_state=42))]) 112 | 113 | custom = CustomTransformer() 114 | clf = Pipeline([('vect', custom), 115 | ('clf', SGDClassifier(loss='log', penalty='l2', 116 | alpha=1e-3, random_state=42))]) 117 | 118 | final_classifier = VotingClassifier(estimators=[('text', text_clf), ('custom', clf)], 119 | voting='soft', weights=[3, 1]) 120 | final_classifier = final_classifier.fit(comments, categories) 121 | return final_classifier 122 | -------------------------------------------------------------------------------- /trolling_detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Define Global Variables and Helper Functions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# Global Imports\n", 17 | "import numpy as np\n", 18 | "from sklearn import metrics\n", 19 | "import pandas as pd\n", 20 | "import os\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "\n", 23 | "# Helpers\n", 24 | "TRAIN_FILE = \"resources/train/train.csv\"\n", 25 | "TEST_FILE = \"resources/test/test_with_solutions.csv\"\n", 26 | "BAD_WORDS_FILE = \"resources/badwords.txt\"\n", 27 | "NO_INSULT = 'NoInsult'\n", 28 | "INSULT = 'Insult'\n", 29 | "\n", 30 | "# Comments preprocessing\n", 31 | "def preprocess_comment(comment):\n", 32 | " import re\n", 33 | " comment = comment.strip().strip('\"')\n", 34 | " comment = comment.replace('_', ' ')\n", 35 | " comment = comment.replace(\"\\\\\\\\\", \"\\\\\")\n", 36 | " comment = comment.replace('\\\\n', ' ')\n", 37 | " comment = comment.replace('\\\\n', ' ')\n", 38 | " comment = comment.lower()\n", 39 | " comment = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', 'URL', comment, flags=re.MULTILINE)\n", 40 | " comment = comment.encode('utf-8').decode('unicode-escape')\n", 41 | " return comment\n", 42 | "\n", 43 | "# Predictions Accuracy Report\n", 44 | "def predictions_report(pred, ground_truth):\n", 45 | " print(\"Accuracy: \" + str(np.mean(pred == ground_truth)) + \"\\n\")\n", 46 | " print(metrics.classification_report(ground_truth, pred,\n", 47 | " target_names=['NoInsult', 'Insult']))\n", 48 | "\n", 49 | "# Predictions Proba Accuracy Report\n", 50 | "def predictions_report_proba(pred, ground_truth, threshold=0.5):\n", 51 | " # Extract Plain Estimations\n", 52 | "\n", 53 | " labels = []\n", 54 | " for index, prediction in enumerate(pred):\n", 55 | " if isinstance(prediction, list) or type(prediction) is np.ndarray:\n", 56 | " if len(prediction) == 1:\n", 57 | " if prediction[0] >= threshold:\n", 58 | " labels.append(1)\n", 59 | " else:\n", 60 | " labels.append(0)\n", 61 | " else:\n", 62 | " if prediction[0] > prediction[1]:\n", 63 | " labels.append(0)\n", 64 | " else:\n", 65 | " labels.append(1)\n", 66 | " else:\n", 67 | " if prediction >= threshold:\n", 68 | " labels.append(1)\n", 69 | " else:\n", 70 | " labels.append(0)\n", 71 | "\n", 72 | " labels = np.array(labels)\n", 73 | "\n", 74 | " print(\"Accuracy: \" + str(np.mean(labels==ground_truth)) + \"\\n\")\n", 75 | " print(metrics.classification_report(ground_truth, labels,\n", 76 | " target_names=['NoInsult', 'Insult']))\n", 77 | "\n", 78 | "# Load Bad Words Helper\n", 79 | "def load_bad_words(badwords_file):\n", 80 | " with open(badwords_file) as f:\n", 81 | " lines = f.readlines()\n", 82 | " return [badword[:-1] for badword in lines]" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# Load Resources" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 93, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | ":25: DeprecationWarning: invalid escape sequence '\\ '\n", 102 | " comment = comment.encode('utf-8').decode('unicode-escape')\n", 103 | ":25: DeprecationWarning: invalid escape sequence '\\p'\n", 104 | " comment = comment.encode('utf-8').decode('unicode-escape')\n", 105 | ":25: DeprecationWarning: invalid escape sequence '\\/'\n", 106 | " comment = comment.encode('utf-8').decode('unicode-escape')\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# Load list of badwords\n", 112 | "badwords = load_bad_words(BAD_WORDS_FILE)\n", 113 | "\n", 114 | "# Load Train Data\n", 115 | "train_df = pd.read_csv(TRAIN_FILE)\n", 116 | "\n", 117 | "# Load Test Data\n", 118 | "test_df = pd.read_csv(TEST_FILE)\n", 119 | "\n", 120 | "raw_train_comments = train_df['Comment'].apply(lambda comment: preprocess_comment(comment))\n", 121 | "raw_test_comments = test_df['Comment'].apply(lambda comment: preprocess_comment(comment))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 3, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "3947\n" 134 | ] 135 | }, 136 | { 137 | "data": { 138 | "text/html": [ 139 | "
\n", 140 | "\n", 153 | "\n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | "
InsultDateComment
0120120618192155Z\"You fuck your dad.\"
1020120528192215Z\"i really don't understand your point.\\xa0 It ...
20NaN\"A\\\\xc2\\\\xa0majority of Canadians can and has ...
30NaN\"listen if you dont wanna get married to a man...
4020120619094753Z\"C\\xe1c b\\u1ea1n xu\\u1ed1ng \\u0111\\u01b0\\u1edd...
\n", 195 | "
" 196 | ], 197 | "text/plain": [ 198 | " Insult Date Comment\n", 199 | "0 1 20120618192155Z \"You fuck your dad.\"\n", 200 | "1 0 20120528192215Z \"i really don't understand your point.\\xa0 It ...\n", 201 | "2 0 NaN \"A\\\\xc2\\\\xa0majority of Canadians can and has ...\n", 202 | "3 0 NaN \"listen if you dont wanna get married to a man...\n", 203 | "4 0 20120619094753Z \"C\\xe1c b\\u1ea1n xu\\u1ed1ng \\u0111\\u01b0\\u1edd..." 204 | ] 205 | }, 206 | "execution_count": 3, 207 | "metadata": {}, 208 | "output_type": "execute_result" 209 | } 210 | ], 211 | "source": [ 212 | "print(len(train_df.index))\n", 213 | "train_df.head()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 4, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAD4CAYAAADPccAIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAARC0lEQVR4nO3cf2xd5X3H8feXGJIWEgrNjzqkwzC5asAkaRa1hJFskCW0UBEgKU3FNrdEQoN1hSIEmZDKjz+QqWAEEFAF0eLSjsKyUCrosmYQ2g1GfoExqVNICt7SNAsL5AesLSTh2R/32HVcx9jmsa/v5f2SrHvuc+459/vNuc7H5znHjpQSkiTlcli5C5AkVReDRZKUlcEiScrKYJEkZWWwSJKyqil3AYNp7Nixqa6urtxlSFJF2bBhw86U0riBbl/VwVJXV8f69evLXYYkVZSI+K/3s71TYZKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVlV9S9IvrhtD3VLHi93GcqkvemccpcgqQ88Y5EkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWVZSLL76Y8ePH09DQ0Dn2xhtvMHfuXOrr65k7dy67du06aJt169ZRU1PD8uXLO8euueYaGhoaaGho4KGHHhqy+qUPAoNFFeXLX/4yK1euPGisqamJOXPmsHnzZubMmUNTU1PnugMHDnDNNdcwb968zrHHH3+c5557jpaWFtasWcMtt9zC3r17h6wHqdoZLKoos2fP5thjjz1o7NFHH6WxsRGAxsZGfvjDH3auu/POO1mwYAHjx4/vHGtra2P27NnU1NRw5JFHMmXKlD8IK0kDZ7Co4u3YsYPa2loAPvaxj7Fjxw4Atm3bxiOPPMKll1560OunTp3KypUr+c1vfsPOnTtZvXo1W7duHfK6pWpVM1g7joi3UkpHZdxfHfBYSqkhIqYBE1NKP861f1WHiCAiALjiiiu4+eabOeywg39+mjdvHuvWreO0005j3LhxzJw5kxEjRpSjXKkqVeoZyzTg7HIXoeFhwoQJbN++HYDt27d3TnutX7+eRYsWUVdXx/Lly7nssss6p8muvfZaWlpaWLVqFSklPvGJT5SrfKnqDHqwRMSfR8RTEbE8In4REd+P4kfKiGiKiLaIaI2IW4qx+yNiYZft3+q2vyOAG4EvRkRLRHxxsHvQ8HbuuefS3NwMQHNzM/Pnzwfg1Vdfpb29nfb2dhYuXMjdd9/Neeedx4EDB3j99dcBaG1tpbW19aCL+5Len0GbCuvmU8DJwK+Bp4E/jYhNwPnAJ1NKKSI+0pcdpZTeiYhvADNSSl/tvj4iLgEuARgxZlym8jVcfOlLX+Kpp55i586dTJo0iRtuuIElS5Zw4YUXct9993H88cfz8MMP97qPffv2MWvWLADGjBnD9773PWpqhupbQap+Q/XdtDal9CuAiGgB6oBngd8B90XEY8BjOd4opbQMWAYwsrY+5dinho8HH3ywx/Ennnii1+3uv//+zuVRo0bR1taWsyxJXQzVNZa3uywfAGpSSvuBTwPLgc8DHfd77u+oKyIOA44YoholSRmU7eJ9RBwFHF3c2fV1YGqxqh34k2L5XODwHjZ/Exg92DVKkvqvnHeFjQYei4hW4D+AK4vxe4E/i4gXgJnA//Ww7WrgJC/eS9LwEylV72WIkbX1qbZxabnLUCbtTeeUuwTpAyEiNqSUZgx0+0r9PRZJ0jBlsEiSsjJYJElZGSySpKwMFklSVgaLJCkrg0WSlJXBIknKymCRJGVlsEiSsjJYJElZGSySpKwMFklSVgaLJCkrg0WSlJXBIknKymCRJGVlsEiSsjJYJElZGSySpKwMFklSVgaLJCkrg0WSlJXBIknKymCRJGVlsEiSsjJYJElZGSySpKwMFklSVgaLJCkrg0WSlJXBIknKymCRJGVlsEiSsjJYJElZGSySpKwMFklSVgaLJCmrmnIXMJhOOe5o1jedU+4yJOkDxTMWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZVVT7gIG04vb9lC35PFylyFJQ6q96Zyyvr9nLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiSVVo69atnHHGGZx00kmcfPLJ3H777QC0tLRw6qmnMm3aNGbMmMHatWsB2LVrF+effz5TpkwBmBwRDR37ioivR8TPI2JjRDwYEaN6e2+DRZKqUE1NDbfeeittbW08++yz3HXXXbS1tXH11Vdz3XXX0dLSwo033sjVV18NwE033cS0adNobW0FeBW4HSAijgO+BsxIKTUAI4BFvb23wSJJVai2tpbp06cDMHr0aCZPnsy2bduICPbu3QvAnj17mDhxIgBtbW2ceeaZHZv/DqiLiAnF8xrgQxFRA3wY+HVv712TuxlJ0vDS3t7O888/z2c+8xmWLl3KWWedxVVXXcW7777LM888A8DUqVNZsWIFs2bNglJ4HA9MSiltiIhbgP8Gfgv8JKX0k97e7z3PWCIiRcStXZ5fFRHXv8c210fEVe+17/6IiPsjYmGxfEVEfDjn/iWpGr311lssWLCApUuXMmbMGO655x5uu+02tm7dym233cbixYsBWLJkCbt372batGkAE4DngQMRcQwwHzgBmAgcGRF/2dt79mUq7G3ggogYO/DWsruCUqJKkg5h3759LFiwgIsuuogLLrgAgObm5s7lL3zhC50X78eMGcN3vvMdWlpaoHSNZRzwCvAXwKsppf9NKe0DVgCn9fa+fQmW/cAy4OvdV0REXUQ8GRGtEfFERPxRD695KiJujoi1EfFyRMwqxk8uxlqK7euL/W3ssu0fnB1FxNcopebqiFjdh/ol6QMnpcTixYuZPHkyV155Zef4xIkT+elPfwrAk08+SX19PQC7d+/mnXfe6XjZWOBnKaW9lKbATo2ID0dEAHOATb29d1+vsdwFtEbEN7uN3wk0p5SaI+Ji4A7gvJ7eJ6X06Yg4G7iOUgL+DXB7Sun7EXEEpTsNJvSw7UFSSndExJXAGSmlnd3XR8QlwCUAI8aM62N7klRdnn76aR544AFOOeWUjuktbrrpJu69914uv/xy9u/fz6hRo1i2bBkAmzZtorGxkVJ2cDRwOUBKaU1ELAeeo3Si8Tylk41D6lOwpJT2RsR3Kd1y9tsuq2YCFxTLDwDdg6fDiuJxA1BXLP8ncG1ETAJWpJQ2Fw29LymlZRRNj6ytT+97h5JUgU4//XRS6vm/wA0bNvzB2MyZM3n55ZcBiIhfppR2daxLKV1H6aSgT/pzu/FSYDFwZD+26fB28XiAIsxSSv8InEspqH4cEWdSSsOuNfX6SziSpOGnz8GSUnoDeJhSuHR4ht//osxFwL/3dX8RcSLwSkrpDuBRYAqwAxgfER+NiJHA5w+x+ZvA6L6+lyRp6PT3FyRvpXRRp8PfAV+JiFbgryjm5ProQmBjRLQADcB3izsObgTWAquAXxxi22XASi/eS9LwE4eag6sGI2vrU23j0nKXIUlDqr3pnPe1fURsSCnNGOj2/kkXSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJWBoskKSuDRZKUlcEiScrKYJEkZWWwSJKyqil3AYPplOOOZn3TOeUuQ5I+UDxjkSRlZbBIkrIyWCRJWRkskqSsDBZJUlYGiyQpK4NFkpSVwSJJyspgkSRlFSmlctcwaCLiTeClctcxSMYCO8tdxCCxt8pkb5Wpp96OTymNG+gOq/pPugAvpZRmlLuIwRAR6+2t8thbZbK3/nEqTJKUlcEiScqq2oNlWbkLGET2VpnsrTLZWz9U9cV7SdLQq/YzFknSEDNYJElZVW2wRMRnI+KliNgSEUvKXc9ARER7RLwYES0Rsb4YOzYiVkXE5uLxmGI8IuKOot/WiJhe3uoPFhHfjojXImJjl7F+9xIRjcXrN0dEYzl66e4QvV0fEduKY9cSEWd3Wff3RW8vRcRZXcaH3Wc2Ij4eEasjoi0ifh4RlxfjFX/seumt4o9dRIyKiLUR8ULR2w3F+AkRsaao86GIOKIYH1k831Ksr+uyrx577lVKqeq+gBHAL4ETgSOAF4CTyl3XAPpoB8Z2G/smsKRYXgLcXCyfDfwLEMCpwJpy19+t7tnAdGDjQHsBjgVeKR6PKZaPGaa9XQ9c1cNrTyo+jyOBE4rP6Yjh+pkFaoHpxfJo4OWih4o/dr30VvHHrvj3P6pYPhxYUxyPh4FFxfi3gEuL5cuAbxXLi4CHeuv5vd6/Ws9YPg1sSSm9klJ6B/gBML/MNeUyH2gulpuB87qMfzeVPAt8JCJqy1Bfj1JKPwPe6Dbc317OAlallN5IKe0CVgGfHfTi38MhejuU+cAPUkpvp5ReBbZQ+rwOy89sSml7Sum5YvlNYBNwHFVw7Hrp7VAq5tgV//5vFU8PL74ScCawvBjvftw6judyYE5EBIfuuVfVGizHAVu7PP8VvX9ghqsE/CQiNkTEJcXYhJTS9mL5f4AJxXIl9tzfXiqtx68W00Hf7pgqooJ7K6ZHPkXpp9+qOnbdeoMqOHYRMSIiWoDXKAX5L4HdKaX9xUu61tnZQ7F+D/BRBthbtQZLtTg9pTQd+BzwtxExu+vKVDpXrYr7xaupl8I9wB8D04DtwK1lreZ9ioijgH8Grkgp7e26rtKPXQ+9VcWxSykdSClNAyZROsv45FC9d7UGyzbg412eTyrGKkpKaVvx+BrwCKUPx46OKa7i8bXi5ZXYc397qZgeU0o7im/sd4F7+f30QcX1FhGHU/qP9/sppRXFcFUcu556q6ZjB5BS2g2sBmZSmprs+BuRXevs7KFYfzTwOgPsrVqDZR1QX9wBcQSli1E/KnNN/RIRR0bE6I5lYB6wkVIfHXfUNAKPFss/Av66uCvnVGBPl6mK4aq/vfwrMC8ijimmJ+YVY8NOt+tb51M6dlDqbVFxF84JQD2wlmH6mS3m2e8DNqWU/qHLqoo/dofqrRqOXUSMi4iPFMsfAuZSuoa0GlhYvKz7ces4nguBJ4sz0UP13Lty3rkwmF+U7k55mdK84rXlrmcA9Z9I6W6MF4Cfd/RAad7zCWAz8G/Asen3d4HcVfT7IjCj3D106+dBStMK+yjN0y4eSC/AxZQuIG4BvlLuvnrp7YGi9tbim7O2y+uvLXp7CfjccP7MAqdTmuZqBVqKr7Or4dj10lvFHztgCvB80cNG4BvF+ImUgmEL8E/AyGJ8VPF8S7H+xPfqubcv/6SLJCmrap0KkySVicEiScrKYJEkZWWwSJKyMlgkSVkZLJKkrAwWSVJW/w/uns6xA9RWrwAAAABJRU5ErkJggg==\n", 224 | "text/plain": [ 225 | "
" 226 | ] 227 | }, 228 | "metadata": { 229 | "needs_background": "light" 230 | }, 231 | "output_type": "display_data" 232 | } 233 | ], 234 | "source": [ 235 | "# Categories Balance\n", 236 | "x = [NO_INSULT, INSULT]\n", 237 | "ni_count, icount = train_df['Insult'].value_counts()\n", 238 | "y = [ni_count, icount]\n", 239 | "plt.barh(x, y)\n", 240 | "for index, value in enumerate(y):\n", 241 | " plt.text(value, index, str(value))" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 5, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "0 0.734229\n", 253 | "1 0.265771\n", 254 | "Name: Insult, dtype: float64" 255 | ] 256 | }, 257 | "execution_count": 5, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "train_df['Insult'].value_counts(normalize=True)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 6, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "2647\n" 276 | ] 277 | }, 278 | { 279 | "data": { 280 | "text/html": [ 281 | "
\n", 282 | "\n", 295 | "\n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | "
InsultDateCommentUsage
00NaN\"THE DRUDGE REPORT\\\\n\\\\n\\\\n\\\\nYou won't see th...PublicTest
1020120618222256Z\"@ian21\\xa0\"Roger Clemens is the fucking man, ...PublicTest
2120120618213617Z\"Agree with Alan you are an extremest idiot. ...PublicTest
30NaN\"Really?\\\\n\\\\nI see Marc Lamont Hill on variou...PrivateTest
4020120620003825Z\"Really suck isn't the word, when many of our ...PrivateTest
\n", 343 | "
" 344 | ], 345 | "text/plain": [ 346 | " Insult Date Comment \\\n", 347 | "0 0 NaN \"THE DRUDGE REPORT\\\\n\\\\n\\\\n\\\\nYou won't see th... \n", 348 | "1 0 20120618222256Z \"@ian21\\xa0\"Roger Clemens is the fucking man, ... \n", 349 | "2 1 20120618213617Z \"Agree with Alan you are an extremest idiot. ... \n", 350 | "3 0 NaN \"Really?\\\\n\\\\nI see Marc Lamont Hill on variou... \n", 351 | "4 0 20120620003825Z \"Really suck isn't the word, when many of our ... \n", 352 | "\n", 353 | " Usage \n", 354 | "0 PublicTest \n", 355 | "1 PublicTest \n", 356 | "2 PublicTest \n", 357 | "3 PrivateTest \n", 358 | "4 PrivateTest " 359 | ] 360 | }, 361 | "execution_count": 6, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "print(len(test_df.index))\n", 368 | "test_df.head()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 7, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAD4CAYAAADPccAIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAARC0lEQVR4nO3cf5BV5X3H8fdXERrFn0ETFJMVRgsCugiNYZpg0pbEaDBFmUTGaRRlDJmmwWQyHTvOpE7qjJtQnUQrccxIDKkhKTEOTkwp1ppSW6PyYxFMAI2ixBjJjwbQRA3w9I97oFdkYRe+d8+yvF8zO3v2ufec+7nPvXs/95xzd6OUgiRJWQ6rO4AkqX+xWCRJqSwWSVIqi0WSlMpikSSlGlB3gFYaMmRIaWtrqzuGJB1Uli9f/qtSyon7u36/Lpa2tjaWLVtWdwxJOqhExHMHsr6HwiRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkperXfyC5+oXNtF17f90xdAA2dFxYdwRJPeQeiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiw4Kv/3tb5k2bRojR45k1KhRPPLII6xatYqJEycyduxYpkyZwpYtWwB47LHHaG9vp729nbPPPpt777235vTSocVi0UFh9uzZnH/++axdu5ZVq1YxatQoZs6cSUdHB6tXr2bq1KnMmTMHgDFjxrBs2TI6OztZvHgxn/jEJ9i2bVvN90A6dFgs6vM2b97M0qVLueqqqwAYOHAgxx13HOvXr2fSpEkATJ48mXvuuQeAI488kgEDBgDw6quvEhH1BJcOURaL+rxnn32WE088kRkzZjBu3DhmzpzJK6+8wujRo1m0aBEACxcuZOPGjbvWefTRRxk9ejRjx47l9ttv31U0klqvZcUSES8nb68tItZUy+0RcUHm9tV3bdu2jRUrVvDJT36SlStXctRRR9HR0cG8efOYO3cu48ePZ+vWrQwcOHDXOueeey5PPvkkjz/+ODfeeCOvvvpqjfdAOrQcrHss7YDFcogYNmwYw4YN49xzzwVg2rRprFixgpEjR7JkyRKWL1/O9OnTGTFixJvWHTVqFIMHD2bNmjW9HVs6ZLW8WCLifRHxw4j4bkSsjYi7ozroHREdEfHjiHgiIv6xGrsrIqY1rf/ybtsbCHwB+FhEdEbEx1p9H1Svt7/97Zx66qmsW7cOgAcffJAzzzyTTZs2AbBjxw5uuOEGZs2aBTQOne08Wf/cc8+xdu1a2traaskuHYp668DzOGA08HPgv4E/jYifAFOBkaWUEhHHdWdDpZTXI+LzwIRSyqd2vzwirgauBjj8mBOT4qtut956K5dddhmvv/46w4cP5+tf/zrz58/ntttuA+Diiy9mxowZADz88MN0dHRwxBFHcNhhhzF37lyGDBlSZ3zpkBKllNZsOOLlUsrgiHgfcF0pZXI1/lUa5fJtYHn19X3g+1Vp3FUtf3e37bRV42Mi4gq6KJZmg4aeXoZe/uVW3D31kg0dF9YdQTrkRMTyUsqE/V2/t86xvNa0vB0YUErZBrwL+C7wYWBxdfm2nbki4jBgIJKkg0ZtJ+8jYjBwbCnlB8BngLOrizYA46vli4Aj9rD6VuDoVmeUJPVcnZ8KOxr4fkQ8ATwMfLYa/xpwXkSsAiYCr+xh3YeAMz15L0l9T8vOsfQFnmM5+HmORep9B8s5FknSIcJikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlGlB3gFYae8qxLOu4sO4YknRIcY9FkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlGpA3QFaafULm2m79v66Y0hSr9rQcWGtt+8eiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiyQplcUiSUplsUiSUlkskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCmVxSJJSmWxSJJSWSySpFQWiyT1Q1deeSUnnXQSY8aM2TW2atUqJk6cyNixY5kyZQpbtmwBYMOGDbzlLW+hvb2d9vZ2gHfsvr2IuC8i1nTnti0WSeqHrrjiChYvXvyGsZkzZ9LR0cHq1auZOnUqc+bM2XXZiBEj6OzspLOzE+D55vUi4mLg5e7etsUiSf3QpEmTOOGEE94wtn79eiZNmgTA5MmTueeee/a5nYgYDHwWuKG7t22xSNIhYvTo0SxatAiAhQsXsnHjxl2XPfvss4wbN47zzjsPYHDTav8A3AT8rru3s89iiYgSETc1/fy5iLh+H+tcHxGf626I7oiIuyJiWrV8TUQcmbl9Serv5s2bx9y5cxk/fjxbt25l4MCBAAwdOpTnn3+elStXcvPNNwMMj4hjIqIdGFFKubcntzOgG9d5Dbg4Im4spfyqh/ejVa4B/pkeNKgkHepGjhzJkiVLgMZhsfvvvx+AQYMGMWjQIADGjx8Pjdf9M4A/ASZExAYafXFSRPywlPK+vd1Odw6FbQPuAD6z+wUR0RYR/xERT0TEgxGxp08S/DAivhgRj0XE+oh4bzU+uhrrrNY/vdremqZ137R3FBGfBk4GHoqIh7qRX5IEbNq0CYAdO3Zwww03MGvWLAB++ctfsn37dgCeeeYZgEHAM6WUr5ZSTi6ltAHvAdbvq1Sg++dYbgMui4hjdxu/FfhGKeUs4G7gli7WH1BKeReNPY2/r8ZmAV8ppbQDE4CfdSdIKeUW4OfA+0sp79/98oi4OiKWRcSy7b/b3J1NSlK/M336dCZOnMi6desYNmwYd955JwsWLOCMM85g5MiRnHzyycyYMQOApUuXctZZZ9He3s60adMAniul/GZ/b7s7h8IopWyJiPnAp4HfN100Ebi4Wv4m8KUuNvG96vtyoK1afgS4LiKGAd8rpTwVET2I3mXWO2jsYTFo6OnlgDcoSQehBQsW7HF89uzZbxq75JJLuOSSS3b9HBFveldeStkAjNl9fE968qmwLwNXAUf1YJ2dXqu+b6cqs1LKt4CLaBTVDyLiz2gcdmvO9Ef7cVuSpBp1u1iq3aJ/oVEuO/0PcGm1fBnwX93dXkQMp3EM7xZgEXAW8BKNk0NvjYhBwIe7WH0rcHR3b0uS1Ht6+ncsNwFDmn7+G2BGRDwB/BXw5n2srn0UWBMRnTR2r+aXUv4AfAF4DHgAWNvFuncAiz15L0l9T5TSf09DDBp6ehl6+ZfrjiFJvWpDx4UHtH5ELC+lTNjf9f3Le0lSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpTKYpEkpbJYJEmpLBZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSKotFkpRqQN0BWmnsKceyrOPCumNI0iHFPRZJUiqLRZKUymKRJKWyWCRJqSwWSVIqi0WSlMpikSSlslgkSaksFklSqiil1J2hZSJiK7Cu7hz7MAT4Vd0h9sGMOfp6xr6eD8yYZV8Z31lKOXF/N96v/6ULsK6UMqHuEHsTEcvMeODMeOD6ej4wY5ZWZ/RQmCQplcUiSUrV34vljroDdIMZc5jxwPX1fGDGLC3N2K9P3kuSel9/32ORJPUyi0WSlKrfFktEnB8R6yLi6Yi4tqYMp0bEQxHx44h4MiJmV+PXR8QLEdFZfV3QtM7fVZnXRcQHeynnhohYXWVZVo2dEBEPRMRT1ffjq/GIiFuqjE9ExDm9kO+Pm+aqMyK2RMQ1dc9jRMyLiE0RsaZprMfzFhGXV9d/KiIu74WMcyJibZXj3og4rhpvi4jfN83n7U3rjK+eI09X9yNanLHHj22rfue7yPedpmwbIqKzGq9rDrt6rann+VhK6XdfwOHAT4HhwEBgFXBmDTmGAudUy0cD64EzgeuBz+3h+mdWWQcBp1X34fBeyLkBGLLb2JeAa6vla4EvVssXAP8KBPBu4NEaHttfAO+sex6BScA5wJr9nTfgBOCZ6vvx1fLxLc74AWBAtfzFpoxtzdfbbTuPVbmjuh8fanHGHj22rfyd31O+3S6/Cfh8zXPY1WtNLc/H/rrH8i7g6VLKM6WU14FvAx/p7RCllBdLKSuq5a3AT4BT9rLKR4Bvl1JeK6U8CzxN477U4SPAN6rlbwB/2TQ+vzT8CDguIob2Yq4/B35aSnluL9fplXkspSwFfrOH2+7JvH0QeKCU8ptSyv8CDwDntzJjKWVJKWVb9eOPgGF720aV85hSyo9K49VnftP9aknGvejqsW3Z7/ze8lV7HR8FFuxtG70wh1291tTyfOyvxXIKsLHp55+x9xf0louINmAc8Gg19KlqF3Tezt1T6stdgCURsTwirq7G3lZKebFa/gXwtpoz7nQpb/wl7kvzCD2ft7rn80oa71x3Oi0iVkbEf0bEe6uxU6pcO/VWxp48tnXN43uBl0opTzWN1TqHu73W1PJ87K/F0qdExGDgHuCaUsoW4KvACKAdeJHGrnSd3lNKOQf4EPDXETGp+cLqHVbtn0uPiIHARcDCaqivzeMb9JV560pEXAdsA+6uhl4E3lFKGQd8FvhWRBxTU7w+/dg2mc4b3+jUOod7eK3ZpTefj/21WF4ATm36eVg11usi4ggaD/TdpZTvAZRSXiqlbC+l7AC+xv8fpqkldynlher7JuDeKs9LOw9xVd831Zmx8iFgRSnlpSpvn5rHSk/nrZasEXEF8GHgsuoFh+rw0q+r5eU0zlmcUeVpPlzW8oz78dj2+jxGxADgYuA7Tblrm8M9vdZQ0/OxvxbL48DpEXFa9S73UuC+3g5RHX+9E/hJKeXmpvHmcxJTgZ2fNrkPuDQiBkXEacDpNE74tTLjURFx9M5lGid211RZdn4i5HJgUVPGj1efKnk3sLlpV7vV3vDusC/NY5Oeztu/AR+IiOOrwz0fqMZaJiLOB/4WuKiU8rum8RMj4vBqeTiNeXumyrklIt5dPac/3nS/WpWxp49tHb/zfwGsLaXsOsRV1xx29VpDXc/HrE8l9LUvGp96WE/jHcN1NWV4D41dzyeAzurrAuCbwOpq/D5gaNM611WZ15H4qZG9ZBxO4xM0q4And84V8FbgQeAp4N+BE6rxAG6rMq4GJvTSXB4F/Bo4tmms1nmkUXIvAn+gcSz6qv2ZNxrnOZ6uvmb0QsanaRxH3/mcvL267iXVc6ATWAFMadrOBBov7j8F/onqv3a0MGOPH9tW/c7vKV81fhcwa7fr1jWHXb3W1PJ89F+6SJJS9ddDYZKkmlgskqRUFoskKZXFIklKZbFIklJZLJKkVBaLJCnV/wFOKL/AxNMYcAAAAABJRU5ErkJggg==\n", 379 | "text/plain": [ 380 | "
" 381 | ] 382 | }, 383 | "metadata": { 384 | "needs_background": "light" 385 | }, 386 | "output_type": "display_data" 387 | } 388 | ], 389 | "source": [ 390 | "ni_count, icount = test_df['Insult'].value_counts()\n", 391 | "y = [ni_count, icount]\n", 392 | "plt.barh(x, y)\n", 393 | "for index, value in enumerate(y):\n", 394 | " plt.text(value, index, str(value))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 8, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "0 0.738194\n", 406 | "1 0.261806\n", 407 | "Name: Insult, dtype: float64" 408 | ] 409 | }, 410 | "execution_count": 8, 411 | "metadata": {}, 412 | "output_type": "execute_result" 413 | } 414 | ], 415 | "source": [ 416 | "# Test Dataset Balance\n", 417 | "test_df['Insult'].value_counts(normalize=True)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 117, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "train_labels = train_df[\"Insult\"]\n", 427 | "train_comments = train_df[\"Comment\"]" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 10, 433 | "metadata": { 434 | "pycharm": { 435 | "name": "#%%\n" 436 | } 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "test_labels = test_df[\"Insult\"]\n", 441 | "test_comments = test_df[\"Comment\"]" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 85, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "assert len(train_labels) == len(train_comments) == 3947" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": { 456 | "pycharm": { 457 | "name": "#%% md\n" 458 | } 459 | }, 460 | "source": [ 461 | "# NLTK: Corpus Stats\n", 462 | "\n", 463 | "## Raw Corpus\n", 464 | "\n", 465 | "First, let's take a look to the Frequency Distribution and TF-IDF distribution of the training set without any preprocessing.\n", 466 | "\n", 467 | "Note: the tokenizer applied remove stopwords, normalize the text to lowercase and filter tokens of size < 3 characters" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 12, 473 | "metadata": { 474 | "pycharm": { 475 | "name": "#%%\n" 476 | } 477 | }, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "Number of Words: 73786\n", 484 | "Number of unique words: 16372\n", 485 | "[ ('xa0', 1223),\n", 486 | " ('like', 718),\n", 487 | " ('xc2', 615),\n", 488 | " ('...', 437),\n", 489 | " ('people', 419),\n", 490 | " ('.\\\\\\\\', 415),\n", 491 | " ('get', 398),\n", 492 | " ('would', 371),\n", 493 | " ('one', 360),\n", 494 | " ('know', 328),\n", 495 | " ('think', 302),\n", 496 | " (\"\\\\\\\\'\", 263),\n", 497 | " ('://', 245),\n", 498 | " ('fuck', 242),\n", 499 | " ('x80', 224),\n", 500 | " ('right', 223),\n", 501 | " ('time', 219),\n", 502 | " ('xe2', 213),\n", 503 | " ('make', 211),\n", 504 | " ('good', 207)]\n" 505 | ] 506 | } 507 | ], 508 | "source": [ 509 | "from feature_extraction import corpus_stats, tf_idf_stats\n", 510 | "\n", 511 | "corpus_stats(train_comments)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 13, 517 | "metadata": {}, 518 | "outputs": [ 519 | { 520 | "name": "stdout", 521 | "output_type": "stream", 522 | "text": [ 523 | "\n", 524 | "Number of Features: 3820\n", 525 | "\n", 526 | "Scores for comment: 0\n", 527 | "dad 0.8535369792459815\n", 528 | "fuck 0.521032268731645\n", 529 | "\n", 530 | "\n", 531 | "\n", 532 | "Scores for comment: 1\n", 533 | "oranges 0.5306263581464218\n", 534 | "apples 0.5197044492665618\n", 535 | "understand 0.36515999718423015\n", 536 | "point 0.34980838586743224\n", 537 | "really 0.29286527905890025\n", 538 | "don 0.24152415100169672\n", 539 | "xa0 0.22029806147740302\n", 540 | "\n", 541 | "\n", 542 | "\n", 543 | "Scores for comment: 2\n", 544 | "xc2 0.3201457437837055\n", 545 | "inadvertently 0.29957606737309345\n", 546 | "damage 0.2911085231656127\n", 547 | "regard 0.2911085231656127\n", 548 | "canadians 0.2841900357926315\n", 549 | "mail 0.2841900357926315\n", 550 | "perfect 0.25788744107674183\n", 551 | "daughter 0.24941989686926105\n", 552 | "proof 0.23850333373383326\n", 553 | "son 0.23003578952635248\n", 554 | "xa0the 0.22854749591213686\n", 555 | "idea 0.21950060269617008\n", 556 | "kill 0.21126373775810095\n", 557 | "sorry 0.20449981932779426\n", 558 | "wrong 0.19810115683664803\n", 559 | "like 0.11007715246411283\n", 560 | "\n", 561 | "\n", 562 | "\n", 563 | "Scores for comment: 3\n", 564 | "married 0.45173878274554974\n", 565 | "dont 0.32575508025077876\n", 566 | "wasnt 0.25906529250234656\n", 567 | "quick 0.2517427889521695\n", 568 | "people 0.23790850778601833\n", 569 | "suppose 0.23245444333275292\n", 570 | "bother 0.2289970153882341\n", 571 | "judge 0.22301409435361433\n", 572 | "wanna 0.21569159080343728\n", 573 | "listen 0.21158433539213709\n", 574 | "stay 0.19186981216287324\n", 575 | "nice 0.1782134193691978\n", 576 | "women 0.1761829647101862\n", 577 | "thought 0.17365747162008527\n", 578 | "god 0.1637644918134461\n", 579 | "let 0.1576320706809686\n", 580 | "gay 0.15546764580526237\n", 581 | "man 0.14286795196264415\n", 582 | "got 0.14193031061056438\n", 583 | "like 0.0951917486299911\n", 584 | "\n", 585 | "\n", 586 | "\n", 587 | "Scores for comment: 4\n", 588 | "u0111 0.5653840466112466\n", 589 | "xe0 0.34909770568879844\n", 590 | "u01b0 0.2826920233056233\n", 591 | "xf4ng 0.2826920233056233\n", 592 | "th 0.27597356253225125\n", 593 | "kh 0.2327318037925323\n", 594 | "ho 0.23160774973255066\n", 595 | "xf3 0.16961521398337398\n", 596 | "ch 0.16558413751935075\n", 597 | "bi 0.11636590189626615\n", 598 | "chi 0.11636590189626615\n", 599 | "cu 0.11636590189626615\n", 600 | "xe2n 0.11636590189626615\n", 601 | "xe3 0.11636590189626615\n", 602 | "xf4i 0.11636590189626615\n", 603 | "xec 0.11307680932224932\n", 604 | "ng 0.1103894250129005\n", 605 | "nh 0.1103894250129005\n", 606 | "gi 0.058182950948133076\n", 607 | "khi 0.058182950948133076\n", 608 | "\n", 609 | "\n", 610 | "\n", 611 | "Scores for comment: 5\n", 612 | "falls 0.32265765609972136\n", 613 | "sdl 0.32265765609972136\n", 614 | "skills 0.3135377087325509\n", 615 | "productive 0.306086169165001\n", 616 | "contract 0.28951468223028054\n", 617 | "hurt 0.25897579380375485\n", 618 | "sign 0.25687942067377567\n", 619 | "chance 0.25488535126580847\n", 620 | "playing 0.25116730597949904\n", 621 | "ok 0.23641257661019505\n", 622 | "start 0.22195919655529\n", 623 | "hope 0.2034074454198284\n", 624 | "old 0.20125587941771544\n", 625 | "year 0.20125587941771544\n", 626 | "time 0.16530090849651477\n", 627 | "\n", 628 | "\n", 629 | "\n", 630 | "Scores for comment: 6\n", 631 | "yeah 1.0\n", 632 | "\n", 633 | "\n", 634 | "\n", 635 | "Scores for comment: 7\n", 636 | "burned 0.47267418513506926\n", 637 | "stake 0.4519427517610451\n", 638 | "faggot 0.3842527465020163\n", 639 | "friends 0.3564040907762901\n", 640 | "rest 0.3564040907762901\n", 641 | "shut 0.3359399641036873\n", 642 | "fuck 0.24036072748490928\n", 643 | "\n", 644 | "\n", 645 | "\n", 646 | "Scores for comment: 8\n", 647 | "extremely 0.6269501175992181\n", 648 | "fake 0.5181457444089772\n", 649 | "maybe 0.44921371996717435\n", 650 | "stupid 0.3696830688393053\n", 651 | "\n", 652 | "\n", 653 | "\n", 654 | "Scores for comment: 9\n", 655 | "health 0.6593542026161987\n", 656 | "women 0.5839627759110696\n", 657 | "idiot 0.473539345611951\n", 658 | "\n", 659 | "\n", 660 | "\n", 661 | "Scores for comment: 10\n", 662 | "injured 0.4351905204398433\n", 663 | "injury 0.4272680710265408\n", 664 | "looked 0.3940659326854057\n", 665 | "happened 0.3700095909914432\n", 666 | "doubt 0.35294134493096807\n", 667 | "wish 0.35294134493096807\n", 668 | "dont 0.29448423344607455\n", 669 | "\n", 670 | "\n", 671 | "\n", 672 | "Scores for comment: 11\n", 673 | "careful 1.0\n", 674 | "\n", 675 | "\n", 676 | "\n", 677 | "Scores for comment: 12\n", 678 | "attention 0.6693570693354229\n", 679 | "pay 0.5590168467992129\n", 680 | "don 0.360092546887582\n", 681 | "just 0.3313527371267268\n", 682 | "\n", 683 | "\n", 684 | "\n", 685 | "Scores for comment: 13\n", 686 | "pregnant 0.3466326928626648\n", 687 | "stream 0.3466326928626648\n", 688 | "hmmm 0.32206154212785365\n", 689 | "faced 0.3161985553309461\n", 690 | "zero 0.29839572613310034\n", 691 | "pig 0.29162740459613495\n", 692 | "hold 0.2857644177992274\n", 693 | "chance 0.2738245753982892\n", 694 | "looking 0.259652506193914\n", 695 | "women 0.2357350724372564\n", 696 | "thought 0.23235593020569015\n", 697 | "getting 0.2197246218718172\n", 698 | "\n", 699 | "\n", 700 | "\n", 701 | "Scores for comment: 14\n", 702 | "24 0.40066452096079386\n", 703 | "math 0.40066452096079386\n", 704 | "income 0.38527383519309866\n", 705 | "puts 0.3789725677032502\n", 706 | "21 0.3733358918583078\n", 707 | "huh 0.3553349840531867\n", 708 | "spending 0.3481911961678598\n", 709 | "\n", 710 | "\n", 711 | "\n", 712 | "Scores for comment: 15\n", 713 | "negro 0.5018593055816001\n", 714 | "retarded 0.41709121906139934\n", 715 | "damn 0.3970432475506194\n", 716 | "sound 0.393275414183543\n", 717 | "reason 0.38970089007152\n", 718 | "lol 0.3316571676199919\n", 719 | "\n", 720 | "\n", 721 | "\n", 722 | "Scores for comment: 16\n", 723 | "screen 0.6677578940109922\n", 724 | "racist 0.5459984232876237\n", 725 | "nyou 0.5059497176137503\n", 726 | "\n", 727 | "\n", 728 | "\n", 729 | "Scores for comment: 17\n", 730 | "cheating 0.3831105774247488\n", 731 | "fest 0.3831105774247488\n", 732 | "cheat 0.3722819228558328\n", 733 | "cup 0.35595368212943507\n", 734 | "dare 0.32979739445760264\n", 735 | "shame 0.3223168087378635\n", 736 | "comment 0.24856308385846257\n", 737 | "oh 0.24022357747847356\n", 738 | "post 0.23594792998939906\n", 739 | "lol 0.2309520966299293\n", 740 | "\n", 741 | "\n", 742 | "\n", 743 | "Scores for comment: 18\n", 744 | "dickhead 1.0\n", 745 | "\n", 746 | "\n", 747 | "\n", 748 | "Scores for comment: 19\n", 749 | "retard 0.6420313237061857\n", 750 | "head 0.5742606299461356\n", 751 | "post 0.5079571913793038\n", 752 | "\n", 753 | "\n" 754 | ] 755 | } 756 | ], 757 | "source": [ 758 | "tf_idf_stats(train_comments)" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": { 764 | "pycharm": { 765 | "name": "#%% md\n" 766 | } 767 | }, 768 | "source": [ 769 | "## Pre-process comments" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 14, 775 | "metadata": {}, 776 | "outputs": [ 777 | { 778 | "name": "stderr", 779 | "output_type": "stream", 780 | "text": [ 781 | ":25: DeprecationWarning: invalid escape sequence '\\ '\n", 782 | " comment = comment.encode('utf-8').decode('unicode-escape')\n" 783 | ] 784 | }, 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "Number of Words: 67951\n", 790 | "Number of unique words: 15294\n", 791 | "[ ('like', 722),\n", 792 | " ('...', 552),\n", 793 | " ('people', 426),\n", 794 | " ('get', 409),\n", 795 | " ('would', 375),\n", 796 | " ('one', 371),\n", 797 | " ('know', 328),\n", 798 | " ('think', 305),\n", 799 | " ('fuck', 251),\n", 800 | " ('right', 225),\n", 801 | " ('time', 219),\n", 802 | " ('....', 212),\n", 803 | " ('make', 212),\n", 804 | " ('good', 210),\n", 805 | " ('see', 209),\n", 806 | " ('really', 195),\n", 807 | " ('back', 191),\n", 808 | " ('even', 189),\n", 809 | " ('say', 183),\n", 810 | " ('way', 182)]\n" 811 | ] 812 | } 813 | ], 814 | "source": [ 815 | "train_comments = train_comments.apply(lambda comment: preprocess_comment(comment))\n", 816 | "corpus_stats(train_comments)" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 15, 822 | "metadata": {}, 823 | "outputs": [ 824 | { 825 | "name": "stdout", 826 | "output_type": "stream", 827 | "text": [ 828 | "\n", 829 | "Number of Features: 3682\n", 830 | "\n", 831 | "Scores for comment: 0\n", 832 | "dad 0.8556854567153133\n", 833 | "fuck 0.5174962793739737\n", 834 | "\n", 835 | "\n", 836 | "\n", 837 | "Scores for comment: 1\n", 838 | "oranges 0.5445066852844984\n", 839 | "apples 0.5332990769366469\n", 840 | "understand 0.3735486785429875\n", 841 | "point 0.35802065834828095\n", 842 | "really 0.300104678423663\n", 843 | "don 0.2476375156592591\n", 844 | "\n", 845 | "\n", 846 | "\n", 847 | "Scores for comment: 2\n", 848 | "inadvertently 0.3093882858732369\n", 849 | "regard 0.3006433984365323\n", 850 | "canadians 0.2934983051454301\n", 851 | "damage 0.2934983051454301\n", 852 | "mail 0.2874572148453\n", 853 | "perfect 0.2663342036718874\n", 854 | "daughter 0.25758931623518283\n", 855 | "majority 0.24631519548944691\n", 856 | "proof 0.24258003581733398\n", 857 | "son 0.2360332673464672\n", 858 | "idea 0.22552013195772022\n", 859 | "unless 0.22220636897468896\n", 860 | "kill 0.21723903117040777\n", 861 | "sorry 0.2104062065791996\n", 862 | "wrong 0.20458969863859225\n", 863 | "like 0.113616217558659\n", 864 | "\n", 865 | "\n", 866 | "\n", 867 | "Scores for comment: 3\n", 868 | "married 0.4526768951558829\n", 869 | "dont 0.3255604709285814\n", 870 | "wasnt 0.2596032856418905\n", 871 | "quick 0.2522655756677412\n", 872 | "people 0.2374891238829603\n", 873 | "suppose 0.23293717451824505\n", 874 | "bother 0.22947256663658896\n", 875 | "judge 0.22347722104891554\n", 876 | "wanna 0.21613951107476623\n", 877 | "listen 0.2083662535265952\n", 878 | "stay 0.19226826245944578\n", 879 | "nice 0.17789120298661099\n", 880 | "women 0.17654883860013515\n", 881 | "thought 0.17341471954148763\n", 882 | "god 0.1627802354642907\n", 883 | "let 0.1561421978664708\n", 884 | "gay 0.15509819395545876\n", 885 | "man 0.14340379433386724\n", 886 | "got 0.14222505303886993\n", 887 | "like 0.09533374315444033\n", 888 | "\n", 889 | "\n", 890 | "\n", 891 | "Scores for comment: 4\n", 892 | "không 0.7220955827639618\n", 893 | "có 0.5415716870729713\n", 894 | "khi 0.18052389569099045\n", 895 | "ta 0.18052389569099045\n", 896 | "được 0.18052389569099045\n", 897 | "đầu 0.18052389569099045\n", 898 | "sau 0.17542137171210934\n", 899 | "2011 0.1554024188954138\n", 900 | "\n", 901 | "\n", 902 | "\n", 903 | "Scores for comment: 5\n", 904 | "falls 0.3233014303221935\n", 905 | "sdl 0.3233014303221935\n", 906 | "skills 0.314163286619327\n", 907 | "productive 0.3066968795629677\n", 908 | "contract 0.2900923288037419\n", 909 | "hurt 0.25739195259789727\n", 910 | "sign 0.25739195259789727\n", 911 | "chance 0.255393904575264\n", 912 | "playing 0.24992554554153795\n", 913 | "ok 0.23447464462090847\n", 914 | "start 0.22153987853082308\n", 915 | "hope 0.20272117443992987\n", 916 | "old 0.20165742993059105\n", 917 | "year 0.20165742993059105\n", 918 | "time 0.1654134690353827\n", 919 | "\n", 920 | "\n", 921 | "\n", 922 | "Scores for comment: 6\n", 923 | "yeah 1.0\n", 924 | "\n", 925 | "\n", 926 | "\n", 927 | "Scores for comment: 7\n", 928 | "burned 0.47334261598477556\n", 929 | "stake 0.4525818653134214\n", 930 | "faggot 0.3847961364266701\n", 931 | "friends 0.3569080986039406\n", 932 | "rest 0.3569080986039406\n", 933 | "shut 0.33378905211096066\n", 934 | "fuck 0.23846686030961795\n", 935 | "\n", 936 | "\n", 937 | "\n", 938 | "Scores for comment: 8\n", 939 | "extremely 0.6287466263952122\n", 940 | "fake 0.5196304771832563\n", 941 | "maybe 0.4445869947150357\n", 942 | "stupid 0.37014085306703653\n", 943 | "\n", 944 | "\n", 945 | "\n", 946 | "Scores for comment: 9\n", 947 | "health 0.6593542026161987\n", 948 | "women 0.5839627759110696\n", 949 | "idiot 0.473539345611951\n", 950 | "\n", 951 | "\n", 952 | "\n", 953 | "Scores for comment: 10\n", 954 | "injured 0.43529113195797065\n", 955 | "injury 0.42736685095683397\n", 956 | "looked 0.3941570366269384\n", 957 | "happened 0.37009513335719535\n", 958 | "doubt 0.35302294129590617\n", 959 | "wish 0.35302294129590617\n", 960 | "dont 0.29376629099056306\n", 961 | "\n", 962 | "\n", 963 | "\n", 964 | "Scores for comment: 11\n", 965 | "careful 1.0\n", 966 | "\n", 967 | "\n", 968 | "\n", 969 | "Scores for comment: 12\n", 970 | "attention 0.6701686571373742\n", 971 | "pay 0.5596946483415167\n", 972 | "don 0.36023165043797994\n", 973 | "just 0.32840376007482597\n", 974 | "\n", 975 | "\n", 976 | "\n", 977 | "Scores for comment: 13\n", 978 | "pregnant 0.34790672285668545\n", 979 | "stream 0.34790672285668545\n", 980 | "faced 0.3173607262740003\n", 981 | "hmmm 0.3173607262740003\n", 982 | "zero 0.29949246372597293\n", 983 | "pig 0.2926992655838147\n", 984 | "hold 0.2816242011779455\n", 985 | "chance 0.27483100303578734\n", 986 | "looking 0.2606068451461396\n", 987 | "women 0.23660150413603065\n", 988 | "thought 0.23240132196945798\n", 989 | "getting 0.2199243561990862\n", 990 | "\n", 991 | "\n", 992 | "\n", 993 | "Scores for comment: 14\n", 994 | "24 0.39955615860341753\n", 995 | "math 0.39955615860341753\n", 996 | "income 0.39133207419890853\n", 997 | "puts 0.37792421201776727\n", 998 | "21 0.37230312896679696\n", 999 | "huh 0.3543520172568279\n", 1000 | "spending 0.3472279913049065\n", 1001 | "\n", 1002 | "\n", 1003 | "\n", 1004 | "Scores for comment: 15\n", 1005 | "negro 0.5035626261187394\n", 1006 | "retarded 0.4185068350146881\n", 1007 | "damn 0.3964749617018476\n", 1008 | "sound 0.392793876853252\n", 1009 | "reason 0.39102354269949313\n", 1010 | "lol 0.3269524043759824\n", 1011 | "\n", 1012 | "\n", 1013 | "\n", 1014 | "Scores for comment: 16\n", 1015 | "screen 0.7753584994692642\n", 1016 | "racist 0.6315213355863531\n", 1017 | "\n", 1018 | "\n", 1019 | "\n", 1020 | "Scores for comment: 17\n", 1021 | "cheating 0.3835964665366246\n", 1022 | "fest 0.3835964665366246\n", 1023 | "cheat 0.37275407826871576\n", 1024 | "cup 0.35640512886223313\n", 1025 | "dare 0.3344434894866884\n", 1026 | "shame 0.3227255947050717\n", 1027 | "comment 0.2481249923850661\n", 1028 | "post 0.2362471765696478\n", 1029 | "oh 0.23396019930984924\n", 1030 | "lol 0.22719355337216132\n", 1031 | "\n", 1032 | "\n", 1033 | "\n", 1034 | "Scores for comment: 18\n", 1035 | "dickhead 1.0\n", 1036 | "\n", 1037 | "\n", 1038 | "\n", 1039 | "Scores for comment: 19\n", 1040 | "retard 0.642896916296181\n", 1041 | "head 0.5726838975026152\n", 1042 | "post 0.5086420239796117\n", 1043 | "\n", 1044 | "\n" 1045 | ] 1046 | } 1047 | ], 1048 | "source": [ 1049 | "tf_idf_stats(train_comments)" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": 16, 1055 | "metadata": { 1056 | "pycharm": { 1057 | "name": "#%%\n" 1058 | } 1059 | }, 1060 | "outputs": [ 1061 | { 1062 | "name": "stderr", 1063 | "output_type": "stream", 1064 | "text": [ 1065 | ":25: DeprecationWarning: invalid escape sequence '\\ '\n", 1066 | " comment = comment.encode('utf-8').decode('unicode-escape')\n", 1067 | ":25: DeprecationWarning: invalid escape sequence '\\p'\n", 1068 | " comment = comment.encode('utf-8').decode('unicode-escape')\n", 1069 | ":25: DeprecationWarning: invalid escape sequence '\\/'\n", 1070 | " comment = comment.encode('utf-8').decode('unicode-escape')\n" 1071 | ] 1072 | } 1073 | ], 1074 | "source": [ 1075 | "test_comments = test_comments.apply(lambda comment: preprocess_comment(comment))" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": 17, 1081 | "metadata": {}, 1082 | "outputs": [ 1083 | { 1084 | "name": "stdout", 1085 | "output_type": "stream", 1086 | "text": [ 1087 | "3947 2647\n" 1088 | ] 1089 | } 1090 | ], 1091 | "source": [ 1092 | "print(len(train_comments), len(test_comments))" 1093 | ] 1094 | }, 1095 | { 1096 | "cell_type": "markdown", 1097 | "metadata": {}, 1098 | "source": [ 1099 | "## Replace Insults\n", 1100 | "\n", 1101 | "We are going to replace any appearance of an insult with a 'fakeinsult' keyword. The intention of this is to concentrate the frequency of any insult into the same token" 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": 18, 1107 | "metadata": { 1108 | "pycharm": { 1109 | "name": "#%%\n" 1110 | } 1111 | }, 1112 | "outputs": [ 1113 | { 1114 | "name": "stdout", 1115 | "output_type": "stream", 1116 | "text": [ 1117 | "Number of Words: 68944\n", 1118 | "Number of unique words: 14893\n", 1119 | "[ ('fakeinsult', 3526),\n", 1120 | " ('like', 722),\n", 1121 | " ('...', 552),\n", 1122 | " ('people', 426),\n", 1123 | " ('get', 409),\n", 1124 | " ('would', 375),\n", 1125 | " ('one', 371),\n", 1126 | " ('know', 328),\n", 1127 | " ('think', 305),\n", 1128 | " ('right', 225),\n", 1129 | " ('time', 219),\n", 1130 | " ('ing', 219),\n", 1131 | " ('....', 212),\n", 1132 | " ('make', 212),\n", 1133 | " ('good', 210),\n", 1134 | " ('see', 209),\n", 1135 | " ('really', 195),\n", 1136 | " ('back', 191),\n", 1137 | " ('even', 189),\n", 1138 | " ('say', 183)]\n" 1139 | ] 1140 | } 1141 | ], 1142 | "source": [ 1143 | "from feature_extraction import replace_badwords\n", 1144 | "\n", 1145 | "train_comments = train_comments.apply(lambda comment: replace_badwords(comment, badwords))\n", 1146 | "test_comments = test_comments.apply(lambda comment: replace_badwords(comment, badwords))\n", 1147 | "\n", 1148 | "corpus_stats(train_comments)" 1149 | ] 1150 | }, 1151 | { 1152 | "cell_type": "markdown", 1153 | "metadata": {}, 1154 | "source": [ 1155 | "**Let's check which percentage of comments of each class contains an insult**" 1156 | ] 1157 | }, 1158 | { 1159 | "cell_type": "code", 1160 | "execution_count": 19, 1161 | "metadata": { 1162 | "pycharm": { 1163 | "name": "#%%\n" 1164 | } 1165 | }, 1166 | "outputs": [ 1167 | { 1168 | "data": { 1169 | "text/plain": [ 1170 | "0 0.563824\n", 1171 | "1 0.436176\n", 1172 | "Name: Insult, dtype: float64" 1173 | ] 1174 | }, 1175 | "execution_count": 19, 1176 | "metadata": {}, 1177 | "output_type": "execute_result" 1178 | } 1179 | ], 1180 | "source": [ 1181 | "train_df['Comment'] = train_comments\n", 1182 | "train_df[train_df['Comment'].str.contains('fakeinsult')]['Insult'].value_counts(normalize=True)" 1183 | ] 1184 | }, 1185 | { 1186 | "cell_type": "markdown", 1187 | "metadata": {}, 1188 | "source": [] 1189 | }, 1190 | { 1191 | "cell_type": "markdown", 1192 | "metadata": {}, 1193 | "source": [ 1194 | "# A First Attempt. Naive Classifier using insult presence\n", 1195 | "\n", 1196 | "As baseline, we are going to use a naive classifier which will classify a comment as offensive if the comment contains an insult keyword" 1197 | ] 1198 | }, 1199 | { 1200 | "cell_type": "code", 1201 | "execution_count": 20, 1202 | "metadata": {}, 1203 | "outputs": [ 1204 | { 1205 | "name": "stdout", 1206 | "output_type": "stream", 1207 | "text": [ 1208 | "\n", 1209 | "Naive Model Result\n", 1210 | "\n", 1211 | "Accuracy: 0.6720816018133736\n", 1212 | "\n", 1213 | " precision recall f1-score support\n", 1214 | "\n", 1215 | " NoInsult 0.88 0.64 0.74 1954\n", 1216 | " Insult 0.43 0.76 0.55 693\n", 1217 | "\n", 1218 | " accuracy 0.67 2647\n", 1219 | " macro avg 0.65 0.70 0.65 2647\n", 1220 | "weighted avg 0.76 0.67 0.69 2647\n", 1221 | "\n" 1222 | ] 1223 | }, 1224 | { 1225 | "data": { 1226 | "image/png": "\n", 1227 | "text/plain": [ 1228 | "
" 1229 | ] 1230 | }, 1231 | "metadata": { 1232 | "needs_background": "light" 1233 | }, 1234 | "output_type": "display_data" 1235 | } 1236 | ], 1237 | "source": [ 1238 | "from sklearn.metrics import plot_confusion_matrix\n", 1239 | "from sklearn.base import is_classifier\n", 1240 | "from training import NaiveClassifier\n", 1241 | "naive_classifier = NaiveClassifier(badwords)\n", 1242 | "naive_predictions = naive_classifier.predict(test_comments)\n", 1243 | "print(\"\\nNaive Model Result\\n\")\n", 1244 | "predictions_report(naive_predictions, test_labels)\n", 1245 | "plot_confusion_matrix(naive_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1246 | "plt.show()" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "markdown", 1251 | "metadata": {}, 1252 | "source": [ 1253 | "As suspected from the outcome of the previous section, there is a huge number of false positives (35,7%) for NoInsult class as many comments containing insults keywords are not really offenses to a particular user. We will need to work that out in the following approaches" 1254 | ] 1255 | }, 1256 | { 1257 | "cell_type": "markdown", 1258 | "metadata": {}, 1259 | "source": [ 1260 | "# A Simple TF-IDF based classifier" 1261 | ] 1262 | }, 1263 | { 1264 | "cell_type": "code", 1265 | "execution_count": 21, 1266 | "metadata": {}, 1267 | "outputs": [ 1268 | { 1269 | "name": "stdout", 1270 | "output_type": "stream", 1271 | "text": [ 1272 | "\n", 1273 | "TF-IDF Model Result\n", 1274 | "\n", 1275 | "Accuracy: 0.8413298073290517\n", 1276 | "\n", 1277 | " precision recall f1-score support\n", 1278 | "\n", 1279 | " NoInsult 0.83 0.98 0.90 1954\n", 1280 | " Insult 0.90 0.44 0.59 693\n", 1281 | "\n", 1282 | " accuracy 0.84 2647\n", 1283 | " macro avg 0.87 0.71 0.75 2647\n", 1284 | "weighted avg 0.85 0.84 0.82 2647\n", 1285 | "\n" 1286 | ] 1287 | }, 1288 | { 1289 | "data": { 1290 | "image/png": "\n", 1291 | "text/plain": [ 1292 | "
" 1293 | ] 1294 | }, 1295 | "metadata": { 1296 | "needs_background": "light" 1297 | }, 1298 | "output_type": "display_data" 1299 | } 1300 | ], 1301 | "source": [ 1302 | "from training import train_tfidf\n", 1303 | "tf_idf_classifier = train_tfidf(train_comments, train_labels)\n", 1304 | "predictions = tf_idf_classifier.predict(test_comments)\n", 1305 | "print(\"\\nTF-IDF Model Result\\n\")\n", 1306 | "predictions_report(predictions, test_labels)\n", 1307 | "plot_confusion_matrix(tf_idf_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1308 | "plt.show()" 1309 | ] 1310 | }, 1311 | { 1312 | "cell_type": "markdown", 1313 | "metadata": {}, 1314 | "source": [ 1315 | "Using a simple TF-IDF based classifier (using stochastic gradient descent estimator) improves the global accuracy a lot and, in particular, the model seems to fix the NoInsult category prediction. But now we have more than 50% of false negatives for Insult category.\n", 1316 | "\n", 1317 | "## Working with unbalanced training sets\n", 1318 | "\n", 1319 | "Following https://scikit-learn.org/stable/auto_examples/svm/plot_separating_hyperplane_unbalanced.html we are going to try to tweak the SGD classifier by assigning different weights to each class. Most of the models in scikit-learn have a parameter class_weight. This parameter will affect the computation of the loss in linear model or the criterion in the tree-based model to penalize differently a false classification from the minority and majority class." 1320 | ] 1321 | }, 1322 | { 1323 | "cell_type": "code", 1324 | "execution_count": 22, 1325 | "metadata": {}, 1326 | "outputs": [ 1327 | { 1328 | "name": "stdout", 1329 | "output_type": "stream", 1330 | "text": [ 1331 | "\n", 1332 | "TF-IDF Model Result\n", 1333 | "\n", 1334 | "Accuracy: 0.8367963732527389\n", 1335 | "\n", 1336 | " precision recall f1-score support\n", 1337 | "\n", 1338 | " NoInsult 0.89 0.89 0.89 1954\n", 1339 | " Insult 0.69 0.70 0.69 693\n", 1340 | "\n", 1341 | " accuracy 0.84 2647\n", 1342 | " macro avg 0.79 0.79 0.79 2647\n", 1343 | "weighted avg 0.84 0.84 0.84 2647\n", 1344 | "\n" 1345 | ] 1346 | }, 1347 | { 1348 | "data": { 1349 | "image/png": "\n", 1350 | "text/plain": [ 1351 | "
" 1352 | ] 1353 | }, 1354 | "metadata": { 1355 | "needs_background": "light" 1356 | }, 1357 | "output_type": "display_data" 1358 | } 1359 | ], 1360 | "source": [ 1361 | "tf_idf_classifier = train_tfidf(train_comments, train_labels, class_weight={0: 1, 1: 2})\n", 1362 | "predictions = tf_idf_classifier.predict(test_comments)\n", 1363 | "print(\"\\nTF-IDF Model Result\\n\")\n", 1364 | "predictions_report(predictions, test_labels)\n", 1365 | "plot_confusion_matrix(tf_idf_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1366 | "plt.show()" 1367 | ] 1368 | }, 1369 | { 1370 | "cell_type": "markdown", 1371 | "metadata": {}, 1372 | "source": [ 1373 | "# A Custom Classifier\n", 1374 | "\n", 1375 | "We can continue playing with the class weights and other classifier's parameters, but eventually we will realize that we need to capture more semantics of the problem in the shape of features. We need specific features to capture when a comment is being offensive for another user.\n", 1376 | "\n", 1377 | "### A grammar to detect insults to another user" 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": 23, 1383 | "metadata": {}, 1384 | "outputs": [ 1385 | { 1386 | "name": "stderr", 1387 | "output_type": "stream", 1388 | "text": [ 1389 | ":1: FutureWarning: Passing a negative integer is deprecated in version 1.0 and will not be supported in future version. Instead, use None to not limit the column width.\n", 1390 | " pd.set_option('display.max_colwidth', -1)\n" 1391 | ] 1392 | }, 1393 | { 1394 | "data": { 1395 | "text/html": [ 1396 | "
\n", 1397 | "\n", 1410 | "\n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | " \n", 1445 | " \n", 1446 | " \n", 1447 | " \n", 1448 | " \n", 1449 | " \n", 1450 | " \n", 1451 | " \n", 1452 | " \n", 1453 | " \n", 1454 | " \n", 1455 | " \n", 1456 | " \n", 1457 | " \n", 1458 | " \n", 1459 | " \n", 1460 | " \n", 1461 | " \n", 1462 | " \n", 1463 | " \n", 1464 | " \n", 1465 | " \n", 1466 | " \n", 1467 | " \n", 1468 | " \n", 1469 | " \n", 1470 | " \n", 1471 | " \n", 1472 | " \n", 1473 | " \n", 1474 | " \n", 1475 | " \n", 1476 | " \n", 1477 | " \n", 1478 | " \n", 1479 | " \n", 1480 | " \n", 1481 | " \n", 1482 | " \n", 1483 | " \n", 1484 | " \n", 1485 | " \n", 1486 | " \n", 1487 | " \n", 1488 | " \n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | " \n", 1536 | " \n", 1537 | " \n", 1538 | " \n", 1539 | " \n", 1540 | " \n", 1541 | "
InsultDateComment
0120120618192155Zyou fakeinsult your dad.
71NaNshut the fakeinsult up. you and the rest of your fakeinsult got friends should be burned at the stake
8120120502173553Zeither you are fake or extremely fakeinsult ...maybe both...
9120120620160512Zthat you are an fakeinsult who understands neither taxation nor women's health.
15120120611090207Zfor some reason u sound fakeinsult ed. lol. fakeinsult . where u been negro
16120120320162532Zyou with the 'racist' screen name you are a pieceof fakeinsult ..........
18120120320075347Zyour such a fakeinsult head...
19120120320203947Zyour a fakeinsult go post your head up your #%&*
34120120515132156Zallinit123, your hypocrisy is sickening, you ignorant fakeinsult .
37120120620161958Zi can't believe the fakeinsult people on this site that have no idea what is going on.  you fakeinsult s.  you are all useful fakeinsult s.  they are hiding the truth because you fakeinsult s who protect him would look like the useful fakeinsult s you are.
38120120609163743Zfakeinsult ing behave then you fakeinsult !
41120120620075045Zis that it?? your sum total to any conversation is first abuse and then calling them a troll.. you are a typical socialist fakeinsult . keep up your non existent cl fakeinsult warfare, it is as pointless as you.
45120120619074710Zfakeinsult got
47120120529130844Zaimless abe you are a delusion fakeinsult that should be locked up inside and kept away from computers.  anyone who doubts the lunacy of abe the fakeinsult ic prog frog from nyc please read this thread and you will see abe completely come apart from the seams and go off the deep end.
51120120320131007Zeven as a troll you are a pathetic failure.
55120120502173058Zyou're a fakeinsult , truth is beyond your reach
59120120619203151Zyour anti-semitic rants are not welcomed here. you are a racist fakeinsult . fu!
61120120619033424Zgod, you're tiresome. get a life, you fakeinsult .
79120120619194023Zfact : georgia p fakeinsult ed a strict immigration policy and most of the latino farm workers left the area. vidalia georgia now has over 3000 agriculture job openings and they have been able to fill about 250 of them in past year. all you white real americans who are looking for work that the latinos stole from you..where are you ? the jobs are i vadalia just waiting for you..or maybe its the fact that you would rather collect unemployment like the rest of the tea klaners.. you scream..you complain..and you sit at home in your wife beaters and drink beer..typical real white tea klan....
80120120530000452Zof course you would bottom feeder ...
\n", 1542 | "
" 1543 | ], 1544 | "text/plain": [ 1545 | " Insult Date \\\n", 1546 | "0 1 20120618192155Z \n", 1547 | "7 1 NaN \n", 1548 | "8 1 20120502173553Z \n", 1549 | "9 1 20120620160512Z \n", 1550 | "15 1 20120611090207Z \n", 1551 | "16 1 20120320162532Z \n", 1552 | "18 1 20120320075347Z \n", 1553 | "19 1 20120320203947Z \n", 1554 | "34 1 20120515132156Z \n", 1555 | "37 1 20120620161958Z \n", 1556 | "38 1 20120609163743Z \n", 1557 | "41 1 20120620075045Z \n", 1558 | "45 1 20120619074710Z \n", 1559 | "47 1 20120529130844Z \n", 1560 | "51 1 20120320131007Z \n", 1561 | "55 1 20120502173058Z \n", 1562 | "59 1 20120619203151Z \n", 1563 | "61 1 20120619033424Z \n", 1564 | "79 1 20120619194023Z \n", 1565 | "80 1 20120530000452Z \n", 1566 | "\n", 1567 | " Comment \n", 1568 | "0 you fakeinsult your dad. \n", 1569 | "7 shut the fakeinsult up. you and the rest of your fakeinsult got friends should be burned at the stake \n", 1570 | "8 either you are fake or extremely fakeinsult ...maybe both... \n", 1571 | "9 that you are an fakeinsult who understands neither taxation nor women's health. \n", 1572 | "15 for some reason u sound fakeinsult ed. lol. fakeinsult . where u been negro \n", 1573 | "16 you with the 'racist' screen name you are a pieceof fakeinsult .......... \n", 1574 | "18 your such a fakeinsult head... \n", 1575 | "19 your a fakeinsult go post your head up your #%&* \n", 1576 | "34 allinit123, your hypocrisy is sickening, you ignorant fakeinsult . \n", 1577 | "37 i can't believe the fakeinsult people on this site that have no idea what is going on.  you fakeinsult s.  you are all useful fakeinsult s.  they are hiding the truth because you fakeinsult s who protect him would look like the useful fakeinsult s you are. \n", 1578 | "38 fakeinsult ing behave then you fakeinsult ! \n", 1579 | "41 is that it?? your sum total to any conversation is first abuse and then calling them a troll.. you are a typical socialist fakeinsult . keep up your non existent cl fakeinsult warfare, it is as pointless as you. \n", 1580 | "45 fakeinsult got \n", 1581 | "47 aimless abe you are a delusion fakeinsult that should be locked up inside and kept away from computers.  anyone who doubts the lunacy of abe the fakeinsult ic prog frog from nyc please read this thread and you will see abe completely come apart from the seams and go off the deep end. \n", 1582 | "51 even as a troll you are a pathetic failure. \n", 1583 | "55 you're a fakeinsult , truth is beyond your reach \n", 1584 | "59 your anti-semitic rants are not welcomed here. you are a racist fakeinsult . fu! \n", 1585 | "61 god, you're tiresome. get a life, you fakeinsult . \n", 1586 | "79 fact : georgia p fakeinsult ed a strict immigration policy and most of the latino farm workers left the area. vidalia georgia now has over 3000 agriculture job openings and they have been able to fill about 250 of them in past year. all you white real americans who are looking for work that the latinos stole from you..where are you ? the jobs are i vadalia just waiting for you..or maybe its the fact that you would rather collect unemployment like the rest of the tea klaners.. you scream..you complain..and you sit at home in your wife beaters and drink beer..typical real white tea klan.... \n", 1587 | "80 of course you would bottom feeder ... " 1588 | ] 1589 | }, 1590 | "execution_count": 23, 1591 | "metadata": {}, 1592 | "output_type": "execute_result" 1593 | } 1594 | ], 1595 | "source": [ 1596 | "pd.set_option('display.max_colwidth', -1)\n", 1597 | "train_df[train_df['Insult'] == 1].head(20)" 1598 | ] 1599 | }, 1600 | { 1601 | "cell_type": "code", 1602 | "execution_count": 24, 1603 | "metadata": {}, 1604 | "outputs": [ 1605 | { 1606 | "name": "stdout", 1607 | "output_type": "stream", 1608 | "text": [ 1609 | "7732708342930407699 insult1 1 7 you are fake or extremely fakeinsult\n", 1610 | "15011685821042383993 insult2 0 5 your such a fakeinsult\n" 1611 | ] 1612 | } 1613 | ], 1614 | "source": [ 1615 | "import spacy\n", 1616 | "from spacy.matcher import Matcher\n", 1617 | "\n", 1618 | "nlp = spacy.load(\"en_core_web_sm\")\n", 1619 | "matcher = Matcher(nlp.vocab)\n", 1620 | "\n", 1621 | "pattern1 = [{\"LEMMA\": \"-PRON-\", \"LOWER\": {\"IN\": [\"you\", \"your\"]}},\n", 1622 | " {\"LEMMA\": {\"IN\": [\"be\", \"sound\"]}},{\"OP\": \"*\", \"LENGTH\": {\"<=\": 10}},\n", 1623 | " {\"LOWER\": \"fakeinsult\"}]\n", 1624 | "matcher.add(\"insult1\", None, pattern1)\n", 1625 | "\n", 1626 | "pattern2 = [{\"LEMMA\": \"-PRON-\", \"LOWER\": {\"IN\": [\"you\", \"your\"]}},\n", 1627 | " {\"OP\": \"*\", \"LENGTH\": {\"<=\": 4}},\n", 1628 | " {\"LOWER\": \"fakeinsult\"}]\n", 1629 | "matcher.add(\"insult2\", None, pattern2)\n", 1630 | "\n", 1631 | "doc = nlp(\"Either you are fake or extremely fakeinsult...maybe both...\")\n", 1632 | "matches = matcher(doc)\n", 1633 | "for match_id, start, end in matches:\n", 1634 | " string_id = nlp.vocab.strings[match_id] # Get string representation\n", 1635 | " span = doc[start:end] # The matched span\n", 1636 | " print(match_id, string_id, start, end, span.text)\n", 1637 | " \n", 1638 | "doc = nlp(\"your such a fakeinsult head...\")\n", 1639 | "matches = matcher(doc)\n", 1640 | "for match_id, start, end in matches:\n", 1641 | " string_id = nlp.vocab.strings[match_id] # Get string representation\n", 1642 | " span = doc[start:end] # The matched span\n", 1643 | " print(match_id, string_id, start, end, span.text)\n" 1644 | ] 1645 | }, 1646 | { 1647 | "cell_type": "code", 1648 | "execution_count": 25, 1649 | "metadata": {}, 1650 | "outputs": [ 1651 | { 1652 | "name": "stdout", 1653 | "output_type": "stream", 1654 | "text": [ 1655 | "Total offensive comments 1049\n", 1656 | "Total with pattern 463\n" 1657 | ] 1658 | } 1659 | ], 1660 | "source": [ 1661 | "insults = train_df[train_df['Insult'] == 1]['Comment']\n", 1662 | "total_matches = 0\n", 1663 | "for insult in insults:\n", 1664 | " doc = nlp(insult)\n", 1665 | " matches = matcher(doc)\n", 1666 | " if len(matches) > 0:\n", 1667 | " total_matches += 1\n", 1668 | " \n", 1669 | "print(\"Total offensive comments %d\" % len(insults))\n", 1670 | "print(\"Total with pattern %d\" % total_matches)" 1671 | ] 1672 | }, 1673 | { 1674 | "cell_type": "markdown", 1675 | "metadata": {}, 1676 | "source": [ 1677 | "### Custom Features\n", 1678 | "\n", 1679 | "- **n_words**: total number of words (tokens) in the comment\n", 1680 | "- **n_chars**: total number of characters in the comment\n", 1681 | "- **n_dwords**: total number of words in the comment that appears in an English dictionary\n", 1682 | "- **you_re**: number of insults patterns matches found in the comment\n", 1683 | "- **!**: number of exclamation symbols\n", 1684 | "- **allcaps**: number of uppercase characters\n", 1685 | "- **@**: number of 'addressing' symbols\n", 1686 | "- **bad_ratio**: ratio of insults used in the comment\n", 1687 | "- **n_bad**: number of insults in the comment\n", 1688 | "- **capsratio**: ratio of uppercased characters\n", 1689 | "- **dicratio**: ratio of dictionary words in the comment\n", 1690 | "- **sent**: lexicon sentiment score\n" 1691 | ] 1692 | }, 1693 | { 1694 | "cell_type": "code", 1695 | "execution_count": 26, 1696 | "metadata": {}, 1697 | "outputs": [ 1698 | { 1699 | "name": "stdout", 1700 | "output_type": "stream", 1701 | "text": [ 1702 | "\n", 1703 | "Custom Model Result\n", 1704 | "\n", 1705 | "Accuracy: 0.8160181337363053\n", 1706 | "\n", 1707 | " precision recall f1-score support\n", 1708 | "\n", 1709 | " NoInsult 0.82 0.95 0.88 1954\n", 1710 | " Insult 0.77 0.43 0.55 693\n", 1711 | "\n", 1712 | " accuracy 0.82 2647\n", 1713 | " macro avg 0.80 0.69 0.72 2647\n", 1714 | "weighted avg 0.81 0.82 0.80 2647\n", 1715 | "\n" 1716 | ] 1717 | }, 1718 | { 1719 | "data": { 1720 | "image/png": "\n", 1721 | "text/plain": [ 1722 | "
" 1723 | ] 1724 | }, 1725 | "metadata": { 1726 | "needs_background": "light" 1727 | }, 1728 | "output_type": "display_data" 1729 | } 1730 | ], 1731 | "source": [ 1732 | "from training import train_custom\n", 1733 | "custom_classifier = train_custom(train_comments, train_labels)\n", 1734 | "predictions = custom_classifier.predict(test_comments)\n", 1735 | "print(\"\\nCustom Model Result\\n\")\n", 1736 | "predictions_report(predictions, test_labels)\n", 1737 | "plot_confusion_matrix(custom_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1738 | "plt.show()" 1739 | ] 1740 | }, 1741 | { 1742 | "cell_type": "markdown", 1743 | "metadata": {}, 1744 | "source": [ 1745 | "# Ensemble Voting Classifier\n", 1746 | "\n", 1747 | "The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator.\n", 1748 | "\n", 1749 | "We are going to combine the TF-IDF classifier and the custom classifier using a majority vote classifier. Most of the ensemble methods build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. In this case, we need to combine estimators of different nature. Sklearn [Voting classifier](https://scikit-learn.org/stable/modules/ensemble.html#voting-classifier) can help in this situation. The idea behind the VotingClassifier is to combine conceptually different machine learning classifiers and use a majority vote or the average predicted probabilities (soft vote) to predict the class labels. Such a classifier can be useful for a set of equally well performing model in order to balance out their individual weaknesses." 1750 | ] 1751 | }, 1752 | { 1753 | "cell_type": "code", 1754 | "execution_count": 27, 1755 | "metadata": {}, 1756 | "outputs": [ 1757 | { 1758 | "name": "stdout", 1759 | "output_type": "stream", 1760 | "text": [ 1761 | "\n", 1762 | "Majority Vote Model Result\n", 1763 | "\n", 1764 | "Accuracy: 0.8209293539856442\n", 1765 | "\n", 1766 | " precision recall f1-score support\n", 1767 | "\n", 1768 | " NoInsult 0.84 0.94 0.89 1954\n", 1769 | " Insult 0.73 0.50 0.59 693\n", 1770 | "\n", 1771 | " accuracy 0.82 2647\n", 1772 | " macro avg 0.79 0.72 0.74 2647\n", 1773 | "weighted avg 0.81 0.82 0.81 2647\n", 1774 | "\n" 1775 | ] 1776 | }, 1777 | { 1778 | "data": { 1779 | "image/png": "\n", 1780 | "text/plain": [ 1781 | "
" 1782 | ] 1783 | }, 1784 | "metadata": { 1785 | "needs_background": "light" 1786 | }, 1787 | "output_type": "display_data" 1788 | } 1789 | ], 1790 | "source": [ 1791 | "from training import train_assembling_voting\n", 1792 | "voting_classifier = train_assembling_voting(train_comments, train_labels)\n", 1793 | "predictions = voting_classifier.predict_proba(test_comments)\n", 1794 | "print(\"\\nMajority Vote Model Result\\n\")\n", 1795 | "predictions_report_proba(predictions, test_labels)\n", 1796 | "plot_confusion_matrix(voting_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1797 | "plt.show()" 1798 | ] 1799 | }, 1800 | { 1801 | "cell_type": "markdown", 1802 | "metadata": {}, 1803 | "source": [ 1804 | "# Feature Union\n", 1805 | "\n", 1806 | "To finish with classic estimators based solutions, we are going to make a final attempt by merging the features for both classifiers making use of sklearn [FeatureUnion](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html). This estimator applies a list of transformer objects in parallel to the input data, then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer." 1807 | ] 1808 | }, 1809 | { 1810 | "cell_type": "code", 1811 | "execution_count": 28, 1812 | "metadata": {}, 1813 | "outputs": [ 1814 | { 1815 | "name": "stdout", 1816 | "output_type": "stream", 1817 | "text": [ 1818 | "\n", 1819 | "Feature Union Model Result\n", 1820 | "\n", 1821 | "Accuracy: 0.8466188137514167\n", 1822 | "\n", 1823 | " precision recall f1-score support\n", 1824 | "\n", 1825 | " NoInsult 0.86 0.95 0.90 1954\n", 1826 | " Insult 0.79 0.56 0.66 693\n", 1827 | "\n", 1828 | " accuracy 0.85 2647\n", 1829 | " macro avg 0.83 0.75 0.78 2647\n", 1830 | "weighted avg 0.84 0.85 0.84 2647\n", 1831 | "\n" 1832 | ] 1833 | }, 1834 | { 1835 | "data": { 1836 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWEAAAEGCAYAAAC0DiQ1AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAlOUlEQVR4nO3de5xXVb3/8debm3IXRDkIKGh0UVMzQtEyb8fbsTBPx8sxI7XQ8m6e0vodMc2y0lTM7JByvGReq5N5OKJ4Sa0QURERRchL3BRxEBVBYObz+2Ov0S/DzHy/M3yHPRveTx/7MXuvfVt7vvj5rllr7bUUEZiZWT465J0BM7NNmYOwmVmOHITNzHLkIGxmliMHYTOzHHXKOwPtUb++HWPI4M55Z8Na4MUZ3fLOgrXQOyxdEhFbrc81Dt6ve7xZU1v2uCdnvD8pIg5Zn3u1FQfhRgwZ3JmpkwbnnQ1rgYO32S3vLFgLTY67Xl3fa7xZU8vUSduWPa7jgDn91vdebcVB2MwKK4A66vLOxnpxEDazwgqC1VG+OqI9c8OcmRVaXQX/VULSBEmLJc0sSdtN0hRJ0yVNkzQipUvSOElzJc2QtHvJOaMlzUnL6HL3dRA2s8IKgtoov1ToBqBh491PgR9ExG7ABWkb4FBgWFrGANcCSOoLjAX2AEYAYyX1ae6mDsJmVmh1RNmlEhHxCFDTMBnoldZ7AwvT+ijgpshMAbaQNAA4GLg/ImoiYilwP+sG9rW4TtjMCiuA2sqCbD9J00q2x0fE+ArOOwuYJOkyskLrXil9IDCv5Lj5Ka2p9CY5CJtZoVVY0l0SEcNbcflvAmdHxO8kHQVcDxzYius0ydURZlZYAayOKLush9HA79P6nWT1vAALgNKXCQaltKbSm+QgbGaFFQS1FSzrYSHw+bS+PzAnrd8NfDX1ktgTWBYRi4BJwEGS+qQGuYNSWpNcHWFmxRVQW6V5KSTdCuxLVn88n6yXwzeAqyR1AlaS9YQAmAgcBswF3gNOAIiIGkkXA0+k4y6KiIaNfWtxEDazwsremKvStSKObWLXpxs5NoBTm7jOBGBCpfd1EDazAhO1KO9MrBcHYTMrrKxhzkHYzCwXWT9hB2Ezs9zUuSRsZpYPl4TNzHIUiNqCv+7gIGxmhebqCDOznARiVXTMOxvrxUHYzAore1nD1RFmZrlxw5yZWU4iRG24JGxmlps6l4TNzPKRNcwVO4wVO/dmtklzw5yZWc5q3U/YzCwfG8Mbc8XOvZlt8uqiQ9mlEpImSFosaWaD9NMlvSDpOUk/LUk/X9JcSbMlHVySfkhKmyvpvHL3dUnYzAorG8CnamXJG4BfADfVJ0jaDxgF7BoR70vaOqXvCBwD7ARsA0yW9NF02jXAP5NNd/+EpLsjYlZTN3UQNrPCCsTqKr22HBGPSBrSIPmbwKUR8X46ZnFKHwXcltJfljSXD2dinhsRLwFIui0d22QQdnWEmRVWBNRGh7IL2eSd00qWMeWunXwU+JykxyX9WdJnUvpAYF7JcfNTWlPpTXJJ2MwKTJW+rLEkIoa34gadgL7AnsBngDskbd+K6zR7AzOzQgpo69eW5wO/T7MrT5VUB/QDFgCDS44blNJoJr1Rro4ws0KrpUPZZT38D7AfQGp46wIsAe4GjpG0maShwDBgKvAEMEzSUEldyBrv7m7uBi4Jm1lhBaraoO6SbgX2Jas/ng+MBSYAE1K3tVXA6FQqfk7SHWQNbmuAUyOiNl3nNGAS0BGYEBHPNXdfB2EzK6xsyvvqhLGIOLaJXV9p4vhLgEsaSZ8ITKz0vg7CZlZg8njCZmZ5Caj4jbj2ykHYzArNJWEzs5xEyCVhM7O8ZA1znm3ZzCwnnmPOzCw3WcOc64TNzHJT9EHdHYTNrLCq+cZcXhyEzazQPNGnmVlOImB1nYOwmVkusuoIB2Ezs9z4jTnL1eVnD+bxyb3Yot8axj80G4C/z+zKuPMGsWplBzp2Ck778Xw+/qn3eOavPbjwhKH80+BVAOx92Ft85ZzXWbygMz87c1veeqMzKDjsK2/ypa8vyfOxNhnn/Pwf7HHgO7y1pBMn7/8xAHpusYbv/epV+g9axevzu3DJydvx7rJO7PelpRx16mIkWLG8A1efN4iXZnXN+QnytTF0UWuzcrykkHR5yfa5ki4sc86Fks6tcj5ukPTltH6WpG7VvH7eDjq6hktueWmttOt+OICvnPMa106ezVf/YxHX/3CbD/btvMe7XDt5NtdOns1XznkdgI6dgjEXLOTXf36Bq+6Zw59u6MerL262QZ9jU3Xf7X35/nFD10o76rTFPP1YD0787Cd4+rEeHH1aNrfk6/O68B//ugOnHPAxbrmiP2f+dH4eWW5nVLUp7/PSlrl7HzhSUr82vEdLnQVsVEH4k3sup2ef2rXSJFj+TvYq5/K3O9K3/+pmr7Fl/zUM22UFAN161DH4I++zZFHntsmwrWXm4z14Z+naf5COPPhtJt/RF4DJd/Rl5CFvAzBrWnfeXZYd+8JT3eg3YNWGzWw7VZfmmWtuqYSkCZIWpwHcG+77dipY9kvbkjRO0lxJMyTtXnLsaElz0jK63H3bMgivAcYDZzfcIWmIpAdT5h+QtG0jxzws6SeSpkp6UdLnUvpOKW16On9Yut7MknPXKXVLOgPYBnhI0kNVftZ25ZSLFnDdxdtw3Kd35NcXb8OJ31v4wb7nn+zOKQd+jO8ftz2vzN58nXNfm9eFv8/sysd3f29DZtlK9Om3mprF2ZdgzeJO9Om37pfoIcfW8MRDvTZ01tqdrHdEx7JLhW4ADmmYKGkwcBDwj5LkQ8mmNBoGjAGuTcf2JZuRYw9gBDBWUp/mbtrW5fRrgOMk9W6QfjVwY0TsAtwCjGvi/E4RMYKsBDs2pZ0CXBURuwHDySbiKysixgELgf0iYr+G+yWNqZ8O+403a9e9QIHcc2M/Tv7BAm55chYnX7iQn5+Tfcd95JPvcfPUWfxq8mxGnfgGPzhx7T+DVyzvwMVfH8IpFy2ge8+6PLJu6xDRoM5z173e5eBja7j+kgE55an9qH9Zo9xS0bUiHgFqGtl1BfAdsiroeqOAmyIzBdhC0gDgYOD+iKiJiKXA/TQS2Eu1aRCOiLeBm4AzGuwaCfw2rd8MfLaJS/w+/XwSGJLW/wZ8T9J3ge0iYkWV8jo+IoZHxPCttiz2qEz339mXzx62DIB9vvAWL07PamC696yja/csuI444B1qV4tlb2bPumY1XPz1Iex/5NIPzrV8LF3Smb5bZ6Xfvluv5q03P6yuGPqJFZx12TwuPGHoOtUYm6pqVUc0RtIoYEFEPNNg10BgXsn2/JTWVHqTNkSN9ZXASUD3Vpz7fvpZS+rJERG/Bb4IrAAmStqfrOqj9FnW/Tt7E7Jl/9XM+FsPAKY/1oNthma/xprFnYj0Xf7C092oq4NefWuJgJ9/e1sGD3uffz35jbyybcmU+3px4FFZgezAo2r426Ss2mGrgau44LpX+NkZ27LgJTecwoe9IyooCfer/0s3LWPKXTs14n8PuKAtn6HNv0ojoibNSnoS2cylAH8lmwr6ZuA44NFKrydpe+CliBiX6pJ3SedvLWlL4F3gcODeRk5/B+hJNmX1RuHH39yOGX/rwbKaThz36R05/tuvcdbP5nHtBQOprRVdNqvjrJ9lX8yP3rMF99y0JR07wWab13H+ta8gwczHu/PAXX0Z+okVfPPArJvUCecvZMQB7+T5aJuE8375KruMfJfefdfwm2mzuPny/tz+i635/q9e5ZBjali8IOuiBnDc2a/Ts08tp/04q4GrXSNOP/SjeWa/Xaiw98OSiBjewkvvAAwFnpEEMAh4StIIYAEwuOTYQSltAdmMzaXpDzd3kw3198zlwGkl26cD/y3pP4A3gBNacK2jgOMlrQZeA34UEaslXQRMJfslvNDEueOBeyUtbKxeuIjOv/bVRtOvmfTiOmmjTlzCqBPX/f7ZeY/lTFo4vdpZswpc+q3tGk0/7+gd1km78tzBXHnu4EaO3nRFiDVt1AUtIp4Ftq7flvQKMDwilki6GzhN0m1kjXDLImKRpEnAj0oa4w4Czm/uPm0WhCOiR8n665R0DYuIV4H9GznnwpL1fUvWl5DqhCPiUuDSRs4dRyMNfBHxtZL1q8kaBc1sI1GtlzUk3UpWiu0naT4wNiKub+LwicBhwFzgPVJBMv3lfzHwRDruoohorLHvA67ZN7PCquYbcxFxbJn9Q0rWAzi1ieMm8GHVa1kOwmZWaEV/bdlB2MwKy4O6m5nlbH36AbcHDsJmVlgRsMaDupuZ5cfVEWZmOXGdsJlZzhoOcFQ0DsJmVmhumDMzy0mE64TNzHIkat07wswsP64TNjPLycYw27KDsJkVV/DBRAVF5SBsZoXm3hFmZjkJN8yZmeWr6NURxf4KMbNNXoTKLpWQNEHSYkkzS9J+JukFSTMk/UHSFiX7zpc0V9JsSQeXpB+S0uZKOq/cfR2EzaywIqoXhIEbgEMapN0P7BwRuwAvkuaLk7Qj2WTFO6Vzfimpo6SOwDXAocCOwLHp2CY5CJtZoVU45X1ZEfEIUNMg7b6IWJM2p5DNngwwCrgtIt6PiJfJ5pobkZa5EfFSRKwCbkvHNslB2MwKLaL8QjZ557SSZUwrbnUi8H9pfSAwr2Tf/JTWVHqT3DBnZoUViLrKekcsiYjhrb2PpO8Da4BbWnuNpjgIm1mhtXXnCElfAw4HDkizLAMsAAaXHDYopdFMeqNcHWFmxVXdhrl1SDoE+A7wxYh4r2TX3cAxkjaTNBQYBkwFngCGSRoqqQtZ493dzd3DJWEzK7YqFYUl3QrsS1Z/PB8YS9YbYjPgfkkAUyLilIh4TtIdwCyyaopTI6I2Xec0YBLQEZgQEc81d18HYTMrtGqNohYRxzaSfH0zx18CXNJI+kRgYqX3bTIIS7qaZr5jIuKMSm9iZtYWAqir23jHjpi2wXJhZtYaAWysQ1lGxI2l25K6NaiYNjPL3UY/doSkkZJmAS+k7V0l/bLNc2ZmVomoYGnHKumidiVwMPAmQEQ8A+zThnkyM6tQ+e5p7X36o4p6R0TEvNQ9o15t22THzKyF2nlJt5xKgvA8SXsBIakzcCbwfNtmy8ysAgFR8N4RlVRHnAKcSjYIxUJgt7RtZtYOqIKl/SpbEo6IJcBxGyAvZmYtV/DqiEp6R2wv6U+S3kijzv9R0vYbInNmZmVtAr0jfgvcAQwAtgHuBG5ty0yZmVWk/mWNcks7VkkQ7hYRN0fEmrT8Bti8rTNmZlaJCgd1b7eaGzuib1r9vzRZ3W1k3ztH04LBKczM2lTBe0c01zD3JFnQrX/Ck0v2BWnCOzOzPKmdl3TLaW7siKEbMiNmZi1WgIa3cip6Y07SzmTTN39QFxwRN7VVpszMKtP+G97KqaSL2ljg6rTsB/wU+GIb58vMrDJV6qImaULqhjuzJK2vpPslzUk/+6R0SRonaa6kGZJ2LzlndDp+jqTR5e5bSe+ILwMHAK9FxAnArkDvyh7LzKyN1VWwVOYG4JAGaecBD0TEMOCBtA1wKNm8csOAMcC18EGHhrHAHsAIYGx94G5KJUF4RUTUAWsk9QIWs/ZsomZm+ahiP+GIeASoaZA8CqgfW/1G4IiS9JsiMwXYQtIAshEn74+ImohYCtzPuoF9LZXUCU+TtAXwa7IeE+8Cf6vgPDOzNldh74h+kkpnCxofEeMrOK9/RCxK668B/dP6QGBeyXHzU1pT6U2qZOyIb6XVX0m6F+gVETPK593MbAOoLAgviYjh63WbiJCq3yGuuZc1dm9uX0Q8Ve3MmJm1M69LGhARi1J1w+KUvoC1q2UHpbQFwL4N0h9u7gbNlYQvb2ZfAPs3d+Eim/NiHw7b78t5Z8NaYkTXvHNgLfX4XVW5TBu/rHE3MBq4NP38Y0n6aZJuI2uEW5YC9STgRyWNcQdR5sW25l7W2G89M29m1raCqr22LOlWslJsP0nzyXo5XArcIekk4FXgqHT4ROAwYC7wHnACQETUSLoYeCIdd1FENGzsW0tFL2uYmbVbVSoJR8SxTew6oJFjgyYmt4iICcCESu/rIGxmhbbRjh1hZlYIBQ/Clby2LElfkXRB2t5W0oi2z5qZWQU2gZk1fgmMBOrrS94BrmmzHJmZVUhR2dKeVVIdsUdE7C7paYCIWCqpSxvny8ysMhvxoO71VkvqSCrUS9qKlgyJYWbWhtp7SbecSqojxgF/ALaWdAnwGPCjNs2VmVmlCl4nXMnYEbdIepKsr5yAIyLi+TbPmZlZOQWo8y2nbBCWtC3ZGyF/Kk2LiH+0ZcbMzCqysQdh4H/5cMLPzYGhwGxgpzbMl5lZRVTwFqpKqiM+WbqdRlf7VhOHm5lZC7T4jbmIeErSHm2RGTOzFtvYqyMknVOy2QHYHVjYZjkyM6vUptAwB/QsWV9DVkf8u7bJjplZC23MQTi9pNEzIs7dQPkxM2uZjTUIS+oUEWsk7b0hM2RmVilR/N4Rzb0xNzX9nC7pbknHSzqyftkQmTMza1YVB/CRdLak5yTNlHSrpM0lDZX0uKS5km6vHzdH0mZpe27aP6S1j1DJa8ubA2+SzSl3OPCF9NPMLH9VeG1Z0kDgDGB4ROwMdASOAX4CXBERHwGWAielU04Clqb0K9JxrdJcEN469YyYCTybfj6Xfs5s7Q3NzKqqemNHdAK6SuoEdAMWkRU+62ckvRE4Iq2PStuk/QdIatVwbs01zHUEepBVuzRU8KpwM9tYVFjd0E/StJLt8RExvn4jIhZIugz4B7ACuA94EngrItakw+YDA9P6QGBeOneNpGXAlsCSlua/uSC8KCIuaukFzcw2qMqC8JKIGN7UzjRF/SiyYRneAu4EDqlC7spqrjqi2CMlm9nGL7LeEeWWChwIvBwRb0TEauD3wN7AFql6AmAQsCCtLwAGQ9aTDOhN1nbWYs0F4XWmeTYza3eqUyf8D2BPSd1S3e4BwCzgIeDL6ZjRwB/T+t1pm7T/wYhoVTVtk9UREVHTmguamW1I1XhtOSIel3QX8BTZm8FPA+PJ3hC+TdIPU9r16ZTrgZslzQVqyHpStIqnvDezYqtSN4GIGAuMbZD8ErDO7PIRsRL4t2rc10HYzIqrANMXleMgbGaFJTaNUdTMzNotB2Ezszw5CJuZ5chB2MwsJ5vIzBpmZu2Xg7CZWX6KPqi7g7CZFZqrI8zM8uKXNczMcuYgbGaWD78xZ2aWM9UVOwo7CJtZcblO2MwsX66OMDPLU8GDcHPTG5mZtXuK8ktF15G2kHSXpBckPS9ppKS+ku6XNCf97JOOlaRxkuZKmiFp99bm30HYzIqtOnPMAVwF3BsRHwd2BZ4HzgMeiIhhwANpG+BQYFhaxgDXtjb7DsJmVlxVmm1ZUm9gH9IcchGxKiLeAkYBN6bDbgSOSOujgJsiM4VsVuYBrXkEB2EzK6z6fsIVVEf0kzStZBnT4FJDgTeA/5b0tKTrJHUH+kfEonTMa0D/tD4QmFdy/vyU1mJumDOzYqtspvklETG8mf2dgN2B09PMy1fxYdVDuk2EVP2+GC4Jm1mhValhbj4wPyIeT9t3kQXl1+urGdLPxWn/AmBwyfmDUlqLuSS8EencuZafXvVnOnepo2PHOh778yBuuWFH+v/Tcs674HF69lrF3Bf7cNmPPsOaNR9+/+69zwK+/4MpnHny/sx5sU+OT7Dp6dy5lssvuY/OnWvp2DF49K/bcvNtu7LbLov4+uin6NABVqzoxOXj9mLhaz3Zeqt3Oef0KfTutZJ33u3CT6/YmyVvds/7MfJTpZc1IuI1SfMkfSwiZgMHALPSMhq4NP38YzrlbuA0SbcBewDLSqotWqTdBWFJ70ZEjypebwhwT0TsLGk3YJuImFit67cnq1d34Pxz9mHlyk507FjHZVc/zLTH+3PkUXP4w53DeOShwZx29lMcdNjLTLx7BwC6dl3NqCPn8sKsvvlmfhO1enUHvnPBgaxc2ZmOHev4+Y8n8cRT23D6yVO58Mf7Mm9+bw4/dDbHHvUsl4/bi2987SkmPzSUyQ/twK6ffI0Tjp/Oz67cO+/HyFUVxxM+HbhFUhfgJeAEstqCOySdBLwKHJWOnQgcBswF3kvHtsqmVh2xG9kvbiMlVq7Mvlc7daqjY8cAxC6feoPH/py1GUyetB0jP7vwgzOOP3EWd972UVat2tT+KbQXYuXKzgB06pj9BRMhAujWdTUA3butpqamKwDbDV7GM8/+EwDPPNufkSPm55Lr9qQavSMAImJ6RAyPiF0i4oiIWBoRb0bEARExLCIOjIiadGxExKkRsUNEfDIiprU2/+32/zxJ+0p6uKTz9C2SlPZdKmlW6iR9WUq7QdKXS85/t8H1ugAXAUdLmi7p6A35PBtKhw7B1b+ezG//cA9PP7k1ixZ0Z/m7namryz7qJW90Zct+KwHYYdhSttr6PZ6Y0qqeNVYlHTrU8csr/pfbb7yLp58ZwOw5/bjympH88D8f5DfX/Z4D9n2Z23+3EwAvvdKHvffMGuX33nMe3butpmfP9/PMfr6CrGGu3NKOtbvqiAY+BewELAT+Auwt6XngS8DHU2vlFpVcKCJWSboAGB4RpzXcn7qsjAHYvFOvKmV/w6urE6d/40C6d1/F/7t4CoO2fafR46TgG9+awc8vba7B2DaEuroOfOvsf6F791WMPe/PbLftW3zpC8/z/y7en9lz+vHlI55jzIlPcuU1Ixn/37tz6pgn+Of9/86zz23NG0u6UVenvB8hVx47om1NjYj5AJKmA0OAKcBK4HpJ9wD3VONGETEeGA/Qu+uAgn+ssHx5F2ZM34pP7PQm3XuspkOHOurqOtBvqxW8uWRzunZbw3ZD3+YnVz4CQJ++K7ngkr9y0ff3cuNcTpYv78Izz/bnM7svYPuhS5k9px8Af35sCJeMfRCAmqXduPgnnwdg881X89mR81i+vEtueW4XCv5/a7utjkhK/86qBTpFxBpgBFkXksOBe9P+NaTnkdQB2OT+Zfbq/T7du68CoEuXWj716deZ92ovZjy9FZ/9fNZ75sCDX2XKX7bhveWdOfaIL3DCsYdywrGH8sKsvg7AOejda2XJZ7aG3XdbxLz5venebTUDt3kbIKVlf5316rmS+q6qx/zrc9z3wA75ZLydaMHLGu1Wey8Jr0NSD6BbREyU9BeyVkyAV4BPA3cAXwQ6N3L6O0DPDZHPPPTdciXfPu8JOnQI1AEefXgQU6cM4B+v9uS7/zmVr570HH+fswWTJg7JO6uW9O2zgnPP/CsdOgQdFDzyl+14fNogrrxmD/7zu48QdfDO8i78/OqRAOyy8+ucePx0IuDZWVtzzX+NyPkJchZR+EHdFe2s0rq+i5qkfYFzI+LwlP4LYBowiayv3uZkX4SXRcSNkvqn9K5kpeNT03WG8GEXtb7p/M7AjyPi9sby0LvrgBg55Gtt+JRWbbW9u+adBWuhyY+PfbLMW2xl9dxiUHxqnzPLHvfon76z3vdqK+2uJFzfRzgiHgYeLkkvbUxb5+s/Il4H9ixJ+m5KfwXYOa3XAJ+pcpbNLEftvbqhnHYXhM3MKhZAwasjHITNrNiKHYMdhM2s2FwdYWaWo6L3jnAQNrPi8pT3Zmb5yV7WKHYUdhA2s2Kr3lCWuXAQNrNCc0nYzCwvG0GdcHsfwMfMrBnZ2BHllkpJ6phmW74nbQ+V9LikuZJuT+OSI2mztD037R/S2idwEDazYqvuoO5nAs+XbP8EuCIiPgIsBU5K6ScBS1P6Fem4VnEQNrPiiupNbyRpEPAvwHVpW8D+ZMPmAtwIHJHWR6Vt0v4D6mf+aSkHYTMrtspKwv0kTStZxjRypSuB7/Bhf4stgbfSGOYA84GBaX0gMC+7fawBlqXjW8wNc2ZWbJXVNixpbihLSYcDiyPiyTSM7gbjIGxmhaa6qnQU3hv4oqTDyMYq7wVcBWwhqX5Gn0HAgnT8AmAwMF9SJ6A38GZrbuzqCDMrriCrPCi3lLtMxPkRMSgihgDHAA9GxHHAQ0D9LO6jySaOALg7bZP2PxitnCHDQdjMCksEivLLevgucI6kuWR1vten9OuBLVP6OcB5rb2BqyPMrNiq/MZc6aw+EfESjc/ksxL4t2rcz0HYzIrNry2bmeWkvk64wByEzazQqtQ7IjcOwmZWYC1+LbndcRA2s+IKHITNzHJV7NoIB2EzKzYP6m5mlicHYTOznERAbbHrIxyEzazYXBI2M8uRg7CZWU4CaMEccu2Rg7CZFVhAuE7YzCwfgRvmzMxyVfA6YQ/qbmbFVoUp7yUNlvSQpFmSnpN0ZkrvK+l+SXPSzz4pXZLGSZoraYak3VubfQdhMyuwCgJwZSXlNcC3I2JHYE/gVEk7ks2Y8UBEDAMe4MMZNA4FhqVlDHBta5/AQdjMiiuAurryS7nLRCyKiKfS+jvA82TT2o8CbkyH3QgckdZHATdFZgrZhKADWvMIDsJmVmzVKQl/QNIQ4FPA40D/iFiUdr0G9E/rA4F5JafNT2kt5oY5Myuwil9b7idpWsn2+IgY3/AgST2A3wFnRcTbkj68U0RIqnoroIOwmRVXQFTWT3hJRAxv7gBJnckC8C0R8fuU/LqkARGxKFU3LE7pC4DBJacPSmkt5uoIMyu2uii/lKGsyHs98HxE/Lxk193A6LQ+GvhjSfpXUy+JPYFlJdUWLeKSsJkVW3X6Ce8NHA88K2l6SvsecClwh6STgFeBo9K+icBhwFzgPeCE1t7YQdjMiiuiot4P5S8TjwFqYvcBjRwfwKnrfWMchM2s6Ar+xpyDsJkVWBC1tXlnYr04CJtZcXkoSzOznHkoSzOzfAQQLgmbmeUkPKi7mVmuit4wpyh49462IOkNso7ZG6N+wJK8M2EV25g/r+0iYqv1uYCke8l+R+UsiYhD1udebcVBeBMjaVq5d+it/fDntfHz2BFmZjlyEDYzy5GD8KZnnTFUrV3z57WRc52wmVmOXBI2M8uRg7CZWY4chNs5SSHp8pLtcyVdWOacCyWdW+V83CDpy2n9LEndqnn9TYWkd6t8vSGSZqb13SQdVs3rW9tzEG7/3geOlFRJh/QN5SzAQbj92Y1stgcrEAfh9m8NWQv52Q13pFLQg5JmSHpA0raNHPOwpJ9ImirpRUmfS+k7pbTp6fxhpaWqdMw6pW5JZwDbAA9JeqjKz7rJkLRv+mzukvSCpFvSPGdIulTSrPS5XJbSPvhLJG2/2+B6XYCLgKPTZ3r0hnweaz0H4WK4BjhOUu8G6VcDN0bELsAtwLgmzu8UESPISrBjU9opwFURsRswHJhfSUYiYhywENgvIvZryUPYOj5F9pnsCGwP7C1pS+BLwE7pc/1hJReKiFXABcDtEbFbRNzeNlm2anMQLoCIeBu4CTijwa6RwG/T+s3AZ5u4RP303U8CQ9L634DvSfou2Tv8K6qWYavU1IiYH9mc7dPJPptlwErgeklHkk0iaRsxB+HiuBI4CejeinPfTz9rSSPnRcRvgS8CK4CJkvYnq/oo/TexeWszaxV5v2S9luwvljXACOAu4HDg3rT/g89GUgegywbMp7UhB+GCiIga4A6yQFzvr8Axaf044NFKrydpe+ClVL3wR2AX4HVga0lbStqMLAg05h2gZ8uewCohqQfQOyImkrUD7Jp2vQJ8Oq1/EejcyOn+XArIQbhYLmftYftOB06QNAM4HjizBdc6CpgpaTqwM3BTRKwma9yZCtwPvNDEueOBe90w1yZ6Avekz/Qx4JyU/mvg85KeIauGWt7IuQ8BO7phrlj82rKZWY5cEjYzy5GDsJlZjhyEzcxy5CBsZpYjB2Ezsxw5CFurSKpNXaFmSrpzfUZVazBC23WSdmzm2H0l7dWKe7zS2CBITaU3OKZFI5+1xSh2tvFyELbWWpHGKNgZWEU2FsUHJHVqzUUj4usRMauZQ/YFWhyEzdorB2GrhkeBj6RS6qOS7gZmSeoo6WeSnkgjgp0MoMwvJM2WNBnYuv5CaWSx4Wn9EElPSXomjRI3hCzYn51K4Z+TtJWk36V7PCFp73TulpLuk/ScpOsAlXsISf8j6cl0zpgG+65I6Q9I2iql7SDp3nTOo5I+XpXfpm1SWlVaMauXSryH8uEYB7sDO0fEyymQLYuIz6TXoP8i6T6y0cM+RjZ6WH9gFjChwXW3IntLbJ90rb4RUSPpV8C7EVE/xONvgSsi4rE0lOck4BNko8U9FhEXSfoX1n7duyknpnt0BZ6Q9LuIeJNsvI5pEXG2pAvStU8je3PwlIiYI2kP4JfA/q34NdomzEHYWqtreuUZspLw9WTVBFMj4uWUfhCwS8k4uL2BYcA+wK0RUQsslPRgI9ffE3ik/lpp7IzGHEj2qm79dq80/sI+wJHp3P+VtLSCZzpD0pfS+uCU1zeBOqB+aMjfAL9P99gLuLPk3ptVcA+ztTgIW2utSGMRfyAFo9IxDQScHhGTGhxXzdkfOgB7RsTKRvJSMUn7kgX0kRHxnqSHaXoUuUj3favh78CspVwnbG1pEvBNSZ0BJH1UUnfgEbIZIDpKGgA0Njj8FGAfSUPTuX1TesORwu4jG8iIdNxuafUR4N9T2qFAnzJ57Q0sTQH442Ql8XodgPrS/L+TVXO8Dbws6d/SPSRpV8xayEHY2tJ1ZPW9TymbNum/yP76+gMwJ+27iWyA+bVExBvAGLI//Z/hw+qAPwFfqm+YIxvofnhq+JvFh700fkAWxJ8jq5b4R5m83gt0kvQ8cCnZl0C95cCI9Az7k400B9nwoSel/D0HjKrgd2K2Fo+iZmaWI5eEzcxy5CBsZpYjB2Ezsxw5CJuZ5chB2MwsRw7CZmY5chA2M8vR/wcz8Lly5zlgPQAAAABJRU5ErkJggg==\n", 1837 | "text/plain": [ 1838 | "
" 1839 | ] 1840 | }, 1841 | "metadata": { 1842 | "needs_background": "light" 1843 | }, 1844 | "output_type": "display_data" 1845 | } 1846 | ], 1847 | "source": [ 1848 | "from training import train_feature_union\n", 1849 | "fu_classifier = train_feature_union(train_comments, train_labels)\n", 1850 | "predictions = fu_classifier.predict(test_comments)\n", 1851 | "print(\"\\nFeature Union Model Result\\n\")\n", 1852 | "predictions_report(predictions, test_labels)\n", 1853 | "plot_confusion_matrix(fu_classifier, test_comments, test_labels, display_labels=[NO_INSULT, INSULT])\n", 1854 | "plt.show()" 1855 | ] 1856 | }, 1857 | { 1858 | "cell_type": "markdown", 1859 | "metadata": {}, 1860 | "source": [ 1861 | "# Word Embeddings" 1862 | ] 1863 | }, 1864 | { 1865 | "cell_type": "markdown", 1866 | "metadata": {}, 1867 | "source": [ 1868 | "Word Embedding is the collective name for feature learning techniques where words from the vocabulary are mapped to vectors of real numbers. These vectors are calculated from the probability distribution for each word appearing before or after another. To put it another way, words of the same context usually appear together in the corpus, so they will be close in the vector space as well." 1869 | ] 1870 | }, 1871 | { 1872 | "cell_type": "code", 1873 | "execution_count": 29, 1874 | "metadata": {}, 1875 | "outputs": [], 1876 | "source": [ 1877 | "import gensim\n", 1878 | "from feature_extraction import word2vec_model, tokenize_document\n", 1879 | "raw_tokenized_corpus = [tokenize_document(comment) for comment in raw_train_comments]\n", 1880 | "dim = 300\n", 1881 | "max_len = 15\n", 1882 | "raw_w2v_model = word2vec_model(raw_tokenized_corpus, n_dim=dim)" 1883 | ] 1884 | }, 1885 | { 1886 | "cell_type": "code", 1887 | "execution_count": 30, 1888 | "metadata": {}, 1889 | "outputs": [ 1890 | { 1891 | "name": "stdout", 1892 | "output_type": "stream", 1893 | "text": [ 1894 | "0.99965227\n", 1895 | "[('being', 0.9999071955680847), ('too', 0.9999014735221863), ('someone', 0.9999005794525146), ('....', 0.9999004006385803), ('stop', 0.9998997449874878), ('dont', 0.9998990893363953), ('mean', 0.9998983144760132), ('little', 0.9998980760574341), ('should', 0.9998977184295654), ('make', 0.9998974800109863), ('still', 0.9998961687088013), ('bitch', 0.9998961687088013), ('point', 0.9998945593833923), ('ass', 0.9998944997787476), ('any', 0.999894380569458), ('yourself', 0.9998937845230103), ('going', 0.9998934268951416), ('better', 0.9998930096626282), ('give', 0.9998929500579834), ('never', 0.9998925924301147)]\n" 1896 | ] 1897 | } 1898 | ], 1899 | "source": [ 1900 | "print(raw_w2v_model.wv.similarity('retarded', 'loser'))\n", 1901 | "print(raw_w2v_model.wv.most_similar('retarded', topn=20))" 1902 | ] 1903 | }, 1904 | { 1905 | "cell_type": "code", 1906 | "execution_count": 31, 1907 | "metadata": {}, 1908 | "outputs": [ 1909 | { 1910 | "data": { 1911 | "text/plain": [ 1912 | "[('know', 0.999808669090271),\n", 1913 | " ('are', 0.9997797012329102),\n", 1914 | " ('don', 0.9997004270553589),\n", 1915 | " ('ing', 0.9995821118354797),\n", 1916 | " ('like', 0.9995772838592529),\n", 1917 | " ('your', 0.9995105266571045),\n", 1918 | " ('what', 0.9994930624961853),\n", 1919 | " ('just', 0.9994739890098572),\n", 1920 | " ('understand', 0.9993782043457031),\n", 1921 | " ('can', 0.9993021488189697),\n", 1922 | " ('how', 0.9992895722389221),\n", 1923 | " ('think', 0.9992784261703491),\n", 1924 | " ('people', 0.9992203712463379),\n", 1925 | " ('and', 0.9992198944091797),\n", 1926 | " ('because', 0.9992059469223022),\n", 1927 | " ('get', 0.9992001056671143),\n", 1928 | " ('you', 0.9991925954818726),\n", 1929 | " ('really', 0.9991372227668762),\n", 1930 | " ('they', 0.9991272687911987),\n", 1931 | " ('that', 0.9991147518157959)]" 1932 | ] 1933 | }, 1934 | "execution_count": 31, 1935 | "metadata": {}, 1936 | "output_type": "execute_result" 1937 | } 1938 | ], 1939 | "source": [ 1940 | "tokenized_corpus = [tokenize_document(comment) for comment in train_comments]\n", 1941 | "test_tokenized_corpus = [tokenize_document(comment) for comment in test_comments]\n", 1942 | "w2v_model = word2vec_model(tokenized_corpus, n_dim=dim)\n", 1943 | "w2v_model.wv.most_similar('fakeinsult', topn=20)" 1944 | ] 1945 | }, 1946 | { 1947 | "cell_type": "code", 1948 | "execution_count": 32, 1949 | "metadata": {}, 1950 | "outputs": [], 1951 | "source": [ 1952 | "# Repeat the process for test corpus\n", 1953 | "test_raw_tokenized_corpus = [tokenize_document(comment) for comment in raw_test_comments]" 1954 | ] 1955 | }, 1956 | { 1957 | "cell_type": "markdown", 1958 | "metadata": {}, 1959 | "source": [ 1960 | "## Word2Vec based classifier with Keras" 1961 | ] 1962 | }, 1963 | { 1964 | "cell_type": "code", 1965 | "execution_count": 42, 1966 | "metadata": {}, 1967 | "outputs": [], 1968 | "source": [ 1969 | "## tokenize text for Keras network\n", 1970 | "import tensorflow as tf\n", 1971 | "ktokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True, split=' ', \n", 1972 | " oov_token=\"NaN\", \n", 1973 | " filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n", 1974 | "ktokenizer.fit_on_texts(raw_tokenized_corpus)\n", 1975 | "dic_vocabulary = ktokenizer.word_index\n", 1976 | "## create sequence\n", 1977 | "lst_text2seq= ktokenizer.texts_to_sequences(raw_tokenized_corpus)\n", 1978 | "## padding sequence\n", 1979 | "X_train = tf.keras.preprocessing.sequence.pad_sequences(lst_text2seq, \n", 1980 | " maxlen=max_len)" 1981 | ] 1982 | }, 1983 | { 1984 | "cell_type": "code", 1985 | "execution_count": 43, 1986 | "metadata": {}, 1987 | "outputs": [], 1988 | "source": [ 1989 | "## text to sequence with the fitted tokenizer\n", 1990 | "lst_text2seq = ktokenizer.texts_to_sequences(test_raw_tokenized_corpus)\n", 1991 | "\n", 1992 | "## padding sequence\n", 1993 | "X_test = tf.keras.preprocessing.sequence.pad_sequences(lst_text2seq, maxlen=max_len,\n", 1994 | " padding=\"post\", truncating=\"post\")" 1995 | ] 1996 | }, 1997 | { 1998 | "cell_type": "code", 1999 | "execution_count": 44, 2000 | "metadata": {}, 2001 | "outputs": [], 2002 | "source": [ 2003 | "# Prepare the matrix of embeddings for the embeddings layer\n", 2004 | "embeddings = np.zeros((len(dic_vocabulary)+1, dim))\n", 2005 | "for word,idx in dic_vocabulary.items():\n", 2006 | " ## update the row with vector\n", 2007 | " try:\n", 2008 | " embeddings[idx] = raw_w2v_model.wv[word]\n", 2009 | " ## if word not in model then skip and the row stays all 0s\n", 2010 | " except:\n", 2011 | " pass" 2012 | ] 2013 | }, 2014 | { 2015 | "cell_type": "code", 2016 | "execution_count": 45, 2017 | "metadata": {}, 2018 | "outputs": [ 2019 | { 2020 | "name": "stdout", 2021 | "output_type": "stream", 2022 | "text": [ 2023 | "Model: \"functional_7\"\n", 2024 | "_________________________________________________________________\n", 2025 | "Layer (type) Output Shape Param # \n", 2026 | "=================================================================\n", 2027 | "input_4 (InputLayer) [(None, 15)] 0 \n", 2028 | "_________________________________________________________________\n", 2029 | "embedding_3 (Embedding) (None, 15, 300) 4623000 \n", 2030 | "_________________________________________________________________\n", 2031 | "bidirectional_6 (Bidirection (None, 15, 30) 37920 \n", 2032 | "_________________________________________________________________\n", 2033 | "bidirectional_7 (Bidirection (None, 30) 5520 \n", 2034 | "_________________________________________________________________\n", 2035 | "dense_6 (Dense) (None, 64) 1984 \n", 2036 | "_________________________________________________________________\n", 2037 | "dense_7 (Dense) (None, 1) 65 \n", 2038 | "=================================================================\n", 2039 | "Total params: 4,668,489\n", 2040 | "Trainable params: 45,489\n", 2041 | "Non-trainable params: 4,623,000\n", 2042 | "_________________________________________________________________\n" 2043 | ] 2044 | } 2045 | ], 2046 | "source": [ 2047 | "# Build the Network\n", 2048 | "from keras import layers, models\n", 2049 | "import keras\n", 2050 | "\n", 2051 | "x_in = layers.Input(shape=(max_len,))\n", 2052 | "\n", 2053 | "#Embedding\n", 2054 | "x = layers.Embedding(input_dim=embeddings.shape[0], \n", 2055 | " output_dim=embeddings.shape[1], \n", 2056 | " weights=[embeddings],\n", 2057 | " input_length=max_len, trainable=False)(x_in)\n", 2058 | "\n", 2059 | "## LSTM\n", 2060 | "x = layers.Bidirectional(layers.LSTM(max_len, dropout=0.2, return_sequences=True))(x)\n", 2061 | "x = layers.Bidirectional(layers.LSTM(max_len, dropout=0.2))(x)\n", 2062 | "\n", 2063 | "x = layers.Dense(64, activation='relu')(x)\n", 2064 | "y_out = layers.Dense(1, activation='sigmoid')(x)\n", 2065 | "\n", 2066 | "## compile\n", 2067 | "model = models.Model(x_in, y_out) \n", 2068 | "model.compile(loss='binary_crossentropy',\n", 2069 | " optimizer=keras.optimizers.Adam(), metrics=['accuracy'])\n", 2070 | "\n", 2071 | "model.summary()" 2072 | ] 2073 | }, 2074 | { 2075 | "cell_type": "code", 2076 | "execution_count": 50, 2077 | "metadata": {}, 2078 | "outputs": [ 2079 | { 2080 | "name": "stdout", 2081 | "output_type": "stream", 2082 | "text": [ 2083 | "Epoch 1/20\n", 2084 | "44/44 [==============================] - 1s 23ms/step - loss: 0.9121 - accuracy: 0.6535 - val_loss: 0.6366 - val_accuracy: 0.6489\n", 2085 | "Epoch 2/20\n", 2086 | "44/44 [==============================] - 1s 21ms/step - loss: 0.9023 - accuracy: 0.6427 - val_loss: 0.6379 - val_accuracy: 0.6456\n", 2087 | "Epoch 3/20\n", 2088 | "44/44 [==============================] - 1s 21ms/step - loss: 0.9087 - accuracy: 0.6343 - val_loss: 0.5720 - val_accuracy: 0.7409\n", 2089 | "Epoch 4/20\n", 2090 | "44/44 [==============================] - 1s 21ms/step - loss: 0.8930 - accuracy: 0.6694 - val_loss: 0.5467 - val_accuracy: 0.7527\n", 2091 | "Epoch 5/20\n", 2092 | "44/44 [==============================] - 1s 21ms/step - loss: 0.9170 - accuracy: 0.6318 - val_loss: 0.6680 - val_accuracy: 0.5899\n", 2093 | "Epoch 6/20\n", 2094 | "44/44 [==============================] - 1s 21ms/step - loss: 0.8811 - accuracy: 0.6861 - val_loss: 0.5655 - val_accuracy: 0.7283\n", 2095 | "Epoch 7/20\n", 2096 | "44/44 [==============================] - 1s 20ms/step - loss: 0.8848 - accuracy: 0.6785 - val_loss: 0.6472 - val_accuracy: 0.6346\n", 2097 | "Epoch 8/20\n", 2098 | "44/44 [==============================] - 1s 21ms/step - loss: 0.8923 - accuracy: 0.6539 - val_loss: 0.6337 - val_accuracy: 0.6743\n", 2099 | "Epoch 9/20\n", 2100 | "44/44 [==============================] - 1s 20ms/step - loss: 0.8711 - accuracy: 0.6850 - val_loss: 0.5802 - val_accuracy: 0.7181\n" 2101 | ] 2102 | } 2103 | ], 2104 | "source": [ 2105 | "# Train the model\n", 2106 | "callbacks = [tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)]\n", 2107 | "training = model.fit(x=X_train, y=train_labels, batch_size=64, \n", 2108 | " epochs=20, validation_split=0.3, class_weight={0: 1., 1: 2.5}, callbacks=callbacks)\n" 2109 | ] 2110 | }, 2111 | { 2112 | "cell_type": "code", 2113 | "execution_count": 51, 2114 | "metadata": {}, 2115 | "outputs": [ 2116 | { 2117 | "name": "stdout", 2118 | "output_type": "stream", 2119 | "text": [ 2120 | "Accuracy: 0.7211938043067624\n", 2121 | "\n", 2122 | " precision recall f1-score support\n", 2123 | "\n", 2124 | " NoInsult 0.78 0.86 0.82 1954\n", 2125 | " Insult 0.45 0.32 0.38 693\n", 2126 | "\n", 2127 | " accuracy 0.72 2647\n", 2128 | " macro avg 0.62 0.59 0.60 2647\n", 2129 | "weighted avg 0.70 0.72 0.70 2647\n", 2130 | "\n" 2131 | ] 2132 | } 2133 | ], 2134 | "source": [ 2135 | "# Evaluate Model\n", 2136 | "predictions = model.predict(X_test)\n", 2137 | "predictions_report_proba(predictions, test_labels)" 2138 | ] 2139 | }, 2140 | { 2141 | "cell_type": "markdown", 2142 | "metadata": {}, 2143 | "source": [ 2144 | "## Same Classifier but replacing insults with 'fakeinsult'" 2145 | ] 2146 | }, 2147 | { 2148 | "cell_type": "code", 2149 | "execution_count": 41, 2150 | "metadata": {}, 2151 | "outputs": [ 2152 | { 2153 | "name": "stdout", 2154 | "output_type": "stream", 2155 | "text": [ 2156 | "Model: \"functional_5\"\n", 2157 | "_________________________________________________________________\n", 2158 | "Layer (type) Output Shape Param # \n", 2159 | "=================================================================\n", 2160 | "input_3 (InputLayer) [(None, 15)] 0 \n", 2161 | "_________________________________________________________________\n", 2162 | "embedding_2 (Embedding) (None, 15, 300) 4502700 \n", 2163 | "_________________________________________________________________\n", 2164 | "bidirectional_4 (Bidirection (None, 15, 30) 37920 \n", 2165 | "_________________________________________________________________\n", 2166 | "bidirectional_5 (Bidirection (None, 30) 5520 \n", 2167 | "_________________________________________________________________\n", 2168 | "dense_4 (Dense) (None, 64) 1984 \n", 2169 | "_________________________________________________________________\n", 2170 | "dense_5 (Dense) (None, 1) 65 \n", 2171 | "=================================================================\n", 2172 | "Total params: 4,548,189\n", 2173 | "Trainable params: 45,489\n", 2174 | "Non-trainable params: 4,502,700\n", 2175 | "_________________________________________________________________\n", 2176 | "Epoch 1/20\n", 2177 | "44/44 [==============================] - 2s 46ms/step - loss: 0.9595 - accuracy: 0.6379 - val_loss: 0.6786 - val_accuracy: 0.6363\n", 2178 | "Epoch 2/20\n", 2179 | "44/44 [==============================] - 1s 20ms/step - loss: 0.9517 - accuracy: 0.6531 - val_loss: 0.6805 - val_accuracy: 0.5738\n", 2180 | "Epoch 3/20\n", 2181 | "44/44 [==============================] - 1s 21ms/step - loss: 0.9407 - accuracy: 0.6423 - val_loss: 0.6362 - val_accuracy: 0.6776\n", 2182 | "Epoch 4/20\n", 2183 | "44/44 [==============================] - 1s 20ms/step - loss: 0.9222 - accuracy: 0.6633 - val_loss: 0.6047 - val_accuracy: 0.7004\n", 2184 | "Epoch 5/20\n", 2185 | "44/44 [==============================] - 1s 20ms/step - loss: 0.8981 - accuracy: 0.6629 - val_loss: 0.5712 - val_accuracy: 0.7409\n", 2186 | "Epoch 6/20\n", 2187 | "44/44 [==============================] - 1s 19ms/step - loss: 0.8870 - accuracy: 0.6908 - val_loss: 0.7054 - val_accuracy: 0.5688\n", 2188 | "Epoch 7/20\n", 2189 | "44/44 [==============================] - 1s 19ms/step - loss: 0.8395 - accuracy: 0.7085 - val_loss: 0.5545 - val_accuracy: 0.7435\n", 2190 | "Epoch 8/20\n", 2191 | "44/44 [==============================] - 1s 23ms/step - loss: 0.8442 - accuracy: 0.6970 - val_loss: 0.5112 - val_accuracy: 0.7654\n", 2192 | "Epoch 9/20\n", 2193 | "44/44 [==============================] - 1s 24ms/step - loss: 0.8240 - accuracy: 0.7205 - val_loss: 0.5586 - val_accuracy: 0.7215\n", 2194 | "Epoch 10/20\n", 2195 | "44/44 [==============================] - 1s 23ms/step - loss: 0.8259 - accuracy: 0.7111 - val_loss: 0.5229 - val_accuracy: 0.7376\n", 2196 | "Epoch 11/20\n", 2197 | "44/44 [==============================] - 1s 22ms/step - loss: 0.7995 - accuracy: 0.7183 - val_loss: 0.5130 - val_accuracy: 0.7527\n", 2198 | "Epoch 12/20\n", 2199 | "44/44 [==============================] - 1s 21ms/step - loss: 0.7961 - accuracy: 0.7104 - val_loss: 0.5019 - val_accuracy: 0.7519\n", 2200 | "Accuracy: 0.7627502833396298\n", 2201 | "\n", 2202 | " precision recall f1-score support\n", 2203 | "\n", 2204 | " NoInsult 0.80 0.90 0.85 1954\n", 2205 | " Insult 0.57 0.37 0.45 693\n", 2206 | "\n", 2207 | " accuracy 0.76 2647\n", 2208 | " macro avg 0.69 0.64 0.65 2647\n", 2209 | "weighted avg 0.74 0.76 0.74 2647\n", 2210 | "\n" 2211 | ] 2212 | } 2213 | ], 2214 | "source": [ 2215 | "ktokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True, split=' ', \n", 2216 | " oov_token=\"NaN\", \n", 2217 | " filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n", 2218 | "ktokenizer.fit_on_texts(tokenized_corpus)\n", 2219 | "dic_vocabulary = ktokenizer.word_index\n", 2220 | "## create sequence\n", 2221 | "lst_text2seq= ktokenizer.texts_to_sequences(tokenized_corpus)\n", 2222 | "## padding sequence\n", 2223 | "X_train = tf.keras.preprocessing.sequence.pad_sequences(lst_text2seq, \n", 2224 | " maxlen=max_len)\n", 2225 | "\n", 2226 | "## text to sequence with the fitted tokenizer\n", 2227 | "lst_text2seq = ktokenizer.texts_to_sequences(test_tokenized_corpus)\n", 2228 | "\n", 2229 | "## padding sequence\n", 2230 | "X_test = tf.keras.preprocessing.sequence.pad_sequences(lst_text2seq, maxlen=max_len,\n", 2231 | " padding=\"post\", truncating=\"post\")\n", 2232 | "\n", 2233 | "# Prepare the matrix of embeddings for the embeddings layer\n", 2234 | "embeddings = np.zeros((len(dic_vocabulary)+1, dim))\n", 2235 | "for word,idx in dic_vocabulary.items():\n", 2236 | " ## update the row with vector\n", 2237 | " try:\n", 2238 | " embeddings[idx] = w2v_model.wv[word]\n", 2239 | " ## if word not in model then skip and the row stays all 0s\n", 2240 | " except:\n", 2241 | " pass\n", 2242 | " \n", 2243 | "x_in = layers.Input(shape=(max_len,))\n", 2244 | "\n", 2245 | "#Embedding\n", 2246 | "x = layers.Embedding(input_dim=embeddings.shape[0], \n", 2247 | " output_dim=embeddings.shape[1], \n", 2248 | " weights=[embeddings],\n", 2249 | " input_length=max_len, trainable=False)(x_in)\n", 2250 | "\n", 2251 | "## LSTM\n", 2252 | "x = layers.Bidirectional(layers.LSTM(max_len, dropout=0.2, return_sequences=True))(x)\n", 2253 | "x = layers.Bidirectional(layers.LSTM(max_len, dropout=0.2))(x)\n", 2254 | "\n", 2255 | "x = layers.Dense(64, activation='relu')(x)\n", 2256 | "y_out = layers.Dense(1, activation='sigmoid')(x)\n", 2257 | "\n", 2258 | "## compile\n", 2259 | "model = models.Model(x_in, y_out) \n", 2260 | "model.compile(loss='binary_crossentropy',\n", 2261 | " optimizer=keras.optimizers.Adam(), metrics=['accuracy'])\n", 2262 | "\n", 2263 | "model.summary()\n", 2264 | "\n", 2265 | "callbacks = [tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=3)]\n", 2266 | "training = model.fit(x=X_train, y=train_labels, batch_size=64, \n", 2267 | " epochs=20, validation_split=0.3, class_weight={0: 1., 1: 2.5}, callbacks=callbacks)\n", 2268 | "\n", 2269 | "# Evaluate Model\n", 2270 | "predictions = model.predict(X_test)\n", 2271 | "predictions_report_proba(predictions, test_labels)" 2272 | ] 2273 | }, 2274 | { 2275 | "cell_type": "markdown", 2276 | "metadata": {}, 2277 | "source": [ 2278 | "# Language Models. BERT\n", 2279 | "\n", 2280 | "In the field of computer vision, researchers have repeatedly shown the value of transfer learning — pre-training a neural network model on a known task, for instance ImageNet, and then performing fine-tuning — using the trained neural network as the basis of a new purpose-specific model. In recent years, researchers have been showing that a similar technique can be useful in many natural language tasks.\n", 2281 | "\n", 2282 | "BERT makes use of Transformer, an attention mechanism that learns contextual relations between words (or sub-words) in a text. In its vanilla form, Transformer includes two separate mechanisms — an encoder that reads the text input and a decoder that produces a prediction for the task. Since BERT’s goal is to generate a language model, only the encoder mechanism is necessary.\n", 2283 | "\n", 2284 | "As opposed to directional models, which read the text input sequentially (left-to-right or right-to-left), the Transformer encoder reads the entire sequence of words at once. Therefore it is considered bidirectional, though it would be more accurate to say that it’s non-directional. This characteristic allows the model to learn the context of a word based on all of its surroundings (left and right of the word)." 2285 | ] 2286 | }, 2287 | { 2288 | "cell_type": "code", 2289 | "execution_count": 119, 2290 | "metadata": {}, 2291 | "outputs": [ 2292 | { 2293 | "name": "stderr", 2294 | "output_type": "stream", 2295 | "text": [ 2296 | "/Users/rharo/.virtualenvs/trolling_detection/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:1767: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).\n", 2297 | " warnings.warn(\n" 2298 | ] 2299 | } 2300 | ], 2301 | "source": [ 2302 | "import transformers\n", 2303 | "## bert tokenizer\n", 2304 | "tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)\n", 2305 | "maxlen = 50\n", 2306 | "\n", 2307 | "tokens_train = tokenizer.batch_encode_plus(\n", 2308 | " train_comments,\n", 2309 | " add_special_tokens = True, # add [CLS], [SEP]\n", 2310 | " max_length = maxlen,\n", 2311 | " pad_to_max_length=True, # add [PAD] tokens\n", 2312 | " truncation=True,\n", 2313 | " return_attention_mask = True, # add attention mask to not focus on pad tokens\n", 2314 | ")\n", 2315 | "\n", 2316 | "## feature matrix\n", 2317 | "X_train = [np.asarray(tokens_train['input_ids'], dtype='int32'), \n", 2318 | " np.asarray(tokens_train['attention_mask'], dtype='int32'), \n", 2319 | " np.asarray(tokens_train['token_type_ids'], dtype='int32')]\n", 2320 | "\n", 2321 | "tokens_test = tokenizer.batch_encode_plus(\n", 2322 | " test_comments,\n", 2323 | " add_special_tokens = True, # add [CLS], [SEP]\n", 2324 | " max_length = maxlen,\n", 2325 | " pad_to_max_length=True, # add [PAD] tokens\n", 2326 | " truncation=True,\n", 2327 | " return_attention_mask = True, # add attention mask to not focus on pad tokens\n", 2328 | ")\n", 2329 | "\n", 2330 | "## feature matrix\n", 2331 | "X_test = [np.asarray(tokens_test['input_ids'], dtype='int32'), \n", 2332 | " np.asarray(tokens_test['attention_mask'], dtype='int32'), \n", 2333 | " np.asarray(tokens_test['token_type_ids'], dtype='int32')]" 2334 | ] 2335 | }, 2336 | { 2337 | "cell_type": "code", 2338 | "execution_count": 121, 2339 | "metadata": {}, 2340 | "outputs": [ 2341 | { 2342 | "name": "stderr", 2343 | "output_type": "stream", 2344 | "text": [ 2345 | "Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']\n", 2346 | "- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n", 2347 | "- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", 2348 | "All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.\n", 2349 | "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.\n" 2350 | ] 2351 | }, 2352 | { 2353 | "name": "stdout", 2354 | "output_type": "stream", 2355 | "text": [ 2356 | "Model: \"functional_13\"\n", 2357 | "__________________________________________________________________________________________________\n", 2358 | "Layer (type) Output Shape Param # Connected to \n", 2359 | "==================================================================================================\n", 2360 | "input_idx (InputLayer) [(None, 50)] 0 \n", 2361 | "__________________________________________________________________________________________________\n", 2362 | "input_masks (InputLayer) [(None, 50)] 0 \n", 2363 | "__________________________________________________________________________________________________\n", 2364 | "input_segments (InputLayer) [(None, 50)] 0 \n", 2365 | "__________________________________________________________________________________________________\n", 2366 | "tf_bert_model_3 (TFBertModel) ((None, 50, 768), (N 109482240 input_idx[0][0] \n", 2367 | " input_masks[0][0] \n", 2368 | " input_segments[0][0] \n", 2369 | "__________________________________________________________________________________________________\n", 2370 | "bidirectional_8 (Bidirectional) (None, 50, 128) 426496 tf_bert_model_3[0][0] \n", 2371 | "__________________________________________________________________________________________________\n", 2372 | "global_average_pooling1d_3 (Glo (None, 128) 0 bidirectional_8[0][0] \n", 2373 | "__________________________________________________________________________________________________\n", 2374 | "global_max_pooling1d (GlobalMax (None, 128) 0 bidirectional_8[0][0] \n", 2375 | "__________________________________________________________________________________________________\n", 2376 | "concatenate (Concatenate) (None, 256) 0 global_average_pooling1d_3[0][0] \n", 2377 | " global_max_pooling1d[0][0] \n", 2378 | "__________________________________________________________________________________________________\n", 2379 | "dropout_148 (Dropout) (None, 256) 0 concatenate[0][0] \n", 2380 | "__________________________________________________________________________________________________\n", 2381 | "dense_13 (Dense) (None, 1) 257 dropout_148[0][0] \n", 2382 | "==================================================================================================\n", 2383 | "Total params: 109,908,993\n", 2384 | "Trainable params: 426,753\n", 2385 | "Non-trainable params: 109,482,240\n", 2386 | "__________________________________________________________________________________________________\n" 2387 | ] 2388 | } 2389 | ], 2390 | "source": [ 2391 | "## inputs\n", 2392 | "idx = layers.Input((50), dtype=\"int32\", name=\"input_idx\")\n", 2393 | "masks = layers.Input((50), dtype=\"int32\", name=\"input_masks\")\n", 2394 | "segments = layers.Input((50), dtype=\"int32\", name=\"input_segments\")\n", 2395 | "\n", 2396 | "## pre-trained bert\n", 2397 | "bert_model = transformers.TFBertModel.from_pretrained(\"bert-base-uncased\")\n", 2398 | "bert_model.trainable = False\n", 2399 | "bert_out, _ = bert_model([idx, masks, segments])\n", 2400 | "\n", 2401 | "## fine-tuning\n", 2402 | "# Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.\n", 2403 | "bi_lstm = tf.keras.layers.Bidirectional(\n", 2404 | " tf.keras.layers.LSTM(64, return_sequences=True)\n", 2405 | " )(bert_out)\n", 2406 | "# Applying hybrid pooling approach to bi_lstm sequence output.\n", 2407 | "avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)\n", 2408 | "max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)\n", 2409 | "concat = tf.keras.layers.concatenate([avg_pool, max_pool])\n", 2410 | "dropout = tf.keras.layers.Dropout(0.3)(concat)\n", 2411 | "#x = layers.GlobalAveragePooling1D()(bert_out)\n", 2412 | "#x = layers.Dense(64, activation=\"relu\")(x)\n", 2413 | "y_out = layers.Dense(1, activation='sigmoid')(dropout)\n", 2414 | "## compile\n", 2415 | "model = models.Model([idx, masks, segments], y_out)\n", 2416 | "for layer in model.layers[:4]:\n", 2417 | " layer.trainable = False\n", 2418 | "model.compile(loss='binary_crossentropy', \n", 2419 | " optimizer='adam', metrics=['accuracy'])\n", 2420 | "model.summary()" 2421 | ] 2422 | }, 2423 | { 2424 | "cell_type": "code", 2425 | "execution_count": 122, 2426 | "metadata": {}, 2427 | "outputs": [ 2428 | { 2429 | "name": "stdout", 2430 | "output_type": "stream", 2431 | "text": [ 2432 | "44/44 [==============================] - 233s 5s/step - loss: 0.6711 - accuracy: 0.7875 - val_loss: 0.4103 - val_accuracy: 0.8101\n" 2433 | ] 2434 | } 2435 | ], 2436 | "source": [ 2437 | "training = model.fit(x=X_train, y=train_labels, batch_size=64, \n", 2438 | " epochs=1, validation_split=0.3, class_weight={0: 1., 1: 2.5})\n" 2439 | ] 2440 | }, 2441 | { 2442 | "cell_type": "code", 2443 | "execution_count": 123, 2444 | "metadata": {}, 2445 | "outputs": [ 2446 | { 2447 | "name": "stdout", 2448 | "output_type": "stream", 2449 | "text": [ 2450 | "Accuracy: 0.8273517189270873\n", 2451 | "\n", 2452 | " precision recall f1-score support\n", 2453 | "\n", 2454 | " NoInsult 0.88 0.89 0.88 1954\n", 2455 | " Insult 0.68 0.66 0.67 693\n", 2456 | "\n", 2457 | " accuracy 0.83 2647\n", 2458 | " macro avg 0.78 0.77 0.77 2647\n", 2459 | "weighted avg 0.83 0.83 0.83 2647\n", 2460 | "\n" 2461 | ] 2462 | } 2463 | ], 2464 | "source": [ 2465 | "# Evaluate Model\n", 2466 | "predictions = model.predict(X_test)\n", 2467 | "predictions_report_proba(predictions, test_labels)" 2468 | ] 2469 | }, 2470 | { 2471 | "cell_type": "code", 2472 | "execution_count": null, 2473 | "metadata": {}, 2474 | "outputs": [], 2475 | "source": [] 2476 | } 2477 | ], 2478 | "metadata": { 2479 | "kernelspec": { 2480 | "display_name": "Python 3", 2481 | "language": "python", 2482 | "name": "python3" 2483 | }, 2484 | "language_info": { 2485 | "codemirror_mode": { 2486 | "name": "ipython", 2487 | "version": 3 2488 | }, 2489 | "file_extension": ".py", 2490 | "mimetype": "text/x-python", 2491 | "name": "python", 2492 | "nbconvert_exporter": "python", 2493 | "pygments_lexer": "ipython3", 2494 | "version": "3.8.6" 2495 | } 2496 | }, 2497 | "nbformat": 4, 2498 | "nbformat_minor": 1 2499 | } 2500 | --------------------------------------------------------------------------------