├── LICENSE ├── MANIFEST.in ├── README.md ├── example.ipynb ├── outrageclf ├── classifier.py ├── helpers.py ├── model_architect.py └── preprocessing.py ├── setup.py └── training.py /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Attribution-NonCommercial-ShareAlike 2.0 2 | 3 | 4 | 5 | 6 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL 7 | 8 | SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT 9 | 10 | RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. 11 | 12 | CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND 13 | 14 | DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. 15 | 16 | 17 | 18 | 19 | License 20 | 21 | 22 | 23 | 24 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS 25 | 26 | PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR 27 | 28 | OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS 29 | 30 | LICENSE OR COPYRIGHT LAW IS PROHIBITED. 31 | 32 | 33 | 34 | 35 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO 36 | 37 | BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS 38 | 39 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS. 40 | 41 | 42 | 43 | 44 | 1. Definitions 45 | 46 | 47 | 48 | 49 | a. "Collective Work" means a work, such as a periodical issue, anthology or 50 | 51 | encyclopedia, in which the Work in its entirety in unmodified form, along 52 | 53 | with a number of other contributions, constituting separate and independent 54 | 55 | works in themselves, are assembled into a collective whole. A work that constitutes 56 | 57 | a Collective Work will not be considered a Derivative Work (as defined below) 58 | 59 | for the purposes of this License. 60 | 61 | 62 | 63 | 64 | b. "Derivative Work" means a work based upon the Work or upon the Work and 65 | 66 | other pre-existing works, such as a translation, musical arrangement, dramatization, 67 | 68 | fictionalization, motion picture version, sound recording, art reproduction, 69 | 70 | abridgment, condensation, or any other form in which the Work may be recast, 71 | 72 | transformed, or adapted, except that a work that constitutes a Collective 73 | 74 | Work will not be considered a Derivative Work for the purpose of this License. 75 | 76 | For the avoidance of doubt, where the Work is a musical composition or sound 77 | 78 | recording, the synchronization of the Work in timed-relation with a moving 79 | 80 | image ("synching") will be considered a Derivative Work for the purpose of 81 | 82 | this License. 83 | 84 | 85 | 86 | 87 | c. "Licensor" means the individual or entity that offers the Work under the 88 | 89 | terms of this License. 90 | 91 | 92 | 93 | 94 | d. "Original Author" means the individual or entity who created the Work. 95 | 96 | 97 | 98 | 99 | e. "Work" means the copyrightable work of authorship offered under the terms 100 | 101 | of this License. 102 | 103 | 104 | 105 | 106 | f. "You" means an individual or entity exercising rights under this License 107 | 108 | who has not previously violated the terms of this License with respect to 109 | 110 | the Work, or who has received express permission from the Licensor to exercise 111 | 112 | rights under this License despite a previous violation. 113 | 114 | 115 | 116 | 117 | g. "License Elements" means the following high-level license attributes as 118 | 119 | selected by Licensor and indicated in the title of this License: Attribution, 120 | 121 | Noncommercial, ShareAlike. 122 | 123 | 124 | 125 | 126 | 2. Fair Use Rights. Nothing in this license is intended to reduce, limit, 127 | 128 | or restrict any rights arising from fair use, first sale or other limitations 129 | 130 | on the exclusive rights of the copyright owner under copyright law or other 131 | 132 | applicable laws. 133 | 134 | 135 | 136 | 137 | 3. License Grant. Subject to the terms and conditions of this License, Licensor 138 | 139 | hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for 140 | 141 | the duration of the applicable copyright) license to exercise the rights in 142 | 143 | the Work as stated below: 144 | 145 | 146 | 147 | 148 | a. to reproduce the Work, to incorporate the Work into one or more Collective 149 | 150 | Works, and to reproduce the Work as incorporated in the Collective Works; 151 | 152 | 153 | 154 | 155 | b. to create and reproduce Derivative Works; 156 | 157 | 158 | 159 | 160 | c. to distribute copies or phonorecords of, display publicly, perform publicly, 161 | 162 | and perform publicly by means of a digital audio transmission the Work including 163 | 164 | as incorporated in Collective Works; 165 | 166 | 167 | 168 | 169 | d. to distribute copies or phonorecords of, display publicly, perform publicly, 170 | 171 | and perform publicly by means of a digital audio transmission Derivative Works; 172 | 173 | 174 | 175 | 176 | The above rights may be exercised in all media and formats whether now known 177 | 178 | or hereafter devised. The above rights include the right to make such modifications 179 | 180 | as are technically necessary to exercise the rights in other media and formats. 181 | 182 | All rights not expressly granted by Licensor are hereby reserved, including 183 | 184 | but not limited to the rights set forth in Sections 4(e) and 4(f). 185 | 186 | 187 | 188 | 189 | 4. Restrictions. The license granted in Section 3 above is expressly made 190 | 191 | subject to and limited by the following restrictions: 192 | 193 | 194 | 195 | 196 | a. You may distribute, publicly display, publicly perform, or publicly digitally 197 | 198 | perform the Work only under the terms of this License, and You must include 199 | 200 | a copy of, or the Uniform Resource Identifier for, this License with every 201 | 202 | copy or phonorecord of the Work You distribute, publicly display, publicly 203 | 204 | perform, or publicly digitally perform. You may not offer or impose any terms 205 | 206 | on the Work that alter or restrict the terms of this License or the recipients' 207 | 208 | exercise of the rights granted hereunder. You may not sublicense the Work. 209 | 210 | You must keep intact all notices that refer to this License and to the disclaimer 211 | 212 | of warranties. You may not distribute, publicly display, publicly perform, 213 | 214 | or publicly digitally perform the Work with any technological measures that 215 | 216 | control access or use of the Work in a manner inconsistent with the terms 217 | 218 | of this License Agreement. The above applies to the Work as incorporated in 219 | 220 | a Collective Work, but this does not require the Collective Work apart from 221 | 222 | the Work itself to be made subject to the terms of this License. If You create 223 | 224 | a Collective Work, upon notice from any Licensor You must, to the extent practicable, 225 | 226 | remove from the Collective Work any reference to such Licensor or the Original 227 | 228 | Author, as requested. If You create a Derivative Work, upon notice from any 229 | 230 | Licensor You must, to the extent practicable, remove from the Derivative Work 231 | 232 | any reference to such Licensor or the Original Author, as requested. 233 | 234 | 235 | 236 | 237 | b. You may distribute, publicly display, publicly perform, or publicly digitally 238 | 239 | perform a Derivative Work only under the terms of this License, a later version 240 | 241 | of this License with the same License Elements as this License, or a Creative 242 | 243 | Commons iCommons license that contains the same License Elements as this License 244 | 245 | (e.g. Attribution-NonCommercial-ShareAlike 2.0 Japan). You must include a 246 | 247 | copy of, or the Uniform Resource Identifier for, this License or other license 248 | 249 | specified in the previous sentence with every copy or phonorecord of each 250 | 251 | Derivative Work You distribute, publicly display, publicly perform, or publicly 252 | 253 | digitally perform. You may not offer or impose any terms on the Derivative 254 | 255 | Works that alter or restrict the terms of this License or the recipients' 256 | 257 | exercise of the rights granted hereunder, and You must keep intact all notices 258 | 259 | that refer to this License and to the disclaimer of warranties. You may not 260 | 261 | distribute, publicly display, publicly perform, or publicly digitally perform 262 | 263 | the Derivative Work with any technological measures that control access or 264 | 265 | use of the Work in a manner inconsistent with the terms of this License Agreement. 266 | 267 | The above applies to the Derivative Work as incorporated in a Collective Work, 268 | 269 | but this does not require the Collective Work apart from the Derivative Work 270 | 271 | itself to be made subject to the terms of this License. 272 | 273 | 274 | 275 | 276 | c. You may not exercise any of the rights granted to You in Section 3 above 277 | 278 | in any manner that is primarily intended for or directed toward commercial 279 | 280 | advantage or private monetary compensation. The exchange of the Work for other 281 | 282 | copyrighted works by means of digital file-sharing or otherwise shall not 283 | 284 | be considered to be intended for or directed toward commercial advantage or 285 | 286 | private monetary compensation, provided there is no payment of any monetary 287 | 288 | compensation in connection with the exchange of copyrighted works. 289 | 290 | 291 | 292 | 293 | d. If you distribute, publicly display, publicly perform, or publicly digitally 294 | 295 | perform the Work or any Derivative Works or Collective Works, You must keep 296 | 297 | intact all copyright notices for the Work and give the Original Author credit 298 | 299 | reasonable to the medium or means You are utilizing by conveying the name 300 | 301 | (or pseudonym if applicable) of the Original Author if supplied; the title 302 | 303 | of the Work if supplied; to the extent reasonably practicable, the Uniform 304 | 305 | Resource Identifier, if any, that Licensor specifies to be associated with 306 | 307 | the Work, unless such URI does not refer to the copyright notice or licensing 308 | 309 | information for the Work; and in the case of a Derivative Work, a credit identifying 310 | 311 | the use of the Work in the Derivative Work (e.g., "French translation of the 312 | 313 | Work by Original Author," or "Screenplay based on original Work by Original 314 | 315 | Author"). Such credit may be implemented in any reasonable manner; provided, 316 | 317 | however, that in the case of a Derivative Work or Collective Work, at a minimum 318 | 319 | such credit will appear where any other comparable authorship credit appears 320 | 321 | and in a manner at least as prominent as such other comparable authorship 322 | 323 | credit. 324 | 325 | 326 | 327 | 328 | e. For the avoidance of doubt, where the Work is a musical composition: 329 | 330 | 331 | 332 | 333 | i. Performance Royalties Under Blanket Licenses. Licensor reserves the exclusive 334 | 335 | right to collect, whether individually or via a performance rights society 336 | 337 | (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital 338 | 339 | performance (e.g. webcast) of the Work if that performance is primarily intended 340 | 341 | for or directed toward commercial advantage or private monetary compensation. 342 | 343 | 344 | 345 | 346 | ii. Mechanical Rights and Statutory Royalties. Licensor reserves the exclusive 347 | 348 | right to collect, whether individually or via a music rights agency or designated 349 | 350 | agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from 351 | 352 | the Work ("cover version") and distribute, subject to the compulsory license 353 | 354 | created by 17 USC Section 115 of the US Copyright Act (or the equivalent in 355 | 356 | other jurisdictions), if Your distribution of such cover version is primarily 357 | 358 | intended for or directed toward commercial advantage or private monetary compensation. 359 | 360 | 361 | 362 | 363 | f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, 364 | 365 | where the Work is a sound recording, Licensor reserves the exclusive right 366 | 367 | to collect, whether individually or via a performance-rights society (e.g. 368 | 369 | SoundExchange), royalties for the public digital performance (e.g. webcast) 370 | 371 | of the Work, subject to the compulsory license created by 17 USC Section 114 372 | 373 | of the US Copyright Act (or the equivalent in other jurisdictions), if Your 374 | 375 | public digital performance is primarily intended for or directed toward commercial 376 | 377 | advantage or private monetary compensation. 378 | 379 | 380 | 381 | 382 | 5. Representations, Warranties and Disclaimer 383 | 384 | 385 | 386 | 387 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS 388 | 389 | THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING 390 | 391 | THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, 392 | 393 | WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, 394 | 395 | OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE 396 | 397 | OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE 398 | 399 | EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU. 400 | 401 | 402 | 403 | 404 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, 405 | 406 | IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, 407 | 408 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS 409 | 410 | LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY 411 | 412 | OF SUCH DAMAGES. 413 | 414 | 415 | 416 | 417 | 7. Termination 418 | 419 | 420 | 421 | 422 | a. This License and the rights granted hereunder will terminate automatically 423 | 424 | upon any breach by You of the terms of this License. Individuals or entities 425 | 426 | who have received Derivative Works or Collective Works from You under this 427 | 428 | License, however, will not have their licenses terminated provided such individuals 429 | 430 | or entities remain in full compliance with those licenses. Sections 1, 2, 431 | 432 | 5, 6, 7, and 8 will survive any termination of this License. 433 | 434 | 435 | 436 | 437 | b. Subject to the above terms and conditions, the license granted here is 438 | 439 | perpetual (for the duration of the applicable copyright in the Work). Notwithstanding 440 | 441 | the above, Licensor reserves the right to release the Work under different 442 | 443 | license terms or to stop distributing the Work at any time; provided, however 444 | 445 | that any such election will not serve to withdraw this License (or any other 446 | 447 | license that has been, or is required to be, granted under the terms of this 448 | 449 | License), and this License will continue in full force and effect unless terminated 450 | 451 | as stated above. 452 | 453 | 454 | 455 | 456 | 8. Miscellaneous 457 | 458 | 459 | 460 | 461 | a. Each time You distribute or publicly digitally perform the Work or a Collective 462 | 463 | Work, the Licensor offers to the recipient a license to the Work on the same 464 | 465 | terms and conditions as the license granted to You under this License. 466 | 467 | 468 | 469 | 470 | b. Each time You distribute or publicly digitally perform a Derivative Work, 471 | 472 | Licensor offers to the recipient a license to the original Work on the same 473 | 474 | terms and conditions as the license granted to You under this License. 475 | 476 | 477 | 478 | 479 | c. If any provision of this License is invalid or unenforceable under applicable 480 | 481 | law, it shall not affect the validity or enforceability of the remainder of 482 | 483 | the terms of this License, and without further action by the parties to this 484 | 485 | agreement, such provision shall be reformed to the minimum extent necessary 486 | 487 | to make such provision valid and enforceable. 488 | 489 | 490 | 491 | 492 | d. No term or provision of this License shall be deemed waived and no breach 493 | 494 | consented to unless such waiver or consent shall be in writing and signed 495 | 496 | by the party to be charged with such waiver or consent. 497 | 498 | 499 | 500 | 501 | e. This License constitutes the entire agreement between the parties with 502 | 503 | respect to the Work licensed here. There are no understandings, agreements 504 | 505 | or representations with respect to the Work not specified here. Licensor shall 506 | 507 | not be bound by any additional provisions that may appear in any communication 508 | 509 | from You. This License may not be modified without the mutual written agreement 510 | 511 | of the Licensor and You. 512 | 513 | 514 | 515 | 516 | Creative Commons is not a party to this License, and makes no warranty whatsoever 517 | 518 | in connection with the Work. Creative Commons will not be liable to You or 519 | 520 | any party on any legal theory for any damages whatsoever, including without 521 | 522 | limitation any general, special, incidental or consequential damages arising 523 | 524 | in connection to this license. Notwithstanding the foregoing two (2) sentences, 525 | 526 | if Creative Commons has expressly identified itself as the Licensor hereunder, 527 | 528 | it shall have all rights and obligations of Licensor. 529 | 530 | 531 | 532 | 533 | Except for the limited purpose of indicating to the public that the Work is 534 | 535 | licensed under the CCPL, neither party will use the trademark "Creative Commons" 536 | 537 | or any related trademark or logo of Creative Commons without the prior written 538 | 539 | consent of Creative Commons. Any permitted use will be in compliance with 540 | 541 | Creative Commons' then-current trademark usage guidelines, as may be published 542 | 543 | on its website or otherwise made available upon request from time to time. 544 | 545 | 546 | 547 | 548 | Creative Commons may be contacted at http://creativecommons.org/. 549 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include COPYING 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # *DOC*: Digital Outrage Classifier 2 | 3 | > Developed by members of the Crockett Lab at Yale University in the Psychology and Statistics and Data Science department, `DOC` is a Python package that allows researchers to predict the probability that tweets contain moral outrage. 4 | 5 | > The details of the development of the code and materials in this repository are described in detail in the paper, "[How social learning amplifies moral outrage expression in online social networks](https://psyarxiv.com/gf7t5) (2021). 6 | 7 | [![made-with-python][made-with-python]](https://www.python.org/) 8 | [![Outrageclf version][outrage-image]](www.google.com) 9 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](www.google.com) 10 | [![CC NC-SA 4.0](https://img.shields.io/badge/License-CC--NC--SA%202.0-lightgrey)](www.google.com) 11 | 12 | 13 | ## Repository Contributors 14 | * William Brady | Postdoctoral Fellow | Yale University | william.brady@yale.edu | [Website](http://williamjbrady.com) 15 | * Killian McLoughlin | Ph.D. Student | Princeton University | k.mcloughlin@princeton.edu | [LinkedIn](www.linkedin.com/in/killian-mc-loughlin-5a151032) 16 | * Tuan Nguyen Doan | Data Scientist | Quora | tuan.nguyen.doan@aya.yale.edu | [LinkedIn](https://www.linkedin.com/in/tuan-nguyen-doan) 17 | 18 | 19 | ## Installation 20 | 21 | The first step is to clone the repo into a local directory on you computer. Using the terminal, navigate to the location where you want to store the package and run the following command: 22 | 23 | ```sh 24 | git clone "https://github.com/CrockettLab/outrage_classifier" 25 | ``` 26 | 27 | Then run the command below. The package is compatible with both Python2 and Python3. 28 | ```sh 29 | python setup.py install 30 | ``` 31 | 32 | ## Importing 33 | The package can be imported using the following code: 34 | 35 | ```python 36 | import outrageclf as oclf 37 | from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag 38 | from outrageclf.classifier import _load_crockett_model 39 | ``` 40 | 41 | For those using macOS, a runtime error (described [here](https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial)) may prevent the package from being successfully imported. If you experience this issue, setting the environment varibale `KMP_DUPLICATE_LIB_OK` to `TRUE` should solve the problem: 42 | 43 | ```python 44 | import os 45 | os.environ['KMP_DUPLICATE_LIB_OK']='True' 46 | ``` 47 | 48 | ## Usage 49 | The current version of `outrageclf` allows users to predict moral outrage using a pre-trained deep gated recurrent unit (GRU) model as described in detail in [this](www.google.com) article. 50 | 51 | To run the pre-trained model used in the article, you will need **model files that are NOT hosted in this repository**. If you would like access to these files, see 'Accessing Model Files' below. The omited files are: 52 | 53 | - [x] A pre-trained embedding model, stored in a `.joblib` format 54 | - [x] A pre-trained GRU model, stored in a `.h5` format 55 | 56 | In order to predict the probability a tweet contains moral outrage we use the following pipeline: 57 | 58 | ```mermaid 59 | Load pretrained models -> Preprocess text -> Embed text -> Make prediction 60 | ``` 61 | 62 | Below is a complete coded instance of the pipeline. Note that this example **assumes the presence of either our pretrained-model files or similar files generated by the user**: 63 | 64 | ```python 65 | 66 | tweets = [ 67 | "This topic infuriates me because it violates my moral stance", 68 | "This is just a super-normal topic #normal" 69 | ] 70 | 71 | # loading our pre-trained models 72 | word_embed = WordEmbed() 73 | word_embed._get_pretrained_tokenizer(embedding_url) 74 | model = _load_crockett_model(model_url) 75 | 76 | # the text are lemmatized and embedded into 50-d space 77 | lemmatized_text = get_lemmatize_hashtag(text_vector) 78 | embedded_vector = word_embed._get_embedded_vector(lemmatized_text) 79 | predict = model.predict(embedded_vector) 80 | ``` 81 | 82 | Alternatively, classifications can be generated using the package's model wrapper function, stored in the `classifier` module. This step bypasses the need to lemmatize and embed text input: 83 | 84 | ```python 85 | from outrageclf.classifier import pretrained_model_predict 86 | pretrained_model_predict(tweets, embedding_url, model_url) 87 | ``` 88 | ## Accessing Model Files 89 | In order to access the pre-trained model files please fill out [this form](https://forms.gle/sRDbmtGK1dW6z6ff6). The form will ask for your email and a brief description of your use case. We will then email you the model files. Note that the classifier is for use in academic research only. See the license for more information. 90 | 91 | ## Example Notebook 92 | `example.ipynb` demonstrates examples of these two use cases. 93 | 94 | ## Citation 95 | Brady, W.J., McLoughlin, K.L., Doan, T.N., & Crockett, M.J. (2021). How social learning amplifies moral outrage expression in online social networks. [PsyArXiv](https://psyarxiv.com/gf7t5). doi: 10.31234/osf.io/gf7t5 96 | 97 | ## License 98 | This work is licensed under a 99 | [Creative Commons Attribution-NonCommercial-ShareAlike 2.0 Generic License][cc-nc-sa]. 100 | 101 | [![CC NC-SA 4.0][cc-nc-sa-image]][cc-nc-sa] 102 | 103 | ## Release History 104 | * 0.1.0 105 | * Initial release 106 | * 0.1.5 107 | * Official release with paper 108 | * 0.1.6 109 | * hotfix 110 | 111 | 112 | [made-with-python]: https://img.shields.io/badge/Made%20with-Python-FF0000.svg 113 | [outrage-image]: https://img.shields.io/badge/DOC-v0.1.4-orange.svg 114 | [cc-nc-sa]: https://creativecommons.org/licenses/by-nc-sa/2.0/ 115 | [cc-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/2.0/88x31.png 116 | [cc-nc-sa-shield]: https://img.shields.io/badge/License-CC--NC--SA%202.0-lightgrey 117 | [travis-image]: https://img.shields.io/travis/dbader/node-datadog-metrics/master.svg?style=flat-square 118 | [travis-url]: https://travis-ci.org/dbader/node-datadog-metrics 119 | -------------------------------------------------------------------------------- /example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "example.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "accelerator": "GPU" 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "code", 18 | "metadata": { 19 | "id": "SFOg446dh3ki", 20 | "colab": { 21 | "base_uri": "https://localhost:8080/" 22 | }, 23 | "outputId": "d643edf1-967f-49ae-a27f-1f1830277210" 24 | }, 25 | "source": [ 26 | "%ls" 27 | ], 28 | "execution_count": 1, 29 | "outputs": [ 30 | { 31 | "output_type": "stream", 32 | "text": [ 33 | "\u001b[0m\u001b[01;34mdrive\u001b[0m/ \u001b[01;34msample_data\u001b[0m/\n" 34 | ], 35 | "name": "stdout" 36 | } 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "CT-cXbOmhbqv", 43 | "colab": { 44 | "base_uri": "https://localhost:8080/" 45 | }, 46 | "outputId": "11f9bf87-521b-44f4-dfa7-1d09b269134f" 47 | }, 48 | "source": [ 49 | "%cd \"drive/My Drive/outrageclf/\"" 50 | ], 51 | "execution_count": 2, 52 | "outputs": [ 53 | { 54 | "output_type": "stream", 55 | "text": [ 56 | "/content/drive/My Drive/outrageclf\n" 57 | ], 58 | "name": "stdout" 59 | } 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "metadata": { 65 | "id": "EF1t_c4FyRSq", 66 | "colab": { 67 | "base_uri": "https://localhost:8080/" 68 | }, 69 | "outputId": "ac6ffa73-afab-49a9-a2cb-96b67181386b" 70 | }, 71 | "source": [ 72 | "!python3 setup.py install" 73 | ], 74 | "execution_count": 3, 75 | "outputs": [ 76 | { 77 | "output_type": "stream", 78 | "text": [ 79 | "running install\n", 80 | "running bdist_egg\n", 81 | "running egg_info\n", 82 | "writing outrageclf.egg-info/PKG-INFO\n", 83 | "writing dependency_links to outrageclf.egg-info/dependency_links.txt\n", 84 | "writing requirements to outrageclf.egg-info/requires.txt\n", 85 | "writing top-level names to outrageclf.egg-info/top_level.txt\n", 86 | "reading manifest file 'outrageclf.egg-info/SOURCES.txt'\n", 87 | "reading manifest template 'MANIFEST.in'\n", 88 | "warning: no files found matching 'COPYING'\n", 89 | "writing manifest file 'outrageclf.egg-info/SOURCES.txt'\n", 90 | "installing library code to build/bdist.linux-x86_64/egg\n", 91 | "running install_lib\n", 92 | "running build_py\n", 93 | "copying outrageclf/helpers.py -> build/lib/outrageclf\n", 94 | "creating build/bdist.linux-x86_64/egg\n", 95 | "creating build/bdist.linux-x86_64/egg/outrageclf\n", 96 | "copying build/lib/outrageclf/__init__.py -> build/bdist.linux-x86_64/egg/outrageclf\n", 97 | "copying build/lib/outrageclf/helpers.py -> build/bdist.linux-x86_64/egg/outrageclf\n", 98 | "copying build/lib/outrageclf/model_architect.py -> build/bdist.linux-x86_64/egg/outrageclf\n", 99 | "copying build/lib/outrageclf/classifier.py -> build/bdist.linux-x86_64/egg/outrageclf\n", 100 | "copying build/lib/outrageclf/preprocessing.py -> build/bdist.linux-x86_64/egg/outrageclf\n", 101 | "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/__init__.py to __init__.cpython-36.pyc\n", 102 | "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/helpers.py to helpers.cpython-36.pyc\n", 103 | "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/model_architect.py to model_architect.cpython-36.pyc\n", 104 | "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/classifier.py to classifier.cpython-36.pyc\n", 105 | "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/preprocessing.py to preprocessing.cpython-36.pyc\n", 106 | "creating build/bdist.linux-x86_64/egg/EGG-INFO\n", 107 | "copying outrageclf.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 108 | "copying outrageclf.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 109 | "copying outrageclf.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 110 | "copying outrageclf.egg-info/not-zip-safe -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 111 | "copying outrageclf.egg-info/requires.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 112 | "copying outrageclf.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n", 113 | "creating 'dist/outrageclf-0.1.5-py3.6.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n", 114 | "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n", 115 | "Processing outrageclf-0.1.5-py3.6.egg\n", 116 | "creating /usr/local/lib/python3.6/dist-packages/outrageclf-0.1.5-py3.6.egg\n", 117 | "Extracting outrageclf-0.1.5-py3.6.egg to /usr/local/lib/python3.6/dist-packages\n", 118 | "Adding outrageclf 0.1.5 to easy-install.pth file\n", 119 | "\n", 120 | "Installed /usr/local/lib/python3.6/dist-packages/outrageclf-0.1.5-py3.6.egg\n", 121 | "Processing dependencies for outrageclf==0.1.5\n", 122 | "Searching for tensorflow==2.3.0\n", 123 | "Best match: tensorflow 2.3.0\n", 124 | "Adding tensorflow 2.3.0 to easy-install.pth file\n", 125 | "Installing estimator_ckpt_converter script to /usr/local/bin\n", 126 | "Installing saved_model_cli script to /usr/local/bin\n", 127 | "Installing tensorboard script to /usr/local/bin\n", 128 | "Installing tf_upgrade_v2 script to /usr/local/bin\n", 129 | "Installing tflite_convert script to /usr/local/bin\n", 130 | "Installing toco script to /usr/local/bin\n", 131 | "Installing toco_from_protos script to /usr/local/bin\n", 132 | "\n", 133 | "Using /usr/local/lib/python3.6/dist-packages\n", 134 | "Searching for sklearn==0.0\n", 135 | "Best match: sklearn 0.0\n", 136 | "Adding sklearn 0.0 to easy-install.pth file\n", 137 | "\n", 138 | "Using /usr/local/lib/python3.6/dist-packages\n", 139 | "Searching for numpy==1.18.5\n", 140 | "Best match: numpy 1.18.5\n", 141 | "Adding numpy 1.18.5 to easy-install.pth file\n", 142 | "Installing f2py script to /usr/local/bin\n", 143 | "Installing f2py3 script to /usr/local/bin\n", 144 | "Installing f2py3.6 script to /usr/local/bin\n", 145 | "\n", 146 | "Using /usr/local/lib/python3.6/dist-packages\n", 147 | "Searching for nltk==3.2.5\n", 148 | "Best match: nltk 3.2.5\n", 149 | "Adding nltk 3.2.5 to easy-install.pth file\n", 150 | "\n", 151 | "Using /usr/local/lib/python3.6/dist-packages\n", 152 | "Searching for Keras==2.4.3\n", 153 | "Best match: Keras 2.4.3\n", 154 | "Adding Keras 2.4.3 to easy-install.pth file\n", 155 | "\n", 156 | "Using /usr/local/lib/python3.6/dist-packages\n", 157 | "Searching for joblib==0.17.0\n", 158 | "Best match: joblib 0.17.0\n", 159 | "Adding joblib 0.17.0 to easy-install.pth file\n", 160 | "\n", 161 | "Using /usr/local/lib/python3.6/dist-packages\n", 162 | "Searching for emoji==0.6.0\n", 163 | "Best match: emoji 0.6.0\n", 164 | "Adding emoji 0.6.0 to easy-install.pth file\n", 165 | "\n", 166 | "Using /usr/local/lib/python3.6/dist-packages\n", 167 | "Searching for Keras-Preprocessing==1.1.2\n", 168 | "Best match: Keras-Preprocessing 1.1.2\n", 169 | "Adding Keras-Preprocessing 1.1.2 to easy-install.pth file\n", 170 | "\n", 171 | "Using /usr/local/lib/python3.6/dist-packages\n", 172 | "Searching for astunparse==1.6.3\n", 173 | "Best match: astunparse 1.6.3\n", 174 | "Adding astunparse 1.6.3 to easy-install.pth file\n", 175 | "\n", 176 | "Using /usr/local/lib/python3.6/dist-packages\n", 177 | "Searching for scipy==1.4.1\n", 178 | "Best match: scipy 1.4.1\n", 179 | "Adding scipy 1.4.1 to easy-install.pth file\n", 180 | "\n", 181 | "Using /usr/local/lib/python3.6/dist-packages\n", 182 | "Searching for gast==0.3.3\n", 183 | "Best match: gast 0.3.3\n", 184 | "Adding gast 0.3.3 to easy-install.pth file\n", 185 | "\n", 186 | "Using /usr/local/lib/python3.6/dist-packages\n", 187 | "Searching for six==1.15.0\n", 188 | "Best match: six 1.15.0\n", 189 | "Adding six 1.15.0 to easy-install.pth file\n", 190 | "\n", 191 | "Using /usr/local/lib/python3.6/dist-packages\n", 192 | "Searching for termcolor==1.1.0\n", 193 | "Best match: termcolor 1.1.0\n", 194 | "Adding termcolor 1.1.0 to easy-install.pth file\n", 195 | "\n", 196 | "Using /usr/local/lib/python3.6/dist-packages\n", 197 | "Searching for wheel==0.36.1\n", 198 | "Best match: wheel 0.36.1\n", 199 | "Adding wheel 0.36.1 to easy-install.pth file\n", 200 | "Installing wheel script to /usr/local/bin\n", 201 | "\n", 202 | "Using /usr/local/lib/python3.6/dist-packages\n", 203 | "Searching for tensorflow-estimator==2.3.0\n", 204 | "Best match: tensorflow-estimator 2.3.0\n", 205 | "Adding tensorflow-estimator 2.3.0 to easy-install.pth file\n", 206 | "\n", 207 | "Using /usr/local/lib/python3.6/dist-packages\n", 208 | "Searching for wrapt==1.12.1\n", 209 | "Best match: wrapt 1.12.1\n", 210 | "Adding wrapt 1.12.1 to easy-install.pth file\n", 211 | "\n", 212 | "Using /usr/local/lib/python3.6/dist-packages\n", 213 | "Searching for opt-einsum==3.3.0\n", 214 | "Best match: opt-einsum 3.3.0\n", 215 | "Adding opt-einsum 3.3.0 to easy-install.pth file\n", 216 | "\n", 217 | "Using /usr/local/lib/python3.6/dist-packages\n", 218 | "Searching for h5py==2.10.0\n", 219 | "Best match: h5py 2.10.0\n", 220 | "Adding h5py 2.10.0 to easy-install.pth file\n", 221 | "\n", 222 | "Using /usr/local/lib/python3.6/dist-packages\n", 223 | "Searching for google-pasta==0.2.0\n", 224 | "Best match: google-pasta 0.2.0\n", 225 | "Adding google-pasta 0.2.0 to easy-install.pth file\n", 226 | "\n", 227 | "Using /usr/local/lib/python3.6/dist-packages\n", 228 | "Searching for protobuf==3.12.4\n", 229 | "Best match: protobuf 3.12.4\n", 230 | "Adding protobuf 3.12.4 to easy-install.pth file\n", 231 | "\n", 232 | "Using /usr/local/lib/python3.6/dist-packages\n", 233 | "Searching for grpcio==1.34.0\n", 234 | "Best match: grpcio 1.34.0\n", 235 | "Adding grpcio 1.34.0 to easy-install.pth file\n", 236 | "\n", 237 | "Using /usr/local/lib/python3.6/dist-packages\n", 238 | "Searching for tensorboard==2.3.0\n", 239 | "Best match: tensorboard 2.3.0\n", 240 | "Adding tensorboard 2.3.0 to easy-install.pth file\n", 241 | "Installing tensorboard script to /usr/local/bin\n", 242 | "\n", 243 | "Using /usr/local/lib/python3.6/dist-packages\n", 244 | "Searching for absl-py==0.10.0\n", 245 | "Best match: absl-py 0.10.0\n", 246 | "Adding absl-py 0.10.0 to easy-install.pth file\n", 247 | "\n", 248 | "Using /usr/local/lib/python3.6/dist-packages\n", 249 | "Searching for scikit-learn==0.22.2.post1\n", 250 | "Best match: scikit-learn 0.22.2.post1\n", 251 | "Adding scikit-learn 0.22.2.post1 to easy-install.pth file\n", 252 | "\n", 253 | "Using /usr/local/lib/python3.6/dist-packages\n", 254 | "Searching for PyYAML==3.13\n", 255 | "Best match: PyYAML 3.13\n", 256 | "Adding PyYAML 3.13 to easy-install.pth file\n", 257 | "\n", 258 | "Using /usr/local/lib/python3.6/dist-packages\n", 259 | "Searching for setuptools==50.3.2\n", 260 | "Best match: setuptools 50.3.2\n", 261 | "Adding setuptools 50.3.2 to easy-install.pth file\n", 262 | "Installing easy_install script to /usr/local/bin\n", 263 | "Installing easy_install-3.8 script to /usr/local/bin\n", 264 | "\n", 265 | "Using /usr/local/lib/python3.6/dist-packages\n", 266 | "Searching for google-auth-oauthlib==0.4.2\n", 267 | "Best match: google-auth-oauthlib 0.4.2\n", 268 | "Adding google-auth-oauthlib 0.4.2 to easy-install.pth file\n", 269 | "Installing google-oauthlib-tool script to /usr/local/bin\n", 270 | "\n", 271 | "Using /usr/local/lib/python3.6/dist-packages\n", 272 | "Searching for tensorboard-plugin-wit==1.7.0\n", 273 | "Best match: tensorboard-plugin-wit 1.7.0\n", 274 | "Adding tensorboard-plugin-wit 1.7.0 to easy-install.pth file\n", 275 | "\n", 276 | "Using /usr/local/lib/python3.6/dist-packages\n", 277 | "Searching for google-auth==1.17.2\n", 278 | "Best match: google-auth 1.17.2\n", 279 | "Adding google-auth 1.17.2 to easy-install.pth file\n", 280 | "\n", 281 | "Using /usr/local/lib/python3.6/dist-packages\n", 282 | "Searching for Werkzeug==1.0.1\n", 283 | "Best match: Werkzeug 1.0.1\n", 284 | "Adding Werkzeug 1.0.1 to easy-install.pth file\n", 285 | "\n", 286 | "Using /usr/local/lib/python3.6/dist-packages\n", 287 | "Searching for requests==2.23.0\n", 288 | "Best match: requests 2.23.0\n", 289 | "Adding requests 2.23.0 to easy-install.pth file\n", 290 | "\n", 291 | "Using /usr/local/lib/python3.6/dist-packages\n", 292 | "Searching for Markdown==3.3.3\n", 293 | "Best match: Markdown 3.3.3\n", 294 | "Adding Markdown 3.3.3 to easy-install.pth file\n", 295 | "Installing markdown_py script to /usr/local/bin\n", 296 | "\n", 297 | "Using /usr/local/lib/python3.6/dist-packages\n", 298 | "Searching for requests-oauthlib==1.3.0\n", 299 | "Best match: requests-oauthlib 1.3.0\n", 300 | "Adding requests-oauthlib 1.3.0 to easy-install.pth file\n", 301 | "\n", 302 | "Using /usr/local/lib/python3.6/dist-packages\n", 303 | "Searching for pyasn1-modules==0.2.8\n", 304 | "Best match: pyasn1-modules 0.2.8\n", 305 | "Adding pyasn1-modules 0.2.8 to easy-install.pth file\n", 306 | "\n", 307 | "Using /usr/local/lib/python3.6/dist-packages\n", 308 | "Searching for cachetools==4.1.1\n", 309 | "Best match: cachetools 4.1.1\n", 310 | "Adding cachetools 4.1.1 to easy-install.pth file\n", 311 | "\n", 312 | "Using /usr/local/lib/python3.6/dist-packages\n", 313 | "Searching for rsa==4.6\n", 314 | "Best match: rsa 4.6\n", 315 | "Adding rsa 4.6 to easy-install.pth file\n", 316 | "Installing pyrsa-decrypt script to /usr/local/bin\n", 317 | "Installing pyrsa-encrypt script to /usr/local/bin\n", 318 | "Installing pyrsa-keygen script to /usr/local/bin\n", 319 | "Installing pyrsa-priv2pub script to /usr/local/bin\n", 320 | "Installing pyrsa-sign script to /usr/local/bin\n", 321 | "Installing pyrsa-verify script to /usr/local/bin\n", 322 | "\n", 323 | "Using /usr/local/lib/python3.6/dist-packages\n", 324 | "Searching for chardet==3.0.4\n", 325 | "Best match: chardet 3.0.4\n", 326 | "Adding chardet 3.0.4 to easy-install.pth file\n", 327 | "Installing chardetect script to /usr/local/bin\n", 328 | "\n", 329 | "Using /usr/local/lib/python3.6/dist-packages\n", 330 | "Searching for idna==2.10\n", 331 | "Best match: idna 2.10\n", 332 | "Adding idna 2.10 to easy-install.pth file\n", 333 | "\n", 334 | "Using /usr/local/lib/python3.6/dist-packages\n", 335 | "Searching for urllib3==1.24.3\n", 336 | "Best match: urllib3 1.24.3\n", 337 | "Adding urllib3 1.24.3 to easy-install.pth file\n", 338 | "\n", 339 | "Using /usr/local/lib/python3.6/dist-packages\n", 340 | "Searching for certifi==2020.12.5\n", 341 | "Best match: certifi 2020.12.5\n", 342 | "Adding certifi 2020.12.5 to easy-install.pth file\n", 343 | "\n", 344 | "Using /usr/local/lib/python3.6/dist-packages\n", 345 | "Searching for importlib-metadata==3.1.1\n", 346 | "Best match: importlib-metadata 3.1.1\n", 347 | "Adding importlib-metadata 3.1.1 to easy-install.pth file\n", 348 | "\n", 349 | "Using /usr/local/lib/python3.6/dist-packages\n", 350 | "Searching for oauthlib==3.1.0\n", 351 | "Best match: oauthlib 3.1.0\n", 352 | "Adding oauthlib 3.1.0 to easy-install.pth file\n", 353 | "\n", 354 | "Using /usr/local/lib/python3.6/dist-packages\n", 355 | "Searching for pyasn1==0.4.8\n", 356 | "Best match: pyasn1 0.4.8\n", 357 | "Adding pyasn1 0.4.8 to easy-install.pth file\n", 358 | "\n", 359 | "Using /usr/local/lib/python3.6/dist-packages\n", 360 | "Searching for zipp==3.4.0\n", 361 | "Best match: zipp 3.4.0\n", 362 | "Adding zipp 3.4.0 to easy-install.pth file\n", 363 | "\n", 364 | "Using /usr/local/lib/python3.6/dist-packages\n", 365 | "Finished processing dependencies for outrageclf==0.1.5\n" 366 | ], 367 | "name": "stdout" 368 | } 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": { 374 | "id": "ku3BZxXTCHpe" 375 | }, 376 | "source": [ 377 | "**Running the wrapper function**" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "metadata": { 383 | "id": "OiegrAjQCUDo" 384 | }, 385 | "source": [ 386 | "# an joblib embedding file and a model file is required\n", 387 | "# contact the Crockett lab for these model files\n", 388 | "embedding_url = \"/31k.joblib\"\n", 389 | "model_url = \"/31k.h5\"" 390 | ], 391 | "execution_count": 10, 392 | "outputs": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "metadata": { 397 | "id": "PMy1QJqvCtES" 398 | }, 399 | "source": [ 400 | "# these tweets are created purely for demostration\n", 401 | "# they are not part of, or represent any tweets in the actual training data\n", 402 | "tweets = [\n", 403 | " \"This topic infuriates me because it violates my moral stance\",\n", 404 | " \"This is just a super-normal topic #normal\",\n", 405 | " \"The type of football they play today is atrocious\"\n", 406 | " ]" 407 | ], 408 | "execution_count": 11, 409 | "outputs": [] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "metadata": { 414 | "colab": { 415 | "base_uri": "https://localhost:8080/" 416 | }, 417 | "id": "ihs24aZYCpvs", 418 | "outputId": "da5bf7e5-c164-413e-b4cb-aef971953ac8" 419 | }, 420 | "source": [ 421 | "from outrageclf.classifier import pretrained_model_predict\n", 422 | "pretrained_model_predict(tweets, embedding_url, model_url)" 423 | ], 424 | "execution_count": 12, 425 | "outputs": [ 426 | { 427 | "output_type": "stream", 428 | "text": [ 429 | "Loaded pre-trained tokenizer at: 31k.joblib\n", 430 | "WARNING:tensorflow:Layer gru_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 431 | "WARNING:tensorflow:Layer gru_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 432 | "WARNING:tensorflow:Layer gru_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 433 | "WARNING:tensorflow:Layer gru_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 434 | "Loaded pretrained model at: 31k.h5\n" 435 | ], 436 | "name": "stdout" 437 | }, 438 | { 439 | "output_type": "execute_result", 440 | "data": { 441 | "text/plain": [ 442 | "array([[9.9660861e-01],\n", 443 | " [4.0077552e-04],\n", 444 | " [6.3920277e-01]], dtype=float32)" 445 | ] 446 | }, 447 | "metadata": { 448 | "tags": [] 449 | }, 450 | "execution_count": 12 451 | } 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": { 457 | "id": "I8l_1jYuDvSV" 458 | }, 459 | "source": [ 460 | "**A peak into the model**" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": { 466 | "id": "CRoXZklZEE1E" 467 | }, 468 | "source": [ 469 | "This section gives you a closer look at every steps under `pretrained_model_predict`" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "metadata": { 475 | "id": "_pbnBGfRD4Zs" 476 | }, 477 | "source": [ 478 | "from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag\n", 479 | "from outrageclf.classifier import _load_crockett_model" 480 | ], 481 | "execution_count": 13, 482 | "outputs": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "metadata": { 487 | "colab": { 488 | "base_uri": "https://localhost:8080/" 489 | }, 490 | "id": "eB07TdMrEUgN", 491 | "outputId": "c5e0c61b-de21-422e-b4c5-db7f41376237" 492 | }, 493 | "source": [ 494 | "# loading our pre-trained models\n", 495 | "word_embed = WordEmbed()\n", 496 | "word_embed._get_pretrained_tokenizer(embedding_url)\n", 497 | "model = _load_crockett_model(model_url)" 498 | ], 499 | "execution_count": 14, 500 | "outputs": [ 501 | { 502 | "output_type": "stream", 503 | "text": [ 504 | "Loaded pre-trained tokenizer at: 31k.joblib\n", 505 | "WARNING:tensorflow:Layer gru_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 506 | "WARNING:tensorflow:Layer gru_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 507 | "WARNING:tensorflow:Layer gru_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n", 508 | "WARNING:tensorflow:Layer gru_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n" 509 | ], 510 | "name": "stdout" 511 | } 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "metadata": { 517 | "id": "3En1_lLrEYtj" 518 | }, 519 | "source": [ 520 | "# the text are lemmatized and embedded into 50-d space\n", 521 | "lemmatized_text = get_lemmatize_hashtag(tweets)\n", 522 | "embedded_vector = word_embed._get_embedded_vector(lemmatized_text)" 523 | ], 524 | "execution_count": 15, 525 | "outputs": [] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "metadata": { 530 | "colab": { 531 | "base_uri": "https://localhost:8080/" 532 | }, 533 | "id": "IldpZRMNEiXE", 534 | "outputId": "9acf3f84-2f0e-4124-b0af-6180a03b4c89" 535 | }, 536 | "source": [ 537 | "for idx, tweet in enumerate(tweets):\n", 538 | " print(\"Original tweet:\", tweet)\n", 539 | " print(\"Lemmatize text:\", lemmatized_text[idx])\n", 540 | " print(\"50-d embedded vector:\", embedded_vector[idx])" 541 | ], 542 | "execution_count": 16, 543 | "outputs": [ 544 | { 545 | "output_type": "stream", 546 | "text": [ 547 | "Original tweet: This topic infuriates me because it violates my moral stance\n", 548 | "Lemmatize text: topic infuriate violate moral stance \n", 549 | "50-d embedded vector: [1760 2401 1705 611 3121 0 0 0 0 0 0 0 0 0\n", 550 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 551 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 552 | " 0 0 0 0 0 0 0 0]\n", 553 | "Original tweet: This is just a super-normal topic #normal\n", 554 | "Lemmatize text: super normal topic #normal\n", 555 | "50-d embedded vector: [1427 2033 1760 2033 0 0 0 0 0 0 0 0 0 0\n", 556 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 557 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 558 | " 0 0 0 0 0 0 0 0]\n", 559 | "Original tweet: The type of football they play today is atrocious\n", 560 | "Lemmatize text: type football play today atrocious \n", 561 | "50-d embedded vector: [ 958 2308 250 93 3486 0 0 0 0 0 0 0 0 0\n", 562 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 563 | " 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", 564 | " 0 0 0 0 0 0 0 0]\n" 565 | ], 566 | "name": "stdout" 567 | } 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "metadata": { 573 | "id": "fonhG6PW1F2G" 574 | }, 575 | "source": [ 576 | "# the model then makes prediction using the embedded_vector as inputs\n", 577 | "predict = model.predict(embedded_vector)" 578 | ], 579 | "execution_count": 17, 580 | "outputs": [] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "metadata": { 585 | "colab": { 586 | "base_uri": "https://localhost:8080/" 587 | }, 588 | "id": "l9hecInmFNe8", 589 | "outputId": "b71ada2e-d0d4-43c7-a911-ddbd89ac19d8" 590 | }, 591 | "source": [ 592 | "for idx, tweet in enumerate(tweets):\n", 593 | " print(\"Original tweet:\", tweet)\n", 594 | " print(\"Predicted probability of outrage:\", predict[idx])\n", 595 | " print(\"\\n\")" 596 | ], 597 | "execution_count": 18, 598 | "outputs": [ 599 | { 600 | "output_type": "stream", 601 | "text": [ 602 | "Original tweet: This topic infuriates me because it violates my moral stance\n", 603 | "Predicted probability of outrage: [0.9966086]\n", 604 | "\n", 605 | "\n", 606 | "Original tweet: This is just a super-normal topic #normal\n", 607 | "Predicted probability of outrage: [0.00040078]\n", 608 | "\n", 609 | "\n", 610 | "Original tweet: The type of football they play today is atrocious\n", 611 | "Predicted probability of outrage: [0.6392028]\n", 612 | "\n", 613 | "\n" 614 | ], 615 | "name": "stdout" 616 | } 617 | ] 618 | } 619 | ] 620 | } -------------------------------------------------------------------------------- /outrageclf/classifier.py: -------------------------------------------------------------------------------- 1 | from .model_architect import threshold_acc 2 | from .preprocessing import WordEmbed, get_lemmatize_hashtag 3 | from keras.models import load_model 4 | from joblib import load 5 | 6 | 7 | ''' 8 | Load pretrained model 9 | 10 | Input: url 11 | - Users responsibility to acquire the h5 model format 12 | - and input correct url link 13 | 14 | Output: 15 | ''' 16 | def _load_crockett_model(url): 17 | return load_model( 18 | url, 19 | custom_objects={'threshold_acc': threshold_acc} 20 | ) 21 | 22 | ''' 23 | Wrapper function for prediction: 24 | In general, if users have to call this function several times 25 | it it more efficient to load the model and use built-in predict method. 26 | 27 | Input: text vector, lemmatize_url, model_url 28 | ''' 29 | def pretrained_model_predict(text_vector, lemmatize_url, model_url): 30 | word_embed = WordEmbed() 31 | word_embed._get_pretrained_tokenizer(lemmatize_url) 32 | model = _load_crockett_model(model_url) 33 | 34 | lemmatized_text = get_lemmatize_hashtag(text_vector) 35 | embedded_vector = word_embed._get_embedded_vector(lemmatized_text) 36 | predict = model.predict(embedded_vector) 37 | 38 | return pretrained_model_predict 39 | -------------------------------------------------------------------------------- /outrageclf/helpers.py: -------------------------------------------------------------------------------- 1 | import emoji, re, collections, string 2 | import nltk 3 | from nltk import pos_tag 4 | from nltk.stem.wordnet import WordNetLemmatizer 5 | from nltk.stem.snowball import SnowballStemmer 6 | from nltk.tokenize import TweetTokenizer 7 | from nltk.tokenize import word_tokenize 8 | from nltk.corpus import wordnet, stopwords 9 | 10 | 11 | # top emojis 12 | # the list is practically derived from our datasets 13 | top_emojis = ['😂','🤣','😡','🖕','😹','🙏','👎','🌊','🙄','🤔'] 14 | lemmatizer = WordNetLemmatizer() 15 | cachedStopWordsPunctuation = set(stopwords.words("english") 16 | + [x for x in list(string.punctuation) if x not in ['!','?']] 17 | + ['',' ',' ']) 18 | 19 | # check if emojis in a string 20 | def char_is_emoji(char): 21 | return char in emoji.UNICODE_EMOJI['en'] 22 | 23 | s = set(emoji.UNICODE_EMOJI['en'].values()) 24 | def string_is_emoji_name(text): 25 | return text in s 26 | 27 | # extract a string of emojis from a string 28 | def extract_emojis(text): 29 | return ' '.join(c for c in text if c in emoji.UNICODE_EMOJI) 30 | 31 | # just get the hashtag 32 | # this function removes the function, even in hashtag 33 | def get_hashtag(text): 34 | text = re.sub(r'[%s]' % re.escape("""!"$%&()*+,-./:;<=>?@[\]^_`{|}~"""), '', text) 35 | return ",".join([i.lower() for i in text.split() if i.startswith("#") ]) 36 | 37 | # word_tokenize as defined in nltk library 38 | def token_postag(text): 39 | tokens = word_tokenize(text) 40 | return pos_tag(tokens) 41 | 42 | # function to simplify POS 43 | # exclusively used for lemmatization using WORDNET 44 | def get_wordnet_pos(treebank_tag): 45 | if treebank_tag.startswith('J'): 46 | return wordnet.ADJ 47 | elif treebank_tag.startswith('V'): 48 | return wordnet.VERB 49 | elif treebank_tag.startswith('N'): 50 | return wordnet.NOUN 51 | elif treebank_tag.startswith('R'): 52 | return wordnet.ADV 53 | else: 54 | return None 55 | 56 | # similary to get_wordnet_pos 57 | # but more granular in order to create more features 58 | def modify_pos(dict): 59 | result_dic = {} 60 | for key in dict.keys(): 61 | if key.startswith('J'): 62 | if "adj" in result_dic: 63 | result_dic["adj"] += dict[key] 64 | else: 65 | result_dic["adj"] = dict[key] 66 | elif key.startswith('V'): 67 | if "verb" in result_dic: 68 | result_dic["verb"] += dict[key] 69 | else: 70 | result_dic["verb"] = dict[key] 71 | elif key.startswith('N'): 72 | if "noun" in result_dic: 73 | result_dic["noun"] += dict[key] 74 | else: 75 | result_dic["noun"] = dict[key] 76 | elif key.startswith('R'): 77 | if "adv" in result_dic: 78 | result_dic["adv"] += dict[key] 79 | else: 80 | result_dic["adv"] = dict[key] 81 | elif key in ['PRP', 'PRP$']: 82 | if "pronoun" in result_dic: 83 | result_dic["pronoun"] += dict[key] 84 | else: 85 | result_dic["pronoun"] = dict[key] 86 | elif key.startswith('W'): 87 | if "wh" in result_dic: 88 | result_dic["wh"] += dict[key] 89 | else: 90 | result_dic["wh"] = dict[key] 91 | else: 92 | if "other" in result_dic: 93 | result_dic["other"] += dict[key] 94 | else: 95 | result_dic["other"] = dict[key] 96 | return result_dic 97 | 98 | 99 | # tokenize and then stemmize the string 100 | def token_stem_lemmatize(text): 101 | tokens_pos = token_postag(text) 102 | result_string = '' 103 | for word, tag in tokens_pos: 104 | wntag = get_wordnet_pos(tag) 105 | # not return tag in case of None 106 | if wntag is None: 107 | result_string += lemmatizer.lemmatize(word.lower()) 108 | else: 109 | result_string += lemmatizer.lemmatize(word.lower(), pos=wntag) 110 | result_string += ' ' 111 | return result_string 112 | 113 | # remove stop words and short words 114 | def stop_short_process(text): 115 | text = ' '.join([word for word in text.split() if word not in cachedStopWordsPunctuation]) 116 | text = re.sub("[^a-zA-Z ]+", '', text) # remove apostrophe now 117 | text = ' '.join(word for word in text.split() if len(word)>2) 118 | return text 119 | 120 | # wrap up over the processing 121 | def tweet_process(tweet): 122 | tweet = re.sub('//t.co\S+', ' ', tweet) # remove link 123 | tweet = re.sub('http\S+\s*', ' ', tweet) # remove URLs 124 | tweet = re.sub('@\S+', ' ', tweet) # remove mentions 125 | tweet = re.sub('&', ' ', tweet) # remove mentions 126 | tweet = re.sub('RT@|RT @', ' ', tweet) # remove RT 127 | tweet = re.sub('#\S+', ' ', tweet) # remove hashtags 128 | tweet = re.sub('[%s]' % re.escape("""!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', tweet) # remove punctuations, leave behind apostrophe"'" 129 | tweet = re.sub('\s+', ' ', tweet) # remove extra whitespace 130 | tweet = token_stem_lemmatize(tweet) 131 | tweet = stop_short_process(tweet) 132 | return tweet 133 | 134 | # check if the tweet has embedded link 135 | def has_link(tweet): 136 | short_link = re.findall('//t.co\S+',tweet) 137 | url_link = re.findall('http\S+\s*',tweet) 138 | result = 0 if not short_link and not url_link else 1 139 | return (result) 140 | 141 | 142 | def psy_tweet_process(tweet): 143 | stemmer = SnowballStemmer("english") 144 | tokenizer = TweetTokenizer() 145 | tweet_tokenized = tokenizer.tokenize(tweet) 146 | n = len(tweet_tokenized) 147 | try: 148 | tweet_tokenized = [unicode(y.encode("utf-8"), errors='ignore') for y in tweet_tokenized] 149 | stemmed = [stemmer.stem(y) for y in tweet_tokenized] 150 | except: 151 | stemmed = [stemmer.stem(y) for y in tweet_tokenized] 152 | stemmed = [d for d in stemmed if d not in cachedStopWordsPunctuation] 153 | return stemmed, n 154 | -------------------------------------------------------------------------------- /outrageclf/model_architect.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Specification of Deep and Transfer Learning model, as called in training.py. 4 | 5 | Model Architect includes: 6 | - Deep LSTM 7 | - Deep GRU 8 | - Bidirectional LSTM 9 | - Bidirectional with Attention 10 | 11 | We only provide access to our Deep GRU model, as used in the paper: 12 | 13 | """ 14 | 15 | from sklearn import ensemble 16 | import tensorflow as tf 17 | import keras 18 | import keras.layers as layers 19 | import tensorflow.keras.backend as K 20 | from keras import Sequential, optimizers, initializers, regularizers, constraints 21 | from tensorflow.python.keras.layers import Layer 22 | 23 | 24 | embedding_dim = 50 25 | maxlen = 50 26 | 27 | 28 | ''' 29 | Threshold function 30 | ''' 31 | 32 | def threshold_acc(y_true, y_pred, threshold = 0.7): 33 | if K.backend() == 'tensorflow': 34 | return K.mean(K.equal(y_true, 35 | K.cast(K.greater_equal(y_pred,threshold), y_true.dtype))) 36 | else: 37 | return K.mean(K.equal(y_true, 38 | K.greater_equal(y_pred,threshold))) 39 | 40 | 41 | 42 | ''' 43 | 3-Layer LSTM model: 128, 64, 1 units each 44 | 2 Dropout layers 45 | ''' 46 | 47 | def lstm_model (embedding_matrix, vocab_size): 48 | model = Sequential() 49 | model.add(layers.Embedding(vocab_size, embedding_dim, 50 | weights=[embedding_matrix], 51 | input_length=maxlen, 52 | trainable=True)) 53 | model.add(layers.LSTM(128)) 54 | model.add(layers.Dropout(0.5)) 55 | model.add(layers.Dense(64, activation='relu')) 56 | model.add(layers.Dropout(0.5)) 57 | model.add(layers.Dense(1, activation='sigmoid')) 58 | model.compile(optimizer='adam', 59 | loss='binary_crossentropy', 60 | metrics=[threshold_acc]) 61 | return (model) 62 | 63 | 64 | 65 | ''' 66 | Deep GRU model: 256, 128, 64, 32 layer 67 | 2 Dropout layers 68 | ''' 69 | 70 | def deep_gru_model (embedding_matrix, vocab_size): 71 | model = Sequential() 72 | model.add(layers.Embedding(vocab_size, embedding_dim, 73 | weights=[embedding_matrix], 74 | input_length=maxlen, 75 | trainable=True)) 76 | model.add(layers.GRU(256, return_sequences = True)) 77 | model.add(layers.GRU(128, return_sequences = True)) 78 | model.add(layers.GRU(64, return_sequences = True)) 79 | model.add(layers.GRU(32)) 80 | model.add(layers.Dropout(0.3)) 81 | model.add(layers.Dense(64, activation='relu')) 82 | model.add(layers.Dropout(0.5)) 83 | model.add(layers.Dense(1, activation='sigmoid')) 84 | model.compile(optimizer='adam', 85 | loss='binary_crossentropy', 86 | metrics=[threshold_acc]) 87 | return (model) 88 | 89 | 90 | 91 | ''' 92 | Bi-directional model 93 | ''' 94 | 95 | def deep_bidirectional_model (embedding_matrix, vocab_size): 96 | model = Sequential() 97 | model.add(layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix], 98 | input_length=maxlen, trainable=True)) 99 | model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True))) 100 | model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True))) 101 | model.add(layers.Bidirectional(layers.GRU(64))) 102 | 103 | model.add(layers.Dropout(0.3)) 104 | model.add(layers.Dense(64, activation='relu')) 105 | model.add(layers.Dropout(0.5)) 106 | model.add(layers.Dense(1, activation='sigmoid')) 107 | model.compile(optimizer='adam', 108 | loss='binary_crossentropy', 109 | metrics=[threshold_acc]) 110 | return (model) 111 | 112 | 113 | ''' 114 | Architecture for Attention layer 115 | including: - dot_product wrapper 116 | - predefined AttentionWithContext layer 117 | ''' 118 | 119 | def dot_product(x, kernel): 120 | """ 121 | Wrapper for dot product operation, in order to be compatible with both 122 | Theano and Tensorflow 123 | Args: 124 | x (): input 125 | kernel (): weights 126 | Returns: 127 | """ 128 | if K.backend() == 'tensorflow': 129 | return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1) 130 | else: 131 | return K.dot(x, kernel) 132 | 133 | 134 | 135 | class AttentionWithContext(Layer): 136 | """ 137 | Attention operation, with a context/query vector, for temporal data. 138 | Supports Masking. 139 | Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf] 140 | "Hierarchical Attention Networks for Document Classification" 141 | by using a context vector to assist the attention 142 | # Input shape 143 | 3D tensor with shape: `(samples, steps, features)`. 144 | # Output shape 145 | 2D tensor with shape: `(samples, features)`. 146 | How to use: 147 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 148 | The dimensions are inferred based on the output shape of the RNN. 149 | Note: The layer has been tested with Keras 2.0.6 150 | Example: 151 | model.add(LSTM(64, return_sequences=True)) 152 | model.add(AttentionWithContext()) 153 | # next add a Dense layer (for classification/regression) or whatever... 154 | """ 155 | 156 | def __init__(self, 157 | W_regularizer=None, u_regularizer=None, b_regularizer=None, 158 | W_constraint=None, u_constraint=None, b_constraint=None, 159 | bias=True, **kwargs): 160 | 161 | self.supports_masking = True 162 | self.init = initializers.get('glorot_uniform') 163 | 164 | self.W_regularizer = regularizers.get(W_regularizer) 165 | self.u_regularizer = regularizers.get(u_regularizer) 166 | self.b_regularizer = regularizers.get(b_regularizer) 167 | 168 | self.W_constraint = constraints.get(W_constraint) 169 | self.u_constraint = constraints.get(u_constraint) 170 | self.b_constraint = constraints.get(b_constraint) 171 | 172 | self.bias = bias 173 | super(AttentionWithContext, self).__init__(**kwargs) 174 | 175 | def build(self, input_shape): 176 | assert len(input_shape) == 3 177 | 178 | self.W = self.add_weight((input_shape[-1], input_shape[-1],), 179 | initializer=self.init, 180 | name='{}_W'.format(self.name), 181 | regularizer=self.W_regularizer, 182 | constraint=self.W_constraint) 183 | if self.bias: 184 | self.b = self.add_weight((input_shape[-1],), 185 | initializer='zero', 186 | name='{}_b'.format(self.name), 187 | regularizer=self.b_regularizer, 188 | constraint=self.b_constraint) 189 | 190 | self.u = self.add_weight((input_shape[-1],), 191 | initializer=self.init, 192 | name='{}_u'.format(self.name), 193 | regularizer=self.u_regularizer, 194 | constraint=self.u_constraint) 195 | 196 | super(AttentionWithContext, self).build(input_shape) 197 | 198 | def compute_mask(self, input, input_mask=None): 199 | # do not pass the mask to the next layers 200 | return None 201 | 202 | def call(self, x, mask=None): 203 | uit = dot_product(x, self.W) 204 | 205 | if self.bias: 206 | uit += self.b 207 | 208 | uit = K.tanh(uit) 209 | ait = dot_product(uit, self.u) 210 | 211 | a = K.exp(ait) 212 | 213 | # apply mask after the exp. will be re-normalized next 214 | if mask is not None: 215 | # Cast the mask to floatX to avoid float64 upcasting in theano 216 | a *= K.cast(mask, K.floatx()) 217 | 218 | # in some cases especially in the early stages of training the sum may be almost zero 219 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. 220 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) 221 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 222 | 223 | a = K.expand_dims(a) 224 | weighted_input = x * a 225 | return K.sum(weighted_input, axis=1) 226 | 227 | def compute_output_shape(self, input_shape): 228 | return input_shape[0], input_shape[-1] 229 | 230 | 231 | 232 | ''' 233 | Attention model is a Bidirectional GRU with a layer of Attention 234 | ''' 235 | 236 | def attention_model (embedding_matrix, vocab_size): 237 | model = Sequential() 238 | model.add(layers.Embedding(vocab_size, embedding_dim, 239 | weights=[embedding_matrix], 240 | input_length=maxlen, 241 | trainable=True)) 242 | model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True))) 243 | model.add(layers.Bidirectional(layers.GRU(64, return_sequences = True))) 244 | model.add(AttentionWithContext()) 245 | model.add(layers.Dense(32, activation='relu')) 246 | model.add(layers.Dropout(0.5)) 247 | model.add(layers.Dense(1, activation='sigmoid')) 248 | model.compile(optimizer='adam', 249 | loss='binary_crossentropy', 250 | metrics=[threshold_acc]) 251 | return model 252 | -------------------------------------------------------------------------------- /outrageclf/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from . import helpers 3 | import keras 4 | from keras.preprocessing.text import Tokenizer 5 | from keras.preprocessing.sequence import pad_sequences 6 | from joblib import dump, load 7 | import urllib 8 | 9 | glove_default_url = 'https://worksheets.codalab.org/rest/bundles/'\ 10 | '0x4090ba96b8a444c2a44b2c47884c25f2/'\ 11 | 'contents/blob/glove.twitter.27B.50d.txt' 12 | ''' 13 | Get lemmatize hashtag 14 | - Create a lemmatized + hashtag features for text vector 15 | - Input: a text vector 16 | - Output: a vector of lemmatized keywords + hashtags (if exist) 17 | ''' 18 | 19 | def get_lemmatize_hashtag(text_vector): 20 | hashtag_ls = [helpers.get_hashtag(text) for text in text_vector] 21 | wn_lemmatize_ls = [helpers.tweet_process(text) for text in text_vector] 22 | hashtag_lemmatize = [' '.join([x for x in lemma.split(" ") + hashtag.split(" ")]) 23 | for lemma, hashtag in zip(wn_lemmatize_ls, hashtag_ls)] 24 | return hashtag_lemmatize 25 | 26 | 27 | 28 | ''' 29 | Word embedding object: 30 | * MAXLEN defaults to 50, we currently don't support customization 31 | 32 | * TRAINING: 33 | - Users' responsibility to submit a valid tokenizer path with .joblib format 34 | 35 | * USING A PRETRAINED WORD EMBEDDING: 36 | - Please contact the Crockett lab for access to the tokenizer 37 | - Users' responsbility to submit a valid tokenizer path with .joblib format 38 | ''' 39 | 40 | class WordEmbed: 41 | def __init__(self): 42 | self.tokenizer_path = None 43 | self.tokenizer = None 44 | 45 | def _get_pretrained_tokenizer(self, path): 46 | self.tokenizer_path = path 47 | self.tokenizer = load(self.tokenizer_path) 48 | print ("Loaded pre-trained tokenizer at:", path) 49 | 50 | def _train_new_tokenizer(self, text_vector, saving_path): 51 | self.tokenizer_path = saving_path 52 | embedding_tokenizer = Tokenizer() 53 | embedding_tokenizer.fit_on_texts(text_vector) 54 | 55 | 56 | self.tokenizer = embedding_tokenizer 57 | dump(embedding_tokenizer, self.tokenizer_path) 58 | print ("Trained and saved new tokenizer at:", 59 | self.tokenizer_path) 60 | 61 | def _get_embedded_vector(self, text_vector): 62 | embedded = pad_sequences(self.tokenizer.texts_to_sequences(text_vector), 63 | padding='post', 64 | maxlen=50) 65 | return embedded 66 | 67 | 68 | 69 | ''' 70 | Create Embedding matrix 71 | - based on pre-defined tokenizer 72 | - currently only supported Glove 50d Twitter 73 | - will be updated to support different embedding in the future 74 | 75 | Input: - word_index: from an associated Tokenizer, called from preprocessing.py 76 | - filepath: file path to a pretrained embedding e.g Glove 50d Twitter 77 | 78 | Result: - An embedding matrix for embedding based model such as LSTM, GRU 79 | * It is strictly associated with the Tokenizer used in the word_index argument 80 | * User's responsibility to make sure they are correct 81 | ''' 82 | 83 | def create_embedding_matrix(word_index, filepath): 84 | embedding_dim = 50 85 | # Adding again 1 because of reserved 0 index 86 | vocab_size = len(word_index) + 1 87 | embedding_matrix = np.zeros((vocab_size, embedding_dim)) 88 | 89 | with open(filepath) as f: 90 | for line in f: 91 | word, *vector = line.split() 92 | if word in word_index: 93 | idx = word_index[word] 94 | embedding_matrix[idx] = np.array( 95 | vector, dtype=np.float32)[:embedding_dim] 96 | return embedding_matrix 97 | 98 | 99 | 100 | ''' 101 | Create Embedding matrix from default url link 102 | 103 | Similar to create_embedding_matrix 104 | but use a online storage of Glove 27B 50d embedding 105 | ''' 106 | 107 | def create_embedding_matrix_default(word_index): 108 | embedding_dim = 50 109 | # Adding again 1 because of reserved 0 index 110 | vocab_size = len(word_index) + 1 111 | embedding_matrix = np.zeros((vocab_size, embedding_dim)) 112 | 113 | file = urllib.request.urlopen(glove_default_url) 114 | 115 | for line in file: 116 | word, *vector = line.split() 117 | if word.decode() in word_index: 118 | idx = word_index[word.decode()] 119 | embedding_matrix[idx] = np.array( 120 | vector, dtype=np.float32)[:embedding_dim] 121 | return embedding_matrix 122 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | # The directory containing this file 5 | HERE = pathlib.Path(__file__).parent 6 | 7 | # The text of the README file 8 | README = (HERE / "README.md").read_text() 9 | 10 | setup(name='outrageclf', 11 | version='0.1.6', 12 | description='Outrage Classifier - developed by the Crockett Lab', 13 | long_description=README, 14 | url='https://github.com/CrockettLab/outrage_classifier', 15 | install_requires=[ 16 | 'emoji', 17 | 'joblib', 18 | 'keras', 19 | 'nltk', 20 | 'numpy', 21 | 'sklearn', 22 | 'tensorflow' 23 | ], 24 | author='Tuan Nguyen Doan', 25 | author_email='tuan.nguyen.doan@aya.yale.edu', 26 | license='Creative Commons Attribution-NonCommercial-ShareAlike 2.0', 27 | packages=['outrageclf'], 28 | include_package_data=True, 29 | zip_safe=False) 30 | -------------------------------------------------------------------------------- /training.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | TEST FILE TO TRAIN MODEL. 4 | 5 | In general, this file should serve as a guideline of 6 | how to train model with the outrageclf package. 7 | 8 | 9 | Positional Arguments: 10 | - filepath: file name of training csv file e.g. "./*.csv" 11 | - savepath: path to where models are saved 12 | - filename: name of saved models 13 | - model: specify one of the model architect to train e.g: "LSTM", "GRU" 14 | - text_column: name of training text column e.g. "text" 15 | - class_column: name of class column e.g "outrage" 16 | 17 | Available model: 18 | - LSTM with Glove Twitter 19 | - GRU with Glove Twitter 20 | 21 | """ 22 | 23 | 24 | import pandas as pd 25 | import numpy as np 26 | import tensorflow as tf 27 | import argparse 28 | import keras 29 | import outrageclf 30 | from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag, create_embedding_matrix_default 31 | from outrageclf.model_architect import lstm_model, deep_gru_model 32 | 33 | 34 | model = ["LSTM", "GRU"] 35 | 36 | if __name__ == '__main__': 37 | #Initialize the parser 38 | parser = argparse.ArgumentParser(description="Outrage Classifier Training. Developed by The Crockett Lab") 39 | parser.add_argument( 40 | "filepath", 41 | help='specifying the path to the training dataset. This should be in the form of .../*.csv' 42 | ) 43 | 44 | parser.add_argument( 45 | "savepath", 46 | help= ('specifying the path to save the model.', 47 | 'There will be two files being saved to this path: a tokenizer and a trained model' 48 | ) 49 | ) 50 | 51 | parser.add_argument( 52 | "filename", 53 | help= ('name of the training file.', 54 | 'This is used to attached to the name of tokenizer and the trained model.') 55 | ) 56 | 57 | parser.add_argument( 58 | "model", 59 | help= 'specifying the model for the training. Default value is "LSTM". Allowed values are '+', '.join(model), 60 | choices=model, 61 | nargs='?', 62 | default="LSTM", 63 | metavar = "MODEL" 64 | ) 65 | 66 | parser.add_argument( 67 | "text_column", 68 | help= 'name of text column in csv file' 69 | ) 70 | 71 | parser.add_argument( 72 | "class_column", 73 | help= 'name of class column in csv file. This must be in the form of binary 0, 1 data type' 74 | ) 75 | 76 | 77 | args = parser.parse_args() 78 | df = pd.read_csv(args.filepath) 79 | print ("File loaded") 80 | 81 | word_embed = WordEmbed() 82 | tokenizer_path = args.savepath + args.filename + '_tokenizer' + '.joblib' 83 | lemmatize_hashtag = get_lemmatize_hashtag(df[args.text_column]) 84 | # train the new tokenizer and the embedding matrix for the model 85 | word_embed._train_new_tokenizer(lemmatize_hashtag, tokenizer_path) 86 | word_index = word_embed.tokenizer.word_index 87 | embedding_matrix = create_embedding_matrix_default(word_index) 88 | print ("Embedding matrix created") 89 | 90 | # get X and y train 91 | X_train = word_embed._get_embedded_vector(lemmatize_hashtag) 92 | y_train = np.array(df[args.class_column]) 93 | print ("Training data prepared") 94 | 95 | if args.model == 'LSTM': 96 | model = lstm_model( 97 | embedding_matrix, 98 | vocab_size = len(word_index) + 1 99 | ) 100 | elif args.model == 'GRU': 101 | model = deep_gru_model( 102 | embedding_matrix, 103 | vocab_size = len(word_index) + 1 104 | ) 105 | 106 | # train model 107 | history = model.fit( 108 | X_train, 109 | y_train, 110 | epochs = 20, 111 | batch_size = 300, 112 | verbose = 1 113 | ) 114 | 115 | # save model 116 | model_path = args.savepath + args.filename + '.h5' 117 | model.save(model_path) 118 | 119 | print("Finish training and write " + args.model + " model to:" + model_path) 120 | --------------------------------------------------------------------------------