├── LICENSE
├── MANIFEST.in
├── README.md
├── example.ipynb
├── outrageclf
    ├── classifier.py
    ├── helpers.py
    ├── model_architect.py
    └── preprocessing.py
├── setup.py
└── training.py


/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Attribution-NonCommercial-ShareAlike 2.0
  2 | 
  3 | 
  4 | 
  5 | 
  6 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL
  7 | 
  8 | SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT
  9 | 
 10 | RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS.
 11 | 
 12 | CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND
 13 | 
 14 | DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
 15 | 
 16 | 
 17 | 
 18 | 
 19 | License
 20 | 
 21 | 
 22 | 
 23 | 
 24 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS
 25 | 
 26 | PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR
 27 | 
 28 | OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS
 29 | 
 30 | LICENSE OR COPYRIGHT LAW IS PROHIBITED.
 31 | 
 32 | 
 33 | 
 34 | 
 35 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO
 36 | 
 37 | BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS
 38 | 
 39 | CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 1. Definitions
 45 | 
 46 | 
 47 | 
 48 | 
 49 | a. "Collective Work" means a work, such as a periodical issue, anthology or
 50 | 
 51 | encyclopedia, in which the Work in its entirety in unmodified form, along
 52 | 
 53 | with a number of other contributions, constituting separate and independent
 54 | 
 55 | works in themselves, are assembled into a collective whole. A work that constitutes
 56 | 
 57 | a Collective Work will not be considered a Derivative Work (as defined below)
 58 | 
 59 | for the purposes of this License.
 60 | 
 61 | 
 62 | 
 63 | 
 64 | b. "Derivative Work" means a work based upon the Work or upon the Work and
 65 | 
 66 | other pre-existing works, such as a translation, musical arrangement, dramatization,
 67 | 
 68 | fictionalization, motion picture version, sound recording, art reproduction,
 69 | 
 70 | abridgment, condensation, or any other form in which the Work may be recast,
 71 | 
 72 | transformed, or adapted, except that a work that constitutes a Collective
 73 | 
 74 | Work will not be considered a Derivative Work for the purpose of this License.
 75 | 
 76 | For the avoidance of doubt, where the Work is a musical composition or sound
 77 | 
 78 | recording, the synchronization of the Work in timed-relation with a moving
 79 | 
 80 | image ("synching") will be considered a Derivative Work for the purpose of
 81 | 
 82 | this License.
 83 | 
 84 | 
 85 | 
 86 | 
 87 | c. "Licensor" means the individual or entity that offers the Work under the
 88 | 
 89 | terms of this License.
 90 | 
 91 | 
 92 | 
 93 | 
 94 |      d. "Original Author" means the individual or entity who created the Work.
 95 | 
 96 | 
 97 | 
 98 | 
 99 | e. "Work" means the copyrightable work of authorship offered under the terms
100 | 
101 | of this License.
102 | 
103 | 
104 | 
105 | 
106 | f. "You" means an individual or entity exercising rights under this License
107 | 
108 | who has not previously violated the terms of this License with respect to
109 | 
110 | the Work, or who has received express permission from the Licensor to exercise
111 | 
112 | rights under this License despite a previous violation.
113 | 
114 | 
115 | 
116 | 
117 | g. "License Elements" means the following high-level license attributes as
118 | 
119 | selected by Licensor and indicated in the title of this License: Attribution,
120 | 
121 | Noncommercial, ShareAlike.
122 | 
123 | 
124 | 
125 | 
126 | 2. Fair Use Rights. Nothing in this license is intended to reduce, limit,
127 | 
128 | or restrict any rights arising from fair use, first sale or other limitations
129 | 
130 | on the exclusive rights of the copyright owner under copyright law or other
131 | 
132 | applicable laws.
133 | 
134 | 
135 | 
136 | 
137 | 3. License Grant. Subject to the terms and conditions of this License, Licensor
138 | 
139 | hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for
140 | 
141 | the duration of the applicable copyright) license to exercise the rights in
142 | 
143 | the Work as stated below:
144 | 
145 | 
146 | 
147 | 
148 | a. to reproduce the Work, to incorporate the Work into one or more Collective
149 | 
150 | Works, and to reproduce the Work as incorporated in the Collective Works;
151 | 
152 | 
153 | 
154 | 
155 |      b. to create and reproduce Derivative Works;
156 | 
157 | 
158 | 
159 | 
160 | c. to distribute copies or phonorecords of, display publicly, perform publicly,
161 | 
162 | and perform publicly by means of a digital audio transmission the Work including
163 | 
164 | as incorporated in Collective Works;
165 | 
166 | 
167 | 
168 | 
169 | d. to distribute copies or phonorecords of, display publicly, perform publicly,
170 | 
171 | and perform publicly by means of a digital audio transmission Derivative Works;
172 | 
173 | 
174 | 
175 | 
176 | The above rights may be exercised in all media and formats whether now known
177 | 
178 | or hereafter devised. The above rights include the right to make such modifications
179 | 
180 | as are technically necessary to exercise the rights in other media and formats.
181 | 
182 | All rights not expressly granted by Licensor are hereby reserved, including
183 | 
184 | but not limited to the rights set forth in Sections 4(e) and 4(f).
185 | 
186 | 
187 | 
188 | 
189 | 4. Restrictions. The license granted in Section 3 above is expressly made
190 | 
191 | subject to and limited by the following restrictions:
192 | 
193 | 
194 | 
195 | 
196 | a. You may distribute, publicly display, publicly perform, or publicly digitally
197 | 
198 | perform the Work only under the terms of this License, and You must include
199 | 
200 | a copy of, or the Uniform Resource Identifier for, this License with every
201 | 
202 | copy or phonorecord of the Work You distribute, publicly display, publicly
203 | 
204 | perform, or publicly digitally perform. You may not offer or impose any terms
205 | 
206 | on the Work that alter or restrict the terms of this License or the recipients'
207 | 
208 | exercise of the rights granted hereunder. You may not sublicense the Work.
209 | 
210 | You must keep intact all notices that refer to this License and to the disclaimer
211 | 
212 | of warranties. You may not distribute, publicly display, publicly perform,
213 | 
214 | or publicly digitally perform the Work with any technological measures that
215 | 
216 | control access or use of the Work in a manner inconsistent with the terms
217 | 
218 | of this License Agreement. The above applies to the Work as incorporated in
219 | 
220 | a Collective Work, but this does not require the Collective Work apart from
221 | 
222 | the Work itself to be made subject to the terms of this License. If You create
223 | 
224 | a Collective Work, upon notice from any Licensor You must, to the extent practicable,
225 | 
226 | remove from the Collective Work any reference to such Licensor or the Original
227 | 
228 | Author, as requested. If You create a Derivative Work, upon notice from any
229 | 
230 | Licensor You must, to the extent practicable, remove from the Derivative Work
231 | 
232 | any reference to such Licensor or the Original Author, as requested.
233 | 
234 | 
235 | 
236 | 
237 | b. You may distribute, publicly display, publicly perform, or publicly digitally
238 | 
239 | perform a Derivative Work only under the terms of this License, a later version
240 | 
241 | of this License with the same License Elements as this License, or a Creative
242 | 
243 | Commons iCommons license that contains the same License Elements as this License
244 | 
245 | (e.g. Attribution-NonCommercial-ShareAlike 2.0 Japan). You must include a
246 | 
247 | copy of, or the Uniform Resource Identifier for, this License or other license
248 | 
249 | specified in the previous sentence with every copy or phonorecord of each
250 | 
251 | Derivative Work You distribute, publicly display, publicly perform, or publicly
252 | 
253 | digitally perform. You may not offer or impose any terms on the Derivative
254 | 
255 | Works that alter or restrict the terms of this License or the recipients'
256 | 
257 | exercise of the rights granted hereunder, and You must keep intact all notices
258 | 
259 | that refer to this License and to the disclaimer of warranties. You may not
260 | 
261 | distribute, publicly display, publicly perform, or publicly digitally perform
262 | 
263 | the Derivative Work with any technological measures that control access or
264 | 
265 | use of the Work in a manner inconsistent with the terms of this License Agreement.
266 | 
267 | The above applies to the Derivative Work as incorporated in a Collective Work,
268 | 
269 | but this does not require the Collective Work apart from the Derivative Work
270 | 
271 | itself to be made subject to the terms of this License.
272 | 
273 | 
274 | 
275 | 
276 | c. You may not exercise any of the rights granted to You in Section 3 above
277 | 
278 | in any manner that is primarily intended for or directed toward commercial
279 | 
280 | advantage or private monetary compensation. The exchange of the Work for other
281 | 
282 | copyrighted works by means of digital file-sharing or otherwise shall not
283 | 
284 | be considered to be intended for or directed toward commercial advantage or
285 | 
286 | private monetary compensation, provided there is no payment of any monetary
287 | 
288 | compensation in connection with the exchange of copyrighted works.
289 | 
290 | 
291 | 
292 | 
293 | d. If you distribute, publicly display, publicly perform, or publicly digitally
294 | 
295 | perform the Work or any Derivative Works or Collective Works, You must keep
296 | 
297 | intact all copyright notices for the Work and give the Original Author credit
298 | 
299 | reasonable to the medium or means You are utilizing by conveying the name
300 | 
301 | (or pseudonym if applicable) of the Original Author if supplied; the title
302 | 
303 | of the Work if supplied; to the extent reasonably practicable, the Uniform
304 | 
305 | Resource Identifier, if any, that Licensor specifies to be associated with
306 | 
307 | the Work, unless such URI does not refer to the copyright notice or licensing
308 | 
309 | information for the Work; and in the case of a Derivative Work, a credit identifying
310 | 
311 | the use of the Work in the Derivative Work (e.g., "French translation of the
312 | 
313 | Work by Original Author," or "Screenplay based on original Work by Original
314 | 
315 | Author"). Such credit may be implemented in any reasonable manner; provided,
316 | 
317 | however, that in the case of a Derivative Work or Collective Work, at a minimum
318 | 
319 | such credit will appear where any other comparable authorship credit appears
320 | 
321 | and in a manner at least as prominent as such other comparable authorship
322 | 
323 | credit.
324 | 
325 | 
326 | 
327 | 
328 |      e. For the avoidance of doubt, where the Work is a musical composition:
329 | 
330 | 
331 | 
332 | 
333 | i. Performance Royalties Under Blanket Licenses. Licensor reserves the exclusive
334 | 
335 | right to collect, whether individually or via a performance rights society
336 | 
337 | (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital
338 | 
339 | performance (e.g. webcast) of the Work if that performance is primarily intended
340 | 
341 | for or directed toward commercial advantage or private monetary compensation.
342 | 
343 | 
344 | 
345 | 
346 | ii. Mechanical Rights and Statutory Royalties. Licensor reserves the exclusive
347 | 
348 | right to collect, whether individually or via a music rights agency or designated
349 | 
350 | agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from
351 | 
352 | the Work ("cover version") and distribute, subject to the compulsory license
353 | 
354 | created by 17 USC Section 115 of the US Copyright Act (or the equivalent in
355 | 
356 | other jurisdictions), if Your distribution of such cover version is primarily
357 | 
358 | intended for or directed toward commercial advantage or private monetary compensation.
359 | 
360 | 
361 | 
362 | 
363 | f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt,
364 | 
365 | where the Work is a sound recording, Licensor reserves the exclusive right
366 | 
367 | to collect, whether individually or via a performance-rights society (e.g.
368 | 
369 | SoundExchange), royalties for the public digital performance (e.g. webcast)
370 | 
371 | of the Work, subject to the compulsory license created by 17 USC Section 114
372 | 
373 | of the US Copyright Act (or the equivalent in other jurisdictions), if Your
374 | 
375 | public digital performance is primarily intended for or directed toward commercial
376 | 
377 | advantage or private monetary compensation.
378 | 
379 | 
380 | 
381 | 
382 | 5. Representations, Warranties and Disclaimer
383 | 
384 | 
385 | 
386 | 
387 | UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS
388 | 
389 | THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING
390 | 
391 | THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION,
392 | 
393 | WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT,
394 | 
395 | OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE
396 | 
397 | OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE
398 | 
399 | EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
400 | 
401 | 
402 | 
403 | 
404 | 6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW,
405 | 
406 | IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL,
407 | 
408 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS
409 | 
410 | LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY
411 | 
412 | OF SUCH DAMAGES.
413 | 
414 | 
415 | 
416 | 
417 | 7. Termination
418 | 
419 | 
420 | 
421 | 
422 | a. This License and the rights granted hereunder will terminate automatically
423 | 
424 | upon any breach by You of the terms of this License. Individuals or entities
425 | 
426 | who have received Derivative Works or Collective Works from You under this
427 | 
428 | License, however, will not have their licenses terminated provided such individuals
429 | 
430 | or entities remain in full compliance with those licenses. Sections 1, 2,
431 | 
432 | 5, 6, 7, and 8 will survive any termination of this License.
433 | 
434 | 
435 | 
436 | 
437 | b. Subject to the above terms and conditions, the license granted here is
438 | 
439 | perpetual (for the duration of the applicable copyright in the Work). Notwithstanding
440 | 
441 | the above, Licensor reserves the right to release the Work under different
442 | 
443 | license terms or to stop distributing the Work at any time; provided, however
444 | 
445 | that any such election will not serve to withdraw this License (or any other
446 | 
447 | license that has been, or is required to be, granted under the terms of this
448 | 
449 | License), and this License will continue in full force and effect unless terminated
450 | 
451 | as stated above.
452 | 
453 | 
454 | 
455 | 
456 | 8. Miscellaneous
457 | 
458 | 
459 | 
460 | 
461 | a. Each time You distribute or publicly digitally perform the Work or a Collective
462 | 
463 | Work, the Licensor offers to the recipient a license to the Work on the same
464 | 
465 | terms and conditions as the license granted to You under this License.
466 | 
467 | 
468 | 
469 | 
470 | b. Each time You distribute or publicly digitally perform a Derivative Work,
471 | 
472 | Licensor offers to the recipient a license to the original Work on the same
473 | 
474 | terms and conditions as the license granted to You under this License.
475 | 
476 | 
477 | 
478 | 
479 | c. If any provision of this License is invalid or unenforceable under applicable
480 | 
481 | law, it shall not affect the validity or enforceability of the remainder of
482 | 
483 | the terms of this License, and without further action by the parties to this
484 | 
485 | agreement, such provision shall be reformed to the minimum extent necessary
486 | 
487 | to make such provision valid and enforceable.
488 | 
489 | 
490 | 
491 | 
492 | d. No term or provision of this License shall be deemed waived and no breach
493 | 
494 | consented to unless such waiver or consent shall be in writing and signed
495 | 
496 | by the party to be charged with such waiver or consent.
497 | 
498 | 
499 | 
500 | 
501 | e. This License constitutes the entire agreement between the parties with
502 | 
503 | respect to the Work licensed here. There are no understandings, agreements
504 | 
505 | or representations with respect to the Work not specified here. Licensor shall
506 | 
507 | not be bound by any additional provisions that may appear in any communication
508 | 
509 | from You. This License may not be modified without the mutual written agreement
510 | 
511 | of the Licensor and You.
512 | 
513 | 
514 | 
515 | 
516 | Creative Commons is not a party to this License, and makes no warranty whatsoever
517 | 
518 | in connection with the Work. Creative Commons will not be liable to You or
519 | 
520 | any party on any legal theory for any damages whatsoever, including without
521 | 
522 | limitation any general, special, incidental or consequential damages arising
523 | 
524 | in connection to this license. Notwithstanding the foregoing two (2) sentences,
525 | 
526 | if Creative Commons has expressly identified itself as the Licensor hereunder,
527 | 
528 | it shall have all rights and obligations of Licensor.
529 | 
530 | 
531 | 
532 | 
533 | Except for the limited purpose of indicating to the public that the Work is
534 | 
535 | licensed under the CCPL, neither party will use the trademark "Creative Commons"
536 | 
537 | or any related trademark or logo of Creative Commons without the prior written
538 | 
539 | consent of Creative Commons. Any permitted use will be in compliance with
540 | 
541 | Creative Commons' then-current trademark usage guidelines, as may be published
542 | 
543 | on its website or otherwise made available upon request from time to time.
544 | 
545 | 
546 | 
547 | 
548 | Creative Commons may be contacted at http://creativecommons.org/.
549 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include COPYING
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # *DOC*: Digital Outrage Classifier
  2 | 
  3 | > Developed by members of the Crockett Lab at Yale University in the Psychology and Statistics and Data Science department, `DOC` is a Python package that allows researchers to predict the probability that tweets contain moral outrage. 
  4 | 
  5 | > The details of the development of the code and materials in this repository are described in detail in the paper, "[How social learning amplifies moral outrage expression in online social networks](https://psyarxiv.com/gf7t5) (2021).
  6 | 
  7 | [![made-with-python][made-with-python]](https://www.python.org/)
  8 | [![Outrageclf version][outrage-image]](www.google.com)
  9 | [![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-green.svg)](www.google.com)
 10 | [![CC NC-SA 4.0](https://img.shields.io/badge/License-CC--NC--SA%202.0-lightgrey)](www.google.com)
 11 | 
 12 | 
 13 | ## Repository Contributors
 14 | * William Brady | Postdoctoral Fellow | Yale University | william.brady@yale.edu | [Website](http://williamjbrady.com)
 15 | * Killian McLoughlin | Ph.D. Student | Princeton University | k.mcloughlin@princeton.edu | [LinkedIn](www.linkedin.com/in/killian-mc-loughlin-5a151032)
 16 | * Tuan Nguyen Doan | Data Scientist | Quora | tuan.nguyen.doan@aya.yale.edu | [LinkedIn](https://www.linkedin.com/in/tuan-nguyen-doan)
 17 | 
 18 | 
 19 | ## Installation
 20 | 
 21 | The first step is to clone the repo into a local directory on you computer. Using the terminal, navigate to the location where you want to store the package and run the following command:
 22 | 
 23 | ```sh
 24 | git clone "https://github.com/CrockettLab/outrage_classifier"
 25 | ```
 26 | 
 27 | Then run the command below. The package is compatible with both Python2 and Python3.
 28 | ```sh
 29 | python setup.py install 
 30 | ```
 31 | 
 32 | ## Importing
 33 | The package can be imported using the following code:
 34 | 
 35 | ```python
 36 | import outrageclf as oclf
 37 | from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag
 38 | from outrageclf.classifier import _load_crockett_model
 39 | ```
 40 | 
 41 | For those using macOS, a runtime error (described [here](https://stackoverflow.com/questions/53014306/error-15-initializing-libiomp5-dylib-but-found-libiomp5-dylib-already-initial)) may prevent the package from being successfully imported. If you experience this issue, setting the environment varibale `KMP_DUPLICATE_LIB_OK` to `TRUE` should solve the problem:
 42 | 
 43 | ```python
 44 | import os
 45 | os.environ['KMP_DUPLICATE_LIB_OK']='True'
 46 | ```
 47 | 
 48 | ## Usage
 49 | The current version of `outrageclf` allows users to predict moral outrage using a pre-trained deep gated recurrent unit (GRU) model as described in detail in [this](www.google.com) article. 
 50 | 
 51 | To run the pre-trained model used in the article, you will need **model files that are NOT hosted in this repository**. If you would like access to these files, see 'Accessing Model Files' below. The omited files are:
 52 | 
 53 | - [x] A pre-trained embedding model, stored in a `.joblib` format
 54 | - [x] A pre-trained GRU model, stored in a `.h5` format 
 55 | 
 56 | In order to predict the probability a tweet contains moral outrage we use the following pipeline:
 57 | 
 58 | ```mermaid
 59 | Load pretrained models -> Preprocess text -> Embed text -> Make prediction 
 60 | ```
 61 | 
 62 | Below is a complete coded instance of the pipeline. Note that this example **assumes the presence of either our pretrained-model files or similar files generated by the user**:
 63 | 
 64 | ```python
 65 | 
 66 | tweets = [
 67 |           "This topic infuriates me because it violates my moral stance",
 68 |           "This is just a super-normal topic #normal"
 69 |          ]
 70 | 
 71 | # loading our pre-trained models
 72 | word_embed = WordEmbed()
 73 | word_embed._get_pretrained_tokenizer(embedding_url)
 74 | model = _load_crockett_model(model_url)
 75 | 
 76 | # the text are lemmatized and embedded into 50-d space
 77 | lemmatized_text = get_lemmatize_hashtag(text_vector)
 78 | embedded_vector = word_embed._get_embedded_vector(lemmatized_text)
 79 | predict = model.predict(embedded_vector)
 80 | ```
 81 | 
 82 | Alternatively, classifications can be generated using the package's model wrapper function, stored in the `classifier` module. This step bypasses the need to lemmatize and embed text input:
 83 | 
 84 | ```python
 85 | from outrageclf.classifier import pretrained_model_predict
 86 | pretrained_model_predict(tweets, embedding_url, model_url)
 87 | ```
 88 | ## Accessing Model Files
 89 | In order to access the pre-trained model files please fill out [this form](https://forms.gle/sRDbmtGK1dW6z6ff6). The form will ask for your email and a brief description of your use case. We will then email you the model files. Note that the classifier is for use in academic research only. See the license for more information.
 90 | 
 91 | ## Example Notebook
 92 | `example.ipynb` demonstrates examples of these two use cases.
 93 | 
 94 | ## Citation
 95 | Brady, W.J., McLoughlin, K.L., Doan, T.N., & Crockett, M.J. (2021). How social learning amplifies moral outrage expression in online social networks. [PsyArXiv](https://psyarxiv.com/gf7t5). doi: 10.31234/osf.io/gf7t5
 96 | 
 97 | ## License
 98 | This work is licensed under a
 99 | [Creative Commons Attribution-NonCommercial-ShareAlike 2.0 Generic License][cc-nc-sa].
100 | 
101 | [![CC NC-SA 4.0][cc-nc-sa-image]][cc-nc-sa]
102 | 
103 | ## Release History
104 | * 0.1.0
105 |     * Initial release
106 | * 0.1.5
107 |     * Official release with paper
108 | * 0.1.6
109 |     * hotfix
110 | 
111 | <!-- Markdown link & img dfn's -->
112 | [made-with-python]: https://img.shields.io/badge/Made%20with-Python-FF0000.svg
113 | [outrage-image]: https://img.shields.io/badge/DOC-v0.1.4-orange.svg
114 | [cc-nc-sa]: https://creativecommons.org/licenses/by-nc-sa/2.0/
115 | [cc-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/2.0/88x31.png
116 | [cc-nc-sa-shield]: https://img.shields.io/badge/License-CC--NC--SA%202.0-lightgrey
117 | [travis-image]: https://img.shields.io/travis/dbader/node-datadog-metrics/master.svg?style=flat-square
118 | [travis-url]: https://travis-ci.org/dbader/node-datadog-metrics
119 | 


--------------------------------------------------------------------------------
/example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "example.ipynb",
  7 |       "provenance": []
  8 |     },
  9 |     "kernelspec": {
 10 |       "name": "python3",
 11 |       "display_name": "Python 3"
 12 |     },
 13 |     "accelerator": "GPU"
 14 |   },
 15 |   "cells": [
 16 |     {
 17 |       "cell_type": "code",
 18 |       "metadata": {
 19 |         "id": "SFOg446dh3ki",
 20 |         "colab": {
 21 |           "base_uri": "https://localhost:8080/"
 22 |         },
 23 |         "outputId": "d643edf1-967f-49ae-a27f-1f1830277210"
 24 |       },
 25 |       "source": [
 26 |         "%ls"
 27 |       ],
 28 |       "execution_count": 1,
 29 |       "outputs": [
 30 |         {
 31 |           "output_type": "stream",
 32 |           "text": [
 33 |             "\u001b[0m\u001b[01;34mdrive\u001b[0m/  \u001b[01;34msample_data\u001b[0m/\n"
 34 |           ],
 35 |           "name": "stdout"
 36 |         }
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "code",
 41 |       "metadata": {
 42 |         "id": "CT-cXbOmhbqv",
 43 |         "colab": {
 44 |           "base_uri": "https://localhost:8080/"
 45 |         },
 46 |         "outputId": "11f9bf87-521b-44f4-dfa7-1d09b269134f"
 47 |       },
 48 |       "source": [
 49 |         "%cd \"drive/My Drive/outrageclf/\""
 50 |       ],
 51 |       "execution_count": 2,
 52 |       "outputs": [
 53 |         {
 54 |           "output_type": "stream",
 55 |           "text": [
 56 |             "/content/drive/My Drive/outrageclf\n"
 57 |           ],
 58 |           "name": "stdout"
 59 |         }
 60 |       ]
 61 |     },
 62 |     {
 63 |       "cell_type": "code",
 64 |       "metadata": {
 65 |         "id": "EF1t_c4FyRSq",
 66 |         "colab": {
 67 |           "base_uri": "https://localhost:8080/"
 68 |         },
 69 |         "outputId": "ac6ffa73-afab-49a9-a2cb-96b67181386b"
 70 |       },
 71 |       "source": [
 72 |         "!python3 setup.py install"
 73 |       ],
 74 |       "execution_count": 3,
 75 |       "outputs": [
 76 |         {
 77 |           "output_type": "stream",
 78 |           "text": [
 79 |             "running install\n",
 80 |             "running bdist_egg\n",
 81 |             "running egg_info\n",
 82 |             "writing outrageclf.egg-info/PKG-INFO\n",
 83 |             "writing dependency_links to outrageclf.egg-info/dependency_links.txt\n",
 84 |             "writing requirements to outrageclf.egg-info/requires.txt\n",
 85 |             "writing top-level names to outrageclf.egg-info/top_level.txt\n",
 86 |             "reading manifest file 'outrageclf.egg-info/SOURCES.txt'\n",
 87 |             "reading manifest template 'MANIFEST.in'\n",
 88 |             "warning: no files found matching 'COPYING'\n",
 89 |             "writing manifest file 'outrageclf.egg-info/SOURCES.txt'\n",
 90 |             "installing library code to build/bdist.linux-x86_64/egg\n",
 91 |             "running install_lib\n",
 92 |             "running build_py\n",
 93 |             "copying outrageclf/helpers.py -> build/lib/outrageclf\n",
 94 |             "creating build/bdist.linux-x86_64/egg\n",
 95 |             "creating build/bdist.linux-x86_64/egg/outrageclf\n",
 96 |             "copying build/lib/outrageclf/__init__.py -> build/bdist.linux-x86_64/egg/outrageclf\n",
 97 |             "copying build/lib/outrageclf/helpers.py -> build/bdist.linux-x86_64/egg/outrageclf\n",
 98 |             "copying build/lib/outrageclf/model_architect.py -> build/bdist.linux-x86_64/egg/outrageclf\n",
 99 |             "copying build/lib/outrageclf/classifier.py -> build/bdist.linux-x86_64/egg/outrageclf\n",
100 |             "copying build/lib/outrageclf/preprocessing.py -> build/bdist.linux-x86_64/egg/outrageclf\n",
101 |             "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/__init__.py to __init__.cpython-36.pyc\n",
102 |             "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/helpers.py to helpers.cpython-36.pyc\n",
103 |             "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/model_architect.py to model_architect.cpython-36.pyc\n",
104 |             "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/classifier.py to classifier.cpython-36.pyc\n",
105 |             "byte-compiling build/bdist.linux-x86_64/egg/outrageclf/preprocessing.py to preprocessing.cpython-36.pyc\n",
106 |             "creating build/bdist.linux-x86_64/egg/EGG-INFO\n",
107 |             "copying outrageclf.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
108 |             "copying outrageclf.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
109 |             "copying outrageclf.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
110 |             "copying outrageclf.egg-info/not-zip-safe -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
111 |             "copying outrageclf.egg-info/requires.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
112 |             "copying outrageclf.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO\n",
113 |             "creating 'dist/outrageclf-0.1.5-py3.6.egg' and adding 'build/bdist.linux-x86_64/egg' to it\n",
114 |             "removing 'build/bdist.linux-x86_64/egg' (and everything under it)\n",
115 |             "Processing outrageclf-0.1.5-py3.6.egg\n",
116 |             "creating /usr/local/lib/python3.6/dist-packages/outrageclf-0.1.5-py3.6.egg\n",
117 |             "Extracting outrageclf-0.1.5-py3.6.egg to /usr/local/lib/python3.6/dist-packages\n",
118 |             "Adding outrageclf 0.1.5 to easy-install.pth file\n",
119 |             "\n",
120 |             "Installed /usr/local/lib/python3.6/dist-packages/outrageclf-0.1.5-py3.6.egg\n",
121 |             "Processing dependencies for outrageclf==0.1.5\n",
122 |             "Searching for tensorflow==2.3.0\n",
123 |             "Best match: tensorflow 2.3.0\n",
124 |             "Adding tensorflow 2.3.0 to easy-install.pth file\n",
125 |             "Installing estimator_ckpt_converter script to /usr/local/bin\n",
126 |             "Installing saved_model_cli script to /usr/local/bin\n",
127 |             "Installing tensorboard script to /usr/local/bin\n",
128 |             "Installing tf_upgrade_v2 script to /usr/local/bin\n",
129 |             "Installing tflite_convert script to /usr/local/bin\n",
130 |             "Installing toco script to /usr/local/bin\n",
131 |             "Installing toco_from_protos script to /usr/local/bin\n",
132 |             "\n",
133 |             "Using /usr/local/lib/python3.6/dist-packages\n",
134 |             "Searching for sklearn==0.0\n",
135 |             "Best match: sklearn 0.0\n",
136 |             "Adding sklearn 0.0 to easy-install.pth file\n",
137 |             "\n",
138 |             "Using /usr/local/lib/python3.6/dist-packages\n",
139 |             "Searching for numpy==1.18.5\n",
140 |             "Best match: numpy 1.18.5\n",
141 |             "Adding numpy 1.18.5 to easy-install.pth file\n",
142 |             "Installing f2py script to /usr/local/bin\n",
143 |             "Installing f2py3 script to /usr/local/bin\n",
144 |             "Installing f2py3.6 script to /usr/local/bin\n",
145 |             "\n",
146 |             "Using /usr/local/lib/python3.6/dist-packages\n",
147 |             "Searching for nltk==3.2.5\n",
148 |             "Best match: nltk 3.2.5\n",
149 |             "Adding nltk 3.2.5 to easy-install.pth file\n",
150 |             "\n",
151 |             "Using /usr/local/lib/python3.6/dist-packages\n",
152 |             "Searching for Keras==2.4.3\n",
153 |             "Best match: Keras 2.4.3\n",
154 |             "Adding Keras 2.4.3 to easy-install.pth file\n",
155 |             "\n",
156 |             "Using /usr/local/lib/python3.6/dist-packages\n",
157 |             "Searching for joblib==0.17.0\n",
158 |             "Best match: joblib 0.17.0\n",
159 |             "Adding joblib 0.17.0 to easy-install.pth file\n",
160 |             "\n",
161 |             "Using /usr/local/lib/python3.6/dist-packages\n",
162 |             "Searching for emoji==0.6.0\n",
163 |             "Best match: emoji 0.6.0\n",
164 |             "Adding emoji 0.6.0 to easy-install.pth file\n",
165 |             "\n",
166 |             "Using /usr/local/lib/python3.6/dist-packages\n",
167 |             "Searching for Keras-Preprocessing==1.1.2\n",
168 |             "Best match: Keras-Preprocessing 1.1.2\n",
169 |             "Adding Keras-Preprocessing 1.1.2 to easy-install.pth file\n",
170 |             "\n",
171 |             "Using /usr/local/lib/python3.6/dist-packages\n",
172 |             "Searching for astunparse==1.6.3\n",
173 |             "Best match: astunparse 1.6.3\n",
174 |             "Adding astunparse 1.6.3 to easy-install.pth file\n",
175 |             "\n",
176 |             "Using /usr/local/lib/python3.6/dist-packages\n",
177 |             "Searching for scipy==1.4.1\n",
178 |             "Best match: scipy 1.4.1\n",
179 |             "Adding scipy 1.4.1 to easy-install.pth file\n",
180 |             "\n",
181 |             "Using /usr/local/lib/python3.6/dist-packages\n",
182 |             "Searching for gast==0.3.3\n",
183 |             "Best match: gast 0.3.3\n",
184 |             "Adding gast 0.3.3 to easy-install.pth file\n",
185 |             "\n",
186 |             "Using /usr/local/lib/python3.6/dist-packages\n",
187 |             "Searching for six==1.15.0\n",
188 |             "Best match: six 1.15.0\n",
189 |             "Adding six 1.15.0 to easy-install.pth file\n",
190 |             "\n",
191 |             "Using /usr/local/lib/python3.6/dist-packages\n",
192 |             "Searching for termcolor==1.1.0\n",
193 |             "Best match: termcolor 1.1.0\n",
194 |             "Adding termcolor 1.1.0 to easy-install.pth file\n",
195 |             "\n",
196 |             "Using /usr/local/lib/python3.6/dist-packages\n",
197 |             "Searching for wheel==0.36.1\n",
198 |             "Best match: wheel 0.36.1\n",
199 |             "Adding wheel 0.36.1 to easy-install.pth file\n",
200 |             "Installing wheel script to /usr/local/bin\n",
201 |             "\n",
202 |             "Using /usr/local/lib/python3.6/dist-packages\n",
203 |             "Searching for tensorflow-estimator==2.3.0\n",
204 |             "Best match: tensorflow-estimator 2.3.0\n",
205 |             "Adding tensorflow-estimator 2.3.0 to easy-install.pth file\n",
206 |             "\n",
207 |             "Using /usr/local/lib/python3.6/dist-packages\n",
208 |             "Searching for wrapt==1.12.1\n",
209 |             "Best match: wrapt 1.12.1\n",
210 |             "Adding wrapt 1.12.1 to easy-install.pth file\n",
211 |             "\n",
212 |             "Using /usr/local/lib/python3.6/dist-packages\n",
213 |             "Searching for opt-einsum==3.3.0\n",
214 |             "Best match: opt-einsum 3.3.0\n",
215 |             "Adding opt-einsum 3.3.0 to easy-install.pth file\n",
216 |             "\n",
217 |             "Using /usr/local/lib/python3.6/dist-packages\n",
218 |             "Searching for h5py==2.10.0\n",
219 |             "Best match: h5py 2.10.0\n",
220 |             "Adding h5py 2.10.0 to easy-install.pth file\n",
221 |             "\n",
222 |             "Using /usr/local/lib/python3.6/dist-packages\n",
223 |             "Searching for google-pasta==0.2.0\n",
224 |             "Best match: google-pasta 0.2.0\n",
225 |             "Adding google-pasta 0.2.0 to easy-install.pth file\n",
226 |             "\n",
227 |             "Using /usr/local/lib/python3.6/dist-packages\n",
228 |             "Searching for protobuf==3.12.4\n",
229 |             "Best match: protobuf 3.12.4\n",
230 |             "Adding protobuf 3.12.4 to easy-install.pth file\n",
231 |             "\n",
232 |             "Using /usr/local/lib/python3.6/dist-packages\n",
233 |             "Searching for grpcio==1.34.0\n",
234 |             "Best match: grpcio 1.34.0\n",
235 |             "Adding grpcio 1.34.0 to easy-install.pth file\n",
236 |             "\n",
237 |             "Using /usr/local/lib/python3.6/dist-packages\n",
238 |             "Searching for tensorboard==2.3.0\n",
239 |             "Best match: tensorboard 2.3.0\n",
240 |             "Adding tensorboard 2.3.0 to easy-install.pth file\n",
241 |             "Installing tensorboard script to /usr/local/bin\n",
242 |             "\n",
243 |             "Using /usr/local/lib/python3.6/dist-packages\n",
244 |             "Searching for absl-py==0.10.0\n",
245 |             "Best match: absl-py 0.10.0\n",
246 |             "Adding absl-py 0.10.0 to easy-install.pth file\n",
247 |             "\n",
248 |             "Using /usr/local/lib/python3.6/dist-packages\n",
249 |             "Searching for scikit-learn==0.22.2.post1\n",
250 |             "Best match: scikit-learn 0.22.2.post1\n",
251 |             "Adding scikit-learn 0.22.2.post1 to easy-install.pth file\n",
252 |             "\n",
253 |             "Using /usr/local/lib/python3.6/dist-packages\n",
254 |             "Searching for PyYAML==3.13\n",
255 |             "Best match: PyYAML 3.13\n",
256 |             "Adding PyYAML 3.13 to easy-install.pth file\n",
257 |             "\n",
258 |             "Using /usr/local/lib/python3.6/dist-packages\n",
259 |             "Searching for setuptools==50.3.2\n",
260 |             "Best match: setuptools 50.3.2\n",
261 |             "Adding setuptools 50.3.2 to easy-install.pth file\n",
262 |             "Installing easy_install script to /usr/local/bin\n",
263 |             "Installing easy_install-3.8 script to /usr/local/bin\n",
264 |             "\n",
265 |             "Using /usr/local/lib/python3.6/dist-packages\n",
266 |             "Searching for google-auth-oauthlib==0.4.2\n",
267 |             "Best match: google-auth-oauthlib 0.4.2\n",
268 |             "Adding google-auth-oauthlib 0.4.2 to easy-install.pth file\n",
269 |             "Installing google-oauthlib-tool script to /usr/local/bin\n",
270 |             "\n",
271 |             "Using /usr/local/lib/python3.6/dist-packages\n",
272 |             "Searching for tensorboard-plugin-wit==1.7.0\n",
273 |             "Best match: tensorboard-plugin-wit 1.7.0\n",
274 |             "Adding tensorboard-plugin-wit 1.7.0 to easy-install.pth file\n",
275 |             "\n",
276 |             "Using /usr/local/lib/python3.6/dist-packages\n",
277 |             "Searching for google-auth==1.17.2\n",
278 |             "Best match: google-auth 1.17.2\n",
279 |             "Adding google-auth 1.17.2 to easy-install.pth file\n",
280 |             "\n",
281 |             "Using /usr/local/lib/python3.6/dist-packages\n",
282 |             "Searching for Werkzeug==1.0.1\n",
283 |             "Best match: Werkzeug 1.0.1\n",
284 |             "Adding Werkzeug 1.0.1 to easy-install.pth file\n",
285 |             "\n",
286 |             "Using /usr/local/lib/python3.6/dist-packages\n",
287 |             "Searching for requests==2.23.0\n",
288 |             "Best match: requests 2.23.0\n",
289 |             "Adding requests 2.23.0 to easy-install.pth file\n",
290 |             "\n",
291 |             "Using /usr/local/lib/python3.6/dist-packages\n",
292 |             "Searching for Markdown==3.3.3\n",
293 |             "Best match: Markdown 3.3.3\n",
294 |             "Adding Markdown 3.3.3 to easy-install.pth file\n",
295 |             "Installing markdown_py script to /usr/local/bin\n",
296 |             "\n",
297 |             "Using /usr/local/lib/python3.6/dist-packages\n",
298 |             "Searching for requests-oauthlib==1.3.0\n",
299 |             "Best match: requests-oauthlib 1.3.0\n",
300 |             "Adding requests-oauthlib 1.3.0 to easy-install.pth file\n",
301 |             "\n",
302 |             "Using /usr/local/lib/python3.6/dist-packages\n",
303 |             "Searching for pyasn1-modules==0.2.8\n",
304 |             "Best match: pyasn1-modules 0.2.8\n",
305 |             "Adding pyasn1-modules 0.2.8 to easy-install.pth file\n",
306 |             "\n",
307 |             "Using /usr/local/lib/python3.6/dist-packages\n",
308 |             "Searching for cachetools==4.1.1\n",
309 |             "Best match: cachetools 4.1.1\n",
310 |             "Adding cachetools 4.1.1 to easy-install.pth file\n",
311 |             "\n",
312 |             "Using /usr/local/lib/python3.6/dist-packages\n",
313 |             "Searching for rsa==4.6\n",
314 |             "Best match: rsa 4.6\n",
315 |             "Adding rsa 4.6 to easy-install.pth file\n",
316 |             "Installing pyrsa-decrypt script to /usr/local/bin\n",
317 |             "Installing pyrsa-encrypt script to /usr/local/bin\n",
318 |             "Installing pyrsa-keygen script to /usr/local/bin\n",
319 |             "Installing pyrsa-priv2pub script to /usr/local/bin\n",
320 |             "Installing pyrsa-sign script to /usr/local/bin\n",
321 |             "Installing pyrsa-verify script to /usr/local/bin\n",
322 |             "\n",
323 |             "Using /usr/local/lib/python3.6/dist-packages\n",
324 |             "Searching for chardet==3.0.4\n",
325 |             "Best match: chardet 3.0.4\n",
326 |             "Adding chardet 3.0.4 to easy-install.pth file\n",
327 |             "Installing chardetect script to /usr/local/bin\n",
328 |             "\n",
329 |             "Using /usr/local/lib/python3.6/dist-packages\n",
330 |             "Searching for idna==2.10\n",
331 |             "Best match: idna 2.10\n",
332 |             "Adding idna 2.10 to easy-install.pth file\n",
333 |             "\n",
334 |             "Using /usr/local/lib/python3.6/dist-packages\n",
335 |             "Searching for urllib3==1.24.3\n",
336 |             "Best match: urllib3 1.24.3\n",
337 |             "Adding urllib3 1.24.3 to easy-install.pth file\n",
338 |             "\n",
339 |             "Using /usr/local/lib/python3.6/dist-packages\n",
340 |             "Searching for certifi==2020.12.5\n",
341 |             "Best match: certifi 2020.12.5\n",
342 |             "Adding certifi 2020.12.5 to easy-install.pth file\n",
343 |             "\n",
344 |             "Using /usr/local/lib/python3.6/dist-packages\n",
345 |             "Searching for importlib-metadata==3.1.1\n",
346 |             "Best match: importlib-metadata 3.1.1\n",
347 |             "Adding importlib-metadata 3.1.1 to easy-install.pth file\n",
348 |             "\n",
349 |             "Using /usr/local/lib/python3.6/dist-packages\n",
350 |             "Searching for oauthlib==3.1.0\n",
351 |             "Best match: oauthlib 3.1.0\n",
352 |             "Adding oauthlib 3.1.0 to easy-install.pth file\n",
353 |             "\n",
354 |             "Using /usr/local/lib/python3.6/dist-packages\n",
355 |             "Searching for pyasn1==0.4.8\n",
356 |             "Best match: pyasn1 0.4.8\n",
357 |             "Adding pyasn1 0.4.8 to easy-install.pth file\n",
358 |             "\n",
359 |             "Using /usr/local/lib/python3.6/dist-packages\n",
360 |             "Searching for zipp==3.4.0\n",
361 |             "Best match: zipp 3.4.0\n",
362 |             "Adding zipp 3.4.0 to easy-install.pth file\n",
363 |             "\n",
364 |             "Using /usr/local/lib/python3.6/dist-packages\n",
365 |             "Finished processing dependencies for outrageclf==0.1.5\n"
366 |           ],
367 |           "name": "stdout"
368 |         }
369 |       ]
370 |     },
371 |     {
372 |       "cell_type": "markdown",
373 |       "metadata": {
374 |         "id": "ku3BZxXTCHpe"
375 |       },
376 |       "source": [
377 |         "**Running the wrapper function**"
378 |       ]
379 |     },
380 |     {
381 |       "cell_type": "code",
382 |       "metadata": {
383 |         "id": "OiegrAjQCUDo"
384 |       },
385 |       "source": [
386 |         "# an joblib embedding file and a model file is required\n",
387 |         "# contact the Crockett lab for these model files\n",
388 |         "embedding_url = \"/31k.joblib\"\n",
389 |         "model_url = \"/31k.h5\""
390 |       ],
391 |       "execution_count": 10,
392 |       "outputs": []
393 |     },
394 |     {
395 |       "cell_type": "code",
396 |       "metadata": {
397 |         "id": "PMy1QJqvCtES"
398 |       },
399 |       "source": [
400 |         "# these tweets are created purely for demostration\n",
401 |         "# they are not part of, or represent any tweets in the actual training data\n",
402 |         "tweets = [\n",
403 |         "          \"This topic infuriates me because it violates my moral stance\",\n",
404 |         "          \"This is just a super-normal topic #normal\",\n",
405 |         "          \"The type of football they play today is atrocious\"\n",
406 |         "          ]"
407 |       ],
408 |       "execution_count": 11,
409 |       "outputs": []
410 |     },
411 |     {
412 |       "cell_type": "code",
413 |       "metadata": {
414 |         "colab": {
415 |           "base_uri": "https://localhost:8080/"
416 |         },
417 |         "id": "ihs24aZYCpvs",
418 |         "outputId": "da5bf7e5-c164-413e-b4cb-aef971953ac8"
419 |       },
420 |       "source": [
421 |         "from outrageclf.classifier import pretrained_model_predict\n",
422 |         "pretrained_model_predict(tweets, embedding_url, model_url)"
423 |       ],
424 |       "execution_count": 12,
425 |       "outputs": [
426 |         {
427 |           "output_type": "stream",
428 |           "text": [
429 |             "Loaded pre-trained tokenizer at: 31k.joblib\n",
430 |             "WARNING:tensorflow:Layer gru_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
431 |             "WARNING:tensorflow:Layer gru_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
432 |             "WARNING:tensorflow:Layer gru_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
433 |             "WARNING:tensorflow:Layer gru_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
434 |             "Loaded pretrained model at: 31k.h5\n"
435 |           ],
436 |           "name": "stdout"
437 |         },
438 |         {
439 |           "output_type": "execute_result",
440 |           "data": {
441 |             "text/plain": [
442 |               "array([[9.9660861e-01],\n",
443 |               "       [4.0077552e-04],\n",
444 |               "       [6.3920277e-01]], dtype=float32)"
445 |             ]
446 |           },
447 |           "metadata": {
448 |             "tags": []
449 |           },
450 |           "execution_count": 12
451 |         }
452 |       ]
453 |     },
454 |     {
455 |       "cell_type": "markdown",
456 |       "metadata": {
457 |         "id": "I8l_1jYuDvSV"
458 |       },
459 |       "source": [
460 |         "**A peak into the model**"
461 |       ]
462 |     },
463 |     {
464 |       "cell_type": "markdown",
465 |       "metadata": {
466 |         "id": "CRoXZklZEE1E"
467 |       },
468 |       "source": [
469 |         "This section gives you a closer look at every steps under `pretrained_model_predict`"
470 |       ]
471 |     },
472 |     {
473 |       "cell_type": "code",
474 |       "metadata": {
475 |         "id": "_pbnBGfRD4Zs"
476 |       },
477 |       "source": [
478 |         "from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag\n",
479 |         "from outrageclf.classifier import _load_crockett_model"
480 |       ],
481 |       "execution_count": 13,
482 |       "outputs": []
483 |     },
484 |     {
485 |       "cell_type": "code",
486 |       "metadata": {
487 |         "colab": {
488 |           "base_uri": "https://localhost:8080/"
489 |         },
490 |         "id": "eB07TdMrEUgN",
491 |         "outputId": "c5e0c61b-de21-422e-b4c5-db7f41376237"
492 |       },
493 |       "source": [
494 |         "# loading our pre-trained models\n",
495 |         "word_embed = WordEmbed()\n",
496 |         "word_embed._get_pretrained_tokenizer(embedding_url)\n",
497 |         "model = _load_crockett_model(model_url)"
498 |       ],
499 |       "execution_count": 14,
500 |       "outputs": [
501 |         {
502 |           "output_type": "stream",
503 |           "text": [
504 |             "Loaded pre-trained tokenizer at: 31k.joblib\n",
505 |             "WARNING:tensorflow:Layer gru_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
506 |             "WARNING:tensorflow:Layer gru_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
507 |             "WARNING:tensorflow:Layer gru_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n",
508 |             "WARNING:tensorflow:Layer gru_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU\n"
509 |           ],
510 |           "name": "stdout"
511 |         }
512 |       ]
513 |     },
514 |     {
515 |       "cell_type": "code",
516 |       "metadata": {
517 |         "id": "3En1_lLrEYtj"
518 |       },
519 |       "source": [
520 |         "# the text are lemmatized and embedded into 50-d space\n",
521 |         "lemmatized_text = get_lemmatize_hashtag(tweets)\n",
522 |         "embedded_vector = word_embed._get_embedded_vector(lemmatized_text)"
523 |       ],
524 |       "execution_count": 15,
525 |       "outputs": []
526 |     },
527 |     {
528 |       "cell_type": "code",
529 |       "metadata": {
530 |         "colab": {
531 |           "base_uri": "https://localhost:8080/"
532 |         },
533 |         "id": "IldpZRMNEiXE",
534 |         "outputId": "9acf3f84-2f0e-4124-b0af-6180a03b4c89"
535 |       },
536 |       "source": [
537 |         "for idx, tweet in enumerate(tweets):\n",
538 |         "  print(\"Original tweet:\", tweet)\n",
539 |         "  print(\"Lemmatize text:\", lemmatized_text[idx])\n",
540 |         "  print(\"50-d embedded vector:\", embedded_vector[idx])"
541 |       ],
542 |       "execution_count": 16,
543 |       "outputs": [
544 |         {
545 |           "output_type": "stream",
546 |           "text": [
547 |             "Original tweet: This topic infuriates me because it violates my moral stance\n",
548 |             "Lemmatize text: topic infuriate violate moral stance \n",
549 |             "50-d embedded vector: [1760 2401 1705  611 3121    0    0    0    0    0    0    0    0    0\n",
550 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
551 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
552 |             "    0    0    0    0    0    0    0    0]\n",
553 |             "Original tweet: This is just a super-normal topic #normal\n",
554 |             "Lemmatize text: super normal topic #normal\n",
555 |             "50-d embedded vector: [1427 2033 1760 2033    0    0    0    0    0    0    0    0    0    0\n",
556 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
557 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
558 |             "    0    0    0    0    0    0    0    0]\n",
559 |             "Original tweet: The type of football they play today is atrocious\n",
560 |             "Lemmatize text: type football play today atrocious \n",
561 |             "50-d embedded vector: [ 958 2308  250   93 3486    0    0    0    0    0    0    0    0    0\n",
562 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
563 |             "    0    0    0    0    0    0    0    0    0    0    0    0    0    0\n",
564 |             "    0    0    0    0    0    0    0    0]\n"
565 |           ],
566 |           "name": "stdout"
567 |         }
568 |       ]
569 |     },
570 |     {
571 |       "cell_type": "code",
572 |       "metadata": {
573 |         "id": "fonhG6PW1F2G"
574 |       },
575 |       "source": [
576 |         "# the model then makes prediction using the embedded_vector as inputs\n",
577 |         "predict = model.predict(embedded_vector)"
578 |       ],
579 |       "execution_count": 17,
580 |       "outputs": []
581 |     },
582 |     {
583 |       "cell_type": "code",
584 |       "metadata": {
585 |         "colab": {
586 |           "base_uri": "https://localhost:8080/"
587 |         },
588 |         "id": "l9hecInmFNe8",
589 |         "outputId": "b71ada2e-d0d4-43c7-a911-ddbd89ac19d8"
590 |       },
591 |       "source": [
592 |         "for idx, tweet in enumerate(tweets):\n",
593 |         "  print(\"Original tweet:\", tweet)\n",
594 |         "  print(\"Predicted probability of outrage:\", predict[idx])\n",
595 |         "  print(\"\\n\")"
596 |       ],
597 |       "execution_count": 18,
598 |       "outputs": [
599 |         {
600 |           "output_type": "stream",
601 |           "text": [
602 |             "Original tweet: This topic infuriates me because it violates my moral stance\n",
603 |             "Predicted probability of outrage: [0.9966086]\n",
604 |             "\n",
605 |             "\n",
606 |             "Original tweet: This is just a super-normal topic #normal\n",
607 |             "Predicted probability of outrage: [0.00040078]\n",
608 |             "\n",
609 |             "\n",
610 |             "Original tweet: The type of football they play today is atrocious\n",
611 |             "Predicted probability of outrage: [0.6392028]\n",
612 |             "\n",
613 |             "\n"
614 |           ],
615 |           "name": "stdout"
616 |         }
617 |       ]
618 |     }
619 |   ]
620 | }


--------------------------------------------------------------------------------
/outrageclf/classifier.py:
--------------------------------------------------------------------------------
 1 | from .model_architect import threshold_acc
 2 | from .preprocessing import WordEmbed, get_lemmatize_hashtag
 3 | from keras.models import load_model
 4 | from joblib import load
 5 | 
 6 | 
 7 | '''
 8 | Load pretrained model
 9 | 
10 | Input: url
11 |     - Users responsibility to acquire the h5 model format
12 |     - and input correct url link
13 | 
14 | Output: 
15 | '''
16 | def _load_crockett_model(url):
17 |     return load_model(
18 |         url,
19 |         custom_objects={'threshold_acc': threshold_acc}
20 |     )
21 | 
22 | '''
23 | Wrapper function for prediction:
24 | In general, if users have to call this function several times
25 | it it more efficient to load the model and use built-in predict method.
26 | 
27 | Input: text vector, lemmatize_url, model_url
28 | '''
29 | def pretrained_model_predict(text_vector, lemmatize_url, model_url):
30 |     word_embed = WordEmbed()
31 |     word_embed._get_pretrained_tokenizer(lemmatize_url)
32 |     model = _load_crockett_model(model_url)
33 | 
34 |     lemmatized_text = get_lemmatize_hashtag(text_vector)
35 |     embedded_vector = word_embed._get_embedded_vector(lemmatized_text)
36 |     predict = model.predict(embedded_vector)
37 | 
38 |     return pretrained_model_predict
39 | 


--------------------------------------------------------------------------------
/outrageclf/helpers.py:
--------------------------------------------------------------------------------
  1 | import emoji, re, collections, string
  2 | import nltk
  3 | from nltk import pos_tag
  4 | from nltk.stem.wordnet import WordNetLemmatizer
  5 | from nltk.stem.snowball import SnowballStemmer
  6 | from nltk.tokenize import TweetTokenizer
  7 | from nltk.tokenize import word_tokenize
  8 | from nltk.corpus import wordnet, stopwords
  9 | 
 10 | 
 11 | # top emojis
 12 | # the list is practically derived from our datasets
 13 | top_emojis = ['😂','🤣','😡','🖕','😹','🙏','👎','🌊','🙄','🤔']
 14 | lemmatizer = WordNetLemmatizer()
 15 | cachedStopWordsPunctuation = set(stopwords.words("english")
 16 |                                  + [x for x in list(string.punctuation) if x not in ['!','?']]
 17 |                                  + ['',' ','  '])
 18 | 
 19 | # check if emojis in a string
 20 | def char_is_emoji(char):
 21 |     return char in emoji.UNICODE_EMOJI['en']
 22 | 
 23 | s = set(emoji.UNICODE_EMOJI['en'].values())
 24 | def string_is_emoji_name(text):
 25 |     return text in s
 26 | 
 27 | # extract a string of emojis from a string
 28 | def extract_emojis(text):
 29 |     return ' '.join(c for c in text if c in emoji.UNICODE_EMOJI)
 30 | 
 31 | # just get the hashtag
 32 | # this function removes the function, even in hashtag
 33 | def get_hashtag(text):
 34 |     text = re.sub(r'[%s]' % re.escape("""!"$%&()*+,-./:;<=>?@[\]^_`{|}~"""), '', text)
 35 |     return ",".join([i.lower()  for i in text.split() if i.startswith("#") ])
 36 | 
 37 | # word_tokenize as defined in nltk library
 38 | def token_postag(text):
 39 |     tokens = word_tokenize(text)
 40 |     return pos_tag(tokens)
 41 | 
 42 | # function to simplify POS
 43 | # exclusively used for lemmatization using WORDNET
 44 | def get_wordnet_pos(treebank_tag):
 45 |     if treebank_tag.startswith('J'):
 46 |         return wordnet.ADJ
 47 |     elif treebank_tag.startswith('V'):
 48 |         return wordnet.VERB
 49 |     elif treebank_tag.startswith('N'):
 50 |         return wordnet.NOUN
 51 |     elif treebank_tag.startswith('R'):
 52 |         return wordnet.ADV
 53 |     else:
 54 |         return None
 55 | 
 56 | # similary to get_wordnet_pos
 57 | # but more granular in order to create more features
 58 | def modify_pos(dict):
 59 |     result_dic = {}
 60 |     for key in dict.keys():
 61 |         if key.startswith('J'):
 62 |             if "adj" in result_dic:
 63 |                 result_dic["adj"] += dict[key]
 64 |             else:
 65 |                 result_dic["adj"] = dict[key]
 66 |         elif key.startswith('V'):
 67 |             if "verb" in result_dic:
 68 |                 result_dic["verb"] += dict[key]
 69 |             else:
 70 |                 result_dic["verb"] = dict[key]
 71 |         elif key.startswith('N'):
 72 |             if "noun" in result_dic:
 73 |                 result_dic["noun"] += dict[key]
 74 |             else:
 75 |                 result_dic["noun"] = dict[key]
 76 |         elif key.startswith('R'):
 77 |             if "adv" in result_dic:
 78 |                 result_dic["adv"] += dict[key]
 79 |             else:
 80 |                 result_dic["adv"] = dict[key]
 81 |         elif key in ['PRP', 'PRP$']:
 82 |             if "pronoun" in result_dic:
 83 |                 result_dic["pronoun"] += dict[key]
 84 |             else:
 85 |                 result_dic["pronoun"] = dict[key]
 86 |         elif key.startswith('W'):
 87 |             if "wh" in result_dic:
 88 |                 result_dic["wh"] += dict[key]
 89 |             else:
 90 |                 result_dic["wh"] = dict[key]
 91 |         else:
 92 |             if "other" in result_dic:
 93 |                 result_dic["other"] += dict[key]
 94 |             else:
 95 |                 result_dic["other"] = dict[key]
 96 |     return result_dic
 97 | 
 98 | 
 99 | # tokenize and then stemmize the string
100 | def token_stem_lemmatize(text):
101 |     tokens_pos = token_postag(text)
102 |     result_string = ''
103 |     for word, tag in tokens_pos:
104 |         wntag = get_wordnet_pos(tag)
105 |         # not return tag in case of None
106 |         if wntag is None:
107 |             result_string += lemmatizer.lemmatize(word.lower())
108 |         else:
109 |             result_string += lemmatizer.lemmatize(word.lower(), pos=wntag)
110 |         result_string += ' '
111 |     return result_string
112 | 
113 | # remove stop words and short words
114 | def stop_short_process(text):
115 |     text = ' '.join([word for word in text.split() if word not in cachedStopWordsPunctuation])
116 |     text = re.sub("[^a-zA-Z ]+", '', text) # remove apostrophe now
117 |     text = ' '.join(word for word in text.split() if len(word)>2)
118 |     return text
119 | 
120 | # wrap up over the processing
121 | def tweet_process(tweet):
122 |     tweet = re.sub('//t.co\S+', ' ', tweet) # remove link
123 |     tweet = re.sub('http\S+\s*', ' ', tweet)  # remove URLs
124 |     tweet = re.sub('@\S+', ' ', tweet)  # remove mentions
125 |     tweet = re.sub('&amp', ' ', tweet)  # remove mentions
126 |     tweet = re.sub('RT@|RT @', ' ', tweet)  # remove RT
127 |     tweet = re.sub('#\S+', ' ', tweet)  # remove hashtags
128 |     tweet = re.sub('[%s]' % re.escape("""!"#$%&()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', tweet)  # remove punctuations, leave behind apostrophe"'"
129 |     tweet = re.sub('\s+', ' ', tweet)  # remove extra whitespace
130 |     tweet = token_stem_lemmatize(tweet)
131 |     tweet = stop_short_process(tweet)
132 |     return tweet
133 | 
134 | # check if the tweet has embedded link
135 | def has_link(tweet):
136 |     short_link = re.findall('//t.co\S+',tweet)
137 |     url_link = re.findall('http\S+\s*',tweet)
138 |     result = 0 if not short_link and not url_link else 1
139 |     return (result)
140 | 
141 | 
142 | def psy_tweet_process(tweet):
143 |     stemmer = SnowballStemmer("english")
144 |     tokenizer = TweetTokenizer()
145 |     tweet_tokenized = tokenizer.tokenize(tweet)
146 |     n = len(tweet_tokenized)
147 |     try:
148 |         tweet_tokenized = [unicode(y.encode("utf-8"), errors='ignore') for y in tweet_tokenized]
149 |         stemmed = [stemmer.stem(y) for y in tweet_tokenized]
150 |     except:
151 |         stemmed = [stemmer.stem(y) for y in tweet_tokenized]
152 |         stemmed = [d for d in stemmed if d not in cachedStopWordsPunctuation]
153 |     return stemmed, n
154 | 


--------------------------------------------------------------------------------
/outrageclf/model_architect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Specification of Deep and Transfer Learning model, as called in training.py.
  4 | 
  5 | Model Architect includes:
  6 | - Deep LSTM
  7 | - Deep GRU
  8 | - Bidirectional LSTM
  9 | - Bidirectional with Attention
 10 | 
 11 | We only provide access to our Deep GRU model, as used in the paper:
 12 | 
 13 | """
 14 | 
 15 | from sklearn import ensemble
 16 | import tensorflow as tf
 17 | import keras
 18 | import keras.layers as layers
 19 | import tensorflow.keras.backend as K
 20 | from keras import Sequential, optimizers, initializers, regularizers, constraints
 21 | from tensorflow.python.keras.layers import Layer
 22 | 
 23 | 
 24 | embedding_dim = 50
 25 | maxlen = 50
 26 | 
 27 | 
 28 | '''
 29 | Threshold function
 30 | '''
 31 | 
 32 | def threshold_acc(y_true, y_pred, threshold = 0.7):
 33 |     if K.backend() == 'tensorflow':
 34 |         return K.mean(K.equal(y_true,
 35 |           K.cast(K.greater_equal(y_pred,threshold), y_true.dtype)))
 36 |     else:
 37 |         return K.mean(K.equal(y_true,
 38 |           K.greater_equal(y_pred,threshold)))
 39 |     
 40 |  
 41 | 
 42 | '''
 43 | 3-Layer LSTM model: 128, 64, 1 units each
 44 | 2 Dropout layers
 45 | '''
 46 | 
 47 | def lstm_model (embedding_matrix, vocab_size):
 48 |     model = Sequential()
 49 |     model.add(layers.Embedding(vocab_size, embedding_dim,
 50 |                                weights=[embedding_matrix],
 51 |                                input_length=maxlen,
 52 |                                trainable=True))
 53 |     model.add(layers.LSTM(128))
 54 |     model.add(layers.Dropout(0.5))
 55 |     model.add(layers.Dense(64, activation='relu'))
 56 |     model.add(layers.Dropout(0.5))
 57 |     model.add(layers.Dense(1, activation='sigmoid'))
 58 |     model.compile(optimizer='adam',
 59 |                   loss='binary_crossentropy',
 60 |                   metrics=[threshold_acc])
 61 |     return (model)
 62 | 
 63 | 
 64 | 
 65 | '''
 66 | Deep GRU model: 256, 128, 64, 32 layer
 67 | 2 Dropout layers
 68 | '''
 69 | 
 70 | def deep_gru_model (embedding_matrix, vocab_size):
 71 |     model = Sequential()
 72 |     model.add(layers.Embedding(vocab_size, embedding_dim,
 73 |                                weights=[embedding_matrix],
 74 |                                input_length=maxlen,
 75 |                                trainable=True))
 76 |     model.add(layers.GRU(256, return_sequences = True))
 77 |     model.add(layers.GRU(128, return_sequences = True))
 78 |     model.add(layers.GRU(64, return_sequences = True))
 79 |     model.add(layers.GRU(32))
 80 |     model.add(layers.Dropout(0.3))
 81 |     model.add(layers.Dense(64, activation='relu'))
 82 |     model.add(layers.Dropout(0.5))
 83 |     model.add(layers.Dense(1, activation='sigmoid'))
 84 |     model.compile(optimizer='adam',
 85 |                   loss='binary_crossentropy',
 86 |                   metrics=[threshold_acc])
 87 |     return (model)
 88 | 
 89 | 
 90 | 
 91 | '''
 92 | Bi-directional model
 93 | '''
 94 | 
 95 | def deep_bidirectional_model (embedding_matrix, vocab_size):
 96 |     model = Sequential()
 97 |     model.add(layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],
 98 |                                input_length=maxlen, trainable=True))
 99 |     model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True)))
100 |     model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True)))
101 |     model.add(layers.Bidirectional(layers.GRU(64)))
102 | 
103 |     model.add(layers.Dropout(0.3))
104 |     model.add(layers.Dense(64, activation='relu'))
105 |     model.add(layers.Dropout(0.5))
106 |     model.add(layers.Dense(1, activation='sigmoid'))
107 |     model.compile(optimizer='adam',
108 |                   loss='binary_crossentropy',
109 |                   metrics=[threshold_acc])
110 |     return (model)
111 | 
112 | 
113 | '''
114 | Architecture for Attention layer
115 | including: - dot_product wrapper
116 |            - predefined AttentionWithContext layer
117 | '''
118 | 
119 | def dot_product(x, kernel):
120 |     """
121 |     Wrapper for dot product operation, in order to be compatible with both
122 |     Theano and Tensorflow
123 |     Args:
124 |         x (): input
125 |         kernel (): weights
126 |     Returns:
127 |     """
128 |     if K.backend() == 'tensorflow':
129 |         return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
130 |     else:
131 |         return K.dot(x, kernel)
132 |     
133 | 
134 | 
135 | class AttentionWithContext(Layer):
136 |     """
137 |     Attention operation, with a context/query vector, for temporal data.
138 |     Supports Masking.
139 |     Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
140 |     "Hierarchical Attention Networks for Document Classification"
141 |     by using a context vector to assist the attention
142 |     # Input shape
143 |         3D tensor with shape: `(samples, steps, features)`.
144 |     # Output shape
145 |         2D tensor with shape: `(samples, features)`.
146 |     How to use:
147 |     Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
148 |     The dimensions are inferred based on the output shape of the RNN.
149 |     Note: The layer has been tested with Keras 2.0.6
150 |     Example:
151 |         model.add(LSTM(64, return_sequences=True))
152 |         model.add(AttentionWithContext())
153 |         # next add a Dense layer (for classification/regression) or whatever...
154 |     """
155 | 
156 |     def __init__(self,
157 |                  W_regularizer=None, u_regularizer=None, b_regularizer=None,
158 |                  W_constraint=None, u_constraint=None, b_constraint=None,
159 |                  bias=True, **kwargs):
160 | 
161 |         self.supports_masking = True
162 |         self.init = initializers.get('glorot_uniform')
163 | 
164 |         self.W_regularizer = regularizers.get(W_regularizer)
165 |         self.u_regularizer = regularizers.get(u_regularizer)
166 |         self.b_regularizer = regularizers.get(b_regularizer)
167 | 
168 |         self.W_constraint = constraints.get(W_constraint)
169 |         self.u_constraint = constraints.get(u_constraint)
170 |         self.b_constraint = constraints.get(b_constraint)
171 | 
172 |         self.bias = bias
173 |         super(AttentionWithContext, self).__init__(**kwargs)
174 | 
175 |     def build(self, input_shape):
176 |         assert len(input_shape) == 3
177 | 
178 |         self.W = self.add_weight((input_shape[-1], input_shape[-1],),
179 |                                  initializer=self.init,
180 |                                  name='{}_W'.format(self.name),
181 |                                  regularizer=self.W_regularizer,
182 |                                  constraint=self.W_constraint)
183 |         if self.bias:
184 |             self.b = self.add_weight((input_shape[-1],),
185 |                                      initializer='zero',
186 |                                      name='{}_b'.format(self.name),
187 |                                      regularizer=self.b_regularizer,
188 |                                      constraint=self.b_constraint)
189 | 
190 |         self.u = self.add_weight((input_shape[-1],),
191 |                                  initializer=self.init,
192 |                                  name='{}_u'.format(self.name),
193 |                                  regularizer=self.u_regularizer,
194 |                                  constraint=self.u_constraint)
195 | 
196 |         super(AttentionWithContext, self).build(input_shape)
197 | 
198 |     def compute_mask(self, input, input_mask=None):
199 |         # do not pass the mask to the next layers
200 |         return None
201 | 
202 |     def call(self, x, mask=None):
203 |         uit = dot_product(x, self.W)
204 | 
205 |         if self.bias:
206 |             uit += self.b
207 | 
208 |         uit = K.tanh(uit)
209 |         ait = dot_product(uit, self.u)
210 | 
211 |         a = K.exp(ait)
212 | 
213 |         # apply mask after the exp. will be re-normalized next
214 |         if mask is not None:
215 |             # Cast the mask to floatX to avoid float64 upcasting in theano
216 |             a *= K.cast(mask, K.floatx())
217 | 
218 |         # in some cases especially in the early stages of training the sum may be almost zero
219 |         # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
220 |         # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
221 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
222 | 
223 |         a = K.expand_dims(a)
224 |         weighted_input = x * a
225 |         return K.sum(weighted_input, axis=1)
226 | 
227 |     def compute_output_shape(self, input_shape):
228 |         return input_shape[0], input_shape[-1]
229 | 
230 | 
231 | 
232 | '''
233 | Attention model is a Bidirectional GRU with a layer of Attention
234 | '''
235 | 
236 | def attention_model (embedding_matrix, vocab_size):
237 |     model = Sequential()
238 |     model.add(layers.Embedding(vocab_size, embedding_dim,
239 |                                weights=[embedding_matrix],
240 |                                input_length=maxlen,
241 |                                trainable=True))
242 |     model.add(layers.Bidirectional(layers.GRU(128, return_sequences = True)))
243 |     model.add(layers.Bidirectional(layers.GRU(64, return_sequences = True)))
244 |     model.add(AttentionWithContext())
245 |     model.add(layers.Dense(32, activation='relu'))
246 |     model.add(layers.Dropout(0.5))
247 |     model.add(layers.Dense(1, activation='sigmoid'))
248 |     model.compile(optimizer='adam',
249 |                   loss='binary_crossentropy',
250 |                   metrics=[threshold_acc])
251 |     return model
252 | 


--------------------------------------------------------------------------------
/outrageclf/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from . import helpers
  3 | import keras
  4 | from keras.preprocessing.text import Tokenizer
  5 | from keras.preprocessing.sequence import pad_sequences
  6 | from joblib import dump, load
  7 | import urllib
  8 | 
  9 | glove_default_url = 'https://worksheets.codalab.org/rest/bundles/'\
 10 |                      '0x4090ba96b8a444c2a44b2c47884c25f2/'\
 11 |                      'contents/blob/glove.twitter.27B.50d.txt'
 12 | '''
 13 | Get lemmatize hashtag
 14 |     - Create a lemmatized + hashtag features for text vector
 15 |     - Input: a text vector
 16 |     - Output: a vector of lemmatized keywords + hashtags (if exist)
 17 | '''
 18 | 
 19 | def get_lemmatize_hashtag(text_vector):
 20 |     hashtag_ls = [helpers.get_hashtag(text) for text in text_vector]
 21 |     wn_lemmatize_ls = [helpers.tweet_process(text) for text in text_vector]
 22 |     hashtag_lemmatize = [' '.join([x for x in lemma.split(" ") + hashtag.split(" ")]) 
 23 |     for lemma, hashtag in zip(wn_lemmatize_ls, hashtag_ls)]
 24 |     return hashtag_lemmatize
 25 | 
 26 | 
 27 | 
 28 | '''
 29 | Word embedding object:
 30 |     * MAXLEN defaults to 50, we currently don't support customization
 31 | 
 32 |     * TRAINING:
 33 |         - Users' responsibility to submit a valid tokenizer path with .joblib format
 34 | 
 35 |     * USING A PRETRAINED WORD EMBEDDING:
 36 |         - Please contact the Crockett lab for access to the tokenizer
 37 |         - Users' responsbility to submit a valid tokenizer path with .joblib format
 38 | '''
 39 | 
 40 | class WordEmbed:
 41 |     def __init__(self):
 42 |         self.tokenizer_path = None
 43 |         self.tokenizer = None
 44 | 
 45 |     def _get_pretrained_tokenizer(self, path):
 46 |         self.tokenizer_path = path
 47 |         self.tokenizer = load(self.tokenizer_path)
 48 |         print ("Loaded pre-trained tokenizer at:", path)
 49 | 
 50 |     def _train_new_tokenizer(self, text_vector, saving_path):
 51 |         self.tokenizer_path = saving_path
 52 |         embedding_tokenizer = Tokenizer()
 53 |         embedding_tokenizer.fit_on_texts(text_vector)
 54 | 
 55 |         
 56 |         self.tokenizer = embedding_tokenizer
 57 |         dump(embedding_tokenizer, self.tokenizer_path)
 58 |         print ("Trained and saved new tokenizer at:",
 59 |             self.tokenizer_path)
 60 | 
 61 |     def _get_embedded_vector(self, text_vector):
 62 |         embedded = pad_sequences(self.tokenizer.texts_to_sequences(text_vector),
 63 |             padding='post',
 64 |             maxlen=50)
 65 |         return embedded
 66 | 
 67 | 
 68 | 
 69 | '''
 70 | Create Embedding matrix
 71 | - based on pre-defined tokenizer
 72 | - currently only supported Glove 50d Twitter
 73 | - will be updated to support different embedding in the future
 74 | 
 75 | Input:  - word_index: from an associated Tokenizer, called from preprocessing.py
 76 |         - filepath: file path to a pretrained embedding e.g Glove 50d Twitter
 77 | 
 78 | Result: - An embedding matrix for embedding based model such as LSTM, GRU
 79 |         * It is strictly associated with the Tokenizer used in the word_index argument
 80 |         * User's responsibility to make sure they are correct
 81 | '''
 82 | 
 83 | def create_embedding_matrix(word_index, filepath):
 84 |     embedding_dim = 50
 85 |     # Adding again 1 because of reserved 0 index
 86 |     vocab_size = len(word_index) + 1
 87 |     embedding_matrix = np.zeros((vocab_size, embedding_dim))
 88 | 
 89 |     with open(filepath) as f:
 90 |         for line in f:
 91 |             word, *vector = line.split()
 92 |             if word in word_index:
 93 |                 idx = word_index[word] 
 94 |                 embedding_matrix[idx] = np.array(
 95 |                     vector, dtype=np.float32)[:embedding_dim]
 96 |     return embedding_matrix
 97 |     
 98 | 
 99 | 
100 | '''
101 | Create Embedding matrix from default url link
102 | 
103 | Similar to create_embedding_matrix
104 | but use a online storage of Glove 27B 50d embedding
105 | '''
106 | 
107 | def create_embedding_matrix_default(word_index):
108 |     embedding_dim = 50
109 |     # Adding again 1 because of reserved 0 index
110 |     vocab_size = len(word_index) + 1
111 |     embedding_matrix = np.zeros((vocab_size, embedding_dim))
112 |     
113 |     file = urllib.request.urlopen(glove_default_url)
114 | 
115 |     for line in file:
116 |         word, *vector = line.split()
117 |         if word.decode() in word_index:
118 |             idx = word_index[word.decode()] 
119 |             embedding_matrix[idx] = np.array(
120 |                 vector, dtype=np.float32)[:embedding_dim]
121 |     return embedding_matrix
122 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup
 3 | 
 4 | # The directory containing this file
 5 | HERE = pathlib.Path(__file__).parent
 6 | 
 7 | # The text of the README file
 8 | README = (HERE / "README.md").read_text()
 9 | 
10 | setup(name='outrageclf',
11 |       version='0.1.6',
12 |       description='Outrage Classifier - developed by the Crockett Lab',
13 |       long_description=README,
14 |       url='https://github.com/CrockettLab/outrage_classifier',
15 |       install_requires=[
16 |             'emoji',
17 |             'joblib',
18 |             'keras',
19 |             'nltk',
20 |             'numpy',
21 |             'sklearn',
22 |             'tensorflow'
23 |       ],
24 |       author='Tuan Nguyen Doan',
25 |       author_email='tuan.nguyen.doan@aya.yale.edu',
26 |       license='Creative Commons Attribution-NonCommercial-ShareAlike 2.0',
27 |       packages=['outrageclf'],
28 |       include_package_data=True,
29 |       zip_safe=False)
30 | 


--------------------------------------------------------------------------------
/training.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | TEST FILE TO TRAIN MODEL.
  4 | 
  5 | In general, this file should serve as a guideline of 
  6 | how to train model with the outrageclf package.
  7 | 
  8 | 
  9 | Positional Arguments:
 10 |     - filepath: file name of training csv file e.g. "./*.csv"
 11 |     - savepath: path to where models are saved
 12 |     - filename: name of saved models
 13 |     - model: specify one of the model architect to train e.g: "LSTM", "GRU"
 14 |     - text_column: name of training text column e.g. "text"
 15 |     - class_column: name of class column e.g "outrage"
 16 | 
 17 | Available model:
 18 |     - LSTM with Glove Twitter
 19 |     - GRU with Glove Twitter
 20 | 
 21 | """
 22 | 
 23 | 
 24 | import pandas as pd
 25 | import numpy as np
 26 | import tensorflow as tf
 27 | import argparse
 28 | import keras
 29 | import outrageclf
 30 | from outrageclf.preprocessing import WordEmbed, get_lemmatize_hashtag, create_embedding_matrix_default
 31 | from outrageclf.model_architect import lstm_model, deep_gru_model
 32 | 
 33 | 
 34 | model = ["LSTM", "GRU"]
 35 | 
 36 | if __name__ == '__main__':
 37 |     #Initialize the parser
 38 |     parser = argparse.ArgumentParser(description="Outrage Classifier Training. Developed by The Crockett Lab")
 39 |     parser.add_argument(
 40 |         "filepath",
 41 |         help='specifying the path to the training dataset. This should be in the form of .../*.csv'
 42 |     )
 43 | 
 44 |     parser.add_argument(
 45 |         "savepath",
 46 |         help= ('specifying the path to save the model.',
 47 |             'There will be two files being saved to this path: a tokenizer and a trained model'
 48 |             )
 49 |     )
 50 | 
 51 |     parser.add_argument(
 52 |         "filename",
 53 |         help= ('name of the training file.',
 54 |             'This is used to attached to the name of tokenizer and the trained model.')
 55 |     )
 56 | 
 57 |     parser.add_argument(
 58 |         "model",
 59 |         help= 'specifying the model for the training. Default value is "LSTM". Allowed values are '+', '.join(model),
 60 |         choices=model,
 61 |         nargs='?',
 62 |         default="LSTM",
 63 |         metavar = "MODEL"
 64 |     )
 65 | 
 66 |     parser.add_argument(
 67 |         "text_column",
 68 |         help= 'name of text column in csv file'
 69 |     )
 70 | 
 71 |     parser.add_argument(
 72 |         "class_column",
 73 |         help= 'name of class column in csv file. This must be in the form of binary 0, 1 data type'
 74 |     )
 75 | 
 76 | 
 77 |     args = parser.parse_args()
 78 |     df = pd.read_csv(args.filepath)
 79 |     print ("File loaded")
 80 | 
 81 |     word_embed = WordEmbed()
 82 |     tokenizer_path = args.savepath + args.filename + '_tokenizer' + '.joblib'
 83 |     lemmatize_hashtag = get_lemmatize_hashtag(df[args.text_column])
 84 |     # train the new tokenizer and the embedding matrix for the model
 85 |     word_embed._train_new_tokenizer(lemmatize_hashtag, tokenizer_path)
 86 |     word_index = word_embed.tokenizer.word_index
 87 |     embedding_matrix = create_embedding_matrix_default(word_index)
 88 |     print ("Embedding matrix created")
 89 |     
 90 |     # get X and y train
 91 |     X_train = word_embed._get_embedded_vector(lemmatize_hashtag)
 92 |     y_train = np.array(df[args.class_column])
 93 |     print ("Training data prepared")
 94 | 
 95 |     if args.model == 'LSTM':
 96 |         model = lstm_model(
 97 |             embedding_matrix,
 98 |             vocab_size = len(word_index) + 1
 99 |         )
100 |     elif args.model == 'GRU':
101 |         model = deep_gru_model(
102 |             embedding_matrix,
103 |             vocab_size = len(word_index) + 1
104 |         )
105 |     
106 |     # train model
107 |     history = model.fit(
108 |         X_train,
109 |         y_train,
110 |         epochs = 20,
111 |         batch_size = 300,
112 |         verbose = 1
113 |     )
114 |     
115 |     # save model
116 |     model_path = args.savepath + args.filename + '.h5'
117 |     model.save(model_path)
118 | 
119 |     print("Finish training and write " + args.model + " model to:" + model_path)
120 | 


--------------------------------------------------------------------------------