├── .gitignore
├── LICENSE.txt
├── README.md
├── __init__.py
├── resource
├── .directory
├── abbreviations.txt
├── dev
│ ├── Dev_context_moods.txt
│ ├── Dev_v1.txt
│ └── __init__.py
├── emoji_unicode_names_final.txt
├── offensive_words.txt
├── test
│ └── Test_v1.txt
├── text_context_awc_model
│ └── weights
│ │ └── model.json
├── text_model
│ ├── vocab_list.txt
│ └── weights
│ │ └── model.json
├── text_model_2D
│ ├── vocab_list.txt
│ └── weights
│ │ └── model.json
├── train
│ ├── .directory
│ └── Train_v1.txt
├── word_list.txt
├── word_list_freq.txt
└── word_split.txt
└── src
├── __init__.py
├── data_processing
├── __init__.py
├── data_handler.py
└── glove2Word2vecLoader.py
├── sarcasm_context_moods.py
├── sarcasm_detection_model_CNN_DNN_2D.py
├── sarcasm_detection_model_CNN_LSTM_ATTN.py
├── sarcasm_detection_model_CNN_LSTM_DNN.py
├── sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py
├── sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
├── sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py
├── sarcasm_detection_model_attention.py
├── sarcasm_detection_moods_siamese.py
└── sarcasm_detection_siamese.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # SarcamDetection
2 | Sarcasm detection on tweets using neural network.
3 | [This repository] perform[s] semantic modelling of sentences using neural networks for the task of sarcasm detection ([Ghosh & Veale, 2016](http://www.aclweb.org/anthology/W16-0425)).
4 | ## Pre-requisite
5 | - nltk (TweetTokenizer)
6 | - Keras
7 | - Tensorflow
8 | - numpy
9 | - scipy
10 | - gensim (if you are using word2vec)
11 | - itertools
12 |
13 | ## Cloning the repository
14 | ```
15 | git clone git@github.com:AniSkywalker/SarcasmDetection.git
16 | cd SarcasmDetection/src/
17 | ```
18 | If you want to use the pre-trained model, you'll have to [download it](https://drive.google.com/drive/folders/0B7C_0ZfEBcpRbDZKelBZTFFsV0E?usp=sharing) from Google Drive and save it into `/resource/text_model/weights/`.
19 |
20 | ## Using this package
21 | This code is run by the following command:
22 | ```
23 | python sarcasm_detection_model_CNN_LSTM_DNN.py
24 | ```
25 |
26 | ### Citation
27 | Please cite the following paper when using this code:
28 |
29 | > **Fracking Sarcasm using Neural Network.**
30 | > Aniruddha Ghosh and Tony Veale. 7th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis (WASSA 2016). NAACL-HLT. 16th June 2016, San Diego, California, U.S.A.
31 |
32 | ## Output
33 | The supplied input is rated as either **0** meaning _non-sarcastic_ or **1** meaning _sarcastic_.
34 |
35 | ## Training
36 | If you want to train the model with your own data, you can save your _train_, _development_ and _test_ data into the `/resource/train`, `/resource/dev` and `/resource/test` folders correspondingly.
37 |
38 | The system accepts dataset in the tab separated format — as shown below. An example can be found in [`/resource/train/train_v1.txt`](https://github.com/AniSkywalker/SarcasmDetection/tree/master/resource/train).
39 | ```
40 | idlabeltweet
41 | ```
42 |
43 | ## Context information
44 | To run the model with context information and psychological dimensions (using Tensorflow) run:
45 | ```
46 | python sarcasm_context_moods.py
47 | ```
48 |
49 | ### Citation
50 | Please cite the following paper when using context information and psychological dimensions:
51 | > **Magnets for Sarcasm: Making Sarcasm Detection Timely, Contextual and Very Personal**
52 | > Aniruddha Ghosh and Tony Veale. Conference on Empirical Methods in Natural Language Processing (EMNLP). 7th-11th September, 2017, Copenhagen, Denmark.
53 |
54 | ## Notes
55 | - Samples of _train_, _dev_, and _test_ files are included for both versions.
56 | - For a test data set, please contact at aniruddha.ghosh@ucdconnect.ie
57 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'root'
2 |
--------------------------------------------------------------------------------
/resource/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2017,3,10,1,14,25
3 | Version=3
4 |
--------------------------------------------------------------------------------
/resource/abbreviations.txt:
--------------------------------------------------------------------------------
1 | i've i have
2 | we've we have
3 | can't can not
4 | i'm i am
5 | we're we are
6 | don't do not
7 | didn't did not
8 | tt's it is
9 | that's that is
10 | he's he is
11 | she's she is
12 | let's let us
13 | there's there is
14 | how's how is
15 | i'd i would
16 | 2F4U Too Fast For You
17 | 4YEO FYEO For Your Eyes Only
18 | AAMOF As a Matter of Fact
19 | ACK Acknowledgment
20 | AFAIK As far as I know
21 | AFAIR As far as I remember
22 | AFK Away from Keyboard
23 | AKA Also known as
24 | B2K BTK Back to Keyboard
25 | BTT Back to Topic
26 | BTW By the Way
27 | B/C Because
28 | C&P Copy and Paste
29 | CU See you
30 | CYS Check your Settings
31 | DIY Do it Yourself
32 | EOBD End of Business Day
33 | EOD End of Discussion
34 | EOM End of Message
35 | EOT End of Text
36 | FAQ Frequently asked Questions
37 | FACK Full Acknowledge
38 | FKA Formerly known as
39 | FWIW For what it is Worth
40 | FYI For your Information
41 | JFYI Just For your Information
42 | FTW Fuck the World
43 | HF Have fun
44 | HTH Hope this Helps
45 | IDK I do not know
46 | IIRC If I Recall Correctly
47 | IMHO In my Humble Opinion
48 | IMO In my Opinion
49 | IMNSHO In my not so Humble Opinion
50 | IOW In other Words
51 | ITT In this Thread
52 | LOL Laughing out loud
53 | DGMW Do not get me wrong
54 | MMW Mark my Words
55 | N/A Not Available
56 | NaN Not a Number
57 | NNTR No need to Reply
58 | noob Newbie
59 | n00b Newbie
60 | NOYB None of your Business
61 | NRN No Reply Necessary
62 | OMG Oh my God
63 | OP Original Poster
64 | OT Off Topic
65 | OTOH On the other Hand
66 | PEBKAC Problem exists between Keyboard and Chair
67 | POV Point of View
68 | ROTFL Rolling on the Floor Laughing
69 | RSVP Repondez s'il vous plait
70 | RTFM Read the fine Manual
71 | SCNR Sorry could not Resist
72 | SFLR Sorry for late Reply
73 | SPOC Single Point of Contact
74 | TBA To be Announced
75 | TBC To be Continued
76 | TIA Thanks in Advance
77 | TGIF Thanks God, its Friday
78 | THX TNX Thanks
79 | TQ Thank You
80 | TYVM Thank You Very Much
81 | TYT Take your Time
82 | TTYL Talk to you Later
83 | w00t Hooray
84 | WFM Works for Me
85 | WRT With Regard to
86 | WTH What the Hell
87 | WTF What the Fuck
88 | YMMD You made my Day
89 | YMMV Your Mileage may vary
90 | YAM Yet Another Meeting
91 | ICYMI In Case you missed it
--------------------------------------------------------------------------------
/resource/dev/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/resource/dev/__init__.py
--------------------------------------------------------------------------------
/resource/offensive_words.txt:
--------------------------------------------------------------------------------
1 | abbo
2 | abo
3 | abortion
4 | abuse
5 | addict
6 | addicts
7 | adult
8 | africa
9 | african
10 | alla
11 | allah
12 | alligatorbait
13 | amateur
14 | american
15 | anal
16 | analannie
17 | analsex
18 | angie
19 | angry
20 | anus
21 | arab
22 | arabs
23 | areola
24 | argie
25 | aroused
26 | arse
27 | arsehole
28 | asian
29 | ass
30 | assassin
31 | assassinate
32 | assassination
33 | assault
34 | assbagger
35 | assblaster
36 | assclown
37 | asscowboy
38 | asses
39 | assfuck
40 | assfucker
41 | asshat
42 | asshole
43 | assholes
44 | asshore
45 | assjockey
46 | asskiss
47 | asskisser
48 | assklown
49 | asslick
50 | asslicker
51 | asslover
52 | assman
53 | assmonkey
54 | assmunch
55 | assmuncher
56 | asspacker
57 | asspirate
58 | asspuppies
59 | assranger
60 | asswhore
61 | asswipe
62 | athletesfoot
63 | attack
64 | australian
65 | babe
66 | babies
67 | backdoor
68 | backdoorman
69 | backseat
70 | badfuck
71 | balllicker
72 | balls
73 | ballsack
74 | banging
75 | baptist
76 | barelylegal
77 | barf
78 | barface
79 | barfface
80 | bast
81 | bastard
82 | bazongas
83 | bazooms
84 | beaner
85 | beast
86 | beastality
87 | beastial
88 | beastiality
89 | beatoff
90 | beat-off
91 | beatyourmeat
92 | beaver
93 | bestial
94 | bestiality
95 | bi
96 | biatch
97 | bible
98 | bicurious
99 | bigass
100 | bigbastard
101 | bigbutt
102 | bigger
103 | bisexual
104 | bi-sexual
105 | bitch
106 | bitcher
107 | bitches
108 | bitchez
109 | bitchin
110 | bitching
111 | bitchslap
112 | bitchy
113 | biteme
114 | black
115 | blackman
116 | blackout
117 | blacks
118 | blind
119 | blow
120 | blowjob
121 | boang
122 | bogan
123 | bohunk
124 | bollick
125 | bollock
126 | bomb
127 | bombers
128 | bombing
129 | bombs
130 | bomd
131 | bondage
132 | boner
133 | bong
134 | boob
135 | boobies
136 | boobs
137 | booby
138 | boody
139 | boom
140 | boong
141 | boonga
142 | boonie
143 | booty
144 | bootycall
145 | bountybar
146 | bra
147 | brea5t
148 | breast
149 | breastjob
150 | breastlover
151 | breastman
152 | brothel
153 | bugger
154 | buggered
155 | buggery
156 | bullcrap
157 | bulldike
158 | bulldyke
159 | bullshit
160 | bumblefuck
161 | bumfuck
162 | bunga
163 | bunghole
164 | buried
165 | burn
166 | butchbabes
167 | butchdike
168 | butchdyke
169 | butt
170 | buttbang
171 | butt-bang
172 | buttface
173 | buttfuck
174 | butt-fuck
175 | buttfucker
176 | butt-fucker
177 | buttfuckers
178 | butt-fuckers
179 | butthead
180 | buttman
181 | buttmunch
182 | buttmuncher
183 | buttpirate
184 | buttplug
185 | buttstain
186 | byatch
187 | cacker
188 | cameljockey
189 | cameltoe
190 | canadian
191 | cancer
192 | carpetmuncher
193 | carruth
194 | catholic
195 | catholics
196 | cemetery
197 | chav
198 | cherrypopper
199 | chickslick
200 | children's
201 | chin
202 | chinaman
203 | chinamen
204 | chinese
205 | chink
206 | chinky
207 | choad
208 | chode
209 | christ
210 | christian
211 | church
212 | cigarette
213 | cigs
214 | clamdigger
215 | clamdiver
216 | clit
217 | clitoris
218 | clogwog
219 | cocaine
220 | cock
221 | cockblock
222 | cockblocker
223 | cockcowboy
224 | cockfight
225 | cockhead
226 | cockknob
227 | cocklicker
228 | cocklover
229 | cocknob
230 | cockqueen
231 | cockrider
232 | cocksman
233 | cocksmith
234 | cocksmoker
235 | cocksucer
236 | cocksuck
237 | cocksucked
238 | cocksucker
239 | cocksucking
240 | cocktail
241 | cocktease
242 | cocky
243 | cohee
244 | coitus
245 | color
246 | colored
247 | coloured
248 | commie
249 | communist
250 | condom
251 | conservative
252 | conspiracy
253 | coolie
254 | cooly
255 | coon
256 | coondog
257 | copulate
258 | cornhole
259 | corruption
260 | cra5h
261 | crabs
262 | crack
263 | crackpipe
264 | crackwhore
265 | crack-whore
266 | crap
267 | crapola
268 | crapper
269 | crappy
270 | crash
271 | creamy
272 | crime
273 | crimes
274 | criminal
275 | criminals
276 | crotch
277 | crotchjockey
278 | crotchmonkey
279 | crotchrot
280 | cum
281 | cumbubble
282 | cumfest
283 | cumjockey
284 | cumm
285 | cummer
286 | cumming
287 | cumquat
288 | cumqueen
289 | cumshot
290 | cunilingus
291 | cunillingus
292 | cunn
293 | cunnilingus
294 | cunntt
295 | cunt
296 | cunteyed
297 | cuntfuck
298 | cuntfucker
299 | cuntlick
300 | cuntlicker
301 | cuntlicking
302 | cuntsucker
303 | cybersex
304 | cyberslimer
305 | dago
306 | dahmer
307 | dammit
308 | damn
309 | damnation
310 | damnit
311 | darkie
312 | darky
313 | datnigga
314 | dead
315 | deapthroat
316 | death
317 | deepthroat
318 | defecate
319 | dego
320 | demon
321 | deposit
322 | desire
323 | destroy
324 | deth
325 | devil
326 | devilworshipper
327 | dick
328 | dickbrain
329 | dickforbrains
330 | dickhead
331 | dickless
332 | dicklick
333 | dicklicker
334 | dickman
335 | dickwad
336 | dickweed
337 | diddle
338 | die
339 | died
340 | dies
341 | dike
342 | dildo
343 | dingleberry
344 | dink
345 | dipshit
346 | dipstick
347 | dirty
348 | disease
349 | diseases
350 | disturbed
351 | dive
352 | dix
353 | dixiedike
354 | dixiedyke
355 | doggiestyle
356 | doggystyle
357 | dong
358 | doodoo
359 | doo-doo
360 | doom
361 | dope
362 | dragqueen
363 | dragqween
364 | dripdick
365 | drug
366 | drunk
367 | drunken
368 | dumb
369 | dumbass
370 | dumbbitch
371 | dumbfuck
372 | dyefly
373 | dyke
374 | easyslut
375 | eatballs
376 | eatme
377 | eatpussy
378 | ecstacy
379 | ejaculate
380 | ejaculated
381 | ejaculating
382 | ejaculation
383 | enema
384 | enemy
385 | erect
386 | erection
387 | ero
388 | escort
389 | ethiopian
390 | ethnic
391 | european
392 | evl
393 | excrement
394 | execute
395 | executed
396 | execution
397 | executioner
398 | explosion
399 | facefucker
400 | faeces
401 | fag
402 | fagging
403 | faggot
404 | fagot
405 | failed
406 | failure
407 | fairies
408 | fairy
409 | faith
410 | fannyfucker
411 | fart
412 | farted
413 | farting
414 | farty
415 | fastfuck
416 | fat
417 | fatah
418 | fatass
419 | fatfuck
420 | fatfucker
421 | fatso
422 | fckcum
423 | fear
424 | feces
425 | felatio
426 | felch
427 | felcher
428 | felching
429 | fellatio
430 | feltch
431 | feltcher
432 | feltching
433 | fetish
434 | fight
435 | filipina
436 | filipino
437 | fingerfood
438 | fingerfuck
439 | fingerfucked
440 | fingerfucker
441 | fingerfuckers
442 | fingerfucking
443 | fire
444 | firing
445 | fister
446 | fistfuck
447 | fistfucked
448 | fistfucker
449 | fistfucking
450 | fisting
451 | flange
452 | flasher
453 | flatulence
454 | floo
455 | flydie
456 | flydye
457 | fok
458 | fondle
459 | footaction
460 | footfuck
461 | footfucker
462 | footlicker
463 | footstar
464 | fore
465 | foreskin
466 | forni
467 | fornicate
468 | foursome
469 | fourtwenty
470 | fraud
471 | freakfuck
472 | freakyfucker
473 | freefuck
474 | fu
475 | fubar
476 | fuc
477 | fucck
478 | fuck
479 | fucka
480 | fuckable
481 | fuckbag
482 | fuckbuddy
483 | fucked
484 | fuckedup
485 | fucker
486 | fuckers
487 | fuckface
488 | fuckfest
489 | fuckfreak
490 | fuckfriend
491 | fuckhead
492 | fuckher
493 | fuckin
494 | fuckina
495 | fucking
496 | fuckingbitch
497 | fuckinnuts
498 | fuckinright
499 | fuckit
500 | fuckknob
501 | fuckme
502 | fuckmehard
503 | fuckmonkey
504 | fuckoff
505 | fuckpig
506 | fucks
507 | fucktard
508 | fuckwhore
509 | fuckyou
510 | fudgepacker
511 | fugly
512 | fuk
513 | fuks
514 | funeral
515 | funfuck
516 | fungus
517 | fuuck
518 | gangbang
519 | gangbanged
520 | gangbanger
521 | gangsta
522 | gatorbait
523 | gay
524 | gaymuthafuckinwhore
525 | gaysex
526 | geez
527 | geezer
528 | geni
529 | genital
530 | german
531 | getiton
532 | gin
533 | ginzo
534 | gipp
535 | girls
536 | givehead
537 | glazeddonut
538 | gob
539 | god
540 | godammit
541 | goddamit
542 | goddammit
543 | goddamn
544 | goddamned
545 | goddamnes
546 | goddamnit
547 | goddamnmuthafucker
548 | goldenshower
549 | gonorrehea
550 | gonzagas
551 | gook
552 | gotohell
553 | goy
554 | goyim
555 | greaseball
556 | gringo
557 | groe
558 | gross
559 | grostulation
560 | gubba
561 | gummer
562 | gun
563 | gyp
564 | gypo
565 | gypp
566 | gyppie
567 | gyppo
568 | gyppy
569 | hamas
570 | handjob
571 | hapa
572 | harder
573 | hardon
574 | harem
575 | headfuck
576 | headlights
577 | hebe
578 | heeb
579 | hell
580 | henhouse
581 | heroin
582 | herpes
583 | heterosexual
584 | hijack
585 | hijacker
586 | hijacking
587 | hillbillies
588 | hindoo
589 | hiscock
590 | hitler
591 | hitlerism
592 | hitlerist
593 | hiv
594 | ho
595 | hobo
596 | hodgie
597 | hoes
598 | hole
599 | holestuffer
600 | homicide
601 | homo
602 | homobangers
603 | homosexual
604 | honger
605 | honk
606 | honkers
607 | honkey
608 | honky
609 | hook
610 | hooker
611 | hookers
612 | hooters
613 | hore
614 | hork
615 | horn
616 | horney
617 | horniest
618 | horny
619 | horseshit
620 | hosejob
621 | hoser
622 | hostage
623 | hotdamn
624 | hotpussy
625 | hottotrot
626 | hummer
627 | husky
628 | hussy
629 | hustler
630 | hymen
631 | hymie
632 | iblowu
633 | idiot
634 | ikey
635 | illegal
636 | incest
637 | insest
638 | intercourse
639 | interracial
640 | intheass
641 | inthebuff
642 | israel
643 | israeli
644 | israel's
645 | italiano
646 | itch
647 | jackass
648 | jackoff
649 | jackshit
650 | jacktheripper
651 | jade
652 | jap
653 | japanese
654 | japcrap
655 | jebus
656 | jeez
657 | jerkoff
658 | jesus
659 | jesuschrist
660 | jew
661 | jewish
662 | jiga
663 | jigaboo
664 | jigg
665 | jigga
666 | jiggabo
667 | jigger
668 | jiggy
669 | jihad
670 | jijjiboo
671 | jimfish
672 | jism
673 | jiz
674 | jizim
675 | jizjuice
676 | jizm
677 | jizz
678 | jizzim
679 | jizzum
680 | joint
681 | juggalo
682 | jugs
683 | junglebunny
684 | kaffer
685 | kaffir
686 | kaffre
687 | kafir
688 | kanake
689 | kid
690 | kigger
691 | kike
692 | kill
693 | killed
694 | killer
695 | killing
696 | kills
697 | kink
698 | kinky
699 | kissass
700 | kkk
701 | knife
702 | knockers
703 | kock
704 | kondum
705 | koon
706 | kotex
707 | krap
708 | krappy
709 | kraut
710 | kum
711 | kumbubble
712 | kumbullbe
713 | kummer
714 | kumming
715 | kumquat
716 | kums
717 | kunilingus
718 | kunnilingus
719 | kunt
720 | ky
721 | kyke
722 | lactate
723 | laid
724 | lapdance
725 | latin
726 | lesbain
727 | lesbayn
728 | lesbian
729 | lesbin
730 | lesbo
731 | lez
732 | lezbe
733 | lezbefriends
734 | lezbo
735 | lezz
736 | lezzo
737 | liberal
738 | libido
739 | licker
740 | lickme
741 | lies
742 | limey
743 | limpdick
744 | limy
745 | lingerie
746 | liquor
747 | livesex
748 | loadedgun
749 | lolita
750 | looser
751 | loser
752 | lotion
753 | lovebone
754 | lovegoo
755 | lovegun
756 | lovejuice
757 | lovemuscle
758 | lovepistol
759 | loverocket
760 | lowlife
761 | lsd
762 | lubejob
763 | lucifer
764 | luckycammeltoe
765 | lugan
766 | lynch
767 | macaca
768 | mad
769 | mafia
770 | magicwand
771 | mams
772 | manhater
773 | manpaste
774 | marijuana
775 | mastabate
776 | mastabater
777 | masterbate
778 | masterblaster
779 | mastrabator
780 | masturbate
781 | masturbating
782 | mattressprincess
783 | meatbeatter
784 | meatrack
785 | meth
786 | mexican
787 | mgger
788 | mggor
789 | mickeyfinn
790 | mideast
791 | milf
792 | minority
793 | mockey
794 | mockie
795 | mocky
796 | mofo
797 | moky
798 | moles
799 | molest
800 | molestation
801 | molester
802 | molestor
803 | moneyshot
804 | mooncricket
805 | mormon
806 | moron
807 | moslem
808 | mosshead
809 | mothafuck
810 | mothafucka
811 | mothafuckaz
812 | mothafucked
813 | mothafucker
814 | mothafuckin
815 | mothafucking
816 | mothafuckings
817 | motherfuck
818 | motherfucked
819 | motherfucker
820 | motherfuckin
821 | motherfucking
822 | motherfuckings
823 | motherlovebone
824 | muff
825 | muffdive
826 | muffdiver
827 | muffindiver
828 | mufflikcer
829 | mulatto
830 | muncher
831 | munt
832 | murder
833 | murderer
834 | muslim
835 | naked
836 | narcotic
837 | nasty
838 | nastybitch
839 | nastyho
840 | nastyslut
841 | nastywhore
842 | nazi
843 | necro
844 | negro
845 | negroes
846 | negroid
847 | negro's
848 | nig
849 | niger
850 | nigerian
851 | nigerians
852 | nigg
853 | nigga
854 | niggah
855 | niggaracci
856 | niggard
857 | niggarded
858 | niggarding
859 | niggardliness
860 | niggardliness's
861 | niggardly
862 | niggards
863 | niggard's
864 | niggaz
865 | nigger
866 | niggerhead
867 | niggerhole
868 | niggers
869 | nigger's
870 | niggle
871 | niggled
872 | niggles
873 | niggling
874 | nigglings
875 | niggor
876 | niggur
877 | niglet
878 | nignog
879 | nigr
880 | nigra
881 | nigre
882 | nip
883 | nipple
884 | nipplering
885 | nittit
886 | nlgger
887 | nlggor
888 | nofuckingway
889 | nook
890 | nookey
891 | nookie
892 | noonan
893 | nooner
894 | nude
895 | nudger
896 | nuke
897 | nutfucker
898 | nymph
899 | ontherag
900 | oral
901 | orga
902 | orgasim
903 | orgasm
904 | orgies
905 | orgy
906 | osama
907 | paki
908 | palesimian
909 | palestinian
910 | pansies
911 | pansy
912 | panti
913 | panties
914 | payo
915 | pearlnecklace
916 | peck
917 | pecker
918 | peckerwood
919 | pee
920 | peehole
921 | pee-pee
922 | peepshow
923 | peepshpw
924 | pendy
925 | penetration
926 | peni5
927 | penile
928 | penis
929 | penises
930 | penthouse
931 | period
932 | perv
933 | phonesex
934 | phuk
935 | phuked
936 | phuking
937 | phukked
938 | phukking
939 | phungky
940 | phuq
941 | pi55
942 | picaninny
943 | piccaninny
944 | pickaninny
945 | piker
946 | pikey
947 | piky
948 | pimp
949 | pimped
950 | pimper
951 | pimpjuic
952 | pimpjuice
953 | pimpsimp
954 | pindick
955 | piss
956 | pissed
957 | pisser
958 | pisses
959 | pisshead
960 | pissin
961 | pissing
962 | pissoff
963 | pistol
964 | pixie
965 | pixy
966 | playboy
967 | playgirl
968 | pocha
969 | pocho
970 | pocketpool
971 | pohm
972 | polack
973 | pom
974 | pommie
975 | pommy
976 | poo
977 | poon
978 | poontang
979 | poop
980 | pooper
981 | pooperscooper
982 | pooping
983 | poorwhitetrash
984 | popimp
985 | porchmonkey
986 | porn
987 | pornflick
988 | pornking
989 | porno
990 | pornography
991 | pornprincess
992 | pot
993 | poverty
994 | premature
995 | pric
996 | prick
997 | prickhead
998 | primetime
999 | propaganda
1000 | pros
1001 | prostitute
1002 | protestant
1003 | pu55i
1004 | pu55y
1005 | pube
1006 | pubic
1007 | pubiclice
1008 | pud
1009 | pudboy
1010 | pudd
1011 | puddboy
1012 | puke
1013 | puntang
1014 | purinapricness
1015 | puss
1016 | pussie
1017 | pussies
1018 | pussy
1019 | pussycat
1020 | pussyeater
1021 | pussyfucker
1022 | pussylicker
1023 | pussylips
1024 | pussylover
1025 | pussypounder
1026 | pusy
1027 | quashie
1028 | queef
1029 | queer
1030 | quickie
1031 | quim
1032 | ra8s
1033 | rabbi
1034 | racial
1035 | racist
1036 | radical
1037 | radicals
1038 | raghead
1039 | randy
1040 | rape
1041 | raped
1042 | raper
1043 | rapist
1044 | rearend
1045 | rearentry
1046 | rectum
1047 | redlight
1048 | redneck
1049 | reefer
1050 | reestie
1051 | refugee
1052 | reject
1053 | remains
1054 | rentafuck
1055 | republican
1056 | rere
1057 | retard
1058 | retarded
1059 | ribbed
1060 | rigger
1061 | rimjob
1062 | rimming
1063 | roach
1064 | robber
1065 | roundeye
1066 | rump
1067 | russki
1068 | russkie
1069 | sadis
1070 | sadom
1071 | samckdaddy
1072 | sandm
1073 | sandnigger
1074 | satan
1075 | scag
1076 | scallywag
1077 | scat
1078 | schlong
1079 | screw
1080 | screwyou
1081 | scrotum
1082 | scum
1083 | semen
1084 | seppo
1085 | servant
1086 | sex
1087 | sexed
1088 | sexfarm
1089 | sexhound
1090 | sexhouse
1091 | sexing
1092 | sexkitten
1093 | sexpot
1094 | sexslave
1095 | sextogo
1096 | sextoy
1097 | sextoys
1098 | sexual
1099 | sexually
1100 | sexwhore
1101 | sexy
1102 | sexymoma
1103 | sexy-slim
1104 | shag
1105 | shaggin
1106 | shagging
1107 | shat
1108 | shav
1109 | shawtypimp
1110 | sheeney
1111 | shhit
1112 | shinola
1113 | shit
1114 | shitcan
1115 | shitdick
1116 | shite
1117 | shiteater
1118 | shited
1119 | shitface
1120 | shitfaced
1121 | shitfit
1122 | shitforbrains
1123 | shitfuck
1124 | shitfucker
1125 | shitfull
1126 | shithapens
1127 | shithappens
1128 | shithead
1129 | shithouse
1130 | shiting
1131 | shitlist
1132 | shitola
1133 | shitoutofluck
1134 | shits
1135 | shitstain
1136 | shitted
1137 | shitter
1138 | shitting
1139 | shitty
1140 | shoot
1141 | shooting
1142 | shortfuck
1143 | showtime
1144 | sick
1145 | sissy
1146 | sixsixsix
1147 | sixtynine
1148 | sixtyniner
1149 | skank
1150 | skankbitch
1151 | skankfuck
1152 | skankwhore
1153 | skanky
1154 | skankybitch
1155 | skankywhore
1156 | skinflute
1157 | skum
1158 | skumbag
1159 | slant
1160 | slanteye
1161 | slapper
1162 | slaughter
1163 | slav
1164 | slave
1165 | slavedriver
1166 | sleezebag
1167 | sleezeball
1168 | slideitin
1169 | slime
1170 | slimeball
1171 | slimebucket
1172 | slopehead
1173 | slopey
1174 | slopy
1175 | slut
1176 | sluts
1177 | slutt
1178 | slutting
1179 | slutty
1180 | slutwear
1181 | slutwhore
1182 | smack
1183 | smackthemonkey
1184 | smut
1185 | snatch
1186 | snatchpatch
1187 | snigger
1188 | sniggered
1189 | sniggering
1190 | sniggers
1191 | snigger's
1192 | sniper
1193 | snot
1194 | snowback
1195 | snownigger
1196 | sob
1197 | sodom
1198 | sodomise
1199 | sodomite
1200 | sodomize
1201 | sodomy
1202 | sonofabitch
1203 | sonofbitch
1204 | sooty
1205 | sos
1206 | soviet
1207 | spaghettibender
1208 | spaghettinigger
1209 | spank
1210 | spankthemonkey
1211 | sperm
1212 | spermacide
1213 | spermbag
1214 | spermhearder
1215 | spermherder
1216 | spic
1217 | spick
1218 | spig
1219 | spigotty
1220 | spik
1221 | spit
1222 | spitter
1223 | splittail
1224 | spooge
1225 | spreadeagle
1226 | spunk
1227 | spunky
1228 | squaw
1229 | stagg
1230 | stiffy
1231 | strapon
1232 | stringer
1233 | stripclub
1234 | stroke
1235 | stroking
1236 | stupid
1237 | stupidfuck
1238 | stupidfucker
1239 | suck
1240 | suckdick
1241 | sucker
1242 | suckme
1243 | suckmyass
1244 | suckmydick
1245 | suckmytit
1246 | suckoff
1247 | suicide
1248 | swallow
1249 | swallower
1250 | swalow
1251 | swastika
1252 | sweetness
1253 | syphilis
1254 | taboo
1255 | taff
1256 | tampon
1257 | tang
1258 | tantra
1259 | tarbaby
1260 | tard
1261 | teat
1262 | terror
1263 | terrorist
1264 | teste
1265 | testicle
1266 | testicles
1267 | thicklips
1268 | thirdeye
1269 | thirdleg
1270 | threesome
1271 | threeway
1272 | timbernigger
1273 | tinkle
1274 | tit
1275 | titbitnipply
1276 | titfuck
1277 | titfucker
1278 | titfuckin
1279 | titjob
1280 | titlicker
1281 | titlover
1282 | tits
1283 | tittie
1284 | titties
1285 | titty
1286 | tnt
1287 | toilet
1288 | tongethruster
1289 | tongue
1290 | tonguethrust
1291 | tonguetramp
1292 | tortur
1293 | torture
1294 | tosser
1295 | towelhead
1296 | trailertrash
1297 | tramp
1298 | trannie
1299 | tranny
1300 | transexual
1301 | transsexual
1302 | transvestite
1303 | triplex
1304 | trisexual
1305 | trojan
1306 | trots
1307 | tuckahoe
1308 | tunneloflove
1309 | turd
1310 | turnon
1311 | twat
1312 | twink
1313 | twinkie
1314 | twobitwhore
1315 | uck
1316 | uk
1317 | unfuckable
1318 | upskirt
1319 | uptheass
1320 | upthebutt
1321 | urinary
1322 | urinate
1323 | urine
1324 | usama
1325 | uterus
1326 | vagina
1327 | vaginal
1328 | vatican
1329 | vibr
1330 | vibrater
1331 | vibrator
1332 | vietcong
1333 | violence
1334 | virgin
1335 | virginbreaker
1336 | vomit
1337 | vulva
1338 | wab
1339 | wank
1340 | wanker
1341 | wanking
1342 | waysted
1343 | weapon
1344 | weenie
1345 | weewee
1346 | welcher
1347 | welfare
1348 | wetb
1349 | wetback
1350 | wetspot
1351 | whacker
1352 | whash
1353 | whigger
1354 | whiskey
1355 | whiskeydick
1356 | whiskydick
1357 | whit
1358 | whitenigger
1359 | whites
1360 | whitetrash
1361 | whitey
1362 | whiz
1363 | whop
1364 | whore
1365 | whorefucker
1366 | whorehouse
1367 | wigger
1368 | willie
1369 | williewanker
1370 | willy
1371 | wn
1372 | wog
1373 | women's
1374 | wop
1375 | wtf
1376 | wuss
1377 | wuzzie
1378 | xtc
1379 | xxx
1380 | yankee
1381 | yellowman
1382 | zigabo
1383 | zipperhead
--------------------------------------------------------------------------------
/resource/text_context_awc_model/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "context", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "context"}, "inbound_nodes": []}, {"name": "text", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "text"}, "inbound_nodes": []}, {"name": "embedding_1", "class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["context", 0, 0, {}]]]}, {"name": "embedding_2", "class_name": "Embedding", "config": {"name": "embedding_2", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["text", 0, 0, {}]]]}, {"name": "conv1d_1", "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_1", 0, 0, {}]]]}, {"name": "conv1d_2", "class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_2", 0, 0, {}]]]}, {"name": "lstm_1", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_2", "class_name": "LSTM", "config": {"name": "lstm_2", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_3", "class_name": "LSTM", "config": {"name": "lstm_3", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "lstm_4", "class_name": "LSTM", "config": {"name": "lstm_4", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "concatenate_1", "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_1", 0, 0, {}], ["lstm_2", 0, 0, {}]]]}, {"name": "concatenate_2", "class_name": "Concatenate", "config": {"name": "concatenate_2", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_3", 0, 0, {}], ["lstm_4", 0, 0, {}]]]}, {"name": "awc", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 11], "dtype": "float32", "sparse": false, "name": "awc"}, "inbound_nodes": []}, {"name": "concatenate_3", "class_name": "Concatenate", "config": {"name": "concatenate_3", "trainable": true, "axis": -1}, "inbound_nodes": [[["concatenate_1", 0, 0, {}], ["concatenate_2", 0, 0, {}], ["awc", 0, 0, {}]]]}, {"name": "dense_1", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 64, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["concatenate_3", 0, 0, {}]]]}, {"name": "dropout_1", "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"name": "dense_2", "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}, {"name": "activation_1", "class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}, "inbound_nodes": [[["dense_2", 0, 0, {}]]]}], "input_layers": [["context", 0, 0], ["text", 0, 0], ["awc", 0, 0]], "output_layers": [["activation_1", 0, 0]]}, "keras_version": "2.1.6", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/resource/text_model/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 33892, "output_dim": 256, "embeddings_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "normal", "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 256, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [3], "pool_size": [3], "padding": "valid"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 256, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.5, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 256, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}}], "keras_version": "2.1.6", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/resource/text_model_2D/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": [{"class_name": "Masking", "config": {"name": "masking_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "mask_value": 0}}, {"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 34552, "output_dim": 200, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Reshape", "config": {"name": "reshape_1", "trainable": true, "target_shape": [30, 200, 1]}}, {"class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "filters": 32, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_1", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 64, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_2", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.1.2", "backend": "tensorflow"}
--------------------------------------------------------------------------------
/resource/train/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2017,2,6,16,20,50
3 | Version=3
4 | ViewMode=1
5 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/__init__.py
--------------------------------------------------------------------------------
/src/data_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/data_processing/__init__.py
--------------------------------------------------------------------------------
/src/data_processing/data_handler.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append('../')
4 | from collections import defaultdict
5 | import re
6 | from gensim.models.keyedvectors import KeyedVectors
7 | from gensim.models.wrappers import FastText
8 | import numpy
9 | from nltk.tokenize import TweetTokenizer
10 | import src.data_processing.glove2Word2vecLoader as glove
11 | import itertools
12 |
13 |
14 | # loading the emoji dataset
15 | def load_unicode_mapping(path):
16 | emoji_dict = defaultdict()
17 | with open(path, 'r') as f:
18 | lines = f.readlines()
19 | for line in lines:
20 | tokens = line.strip().split('\t')
21 | emoji_dict[tokens[0]] = tokens[1]
22 | return emoji_dict
23 |
24 |
25 | def load_word2vec(path=None):
26 | word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True)
27 | return word2vecmodel
28 |
29 |
30 | def load_fasttext(path=None):
31 | word2vecmodel = FastText.load_fasttext_format(path)
32 | return word2vecmodel
33 |
34 |
35 | def InitializeWords(word_file_path):
36 | word_dictionary = defaultdict()
37 |
38 | with open(word_file_path, 'r') as f:
39 | lines = f.readlines()
40 | for line in lines:
41 | tokens = line.lower().strip().split('\t')
42 | word_dictionary[tokens[0]] = int(tokens[1])
43 |
44 | for alphabet in "bcdefghjklmnopqrstuvwxyz":
45 | if (alphabet in word_dictionary):
46 | word_dictionary.__delitem__(alphabet)
47 |
48 | for word in ['ann', 'assis',
49 | 'bz',
50 | 'ch', 'cre', 'ct',
51 | 'di',
52 | 'ed', 'ee',
53 | 'ic',
54 | 'le',
55 | 'ng', 'ns',
56 | 'pr', 'picon',
57 | 'th', 'tle', 'tl', 'tr',
58 | 'um',
59 | 've',
60 | 'yi'
61 | ]:
62 | if (word in word_dictionary):
63 | word_dictionary.__delitem__(word)
64 |
65 | return word_dictionary
66 |
67 |
68 | def normalize_word(word):
69 | temp = word
70 | while True:
71 | w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
72 | if (w == temp):
73 | break
74 | else:
75 | temp = w
76 | return w
77 |
78 |
79 | def load_split_word(split_word_file_path):
80 | split_word_dictionary = defaultdict()
81 | with open(split_word_file_path, 'r') as f:
82 | lines = f.readlines()
83 | for line in lines:
84 | tokens = line.lower().strip().split('\t')
85 | if (len(tokens) >= 2):
86 | split_word_dictionary[tokens[0]] = tokens[1]
87 |
88 | print('split entry found:', len(split_word_dictionary.keys()))
89 | return split_word_dictionary
90 |
91 |
92 | def split_hashtags(term, wordlist, split_word_list, dump_file=''):
93 | # print('term::',term)
94 |
95 | if (len(term.strip()) == 1):
96 | return ['']
97 |
98 | if (split_word_list != None and term.lower() in split_word_list):
99 | # print('found')
100 | return split_word_list.get(term.lower()).split(' ')
101 | else:
102 | print(term)
103 |
104 | # discarding # if exists
105 | if (term.startswith('#')):
106 | term = term[1:]
107 |
108 | if (wordlist != None and term.lower() in wordlist):
109 | return [term.lower()]
110 |
111 | words = []
112 | # max freq
113 | penalty = -69971
114 | max_coverage = penalty
115 |
116 | split_words_count = 6
117 | # checking camel cases
118 | term = re.sub(r'([0-9]+)', r' \1', term)
119 | term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
120 | term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
121 | term = re.sub(r'([A-Z]{2,})+', r' \1', term)
122 | words = term.strip().split(' ')
123 |
124 | n_splits = 0
125 |
126 | if (len(words) < 3):
127 | # splitting lower case and uppercase words upto 5 words
128 | chars = [c for c in term.lower()]
129 |
130 | found_all_words = False
131 |
132 | while (n_splits < split_words_count and not found_all_words):
133 | for idx in itertools.combinations(range(0, len(chars)), n_splits):
134 | output = numpy.split(chars, idx)
135 | line = [''.join(o) for o in output]
136 |
137 | score = (1. / len(line)) * sum(
138 | [wordlist.get(
139 | word.strip()) if word.strip() in wordlist else 0. if word.strip().isnumeric() else penalty for
140 | word in line])
141 |
142 | if (score > max_coverage):
143 | words = line
144 | max_coverage = score
145 |
146 | line_is_valid_word = [word.strip() in wordlist if not word.isnumeric() else True for word in line]
147 |
148 | if (all(line_is_valid_word)):
149 | found_all_words = True
150 |
151 | # uncomment to debug hashtag splitting
152 | # print(line, score, line_is_valid_word)
153 |
154 | n_splits = n_splits + 1
155 |
156 | # removing hashtag sign
157 | words = [str(s) for s in words]
158 |
159 | # dumping splits for debug
160 | with open(dump_file, 'a') as f:
161 | if (term != '' and len(words) > 0):
162 | f.write('#' + str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n')
163 |
164 | return words
165 |
166 |
167 | def load_abbreviation(path='../resource/abbreviations.txt'):
168 | abbreviation_dict = defaultdict()
169 | with open(path) as f:
170 | lines = f.readlines()
171 | for line in lines:
172 | token = line.lower().strip().split('\t')
173 | abbreviation_dict[token[0]] = token[1]
174 | return abbreviation_dict
175 |
176 |
177 | def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
178 | split_hashtag=False,
179 | ignore_profiles=False,
180 | replace_emoji=True):
181 | filtered_text = []
182 |
183 | filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':']
184 |
185 | for t in text:
186 | word_tokens = None
187 |
188 | # discarding symbols
189 | # if (str(t).lower() in filter_list):
190 | # continue
191 |
192 | # ignoring profile information if ignore_profiles is set
193 | if (ignore_profiles and str(t).startswith("@")):
194 | continue
195 |
196 | # ignoring links
197 | if (str(t).startswith('http')):
198 | continue
199 |
200 | # ignoring sarcastic marker
201 | # uncomment the following line for Fracking sarcasm using neural network
202 | # if (str(t).lower() in ['#sarcasm','#sarcastic', '#yeahright','#not']):
203 | # continue
204 |
205 | # for onlinesarcasm
206 | # comment if you are running the code for Fracking sarcasm using neural network
207 | if (str(t).lower() in ['#sarcasm']):
208 | continue
209 |
210 | # replacing emoji with its unicode description
211 | if (replace_emoji):
212 | if (t in emoji_dict):
213 | t = emoji_dict.get(t).split('_')
214 | filtered_text.extend(t)
215 | continue
216 |
217 | # splitting hastags
218 | if (split_hashtag and str(t).startswith("#")):
219 | splits = split_hashtags(t, word_list, split_word_list, dump_file='../resource/hastash_split_dump.txt')
220 | # adding the hashtags
221 | if (splits != None):
222 | filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))])
223 | continue
224 |
225 | # removes repeatation of letters
226 | if (normalize_text):
227 | t = normalize_word(t)
228 |
229 | # expands the abbreviation
230 | if (t in abbreviation_dict):
231 | tokens = abbreviation_dict.get(t).split(' ')
232 | filtered_text.extend(tokens)
233 | continue
234 |
235 | # appends the text
236 | filtered_text.append(t)
237 |
238 | return filtered_text
239 |
240 |
241 | def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
242 | split_hashtag=False,
243 | ignore_profiles=False,
244 | lowercase=False, replace_emoji=True, n_grams=None, at_character=False):
245 | data = []
246 | for i, line in enumerate(lines):
247 | if (i % 100 == 0):
248 | print(str(i) + '...', end='', flush=True)
249 |
250 | try:
251 |
252 | # convert the line to lowercase
253 | if (lowercase):
254 | line = line.lower()
255 |
256 | # split into token
257 | token = line.split('\t')
258 |
259 | # ID
260 | id = token[0]
261 |
262 | # label
263 | label = int(token[1].strip())
264 |
265 | # tweet text
266 | target_text = TweetTokenizer().tokenize(token[2].strip())
267 | if (at_character):
268 | target_text = [c for c in token[2].strip()]
269 |
270 | if (n_grams != None):
271 | n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams))
272 | target_text.extend(['_'.join(n) for n in n_grams_list])
273 |
274 | # filter text
275 | target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict,
276 | normalize_text,
277 | split_hashtag,
278 | ignore_profiles, replace_emoji=replace_emoji)
279 |
280 | # awc dimensions
281 | dimensions = []
282 | if (len(token) > 3 and token[3].strip() != 'NA'):
283 | dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
284 |
285 | # context tweet
286 | context = []
287 | if (len(token) > 4):
288 | if (token[4] != 'NA'):
289 | context = TweetTokenizer().tokenize(token[4].strip())
290 | context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict,
291 | normalize_text,
292 | split_hashtag,
293 | ignore_profiles, replace_emoji=replace_emoji)
294 |
295 | # author
296 | author = 'NA'
297 | if (len(token) > 5):
298 | author = token[5]
299 |
300 | if (len(target_text) != 0):
301 | # print((label, target_text, dimensions, context, author))
302 | data.append((id, label, target_text, dimensions, context, author))
303 | except:
304 | raise
305 | print('')
306 | return data
307 |
308 |
309 | def load_resources(word_file_path, split_word_path, emoji_file_path, split_hashtag=False, replace_emoji=True):
310 | word_list = None
311 | emoji_dict = None
312 |
313 | # load split files
314 | split_word_list = load_split_word(split_word_path)
315 |
316 | # load word dictionary
317 | if (split_hashtag):
318 | word_list = InitializeWords(word_file_path)
319 |
320 | if (replace_emoji):
321 | emoji_dict = load_unicode_mapping(emoji_file_path)
322 |
323 | abbreviation_dict = load_abbreviation()
324 |
325 | return word_list, emoji_dict, split_word_list, abbreviation_dict
326 |
327 |
328 | def loaddata(filename, word_file_path, split_word_path, emoji_file_path, normalize_text=False, split_hashtag=False,
329 | ignore_profiles=False,
330 | lowercase=True, replace_emoji=True, n_grams=None, at_character=False):
331 |
332 | word_list, emoji_dict, split_word_list, abbreviation_dict = load_resources(word_file_path, split_word_path,
333 | emoji_file_path,
334 | split_hashtag=split_hashtag,
335 | replace_emoji=replace_emoji)
336 | lines = open(filename, 'r').readlines()
337 |
338 | data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=normalize_text,
339 | split_hashtag=split_hashtag,
340 | ignore_profiles=ignore_profiles, lowercase=lowercase, replace_emoji=replace_emoji,
341 | n_grams=n_grams, at_character=at_character)
342 | return data
343 |
344 |
345 | def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0):
346 | vocab = defaultdict(int)
347 | vocab_freq = defaultdict(int)
348 |
349 | total_words = 1
350 | if (not without_dimension):
351 | for i in range(1, 101):
352 | vocab_freq[str(i)] = 0
353 | # vocab[str(i)] = total_words
354 | # total_words = total_words + 1
355 |
356 | for sentence_no, token in enumerate(data):
357 | for word in token[2]:
358 | if (word not in vocab_freq):
359 | # vocab[word] = total_words
360 | # total_words = total_words + 1
361 | vocab_freq[word] = 0
362 | vocab_freq[word] = vocab_freq.get(word) + 1
363 |
364 | if (not without_dimension):
365 | for word in token[3]:
366 | # if (word not in vocab_freq):
367 | # vocab[word] = total_words
368 | # total_words = total_words + 1
369 | vocab_freq[word] = vocab_freq.get(word) + 1
370 |
371 | if (ignore_context == False):
372 | for word in token[4]:
373 | if (not word in vocab):
374 | # vocab[word] = total_words
375 | # total_words = total_words + 1
376 | vocab_freq[word] = 0
377 | vocab_freq[word] = vocab_freq.get(word) + 1
378 |
379 | for k, v in vocab_freq.items():
380 | if (v >= min_freq):
381 | vocab[k] = total_words
382 | total_words = total_words + 1
383 |
384 | return vocab
385 |
386 |
387 | def build_reverse_vocab(vocab):
388 | rev_vocab = defaultdict(str)
389 | for k, v in vocab.items():
390 | rev_vocab[v] = k
391 | return rev_vocab
392 |
393 |
394 | def build_auxiliary_feature(data):
395 | aux = []
396 | for id, label, line, dimensions, context, author in data:
397 | aux.append([float(line.count('!')), float(line.count('?')), float(line.count('.')),
398 | sum([1.0 if c.isupper() else 0.0 for c in line]), float(line.count('"'))])
399 |
400 | return numpy.asarray(aux)
401 |
402 |
403 | def vectorize_word_dimension(data, vocab, drop_dimension_index=None, verbose=False):
404 | X = []
405 | Y = []
406 | D = []
407 | C = []
408 | A = []
409 |
410 | known_words_set = set()
411 | unknown_words_set = set()
412 |
413 | tokens = 0
414 | token_coverage = 0
415 |
416 | for id, label, line, dimensions, context, author in data:
417 | vec = []
418 | context_vec = []
419 | if (len(dimensions) != 0):
420 | dvec = [vocab.get(d) for d in dimensions]
421 | else:
422 | dvec = [vocab.get('unk')] * 11
423 |
424 | if drop_dimension_index != None:
425 | dvec.pop(drop_dimension_index)
426 |
427 | # tweet
428 | for words in line:
429 | tokens = tokens + 1
430 | if (words in vocab):
431 | vec.append(vocab[words])
432 | token_coverage = token_coverage + 1
433 | known_words_set.add(words)
434 | else:
435 | vec.append(vocab['unk'])
436 | unknown_words_set.add(words)
437 | # context_tweet
438 | if (len(context) != 0):
439 | for words in line:
440 | tokens = tokens + 1
441 | if (words in vocab):
442 | context_vec.append(vocab[words])
443 | token_coverage = token_coverage + 1
444 | known_words_set.add(words)
445 | else:
446 | context_vec.append(vocab['unk'])
447 | unknown_words_set.add(words)
448 | else:
449 | context_vec = [vocab['unk']]
450 |
451 | X.append(vec)
452 | Y.append(label)
453 | D.append(dvec)
454 | C.append(context_vec)
455 | A.append(author)
456 |
457 | if verbose:
458 | print('Token coverage:', token_coverage / float(tokens))
459 | print('Word coverage:', len(known_words_set) / float(len(vocab.keys())))
460 |
461 | return numpy.asarray(X), numpy.asarray(Y), numpy.asarray(D), numpy.asarray(C), numpy.asarray(A)
462 |
463 |
464 | def pad_sequence_1d(sequences, maxlen=None, dtype='float32', padding='pre', truncating='pre', value=0.):
465 | X = [vectors for vectors in sequences]
466 |
467 | nb_samples = len(X)
468 |
469 | x = (numpy.zeros((nb_samples, maxlen)) * value).astype(dtype)
470 |
471 | for idx, s in enumerate(X):
472 | if truncating == 'pre':
473 | trunc = s[-maxlen:]
474 | elif truncating == 'post':
475 | trunc = s[:maxlen]
476 | else:
477 | raise ValueError("Truncating type '%s' not understood" % padding)
478 |
479 | if padding == 'post':
480 | x[idx, :len(trunc)] = trunc
481 | elif padding == 'pre':
482 | x[idx, -len(trunc):] = trunc
483 | else:
484 | raise ValueError("Padding type '%s' not understood" % padding)
485 |
486 | return x
487 |
488 |
489 | def write_vocab(filepath, vocab):
490 | with open(filepath, 'w') as fw:
491 | for key, value in vocab.items():
492 | fw.write(str(key) + '\t' + str(value) + '\n')
493 |
494 |
495 | def get_fasttext_weight(vocab, n=300, path=None):
496 | word2vecmodel = load_word2vec(path=path)
497 | emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
498 | for k, v in vocab.items():
499 | if (word2vecmodel.__contains__(k)):
500 | emb_weights[v, :] = word2vecmodel[k][:n]
501 |
502 | return emb_weights
503 |
504 |
505 | def get_word2vec_weight(vocab, n=300, path=None):
506 | word2vecmodel = load_word2vec(path=path)
507 | emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
508 | for k, v in vocab.items():
509 | if (word2vecmodel.__contains__(k)):
510 | emb_weights[v, :] = word2vecmodel[k][:n]
511 |
512 | return emb_weights
513 |
514 |
515 | def load_glove_model(vocab, n=200, glove_path='/home/glove/glove.twitter.27B/glove.twitter.27B.200d.txt'):
516 | word2vecmodel = glove.load_glove_word2vec(glove_path)
517 |
518 | embedding_matrix = numpy.zeros((len(vocab.keys()) + 1, n))
519 | for k, v in vocab.items():
520 | embedding_vector = word2vecmodel.get(k)
521 | if embedding_vector is not None:
522 | embedding_matrix[v] = embedding_vector
523 |
524 | return embedding_matrix
525 |
526 |
527 | def add_ngram(sequences, token_indice, ngram_range=2):
528 | """
529 | Augment the input list of list (sequences) by appending n-grams values.
530 | Example: adding bi-gram
531 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
532 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
533 | >>> add_ngram(sequences, token_indice, ngram_range=2)
534 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
535 | Example: adding tri-gram
536 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
537 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
538 | >>> add_ngram(sequences, token_indice, ngram_range=3)
539 | [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
540 | """
541 | new_sequences = []
542 | for input_list in sequences:
543 | new_list = input_list[:]
544 | for i in range(len(new_list) - ngram_range + 1):
545 | for ngram_value in range(2, ngram_range + 1):
546 | ngram = tuple(new_list[i:i + ngram_value])
547 | if ngram in token_indice:
548 | new_list.append(token_indice[ngram])
549 | new_sequences.append(new_list)
550 |
551 | return new_sequences
552 |
553 |
554 | def create_ngram_set(input_list, ngram_value=2):
555 | """
556 | Extract a set of n-grams from a list of integers.
557 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
558 | {(4, 9), (4, 1), (1, 4), (9, 4)}
559 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
560 | [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
561 | """
562 | return set(zip(*[input_list[i:] for i in range(ngram_value)]))
563 |
564 |
565 | def prepare_fasttext(x_train, x_test, max_features=20000, ngram_range=2):
566 | if ngram_range > 1:
567 | print('Adding {}-gram features'.format(ngram_range))
568 | # Create set of unique n-gram from the training set.
569 | ngram_set = set()
570 | for input_list in x_train:
571 | for i in range(2, ngram_range + 1):
572 | set_of_ngram = create_ngram_set(input_list, ngram_value=i)
573 | ngram_set.update(set_of_ngram)
574 |
575 | # Dictionary mapping n-gram token to a unique integer.
576 | # Integer values are greater than max_features in order
577 | # to avoid collision with existing features.
578 | start_index = max_features + 1
579 | token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
580 | indice_token = {token_indice[k]: k for k in token_indice}
581 |
582 | # max_features is the highest integer that could be found in the dataset.
583 | max_features = numpy.max(list(indice_token.keys())) + 1
584 |
585 | # Augmenting x_train and x_test with n-grams features
586 | x_train = add_ngram(x_train, token_indice, ngram_range)
587 | x_test = add_ngram(x_test, token_indice, ngram_range)
588 | print('Average train sequence length: {}'.format(numpy.mean(list(map(len, x_train)), dtype=int)))
589 | print('Average test sequence length: {}'.format(numpy.mean(list(map(len, x_test)), dtype=int)))
590 |
--------------------------------------------------------------------------------
/src/data_processing/glove2Word2vecLoader.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import shutil
3 | import hashlib
4 | from sys import platform
5 |
6 | import gensim
7 |
8 |
9 | def prepend_line(infile, outfile, line):
10 | with open(infile, 'r') as old:
11 | with open(outfile, 'w') as new:
12 | new.write(str(line) + "\n")
13 | shutil.copyfileobj(old, new)
14 |
15 |
16 | def prepend_slow(infile, outfile, line):
17 | with open(infile, 'r') as fin:
18 | with open(outfile, 'w') as fout:
19 | fout.write(line + "\n")
20 | for line in fin:
21 | fout.write(line)
22 |
23 |
24 | def checksum(filename):
25 | BLOCKSIZE = 65536
26 | hasher = hashlib.md5()
27 | with open(filename, 'rb') as afile:
28 | buf = afile.read(BLOCKSIZE)
29 | while len(buf) > 0:
30 | hasher.update(buf)
31 | buf = afile.read(BLOCKSIZE)
32 | return hasher.hexdigest()
33 |
34 |
35 | # Pre-computed glove files values.
36 | pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494}
37 |
38 | pretrain_checksum = {
39 | "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4",
40 | "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06",
41 | "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e",
42 | "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12",
43 | "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de",
44 | "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1",
45 | "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80",
46 | "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a",
47 | "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63",
48 | "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569",
49 | }
50 |
51 |
52 | def check_num_lines_in_glove(filename, check_checksum=False):
53 | if check_checksum:
54 | assert checksum(filename) == pretrain_checksum[filename]
55 | if filename.startswith('glove.6B.'):
56 | return 400000
57 | elif filename.startswith('glove.twitter.27B.'):
58 | return 1193514
59 | else:
60 | return pretrain_num_lines[filename]
61 |
62 |
63 | def load_glove_word2vec(filename):
64 |
65 | # load the whole embedding into memory
66 | embeddings_index = dict()
67 | f = open(filename)
68 | for line in f:
69 | values = line.split()
70 | word = values[0]
71 | coefs = np.asarray(values[1:], dtype='float32')
72 | embeddings_index[word] = coefs
73 | f.close()
74 | print('Loaded %s word vectors.' % len(embeddings_index))
75 |
76 | return embeddings_index
77 |
--------------------------------------------------------------------------------
/src/sarcasm_context_moods.py:
--------------------------------------------------------------------------------
1 | import os
2 | import collections
3 | import random
4 | import sys
5 |
6 | sys.path.append('../../')
7 |
8 | import time
9 | import numpy
10 |
11 | numpy.random.seed(1337)
12 |
13 | from keras.layers.wrappers import TimeDistributed
14 | from keras import backend as K, regularizers
15 | from sklearn import metrics
16 | from keras.models import model_from_json
17 | from keras.layers.core import Dropout, Dense, Activation, Flatten
18 | from keras.layers.embeddings import Embedding
19 | from keras.layers.recurrent import LSTM
20 | from keras.layers.convolutional import Convolution1D
21 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
22 |
23 | from keras.layers.merge import add, concatenate
24 | from keras.models import Model
25 | from keras.utils import np_utils
26 | from keras.layers import Input
27 | import src.data_processing.data_handler as dh
28 | from collections import defaultdict
29 |
30 |
31 | class sarcasm_model():
32 | _train_file = None
33 | _gold_data_path = None
34 | _validation_file = None
35 | _tweet_file = None
36 | # test_debug = None
37 | _output_file = None
38 | _model_file = None
39 | _word_file_path = None
40 | _vocab_file_path = None
41 | _input_weight_file_path = None
42 | _vocab = None
43 | _line_maxlen = None
44 |
45 | def __init__(self):
46 | self._train_file = None
47 | self._test_file = None
48 | self._validation_file = None
49 | self._tweet_file = None
50 | self._output_file = None
51 | self._model_file = None
52 | self._word_file_path = None
53 | self._vocab_file_path = None
54 | self._input_weight_file_path = None
55 | self._vocab = None
56 |
57 | self._line_maxlen = 30
58 |
59 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256,
60 | dimension_length=11, trainable=True, batch_size=1):
61 |
62 | print('Building model...')
63 |
64 | context_input = Input(name='context', batch_shape=(batch_size, maxlen))
65 |
66 | if (len(c_emb_weights) == 0):
67 | c_emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal',
68 | trainable=trainable)(context_input)
69 | else:
70 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
71 | trainable=trainable)(context_input)
72 |
73 | c_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid',
74 | padding='valid', input_shape=(1, maxlen))(c_emb)
75 |
76 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
77 | dropout=0.25)(c_cnn1)
78 |
79 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.25,
80 | go_backwards=True)(c_cnn1)
81 |
82 | c_merged = concatenate([c_lstm1, c_lstm2], axis=-1)
83 |
84 | print(c_merged)
85 |
86 |
87 | text_input = Input(name='text', batch_shape=(batch_size, maxlen))
88 |
89 | if (len(emb_weights) == 0):
90 | emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal',
91 | trainable=trainable)(text_input)
92 | else:
93 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
94 | trainable=trainable)(text_input)
95 |
96 | t_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal',
97 | activation='relu', padding='valid', input_shape=(1, maxlen))(emb)
98 |
99 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
100 | bias_initializer='he_normal', activation='sigmoid',
101 | dropout=0.25)(t_cnn1)
102 |
103 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
104 | bias_initializer='he_normal', activation='sigmoid',
105 | dropout=0.25,
106 | go_backwards=True)(t_cnn1)
107 |
108 | t_merged = concatenate([t_lstm1, t_lstm2], axis=-1)
109 |
110 | # t_merged = Reshape((-1,int(hidden_units / 8)))(t_merged)
111 |
112 | awc_input = Input(name='awc', batch_shape=(batch_size, 11))
113 |
114 | eaw = Embedding(101, int(hidden_units / 8), input_length=dimension_length,
115 | embeddings_initializer='glorot_normal',
116 | trainable=True)(awc_input)
117 |
118 | merged = concatenate([c_merged, t_merged, awc_input], axis=-1)
119 |
120 | # flat_model = Flatten()(merged)
121 |
122 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
123 | dnn_1 = Dropout(0.25)(dnn_1)
124 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
125 |
126 | softmax = Activation('softmax')(dnn_2)
127 |
128 | model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax)
129 |
130 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
131 | print('No of parameter:', model.count_params())
132 |
133 | print(model.summary())
134 |
135 | return model
136 |
137 |
138 | class train_model(sarcasm_model):
139 | train = None
140 | validation = None
141 |
142 | def load_train_validation_test_data(self):
143 | print("Loading resource...")
144 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
145 | self._emoji_file_path, normalize_text=True,
146 | split_hashtag=True,
147 | ignore_profiles=False)
148 |
149 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
150 | self._emoji_file_path,
151 | normalize_text=True,
152 | split_hashtag=True,
153 | ignore_profiles=False)
154 |
155 | if (self._test_file != None):
156 | self.test = dh.loaddata(self._test_file, self._word_file_path, self._split_word_file_path,
157 | self._emoji_file_path, normalize_text=True,
158 | split_hashtag=True,
159 | ignore_profiles=True)
160 |
161 | def split_train_validation(self, train, ratio=.1):
162 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
163 | print(len(test_indices))
164 | train_data = []
165 | validation_data = []
166 | for i, t in enumerate(train):
167 | if (test_indices.__contains__(i)):
168 | validation_data.append(t)
169 | else:
170 | train_data.append(t)
171 | return train_data, validation_data
172 |
173 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
174 | vocab_file,
175 | output_file,
176 | word2vec_path=None):
177 | sarcasm_model.__init__(self)
178 |
179 | self._train_file = train_file
180 | self._validation_file = validation_file
181 | self._word_file_path = word_file_path
182 | self._split_word_file_path = split_word_path
183 | self._emoji_file_path = emoji_file_path
184 | self._model_file = model_file
185 | self._vocab_file_path = vocab_file
186 | self._output_file = output_file
187 |
188 | self.load_train_validation_test_data()
189 |
190 | batch_size = 2
191 |
192 | self.train = self.train[:-(len(self.train) % batch_size)]
193 | self.validation = self.validation[:-(len(self.validation) % batch_size)]
194 |
195 | print(self._line_maxlen)
196 | self._vocab = dh.build_vocab(self.train, ignore_context=False)
197 | self._vocab['unk'] = len(self._vocab.keys()) + 1
198 |
199 | print(len(self._vocab.keys()) + 1)
200 | print('unk::', self._vocab['unk'])
201 |
202 | dh.write_vocab(self._vocab_file_path, self._vocab)
203 |
204 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
205 |
206 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
207 |
208 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
209 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
210 | D = dh.pad_sequence_1d(D, maxlen=11)
211 |
212 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
213 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
214 | tD = dh.pad_sequence_1d(tD, maxlen=11)
215 |
216 | hidden_units = 64
217 | dimension_size = 300
218 |
219 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
220 | path=word2vec_path)
221 | cW = W
222 |
223 | print('Word2vec obtained....')
224 |
225 | ratio = self.calculate_label_ratio(Y)
226 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
227 |
228 | print('ratio', ratio)
229 |
230 | dimension_vocab = numpy.unique(D)
231 | print(len(dimension_vocab))
232 |
233 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
234 |
235 | print('train_X', X.shape)
236 | print('train_C', C.shape)
237 | print('train_D', D.shape)
238 | print('train_Y', Y.shape)
239 |
240 | print('validation_X', tX.shape)
241 | print('validation_C', tC.shape)
242 | print('validation_D', tD.shape)
243 | print('validation_Y', tY.shape)
244 |
245 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
246 | hidden_units=hidden_units, trainable=False, dimension_length=11,
247 | batch_size=batch_size)
248 |
249 | open(self._model_file + 'model.json', 'w').write(model.to_json())
250 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
251 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
252 | # save_best_only=False)
253 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
254 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
255 | epsilon=0.0001,
256 | cooldown=0, min_lr=0.000001)
257 |
258 | model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True,
259 | callbacks=[save_best, lr_tuner], class_weight=ratio)
260 |
261 | def get_maxlen(self):
262 | return max(map(len, (x for _, x in self.train + self.validation)))
263 |
264 | def write_vocab(self):
265 | with open(self._vocab_file_path, 'w') as fw:
266 | for key, value in self._vocab.iteritems():
267 | fw.write(str(key) + '\t' + str(value) + '\n')
268 |
269 | def calculate_label_ratio(self, labels, ):
270 | return collections.Counter(labels)
271 |
272 |
273 | class test_model(sarcasm_model):
274 | test = None
275 | model = None
276 |
277 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
278 | print('initializing...')
279 | sarcasm_model.__init__(self)
280 |
281 | self._word_file_path = word_file_path
282 | self._model_file = model_file
283 | self._vocab_file_path = vocab_file_path
284 | self._output_file = output_file
285 |
286 | # self._line_maxlen = 45
287 | print('test_maxlen', self._line_maxlen)
288 |
289 | def predict_cross_validation(self, tC, tX, tD, test):
290 | self.__predict_model([tC, tX, tD], test)
291 |
292 | def load_trained_model(self, weight_file='model.json.hdf5'):
293 | start = time.time()
294 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
295 | end = time.time()
296 | print('model loading time::', (end - start))
297 |
298 | def __load_model(self, model_path, model_weight_path):
299 | self.model = model_from_json(open(model_path).read())
300 | print('model loaded from file...')
301 | self.model.load_weights(model_weight_path)
302 | print('model weights loaded from file...')
303 |
304 | def load_vocab(self):
305 | vocab = defaultdict()
306 | with open(self._vocab_file_path, 'r') as f:
307 | for line in f.readlines():
308 | key, value = line.split('\t')
309 | vocab[key] = value
310 |
311 | return vocab
312 |
313 | def predict(self, test_file, verbose=False):
314 | start = time.time()
315 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
316 | split_hashtag=True,
317 | ignore_profiles=False)
318 | end = time.time()
319 | if (verbose == True):
320 | print('test resource loading time::', (end - start))
321 |
322 | self._vocab = self.load_vocab()
323 |
324 | start = time.time()
325 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
326 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
327 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
328 | tD = dh.pad_sequence_1d(tD, maxlen=11)
329 |
330 | end = time.time()
331 | if (verbose == True):
332 | print('test resource preparation time::', (end - start))
333 |
334 | self.__predict_model([tC, tX, tD], self.test)
335 |
336 | def __predict_model(self, tX, test):
337 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
338 |
339 | y = []
340 | y_pred = []
341 |
342 | fd = open(self._output_file + '.analysis', 'w')
343 | for i, (label) in enumerate(prediction_probability):
344 | gold_label = test[i][0]
345 | words = test[i][1]
346 | dimensions = test[i][2]
347 | context = test[i][3]
348 | author = test[i][4]
349 |
350 | predicted = numpy.argmax(prediction_probability[i])
351 |
352 | y.append(int(gold_label))
353 | y_pred.append(predicted)
354 |
355 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
356 | + str(gold_label) + '\t'
357 | + str(predicted) + '\t'
358 | + ' '.join(words) + '\t'
359 | + str(dimensions) + '\t'
360 | + ' '.join(context))
361 |
362 | fd.write('\n')
363 |
364 | print('accuracy::', metrics.accuracy_score(y, y_pred))
365 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
366 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
367 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
368 | print('f_score::', metrics.classification_report(y, y_pred))
369 |
370 | fd.close()
371 |
372 |
373 | if __name__ == "__main__":
374 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
375 | train_file = basepath + '/resource/train/Train_context_moods.txt'
376 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
377 | test_file = basepath + '/resource/test/Test_context_AW.txt'
378 | word_file_path = basepath + '/resource/word_list_freq.txt'
379 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
380 | model_file = basepath + '/resource/text_context_awc_model/weights/'
381 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
382 | split_word_path = basepath + '/resource/word_split.txt'
383 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
384 |
385 | # word2vec path
386 | word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin'
387 |
388 | tr = train_model(train_file=train_file, validation_file=validation_file, word_file_path=word_file_path,
389 | split_word_path=split_word_path, emoji_file_path=emoji_file_path, model_file=model_file,
390 | vocab_file=vocab_file_path, output_file=output_file,
391 | word2vec_path=word2vec_path)
392 |
393 | with K.get_session():
394 | t = test_model(word_file_path, model_file, vocab_file_path, output_file)
395 | t.load_trained_model()
396 | t.predict(test_file)
397 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_DNN_2D.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | from keras.engine import InputLayer
5 | from keras.layers.normalization import BatchNormalization
6 | from keras.layers.wrappers import TimeDistributed
7 |
8 | sys.path.append('../')
9 | import collections
10 | import time
11 | import numpy
12 |
13 | numpy.random.seed(1337)
14 | from sklearn import metrics
15 | from keras.models import Sequential, model_from_json
16 | from keras.layers import Masking, Bidirectional, GlobalAveragePooling2D
17 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten
18 | from keras.layers.embeddings import Embedding
19 | from keras.layers.recurrent import LSTM
20 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling2D
21 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
22 | from keras.callbacks import EarlyStopping
23 | from keras.optimizers import Adam
24 | from keras.utils import np_utils
25 | from collections import defaultdict
26 | import src.data_processing.data_handler as dh
27 |
28 | import keras.backend as K
29 |
30 |
31 | class sarcasm_model():
32 | _train_file = None
33 | _test_file = None
34 | _tweet_file = None
35 | _output_file = None
36 | _model_file = None
37 | _word_file_path = None
38 | _vocab_file_path = None
39 | _input_weight_file_path = None
40 | _vocab = None
41 | _line_maxlen = None
42 |
43 | def __init__(self):
44 | self._line_maxlen = 30
45 |
46 | def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False):
47 | print('Build model...')
48 |
49 | model = Sequential()
50 |
51 | model.add(Masking(mask_value=0, input_shape=(maxlen,)))
52 |
53 | if (len(emb_weights) == 0):
54 | model.add(Embedding(vocab_size, 20, input_length=maxlen, embeddings_initializer='he_normal',
55 | trainable=trainable, mask_zero=True))
56 | else:
57 | model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
58 | trainable=trainable))
59 |
60 | model.add(Reshape((model.output_shape[1], model.output_shape[2], 1)))
61 |
62 | model.add(Convolution2D(int(hidden_units / 8), (5, 1), kernel_initializer='he_normal', padding='valid',
63 | activation='relu'))
64 | model.add(MaxPooling2D((2, 1)))
65 | model.add(Dropout(0.5))
66 |
67 | model.add(Convolution2D(int(hidden_units / 4), (3, 1), kernel_initializer='he_normal', padding='valid',
68 | activation='relu'))
69 | model.add(MaxPooling2D((2, 1)))
70 | model.add(Dropout(0.5))
71 |
72 | model.add(Dense(int(hidden_units / 2), kernel_initializer='he_normal', activation='relu'))
73 | model.add(Dropout(0.5))
74 |
75 | model.add(Dense(2, activation='softmax'))
76 |
77 | adam = Adam(lr=0.001)
78 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
79 | print('No of parameter:', model.count_params())
80 |
81 | print(model.summary())
82 |
83 | return model
84 |
85 |
86 | class train_model(sarcasm_model):
87 | train = None
88 | validation = None
89 | print("Loading resource...")
90 |
91 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
92 | vocab_file,
93 | output_file,
94 | word2vec_path=None, test_file=None, input_weight_file_path=None):
95 |
96 | sarcasm_model.__init__(self)
97 |
98 | self._train_file = train_file
99 | self._validation_file = validation_file
100 | self._word_file_path = word_file_path
101 | self._split_word_file_path = split_word_path
102 | self._emoji_file_path = emoji_file_path
103 | self._model_file = model_file
104 | self._vocab_file_path = vocab_file
105 | self._output_file = output_file
106 | self._input_weight_file_path = input_weight_file_path
107 |
108 | self.load_train_validation_test_data()
109 |
110 | print(self._line_maxlen)
111 |
112 | # build vocabulary
113 | if (self._test_file != None):
114 | self._vocab = dh.build_vocab(self.train + self.validation + self.test)
115 | else:
116 | self._vocab = dh.build_vocab(self.train + self.validation)
117 |
118 | self._vocab['unk'] = len(self._vocab.keys()) + 1
119 |
120 | print(len(self._vocab.keys()) + 1)
121 | print('unk::', self._vocab['unk'])
122 |
123 | dh.write_vocab(self._vocab_file_path, self._vocab)
124 |
125 | # prepares input
126 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
127 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
128 |
129 | # prepares input
130 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
131 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
132 |
133 | # embedding dimension
134 | dimension_size = 100
135 | W = []
136 |
137 | W = dh.get_word2vec_weight(self._vocab, n=200,
138 | path=word2vec_path)
139 |
140 | # solving class imbalance
141 | ratio = self.calculate_label_ratio(Y)
142 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
143 | print('class ratio::', ratio)
144 |
145 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
146 |
147 | print('train_X', X.shape)
148 | print('train_Y', Y.shape)
149 | print('validation_X', tX.shape)
150 | print('validation_Y', tY.shape)
151 |
152 | # trainable true if you want word2vec weights to be updated
153 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False)
154 |
155 | open(self._model_file + 'model.json', 'w').write(model.to_json())
156 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
157 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
158 | save_best_only=False)
159 | early_stopping = EarlyStopping(monitor='loss', patience=20, verbose=1)
160 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
161 | epsilon=0.0001,
162 | cooldown=0, min_lr=0.000001)
163 |
164 | # training
165 | model.fit(X, Y, batch_size=128, epochs=100, validation_data=(tX, tY), shuffle=True,
166 | callbacks=[save_best], class_weight=ratio)
167 |
168 | def load_train_validation_test_data(self):
169 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
170 | self._emoji_file_path, normalize_text=True,
171 | split_hashtag=True,
172 | ignore_profiles=False, replace_emoji=False)
173 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
174 | self._emoji_file_path, normalize_text=True,
175 | split_hashtag=True,
176 | ignore_profiles=False, replace_emoji=False)
177 | if (self._test_file != None):
178 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
179 | split_hashtag=True,
180 | ignore_profiles=True)
181 |
182 | def get_maxlen(self):
183 | return max(map(len, (x for _, x in self.train + self.validation)))
184 |
185 | def write_vocab(self):
186 | with open(self._vocab_file_path, 'w') as fw:
187 | for key, value in self._vocab.iteritems():
188 | fw.write(str(key) + '\t' + str(value) + '\n')
189 |
190 | def calculate_label_ratio(self, labels):
191 | return collections.Counter(labels)
192 |
193 |
194 | class test_model(sarcasm_model):
195 | test = None
196 | model = None
197 |
198 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
199 | input_weight_file_path=None):
200 | print('initializing...')
201 | sarcasm_model.__init__(self)
202 |
203 | self._model_file_path = model_file
204 | self._word_file_path = word_file_path
205 | self._split_word_file_path = split_word_path
206 | self._emoji_file_path = emoji_file_path
207 | self._vocab_file_path = vocab_file_path
208 | self._output_file = output_file
209 | self._input_weight_file_path = input_weight_file_path
210 |
211 | print('test_maxlen', self._line_maxlen)
212 |
213 | def load_trained_model(self, weight_file='model.json.hdf5'):
214 | start = time.time()
215 | self.__load_model(self._model_file_path + 'model.json', self._model_file_path + weight_file)
216 | end = time.time()
217 | print('model loading time::', (end - start))
218 |
219 | def __load_model(self, model_path, model_weight_path):
220 | self.model = model_from_json(open(model_path).read())
221 | print('model loaded from file...')
222 | self.model.load_weights(model_weight_path)
223 | print('model weights loaded from file...')
224 |
225 | def load_vocab(self):
226 | vocab = defaultdict()
227 | with open(self._vocab_file_path, 'r') as f:
228 | for line in f.readlines():
229 | key, value = line.split('\t')
230 | vocab[key] = value
231 |
232 | return vocab
233 |
234 | def predict(self, test_file, verbose=False):
235 | try:
236 | start = time.time()
237 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
238 | normalize_text=True, split_hashtag=True,
239 | ignore_profiles=True)
240 | end = time.time()
241 | if (verbose == True):
242 | print('test resource loading time::', (end - start))
243 |
244 | self._vocab = self.load_vocab()
245 |
246 | start = time.time()
247 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
248 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
249 | end = time.time()
250 | if (verbose == True):
251 | print('test resource preparation time::', (end - start))
252 |
253 | self.__predict_model(tX, self.test)
254 | except Exception as e:
255 | print('Error:', e)
256 |
257 | def __predict_model(self, tX, test):
258 | y = []
259 | y_pred = []
260 |
261 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
262 |
263 | try:
264 | fd = open(self._output_file + '.analysis', 'w')
265 | for i, (label) in enumerate(prediction_probability):
266 | id = test[i][0]
267 | gold_label = test[i][1]
268 | words = test[i][2]
269 | dimensions = test[i][3]
270 | context = test[i][4]
271 | author = test[i][5]
272 |
273 | predicted = numpy.argmax(prediction_probability[i])
274 |
275 | y.append(int(gold_label))
276 | y_pred.append(predicted)
277 |
278 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
279 | + str(gold_label) + '\t'
280 | + str(predicted) + '\t'
281 | + ' '.join(words))
282 |
283 | fd.write('\n')
284 |
285 | print()
286 |
287 | print('accuracy::', metrics.accuracy_score(y, y_pred))
288 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
289 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
290 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
291 | print('f_score::', metrics.classification_report(y, y_pred))
292 | fd.close()
293 | except Exception as e:
294 | print(e)
295 |
296 |
297 | if __name__ == "__main__":
298 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
299 | train_file = basepath + '/resource/train/Train_v1.txt'
300 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
301 | test_file = basepath + '/resource/test/Test_v1.txt'
302 | word_file_path = basepath + '/resource/word_list_freq.txt'
303 | split_word_path = basepath + '/resource/word_split.txt'
304 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
305 |
306 | output_file = basepath + '/resource/text_model_2D/TestResults.txt'
307 | model_file = basepath + '/resource/text_model_2D/weights/'
308 | vocab_file_path = basepath + '/resource/text_model_2D/vocab_list.txt'
309 |
310 | # word2vec path
311 | word2vec_path = '/home/ubuntu/word2vec/GoogleNews-vectors-negative300.bin'
312 | glove_path = '/home/striker/word2vec/glove_model_200.txt.bin'
313 |
314 | # test file is passed to build the vocabulary
315 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
316 | # vocab_file_path, output_file,
317 | # word2vec_path=glove_path, test_file=test_file)
318 | #
319 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
320 | t.load_trained_model()
321 | t.predict(test_file)
322 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_ATTN.py:
--------------------------------------------------------------------------------
1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
2 |
3 | import os
4 | import sys
5 |
6 | from src.data_processing.data_handler import load_glove_model, build_auxiliary_feature
7 |
8 | sys.path.append('../')
9 |
10 | import collections
11 | import time
12 | import numpy
13 |
14 | from keras import backend as K
15 |
16 | from keras import backend as K, regularizers
17 | from sklearn import metrics
18 | from keras.models import model_from_json, load_model
19 | from keras.layers.core import Dropout, Dense, Activation, Flatten
20 | from keras.layers.embeddings import Embedding
21 | from keras.layers.recurrent import LSTM
22 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
23 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
24 |
25 | from keras.layers.merge import concatenate, multiply
26 | from keras.models import Model
27 | from keras.utils import np_utils
28 | from keras.layers import Input, Reshape, Permute, RepeatVector, Lambda, merge
29 | import src.data_processing.data_handler as dh
30 | from collections import defaultdict
31 |
32 |
33 | class sarcasm_model():
34 | _train_file = None
35 | _test_file = None
36 | _tweet_file = None
37 | _output_file = None
38 | _model_file_path = None
39 | _word_file_path = None
40 | _split_word_file_path = None
41 | _emoji_file_path = None
42 | _vocab_file_path = None
43 | _input_weight_file_path = None
44 | _vocab = None
45 | _line_maxlen = None
46 |
47 | def __init__(self):
48 | self._line_maxlen = 30
49 |
50 | def attention_3d_block(self, inputs, SINGLE_ATTENTION_VECTOR=False):
51 | # inputs.shape = (batch_size, time_steps, input_dim)
52 | input_dim = int(inputs.shape[2])
53 | a = Permute((2, 1))(inputs)
54 | a = Reshape((input_dim, self._line_maxlen))(a)
55 | # this line is not useful. It's just to know which dimension is what.
56 | a = Dense(self._line_maxlen, activation='softmax')(a)
57 | if SINGLE_ATTENTION_VECTOR:
58 | a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
59 | a = RepeatVector(input_dim)(a)
60 | a_probs = Permute((2, 1), name='attention_vec')(a)
61 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
62 | return output_attention_mul
63 |
64 | def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=50, hidden_units=256,
65 | batch_size=1):
66 | print('Build model...')
67 |
68 | text_input = Input(name='text', shape=(maxlen,))
69 |
70 | if (len(emb_weights) == 0):
71 | emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen,
72 | embeddings_initializer='glorot_normal',
73 | trainable=True)(text_input)
74 | else:
75 | emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
76 | trainable=False)(text_input)
77 | emb_dropout = Dropout(0.5)(emb)
78 |
79 | lstm_bwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4,
80 | go_backwards=True, return_sequences=True)(emb_dropout)
81 | lstm_fwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4,
82 | return_sequences=True)(emb_dropout)
83 |
84 | lstm_merged = concatenate([lstm_bwd, lstm_fwd])
85 |
86 | attention_mul = self.attention_3d_block(lstm_merged)
87 |
88 | flat_attention = Flatten()(attention_mul)
89 |
90 | aux_input = Input(name='aux', shape=(5,))
91 |
92 | merged_aux = concatenate([flat_attention, aux_input], axis=1)
93 |
94 |
95 | reshaped = Reshape((-1, 1))(merged_aux)
96 |
97 | print(reshaped.shape)
98 |
99 | cnn1 = Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')(
100 | reshaped)
101 | pool1 = MaxPooling1D(pool_size=3)(cnn1)
102 | print(pool1.shape)
103 |
104 | cnn2 = Convolution1D(2 * hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')(
105 | pool1)
106 | pool2 = MaxPooling1D(pool_size=3)(cnn2)
107 | print(pool2.shape)
108 |
109 | flat_cnn = Flatten()(pool2)
110 |
111 | dnn_1 = Dense(hidden_units)(flat_cnn)
112 | dropout_1 = Dropout(0.25)(dnn_1)
113 | dnn_2 = Dense(2)(dropout_1)
114 | print(dnn_2.shape)
115 |
116 | softmax = Activation('softmax')(dnn_2)
117 |
118 | model = Model(inputs=[text_input, aux_input], outputs=softmax)
119 |
120 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
121 | print('No of parameter:', model.count_params())
122 |
123 | print(model.summary())
124 |
125 | return model
126 |
127 |
128 | class train_model(sarcasm_model):
129 | train = None
130 | validation = None
131 | print("Loading resource...")
132 |
133 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
134 | vocab_file,
135 | output_file,
136 | input_weight_file_path=None):
137 | sarcasm_model.__init__(self)
138 |
139 | self._train_file = train_file
140 | self._validation_file = validation_file
141 | self._word_file_path = word_file_path
142 | self._split_word_file_path = split_word_path
143 | self._emoji_file_path = emoji_file_path
144 | self._model_file = model_file
145 | self._vocab_file_path = vocab_file
146 | self._output_file = output_file
147 | self._input_weight_file_path = input_weight_file_path
148 |
149 | self.load_train_validation_data()
150 |
151 | print(self._line_maxlen)
152 | batch_size = 32
153 |
154 | # build vocabulary
155 | # truncates words with min freq=1
156 | self._vocab = dh.build_vocab(self.train, min_freq=1)
157 | if ('unk' not in self._vocab):
158 | self._vocab['unk'] = len(self._vocab.keys()) + 1
159 |
160 | print(len(self._vocab.keys()) + 1)
161 | print('unk::', self._vocab['unk'])
162 |
163 | dh.write_vocab(self._vocab_file_path, self._vocab)
164 |
165 | self.train = self.train[:-(len(self.train) % batch_size)]
166 | self.validation = self.validation[:-(len(self.validation) % batch_size)]
167 |
168 | # prepares input
169 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
170 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
171 |
172 | # prepares input
173 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
174 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
175 |
176 | # embedding dimension
177 | dimension_size = 300
178 | emb_weights = load_glove_model(self._vocab, n=dimension_size,
179 | glove_path='/home/aghosh/backups/glove.6B.300d.txt')
180 |
181 | # aux inputs
182 | aux_train = build_auxiliary_feature(self.train)
183 | aux_validation = build_auxiliary_feature(self.validation)
184 |
185 | # solving class imbalance
186 | ratio = self.calculate_label_ratio(Y)
187 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
188 | print('class ratio::', ratio)
189 |
190 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
191 |
192 | print('train_X', X.shape)
193 | print('train_Y', Y.shape)
194 | print('validation_X', tX.shape)
195 | print('validation_Y', tY.shape)
196 |
197 | # trainable true if you want word2vec weights to be updated
198 | # Not applicable in this code
199 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights, hidden_units=32,
200 | embedding_dimension=dimension_size, batch_size=batch_size)
201 |
202 | # open(self._model_file + 'model.json', 'w').write(model.to_json())
203 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
204 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
205 | save_best_only=False)
206 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
207 |
208 | # training
209 | model.fit([X, aux_train], Y, batch_size=batch_size, epochs=10, validation_data=([tX, aux_validation], tY),
210 | shuffle=True,
211 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
212 |
213 | def load_train_validation_data(self):
214 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
215 | self._emoji_file_path, normalize_text=True,
216 | split_hashtag=True,
217 | ignore_profiles=False)
218 | print('Training data loading finished...')
219 |
220 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
221 | self._emoji_file_path,
222 | normalize_text=True,
223 | split_hashtag=True,
224 | ignore_profiles=False)
225 | print('Validation data loading finished...')
226 |
227 | if (self._test_file != None):
228 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
229 | split_hashtag=True,
230 | ignore_profiles=True)
231 |
232 | def get_maxlen(self):
233 | return max(map(len, (x for _, x in self.train + self.validation)))
234 |
235 | def write_vocab(self):
236 | with open(self._vocab_file_path, 'w') as fw:
237 | for key, value in self._vocab.iteritems():
238 | fw.write(str(key) + '\t' + str(value) + '\n')
239 |
240 | def calculate_label_ratio(self, labels):
241 | return collections.Counter(labels)
242 |
243 |
244 | class test_model(sarcasm_model):
245 | test = None
246 | model = None
247 |
248 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
249 | input_weight_file_path=None):
250 | print('initializing...')
251 | sarcasm_model.__init__(self)
252 |
253 | self._model_file_path = model_file
254 | self._word_file_path = word_file_path
255 | self._split_word_file_path = split_word_path
256 | self._emoji_file_path = emoji_file_path
257 | self._vocab_file_path = vocab_file_path
258 | self._output_file = output_file
259 | self._input_weight_file_path = input_weight_file_path
260 |
261 | print('test_maxlen', self._line_maxlen)
262 |
263 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
264 | start = time.time()
265 | self.__load_model(self._model_file_path + weight_file)
266 | end = time.time()
267 | print('model loading time::', (end - start))
268 |
269 | def __load_model(self, model_path):
270 | self.model = load_model(model_path)
271 | print('model loaded from file...')
272 | # self.model.load_weights(model_weight_path)
273 | # print('model weights loaded from file...')
274 |
275 | def load_vocab(self):
276 | vocab = defaultdict()
277 | with open(self._vocab_file_path, 'r') as f:
278 | for line in f.readlines():
279 | key, value = line.split('\t')
280 | vocab[key] = value
281 |
282 | return vocab
283 |
284 | def predict(self, test_file, verbose=False):
285 | try:
286 | start = time.time()
287 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
288 | normalize_text=True, split_hashtag=True,
289 | ignore_profiles=False)
290 | end = time.time()
291 | if (verbose == True):
292 | print('test resource loading time::', (end - start))
293 |
294 | self._vocab = self.load_vocab()
295 | print('vocab loaded...')
296 |
297 | start = time.time()
298 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
299 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
300 |
301 | aux_test = build_auxiliary_feature(self.test)
302 |
303 | end = time.time()
304 | if (verbose == True):
305 | print('test resource preparation time::', (end - start))
306 |
307 | self.__predict_model([tX, aux_test], self.test)
308 | except Exception as e:
309 | print('Error:', e)
310 | raise
311 |
312 | def __predict_model(self, tX, test):
313 | y = []
314 | y_pred = []
315 |
316 | # tX = tX[:-len(tX) % 32]
317 | # test = test[:-len(test) % 32]
318 |
319 | prediction_probability = self.model.predict_file(tX, batch_size=1, verbose=1)
320 |
321 | try:
322 | fd = open(self._output_file + '.analysis', 'w')
323 | for i, (label) in enumerate(prediction_probability):
324 | gold_label = test[i][1]
325 | words = test[i][2]
326 | dimensions = test[i][3]
327 | context = test[i][4]
328 | author = test[i][5]
329 |
330 | predicted = numpy.argmax(prediction_probability[i])
331 |
332 | y.append(int(gold_label))
333 | y_pred.append(predicted)
334 |
335 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
336 | + str(gold_label) + '\t'
337 | + str(predicted) + '\t'
338 | + ' '.join(words))
339 |
340 | fd.write('\n')
341 |
342 | print()
343 |
344 | print('accuracy::', metrics.accuracy_score(y, y_pred))
345 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
346 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
347 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
348 | print('f_score::', metrics.classification_report(y, y_pred))
349 | fd.close()
350 | except Exception as e:
351 | print(e)
352 | raise
353 |
354 |
355 | if __name__ == "__main__":
356 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
357 | train_file = basepath + '/resource/train/Train_v1.txt'
358 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
359 | test_file = basepath + '/resource/test/Test_v1.txt'
360 | word_file_path = basepath + '/resource/word_list_freq.txt'
361 | split_word_path = basepath + '/resource/word_split.txt'
362 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
363 |
364 | output_file = basepath + '/resource/text_model/TestResults.txt'
365 | model_file = basepath + '/resource/text_model/weights/'
366 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
367 |
368 | # uncomment for training
369 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
370 | vocab_file_path, output_file)
371 |
372 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
373 | t.load_trained_model(weight_file='model.json.hdf5')
374 | t.predict(test_file)
375 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN.py:
--------------------------------------------------------------------------------
1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
2 |
3 | import os
4 | import sys
5 |
6 | sys.path.append('../')
7 |
8 | import collections
9 | import time
10 | import numpy
11 |
12 | numpy.random.seed(1337)
13 | from sklearn import metrics
14 | from keras.models import Model
15 | from keras.layers import Input
16 | from keras.models import Sequential, model_from_json
17 | from keras.layers.core import Dropout, Dense, Activation
18 | from keras.layers.embeddings import Embedding
19 | from keras.layers.recurrent import LSTM
20 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
21 | from keras.callbacks import ModelCheckpoint
22 | from keras.callbacks import EarlyStopping
23 | from keras.optimizers import Adam
24 | from keras.utils import np_utils
25 | from collections import defaultdict
26 | import src.data_processing.data_handler as dh
27 |
28 |
29 | class sarcasm_model():
30 | _train_file = None
31 | _test_file = None
32 | _tweet_file = None
33 | _output_file = None
34 | _model_file_path = None
35 | _word_file_path = None
36 | _split_word_file_path = None
37 | _emoji_file_path = None
38 | _vocab_file_path = None
39 | _input_weight_file_path = None
40 | _vocab = None
41 | _line_maxlen = None
42 |
43 | def __init__(self):
44 | self._line_maxlen = 30
45 |
46 | def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=256, hidden_units=256):
47 | print('Build model...')
48 |
49 | text_input = Input(name='text', shape=(maxlen,))
50 |
51 | if (len(emb_weights) == 0):
52 | emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen,
53 | embeddings_initializer='glorot_normal',
54 | trainable=True)(text_input)
55 | else:
56 | emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
57 | trainable=False)(text_input)
58 |
59 | cnn1 = Convolution1D(int(hidden_units / 4), 3, kernel_initializer='he_normal', activation='sigmoid',
60 | padding='valid', input_shape=(1, maxlen))(emb)
61 |
62 | cnn2 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid',
63 | padding='valid', input_shape=(1, maxlen - 1))(cnn1)
64 |
65 | lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
66 | dropout=0.25, return_sequences=True)(cnn2)
67 |
68 | lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
69 | dropout=0.25)(lstm1)
70 |
71 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(lstm2)
72 | dnn_2 = Dense(2, activation='softmax')(dnn_1)
73 |
74 | model = Model(inputs=[text_input], outputs=dnn_2)
75 |
76 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
77 | print('No of parameter:', model.count_params())
78 |
79 | print(model.summary())
80 | return model
81 |
82 |
83 | class train_model(sarcasm_model):
84 | train = None
85 | validation = None
86 | print("Loading resource...")
87 |
88 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
89 | vocab_file,
90 | output_file,
91 | word2vec_path=None):
92 | sarcasm_model.__init__(self)
93 |
94 | self._train_file = train_file
95 | self._validation_file = validation_file
96 | self._word_file_path = word_file_path
97 | self._split_word_file_path = split_word_path
98 | self._emoji_file_path = emoji_file_path
99 | self._model_file = model_file
100 | self._vocab_file_path = vocab_file
101 | self._output_file = output_file
102 |
103 | self.load_train_validation_data()
104 |
105 | print(self._line_maxlen)
106 |
107 | # build vocabulary
108 | # truncates words with min freq=1
109 | self._vocab = dh.build_vocab(self.train, min_freq=1)
110 | if ('unk' not in self._vocab):
111 | self._vocab['unk'] = len(self._vocab.keys()) + 1
112 |
113 | print(len(self._vocab.keys()) + 1)
114 | print('unk::', self._vocab['unk'])
115 |
116 | dh.write_vocab(self._vocab_file_path, self._vocab)
117 |
118 | # prepares input
119 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
120 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
121 |
122 | # prepares input
123 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
124 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
125 |
126 | # embedding dimension
127 | dimension_size = 300
128 |
129 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
130 | path=word2vec_path)
131 |
132 | # solving class imbalance
133 | ratio = self.calculate_label_ratio(Y)
134 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
135 | print('class ratio::', ratio)
136 |
137 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
138 |
139 | print('train_X', X.shape)
140 | print('train_Y', Y.shape)
141 | print('validation_X', tX.shape)
142 | print('validation_Y', tY.shape)
143 |
144 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=256, emb_weights=W)
145 |
146 | open(self._model_file + 'model.json', 'w').write(model.to_json())
147 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
148 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}.hdf5',
149 | save_best_only=False)
150 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
151 |
152 | # training
153 | model.fit(X, Y, batch_size=64, epochs=100, validation_data=(tX, tY), shuffle=True,
154 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio, verbose=2)
155 |
156 | def load_train_validation_data(self):
157 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
158 | self._emoji_file_path, normalize_text=True,
159 | split_hashtag=True,
160 | ignore_profiles=False)
161 | print('Training data loading finished...')
162 |
163 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
164 | self._emoji_file_path,
165 | normalize_text=True,
166 | split_hashtag=True,
167 | ignore_profiles=False)
168 | print('Validation data loading finished...')
169 |
170 | if (self._test_file != None):
171 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
172 | split_hashtag=True,
173 | ignore_profiles=True)
174 |
175 | def get_maxlen(self):
176 | return max(map(len, (x for _, x in self.train + self.validation)))
177 |
178 | def write_vocab(self):
179 | with open(self._vocab_file_path, 'w') as fw:
180 | for key, value in self._vocab.iteritems():
181 | fw.write(str(key) + '\t' + str(value) + '\n')
182 |
183 | def calculate_label_ratio(self, labels):
184 | return collections.Counter(labels)
185 |
186 |
187 | class test_model(sarcasm_model):
188 | test = None
189 | model = None
190 |
191 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
192 | input_weight_file_path=None):
193 | print('initializing...')
194 | sarcasm_model.__init__(self)
195 |
196 | self._model_file_path = model_file
197 | self._word_file_path = word_file_path
198 | self._split_word_file_path = split_word_path
199 | self._emoji_file_path = emoji_file_path
200 | self._vocab_file_path = vocab_file_path
201 | self._output_file = output_file
202 | self._input_weight_file_path = input_weight_file_path
203 |
204 | print('test_maxlen', self._line_maxlen)
205 |
206 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
207 | start = time.time()
208 | self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
209 | end = time.time()
210 | print('model loading time::', (end - start))
211 |
212 | def __load_model(self, model_path, model_weight_path):
213 | self.model = model_from_json(open(model_path).read())
214 | print('model loaded from file...')
215 | self.model.load_weights(model_weight_path)
216 | print('model weights loaded from file...')
217 |
218 | def load_vocab(self):
219 | vocab = defaultdict()
220 | with open(self._vocab_file_path, 'r') as f:
221 | for line in f.readlines():
222 | key, value = line.split('\t')
223 | vocab[key] = value
224 |
225 | return vocab
226 |
227 | def predict(self, test_file, verbose=False):
228 | try:
229 | start = time.time()
230 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
231 | normalize_text=True, split_hashtag=True,
232 | ignore_profiles=False)
233 | end = time.time()
234 | if (verbose == True):
235 | print('test resource loading time::', (end - start))
236 |
237 | self._vocab = self.load_vocab()
238 | print('vocab loaded...')
239 |
240 | start = time.time()
241 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
242 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
243 | end = time.time()
244 | if (verbose == True):
245 | print('test resource preparation time::', (end - start))
246 |
247 | self.__predict_model(tX, self.test)
248 | except Exception as e:
249 | print('Error:', e)
250 | raise
251 |
252 | def __predict_model(self, tX, test):
253 | y = []
254 | y_pred = []
255 |
256 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
257 |
258 | try:
259 | fd = open(self._output_file + '.analysis', 'w')
260 | for i, (label) in enumerate(prediction_probability):
261 | gold_label = test[i][1]
262 | words = test[i][2]
263 | dimensions = test[i][3]
264 | context = test[i][4]
265 | author = test[i][5]
266 |
267 | predicted = numpy.argmax(prediction_probability[i])
268 |
269 | y.append(int(gold_label))
270 | y_pred.append(predicted)
271 |
272 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
273 | + str(gold_label) + '\t'
274 | + str(predicted) + '\t'
275 | + ' '.join(words))
276 |
277 | fd.write('\n')
278 |
279 | print()
280 |
281 | print('accuracy::', metrics.accuracy_score(y, y_pred))
282 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
283 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
284 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
285 | print('f_score::', metrics.classification_report(y, y_pred))
286 | fd.close()
287 | except Exception as e:
288 | print(e)
289 | raise
290 |
291 |
292 | if __name__ == "__main__":
293 | basepath = os.path.abspath(os.path.join(os.getcwd(), '..'))
294 | train_file = basepath + '/resource/train/Train_v1.txt'
295 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
296 | test_file = basepath + '/resource/test/Test_v1.txt'
297 | word_file_path = basepath + '/resource/word_list_freq.txt'
298 | split_word_path = basepath + '/resource/word_split.txt'
299 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
300 |
301 | output_file = basepath + '/resource/text_model/TestResults.txt'
302 | model_file = basepath + '/resource/text_model/weights/'
303 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
304 |
305 | word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin'
306 |
307 | # uncomment for training
308 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
309 | # vocab_file_path, output_file)
310 |
311 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
312 | t.load_trained_model(weight_file='weights.05__.hdf5')
313 | t.predict(test_file)
314 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D
4 |
5 | sys.path.append('../')
6 |
7 | import collections
8 | import time
9 | import numpy
10 |
11 | numpy.random.seed(1337)
12 | from sklearn import metrics
13 | from keras.models import Sequential, model_from_json
14 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten
15 | from keras.layers.embeddings import Embedding
16 | from keras.layers.recurrent import LSTM
17 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D
18 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
19 | from keras.callbacks import EarlyStopping
20 | from keras.optimizers import Adam
21 | from keras.utils import np_utils
22 | from collections import defaultdict
23 | import src.data_processing.data_handler as dh
24 |
25 |
26 | class sarcasm_model():
27 | _train_file = None
28 | _test_file = None
29 | _tweet_file = None
30 | _output_file = None
31 | _model_file = None
32 | _word_file_path = None
33 | _split_word_file_path = None
34 | _emoji_file_path = None
35 | _vocab_file_path = None
36 | _input_weight_file_path = None
37 | _vocab = None
38 | _line_maxlen = None
39 |
40 | def __init__(self):
41 | self._line_maxlen = 50
42 |
43 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
44 | print('Build model...')
45 | model = Sequential()
46 |
47 | model.add(
48 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
49 |
50 | model.add(
51 | Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid',
52 | activation='sigmoid'))
53 | model.add(MaxPooling1D(pool_size=2))
54 | model.add(Dropout(0.25))
55 |
56 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
57 | recurrent_activation=0.5, unroll=True, return_sequences=True))
58 |
59 | model.add(GlobalAveragePooling1D())
60 | model.add(Dropout(0.5))
61 |
62 | model.add(Dense(2))
63 | model.add(Activation('softmax'))
64 | adam = Adam(lr=0.001)
65 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
66 | print('No of parameter:', model.count_params())
67 |
68 | print(model.summary())
69 | return model
70 |
71 |
72 | class train_model(sarcasm_model):
73 | train = None
74 | validation = None
75 | print("Loading resource...")
76 |
77 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
78 | vocab_file,
79 | output_file,
80 | word2vec_path=None):
81 | sarcasm_model.__init__(self)
82 |
83 | self._train_file = train_file
84 | self._validation_file = validation_file
85 | self._word_file_path = word_file_path
86 | self._split_word_file_path = split_word_path
87 | self._emoji_file_path = emoji_file_path
88 | self._model_file = model_file
89 | self._vocab_file_path = vocab_file
90 | self._output_file = output_file
91 | self._input_weight_file_path = input_weight_file_path
92 |
93 | self.load_train_validation_data()
94 |
95 | print(self._line_maxlen)
96 |
97 | # build vocabulary
98 | # truncates words with min freq=10
99 | self._vocab = dh.build_vocab(self.train, min_freq=2)
100 | if ('unk' not in self._vocab):
101 | self._vocab['unk'] = len(self._vocab.keys()) + 1
102 |
103 | print(len(self._vocab.keys()) + 1)
104 | print('unk::', self._vocab['unk'])
105 |
106 | dh.write_vocab(self._vocab_file_path, self._vocab)
107 |
108 | # prepares input
109 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
110 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
111 |
112 | # prepares input
113 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
114 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
115 |
116 | # embedding dimension
117 | dimension_size = 30
118 |
119 | W = dh.get_fasttext_weight(self._vocab, n=dimension_size,
120 | path=word2vec_path)
121 |
122 | # solving class imbalance
123 | ratio = self.calculate_label_ratio(Y)
124 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
125 | print('class ratio::', ratio)
126 |
127 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
128 |
129 | print('train_X', X.shape)
130 | print('train_Y', Y.shape)
131 | print('validation_X', tX.shape)
132 | print('validation_Y', tY.shape)
133 |
134 | # trainable true if you want word2vec weights to be updated
135 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128,
136 | embedding_dimension=dimension_size,
137 | trainable=True)
138 |
139 | open(self._model_file + 'model.json', 'w').write(model.to_json())
140 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
141 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
142 | save_best_only=False)
143 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
144 | lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto',
145 | epsilon=0.0001,
146 | cooldown=0, min_lr=0.000001)
147 |
148 | # training
149 | # model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True,
150 | # callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
151 | model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True,
152 | callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio)
153 |
154 | def load_train_validation_data(self):
155 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
156 | self._emoji_file_path, normalize_text=True,
157 | split_hashtag=True,
158 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
159 | print('Training data loading finished...')
160 |
161 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
162 | self._emoji_file_path,
163 | normalize_text=True,
164 | split_hashtag=False,
165 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
166 | print('Validation data loading finished...')
167 |
168 | def get_maxlen(self):
169 | return max(map(len, (x for _, x in self.train + self.validation)))
170 |
171 | def write_vocab(self):
172 | with open(self._vocab_file_path, 'w') as fw:
173 | for key, value in self._vocab.iteritems():
174 | fw.write(str(key) + '\t' + str(value) + '\n')
175 |
176 | def calculate_label_ratio(self, labels):
177 | return collections.Counter(labels)
178 |
179 |
180 | class test_model(sarcasm_model):
181 | test = None
182 | model = None
183 |
184 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
185 | input_weight_file_path=None):
186 | print('initializing...')
187 | sarcasm_model.__init__(self)
188 |
189 | self._model_file = model_file
190 | self._word_file_path = word_file_path
191 | self._split_word_file_path = split_word_path
192 | self._emoji_file_path = emoji_file_path
193 | self._vocab_file_path = vocab_file_path
194 | self._output_file = output_file
195 | self._input_weight_file_path = input_weight_file_path
196 |
197 | print('test_maxlen', self._line_maxlen)
198 |
199 | def load_trained_model(self, weight_file='model.json.hdf5'):
200 | start = time.time()
201 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
202 | end = time.time()
203 | print('model loading time::', (end - start))
204 |
205 | def __load_model(self, model_path, model_weight_path):
206 | self.model = model_from_json(open(model_path).read())
207 | print('model loaded from file...')
208 | self.model.load_weights(model_weight_path)
209 | print('model weights loaded from file...')
210 |
211 | def load_vocab(self):
212 | vocab = defaultdict()
213 | with open(self._vocab_file_path, 'r') as f:
214 | for line in f.readlines():
215 | key, value = line.split('\t')
216 | vocab[key] = value
217 |
218 | return vocab
219 |
220 | def predict(self, test_file, verbose=False):
221 | try:
222 | start = time.time()
223 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
224 | normalize_text=True, split_hashtag=True,
225 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
226 | end = time.time()
227 | if (verbose == True):
228 | print('test resource loading time::', (end - start))
229 |
230 | self._vocab = self.load_vocab()
231 | print('vocab loaded...')
232 |
233 | start = time.time()
234 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
235 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
236 | end = time.time()
237 | if (verbose == True):
238 | print('test resource preparation time::', (end - start))
239 |
240 | self.__predict_model(tX, self.test)
241 | except Exception as e:
242 | print('Error:', e)
243 |
244 | def __predict_model(self, tX, test):
245 | y = []
246 | y_pred = []
247 |
248 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
249 |
250 | try:
251 | fd = open(self._output_file + '.analysis', 'w')
252 | for i, (label) in enumerate(prediction_probability):
253 | id = test[i][0]
254 | gold_label = test[i][1]
255 | words = test[i][2]
256 | dimensions = test[i][3]
257 | context = test[i][4]
258 | author = test[i][5]
259 |
260 | predicted = numpy.argmax(prediction_probability[i])
261 |
262 | y.append(int(gold_label))
263 | y_pred.append(predicted)
264 |
265 | # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t'
266 | # + str(gold_label) + '\t'
267 | # + str(predicted) + '\t'
268 | # + ' '.join(words))
269 | fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n')
270 |
271 | print()
272 |
273 | print('accuracy::', metrics.accuracy_score(y, y_pred))
274 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
275 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
276 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
277 | print('f_score::', metrics.classification_report(y, y_pred))
278 | fd.close()
279 | except Exception as e:
280 | print(e)
281 |
282 |
283 | if __name__ == "__main__":
284 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
285 | train_file = basepath + '/resource/train/spooky_train.tsv'
286 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
287 | test_file = basepath + '/resource/test/spooky_test.tsv'
288 | word_file_path = basepath + '/resource/word_list_freq.txt'
289 | split_word_path = basepath + '/resource/word_split.txt'
290 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
291 |
292 | output_file = basepath + '/resource/text_model/TestResults.txt'
293 | model_file = basepath + '/resource/text_model/weights/'
294 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
295 |
296 | #fastext model path
297 | fasttext_path = '/home/fasttext/en.wiki.bin'
298 |
299 | # uncomment for training
300 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
301 | vocab_file_path, output_file, fasttext_path)
302 |
303 | # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
304 | # t.load_trained_model()
305 | # t.predict(test_file)
306 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_simpler.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append('../')
5 |
6 | import collections
7 | import time
8 | import numpy
9 |
10 | numpy.random.seed(1337)
11 | from sklearn import metrics
12 | from keras.models import Sequential, model_from_json
13 | from keras.layers.core import Dropout, Dense, Activation
14 | from keras.layers.embeddings import Embedding
15 | from keras.layers.recurrent import LSTM
16 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
17 | from keras.callbacks import ModelCheckpoint
18 | from keras.callbacks import EarlyStopping
19 | from keras.optimizers import Adam
20 | from keras.utils import np_utils
21 | from collections import defaultdict
22 | import src.data_processing.data_handler as dh
23 |
24 |
25 | class sarcasm_model():
26 | _train_file = None
27 | _test_file = None
28 | _tweet_file = None
29 | _output_file = None
30 | _model_file_path = None
31 | _word_file_path = None
32 | _split_word_file_path = None
33 | _emoji_file_path = None
34 | _vocab_file_path = None
35 | _input_weight_file_path = None
36 | _vocab = None
37 | _line_maxlen = None
38 |
39 | def __init__(self):
40 | self._line_maxlen = 30
41 |
42 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
43 | print('Build model...')
44 | model = Sequential()
45 |
46 | model.add(
47 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
48 |
49 | model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='sigmoid',
50 | input_shape=(1, maxlen)))
51 | model.add(MaxPooling1D(pool_size=3))
52 | model.add(Dropout(0.25))
53 |
54 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
55 | model.add(Dropout(0.25))
56 |
57 | model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
58 | model.add(Dropout(0.25))
59 |
60 | model.add(Dense(2))
61 | model.add(Activation('softmax'))
62 | adam = Adam(lr=0.0001)
63 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
64 | print('No of parameter:', model.count_params())
65 |
66 | print(model.summary())
67 | return model
68 |
69 |
70 | class train_model(sarcasm_model):
71 | train = None
72 | validation = None
73 | print("Loading resource...")
74 |
75 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
76 | vocab_file,
77 | output_file,
78 | input_weight_file_path=None):
79 | sarcasm_model.__init__(self)
80 |
81 | self._train_file = train_file
82 | self._validation_file = validation_file
83 | self._word_file_path = word_file_path
84 | self._split_word_file_path = split_word_path
85 | self._emoji_file_path = emoji_file_path
86 | self._model_file = model_file
87 | self._vocab_file_path = vocab_file
88 | self._output_file = output_file
89 | self._input_weight_file_path = input_weight_file_path
90 |
91 | self.load_train_validation_data()
92 |
93 | print(self._line_maxlen)
94 |
95 | # build vocabulary
96 | # truncates words with min freq=1
97 | self._vocab = dh.build_vocab(self.train, min_freq=1)
98 | if ('unk' not in self._vocab):
99 | self._vocab['unk'] = len(self._vocab.keys()) + 1
100 |
101 | print(len(self._vocab.keys()) + 1)
102 | print('unk::', self._vocab['unk'])
103 |
104 | dh.write_vocab(self._vocab_file_path, self._vocab)
105 |
106 | # prepares input
107 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
108 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
109 |
110 | # prepares input
111 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
112 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
113 |
114 | # embedding dimension
115 | dimension_size = 256
116 |
117 | # solving class imbalance
118 | ratio = self.calculate_label_ratio(Y)
119 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
120 | print('class ratio::', ratio)
121 |
122 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
123 |
124 | print('train_X', X.shape)
125 | print('train_Y', Y.shape)
126 | print('validation_X', tX.shape)
127 | print('validation_Y', tY.shape)
128 |
129 | # trainable true if you want word2vec weights to be updated
130 | # Not applicable in this code
131 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, embedding_dimension=dimension_size,
132 | trainable=True)
133 |
134 | open(self._model_file + 'model.json', 'w').write(model.to_json())
135 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
136 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
137 | save_best_only=False)
138 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
139 |
140 | # training
141 | model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True,
142 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
143 |
144 | def load_train_validation_data(self):
145 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
146 | self._emoji_file_path, normalize_text=True,
147 | split_hashtag=True,
148 | ignore_profiles=False)
149 | print('Training data loading finished...')
150 |
151 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
152 | self._emoji_file_path,
153 | normalize_text=True,
154 | split_hashtag=True,
155 | ignore_profiles=False)
156 | print('Validation data loading finished...')
157 |
158 | def get_maxlen(self):
159 | return max(map(len, (x for _, x in self.train + self.validation)))
160 |
161 | def write_vocab(self):
162 | with open(self._vocab_file_path, 'w') as fw:
163 | for key, value in self._vocab.iteritems():
164 | fw.write(str(key) + '\t' + str(value) + '\n')
165 |
166 | def calculate_label_ratio(self, labels):
167 | return collections.Counter(labels)
168 |
169 |
170 | class test_model(sarcasm_model):
171 | test = None
172 | model = None
173 |
174 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
175 | input_weight_file_path=None):
176 | print('initializing...')
177 | sarcasm_model.__init__(self)
178 |
179 | self._model_file_path = model_file
180 | self._word_file_path = word_file_path
181 | self._split_word_file_path = split_word_path
182 | self._emoji_file_path = emoji_file_path
183 | self._vocab_file_path = vocab_file_path
184 | self._output_file = output_file
185 | self._input_weight_file_path = input_weight_file_path
186 |
187 | print('test_maxlen', self._line_maxlen)
188 |
189 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
190 | start = time.time()
191 | self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
192 | end = time.time()
193 | print('model loading time::', (end - start))
194 |
195 | def __load_model(self, model_path, model_weight_path):
196 | self.model = model_from_json(open(model_path).read())
197 | print('model loaded from file...')
198 | self.model.load_weights(model_weight_path)
199 | print('model weights loaded from file...')
200 |
201 | def load_vocab(self):
202 | vocab = defaultdict()
203 | with open(self._vocab_file_path, 'r') as f:
204 | for line in f.readlines():
205 | key, value = line.split('\t')
206 | vocab[key] = value
207 |
208 | return vocab
209 |
210 | def interactive(self, word_file_path, split_word_path, emoji_file_path):
211 | word_list, emoji_dict, split_word_list, abbreviation_dict = dh.load_resources(word_file_path, split_word_path,
212 | emoji_file_path,
213 | split_hashtag=True)
214 | self._vocab = self.load_vocab()
215 | text = ''
216 | while (text != 'exit'):
217 | text = input('Enter a query::')
218 | data = dh.parsedata(['{}\t{}\t{}'.format('id', -1, text)], word_list, split_word_list, emoji_dict,
219 | abbreviation_dict, normalize_text=True,
220 | split_hashtag=True,
221 | ignore_profiles=False)
222 |
223 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(data, self._vocab)
224 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
225 | print(self.__predict_line(tX))
226 |
227 | def predict_file(self, test_file, verbose=False):
228 | try:
229 | start = time.time()
230 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
231 | normalize_text=True, split_hashtag=True,
232 | ignore_profiles=False)
233 | end = time.time()
234 | if (verbose == True):
235 | print('test resource loading time::', (end - start))
236 |
237 | self._vocab = self.load_vocab()
238 | print('vocab loaded...')
239 |
240 | start = time.time()
241 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
242 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
243 | end = time.time()
244 | if (verbose == True):
245 | print('test resource preparation time::', (end - start))
246 |
247 | self.__predict_model(tX, self.test)
248 | except Exception as e:
249 | print('Error:', e)
250 | raise
251 |
252 | def __predict_line(self, tX):
253 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
254 | predicted = numpy.argmax(prediction_probability[0])
255 | return predicted, prediction_probability
256 |
257 | def __predict_model(self, tX, test):
258 | y = []
259 | y_pred = []
260 |
261 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
262 |
263 | try:
264 | fd = open(self._output_file + '.analysis', 'w')
265 | for i, (label) in enumerate(prediction_probability):
266 | gold_label = test[i][1]
267 | words = test[i][2]
268 | dimensions = test[i][3]
269 | context = test[i][4]
270 | author = test[i][5]
271 |
272 | predicted = numpy.argmax(prediction_probability[i])
273 |
274 | y.append(int(gold_label))
275 | y_pred.append(predicted)
276 |
277 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
278 | + str(gold_label) + '\t'
279 | + str(predicted) + '\t'
280 | + ' '.join(words))
281 |
282 | fd.write('\n')
283 |
284 | print()
285 |
286 | print('accuracy::', metrics.accuracy_score(y, y_pred))
287 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
288 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
289 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
290 | print('f_score::', metrics.classification_report(y, y_pred))
291 | fd.close()
292 | except Exception as e:
293 | print(e)
294 | raise
295 |
296 |
297 | if __name__ == "__main__":
298 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
299 | train_file = basepath + '/resource/train/Train_v1.txt'
300 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
301 | test_file = basepath + '/resource/test/Test_v1.txt'
302 | word_file_path = basepath + '/resource/word_list_freq.txt'
303 | split_word_path = basepath + '/resource/word_split.txt'
304 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
305 |
306 | output_file = basepath + '/resource/text_model/TestResults.txt'
307 | model_file = basepath + '/resource/text_model/weights/'
308 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
309 |
310 | # uncomment for training
311 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
312 | # vocab_file_path, output_file)
313 |
314 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
315 | t.load_trained_model()
316 | # t.predict_file(test_file)
317 | t.interactive(word_file_path, split_word_path, emoji_file_path)
318 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | sys.path.append('../')
5 | import collections
6 | import time
7 | import numpy
8 |
9 | numpy.random.seed(1337)
10 | from sklearn import metrics
11 | from keras.models import Sequential, model_from_json
12 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape
13 | from keras.layers.embeddings import Embedding
14 | from keras.layers.recurrent import LSTM
15 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
16 | from keras.callbacks import ModelCheckpoint
17 | from keras.callbacks import EarlyStopping
18 | from keras.optimizers import Adam
19 | from keras.utils import np_utils
20 | from collections import defaultdict
21 | import src.data_processing.data_handler as dh
22 |
23 |
24 | class sarcasm_model():
25 | _train_file = None
26 | _test_file = None
27 | _tweet_file = None
28 | _output_file = None
29 | _model_file = None
30 | _word_file_path = None
31 | _vocab_file_path = None
32 | _vocab = None
33 | _line_maxlen = None
34 |
35 | def __init__(self):
36 | self._line_maxlen = 30
37 |
38 | def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False):
39 | print('Build model...')
40 | model = Sequential()
41 |
42 | model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
43 | trainable=trainable))
44 |
45 | # model.add(Reshape((maxlen, emb_weights.shape[1], 1)))
46 |
47 | model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid',
48 | activation='sigmoid',
49 | input_shape=(1, maxlen)))
50 | # model.add(MaxPooling1D(pool_size=3))
51 |
52 | model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid',
53 | activation='sigmoid',
54 | input_shape=(1, maxlen - 2)))
55 | # model.add(MaxPooling1D(pool_size=3))
56 |
57 | model.add(Dropout(0.25))
58 |
59 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
60 | return_sequences=True))
61 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
62 |
63 | model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
64 | model.add(Dense(2, activation='softmax'))
65 | adam = Adam(lr=0.0001)
66 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
67 | print('No of parameter:', model.count_params())
68 |
69 | print(model.summary())
70 | return model
71 |
72 |
73 | class train_model(sarcasm_model):
74 | train = None
75 | validation = None
76 | print("Loading resource...")
77 |
78 | def __init__(self, train_file, validation_file, word_file_path, model_file, vocab_file, output_file,
79 | word2vec_path=None, test_file=None):
80 |
81 | sarcasm_model.__init__(self)
82 |
83 | self._train_file = train_file
84 | self._validation_file = validation_file
85 | self._word_file_path = word_file_path
86 | self._model_file = model_file
87 | self._vocab_file_path = vocab_file
88 | self._output_file = output_file
89 | self._test_file = test_file
90 |
91 | self.load_train_validation_test_data()
92 |
93 | print(self._line_maxlen)
94 |
95 | # build vocabulary
96 | if (self._test_file != None):
97 | self._vocab = dh.build_vocab(self.train + self.validation + self.test, min_freq=2)
98 | else:
99 | self._vocab = dh.build_vocab(self.train + self.validation, min_freq=2)
100 |
101 | self._vocab['unk'] = len(self._vocab.keys()) + 1
102 |
103 | print(len(self._vocab.keys()) + 1)
104 | print('unk::', self._vocab['unk'])
105 |
106 | dh.write_vocab(self._vocab_file_path, self._vocab)
107 |
108 | # prepares input
109 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
110 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
111 |
112 | # prepares input
113 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
114 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
115 |
116 | # embedding dimension
117 | W = dh.get_word2vec_weight(self._vocab, n=300,
118 | path=word2vec_path)
119 |
120 | # solving class imbalance
121 | ratio = self.calculate_label_ratio(Y)
122 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
123 | print('class ratio::', ratio)
124 |
125 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
126 |
127 | print('train_X', X.shape)
128 | print('train_Y', Y.shape)
129 | print('validation_X', tX.shape)
130 | print('validation_Y', tY.shape)
131 |
132 | # trainable true if you want word2vec weights to be updated
133 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False)
134 |
135 | open(self._model_file + 'model_wv.json', 'w').write(model.to_json())
136 | save_best = ModelCheckpoint(model_file + 'model_wv.json.hdf5', save_best_only=True)
137 | # save_all = ModelCheckpoint(self._model_file + 'weights_wv.{epoch:02d}.hdf5',
138 | # save_best_only=False)
139 | # early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1)
140 |
141 | # training
142 | model.fit(X, Y, batch_size=8, epochs=100, validation_data=(tX, tY), shuffle=True,
143 | callbacks=[save_best], class_weight=ratio)
144 |
145 | def load_train_validation_test_data(self):
146 | self.train = dh.loaddata(self._train_file, self._word_file_path, normalize_text=True,
147 | split_hashtag=True,
148 | ignore_profiles=False)
149 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, normalize_text=True,
150 | split_hashtag=True,
151 | ignore_profiles=False)
152 | if (self._test_file != None):
153 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
154 | split_hashtag=True,
155 | ignore_profiles=True)
156 |
157 | def get_maxlen(self):
158 | return max(map(len, (x for _, x in self.train + self.validation)))
159 |
160 | def write_vocab(self):
161 | with open(self._vocab_file_path, 'w') as fw:
162 | for key, value in self._vocab.iteritems():
163 | fw.write(str(key) + '\t' + str(value) + '\n')
164 |
165 | def calculate_label_ratio(self, labels):
166 | return collections.Counter(labels)
167 |
168 |
169 | class test_model(sarcasm_model):
170 | test = None
171 | model = None
172 |
173 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file, input_weight_file_path=None):
174 | print('initializing...')
175 | sarcasm_model.__init__(self)
176 |
177 | self._word_file_path = word_file_path
178 | self._model_file = model_file
179 | self._vocab_file_path = vocab_file_path
180 | self._output_file = output_file
181 | self._input_weight_file_path = input_weight_file_path
182 |
183 | print('test_maxlen', self._line_maxlen)
184 |
185 | def load_trained_model(self, weight_file='model_wv.json.hdf5'):
186 | start = time.time()
187 | self.__load_model(self._model_file + 'model_wv.json', self._model_file + weight_file)
188 | end = time.time()
189 | print('model loading time::', (end - start))
190 |
191 | def __load_model(self, model_path, model_weight_path):
192 | self.model = model_from_json(open(model_path).read())
193 | print('model loaded from file...')
194 | self.model.load_weights(model_weight_path)
195 | print('model weights loaded from file...')
196 |
197 | def load_vocab(self):
198 | vocab = defaultdict()
199 | with open(self._vocab_file_path, 'r') as f:
200 | for line in f.readlines():
201 | key, value = line.split('\t')
202 | vocab[key] = value
203 |
204 | return vocab
205 |
206 | def predict(self, test_file, verbose=False):
207 | try:
208 | start = time.time()
209 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, split_hashtag=True,
210 | ignore_profiles=True)
211 | end = time.time()
212 | if (verbose == True):
213 | print('test resource loading time::', (end - start))
214 |
215 | self._vocab = self.load_vocab()
216 |
217 | start = time.time()
218 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
219 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
220 | end = time.time()
221 | if (verbose == True):
222 | print('test resource preparation time::', (end - start))
223 |
224 | self.__predict_model(tX, self.test)
225 | except Exception as e:
226 | print('Error:', e)
227 |
228 | def __predict_model(self, tX, test):
229 | y = []
230 | y_pred = []
231 |
232 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
233 |
234 | try:
235 | fd = open(self._output_file + '_wv.analysis', 'w')
236 | for i, (label) in enumerate(prediction_probability):
237 | gold_label = test[i][0]
238 | words = test[i][1]
239 | dimensions = test[i][2]
240 | context = test[i][3]
241 | author = test[i][4]
242 |
243 | predicted = numpy.argmax(prediction_probability[i])
244 |
245 | y.append(int(gold_label))
246 | y_pred.append(predicted)
247 |
248 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
249 | + str(gold_label) + '\t'
250 | + str(predicted) + '\t'
251 | + ' '.join(words))
252 |
253 | fd.write('\n')
254 |
255 | print()
256 |
257 | print('accuracy::', metrics.accuracy_score(y, y_pred))
258 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
259 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
260 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
261 | print('f_score::', metrics.classification_report(y, y_pred))
262 | fd.close()
263 | except Exception as e:
264 | print(e)
265 |
266 |
267 | if __name__ == "__main__":
268 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
269 | train_file = basepath + '/resource/train/Train_v1.txt'
270 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
271 | test_file = basepath + '/resource/test/Test_v1.txt'
272 | word_file_path = basepath + '/resource/word_list.txt'
273 |
274 | output_file = basepath + '/resource/text_model/TestResults.txt'
275 | model_file = basepath + '/resource/text_model/weights/'
276 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
277 |
278 | # word2vec path
279 | word2vec_path = '/home/striker/word2vec/GoogleNews-vectors-negative300.bin'
280 |
281 | tr = train_model(train_file, validation_file, word_file_path, model_file, vocab_file_path, output_file,
282 | word2vec_path=word2vec_path, test_file=test_file)
283 |
284 | t = test_model(word_file_path, model_file, vocab_file_path, output_file)
285 | t.load_trained_model()
286 | t.predict(test_file)
287 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_model_attention.py:
--------------------------------------------------------------------------------
1 | # still working
2 | import os
3 | import sys
4 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D
5 |
6 | sys.path.append('../')
7 |
8 | import collections
9 | import time
10 | import numpy
11 |
12 | numpy.random.seed(1337)
13 | from sklearn import metrics
14 | from keras import initializers, regularizers, constraints, Input
15 | from keras.models import Sequential, model_from_json
16 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten, Layer
17 | from keras.layers.embeddings import Embedding
18 | from keras.layers.recurrent import LSTM
19 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D
20 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
21 | from keras.callbacks import EarlyStopping
22 | from keras.optimizers import Adam
23 | from keras.utils import np_utils
24 | from collections import defaultdict
25 | import src.data_processing.data_handler as dh
26 |
27 | from keras import backend as K
28 |
29 |
30 | class Attention(Layer):
31 | def __init__(self,
32 | W_regularizer=None, b_regularizer=None,
33 | W_constraint=None, b_constraint=None,
34 | bias=True, **kwargs):
35 | """
36 | Keras Layer that implements an Attention mechanism for temporal data.
37 | Supports Masking.
38 | Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
39 | # Input shape
40 | 3D tensor with shape: `(samples, steps, features)`.
41 | # Output shape
42 | 2D tensor with shape: `(samples, features)`.
43 | :param kwargs:
44 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
45 | The dimensions are inferred based on the output shape of the RNN.
46 | Note: The layer has been tested with Keras 2.0.6
47 | Example:
48 | model.add(LSTM(64, return_sequences=True))
49 | model.add(Attention())
50 | # next add a Dense layer (for classification/regression) or whatever...
51 | """
52 | self.supports_masking = True
53 | self.init = initializers.get('glorot_uniform')
54 |
55 | self.W_regularizer = regularizers.get(W_regularizer)
56 | self.b_regularizer = regularizers.get(b_regularizer)
57 |
58 | self.W_constraint = constraints.get(W_constraint)
59 | self.b_constraint = constraints.get(b_constraint)
60 |
61 | self.bias = bias
62 | super(Attention, self).__init__(**kwargs)
63 |
64 | def build(self, input_shape):
65 | assert len(input_shape) == 3
66 |
67 | self.W = self.add_weight((input_shape[-1],),
68 | initializer=self.init,
69 | name='{}_W'.format(self.name),
70 | regularizer=self.W_regularizer,
71 | constraint=self.W_constraint)
72 | if self.bias:
73 | self.b = self.add_weight((input_shape[1],),
74 | initializer='zero',
75 | name='{}_b'.format(self.name),
76 | regularizer=self.b_regularizer,
77 | constraint=self.b_constraint)
78 | else:
79 | self.b = None
80 |
81 | self.built = True
82 |
83 | def compute_mask(self, input, input_mask=None):
84 | # do not pass the mask to the next layers
85 | return None
86 |
87 | def call(self, x, mask=None):
88 | eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)
89 |
90 | if self.bias:
91 | eij += self.b
92 |
93 | eij = K.tanh(eij)
94 |
95 | a = K.exp(eij)
96 |
97 | # apply mask after the exp. will be re-normalized next
98 | if mask is not None:
99 | # Cast the mask to floatX to avoid float64 upcasting in theano
100 | a *= K.cast(mask, K.floatx())
101 |
102 | # in some cases especially in the early stages of training the sum may be almost zero
103 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
104 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
105 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
106 |
107 | a = K.expand_dims(a)
108 |
109 | weighted_input = x * a
110 | return K.sum(weighted_input, axis=1)
111 |
112 | def compute_output_shape(self, input_shape):
113 | return (input_shape[0], input_shape[-1])
114 |
115 |
116 | class sarcasm_model():
117 | _train_file = None
118 | _test_file = None
119 | _tweet_file = None
120 | _output_file = None
121 | _model_file = None
122 | _word_file_path = None
123 | _split_word_file_path = None
124 | _emoji_file_path = None
125 | _vocab_file_path = None
126 | _input_weight_file_path = None
127 | _vocab = None
128 | _line_maxlen = None
129 |
130 | def __init__(self):
131 | self._line_maxlen = 50
132 |
133 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
134 | print('Build model...')
135 | model = Sequential()
136 |
137 | # input = Input(shape=(maxlen,))
138 |
139 | # emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')(input)
140 |
141 | model.add(
142 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
143 |
144 | model.add(
145 | Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid',
146 | activation='sigmoid'))
147 | model.add(MaxPooling1D(pool_size=2))
148 | model.add(Dropout(0.25))
149 |
150 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
151 | recurrent_dropout=0.5, unroll=True, return_sequences=True))
152 |
153 | model.add(Attention())
154 |
155 | # model.add(GlobalAveragePooling1D())
156 | # model.add(Dropout(0.5))
157 |
158 | model.add(Dense(2))
159 | model.add(Activation('softmax'))
160 | adam = Adam(lr=0.001)
161 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
162 | print('No of parameter:', model.count_params())
163 |
164 | print(model.summary())
165 | return model
166 |
167 |
168 | class train_model(sarcasm_model):
169 | train = None
170 | validation = None
171 | print("Loading resource...")
172 |
173 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
174 | vocab_file,
175 | output_file,
176 | input_weight_file_path=None):
177 | sarcasm_model.__init__(self)
178 |
179 | self._train_file = train_file
180 | self._validation_file = validation_file
181 | self._word_file_path = word_file_path
182 | self._split_word_file_path = split_word_path
183 | self._emoji_file_path = emoji_file_path
184 | self._model_file = model_file
185 | self._vocab_file_path = vocab_file
186 | self._output_file = output_file
187 | self._input_weight_file_path = input_weight_file_path
188 |
189 | self.load_train_validation_data()
190 |
191 | print(self._line_maxlen)
192 |
193 | # build vocabulary
194 | # truncates words with min freq=10
195 | self._vocab = dh.build_vocab(self.train, min_freq=2)
196 | if ('unk' not in self._vocab):
197 | self._vocab['unk'] = len(self._vocab.keys()) + 1
198 |
199 | print(len(self._vocab.keys()) + 1)
200 | print('unk::', self._vocab['unk'])
201 |
202 | dh.write_vocab(self._vocab_file_path, self._vocab)
203 |
204 | # prepares input
205 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
206 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
207 |
208 | # prepares input
209 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
210 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
211 |
212 | # embedding dimension
213 | dimension_size = 30
214 |
215 | # solving class imbalance
216 | ratio = self.calculate_label_ratio(Y)
217 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
218 | print('class ratio::', ratio)
219 |
220 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
221 |
222 | print('train_X', X.shape)
223 | print('train_Y', Y.shape)
224 | print('validation_X', tX.shape)
225 | print('validation_Y', tY.shape)
226 |
227 | # trainable true if you want word2vec weights to be updated
228 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128,
229 | embedding_dimension=dimension_size,
230 | trainable=True)
231 |
232 | open(self._model_file + 'model.json', 'w').write(model.to_json())
233 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
234 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
235 | save_best_only=False)
236 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
237 | lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto',
238 | epsilon=0.0001,
239 | cooldown=0, min_lr=0.000001)
240 |
241 | # training
242 | model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, verbose=2,
243 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
244 | # model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True, verbose=1,
245 | # callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio)
246 |
247 | def load_train_validation_data(self):
248 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
249 | self._emoji_file_path, normalize_text=True,
250 | split_hashtag=True,
251 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
252 | print('Training data loading finished...')
253 |
254 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
255 | self._emoji_file_path,
256 | normalize_text=True,
257 | split_hashtag=False,
258 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
259 | print('Validation data loading finished...')
260 |
261 | def get_maxlen(self):
262 | return max(map(len, (x for _, x in self.train + self.validation)))
263 |
264 | def write_vocab(self):
265 | with open(self._vocab_file_path, 'w') as fw:
266 | for key, value in self._vocab.iteritems():
267 | fw.write(str(key) + '\t' + str(value) + '\n')
268 |
269 | def calculate_label_ratio(self, labels):
270 | return collections.Counter(labels)
271 |
272 |
273 | class test_model(sarcasm_model):
274 | test = None
275 | model = None
276 |
277 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
278 | input_weight_file_path=None):
279 | print('initializing...')
280 | sarcasm_model.__init__(self)
281 |
282 | self._model_file = model_file
283 | self._word_file_path = word_file_path
284 | self._split_word_file_path = split_word_path
285 | self._emoji_file_path = emoji_file_path
286 | self._vocab_file_path = vocab_file_path
287 | self._output_file = output_file
288 | self._input_weight_file_path = input_weight_file_path
289 |
290 | print('test_maxlen', self._line_maxlen)
291 |
292 | def load_trained_model(self, weight_file='model.json.hdf5'):
293 | start = time.time()
294 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
295 | end = time.time()
296 | print('model loading time::', (end - start))
297 |
298 | def __load_model(self, model_path, model_weight_path):
299 | self.model = model_from_json(open(model_path).read())
300 | print('model loaded from file...')
301 | self.model.load_weights(model_weight_path)
302 | print('model weights loaded from file...')
303 |
304 | def load_vocab(self):
305 | vocab = defaultdict()
306 | with open(self._vocab_file_path, 'r') as f:
307 | for line in f.readlines():
308 | key, value = line.split('\t')
309 | vocab[key] = value
310 |
311 | return vocab
312 |
313 | def predict(self, test_file, verbose=False):
314 | try:
315 | start = time.time()
316 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
317 | normalize_text=True, split_hashtag=True,
318 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
319 | end = time.time()
320 | if (verbose == True):
321 | print('test resource loading time::', (end - start))
322 |
323 | self._vocab = self.load_vocab()
324 | print('vocab loaded...')
325 |
326 | start = time.time()
327 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
328 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
329 | end = time.time()
330 | if (verbose == True):
331 | print('test resource preparation time::', (end - start))
332 |
333 | self.__predict_model(tX, self.test)
334 | except Exception as e:
335 | print('Error:', e)
336 |
337 | def __predict_model(self, tX, test):
338 | y = []
339 | y_pred = []
340 |
341 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
342 |
343 | try:
344 | fd = open(self._output_file + '.analysis', 'w')
345 | for i, (label) in enumerate(prediction_probability):
346 | id = test[i][0]
347 | gold_label = test[i][1]
348 | words = test[i][2]
349 | dimensions = test[i][3]
350 | context = test[i][4]
351 | author = test[i][5]
352 |
353 | predicted = numpy.argmax(prediction_probability[i])
354 |
355 | y.append(int(gold_label))
356 | y_pred.append(predicted)
357 |
358 | # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t'
359 | # + str(gold_label) + '\t'
360 | # + str(predicted) + '\t'
361 | # + ' '.join(words))
362 | fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n')
363 |
364 | print()
365 |
366 | print('accuracy::', metrics.accuracy_score(y, y_pred))
367 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
368 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
369 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
370 | print('f_score::', metrics.classification_report(y, y_pred))
371 | fd.close()
372 | except Exception as e:
373 | print(e)
374 |
375 |
376 | if __name__ == "__main__":
377 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
378 | train_file = basepath + '/resource/train/Train_v1.txt'
379 | validation_file = basepath + '/resource/dev/Dev_v1.txt'
380 | test_file = basepath + '/resource/test/Test_v1.tsv'
381 | word_file_path = basepath + '/resource/word_list_freq.txt'
382 | split_word_path = basepath + '/resource/word_split.txt'
383 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
384 |
385 | output_file = basepath + '/resource/text_model/TestResults.txt'
386 | model_file = basepath + '/resource/text_model/weights/'
387 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
388 |
389 | # uncomment for training
390 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
391 | vocab_file_path, output_file)
392 |
393 | # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
394 | # t.load_trained_model()
395 | # t.predict(test_file)
396 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_moods_siamese.py:
--------------------------------------------------------------------------------
1 | # not finalized
2 | import os
3 | import collections
4 | import random
5 | import sys
6 |
7 | sys.path.append('../')
8 |
9 | import time
10 | import numpy
11 |
12 | numpy.random.seed(1337)
13 |
14 | from keras.layers.wrappers import TimeDistributed
15 | from keras import backend as K, regularizers
16 | from sklearn import metrics
17 | from keras.models import model_from_json
18 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape
19 | from keras.layers.embeddings import Embedding
20 | from keras.layers.recurrent import LSTM
21 | from keras.layers.convolutional import Convolution1D
22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
23 |
24 | from keras.layers.merge import add, concatenate, subtract, multiply
25 | from keras.models import Model
26 | from keras.utils import np_utils
27 | from keras.layers import Input
28 | import src.data_processing.data_handler as dh
29 | from collections import defaultdict
30 |
31 |
32 | class sarcasm_model():
33 | _train_file = None
34 | _gold_data_path = None
35 | _validation_file = None
36 | _tweet_file = None
37 | # test_debug = None
38 | _output_file = None
39 | _model_file = None
40 | _word_file_path = None
41 | _vocab_file_path = None
42 | _input_weight_file_path = None
43 | _vocab = None
44 | _line_maxlen = None
45 |
46 | def __init__(self):
47 | self._train_file = None
48 | self._test_file = None
49 | self._validation_file = None
50 | self._tweet_file = None
51 | self._output_file = None
52 | self._model_file = None
53 | self._word_file_path = None
54 | self._vocab_file_path = None
55 | self._input_weight_file_path = None
56 | self._vocab = None
57 |
58 | self._line_maxlen = 30
59 |
60 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True,
61 | batch_size=1):
62 |
63 | print('Building model...')
64 |
65 | context_input = Input(name='context', batch_shape=(batch_size, maxlen))
66 |
67 | if (len(c_emb_weights) == 0):
68 | c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
69 | trainable=trainable)(context_input)
70 | else:
71 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
72 | trainable=trainable)(context_input)
73 |
74 | c_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal',
75 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(c_emb)
76 | c_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal',
77 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))(
78 | c_cnn1)
79 |
80 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
81 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
82 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
83 | recurrent_regularizer=regularizers.l2(0.01),
84 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_cnn2)
85 |
86 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
87 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
88 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
89 | recurrent_regularizer=regularizers.l2(0.01),
90 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False,
91 | go_backwards=True)(c_cnn2)
92 |
93 | c_merged = add([c_lstm1, c_lstm2])
94 | c_merged = Dropout(0.25)(c_merged)
95 |
96 | text_input = Input(name='text', batch_shape=(batch_size, maxlen))
97 |
98 | if (len(emb_weights) == 0):
99 | emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
100 | trainable=trainable)(text_input)
101 | else:
102 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
103 | trainable=trainable)(text_input)
104 |
105 | t_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal',
106 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(emb)
107 | t_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal',
108 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))(
109 | t_cnn1)
110 |
111 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
112 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
113 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
114 | recurrent_regularizer=regularizers.l2(0.01),
115 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(t_cnn2)
116 |
117 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
118 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
119 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
120 | recurrent_regularizer=regularizers.l2(0.01),
121 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False,
122 | go_backwards=True)(t_cnn2)
123 |
124 | t_merged = add([t_lstm1, t_lstm2])
125 | t_merged = Dropout(0.25)(t_merged)
126 |
127 | awc_input = Input(name='awc', batch_shape=(batch_size, 11))
128 |
129 | t_merged = Reshape((-1, 1))(t_merged)
130 |
131 | t_merged = multiply([t_merged, awc_input])
132 |
133 | t_merged = Flatten()(t_merged)
134 |
135 | merged = concatenate([c_merged, t_merged], axis=1)
136 |
137 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
138 | dnn_1 = Dropout(0.25)(dnn_1)
139 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
140 |
141 | softmax = Activation('softmax')(dnn_2)
142 |
143 | model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax)
144 |
145 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
146 | print('No of parameter:', model.count_params())
147 |
148 | print(model.summary())
149 | return model
150 |
151 |
152 | class train_model(sarcasm_model):
153 | train = None
154 | validation = None
155 |
156 | def load_train_validation_test_data(self):
157 | print("Loading resource...")
158 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
159 | self._emoji_file_path, normalize_text=True,
160 | split_hashtag=True,
161 | ignore_profiles=False)
162 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
163 | self._emoji_file_path,
164 | normalize_text=True,
165 | split_hashtag=True,
166 | ignore_profiles=False)
167 |
168 | if (self._test_file != None):
169 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
170 | split_hashtag=True,
171 | ignore_profiles=True)
172 |
173 | def split_train_validation(self, train, ratio=.1):
174 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
175 | print(len(test_indices))
176 | train_data = []
177 | validation_data = []
178 | for i, t in enumerate(train):
179 | if (test_indices.__contains__(i)):
180 | validation_data.append(t)
181 | else:
182 | train_data.append(t)
183 | return train_data, validation_data
184 |
185 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
186 | vocab_file,
187 | output_file,
188 | input_weight_file_path=None):
189 | sarcasm_model.__init__(self)
190 |
191 | self._train_file = train_file
192 | self._validation_file = validation_file
193 | self._word_file_path = word_file_path
194 | self._split_word_file_path = split_word_path
195 | self._emoji_file_path = emoji_file_path
196 | self._model_file = model_file
197 | self._vocab_file_path = vocab_file
198 | self._output_file = output_file
199 | self._input_weight_file_path = input_weight_file_path
200 |
201 | self.load_train_validation_test_data()
202 |
203 | batch_size = 32
204 |
205 | print(self._line_maxlen)
206 | self._vocab = dh.build_vocab(self.train, ignore_context=False)
207 | self._vocab['unk'] = len(self._vocab.keys()) + 1
208 |
209 | print(len(self._vocab.keys()) + 1)
210 | print('unk::', self._vocab['unk'])
211 |
212 | dh.write_vocab(self._vocab_file_path, self._vocab)
213 |
214 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
215 |
216 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
217 |
218 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
219 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
220 | D = dh.pad_sequence_1d(D, maxlen=11)
221 |
222 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
223 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
224 | tD = dh.pad_sequence_1d(tD, maxlen=11)
225 |
226 | hidden_units = 128
227 | dimension_size = 300
228 |
229 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
230 | path=word2vec_path)
231 |
232 | cW = W
233 |
234 | print('Word2vec obtained....')
235 |
236 | ratio = self.calculate_label_ratio(Y)
237 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
238 |
239 | print('ratio', ratio)
240 |
241 | dimension_vocab = numpy.unique(D)
242 | print(len(dimension_vocab))
243 |
244 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
245 |
246 | print('train_X', X.shape)
247 | print('train_C', C.shape)
248 | print('train_D', D.shape)
249 | print('train_Y', Y.shape)
250 |
251 | print('validation_X', tX.shape)
252 | print('validation_C', tC.shape)
253 | print('validation_D', tD.shape)
254 | print('validation_Y', tY.shape)
255 |
256 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
257 | hidden_units=hidden_units, trainable=False, dimension_length=11,
258 | batch_size=batch_size)
259 |
260 | open(self._model_file + 'model.json', 'w').write(model.to_json())
261 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
262 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
263 | # save_best_only=False)
264 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
265 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
266 | epsilon=0.0001,
267 | cooldown=0, min_lr=0.000001)
268 |
269 | model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True,
270 | callbacks=[save_best, lr_tuner], class_weight=ratio)
271 |
272 | def get_maxlen(self):
273 | return max(map(len, (x for _, x in self.train + self.validation)))
274 |
275 | def write_vocab(self):
276 | with open(self._vocab_file_path, 'w') as fw:
277 | for key, value in self._vocab.iteritems():
278 | fw.write(str(key) + '\t' + str(value) + '\n')
279 |
280 | def calculate_label_ratio(self, labels, ):
281 | return collections.Counter(labels)
282 |
283 |
284 | class test_model(sarcasm_model):
285 | test = None
286 | model = None
287 |
288 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
289 | print('initializing...')
290 | sarcasm_model.__init__(self)
291 |
292 | self._word_file_path = word_file_path
293 | self._model_file = model_file
294 | self._vocab_file_path = vocab_file_path
295 | self._output_file = output_file
296 |
297 | # self._line_maxlen = 45
298 | print('test_maxlen', self._line_maxlen)
299 |
300 | def predict_cross_validation(self, tC, tX, tD, test):
301 | self.__predict_model([tC, tX, tD], test)
302 |
303 | def load_trained_model(self, weight_file='model.json.hdf5'):
304 | start = time.time()
305 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
306 | end = time.time()
307 | print('model loading time::', (end - start))
308 |
309 | def __load_model(self, model_path, model_weight_path):
310 | self.model = model_from_json(open(model_path).read())
311 | print('model loaded from file...')
312 | self.model.load_weights(model_weight_path)
313 | print('model weights loaded from file...')
314 |
315 | def load_vocab(self):
316 | vocab = defaultdict()
317 | with open(self._vocab_file_path, 'r') as f:
318 | for line in f.readlines():
319 | key, value = line.split('\t')
320 | vocab[key] = value
321 |
322 | return vocab
323 |
324 | def predict(self, test_file, verbose=False):
325 | start = time.time()
326 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
327 | split_hashtag=True,
328 | ignore_profiles=False)
329 | end = time.time()
330 | if (verbose == True):
331 | print('test resource loading time::', (end - start))
332 |
333 | self._vocab = self.load_vocab()
334 |
335 | start = time.time()
336 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
337 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
338 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
339 | tD = dh.pad_sequence_1d(tD, maxlen=11)
340 |
341 | end = time.time()
342 | if (verbose == True):
343 | print('test resource preparation time::', (end - start))
344 |
345 | self.__predict_model([tC, tX, tD], self.test)
346 |
347 | def __predict_model(self, tX, test):
348 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
349 |
350 | y = []
351 | y_pred = []
352 |
353 | fd = open(self._output_file + '.analysis', 'w')
354 | for i, (label) in enumerate(prediction_probability):
355 | gold_label = test[i][0]
356 | words = test[i][1]
357 | dimensions = test[i][2]
358 | context = test[i][3]
359 | author = test[i][4]
360 |
361 | predicted = numpy.argmax(prediction_probability[i])
362 |
363 | y.append(int(gold_label))
364 | y_pred.append(predicted)
365 |
366 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
367 | + str(gold_label) + '\t'
368 | + str(predicted) + '\t'
369 | + ' '.join(words) + '\t'
370 | + str(dimensions) + '\t'
371 | + ' '.join(context))
372 |
373 | fd.write('\n')
374 |
375 | print('accuracy::', metrics.accuracy_score(y, y_pred))
376 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
377 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
378 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
379 | print('f_score::', metrics.classification_report(y, y_pred))
380 |
381 | fd.close()
382 |
383 |
384 | if __name__ == "__main__":
385 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
386 | train_file = basepath + '/resource/train/Train_context_moods_v1.txt'
387 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
388 | test_file = basepath + '/resource/test/Test_context_AW.txt'
389 | word_file_path = basepath + '/resource/word_list_freq.txt'
390 | split_word_path = basepath + '/resource/word_split.txt'
391 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
392 |
393 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
394 | model_file = basepath + '/resource/text_context_awc_model/weights/'
395 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
396 |
397 | # word2vec path
398 | word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin'
399 |
400 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
401 | vocab_file_path, output_file)
402 |
403 | # testing the model
404 | # with K.get_session():
405 | # t = test_model(word_file_path, model_file, vocab_file_path, output_file)
406 | # t.load_trained_model()
407 | # t.predict(test_file)
408 |
--------------------------------------------------------------------------------
/src/sarcasm_detection_siamese.py:
--------------------------------------------------------------------------------
1 | # not finalized
2 | import os
3 | import collections
4 | import random
5 | import sys
6 |
7 | sys.path.append('../')
8 |
9 | import time
10 | import numpy
11 |
12 | numpy.random.seed(1337)
13 |
14 | from keras.layers.wrappers import TimeDistributed
15 | from keras import backend as K, regularizers
16 | from sklearn import metrics
17 | from keras.models import model_from_json
18 | from keras.layers.core import Dropout, Dense, Activation, Flatten
19 | from keras.layers.embeddings import Embedding
20 | from keras.layers.recurrent import LSTM
21 | from keras.layers.convolutional import Convolution1D
22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
23 |
24 | from keras.layers.merge import add, concatenate, subtract
25 | from keras.models import Model
26 | from keras.utils import np_utils
27 | from keras.layers import Input
28 | import src.data_processing.data_handler as dh
29 | from collections import defaultdict
30 |
31 |
32 | class sarcasm_model():
33 | _train_file = None
34 | _gold_data_path = None
35 | _validation_file = None
36 | _tweet_file = None
37 | # test_debug = None
38 | _output_file = None
39 | _model_file = None
40 | _word_file_path = None
41 | _vocab_file_path = None
42 | _input_weight_file_path = None
43 | _vocab = None
44 | _line_maxlen = None
45 |
46 | def __init__(self):
47 | self._train_file = None
48 | self._test_file = None
49 | self._validation_file = None
50 | self._tweet_file = None
51 | self._output_file = None
52 | self._model_file = None
53 | self._word_file_path = None
54 | self._vocab_file_path = None
55 | self._input_weight_file_path = None
56 | self._vocab = None
57 |
58 | self._line_maxlen = 30
59 |
60 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True,
61 | batch_size=1):
62 |
63 | print('Building model...')
64 |
65 | context_input = Input(name='context', batch_shape=(batch_size, maxlen))
66 |
67 | if (len(c_emb_weights) == 0):
68 | c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
69 | trainable=trainable)(context_input)
70 | else:
71 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
72 | trainable=trainable)(context_input)
73 |
74 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
75 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
76 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
77 | recurrent_regularizer=regularizers.l2(0.01),
78 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_emb)
79 |
80 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
81 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
82 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
83 | recurrent_regularizer=regularizers.l2(0.01),
84 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False,
85 | go_backwards=True)(c_emb)
86 |
87 | c_merged = add([c_lstm1, c_lstm2])
88 | c_merged = Dropout(0.25)(c_merged)
89 |
90 | text_input = Input(name='text', batch_shape=(batch_size, maxlen))
91 |
92 | if (len(emb_weights) == 0):
93 | emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
94 | trainable=trainable)(text_input)
95 | else:
96 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
97 | trainable=trainable)(text_input)
98 |
99 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
100 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
101 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
102 | recurrent_regularizer=regularizers.l2(0.01),
103 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(emb)
104 |
105 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
106 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
107 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
108 | recurrent_regularizer=regularizers.l2(0.01),
109 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False,
110 | go_backwards=True)(emb)
111 |
112 | t_merged = add([t_lstm1, t_lstm2])
113 | t_merged = Dropout(0.25)(t_merged)
114 |
115 | merged = subtract([c_merged, t_merged])
116 |
117 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
118 | dnn_1 = Dropout(0.25)(dnn_1)
119 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
120 |
121 | softmax = Activation('softmax')(dnn_2)
122 |
123 | model = Model(inputs=[context_input, text_input], outputs=softmax)
124 |
125 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
126 | print('No of parameter:', model.count_params())
127 |
128 | print(model.summary())
129 | return model
130 |
131 |
132 | class train_model(sarcasm_model):
133 | train = None
134 | validation = None
135 |
136 | def load_train_validation_test_data(self):
137 | print("Loading resource...")
138 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
139 | self._emoji_file_path, normalize_text=True,
140 | split_hashtag=True,
141 | ignore_profiles=False)
142 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
143 | self._emoji_file_path,
144 | normalize_text=True,
145 | split_hashtag=True,
146 | ignore_profiles=False)
147 |
148 | if (self._test_file != None):
149 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
150 | split_hashtag=True,
151 | ignore_profiles=True)
152 |
153 | def split_train_validation(self, train, ratio=.1):
154 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
155 | print(len(test_indices))
156 | train_data = []
157 | validation_data = []
158 | for i, t in enumerate(train):
159 | if (test_indices.__contains__(i)):
160 | validation_data.append(t)
161 | else:
162 | train_data.append(t)
163 | return train_data, validation_data
164 |
165 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
166 | vocab_file,
167 | output_file,
168 | input_weight_file_path=None):
169 | sarcasm_model.__init__(self)
170 |
171 | self._train_file = train_file
172 | self._validation_file = validation_file
173 | self._word_file_path = word_file_path
174 | self._split_word_file_path = split_word_path
175 | self._emoji_file_path = emoji_file_path
176 | self._model_file = model_file
177 | self._vocab_file_path = vocab_file
178 | self._output_file = output_file
179 | self._input_weight_file_path = input_weight_file_path
180 |
181 | self.load_train_validation_test_data()
182 |
183 | batch_size = 32
184 |
185 | print(self._line_maxlen)
186 | self._vocab = dh.build_vocab(self.train, ignore_context=False)
187 | self._vocab['unk'] = len(self._vocab.keys()) + 1
188 |
189 | print(len(self._vocab.keys()) + 1)
190 | print('unk::', self._vocab['unk'])
191 |
192 | dh.write_vocab(self._vocab_file_path, self._vocab)
193 |
194 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
195 |
196 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
197 |
198 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
199 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
200 | D = dh.pad_sequence_1d(D, maxlen=11)
201 |
202 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
203 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
204 | tD = dh.pad_sequence_1d(tD, maxlen=11)
205 |
206 | hidden_units = 128
207 | dimension_size = 300
208 |
209 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
210 | path=word2vec_path)
211 |
212 | cW = W
213 |
214 | print('Word2vec obtained....')
215 |
216 | ratio = self.calculate_label_ratio(Y)
217 | ratio = [max(ratio.values()) / value for key, value in ratio.items()]
218 |
219 | print('ratio', ratio)
220 |
221 | dimension_vocab = numpy.unique(D)
222 | print(len(dimension_vocab))
223 |
224 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
225 |
226 | print('train_X', X.shape)
227 | print('train_C', C.shape)
228 | print('train_D', D.shape)
229 | print('train_Y', Y.shape)
230 |
231 | print('validation_X', tX.shape)
232 | print('validation_C', tC.shape)
233 | print('validation_D', tD.shape)
234 | print('validation_Y', tY.shape)
235 |
236 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
237 | hidden_units=hidden_units, trainable=False,
238 | batch_size=batch_size)
239 |
240 | open(self._model_file + 'model.json', 'w').write(model.to_json())
241 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
242 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
243 | # save_best_only=False)
244 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
245 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
246 | epsilon=0.0001,
247 | cooldown=0, min_lr=0.000001)
248 |
249 | model.fit([C, X], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX], tY), shuffle=True,
250 | callbacks=[save_best, lr_tuner], class_weight=ratio)
251 |
252 | def get_maxlen(self):
253 | return max(map(len, (x for _, x in self.train + self.validation)))
254 |
255 | def write_vocab(self):
256 | with open(self._vocab_file_path, 'w') as fw:
257 | for key, value in self._vocab.iteritems():
258 | fw.write(str(key) + '\t' + str(value) + '\n')
259 |
260 | def calculate_label_ratio(self, labels, ):
261 | return collections.Counter(labels)
262 |
263 |
264 | class test_model(sarcasm_model):
265 | test = None
266 | model = None
267 |
268 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
269 | print('initializing...')
270 | sarcasm_model.__init__(self)
271 |
272 | self._word_file_path = word_file_path
273 | self._model_file = model_file
274 | self._vocab_file_path = vocab_file_path
275 | self._output_file = output_file
276 |
277 | # self._line_maxlen = 45
278 | print('test_maxlen', self._line_maxlen)
279 |
280 | def predict_cross_validation(self, tC, tX, tD, test):
281 | self.__predict_model([tC, tX, tD], test)
282 |
283 | def load_trained_model(self, weight_file='model.json.hdf5'):
284 | start = time.time()
285 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
286 | end = time.time()
287 | print('model loading time::', (end - start))
288 |
289 | def __load_model(self, model_path, model_weight_path):
290 | self.model = model_from_json(open(model_path).read())
291 | print('model loaded from file...')
292 | self.model.load_weights(model_weight_path)
293 | print('model weights loaded from file...')
294 |
295 | def load_vocab(self):
296 | vocab = defaultdict()
297 | with open(self._vocab_file_path, 'r') as f:
298 | for line in f.readlines():
299 | key, value = line.split('\t')
300 | vocab[key] = value
301 |
302 | return vocab
303 |
304 | def predict(self, test_file, verbose=False):
305 | start = time.time()
306 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
307 | split_hashtag=True,
308 | ignore_profiles=False)
309 | end = time.time()
310 | if (verbose == True):
311 | print('test resource loading time::', (end - start))
312 |
313 | self._vocab = self.load_vocab()
314 |
315 | start = time.time()
316 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
317 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
318 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
319 | tD = dh.pad_sequence_1d(tD, maxlen=11)
320 |
321 | end = time.time()
322 | if (verbose == True):
323 | print('test resource preparation time::', (end - start))
324 |
325 | self.__predict_model([tC, tX, tD], self.test)
326 |
327 | def __predict_model(self, tX, test):
328 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
329 |
330 | y = []
331 | y_pred = []
332 |
333 | fd = open(self._output_file + '.analysis', 'w')
334 | for i, (label) in enumerate(prediction_probability):
335 | gold_label = test[i][0]
336 | words = test[i][1]
337 | dimensions = test[i][2]
338 | context = test[i][3]
339 | author = test[i][4]
340 |
341 | predicted = numpy.argmax(prediction_probability[i])
342 |
343 | y.append(int(gold_label))
344 | y_pred.append(predicted)
345 |
346 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
347 | + str(gold_label) + '\t'
348 | + str(predicted) + '\t'
349 | + ' '.join(words) + '\t'
350 | + str(dimensions) + '\t'
351 | + ' '.join(context))
352 |
353 | fd.write('\n')
354 |
355 | print('accuracy::', metrics.accuracy_score(y, y_pred))
356 | print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
357 | print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
358 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
359 | print('f_score::', metrics.classification_report(y, y_pred))
360 |
361 | fd.close()
362 |
363 |
364 | if __name__ == "__main__":
365 | basepath = os.getcwd()[:os.getcwd().rfind('/')]
366 | train_file = basepath + '/resource/train/Train_context_moods_v1.txt'
367 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
368 | test_file = basepath + '/resource/test/Test_context_AW.txt'
369 | word_file_path = basepath + '/resource/word_list_freq.txt'
370 | split_word_path = basepath + '/resource/word_split.txt'
371 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
372 |
373 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
374 | model_file = basepath + '/resource/text_context_awc_model/weights/'
375 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
376 |
377 | # word2vec path
378 | word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin'
379 |
380 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
381 | vocab_file_path, output_file)
382 |
383 | # testing the model
384 | # with K.get_session():
385 | # t = test_model(word_file_path, model_file, vocab_file_path, output_file)
386 | # t.load_trained_model()
387 | # t.predict(test_file)
388 |
--------------------------------------------------------------------------------