├── .gitignore ├── LICENSE.txt ├── README.md ├── __init__.py ├── resource ├── .directory ├── abbreviations.txt ├── dev │ ├── Dev_context_moods.txt │ ├── Dev_v1.txt │ └── __init__.py ├── emoji_unicode_names_final.txt ├── offensive_words.txt ├── test │ └── Test_v1.txt ├── text_context_awc_model │ └── weights │ │ └── model.json ├── text_model │ ├── vocab_list.txt │ └── weights │ │ └── model.json ├── text_model_2D │ ├── vocab_list.txt │ └── weights │ │ └── model.json ├── train │ ├── .directory │ └── Train_v1.txt ├── word_list.txt ├── word_list_freq.txt └── word_split.txt └── src ├── __init__.py ├── data_processing ├── __init__.py ├── data_handler.py └── glove2Word2vecLoader.py ├── sarcasm_context_moods.py ├── sarcasm_detection_model_CNN_DNN_2D.py ├── sarcasm_detection_model_CNN_LSTM_ATTN.py ├── sarcasm_detection_model_CNN_LSTM_DNN.py ├── sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py ├── sarcasm_detection_model_CNN_LSTM_DNN_simpler.py ├── sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py ├── sarcasm_detection_model_attention.py ├── sarcasm_detection_moods_siamese.py └── sarcasm_detection_siamese.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SarcamDetection 2 | Sarcasm detection on tweets using neural network.
3 | [This repository] perform[s] semantic modelling of sentences using neural networks for the task of sarcasm detection ([Ghosh & Veale, 2016](http://www.aclweb.org/anthology/W16-0425)). 4 | ## Pre-requisite 5 | - nltk (TweetTokenizer) 6 | - Keras 7 | - Tensorflow 8 | - numpy 9 | - scipy 10 | - gensim (if you are using word2vec) 11 | - itertools 12 | 13 | ## Cloning the repository 14 | ``` 15 | git clone git@github.com:AniSkywalker/SarcasmDetection.git 16 | cd SarcasmDetection/src/ 17 | ``` 18 | If you want to use the pre-trained model, you'll have to [download it](https://drive.google.com/drive/folders/0B7C_0ZfEBcpRbDZKelBZTFFsV0E?usp=sharing) from Google Drive and save it into `/resource/text_model/weights/`. 19 | 20 | ## Using this package 21 | This code is run by the following command: 22 | ``` 23 | python sarcasm_detection_model_CNN_LSTM_DNN.py 24 | ``` 25 | 26 | ### Citation 27 | Please cite the following paper when using this code: 28 | 29 | > **Fracking Sarcasm using Neural Network.**
30 | > Aniruddha Ghosh and Tony Veale. 7th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis (WASSA 2016). NAACL-HLT. 16th June 2016, San Diego, California, U.S.A. 31 | 32 | ## Output 33 | The supplied input is rated as either **0** meaning _non-sarcastic_ or **1** meaning _sarcastic_. 34 | 35 | ## Training 36 | If you want to train the model with your own data, you can save your _train_, _development_ and _test_ data into the `/resource/train`, `/resource/dev` and `/resource/test` folders correspondingly. 37 | 38 | The system accepts dataset in the tab separated format — as shown below. An example can be found in [`/resource/train/train_v1.txt`](https://github.com/AniSkywalker/SarcasmDetection/tree/master/resource/train). 39 | ``` 40 | idlabeltweet 41 | ``` 42 | 43 | ## Context information 44 | To run the model with context information and psychological dimensions (using Tensorflow) run: 45 | ``` 46 | python sarcasm_context_moods.py 47 | ``` 48 | 49 | ### Citation 50 | Please cite the following paper when using context information and psychological dimensions: 51 | > **Magnets for Sarcasm: Making Sarcasm Detection Timely, Contextual and Very Personal**
52 | > Aniruddha Ghosh and Tony Veale. Conference on Empirical Methods in Natural Language Processing (EMNLP). 7th-11th September, 2017, Copenhagen, Denmark. 53 | 54 | ## Notes 55 | - Samples of _train_, _dev_, and _test_ files are included for both versions. 56 | - For a test data set, please contact at aniruddha.ghosh@ucdconnect.ie 57 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'root' 2 | -------------------------------------------------------------------------------- /resource/.directory: -------------------------------------------------------------------------------- 1 | [Dolphin] 2 | Timestamp=2017,3,10,1,14,25 3 | Version=3 4 | -------------------------------------------------------------------------------- /resource/abbreviations.txt: -------------------------------------------------------------------------------- 1 | i've i have 2 | we've we have 3 | can't can not 4 | i'm i am 5 | we're we are 6 | don't do not 7 | didn't did not 8 | tt's it is 9 | that's that is 10 | he's he is 11 | she's she is 12 | let's let us 13 | there's there is 14 | how's how is 15 | i'd i would 16 | 2F4U Too Fast For You 17 | 4YEO FYEO For Your Eyes Only 18 | AAMOF As a Matter of Fact 19 | ACK Acknowledgment 20 | AFAIK As far as I know 21 | AFAIR As far as I remember 22 | AFK Away from Keyboard 23 | AKA Also known as 24 | B2K BTK Back to Keyboard 25 | BTT Back to Topic 26 | BTW By the Way 27 | B/C Because 28 | C&P Copy and Paste 29 | CU See you 30 | CYS Check your Settings 31 | DIY Do it Yourself 32 | EOBD End of Business Day 33 | EOD End of Discussion 34 | EOM End of Message 35 | EOT End of Text 36 | FAQ Frequently asked Questions 37 | FACK Full Acknowledge 38 | FKA Formerly known as 39 | FWIW For what it is Worth 40 | FYI For your Information 41 | JFYI Just For your Information 42 | FTW Fuck the World 43 | HF Have fun 44 | HTH Hope this Helps 45 | IDK I do not know 46 | IIRC If I Recall Correctly 47 | IMHO In my Humble Opinion 48 | IMO In my Opinion 49 | IMNSHO In my not so Humble Opinion 50 | IOW In other Words 51 | ITT In this Thread 52 | LOL Laughing out loud 53 | DGMW Do not get me wrong 54 | MMW Mark my Words 55 | N/A Not Available 56 | NaN Not a Number 57 | NNTR No need to Reply 58 | noob Newbie 59 | n00b Newbie 60 | NOYB None of your Business 61 | NRN No Reply Necessary 62 | OMG Oh my God 63 | OP Original Poster 64 | OT Off Topic 65 | OTOH On the other Hand 66 | PEBKAC Problem exists between Keyboard and Chair 67 | POV Point of View 68 | ROTFL Rolling on the Floor Laughing 69 | RSVP Repondez s'il vous plait 70 | RTFM Read the fine Manual 71 | SCNR Sorry could not Resist 72 | SFLR Sorry for late Reply 73 | SPOC Single Point of Contact 74 | TBA To be Announced 75 | TBC To be Continued 76 | TIA Thanks in Advance 77 | TGIF Thanks God, its Friday 78 | THX TNX Thanks 79 | TQ Thank You 80 | TYVM Thank You Very Much 81 | TYT Take your Time 82 | TTYL Talk to you Later 83 | w00t Hooray 84 | WFM Works for Me 85 | WRT With Regard to 86 | WTH What the Hell 87 | WTF What the Fuck 88 | YMMD You made my Day 89 | YMMV Your Mileage may vary 90 | YAM Yet Another Meeting 91 | ICYMI In Case you missed it -------------------------------------------------------------------------------- /resource/dev/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/resource/dev/__init__.py -------------------------------------------------------------------------------- /resource/offensive_words.txt: -------------------------------------------------------------------------------- 1 | abbo 2 | abo 3 | abortion 4 | abuse 5 | addict 6 | addicts 7 | adult 8 | africa 9 | african 10 | alla 11 | allah 12 | alligatorbait 13 | amateur 14 | american 15 | anal 16 | analannie 17 | analsex 18 | angie 19 | angry 20 | anus 21 | arab 22 | arabs 23 | areola 24 | argie 25 | aroused 26 | arse 27 | arsehole 28 | asian 29 | ass 30 | assassin 31 | assassinate 32 | assassination 33 | assault 34 | assbagger 35 | assblaster 36 | assclown 37 | asscowboy 38 | asses 39 | assfuck 40 | assfucker 41 | asshat 42 | asshole 43 | assholes 44 | asshore 45 | assjockey 46 | asskiss 47 | asskisser 48 | assklown 49 | asslick 50 | asslicker 51 | asslover 52 | assman 53 | assmonkey 54 | assmunch 55 | assmuncher 56 | asspacker 57 | asspirate 58 | asspuppies 59 | assranger 60 | asswhore 61 | asswipe 62 | athletesfoot 63 | attack 64 | australian 65 | babe 66 | babies 67 | backdoor 68 | backdoorman 69 | backseat 70 | badfuck 71 | balllicker 72 | balls 73 | ballsack 74 | banging 75 | baptist 76 | barelylegal 77 | barf 78 | barface 79 | barfface 80 | bast 81 | bastard 82 | bazongas 83 | bazooms 84 | beaner 85 | beast 86 | beastality 87 | beastial 88 | beastiality 89 | beatoff 90 | beat-off 91 | beatyourmeat 92 | beaver 93 | bestial 94 | bestiality 95 | bi 96 | biatch 97 | bible 98 | bicurious 99 | bigass 100 | bigbastard 101 | bigbutt 102 | bigger 103 | bisexual 104 | bi-sexual 105 | bitch 106 | bitcher 107 | bitches 108 | bitchez 109 | bitchin 110 | bitching 111 | bitchslap 112 | bitchy 113 | biteme 114 | black 115 | blackman 116 | blackout 117 | blacks 118 | blind 119 | blow 120 | blowjob 121 | boang 122 | bogan 123 | bohunk 124 | bollick 125 | bollock 126 | bomb 127 | bombers 128 | bombing 129 | bombs 130 | bomd 131 | bondage 132 | boner 133 | bong 134 | boob 135 | boobies 136 | boobs 137 | booby 138 | boody 139 | boom 140 | boong 141 | boonga 142 | boonie 143 | booty 144 | bootycall 145 | bountybar 146 | bra 147 | brea5t 148 | breast 149 | breastjob 150 | breastlover 151 | breastman 152 | brothel 153 | bugger 154 | buggered 155 | buggery 156 | bullcrap 157 | bulldike 158 | bulldyke 159 | bullshit 160 | bumblefuck 161 | bumfuck 162 | bunga 163 | bunghole 164 | buried 165 | burn 166 | butchbabes 167 | butchdike 168 | butchdyke 169 | butt 170 | buttbang 171 | butt-bang 172 | buttface 173 | buttfuck 174 | butt-fuck 175 | buttfucker 176 | butt-fucker 177 | buttfuckers 178 | butt-fuckers 179 | butthead 180 | buttman 181 | buttmunch 182 | buttmuncher 183 | buttpirate 184 | buttplug 185 | buttstain 186 | byatch 187 | cacker 188 | cameljockey 189 | cameltoe 190 | canadian 191 | cancer 192 | carpetmuncher 193 | carruth 194 | catholic 195 | catholics 196 | cemetery 197 | chav 198 | cherrypopper 199 | chickslick 200 | children's 201 | chin 202 | chinaman 203 | chinamen 204 | chinese 205 | chink 206 | chinky 207 | choad 208 | chode 209 | christ 210 | christian 211 | church 212 | cigarette 213 | cigs 214 | clamdigger 215 | clamdiver 216 | clit 217 | clitoris 218 | clogwog 219 | cocaine 220 | cock 221 | cockblock 222 | cockblocker 223 | cockcowboy 224 | cockfight 225 | cockhead 226 | cockknob 227 | cocklicker 228 | cocklover 229 | cocknob 230 | cockqueen 231 | cockrider 232 | cocksman 233 | cocksmith 234 | cocksmoker 235 | cocksucer 236 | cocksuck 237 | cocksucked 238 | cocksucker 239 | cocksucking 240 | cocktail 241 | cocktease 242 | cocky 243 | cohee 244 | coitus 245 | color 246 | colored 247 | coloured 248 | commie 249 | communist 250 | condom 251 | conservative 252 | conspiracy 253 | coolie 254 | cooly 255 | coon 256 | coondog 257 | copulate 258 | cornhole 259 | corruption 260 | cra5h 261 | crabs 262 | crack 263 | crackpipe 264 | crackwhore 265 | crack-whore 266 | crap 267 | crapola 268 | crapper 269 | crappy 270 | crash 271 | creamy 272 | crime 273 | crimes 274 | criminal 275 | criminals 276 | crotch 277 | crotchjockey 278 | crotchmonkey 279 | crotchrot 280 | cum 281 | cumbubble 282 | cumfest 283 | cumjockey 284 | cumm 285 | cummer 286 | cumming 287 | cumquat 288 | cumqueen 289 | cumshot 290 | cunilingus 291 | cunillingus 292 | cunn 293 | cunnilingus 294 | cunntt 295 | cunt 296 | cunteyed 297 | cuntfuck 298 | cuntfucker 299 | cuntlick 300 | cuntlicker 301 | cuntlicking 302 | cuntsucker 303 | cybersex 304 | cyberslimer 305 | dago 306 | dahmer 307 | dammit 308 | damn 309 | damnation 310 | damnit 311 | darkie 312 | darky 313 | datnigga 314 | dead 315 | deapthroat 316 | death 317 | deepthroat 318 | defecate 319 | dego 320 | demon 321 | deposit 322 | desire 323 | destroy 324 | deth 325 | devil 326 | devilworshipper 327 | dick 328 | dickbrain 329 | dickforbrains 330 | dickhead 331 | dickless 332 | dicklick 333 | dicklicker 334 | dickman 335 | dickwad 336 | dickweed 337 | diddle 338 | die 339 | died 340 | dies 341 | dike 342 | dildo 343 | dingleberry 344 | dink 345 | dipshit 346 | dipstick 347 | dirty 348 | disease 349 | diseases 350 | disturbed 351 | dive 352 | dix 353 | dixiedike 354 | dixiedyke 355 | doggiestyle 356 | doggystyle 357 | dong 358 | doodoo 359 | doo-doo 360 | doom 361 | dope 362 | dragqueen 363 | dragqween 364 | dripdick 365 | drug 366 | drunk 367 | drunken 368 | dumb 369 | dumbass 370 | dumbbitch 371 | dumbfuck 372 | dyefly 373 | dyke 374 | easyslut 375 | eatballs 376 | eatme 377 | eatpussy 378 | ecstacy 379 | ejaculate 380 | ejaculated 381 | ejaculating 382 | ejaculation 383 | enema 384 | enemy 385 | erect 386 | erection 387 | ero 388 | escort 389 | ethiopian 390 | ethnic 391 | european 392 | evl 393 | excrement 394 | execute 395 | executed 396 | execution 397 | executioner 398 | explosion 399 | facefucker 400 | faeces 401 | fag 402 | fagging 403 | faggot 404 | fagot 405 | failed 406 | failure 407 | fairies 408 | fairy 409 | faith 410 | fannyfucker 411 | fart 412 | farted 413 | farting 414 | farty 415 | fastfuck 416 | fat 417 | fatah 418 | fatass 419 | fatfuck 420 | fatfucker 421 | fatso 422 | fckcum 423 | fear 424 | feces 425 | felatio 426 | felch 427 | felcher 428 | felching 429 | fellatio 430 | feltch 431 | feltcher 432 | feltching 433 | fetish 434 | fight 435 | filipina 436 | filipino 437 | fingerfood 438 | fingerfuck 439 | fingerfucked 440 | fingerfucker 441 | fingerfuckers 442 | fingerfucking 443 | fire 444 | firing 445 | fister 446 | fistfuck 447 | fistfucked 448 | fistfucker 449 | fistfucking 450 | fisting 451 | flange 452 | flasher 453 | flatulence 454 | floo 455 | flydie 456 | flydye 457 | fok 458 | fondle 459 | footaction 460 | footfuck 461 | footfucker 462 | footlicker 463 | footstar 464 | fore 465 | foreskin 466 | forni 467 | fornicate 468 | foursome 469 | fourtwenty 470 | fraud 471 | freakfuck 472 | freakyfucker 473 | freefuck 474 | fu 475 | fubar 476 | fuc 477 | fucck 478 | fuck 479 | fucka 480 | fuckable 481 | fuckbag 482 | fuckbuddy 483 | fucked 484 | fuckedup 485 | fucker 486 | fuckers 487 | fuckface 488 | fuckfest 489 | fuckfreak 490 | fuckfriend 491 | fuckhead 492 | fuckher 493 | fuckin 494 | fuckina 495 | fucking 496 | fuckingbitch 497 | fuckinnuts 498 | fuckinright 499 | fuckit 500 | fuckknob 501 | fuckme 502 | fuckmehard 503 | fuckmonkey 504 | fuckoff 505 | fuckpig 506 | fucks 507 | fucktard 508 | fuckwhore 509 | fuckyou 510 | fudgepacker 511 | fugly 512 | fuk 513 | fuks 514 | funeral 515 | funfuck 516 | fungus 517 | fuuck 518 | gangbang 519 | gangbanged 520 | gangbanger 521 | gangsta 522 | gatorbait 523 | gay 524 | gaymuthafuckinwhore 525 | gaysex 526 | geez 527 | geezer 528 | geni 529 | genital 530 | german 531 | getiton 532 | gin 533 | ginzo 534 | gipp 535 | girls 536 | givehead 537 | glazeddonut 538 | gob 539 | god 540 | godammit 541 | goddamit 542 | goddammit 543 | goddamn 544 | goddamned 545 | goddamnes 546 | goddamnit 547 | goddamnmuthafucker 548 | goldenshower 549 | gonorrehea 550 | gonzagas 551 | gook 552 | gotohell 553 | goy 554 | goyim 555 | greaseball 556 | gringo 557 | groe 558 | gross 559 | grostulation 560 | gubba 561 | gummer 562 | gun 563 | gyp 564 | gypo 565 | gypp 566 | gyppie 567 | gyppo 568 | gyppy 569 | hamas 570 | handjob 571 | hapa 572 | harder 573 | hardon 574 | harem 575 | headfuck 576 | headlights 577 | hebe 578 | heeb 579 | hell 580 | henhouse 581 | heroin 582 | herpes 583 | heterosexual 584 | hijack 585 | hijacker 586 | hijacking 587 | hillbillies 588 | hindoo 589 | hiscock 590 | hitler 591 | hitlerism 592 | hitlerist 593 | hiv 594 | ho 595 | hobo 596 | hodgie 597 | hoes 598 | hole 599 | holestuffer 600 | homicide 601 | homo 602 | homobangers 603 | homosexual 604 | honger 605 | honk 606 | honkers 607 | honkey 608 | honky 609 | hook 610 | hooker 611 | hookers 612 | hooters 613 | hore 614 | hork 615 | horn 616 | horney 617 | horniest 618 | horny 619 | horseshit 620 | hosejob 621 | hoser 622 | hostage 623 | hotdamn 624 | hotpussy 625 | hottotrot 626 | hummer 627 | husky 628 | hussy 629 | hustler 630 | hymen 631 | hymie 632 | iblowu 633 | idiot 634 | ikey 635 | illegal 636 | incest 637 | insest 638 | intercourse 639 | interracial 640 | intheass 641 | inthebuff 642 | israel 643 | israeli 644 | israel's 645 | italiano 646 | itch 647 | jackass 648 | jackoff 649 | jackshit 650 | jacktheripper 651 | jade 652 | jap 653 | japanese 654 | japcrap 655 | jebus 656 | jeez 657 | jerkoff 658 | jesus 659 | jesuschrist 660 | jew 661 | jewish 662 | jiga 663 | jigaboo 664 | jigg 665 | jigga 666 | jiggabo 667 | jigger 668 | jiggy 669 | jihad 670 | jijjiboo 671 | jimfish 672 | jism 673 | jiz 674 | jizim 675 | jizjuice 676 | jizm 677 | jizz 678 | jizzim 679 | jizzum 680 | joint 681 | juggalo 682 | jugs 683 | junglebunny 684 | kaffer 685 | kaffir 686 | kaffre 687 | kafir 688 | kanake 689 | kid 690 | kigger 691 | kike 692 | kill 693 | killed 694 | killer 695 | killing 696 | kills 697 | kink 698 | kinky 699 | kissass 700 | kkk 701 | knife 702 | knockers 703 | kock 704 | kondum 705 | koon 706 | kotex 707 | krap 708 | krappy 709 | kraut 710 | kum 711 | kumbubble 712 | kumbullbe 713 | kummer 714 | kumming 715 | kumquat 716 | kums 717 | kunilingus 718 | kunnilingus 719 | kunt 720 | ky 721 | kyke 722 | lactate 723 | laid 724 | lapdance 725 | latin 726 | lesbain 727 | lesbayn 728 | lesbian 729 | lesbin 730 | lesbo 731 | lez 732 | lezbe 733 | lezbefriends 734 | lezbo 735 | lezz 736 | lezzo 737 | liberal 738 | libido 739 | licker 740 | lickme 741 | lies 742 | limey 743 | limpdick 744 | limy 745 | lingerie 746 | liquor 747 | livesex 748 | loadedgun 749 | lolita 750 | looser 751 | loser 752 | lotion 753 | lovebone 754 | lovegoo 755 | lovegun 756 | lovejuice 757 | lovemuscle 758 | lovepistol 759 | loverocket 760 | lowlife 761 | lsd 762 | lubejob 763 | lucifer 764 | luckycammeltoe 765 | lugan 766 | lynch 767 | macaca 768 | mad 769 | mafia 770 | magicwand 771 | mams 772 | manhater 773 | manpaste 774 | marijuana 775 | mastabate 776 | mastabater 777 | masterbate 778 | masterblaster 779 | mastrabator 780 | masturbate 781 | masturbating 782 | mattressprincess 783 | meatbeatter 784 | meatrack 785 | meth 786 | mexican 787 | mgger 788 | mggor 789 | mickeyfinn 790 | mideast 791 | milf 792 | minority 793 | mockey 794 | mockie 795 | mocky 796 | mofo 797 | moky 798 | moles 799 | molest 800 | molestation 801 | molester 802 | molestor 803 | moneyshot 804 | mooncricket 805 | mormon 806 | moron 807 | moslem 808 | mosshead 809 | mothafuck 810 | mothafucka 811 | mothafuckaz 812 | mothafucked 813 | mothafucker 814 | mothafuckin 815 | mothafucking 816 | mothafuckings 817 | motherfuck 818 | motherfucked 819 | motherfucker 820 | motherfuckin 821 | motherfucking 822 | motherfuckings 823 | motherlovebone 824 | muff 825 | muffdive 826 | muffdiver 827 | muffindiver 828 | mufflikcer 829 | mulatto 830 | muncher 831 | munt 832 | murder 833 | murderer 834 | muslim 835 | naked 836 | narcotic 837 | nasty 838 | nastybitch 839 | nastyho 840 | nastyslut 841 | nastywhore 842 | nazi 843 | necro 844 | negro 845 | negroes 846 | negroid 847 | negro's 848 | nig 849 | niger 850 | nigerian 851 | nigerians 852 | nigg 853 | nigga 854 | niggah 855 | niggaracci 856 | niggard 857 | niggarded 858 | niggarding 859 | niggardliness 860 | niggardliness's 861 | niggardly 862 | niggards 863 | niggard's 864 | niggaz 865 | nigger 866 | niggerhead 867 | niggerhole 868 | niggers 869 | nigger's 870 | niggle 871 | niggled 872 | niggles 873 | niggling 874 | nigglings 875 | niggor 876 | niggur 877 | niglet 878 | nignog 879 | nigr 880 | nigra 881 | nigre 882 | nip 883 | nipple 884 | nipplering 885 | nittit 886 | nlgger 887 | nlggor 888 | nofuckingway 889 | nook 890 | nookey 891 | nookie 892 | noonan 893 | nooner 894 | nude 895 | nudger 896 | nuke 897 | nutfucker 898 | nymph 899 | ontherag 900 | oral 901 | orga 902 | orgasim 903 | orgasm 904 | orgies 905 | orgy 906 | osama 907 | paki 908 | palesimian 909 | palestinian 910 | pansies 911 | pansy 912 | panti 913 | panties 914 | payo 915 | pearlnecklace 916 | peck 917 | pecker 918 | peckerwood 919 | pee 920 | peehole 921 | pee-pee 922 | peepshow 923 | peepshpw 924 | pendy 925 | penetration 926 | peni5 927 | penile 928 | penis 929 | penises 930 | penthouse 931 | period 932 | perv 933 | phonesex 934 | phuk 935 | phuked 936 | phuking 937 | phukked 938 | phukking 939 | phungky 940 | phuq 941 | pi55 942 | picaninny 943 | piccaninny 944 | pickaninny 945 | piker 946 | pikey 947 | piky 948 | pimp 949 | pimped 950 | pimper 951 | pimpjuic 952 | pimpjuice 953 | pimpsimp 954 | pindick 955 | piss 956 | pissed 957 | pisser 958 | pisses 959 | pisshead 960 | pissin 961 | pissing 962 | pissoff 963 | pistol 964 | pixie 965 | pixy 966 | playboy 967 | playgirl 968 | pocha 969 | pocho 970 | pocketpool 971 | pohm 972 | polack 973 | pom 974 | pommie 975 | pommy 976 | poo 977 | poon 978 | poontang 979 | poop 980 | pooper 981 | pooperscooper 982 | pooping 983 | poorwhitetrash 984 | popimp 985 | porchmonkey 986 | porn 987 | pornflick 988 | pornking 989 | porno 990 | pornography 991 | pornprincess 992 | pot 993 | poverty 994 | premature 995 | pric 996 | prick 997 | prickhead 998 | primetime 999 | propaganda 1000 | pros 1001 | prostitute 1002 | protestant 1003 | pu55i 1004 | pu55y 1005 | pube 1006 | pubic 1007 | pubiclice 1008 | pud 1009 | pudboy 1010 | pudd 1011 | puddboy 1012 | puke 1013 | puntang 1014 | purinapricness 1015 | puss 1016 | pussie 1017 | pussies 1018 | pussy 1019 | pussycat 1020 | pussyeater 1021 | pussyfucker 1022 | pussylicker 1023 | pussylips 1024 | pussylover 1025 | pussypounder 1026 | pusy 1027 | quashie 1028 | queef 1029 | queer 1030 | quickie 1031 | quim 1032 | ra8s 1033 | rabbi 1034 | racial 1035 | racist 1036 | radical 1037 | radicals 1038 | raghead 1039 | randy 1040 | rape 1041 | raped 1042 | raper 1043 | rapist 1044 | rearend 1045 | rearentry 1046 | rectum 1047 | redlight 1048 | redneck 1049 | reefer 1050 | reestie 1051 | refugee 1052 | reject 1053 | remains 1054 | rentafuck 1055 | republican 1056 | rere 1057 | retard 1058 | retarded 1059 | ribbed 1060 | rigger 1061 | rimjob 1062 | rimming 1063 | roach 1064 | robber 1065 | roundeye 1066 | rump 1067 | russki 1068 | russkie 1069 | sadis 1070 | sadom 1071 | samckdaddy 1072 | sandm 1073 | sandnigger 1074 | satan 1075 | scag 1076 | scallywag 1077 | scat 1078 | schlong 1079 | screw 1080 | screwyou 1081 | scrotum 1082 | scum 1083 | semen 1084 | seppo 1085 | servant 1086 | sex 1087 | sexed 1088 | sexfarm 1089 | sexhound 1090 | sexhouse 1091 | sexing 1092 | sexkitten 1093 | sexpot 1094 | sexslave 1095 | sextogo 1096 | sextoy 1097 | sextoys 1098 | sexual 1099 | sexually 1100 | sexwhore 1101 | sexy 1102 | sexymoma 1103 | sexy-slim 1104 | shag 1105 | shaggin 1106 | shagging 1107 | shat 1108 | shav 1109 | shawtypimp 1110 | sheeney 1111 | shhit 1112 | shinola 1113 | shit 1114 | shitcan 1115 | shitdick 1116 | shite 1117 | shiteater 1118 | shited 1119 | shitface 1120 | shitfaced 1121 | shitfit 1122 | shitforbrains 1123 | shitfuck 1124 | shitfucker 1125 | shitfull 1126 | shithapens 1127 | shithappens 1128 | shithead 1129 | shithouse 1130 | shiting 1131 | shitlist 1132 | shitola 1133 | shitoutofluck 1134 | shits 1135 | shitstain 1136 | shitted 1137 | shitter 1138 | shitting 1139 | shitty 1140 | shoot 1141 | shooting 1142 | shortfuck 1143 | showtime 1144 | sick 1145 | sissy 1146 | sixsixsix 1147 | sixtynine 1148 | sixtyniner 1149 | skank 1150 | skankbitch 1151 | skankfuck 1152 | skankwhore 1153 | skanky 1154 | skankybitch 1155 | skankywhore 1156 | skinflute 1157 | skum 1158 | skumbag 1159 | slant 1160 | slanteye 1161 | slapper 1162 | slaughter 1163 | slav 1164 | slave 1165 | slavedriver 1166 | sleezebag 1167 | sleezeball 1168 | slideitin 1169 | slime 1170 | slimeball 1171 | slimebucket 1172 | slopehead 1173 | slopey 1174 | slopy 1175 | slut 1176 | sluts 1177 | slutt 1178 | slutting 1179 | slutty 1180 | slutwear 1181 | slutwhore 1182 | smack 1183 | smackthemonkey 1184 | smut 1185 | snatch 1186 | snatchpatch 1187 | snigger 1188 | sniggered 1189 | sniggering 1190 | sniggers 1191 | snigger's 1192 | sniper 1193 | snot 1194 | snowback 1195 | snownigger 1196 | sob 1197 | sodom 1198 | sodomise 1199 | sodomite 1200 | sodomize 1201 | sodomy 1202 | sonofabitch 1203 | sonofbitch 1204 | sooty 1205 | sos 1206 | soviet 1207 | spaghettibender 1208 | spaghettinigger 1209 | spank 1210 | spankthemonkey 1211 | sperm 1212 | spermacide 1213 | spermbag 1214 | spermhearder 1215 | spermherder 1216 | spic 1217 | spick 1218 | spig 1219 | spigotty 1220 | spik 1221 | spit 1222 | spitter 1223 | splittail 1224 | spooge 1225 | spreadeagle 1226 | spunk 1227 | spunky 1228 | squaw 1229 | stagg 1230 | stiffy 1231 | strapon 1232 | stringer 1233 | stripclub 1234 | stroke 1235 | stroking 1236 | stupid 1237 | stupidfuck 1238 | stupidfucker 1239 | suck 1240 | suckdick 1241 | sucker 1242 | suckme 1243 | suckmyass 1244 | suckmydick 1245 | suckmytit 1246 | suckoff 1247 | suicide 1248 | swallow 1249 | swallower 1250 | swalow 1251 | swastika 1252 | sweetness 1253 | syphilis 1254 | taboo 1255 | taff 1256 | tampon 1257 | tang 1258 | tantra 1259 | tarbaby 1260 | tard 1261 | teat 1262 | terror 1263 | terrorist 1264 | teste 1265 | testicle 1266 | testicles 1267 | thicklips 1268 | thirdeye 1269 | thirdleg 1270 | threesome 1271 | threeway 1272 | timbernigger 1273 | tinkle 1274 | tit 1275 | titbitnipply 1276 | titfuck 1277 | titfucker 1278 | titfuckin 1279 | titjob 1280 | titlicker 1281 | titlover 1282 | tits 1283 | tittie 1284 | titties 1285 | titty 1286 | tnt 1287 | toilet 1288 | tongethruster 1289 | tongue 1290 | tonguethrust 1291 | tonguetramp 1292 | tortur 1293 | torture 1294 | tosser 1295 | towelhead 1296 | trailertrash 1297 | tramp 1298 | trannie 1299 | tranny 1300 | transexual 1301 | transsexual 1302 | transvestite 1303 | triplex 1304 | trisexual 1305 | trojan 1306 | trots 1307 | tuckahoe 1308 | tunneloflove 1309 | turd 1310 | turnon 1311 | twat 1312 | twink 1313 | twinkie 1314 | twobitwhore 1315 | uck 1316 | uk 1317 | unfuckable 1318 | upskirt 1319 | uptheass 1320 | upthebutt 1321 | urinary 1322 | urinate 1323 | urine 1324 | usama 1325 | uterus 1326 | vagina 1327 | vaginal 1328 | vatican 1329 | vibr 1330 | vibrater 1331 | vibrator 1332 | vietcong 1333 | violence 1334 | virgin 1335 | virginbreaker 1336 | vomit 1337 | vulva 1338 | wab 1339 | wank 1340 | wanker 1341 | wanking 1342 | waysted 1343 | weapon 1344 | weenie 1345 | weewee 1346 | welcher 1347 | welfare 1348 | wetb 1349 | wetback 1350 | wetspot 1351 | whacker 1352 | whash 1353 | whigger 1354 | whiskey 1355 | whiskeydick 1356 | whiskydick 1357 | whit 1358 | whitenigger 1359 | whites 1360 | whitetrash 1361 | whitey 1362 | whiz 1363 | whop 1364 | whore 1365 | whorefucker 1366 | whorehouse 1367 | wigger 1368 | willie 1369 | williewanker 1370 | willy 1371 | wn 1372 | wog 1373 | women's 1374 | wop 1375 | wtf 1376 | wuss 1377 | wuzzie 1378 | xtc 1379 | xxx 1380 | yankee 1381 | yellowman 1382 | zigabo 1383 | zipperhead -------------------------------------------------------------------------------- /resource/text_context_awc_model/weights/model.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "context", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "context"}, "inbound_nodes": []}, {"name": "text", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "text"}, "inbound_nodes": []}, {"name": "embedding_1", "class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["context", 0, 0, {}]]]}, {"name": "embedding_2", "class_name": "Embedding", "config": {"name": "embedding_2", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["text", 0, 0, {}]]]}, {"name": "conv1d_1", "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_1", 0, 0, {}]]]}, {"name": "conv1d_2", "class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_2", 0, 0, {}]]]}, {"name": "lstm_1", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_2", "class_name": "LSTM", "config": {"name": "lstm_2", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_3", "class_name": "LSTM", "config": {"name": "lstm_3", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "lstm_4", "class_name": "LSTM", "config": {"name": "lstm_4", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "concatenate_1", "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_1", 0, 0, {}], ["lstm_2", 0, 0, {}]]]}, {"name": "concatenate_2", "class_name": "Concatenate", "config": {"name": "concatenate_2", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_3", 0, 0, {}], ["lstm_4", 0, 0, {}]]]}, {"name": "awc", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 11], "dtype": "float32", "sparse": false, "name": "awc"}, "inbound_nodes": []}, {"name": "concatenate_3", "class_name": "Concatenate", "config": {"name": "concatenate_3", "trainable": true, "axis": -1}, "inbound_nodes": [[["concatenate_1", 0, 0, {}], ["concatenate_2", 0, 0, {}], ["awc", 0, 0, {}]]]}, {"name": "dense_1", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 64, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["concatenate_3", 0, 0, {}]]]}, {"name": "dropout_1", "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"name": "dense_2", "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}, {"name": "activation_1", "class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}, "inbound_nodes": [[["dense_2", 0, 0, {}]]]}], "input_layers": [["context", 0, 0], ["text", 0, 0], ["awc", 0, 0]], "output_layers": [["activation_1", 0, 0]]}, "keras_version": "2.1.6", "backend": "tensorflow"} -------------------------------------------------------------------------------- /resource/text_model/weights/model.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 33892, "output_dim": 256, "embeddings_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "normal", "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 256, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [3], "pool_size": [3], "padding": "valid"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 256, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.5, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 256, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}}], "keras_version": "2.1.6", "backend": "tensorflow"} -------------------------------------------------------------------------------- /resource/text_model_2D/weights/model.json: -------------------------------------------------------------------------------- 1 | {"class_name": "Sequential", "config": [{"class_name": "Masking", "config": {"name": "masking_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "mask_value": 0}}, {"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 34552, "output_dim": 200, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Reshape", "config": {"name": "reshape_1", "trainable": true, "target_shape": [30, 200, 1]}}, {"class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "filters": 32, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_1", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 64, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_2", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.1.2", "backend": "tensorflow"} -------------------------------------------------------------------------------- /resource/train/.directory: -------------------------------------------------------------------------------- 1 | [Dolphin] 2 | Timestamp=2017,2,6,16,20,50 3 | Version=3 4 | ViewMode=1 5 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/__init__.py -------------------------------------------------------------------------------- /src/data_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/data_processing/__init__.py -------------------------------------------------------------------------------- /src/data_processing/data_handler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | sys.path.append('../') 4 | from collections import defaultdict 5 | import re 6 | from gensim.models.keyedvectors import KeyedVectors 7 | from gensim.models.wrappers import FastText 8 | import numpy 9 | from nltk.tokenize import TweetTokenizer 10 | import src.data_processing.glove2Word2vecLoader as glove 11 | import itertools 12 | 13 | 14 | # loading the emoji dataset 15 | def load_unicode_mapping(path): 16 | emoji_dict = defaultdict() 17 | with open(path, 'r') as f: 18 | lines = f.readlines() 19 | for line in lines: 20 | tokens = line.strip().split('\t') 21 | emoji_dict[tokens[0]] = tokens[1] 22 | return emoji_dict 23 | 24 | 25 | def load_word2vec(path=None): 26 | word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True) 27 | return word2vecmodel 28 | 29 | 30 | def load_fasttext(path=None): 31 | word2vecmodel = FastText.load_fasttext_format(path) 32 | return word2vecmodel 33 | 34 | 35 | def InitializeWords(word_file_path): 36 | word_dictionary = defaultdict() 37 | 38 | with open(word_file_path, 'r') as f: 39 | lines = f.readlines() 40 | for line in lines: 41 | tokens = line.lower().strip().split('\t') 42 | word_dictionary[tokens[0]] = int(tokens[1]) 43 | 44 | for alphabet in "bcdefghjklmnopqrstuvwxyz": 45 | if (alphabet in word_dictionary): 46 | word_dictionary.__delitem__(alphabet) 47 | 48 | for word in ['ann', 'assis', 49 | 'bz', 50 | 'ch', 'cre', 'ct', 51 | 'di', 52 | 'ed', 'ee', 53 | 'ic', 54 | 'le', 55 | 'ng', 'ns', 56 | 'pr', 'picon', 57 | 'th', 'tle', 'tl', 'tr', 58 | 'um', 59 | 've', 60 | 'yi' 61 | ]: 62 | if (word in word_dictionary): 63 | word_dictionary.__delitem__(word) 64 | 65 | return word_dictionary 66 | 67 | 68 | def normalize_word(word): 69 | temp = word 70 | while True: 71 | w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp) 72 | if (w == temp): 73 | break 74 | else: 75 | temp = w 76 | return w 77 | 78 | 79 | def load_split_word(split_word_file_path): 80 | split_word_dictionary = defaultdict() 81 | with open(split_word_file_path, 'r') as f: 82 | lines = f.readlines() 83 | for line in lines: 84 | tokens = line.lower().strip().split('\t') 85 | if (len(tokens) >= 2): 86 | split_word_dictionary[tokens[0]] = tokens[1] 87 | 88 | print('split entry found:', len(split_word_dictionary.keys())) 89 | return split_word_dictionary 90 | 91 | 92 | def split_hashtags(term, wordlist, split_word_list, dump_file=''): 93 | # print('term::',term) 94 | 95 | if (len(term.strip()) == 1): 96 | return [''] 97 | 98 | if (split_word_list != None and term.lower() in split_word_list): 99 | # print('found') 100 | return split_word_list.get(term.lower()).split(' ') 101 | else: 102 | print(term) 103 | 104 | # discarding # if exists 105 | if (term.startswith('#')): 106 | term = term[1:] 107 | 108 | if (wordlist != None and term.lower() in wordlist): 109 | return [term.lower()] 110 | 111 | words = [] 112 | # max freq 113 | penalty = -69971 114 | max_coverage = penalty 115 | 116 | split_words_count = 6 117 | # checking camel cases 118 | term = re.sub(r'([0-9]+)', r' \1', term) 119 | term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term) 120 | term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip()) 121 | term = re.sub(r'([A-Z]{2,})+', r' \1', term) 122 | words = term.strip().split(' ') 123 | 124 | n_splits = 0 125 | 126 | if (len(words) < 3): 127 | # splitting lower case and uppercase words upto 5 words 128 | chars = [c for c in term.lower()] 129 | 130 | found_all_words = False 131 | 132 | while (n_splits < split_words_count and not found_all_words): 133 | for idx in itertools.combinations(range(0, len(chars)), n_splits): 134 | output = numpy.split(chars, idx) 135 | line = [''.join(o) for o in output] 136 | 137 | score = (1. / len(line)) * sum( 138 | [wordlist.get( 139 | word.strip()) if word.strip() in wordlist else 0. if word.strip().isnumeric() else penalty for 140 | word in line]) 141 | 142 | if (score > max_coverage): 143 | words = line 144 | max_coverage = score 145 | 146 | line_is_valid_word = [word.strip() in wordlist if not word.isnumeric() else True for word in line] 147 | 148 | if (all(line_is_valid_word)): 149 | found_all_words = True 150 | 151 | # uncomment to debug hashtag splitting 152 | # print(line, score, line_is_valid_word) 153 | 154 | n_splits = n_splits + 1 155 | 156 | # removing hashtag sign 157 | words = [str(s) for s in words] 158 | 159 | # dumping splits for debug 160 | with open(dump_file, 'a') as f: 161 | if (term != '' and len(words) > 0): 162 | f.write('#' + str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n') 163 | 164 | return words 165 | 166 | 167 | def load_abbreviation(path='../resource/abbreviations.txt'): 168 | abbreviation_dict = defaultdict() 169 | with open(path) as f: 170 | lines = f.readlines() 171 | for line in lines: 172 | token = line.lower().strip().split('\t') 173 | abbreviation_dict[token[0]] = token[1] 174 | return abbreviation_dict 175 | 176 | 177 | def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False, 178 | split_hashtag=False, 179 | ignore_profiles=False, 180 | replace_emoji=True): 181 | filtered_text = [] 182 | 183 | filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':'] 184 | 185 | for t in text: 186 | word_tokens = None 187 | 188 | # discarding symbols 189 | # if (str(t).lower() in filter_list): 190 | # continue 191 | 192 | # ignoring profile information if ignore_profiles is set 193 | if (ignore_profiles and str(t).startswith("@")): 194 | continue 195 | 196 | # ignoring links 197 | if (str(t).startswith('http')): 198 | continue 199 | 200 | # ignoring sarcastic marker 201 | # uncomment the following line for Fracking sarcasm using neural network 202 | # if (str(t).lower() in ['#sarcasm','#sarcastic', '#yeahright','#not']): 203 | # continue 204 | 205 | # for onlinesarcasm 206 | # comment if you are running the code for Fracking sarcasm using neural network 207 | if (str(t).lower() in ['#sarcasm']): 208 | continue 209 | 210 | # replacing emoji with its unicode description 211 | if (replace_emoji): 212 | if (t in emoji_dict): 213 | t = emoji_dict.get(t).split('_') 214 | filtered_text.extend(t) 215 | continue 216 | 217 | # splitting hastags 218 | if (split_hashtag and str(t).startswith("#")): 219 | splits = split_hashtags(t, word_list, split_word_list, dump_file='../resource/hastash_split_dump.txt') 220 | # adding the hashtags 221 | if (splits != None): 222 | filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))]) 223 | continue 224 | 225 | # removes repeatation of letters 226 | if (normalize_text): 227 | t = normalize_word(t) 228 | 229 | # expands the abbreviation 230 | if (t in abbreviation_dict): 231 | tokens = abbreviation_dict.get(t).split(' ') 232 | filtered_text.extend(tokens) 233 | continue 234 | 235 | # appends the text 236 | filtered_text.append(t) 237 | 238 | return filtered_text 239 | 240 | 241 | def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False, 242 | split_hashtag=False, 243 | ignore_profiles=False, 244 | lowercase=False, replace_emoji=True, n_grams=None, at_character=False): 245 | data = [] 246 | for i, line in enumerate(lines): 247 | if (i % 100 == 0): 248 | print(str(i) + '...', end='', flush=True) 249 | 250 | try: 251 | 252 | # convert the line to lowercase 253 | if (lowercase): 254 | line = line.lower() 255 | 256 | # split into token 257 | token = line.split('\t') 258 | 259 | # ID 260 | id = token[0] 261 | 262 | # label 263 | label = int(token[1].strip()) 264 | 265 | # tweet text 266 | target_text = TweetTokenizer().tokenize(token[2].strip()) 267 | if (at_character): 268 | target_text = [c for c in token[2].strip()] 269 | 270 | if (n_grams != None): 271 | n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams)) 272 | target_text.extend(['_'.join(n) for n in n_grams_list]) 273 | 274 | # filter text 275 | target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict, 276 | normalize_text, 277 | split_hashtag, 278 | ignore_profiles, replace_emoji=replace_emoji) 279 | 280 | # awc dimensions 281 | dimensions = [] 282 | if (len(token) > 3 and token[3].strip() != 'NA'): 283 | dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')] 284 | 285 | # context tweet 286 | context = [] 287 | if (len(token) > 4): 288 | if (token[4] != 'NA'): 289 | context = TweetTokenizer().tokenize(token[4].strip()) 290 | context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict, 291 | normalize_text, 292 | split_hashtag, 293 | ignore_profiles, replace_emoji=replace_emoji) 294 | 295 | # author 296 | author = 'NA' 297 | if (len(token) > 5): 298 | author = token[5] 299 | 300 | if (len(target_text) != 0): 301 | # print((label, target_text, dimensions, context, author)) 302 | data.append((id, label, target_text, dimensions, context, author)) 303 | except: 304 | raise 305 | print('') 306 | return data 307 | 308 | 309 | def load_resources(word_file_path, split_word_path, emoji_file_path, split_hashtag=False, replace_emoji=True): 310 | word_list = None 311 | emoji_dict = None 312 | 313 | # load split files 314 | split_word_list = load_split_word(split_word_path) 315 | 316 | # load word dictionary 317 | if (split_hashtag): 318 | word_list = InitializeWords(word_file_path) 319 | 320 | if (replace_emoji): 321 | emoji_dict = load_unicode_mapping(emoji_file_path) 322 | 323 | abbreviation_dict = load_abbreviation() 324 | 325 | return word_list, emoji_dict, split_word_list, abbreviation_dict 326 | 327 | 328 | def loaddata(filename, word_file_path, split_word_path, emoji_file_path, normalize_text=False, split_hashtag=False, 329 | ignore_profiles=False, 330 | lowercase=True, replace_emoji=True, n_grams=None, at_character=False): 331 | 332 | word_list, emoji_dict, split_word_list, abbreviation_dict = load_resources(word_file_path, split_word_path, 333 | emoji_file_path, 334 | split_hashtag=split_hashtag, 335 | replace_emoji=replace_emoji) 336 | lines = open(filename, 'r').readlines() 337 | 338 | data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=normalize_text, 339 | split_hashtag=split_hashtag, 340 | ignore_profiles=ignore_profiles, lowercase=lowercase, replace_emoji=replace_emoji, 341 | n_grams=n_grams, at_character=at_character) 342 | return data 343 | 344 | 345 | def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0): 346 | vocab = defaultdict(int) 347 | vocab_freq = defaultdict(int) 348 | 349 | total_words = 1 350 | if (not without_dimension): 351 | for i in range(1, 101): 352 | vocab_freq[str(i)] = 0 353 | # vocab[str(i)] = total_words 354 | # total_words = total_words + 1 355 | 356 | for sentence_no, token in enumerate(data): 357 | for word in token[2]: 358 | if (word not in vocab_freq): 359 | # vocab[word] = total_words 360 | # total_words = total_words + 1 361 | vocab_freq[word] = 0 362 | vocab_freq[word] = vocab_freq.get(word) + 1 363 | 364 | if (not without_dimension): 365 | for word in token[3]: 366 | # if (word not in vocab_freq): 367 | # vocab[word] = total_words 368 | # total_words = total_words + 1 369 | vocab_freq[word] = vocab_freq.get(word) + 1 370 | 371 | if (ignore_context == False): 372 | for word in token[4]: 373 | if (not word in vocab): 374 | # vocab[word] = total_words 375 | # total_words = total_words + 1 376 | vocab_freq[word] = 0 377 | vocab_freq[word] = vocab_freq.get(word) + 1 378 | 379 | for k, v in vocab_freq.items(): 380 | if (v >= min_freq): 381 | vocab[k] = total_words 382 | total_words = total_words + 1 383 | 384 | return vocab 385 | 386 | 387 | def build_reverse_vocab(vocab): 388 | rev_vocab = defaultdict(str) 389 | for k, v in vocab.items(): 390 | rev_vocab[v] = k 391 | return rev_vocab 392 | 393 | 394 | def build_auxiliary_feature(data): 395 | aux = [] 396 | for id, label, line, dimensions, context, author in data: 397 | aux.append([float(line.count('!')), float(line.count('?')), float(line.count('.')), 398 | sum([1.0 if c.isupper() else 0.0 for c in line]), float(line.count('"'))]) 399 | 400 | return numpy.asarray(aux) 401 | 402 | 403 | def vectorize_word_dimension(data, vocab, drop_dimension_index=None, verbose=False): 404 | X = [] 405 | Y = [] 406 | D = [] 407 | C = [] 408 | A = [] 409 | 410 | known_words_set = set() 411 | unknown_words_set = set() 412 | 413 | tokens = 0 414 | token_coverage = 0 415 | 416 | for id, label, line, dimensions, context, author in data: 417 | vec = [] 418 | context_vec = [] 419 | if (len(dimensions) != 0): 420 | dvec = [vocab.get(d) for d in dimensions] 421 | else: 422 | dvec = [vocab.get('unk')] * 11 423 | 424 | if drop_dimension_index != None: 425 | dvec.pop(drop_dimension_index) 426 | 427 | # tweet 428 | for words in line: 429 | tokens = tokens + 1 430 | if (words in vocab): 431 | vec.append(vocab[words]) 432 | token_coverage = token_coverage + 1 433 | known_words_set.add(words) 434 | else: 435 | vec.append(vocab['unk']) 436 | unknown_words_set.add(words) 437 | # context_tweet 438 | if (len(context) != 0): 439 | for words in line: 440 | tokens = tokens + 1 441 | if (words in vocab): 442 | context_vec.append(vocab[words]) 443 | token_coverage = token_coverage + 1 444 | known_words_set.add(words) 445 | else: 446 | context_vec.append(vocab['unk']) 447 | unknown_words_set.add(words) 448 | else: 449 | context_vec = [vocab['unk']] 450 | 451 | X.append(vec) 452 | Y.append(label) 453 | D.append(dvec) 454 | C.append(context_vec) 455 | A.append(author) 456 | 457 | if verbose: 458 | print('Token coverage:', token_coverage / float(tokens)) 459 | print('Word coverage:', len(known_words_set) / float(len(vocab.keys()))) 460 | 461 | return numpy.asarray(X), numpy.asarray(Y), numpy.asarray(D), numpy.asarray(C), numpy.asarray(A) 462 | 463 | 464 | def pad_sequence_1d(sequences, maxlen=None, dtype='float32', padding='pre', truncating='pre', value=0.): 465 | X = [vectors for vectors in sequences] 466 | 467 | nb_samples = len(X) 468 | 469 | x = (numpy.zeros((nb_samples, maxlen)) * value).astype(dtype) 470 | 471 | for idx, s in enumerate(X): 472 | if truncating == 'pre': 473 | trunc = s[-maxlen:] 474 | elif truncating == 'post': 475 | trunc = s[:maxlen] 476 | else: 477 | raise ValueError("Truncating type '%s' not understood" % padding) 478 | 479 | if padding == 'post': 480 | x[idx, :len(trunc)] = trunc 481 | elif padding == 'pre': 482 | x[idx, -len(trunc):] = trunc 483 | else: 484 | raise ValueError("Padding type '%s' not understood" % padding) 485 | 486 | return x 487 | 488 | 489 | def write_vocab(filepath, vocab): 490 | with open(filepath, 'w') as fw: 491 | for key, value in vocab.items(): 492 | fw.write(str(key) + '\t' + str(value) + '\n') 493 | 494 | 495 | def get_fasttext_weight(vocab, n=300, path=None): 496 | word2vecmodel = load_word2vec(path=path) 497 | emb_weights = numpy.zeros((len(vocab.keys()) + 1, n)) 498 | for k, v in vocab.items(): 499 | if (word2vecmodel.__contains__(k)): 500 | emb_weights[v, :] = word2vecmodel[k][:n] 501 | 502 | return emb_weights 503 | 504 | 505 | def get_word2vec_weight(vocab, n=300, path=None): 506 | word2vecmodel = load_word2vec(path=path) 507 | emb_weights = numpy.zeros((len(vocab.keys()) + 1, n)) 508 | for k, v in vocab.items(): 509 | if (word2vecmodel.__contains__(k)): 510 | emb_weights[v, :] = word2vecmodel[k][:n] 511 | 512 | return emb_weights 513 | 514 | 515 | def load_glove_model(vocab, n=200, glove_path='/home/glove/glove.twitter.27B/glove.twitter.27B.200d.txt'): 516 | word2vecmodel = glove.load_glove_word2vec(glove_path) 517 | 518 | embedding_matrix = numpy.zeros((len(vocab.keys()) + 1, n)) 519 | for k, v in vocab.items(): 520 | embedding_vector = word2vecmodel.get(k) 521 | if embedding_vector is not None: 522 | embedding_matrix[v] = embedding_vector 523 | 524 | return embedding_matrix 525 | 526 | 527 | def add_ngram(sequences, token_indice, ngram_range=2): 528 | """ 529 | Augment the input list of list (sequences) by appending n-grams values. 530 | Example: adding bi-gram 531 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 532 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017} 533 | >>> add_ngram(sequences, token_indice, ngram_range=2) 534 | [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]] 535 | Example: adding tri-gram 536 | >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]] 537 | >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018} 538 | >>> add_ngram(sequences, token_indice, ngram_range=3) 539 | [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]] 540 | """ 541 | new_sequences = [] 542 | for input_list in sequences: 543 | new_list = input_list[:] 544 | for i in range(len(new_list) - ngram_range + 1): 545 | for ngram_value in range(2, ngram_range + 1): 546 | ngram = tuple(new_list[i:i + ngram_value]) 547 | if ngram in token_indice: 548 | new_list.append(token_indice[ngram]) 549 | new_sequences.append(new_list) 550 | 551 | return new_sequences 552 | 553 | 554 | def create_ngram_set(input_list, ngram_value=2): 555 | """ 556 | Extract a set of n-grams from a list of integers. 557 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2) 558 | {(4, 9), (4, 1), (1, 4), (9, 4)} 559 | >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3) 560 | [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)] 561 | """ 562 | return set(zip(*[input_list[i:] for i in range(ngram_value)])) 563 | 564 | 565 | def prepare_fasttext(x_train, x_test, max_features=20000, ngram_range=2): 566 | if ngram_range > 1: 567 | print('Adding {}-gram features'.format(ngram_range)) 568 | # Create set of unique n-gram from the training set. 569 | ngram_set = set() 570 | for input_list in x_train: 571 | for i in range(2, ngram_range + 1): 572 | set_of_ngram = create_ngram_set(input_list, ngram_value=i) 573 | ngram_set.update(set_of_ngram) 574 | 575 | # Dictionary mapping n-gram token to a unique integer. 576 | # Integer values are greater than max_features in order 577 | # to avoid collision with existing features. 578 | start_index = max_features + 1 579 | token_indice = {v: k + start_index for k, v in enumerate(ngram_set)} 580 | indice_token = {token_indice[k]: k for k in token_indice} 581 | 582 | # max_features is the highest integer that could be found in the dataset. 583 | max_features = numpy.max(list(indice_token.keys())) + 1 584 | 585 | # Augmenting x_train and x_test with n-grams features 586 | x_train = add_ngram(x_train, token_indice, ngram_range) 587 | x_test = add_ngram(x_test, token_indice, ngram_range) 588 | print('Average train sequence length: {}'.format(numpy.mean(list(map(len, x_train)), dtype=int))) 589 | print('Average test sequence length: {}'.format(numpy.mean(list(map(len, x_test)), dtype=int))) 590 | -------------------------------------------------------------------------------- /src/data_processing/glove2Word2vecLoader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import shutil 3 | import hashlib 4 | from sys import platform 5 | 6 | import gensim 7 | 8 | 9 | def prepend_line(infile, outfile, line): 10 | with open(infile, 'r') as old: 11 | with open(outfile, 'w') as new: 12 | new.write(str(line) + "\n") 13 | shutil.copyfileobj(old, new) 14 | 15 | 16 | def prepend_slow(infile, outfile, line): 17 | with open(infile, 'r') as fin: 18 | with open(outfile, 'w') as fout: 19 | fout.write(line + "\n") 20 | for line in fin: 21 | fout.write(line) 22 | 23 | 24 | def checksum(filename): 25 | BLOCKSIZE = 65536 26 | hasher = hashlib.md5() 27 | with open(filename, 'rb') as afile: 28 | buf = afile.read(BLOCKSIZE) 29 | while len(buf) > 0: 30 | hasher.update(buf) 31 | buf = afile.read(BLOCKSIZE) 32 | return hasher.hexdigest() 33 | 34 | 35 | # Pre-computed glove files values. 36 | pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494} 37 | 38 | pretrain_checksum = { 39 | "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4", 40 | "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06", 41 | "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e", 42 | "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12", 43 | "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de", 44 | "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1", 45 | "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80", 46 | "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a", 47 | "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63", 48 | "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569", 49 | } 50 | 51 | 52 | def check_num_lines_in_glove(filename, check_checksum=False): 53 | if check_checksum: 54 | assert checksum(filename) == pretrain_checksum[filename] 55 | if filename.startswith('glove.6B.'): 56 | return 400000 57 | elif filename.startswith('glove.twitter.27B.'): 58 | return 1193514 59 | else: 60 | return pretrain_num_lines[filename] 61 | 62 | 63 | def load_glove_word2vec(filename): 64 | 65 | # load the whole embedding into memory 66 | embeddings_index = dict() 67 | f = open(filename) 68 | for line in f: 69 | values = line.split() 70 | word = values[0] 71 | coefs = np.asarray(values[1:], dtype='float32') 72 | embeddings_index[word] = coefs 73 | f.close() 74 | print('Loaded %s word vectors.' % len(embeddings_index)) 75 | 76 | return embeddings_index 77 | -------------------------------------------------------------------------------- /src/sarcasm_context_moods.py: -------------------------------------------------------------------------------- 1 | import os 2 | import collections 3 | import random 4 | import sys 5 | 6 | sys.path.append('../../') 7 | 8 | import time 9 | import numpy 10 | 11 | numpy.random.seed(1337) 12 | 13 | from keras.layers.wrappers import TimeDistributed 14 | from keras import backend as K, regularizers 15 | from sklearn import metrics 16 | from keras.models import model_from_json 17 | from keras.layers.core import Dropout, Dense, Activation, Flatten 18 | from keras.layers.embeddings import Embedding 19 | from keras.layers.recurrent import LSTM 20 | from keras.layers.convolutional import Convolution1D 21 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 22 | 23 | from keras.layers.merge import add, concatenate 24 | from keras.models import Model 25 | from keras.utils import np_utils 26 | from keras.layers import Input 27 | import src.data_processing.data_handler as dh 28 | from collections import defaultdict 29 | 30 | 31 | class sarcasm_model(): 32 | _train_file = None 33 | _gold_data_path = None 34 | _validation_file = None 35 | _tweet_file = None 36 | # test_debug = None 37 | _output_file = None 38 | _model_file = None 39 | _word_file_path = None 40 | _vocab_file_path = None 41 | _input_weight_file_path = None 42 | _vocab = None 43 | _line_maxlen = None 44 | 45 | def __init__(self): 46 | self._train_file = None 47 | self._test_file = None 48 | self._validation_file = None 49 | self._tweet_file = None 50 | self._output_file = None 51 | self._model_file = None 52 | self._word_file_path = None 53 | self._vocab_file_path = None 54 | self._input_weight_file_path = None 55 | self._vocab = None 56 | 57 | self._line_maxlen = 30 58 | 59 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, 60 | dimension_length=11, trainable=True, batch_size=1): 61 | 62 | print('Building model...') 63 | 64 | context_input = Input(name='context', batch_shape=(batch_size, maxlen)) 65 | 66 | if (len(c_emb_weights) == 0): 67 | c_emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal', 68 | trainable=trainable)(context_input) 69 | else: 70 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights], 71 | trainable=trainable)(context_input) 72 | 73 | c_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid', 74 | padding='valid', input_shape=(1, maxlen))(c_emb) 75 | 76 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', 77 | dropout=0.25)(c_cnn1) 78 | 79 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.25, 80 | go_backwards=True)(c_cnn1) 81 | 82 | c_merged = concatenate([c_lstm1, c_lstm2], axis=-1) 83 | 84 | print(c_merged) 85 | 86 | 87 | text_input = Input(name='text', batch_shape=(batch_size, maxlen)) 88 | 89 | if (len(emb_weights) == 0): 90 | emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal', 91 | trainable=trainable)(text_input) 92 | else: 93 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 94 | trainable=trainable)(text_input) 95 | 96 | t_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', 97 | activation='relu', padding='valid', input_shape=(1, maxlen))(emb) 98 | 99 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 100 | bias_initializer='he_normal', activation='sigmoid', 101 | dropout=0.25)(t_cnn1) 102 | 103 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 104 | bias_initializer='he_normal', activation='sigmoid', 105 | dropout=0.25, 106 | go_backwards=True)(t_cnn1) 107 | 108 | t_merged = concatenate([t_lstm1, t_lstm2], axis=-1) 109 | 110 | # t_merged = Reshape((-1,int(hidden_units / 8)))(t_merged) 111 | 112 | awc_input = Input(name='awc', batch_shape=(batch_size, 11)) 113 | 114 | eaw = Embedding(101, int(hidden_units / 8), input_length=dimension_length, 115 | embeddings_initializer='glorot_normal', 116 | trainable=True)(awc_input) 117 | 118 | merged = concatenate([c_merged, t_merged, awc_input], axis=-1) 119 | 120 | # flat_model = Flatten()(merged) 121 | 122 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged) 123 | dnn_1 = Dropout(0.25)(dnn_1) 124 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1) 125 | 126 | softmax = Activation('softmax')(dnn_2) 127 | 128 | model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax) 129 | 130 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 131 | print('No of parameter:', model.count_params()) 132 | 133 | print(model.summary()) 134 | 135 | return model 136 | 137 | 138 | class train_model(sarcasm_model): 139 | train = None 140 | validation = None 141 | 142 | def load_train_validation_test_data(self): 143 | print("Loading resource...") 144 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 145 | self._emoji_file_path, normalize_text=True, 146 | split_hashtag=True, 147 | ignore_profiles=False) 148 | 149 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 150 | self._emoji_file_path, 151 | normalize_text=True, 152 | split_hashtag=True, 153 | ignore_profiles=False) 154 | 155 | if (self._test_file != None): 156 | self.test = dh.loaddata(self._test_file, self._word_file_path, self._split_word_file_path, 157 | self._emoji_file_path, normalize_text=True, 158 | split_hashtag=True, 159 | ignore_profiles=True) 160 | 161 | def split_train_validation(self, train, ratio=.1): 162 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))]) 163 | print(len(test_indices)) 164 | train_data = [] 165 | validation_data = [] 166 | for i, t in enumerate(train): 167 | if (test_indices.__contains__(i)): 168 | validation_data.append(t) 169 | else: 170 | train_data.append(t) 171 | return train_data, validation_data 172 | 173 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 174 | vocab_file, 175 | output_file, 176 | word2vec_path=None): 177 | sarcasm_model.__init__(self) 178 | 179 | self._train_file = train_file 180 | self._validation_file = validation_file 181 | self._word_file_path = word_file_path 182 | self._split_word_file_path = split_word_path 183 | self._emoji_file_path = emoji_file_path 184 | self._model_file = model_file 185 | self._vocab_file_path = vocab_file 186 | self._output_file = output_file 187 | 188 | self.load_train_validation_test_data() 189 | 190 | batch_size = 2 191 | 192 | self.train = self.train[:-(len(self.train) % batch_size)] 193 | self.validation = self.validation[:-(len(self.validation) % batch_size)] 194 | 195 | print(self._line_maxlen) 196 | self._vocab = dh.build_vocab(self.train, ignore_context=False) 197 | self._vocab['unk'] = len(self._vocab.keys()) + 1 198 | 199 | print(len(self._vocab.keys()) + 1) 200 | print('unk::', self._vocab['unk']) 201 | 202 | dh.write_vocab(self._vocab_file_path, self._vocab) 203 | 204 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None) 205 | 206 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None) 207 | 208 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 209 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen) 210 | D = dh.pad_sequence_1d(D, maxlen=11) 211 | 212 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 213 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 214 | tD = dh.pad_sequence_1d(tD, maxlen=11) 215 | 216 | hidden_units = 64 217 | dimension_size = 300 218 | 219 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size, 220 | path=word2vec_path) 221 | cW = W 222 | 223 | print('Word2vec obtained....') 224 | 225 | ratio = self.calculate_label_ratio(Y) 226 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 227 | 228 | print('ratio', ratio) 229 | 230 | dimension_vocab = numpy.unique(D) 231 | print(len(dimension_vocab)) 232 | 233 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 234 | 235 | print('train_X', X.shape) 236 | print('train_C', C.shape) 237 | print('train_D', D.shape) 238 | print('train_Y', Y.shape) 239 | 240 | print('validation_X', tX.shape) 241 | print('validation_C', tC.shape) 242 | print('validation_D', tD.shape) 243 | print('validation_Y', tY.shape) 244 | 245 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW, 246 | hidden_units=hidden_units, trainable=False, dimension_length=11, 247 | batch_size=batch_size) 248 | 249 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 250 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss') 251 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5', 252 | # save_best_only=False) 253 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1) 254 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto', 255 | epsilon=0.0001, 256 | cooldown=0, min_lr=0.000001) 257 | 258 | model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True, 259 | callbacks=[save_best, lr_tuner], class_weight=ratio) 260 | 261 | def get_maxlen(self): 262 | return max(map(len, (x for _, x in self.train + self.validation))) 263 | 264 | def write_vocab(self): 265 | with open(self._vocab_file_path, 'w') as fw: 266 | for key, value in self._vocab.iteritems(): 267 | fw.write(str(key) + '\t' + str(value) + '\n') 268 | 269 | def calculate_label_ratio(self, labels, ): 270 | return collections.Counter(labels) 271 | 272 | 273 | class test_model(sarcasm_model): 274 | test = None 275 | model = None 276 | 277 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file): 278 | print('initializing...') 279 | sarcasm_model.__init__(self) 280 | 281 | self._word_file_path = word_file_path 282 | self._model_file = model_file 283 | self._vocab_file_path = vocab_file_path 284 | self._output_file = output_file 285 | 286 | # self._line_maxlen = 45 287 | print('test_maxlen', self._line_maxlen) 288 | 289 | def predict_cross_validation(self, tC, tX, tD, test): 290 | self.__predict_model([tC, tX, tD], test) 291 | 292 | def load_trained_model(self, weight_file='model.json.hdf5'): 293 | start = time.time() 294 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file) 295 | end = time.time() 296 | print('model loading time::', (end - start)) 297 | 298 | def __load_model(self, model_path, model_weight_path): 299 | self.model = model_from_json(open(model_path).read()) 300 | print('model loaded from file...') 301 | self.model.load_weights(model_weight_path) 302 | print('model weights loaded from file...') 303 | 304 | def load_vocab(self): 305 | vocab = defaultdict() 306 | with open(self._vocab_file_path, 'r') as f: 307 | for line in f.readlines(): 308 | key, value = line.split('\t') 309 | vocab[key] = value 310 | 311 | return vocab 312 | 313 | def predict(self, test_file, verbose=False): 314 | start = time.time() 315 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, 316 | split_hashtag=True, 317 | ignore_profiles=False) 318 | end = time.time() 319 | if (verbose == True): 320 | print('test resource loading time::', (end - start)) 321 | 322 | self._vocab = self.load_vocab() 323 | 324 | start = time.time() 325 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 326 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 327 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 328 | tD = dh.pad_sequence_1d(tD, maxlen=11) 329 | 330 | end = time.time() 331 | if (verbose == True): 332 | print('test resource preparation time::', (end - start)) 333 | 334 | self.__predict_model([tC, tX, tD], self.test) 335 | 336 | def __predict_model(self, tX, test): 337 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1) 338 | 339 | y = [] 340 | y_pred = [] 341 | 342 | fd = open(self._output_file + '.analysis', 'w') 343 | for i, (label) in enumerate(prediction_probability): 344 | gold_label = test[i][0] 345 | words = test[i][1] 346 | dimensions = test[i][2] 347 | context = test[i][3] 348 | author = test[i][4] 349 | 350 | predicted = numpy.argmax(prediction_probability[i]) 351 | 352 | y.append(int(gold_label)) 353 | y_pred.append(predicted) 354 | 355 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 356 | + str(gold_label) + '\t' 357 | + str(predicted) + '\t' 358 | + ' '.join(words) + '\t' 359 | + str(dimensions) + '\t' 360 | + ' '.join(context)) 361 | 362 | fd.write('\n') 363 | 364 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 365 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 366 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 367 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 368 | print('f_score::', metrics.classification_report(y, y_pred)) 369 | 370 | fd.close() 371 | 372 | 373 | if __name__ == "__main__": 374 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 375 | train_file = basepath + '/resource/train/Train_context_moods.txt' 376 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt' 377 | test_file = basepath + '/resource/test/Test_context_AW.txt' 378 | word_file_path = basepath + '/resource/word_list_freq.txt' 379 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt' 380 | model_file = basepath + '/resource/text_context_awc_model/weights/' 381 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt' 382 | split_word_path = basepath + '/resource/word_split.txt' 383 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 384 | 385 | # word2vec path 386 | word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin' 387 | 388 | tr = train_model(train_file=train_file, validation_file=validation_file, word_file_path=word_file_path, 389 | split_word_path=split_word_path, emoji_file_path=emoji_file_path, model_file=model_file, 390 | vocab_file=vocab_file_path, output_file=output_file, 391 | word2vec_path=word2vec_path) 392 | 393 | with K.get_session(): 394 | t = test_model(word_file_path, model_file, vocab_file_path, output_file) 395 | t.load_trained_model() 396 | t.predict(test_file) 397 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_DNN_2D.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from keras.engine import InputLayer 5 | from keras.layers.normalization import BatchNormalization 6 | from keras.layers.wrappers import TimeDistributed 7 | 8 | sys.path.append('../') 9 | import collections 10 | import time 11 | import numpy 12 | 13 | numpy.random.seed(1337) 14 | from sklearn import metrics 15 | from keras.models import Sequential, model_from_json 16 | from keras.layers import Masking, Bidirectional, GlobalAveragePooling2D 17 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten 18 | from keras.layers.embeddings import Embedding 19 | from keras.layers.recurrent import LSTM 20 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling2D 21 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau 22 | from keras.callbacks import EarlyStopping 23 | from keras.optimizers import Adam 24 | from keras.utils import np_utils 25 | from collections import defaultdict 26 | import src.data_processing.data_handler as dh 27 | 28 | import keras.backend as K 29 | 30 | 31 | class sarcasm_model(): 32 | _train_file = None 33 | _test_file = None 34 | _tweet_file = None 35 | _output_file = None 36 | _model_file = None 37 | _word_file_path = None 38 | _vocab_file_path = None 39 | _input_weight_file_path = None 40 | _vocab = None 41 | _line_maxlen = None 42 | 43 | def __init__(self): 44 | self._line_maxlen = 30 45 | 46 | def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False): 47 | print('Build model...') 48 | 49 | model = Sequential() 50 | 51 | model.add(Masking(mask_value=0, input_shape=(maxlen,))) 52 | 53 | if (len(emb_weights) == 0): 54 | model.add(Embedding(vocab_size, 20, input_length=maxlen, embeddings_initializer='he_normal', 55 | trainable=trainable, mask_zero=True)) 56 | else: 57 | model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 58 | trainable=trainable)) 59 | 60 | model.add(Reshape((model.output_shape[1], model.output_shape[2], 1))) 61 | 62 | model.add(Convolution2D(int(hidden_units / 8), (5, 1), kernel_initializer='he_normal', padding='valid', 63 | activation='relu')) 64 | model.add(MaxPooling2D((2, 1))) 65 | model.add(Dropout(0.5)) 66 | 67 | model.add(Convolution2D(int(hidden_units / 4), (3, 1), kernel_initializer='he_normal', padding='valid', 68 | activation='relu')) 69 | model.add(MaxPooling2D((2, 1))) 70 | model.add(Dropout(0.5)) 71 | 72 | model.add(Dense(int(hidden_units / 2), kernel_initializer='he_normal', activation='relu')) 73 | model.add(Dropout(0.5)) 74 | 75 | model.add(Dense(2, activation='softmax')) 76 | 77 | adam = Adam(lr=0.001) 78 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 79 | print('No of parameter:', model.count_params()) 80 | 81 | print(model.summary()) 82 | 83 | return model 84 | 85 | 86 | class train_model(sarcasm_model): 87 | train = None 88 | validation = None 89 | print("Loading resource...") 90 | 91 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 92 | vocab_file, 93 | output_file, 94 | word2vec_path=None, test_file=None, input_weight_file_path=None): 95 | 96 | sarcasm_model.__init__(self) 97 | 98 | self._train_file = train_file 99 | self._validation_file = validation_file 100 | self._word_file_path = word_file_path 101 | self._split_word_file_path = split_word_path 102 | self._emoji_file_path = emoji_file_path 103 | self._model_file = model_file 104 | self._vocab_file_path = vocab_file 105 | self._output_file = output_file 106 | self._input_weight_file_path = input_weight_file_path 107 | 108 | self.load_train_validation_test_data() 109 | 110 | print(self._line_maxlen) 111 | 112 | # build vocabulary 113 | if (self._test_file != None): 114 | self._vocab = dh.build_vocab(self.train + self.validation + self.test) 115 | else: 116 | self._vocab = dh.build_vocab(self.train + self.validation) 117 | 118 | self._vocab['unk'] = len(self._vocab.keys()) + 1 119 | 120 | print(len(self._vocab.keys()) + 1) 121 | print('unk::', self._vocab['unk']) 122 | 123 | dh.write_vocab(self._vocab_file_path, self._vocab) 124 | 125 | # prepares input 126 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 127 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 128 | 129 | # prepares input 130 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 131 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 132 | 133 | # embedding dimension 134 | dimension_size = 100 135 | W = [] 136 | 137 | W = dh.get_word2vec_weight(self._vocab, n=200, 138 | path=word2vec_path) 139 | 140 | # solving class imbalance 141 | ratio = self.calculate_label_ratio(Y) 142 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 143 | print('class ratio::', ratio) 144 | 145 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 146 | 147 | print('train_X', X.shape) 148 | print('train_Y', Y.shape) 149 | print('validation_X', tX.shape) 150 | print('validation_Y', tY.shape) 151 | 152 | # trainable true if you want word2vec weights to be updated 153 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False) 154 | 155 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 156 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 157 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', 158 | save_best_only=False) 159 | early_stopping = EarlyStopping(monitor='loss', patience=20, verbose=1) 160 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto', 161 | epsilon=0.0001, 162 | cooldown=0, min_lr=0.000001) 163 | 164 | # training 165 | model.fit(X, Y, batch_size=128, epochs=100, validation_data=(tX, tY), shuffle=True, 166 | callbacks=[save_best], class_weight=ratio) 167 | 168 | def load_train_validation_test_data(self): 169 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 170 | self._emoji_file_path, normalize_text=True, 171 | split_hashtag=True, 172 | ignore_profiles=False, replace_emoji=False) 173 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 174 | self._emoji_file_path, normalize_text=True, 175 | split_hashtag=True, 176 | ignore_profiles=False, replace_emoji=False) 177 | if (self._test_file != None): 178 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 179 | split_hashtag=True, 180 | ignore_profiles=True) 181 | 182 | def get_maxlen(self): 183 | return max(map(len, (x for _, x in self.train + self.validation))) 184 | 185 | def write_vocab(self): 186 | with open(self._vocab_file_path, 'w') as fw: 187 | for key, value in self._vocab.iteritems(): 188 | fw.write(str(key) + '\t' + str(value) + '\n') 189 | 190 | def calculate_label_ratio(self, labels): 191 | return collections.Counter(labels) 192 | 193 | 194 | class test_model(sarcasm_model): 195 | test = None 196 | model = None 197 | 198 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 199 | input_weight_file_path=None): 200 | print('initializing...') 201 | sarcasm_model.__init__(self) 202 | 203 | self._model_file_path = model_file 204 | self._word_file_path = word_file_path 205 | self._split_word_file_path = split_word_path 206 | self._emoji_file_path = emoji_file_path 207 | self._vocab_file_path = vocab_file_path 208 | self._output_file = output_file 209 | self._input_weight_file_path = input_weight_file_path 210 | 211 | print('test_maxlen', self._line_maxlen) 212 | 213 | def load_trained_model(self, weight_file='model.json.hdf5'): 214 | start = time.time() 215 | self.__load_model(self._model_file_path + 'model.json', self._model_file_path + weight_file) 216 | end = time.time() 217 | print('model loading time::', (end - start)) 218 | 219 | def __load_model(self, model_path, model_weight_path): 220 | self.model = model_from_json(open(model_path).read()) 221 | print('model loaded from file...') 222 | self.model.load_weights(model_weight_path) 223 | print('model weights loaded from file...') 224 | 225 | def load_vocab(self): 226 | vocab = defaultdict() 227 | with open(self._vocab_file_path, 'r') as f: 228 | for line in f.readlines(): 229 | key, value = line.split('\t') 230 | vocab[key] = value 231 | 232 | return vocab 233 | 234 | def predict(self, test_file, verbose=False): 235 | try: 236 | start = time.time() 237 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 238 | normalize_text=True, split_hashtag=True, 239 | ignore_profiles=True) 240 | end = time.time() 241 | if (verbose == True): 242 | print('test resource loading time::', (end - start)) 243 | 244 | self._vocab = self.load_vocab() 245 | 246 | start = time.time() 247 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 248 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 249 | end = time.time() 250 | if (verbose == True): 251 | print('test resource preparation time::', (end - start)) 252 | 253 | self.__predict_model(tX, self.test) 254 | except Exception as e: 255 | print('Error:', e) 256 | 257 | def __predict_model(self, tX, test): 258 | y = [] 259 | y_pred = [] 260 | 261 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 262 | 263 | try: 264 | fd = open(self._output_file + '.analysis', 'w') 265 | for i, (label) in enumerate(prediction_probability): 266 | id = test[i][0] 267 | gold_label = test[i][1] 268 | words = test[i][2] 269 | dimensions = test[i][3] 270 | context = test[i][4] 271 | author = test[i][5] 272 | 273 | predicted = numpy.argmax(prediction_probability[i]) 274 | 275 | y.append(int(gold_label)) 276 | y_pred.append(predicted) 277 | 278 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 279 | + str(gold_label) + '\t' 280 | + str(predicted) + '\t' 281 | + ' '.join(words)) 282 | 283 | fd.write('\n') 284 | 285 | print() 286 | 287 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 288 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 289 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 290 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 291 | print('f_score::', metrics.classification_report(y, y_pred)) 292 | fd.close() 293 | except Exception as e: 294 | print(e) 295 | 296 | 297 | if __name__ == "__main__": 298 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 299 | train_file = basepath + '/resource/train/Train_v1.txt' 300 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 301 | test_file = basepath + '/resource/test/Test_v1.txt' 302 | word_file_path = basepath + '/resource/word_list_freq.txt' 303 | split_word_path = basepath + '/resource/word_split.txt' 304 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 305 | 306 | output_file = basepath + '/resource/text_model_2D/TestResults.txt' 307 | model_file = basepath + '/resource/text_model_2D/weights/' 308 | vocab_file_path = basepath + '/resource/text_model_2D/vocab_list.txt' 309 | 310 | # word2vec path 311 | word2vec_path = '/home/ubuntu/word2vec/GoogleNews-vectors-negative300.bin' 312 | glove_path = '/home/striker/word2vec/glove_model_200.txt.bin' 313 | 314 | # test file is passed to build the vocabulary 315 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 316 | # vocab_file_path, output_file, 317 | # word2vec_path=glove_path, test_file=test_file) 318 | # 319 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 320 | t.load_trained_model() 321 | t.predict(test_file) 322 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_LSTM_ATTN.py: -------------------------------------------------------------------------------- 1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py 2 | 3 | import os 4 | import sys 5 | 6 | from src.data_processing.data_handler import load_glove_model, build_auxiliary_feature 7 | 8 | sys.path.append('../') 9 | 10 | import collections 11 | import time 12 | import numpy 13 | 14 | from keras import backend as K 15 | 16 | from keras import backend as K, regularizers 17 | from sklearn import metrics 18 | from keras.models import model_from_json, load_model 19 | from keras.layers.core import Dropout, Dense, Activation, Flatten 20 | from keras.layers.embeddings import Embedding 21 | from keras.layers.recurrent import LSTM 22 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 23 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 24 | 25 | from keras.layers.merge import concatenate, multiply 26 | from keras.models import Model 27 | from keras.utils import np_utils 28 | from keras.layers import Input, Reshape, Permute, RepeatVector, Lambda, merge 29 | import src.data_processing.data_handler as dh 30 | from collections import defaultdict 31 | 32 | 33 | class sarcasm_model(): 34 | _train_file = None 35 | _test_file = None 36 | _tweet_file = None 37 | _output_file = None 38 | _model_file_path = None 39 | _word_file_path = None 40 | _split_word_file_path = None 41 | _emoji_file_path = None 42 | _vocab_file_path = None 43 | _input_weight_file_path = None 44 | _vocab = None 45 | _line_maxlen = None 46 | 47 | def __init__(self): 48 | self._line_maxlen = 30 49 | 50 | def attention_3d_block(self, inputs, SINGLE_ATTENTION_VECTOR=False): 51 | # inputs.shape = (batch_size, time_steps, input_dim) 52 | input_dim = int(inputs.shape[2]) 53 | a = Permute((2, 1))(inputs) 54 | a = Reshape((input_dim, self._line_maxlen))(a) 55 | # this line is not useful. It's just to know which dimension is what. 56 | a = Dense(self._line_maxlen, activation='softmax')(a) 57 | if SINGLE_ATTENTION_VECTOR: 58 | a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a) 59 | a = RepeatVector(input_dim)(a) 60 | a_probs = Permute((2, 1), name='attention_vec')(a) 61 | output_attention_mul = multiply([inputs, a_probs], name='attention_mul') 62 | return output_attention_mul 63 | 64 | def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=50, hidden_units=256, 65 | batch_size=1): 66 | print('Build model...') 67 | 68 | text_input = Input(name='text', shape=(maxlen,)) 69 | 70 | if (len(emb_weights) == 0): 71 | emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen, 72 | embeddings_initializer='glorot_normal', 73 | trainable=True)(text_input) 74 | else: 75 | emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 76 | trainable=False)(text_input) 77 | emb_dropout = Dropout(0.5)(emb) 78 | 79 | lstm_bwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4, 80 | go_backwards=True, return_sequences=True)(emb_dropout) 81 | lstm_fwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4, 82 | return_sequences=True)(emb_dropout) 83 | 84 | lstm_merged = concatenate([lstm_bwd, lstm_fwd]) 85 | 86 | attention_mul = self.attention_3d_block(lstm_merged) 87 | 88 | flat_attention = Flatten()(attention_mul) 89 | 90 | aux_input = Input(name='aux', shape=(5,)) 91 | 92 | merged_aux = concatenate([flat_attention, aux_input], axis=1) 93 | 94 | 95 | reshaped = Reshape((-1, 1))(merged_aux) 96 | 97 | print(reshaped.shape) 98 | 99 | cnn1 = Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')( 100 | reshaped) 101 | pool1 = MaxPooling1D(pool_size=3)(cnn1) 102 | print(pool1.shape) 103 | 104 | cnn2 = Convolution1D(2 * hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')( 105 | pool1) 106 | pool2 = MaxPooling1D(pool_size=3)(cnn2) 107 | print(pool2.shape) 108 | 109 | flat_cnn = Flatten()(pool2) 110 | 111 | dnn_1 = Dense(hidden_units)(flat_cnn) 112 | dropout_1 = Dropout(0.25)(dnn_1) 113 | dnn_2 = Dense(2)(dropout_1) 114 | print(dnn_2.shape) 115 | 116 | softmax = Activation('softmax')(dnn_2) 117 | 118 | model = Model(inputs=[text_input, aux_input], outputs=softmax) 119 | 120 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 121 | print('No of parameter:', model.count_params()) 122 | 123 | print(model.summary()) 124 | 125 | return model 126 | 127 | 128 | class train_model(sarcasm_model): 129 | train = None 130 | validation = None 131 | print("Loading resource...") 132 | 133 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 134 | vocab_file, 135 | output_file, 136 | input_weight_file_path=None): 137 | sarcasm_model.__init__(self) 138 | 139 | self._train_file = train_file 140 | self._validation_file = validation_file 141 | self._word_file_path = word_file_path 142 | self._split_word_file_path = split_word_path 143 | self._emoji_file_path = emoji_file_path 144 | self._model_file = model_file 145 | self._vocab_file_path = vocab_file 146 | self._output_file = output_file 147 | self._input_weight_file_path = input_weight_file_path 148 | 149 | self.load_train_validation_data() 150 | 151 | print(self._line_maxlen) 152 | batch_size = 32 153 | 154 | # build vocabulary 155 | # truncates words with min freq=1 156 | self._vocab = dh.build_vocab(self.train, min_freq=1) 157 | if ('unk' not in self._vocab): 158 | self._vocab['unk'] = len(self._vocab.keys()) + 1 159 | 160 | print(len(self._vocab.keys()) + 1) 161 | print('unk::', self._vocab['unk']) 162 | 163 | dh.write_vocab(self._vocab_file_path, self._vocab) 164 | 165 | self.train = self.train[:-(len(self.train) % batch_size)] 166 | self.validation = self.validation[:-(len(self.validation) % batch_size)] 167 | 168 | # prepares input 169 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 170 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 171 | 172 | # prepares input 173 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 174 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 175 | 176 | # embedding dimension 177 | dimension_size = 300 178 | emb_weights = load_glove_model(self._vocab, n=dimension_size, 179 | glove_path='/home/aghosh/backups/glove.6B.300d.txt') 180 | 181 | # aux inputs 182 | aux_train = build_auxiliary_feature(self.train) 183 | aux_validation = build_auxiliary_feature(self.validation) 184 | 185 | # solving class imbalance 186 | ratio = self.calculate_label_ratio(Y) 187 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 188 | print('class ratio::', ratio) 189 | 190 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 191 | 192 | print('train_X', X.shape) 193 | print('train_Y', Y.shape) 194 | print('validation_X', tX.shape) 195 | print('validation_Y', tY.shape) 196 | 197 | # trainable true if you want word2vec weights to be updated 198 | # Not applicable in this code 199 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights, hidden_units=32, 200 | embedding_dimension=dimension_size, batch_size=batch_size) 201 | 202 | # open(self._model_file + 'model.json', 'w').write(model.to_json()) 203 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 204 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', 205 | save_best_only=False) 206 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) 207 | 208 | # training 209 | model.fit([X, aux_train], Y, batch_size=batch_size, epochs=10, validation_data=([tX, aux_validation], tY), 210 | shuffle=True, 211 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio) 212 | 213 | def load_train_validation_data(self): 214 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 215 | self._emoji_file_path, normalize_text=True, 216 | split_hashtag=True, 217 | ignore_profiles=False) 218 | print('Training data loading finished...') 219 | 220 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 221 | self._emoji_file_path, 222 | normalize_text=True, 223 | split_hashtag=True, 224 | ignore_profiles=False) 225 | print('Validation data loading finished...') 226 | 227 | if (self._test_file != None): 228 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 229 | split_hashtag=True, 230 | ignore_profiles=True) 231 | 232 | def get_maxlen(self): 233 | return max(map(len, (x for _, x in self.train + self.validation))) 234 | 235 | def write_vocab(self): 236 | with open(self._vocab_file_path, 'w') as fw: 237 | for key, value in self._vocab.iteritems(): 238 | fw.write(str(key) + '\t' + str(value) + '\n') 239 | 240 | def calculate_label_ratio(self, labels): 241 | return collections.Counter(labels) 242 | 243 | 244 | class test_model(sarcasm_model): 245 | test = None 246 | model = None 247 | 248 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 249 | input_weight_file_path=None): 250 | print('initializing...') 251 | sarcasm_model.__init__(self) 252 | 253 | self._model_file_path = model_file 254 | self._word_file_path = word_file_path 255 | self._split_word_file_path = split_word_path 256 | self._emoji_file_path = emoji_file_path 257 | self._vocab_file_path = vocab_file_path 258 | self._output_file = output_file 259 | self._input_weight_file_path = input_weight_file_path 260 | 261 | print('test_maxlen', self._line_maxlen) 262 | 263 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'): 264 | start = time.time() 265 | self.__load_model(self._model_file_path + weight_file) 266 | end = time.time() 267 | print('model loading time::', (end - start)) 268 | 269 | def __load_model(self, model_path): 270 | self.model = load_model(model_path) 271 | print('model loaded from file...') 272 | # self.model.load_weights(model_weight_path) 273 | # print('model weights loaded from file...') 274 | 275 | def load_vocab(self): 276 | vocab = defaultdict() 277 | with open(self._vocab_file_path, 'r') as f: 278 | for line in f.readlines(): 279 | key, value = line.split('\t') 280 | vocab[key] = value 281 | 282 | return vocab 283 | 284 | def predict(self, test_file, verbose=False): 285 | try: 286 | start = time.time() 287 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 288 | normalize_text=True, split_hashtag=True, 289 | ignore_profiles=False) 290 | end = time.time() 291 | if (verbose == True): 292 | print('test resource loading time::', (end - start)) 293 | 294 | self._vocab = self.load_vocab() 295 | print('vocab loaded...') 296 | 297 | start = time.time() 298 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 299 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 300 | 301 | aux_test = build_auxiliary_feature(self.test) 302 | 303 | end = time.time() 304 | if (verbose == True): 305 | print('test resource preparation time::', (end - start)) 306 | 307 | self.__predict_model([tX, aux_test], self.test) 308 | except Exception as e: 309 | print('Error:', e) 310 | raise 311 | 312 | def __predict_model(self, tX, test): 313 | y = [] 314 | y_pred = [] 315 | 316 | # tX = tX[:-len(tX) % 32] 317 | # test = test[:-len(test) % 32] 318 | 319 | prediction_probability = self.model.predict_file(tX, batch_size=1, verbose=1) 320 | 321 | try: 322 | fd = open(self._output_file + '.analysis', 'w') 323 | for i, (label) in enumerate(prediction_probability): 324 | gold_label = test[i][1] 325 | words = test[i][2] 326 | dimensions = test[i][3] 327 | context = test[i][4] 328 | author = test[i][5] 329 | 330 | predicted = numpy.argmax(prediction_probability[i]) 331 | 332 | y.append(int(gold_label)) 333 | y_pred.append(predicted) 334 | 335 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 336 | + str(gold_label) + '\t' 337 | + str(predicted) + '\t' 338 | + ' '.join(words)) 339 | 340 | fd.write('\n') 341 | 342 | print() 343 | 344 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 345 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 346 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 347 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 348 | print('f_score::', metrics.classification_report(y, y_pred)) 349 | fd.close() 350 | except Exception as e: 351 | print(e) 352 | raise 353 | 354 | 355 | if __name__ == "__main__": 356 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 357 | train_file = basepath + '/resource/train/Train_v1.txt' 358 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 359 | test_file = basepath + '/resource/test/Test_v1.txt' 360 | word_file_path = basepath + '/resource/word_list_freq.txt' 361 | split_word_path = basepath + '/resource/word_split.txt' 362 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 363 | 364 | output_file = basepath + '/resource/text_model/TestResults.txt' 365 | model_file = basepath + '/resource/text_model/weights/' 366 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 367 | 368 | # uncomment for training 369 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 370 | vocab_file_path, output_file) 371 | 372 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 373 | t.load_trained_model(weight_file='model.json.hdf5') 374 | t.predict(test_file) 375 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_LSTM_DNN.py: -------------------------------------------------------------------------------- 1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py 2 | 3 | import os 4 | import sys 5 | 6 | sys.path.append('../') 7 | 8 | import collections 9 | import time 10 | import numpy 11 | 12 | numpy.random.seed(1337) 13 | from sklearn import metrics 14 | from keras.models import Model 15 | from keras.layers import Input 16 | from keras.models import Sequential, model_from_json 17 | from keras.layers.core import Dropout, Dense, Activation 18 | from keras.layers.embeddings import Embedding 19 | from keras.layers.recurrent import LSTM 20 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 21 | from keras.callbacks import ModelCheckpoint 22 | from keras.callbacks import EarlyStopping 23 | from keras.optimizers import Adam 24 | from keras.utils import np_utils 25 | from collections import defaultdict 26 | import src.data_processing.data_handler as dh 27 | 28 | 29 | class sarcasm_model(): 30 | _train_file = None 31 | _test_file = None 32 | _tweet_file = None 33 | _output_file = None 34 | _model_file_path = None 35 | _word_file_path = None 36 | _split_word_file_path = None 37 | _emoji_file_path = None 38 | _vocab_file_path = None 39 | _input_weight_file_path = None 40 | _vocab = None 41 | _line_maxlen = None 42 | 43 | def __init__(self): 44 | self._line_maxlen = 30 45 | 46 | def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=256, hidden_units=256): 47 | print('Build model...') 48 | 49 | text_input = Input(name='text', shape=(maxlen,)) 50 | 51 | if (len(emb_weights) == 0): 52 | emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen, 53 | embeddings_initializer='glorot_normal', 54 | trainable=True)(text_input) 55 | else: 56 | emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 57 | trainable=False)(text_input) 58 | 59 | cnn1 = Convolution1D(int(hidden_units / 4), 3, kernel_initializer='he_normal', activation='sigmoid', 60 | padding='valid', input_shape=(1, maxlen))(emb) 61 | 62 | cnn2 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid', 63 | padding='valid', input_shape=(1, maxlen - 1))(cnn1) 64 | 65 | lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', 66 | dropout=0.25, return_sequences=True)(cnn2) 67 | 68 | lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', 69 | dropout=0.25)(lstm1) 70 | 71 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(lstm2) 72 | dnn_2 = Dense(2, activation='softmax')(dnn_1) 73 | 74 | model = Model(inputs=[text_input], outputs=dnn_2) 75 | 76 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 77 | print('No of parameter:', model.count_params()) 78 | 79 | print(model.summary()) 80 | return model 81 | 82 | 83 | class train_model(sarcasm_model): 84 | train = None 85 | validation = None 86 | print("Loading resource...") 87 | 88 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 89 | vocab_file, 90 | output_file, 91 | word2vec_path=None): 92 | sarcasm_model.__init__(self) 93 | 94 | self._train_file = train_file 95 | self._validation_file = validation_file 96 | self._word_file_path = word_file_path 97 | self._split_word_file_path = split_word_path 98 | self._emoji_file_path = emoji_file_path 99 | self._model_file = model_file 100 | self._vocab_file_path = vocab_file 101 | self._output_file = output_file 102 | 103 | self.load_train_validation_data() 104 | 105 | print(self._line_maxlen) 106 | 107 | # build vocabulary 108 | # truncates words with min freq=1 109 | self._vocab = dh.build_vocab(self.train, min_freq=1) 110 | if ('unk' not in self._vocab): 111 | self._vocab['unk'] = len(self._vocab.keys()) + 1 112 | 113 | print(len(self._vocab.keys()) + 1) 114 | print('unk::', self._vocab['unk']) 115 | 116 | dh.write_vocab(self._vocab_file_path, self._vocab) 117 | 118 | # prepares input 119 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 120 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 121 | 122 | # prepares input 123 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 124 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 125 | 126 | # embedding dimension 127 | dimension_size = 300 128 | 129 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size, 130 | path=word2vec_path) 131 | 132 | # solving class imbalance 133 | ratio = self.calculate_label_ratio(Y) 134 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 135 | print('class ratio::', ratio) 136 | 137 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 138 | 139 | print('train_X', X.shape) 140 | print('train_Y', Y.shape) 141 | print('validation_X', tX.shape) 142 | print('validation_Y', tY.shape) 143 | 144 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=256, emb_weights=W) 145 | 146 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 147 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 148 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}.hdf5', 149 | save_best_only=False) 150 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) 151 | 152 | # training 153 | model.fit(X, Y, batch_size=64, epochs=100, validation_data=(tX, tY), shuffle=True, 154 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio, verbose=2) 155 | 156 | def load_train_validation_data(self): 157 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 158 | self._emoji_file_path, normalize_text=True, 159 | split_hashtag=True, 160 | ignore_profiles=False) 161 | print('Training data loading finished...') 162 | 163 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 164 | self._emoji_file_path, 165 | normalize_text=True, 166 | split_hashtag=True, 167 | ignore_profiles=False) 168 | print('Validation data loading finished...') 169 | 170 | if (self._test_file != None): 171 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 172 | split_hashtag=True, 173 | ignore_profiles=True) 174 | 175 | def get_maxlen(self): 176 | return max(map(len, (x for _, x in self.train + self.validation))) 177 | 178 | def write_vocab(self): 179 | with open(self._vocab_file_path, 'w') as fw: 180 | for key, value in self._vocab.iteritems(): 181 | fw.write(str(key) + '\t' + str(value) + '\n') 182 | 183 | def calculate_label_ratio(self, labels): 184 | return collections.Counter(labels) 185 | 186 | 187 | class test_model(sarcasm_model): 188 | test = None 189 | model = None 190 | 191 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 192 | input_weight_file_path=None): 193 | print('initializing...') 194 | sarcasm_model.__init__(self) 195 | 196 | self._model_file_path = model_file 197 | self._word_file_path = word_file_path 198 | self._split_word_file_path = split_word_path 199 | self._emoji_file_path = emoji_file_path 200 | self._vocab_file_path = vocab_file_path 201 | self._output_file = output_file 202 | self._input_weight_file_path = input_weight_file_path 203 | 204 | print('test_maxlen', self._line_maxlen) 205 | 206 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'): 207 | start = time.time() 208 | self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file) 209 | end = time.time() 210 | print('model loading time::', (end - start)) 211 | 212 | def __load_model(self, model_path, model_weight_path): 213 | self.model = model_from_json(open(model_path).read()) 214 | print('model loaded from file...') 215 | self.model.load_weights(model_weight_path) 216 | print('model weights loaded from file...') 217 | 218 | def load_vocab(self): 219 | vocab = defaultdict() 220 | with open(self._vocab_file_path, 'r') as f: 221 | for line in f.readlines(): 222 | key, value = line.split('\t') 223 | vocab[key] = value 224 | 225 | return vocab 226 | 227 | def predict(self, test_file, verbose=False): 228 | try: 229 | start = time.time() 230 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 231 | normalize_text=True, split_hashtag=True, 232 | ignore_profiles=False) 233 | end = time.time() 234 | if (verbose == True): 235 | print('test resource loading time::', (end - start)) 236 | 237 | self._vocab = self.load_vocab() 238 | print('vocab loaded...') 239 | 240 | start = time.time() 241 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 242 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 243 | end = time.time() 244 | if (verbose == True): 245 | print('test resource preparation time::', (end - start)) 246 | 247 | self.__predict_model(tX, self.test) 248 | except Exception as e: 249 | print('Error:', e) 250 | raise 251 | 252 | def __predict_model(self, tX, test): 253 | y = [] 254 | y_pred = [] 255 | 256 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 257 | 258 | try: 259 | fd = open(self._output_file + '.analysis', 'w') 260 | for i, (label) in enumerate(prediction_probability): 261 | gold_label = test[i][1] 262 | words = test[i][2] 263 | dimensions = test[i][3] 264 | context = test[i][4] 265 | author = test[i][5] 266 | 267 | predicted = numpy.argmax(prediction_probability[i]) 268 | 269 | y.append(int(gold_label)) 270 | y_pred.append(predicted) 271 | 272 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 273 | + str(gold_label) + '\t' 274 | + str(predicted) + '\t' 275 | + ' '.join(words)) 276 | 277 | fd.write('\n') 278 | 279 | print() 280 | 281 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 282 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 283 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 284 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 285 | print('f_score::', metrics.classification_report(y, y_pred)) 286 | fd.close() 287 | except Exception as e: 288 | print(e) 289 | raise 290 | 291 | 292 | if __name__ == "__main__": 293 | basepath = os.path.abspath(os.path.join(os.getcwd(), '..')) 294 | train_file = basepath + '/resource/train/Train_v1.txt' 295 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 296 | test_file = basepath + '/resource/test/Test_v1.txt' 297 | word_file_path = basepath + '/resource/word_list_freq.txt' 298 | split_word_path = basepath + '/resource/word_split.txt' 299 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 300 | 301 | output_file = basepath + '/resource/text_model/TestResults.txt' 302 | model_file = basepath + '/resource/text_model/weights/' 303 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 304 | 305 | word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin' 306 | 307 | # uncomment for training 308 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 309 | # vocab_file_path, output_file) 310 | 311 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 312 | t.load_trained_model(weight_file='weights.05__.hdf5') 313 | t.predict(test_file) 314 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D 4 | 5 | sys.path.append('../') 6 | 7 | import collections 8 | import time 9 | import numpy 10 | 11 | numpy.random.seed(1337) 12 | from sklearn import metrics 13 | from keras.models import Sequential, model_from_json 14 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten 15 | from keras.layers.embeddings import Embedding 16 | from keras.layers.recurrent import LSTM 17 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D 18 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau 19 | from keras.callbacks import EarlyStopping 20 | from keras.optimizers import Adam 21 | from keras.utils import np_utils 22 | from collections import defaultdict 23 | import src.data_processing.data_handler as dh 24 | 25 | 26 | class sarcasm_model(): 27 | _train_file = None 28 | _test_file = None 29 | _tweet_file = None 30 | _output_file = None 31 | _model_file = None 32 | _word_file_path = None 33 | _split_word_file_path = None 34 | _emoji_file_path = None 35 | _vocab_file_path = None 36 | _input_weight_file_path = None 37 | _vocab = None 38 | _line_maxlen = None 39 | 40 | def __init__(self): 41 | self._line_maxlen = 50 42 | 43 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False): 44 | print('Build model...') 45 | model = Sequential() 46 | 47 | model.add( 48 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')) 49 | 50 | model.add( 51 | Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid', 52 | activation='sigmoid')) 53 | model.add(MaxPooling1D(pool_size=2)) 54 | model.add(Dropout(0.25)) 55 | 56 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5, 57 | recurrent_activation=0.5, unroll=True, return_sequences=True)) 58 | 59 | model.add(GlobalAveragePooling1D()) 60 | model.add(Dropout(0.5)) 61 | 62 | model.add(Dense(2)) 63 | model.add(Activation('softmax')) 64 | adam = Adam(lr=0.001) 65 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 66 | print('No of parameter:', model.count_params()) 67 | 68 | print(model.summary()) 69 | return model 70 | 71 | 72 | class train_model(sarcasm_model): 73 | train = None 74 | validation = None 75 | print("Loading resource...") 76 | 77 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 78 | vocab_file, 79 | output_file, 80 | word2vec_path=None): 81 | sarcasm_model.__init__(self) 82 | 83 | self._train_file = train_file 84 | self._validation_file = validation_file 85 | self._word_file_path = word_file_path 86 | self._split_word_file_path = split_word_path 87 | self._emoji_file_path = emoji_file_path 88 | self._model_file = model_file 89 | self._vocab_file_path = vocab_file 90 | self._output_file = output_file 91 | self._input_weight_file_path = input_weight_file_path 92 | 93 | self.load_train_validation_data() 94 | 95 | print(self._line_maxlen) 96 | 97 | # build vocabulary 98 | # truncates words with min freq=10 99 | self._vocab = dh.build_vocab(self.train, min_freq=2) 100 | if ('unk' not in self._vocab): 101 | self._vocab['unk'] = len(self._vocab.keys()) + 1 102 | 103 | print(len(self._vocab.keys()) + 1) 104 | print('unk::', self._vocab['unk']) 105 | 106 | dh.write_vocab(self._vocab_file_path, self._vocab) 107 | 108 | # prepares input 109 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 110 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 111 | 112 | # prepares input 113 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 114 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 115 | 116 | # embedding dimension 117 | dimension_size = 30 118 | 119 | W = dh.get_fasttext_weight(self._vocab, n=dimension_size, 120 | path=word2vec_path) 121 | 122 | # solving class imbalance 123 | ratio = self.calculate_label_ratio(Y) 124 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 125 | print('class ratio::', ratio) 126 | 127 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 128 | 129 | print('train_X', X.shape) 130 | print('train_Y', Y.shape) 131 | print('validation_X', tX.shape) 132 | print('validation_Y', tY.shape) 133 | 134 | # trainable true if you want word2vec weights to be updated 135 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128, 136 | embedding_dimension=dimension_size, 137 | trainable=True) 138 | 139 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 140 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 141 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', 142 | save_best_only=False) 143 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) 144 | lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', 145 | epsilon=0.0001, 146 | cooldown=0, min_lr=0.000001) 147 | 148 | # training 149 | # model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, 150 | # callbacks=[save_best, save_all, early_stopping], class_weight=ratio) 151 | model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True, 152 | callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio) 153 | 154 | def load_train_validation_data(self): 155 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 156 | self._emoji_file_path, normalize_text=True, 157 | split_hashtag=True, 158 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 159 | print('Training data loading finished...') 160 | 161 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 162 | self._emoji_file_path, 163 | normalize_text=True, 164 | split_hashtag=False, 165 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 166 | print('Validation data loading finished...') 167 | 168 | def get_maxlen(self): 169 | return max(map(len, (x for _, x in self.train + self.validation))) 170 | 171 | def write_vocab(self): 172 | with open(self._vocab_file_path, 'w') as fw: 173 | for key, value in self._vocab.iteritems(): 174 | fw.write(str(key) + '\t' + str(value) + '\n') 175 | 176 | def calculate_label_ratio(self, labels): 177 | return collections.Counter(labels) 178 | 179 | 180 | class test_model(sarcasm_model): 181 | test = None 182 | model = None 183 | 184 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 185 | input_weight_file_path=None): 186 | print('initializing...') 187 | sarcasm_model.__init__(self) 188 | 189 | self._model_file = model_file 190 | self._word_file_path = word_file_path 191 | self._split_word_file_path = split_word_path 192 | self._emoji_file_path = emoji_file_path 193 | self._vocab_file_path = vocab_file_path 194 | self._output_file = output_file 195 | self._input_weight_file_path = input_weight_file_path 196 | 197 | print('test_maxlen', self._line_maxlen) 198 | 199 | def load_trained_model(self, weight_file='model.json.hdf5'): 200 | start = time.time() 201 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file) 202 | end = time.time() 203 | print('model loading time::', (end - start)) 204 | 205 | def __load_model(self, model_path, model_weight_path): 206 | self.model = model_from_json(open(model_path).read()) 207 | print('model loaded from file...') 208 | self.model.load_weights(model_weight_path) 209 | print('model weights loaded from file...') 210 | 211 | def load_vocab(self): 212 | vocab = defaultdict() 213 | with open(self._vocab_file_path, 'r') as f: 214 | for line in f.readlines(): 215 | key, value = line.split('\t') 216 | vocab[key] = value 217 | 218 | return vocab 219 | 220 | def predict(self, test_file, verbose=False): 221 | try: 222 | start = time.time() 223 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 224 | normalize_text=True, split_hashtag=True, 225 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 226 | end = time.time() 227 | if (verbose == True): 228 | print('test resource loading time::', (end - start)) 229 | 230 | self._vocab = self.load_vocab() 231 | print('vocab loaded...') 232 | 233 | start = time.time() 234 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 235 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 236 | end = time.time() 237 | if (verbose == True): 238 | print('test resource preparation time::', (end - start)) 239 | 240 | self.__predict_model(tX, self.test) 241 | except Exception as e: 242 | print('Error:', e) 243 | 244 | def __predict_model(self, tX, test): 245 | y = [] 246 | y_pred = [] 247 | 248 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 249 | 250 | try: 251 | fd = open(self._output_file + '.analysis', 'w') 252 | for i, (label) in enumerate(prediction_probability): 253 | id = test[i][0] 254 | gold_label = test[i][1] 255 | words = test[i][2] 256 | dimensions = test[i][3] 257 | context = test[i][4] 258 | author = test[i][5] 259 | 260 | predicted = numpy.argmax(prediction_probability[i]) 261 | 262 | y.append(int(gold_label)) 263 | y_pred.append(predicted) 264 | 265 | # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t' 266 | # + str(gold_label) + '\t' 267 | # + str(predicted) + '\t' 268 | # + ' '.join(words)) 269 | fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n') 270 | 271 | print() 272 | 273 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 274 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 275 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 276 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 277 | print('f_score::', metrics.classification_report(y, y_pred)) 278 | fd.close() 279 | except Exception as e: 280 | print(e) 281 | 282 | 283 | if __name__ == "__main__": 284 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 285 | train_file = basepath + '/resource/train/spooky_train.tsv' 286 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 287 | test_file = basepath + '/resource/test/spooky_test.tsv' 288 | word_file_path = basepath + '/resource/word_list_freq.txt' 289 | split_word_path = basepath + '/resource/word_split.txt' 290 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 291 | 292 | output_file = basepath + '/resource/text_model/TestResults.txt' 293 | model_file = basepath + '/resource/text_model/weights/' 294 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 295 | 296 | #fastext model path 297 | fasttext_path = '/home/fasttext/en.wiki.bin' 298 | 299 | # uncomment for training 300 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 301 | vocab_file_path, output_file, fasttext_path) 302 | 303 | # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 304 | # t.load_trained_model() 305 | # t.predict(test_file) 306 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_LSTM_DNN_simpler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append('../') 5 | 6 | import collections 7 | import time 8 | import numpy 9 | 10 | numpy.random.seed(1337) 11 | from sklearn import metrics 12 | from keras.models import Sequential, model_from_json 13 | from keras.layers.core import Dropout, Dense, Activation 14 | from keras.layers.embeddings import Embedding 15 | from keras.layers.recurrent import LSTM 16 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 17 | from keras.callbacks import ModelCheckpoint 18 | from keras.callbacks import EarlyStopping 19 | from keras.optimizers import Adam 20 | from keras.utils import np_utils 21 | from collections import defaultdict 22 | import src.data_processing.data_handler as dh 23 | 24 | 25 | class sarcasm_model(): 26 | _train_file = None 27 | _test_file = None 28 | _tweet_file = None 29 | _output_file = None 30 | _model_file_path = None 31 | _word_file_path = None 32 | _split_word_file_path = None 33 | _emoji_file_path = None 34 | _vocab_file_path = None 35 | _input_weight_file_path = None 36 | _vocab = None 37 | _line_maxlen = None 38 | 39 | def __init__(self): 40 | self._line_maxlen = 30 41 | 42 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False): 43 | print('Build model...') 44 | model = Sequential() 45 | 46 | model.add( 47 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')) 48 | 49 | model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='sigmoid', 50 | input_shape=(1, maxlen))) 51 | model.add(MaxPooling1D(pool_size=3)) 52 | model.add(Dropout(0.25)) 53 | 54 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5)) 55 | model.add(Dropout(0.25)) 56 | 57 | model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid')) 58 | model.add(Dropout(0.25)) 59 | 60 | model.add(Dense(2)) 61 | model.add(Activation('softmax')) 62 | adam = Adam(lr=0.0001) 63 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 64 | print('No of parameter:', model.count_params()) 65 | 66 | print(model.summary()) 67 | return model 68 | 69 | 70 | class train_model(sarcasm_model): 71 | train = None 72 | validation = None 73 | print("Loading resource...") 74 | 75 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 76 | vocab_file, 77 | output_file, 78 | input_weight_file_path=None): 79 | sarcasm_model.__init__(self) 80 | 81 | self._train_file = train_file 82 | self._validation_file = validation_file 83 | self._word_file_path = word_file_path 84 | self._split_word_file_path = split_word_path 85 | self._emoji_file_path = emoji_file_path 86 | self._model_file = model_file 87 | self._vocab_file_path = vocab_file 88 | self._output_file = output_file 89 | self._input_weight_file_path = input_weight_file_path 90 | 91 | self.load_train_validation_data() 92 | 93 | print(self._line_maxlen) 94 | 95 | # build vocabulary 96 | # truncates words with min freq=1 97 | self._vocab = dh.build_vocab(self.train, min_freq=1) 98 | if ('unk' not in self._vocab): 99 | self._vocab['unk'] = len(self._vocab.keys()) + 1 100 | 101 | print(len(self._vocab.keys()) + 1) 102 | print('unk::', self._vocab['unk']) 103 | 104 | dh.write_vocab(self._vocab_file_path, self._vocab) 105 | 106 | # prepares input 107 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 108 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 109 | 110 | # prepares input 111 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 112 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 113 | 114 | # embedding dimension 115 | dimension_size = 256 116 | 117 | # solving class imbalance 118 | ratio = self.calculate_label_ratio(Y) 119 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 120 | print('class ratio::', ratio) 121 | 122 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 123 | 124 | print('train_X', X.shape) 125 | print('train_Y', Y.shape) 126 | print('validation_X', tX.shape) 127 | print('validation_Y', tY.shape) 128 | 129 | # trainable true if you want word2vec weights to be updated 130 | # Not applicable in this code 131 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, embedding_dimension=dimension_size, 132 | trainable=True) 133 | 134 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 135 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 136 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', 137 | save_best_only=False) 138 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) 139 | 140 | # training 141 | model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, 142 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio) 143 | 144 | def load_train_validation_data(self): 145 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 146 | self._emoji_file_path, normalize_text=True, 147 | split_hashtag=True, 148 | ignore_profiles=False) 149 | print('Training data loading finished...') 150 | 151 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 152 | self._emoji_file_path, 153 | normalize_text=True, 154 | split_hashtag=True, 155 | ignore_profiles=False) 156 | print('Validation data loading finished...') 157 | 158 | def get_maxlen(self): 159 | return max(map(len, (x for _, x in self.train + self.validation))) 160 | 161 | def write_vocab(self): 162 | with open(self._vocab_file_path, 'w') as fw: 163 | for key, value in self._vocab.iteritems(): 164 | fw.write(str(key) + '\t' + str(value) + '\n') 165 | 166 | def calculate_label_ratio(self, labels): 167 | return collections.Counter(labels) 168 | 169 | 170 | class test_model(sarcasm_model): 171 | test = None 172 | model = None 173 | 174 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 175 | input_weight_file_path=None): 176 | print('initializing...') 177 | sarcasm_model.__init__(self) 178 | 179 | self._model_file_path = model_file 180 | self._word_file_path = word_file_path 181 | self._split_word_file_path = split_word_path 182 | self._emoji_file_path = emoji_file_path 183 | self._vocab_file_path = vocab_file_path 184 | self._output_file = output_file 185 | self._input_weight_file_path = input_weight_file_path 186 | 187 | print('test_maxlen', self._line_maxlen) 188 | 189 | def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'): 190 | start = time.time() 191 | self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file) 192 | end = time.time() 193 | print('model loading time::', (end - start)) 194 | 195 | def __load_model(self, model_path, model_weight_path): 196 | self.model = model_from_json(open(model_path).read()) 197 | print('model loaded from file...') 198 | self.model.load_weights(model_weight_path) 199 | print('model weights loaded from file...') 200 | 201 | def load_vocab(self): 202 | vocab = defaultdict() 203 | with open(self._vocab_file_path, 'r') as f: 204 | for line in f.readlines(): 205 | key, value = line.split('\t') 206 | vocab[key] = value 207 | 208 | return vocab 209 | 210 | def interactive(self, word_file_path, split_word_path, emoji_file_path): 211 | word_list, emoji_dict, split_word_list, abbreviation_dict = dh.load_resources(word_file_path, split_word_path, 212 | emoji_file_path, 213 | split_hashtag=True) 214 | self._vocab = self.load_vocab() 215 | text = '' 216 | while (text != 'exit'): 217 | text = input('Enter a query::') 218 | data = dh.parsedata(['{}\t{}\t{}'.format('id', -1, text)], word_list, split_word_list, emoji_dict, 219 | abbreviation_dict, normalize_text=True, 220 | split_hashtag=True, 221 | ignore_profiles=False) 222 | 223 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(data, self._vocab) 224 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 225 | print(self.__predict_line(tX)) 226 | 227 | def predict_file(self, test_file, verbose=False): 228 | try: 229 | start = time.time() 230 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 231 | normalize_text=True, split_hashtag=True, 232 | ignore_profiles=False) 233 | end = time.time() 234 | if (verbose == True): 235 | print('test resource loading time::', (end - start)) 236 | 237 | self._vocab = self.load_vocab() 238 | print('vocab loaded...') 239 | 240 | start = time.time() 241 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 242 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 243 | end = time.time() 244 | if (verbose == True): 245 | print('test resource preparation time::', (end - start)) 246 | 247 | self.__predict_model(tX, self.test) 248 | except Exception as e: 249 | print('Error:', e) 250 | raise 251 | 252 | def __predict_line(self, tX): 253 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 254 | predicted = numpy.argmax(prediction_probability[0]) 255 | return predicted, prediction_probability 256 | 257 | def __predict_model(self, tX, test): 258 | y = [] 259 | y_pred = [] 260 | 261 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 262 | 263 | try: 264 | fd = open(self._output_file + '.analysis', 'w') 265 | for i, (label) in enumerate(prediction_probability): 266 | gold_label = test[i][1] 267 | words = test[i][2] 268 | dimensions = test[i][3] 269 | context = test[i][4] 270 | author = test[i][5] 271 | 272 | predicted = numpy.argmax(prediction_probability[i]) 273 | 274 | y.append(int(gold_label)) 275 | y_pred.append(predicted) 276 | 277 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 278 | + str(gold_label) + '\t' 279 | + str(predicted) + '\t' 280 | + ' '.join(words)) 281 | 282 | fd.write('\n') 283 | 284 | print() 285 | 286 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 287 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 288 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 289 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 290 | print('f_score::', metrics.classification_report(y, y_pred)) 291 | fd.close() 292 | except Exception as e: 293 | print(e) 294 | raise 295 | 296 | 297 | if __name__ == "__main__": 298 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 299 | train_file = basepath + '/resource/train/Train_v1.txt' 300 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 301 | test_file = basepath + '/resource/test/Test_v1.txt' 302 | word_file_path = basepath + '/resource/word_list_freq.txt' 303 | split_word_path = basepath + '/resource/word_split.txt' 304 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 305 | 306 | output_file = basepath + '/resource/text_model/TestResults.txt' 307 | model_file = basepath + '/resource/text_model/weights/' 308 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 309 | 310 | # uncomment for training 311 | # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 312 | # vocab_file_path, output_file) 313 | 314 | t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 315 | t.load_trained_model() 316 | # t.predict_file(test_file) 317 | t.interactive(word_file_path, split_word_path, emoji_file_path) 318 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append('../') 5 | import collections 6 | import time 7 | import numpy 8 | 9 | numpy.random.seed(1337) 10 | from sklearn import metrics 11 | from keras.models import Sequential, model_from_json 12 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape 13 | from keras.layers.embeddings import Embedding 14 | from keras.layers.recurrent import LSTM 15 | from keras.layers.convolutional import Convolution1D, MaxPooling1D 16 | from keras.callbacks import ModelCheckpoint 17 | from keras.callbacks import EarlyStopping 18 | from keras.optimizers import Adam 19 | from keras.utils import np_utils 20 | from collections import defaultdict 21 | import src.data_processing.data_handler as dh 22 | 23 | 24 | class sarcasm_model(): 25 | _train_file = None 26 | _test_file = None 27 | _tweet_file = None 28 | _output_file = None 29 | _model_file = None 30 | _word_file_path = None 31 | _vocab_file_path = None 32 | _vocab = None 33 | _line_maxlen = None 34 | 35 | def __init__(self): 36 | self._line_maxlen = 30 37 | 38 | def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False): 39 | print('Build model...') 40 | model = Sequential() 41 | 42 | model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 43 | trainable=trainable)) 44 | 45 | # model.add(Reshape((maxlen, emb_weights.shape[1], 1))) 46 | 47 | model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid', 48 | activation='sigmoid', 49 | input_shape=(1, maxlen))) 50 | # model.add(MaxPooling1D(pool_size=3)) 51 | 52 | model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid', 53 | activation='sigmoid', 54 | input_shape=(1, maxlen - 2))) 55 | # model.add(MaxPooling1D(pool_size=3)) 56 | 57 | model.add(Dropout(0.25)) 58 | 59 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5, 60 | return_sequences=True)) 61 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5)) 62 | 63 | model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid')) 64 | model.add(Dense(2, activation='softmax')) 65 | adam = Adam(lr=0.0001) 66 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 67 | print('No of parameter:', model.count_params()) 68 | 69 | print(model.summary()) 70 | return model 71 | 72 | 73 | class train_model(sarcasm_model): 74 | train = None 75 | validation = None 76 | print("Loading resource...") 77 | 78 | def __init__(self, train_file, validation_file, word_file_path, model_file, vocab_file, output_file, 79 | word2vec_path=None, test_file=None): 80 | 81 | sarcasm_model.__init__(self) 82 | 83 | self._train_file = train_file 84 | self._validation_file = validation_file 85 | self._word_file_path = word_file_path 86 | self._model_file = model_file 87 | self._vocab_file_path = vocab_file 88 | self._output_file = output_file 89 | self._test_file = test_file 90 | 91 | self.load_train_validation_test_data() 92 | 93 | print(self._line_maxlen) 94 | 95 | # build vocabulary 96 | if (self._test_file != None): 97 | self._vocab = dh.build_vocab(self.train + self.validation + self.test, min_freq=2) 98 | else: 99 | self._vocab = dh.build_vocab(self.train + self.validation, min_freq=2) 100 | 101 | self._vocab['unk'] = len(self._vocab.keys()) + 1 102 | 103 | print(len(self._vocab.keys()) + 1) 104 | print('unk::', self._vocab['unk']) 105 | 106 | dh.write_vocab(self._vocab_file_path, self._vocab) 107 | 108 | # prepares input 109 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 110 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 111 | 112 | # prepares input 113 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 114 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 115 | 116 | # embedding dimension 117 | W = dh.get_word2vec_weight(self._vocab, n=300, 118 | path=word2vec_path) 119 | 120 | # solving class imbalance 121 | ratio = self.calculate_label_ratio(Y) 122 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 123 | print('class ratio::', ratio) 124 | 125 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 126 | 127 | print('train_X', X.shape) 128 | print('train_Y', Y.shape) 129 | print('validation_X', tX.shape) 130 | print('validation_Y', tY.shape) 131 | 132 | # trainable true if you want word2vec weights to be updated 133 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False) 134 | 135 | open(self._model_file + 'model_wv.json', 'w').write(model.to_json()) 136 | save_best = ModelCheckpoint(model_file + 'model_wv.json.hdf5', save_best_only=True) 137 | # save_all = ModelCheckpoint(self._model_file + 'weights_wv.{epoch:02d}.hdf5', 138 | # save_best_only=False) 139 | # early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1) 140 | 141 | # training 142 | model.fit(X, Y, batch_size=8, epochs=100, validation_data=(tX, tY), shuffle=True, 143 | callbacks=[save_best], class_weight=ratio) 144 | 145 | def load_train_validation_test_data(self): 146 | self.train = dh.loaddata(self._train_file, self._word_file_path, normalize_text=True, 147 | split_hashtag=True, 148 | ignore_profiles=False) 149 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, normalize_text=True, 150 | split_hashtag=True, 151 | ignore_profiles=False) 152 | if (self._test_file != None): 153 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 154 | split_hashtag=True, 155 | ignore_profiles=True) 156 | 157 | def get_maxlen(self): 158 | return max(map(len, (x for _, x in self.train + self.validation))) 159 | 160 | def write_vocab(self): 161 | with open(self._vocab_file_path, 'w') as fw: 162 | for key, value in self._vocab.iteritems(): 163 | fw.write(str(key) + '\t' + str(value) + '\n') 164 | 165 | def calculate_label_ratio(self, labels): 166 | return collections.Counter(labels) 167 | 168 | 169 | class test_model(sarcasm_model): 170 | test = None 171 | model = None 172 | 173 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file, input_weight_file_path=None): 174 | print('initializing...') 175 | sarcasm_model.__init__(self) 176 | 177 | self._word_file_path = word_file_path 178 | self._model_file = model_file 179 | self._vocab_file_path = vocab_file_path 180 | self._output_file = output_file 181 | self._input_weight_file_path = input_weight_file_path 182 | 183 | print('test_maxlen', self._line_maxlen) 184 | 185 | def load_trained_model(self, weight_file='model_wv.json.hdf5'): 186 | start = time.time() 187 | self.__load_model(self._model_file + 'model_wv.json', self._model_file + weight_file) 188 | end = time.time() 189 | print('model loading time::', (end - start)) 190 | 191 | def __load_model(self, model_path, model_weight_path): 192 | self.model = model_from_json(open(model_path).read()) 193 | print('model loaded from file...') 194 | self.model.load_weights(model_weight_path) 195 | print('model weights loaded from file...') 196 | 197 | def load_vocab(self): 198 | vocab = defaultdict() 199 | with open(self._vocab_file_path, 'r') as f: 200 | for line in f.readlines(): 201 | key, value = line.split('\t') 202 | vocab[key] = value 203 | 204 | return vocab 205 | 206 | def predict(self, test_file, verbose=False): 207 | try: 208 | start = time.time() 209 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, split_hashtag=True, 210 | ignore_profiles=True) 211 | end = time.time() 212 | if (verbose == True): 213 | print('test resource loading time::', (end - start)) 214 | 215 | self._vocab = self.load_vocab() 216 | 217 | start = time.time() 218 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 219 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 220 | end = time.time() 221 | if (verbose == True): 222 | print('test resource preparation time::', (end - start)) 223 | 224 | self.__predict_model(tX, self.test) 225 | except Exception as e: 226 | print('Error:', e) 227 | 228 | def __predict_model(self, tX, test): 229 | y = [] 230 | y_pred = [] 231 | 232 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 233 | 234 | try: 235 | fd = open(self._output_file + '_wv.analysis', 'w') 236 | for i, (label) in enumerate(prediction_probability): 237 | gold_label = test[i][0] 238 | words = test[i][1] 239 | dimensions = test[i][2] 240 | context = test[i][3] 241 | author = test[i][4] 242 | 243 | predicted = numpy.argmax(prediction_probability[i]) 244 | 245 | y.append(int(gold_label)) 246 | y_pred.append(predicted) 247 | 248 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 249 | + str(gold_label) + '\t' 250 | + str(predicted) + '\t' 251 | + ' '.join(words)) 252 | 253 | fd.write('\n') 254 | 255 | print() 256 | 257 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 258 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 259 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 260 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 261 | print('f_score::', metrics.classification_report(y, y_pred)) 262 | fd.close() 263 | except Exception as e: 264 | print(e) 265 | 266 | 267 | if __name__ == "__main__": 268 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 269 | train_file = basepath + '/resource/train/Train_v1.txt' 270 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 271 | test_file = basepath + '/resource/test/Test_v1.txt' 272 | word_file_path = basepath + '/resource/word_list.txt' 273 | 274 | output_file = basepath + '/resource/text_model/TestResults.txt' 275 | model_file = basepath + '/resource/text_model/weights/' 276 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 277 | 278 | # word2vec path 279 | word2vec_path = '/home/striker/word2vec/GoogleNews-vectors-negative300.bin' 280 | 281 | tr = train_model(train_file, validation_file, word_file_path, model_file, vocab_file_path, output_file, 282 | word2vec_path=word2vec_path, test_file=test_file) 283 | 284 | t = test_model(word_file_path, model_file, vocab_file_path, output_file) 285 | t.load_trained_model() 286 | t.predict(test_file) 287 | -------------------------------------------------------------------------------- /src/sarcasm_detection_model_attention.py: -------------------------------------------------------------------------------- 1 | # still working 2 | import os 3 | import sys 4 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D 5 | 6 | sys.path.append('../') 7 | 8 | import collections 9 | import time 10 | import numpy 11 | 12 | numpy.random.seed(1337) 13 | from sklearn import metrics 14 | from keras import initializers, regularizers, constraints, Input 15 | from keras.models import Sequential, model_from_json 16 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten, Layer 17 | from keras.layers.embeddings import Embedding 18 | from keras.layers.recurrent import LSTM 19 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D 20 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau 21 | from keras.callbacks import EarlyStopping 22 | from keras.optimizers import Adam 23 | from keras.utils import np_utils 24 | from collections import defaultdict 25 | import src.data_processing.data_handler as dh 26 | 27 | from keras import backend as K 28 | 29 | 30 | class Attention(Layer): 31 | def __init__(self, 32 | W_regularizer=None, b_regularizer=None, 33 | W_constraint=None, b_constraint=None, 34 | bias=True, **kwargs): 35 | """ 36 | Keras Layer that implements an Attention mechanism for temporal data. 37 | Supports Masking. 38 | Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756] 39 | # Input shape 40 | 3D tensor with shape: `(samples, steps, features)`. 41 | # Output shape 42 | 2D tensor with shape: `(samples, features)`. 43 | :param kwargs: 44 | Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True. 45 | The dimensions are inferred based on the output shape of the RNN. 46 | Note: The layer has been tested with Keras 2.0.6 47 | Example: 48 | model.add(LSTM(64, return_sequences=True)) 49 | model.add(Attention()) 50 | # next add a Dense layer (for classification/regression) or whatever... 51 | """ 52 | self.supports_masking = True 53 | self.init = initializers.get('glorot_uniform') 54 | 55 | self.W_regularizer = regularizers.get(W_regularizer) 56 | self.b_regularizer = regularizers.get(b_regularizer) 57 | 58 | self.W_constraint = constraints.get(W_constraint) 59 | self.b_constraint = constraints.get(b_constraint) 60 | 61 | self.bias = bias 62 | super(Attention, self).__init__(**kwargs) 63 | 64 | def build(self, input_shape): 65 | assert len(input_shape) == 3 66 | 67 | self.W = self.add_weight((input_shape[-1],), 68 | initializer=self.init, 69 | name='{}_W'.format(self.name), 70 | regularizer=self.W_regularizer, 71 | constraint=self.W_constraint) 72 | if self.bias: 73 | self.b = self.add_weight((input_shape[1],), 74 | initializer='zero', 75 | name='{}_b'.format(self.name), 76 | regularizer=self.b_regularizer, 77 | constraint=self.b_constraint) 78 | else: 79 | self.b = None 80 | 81 | self.built = True 82 | 83 | def compute_mask(self, input, input_mask=None): 84 | # do not pass the mask to the next layers 85 | return None 86 | 87 | def call(self, x, mask=None): 88 | eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1) 89 | 90 | if self.bias: 91 | eij += self.b 92 | 93 | eij = K.tanh(eij) 94 | 95 | a = K.exp(eij) 96 | 97 | # apply mask after the exp. will be re-normalized next 98 | if mask is not None: 99 | # Cast the mask to floatX to avoid float64 upcasting in theano 100 | a *= K.cast(mask, K.floatx()) 101 | 102 | # in some cases especially in the early stages of training the sum may be almost zero 103 | # and this results in NaN's. A workaround is to add a very small positive number ε to the sum. 104 | # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx()) 105 | a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) 106 | 107 | a = K.expand_dims(a) 108 | 109 | weighted_input = x * a 110 | return K.sum(weighted_input, axis=1) 111 | 112 | def compute_output_shape(self, input_shape): 113 | return (input_shape[0], input_shape[-1]) 114 | 115 | 116 | class sarcasm_model(): 117 | _train_file = None 118 | _test_file = None 119 | _tweet_file = None 120 | _output_file = None 121 | _model_file = None 122 | _word_file_path = None 123 | _split_word_file_path = None 124 | _emoji_file_path = None 125 | _vocab_file_path = None 126 | _input_weight_file_path = None 127 | _vocab = None 128 | _line_maxlen = None 129 | 130 | def __init__(self): 131 | self._line_maxlen = 50 132 | 133 | def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False): 134 | print('Build model...') 135 | model = Sequential() 136 | 137 | # input = Input(shape=(maxlen,)) 138 | 139 | # emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')(input) 140 | 141 | model.add( 142 | Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')) 143 | 144 | model.add( 145 | Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid', 146 | activation='sigmoid')) 147 | model.add(MaxPooling1D(pool_size=2)) 148 | model.add(Dropout(0.25)) 149 | 150 | model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5, 151 | recurrent_dropout=0.5, unroll=True, return_sequences=True)) 152 | 153 | model.add(Attention()) 154 | 155 | # model.add(GlobalAveragePooling1D()) 156 | # model.add(Dropout(0.5)) 157 | 158 | model.add(Dense(2)) 159 | model.add(Activation('softmax')) 160 | adam = Adam(lr=0.001) 161 | model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 162 | print('No of parameter:', model.count_params()) 163 | 164 | print(model.summary()) 165 | return model 166 | 167 | 168 | class train_model(sarcasm_model): 169 | train = None 170 | validation = None 171 | print("Loading resource...") 172 | 173 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 174 | vocab_file, 175 | output_file, 176 | input_weight_file_path=None): 177 | sarcasm_model.__init__(self) 178 | 179 | self._train_file = train_file 180 | self._validation_file = validation_file 181 | self._word_file_path = word_file_path 182 | self._split_word_file_path = split_word_path 183 | self._emoji_file_path = emoji_file_path 184 | self._model_file = model_file 185 | self._vocab_file_path = vocab_file 186 | self._output_file = output_file 187 | self._input_weight_file_path = input_weight_file_path 188 | 189 | self.load_train_validation_data() 190 | 191 | print(self._line_maxlen) 192 | 193 | # build vocabulary 194 | # truncates words with min freq=10 195 | self._vocab = dh.build_vocab(self.train, min_freq=2) 196 | if ('unk' not in self._vocab): 197 | self._vocab['unk'] = len(self._vocab.keys()) + 1 198 | 199 | print(len(self._vocab.keys()) + 1) 200 | print('unk::', self._vocab['unk']) 201 | 202 | dh.write_vocab(self._vocab_file_path, self._vocab) 203 | 204 | # prepares input 205 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab) 206 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 207 | 208 | # prepares input 209 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab) 210 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 211 | 212 | # embedding dimension 213 | dimension_size = 30 214 | 215 | # solving class imbalance 216 | ratio = self.calculate_label_ratio(Y) 217 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 218 | print('class ratio::', ratio) 219 | 220 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 221 | 222 | print('train_X', X.shape) 223 | print('train_Y', Y.shape) 224 | print('validation_X', tX.shape) 225 | print('validation_Y', tY.shape) 226 | 227 | # trainable true if you want word2vec weights to be updated 228 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128, 229 | embedding_dimension=dimension_size, 230 | trainable=True) 231 | 232 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 233 | save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True) 234 | save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5', 235 | save_best_only=False) 236 | early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1) 237 | lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto', 238 | epsilon=0.0001, 239 | cooldown=0, min_lr=0.000001) 240 | 241 | # training 242 | model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, verbose=2, 243 | callbacks=[save_best, save_all, early_stopping], class_weight=ratio) 244 | # model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True, verbose=1, 245 | # callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio) 246 | 247 | def load_train_validation_data(self): 248 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 249 | self._emoji_file_path, normalize_text=True, 250 | split_hashtag=True, 251 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 252 | print('Training data loading finished...') 253 | 254 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 255 | self._emoji_file_path, 256 | normalize_text=True, 257 | split_hashtag=False, 258 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 259 | print('Validation data loading finished...') 260 | 261 | def get_maxlen(self): 262 | return max(map(len, (x for _, x in self.train + self.validation))) 263 | 264 | def write_vocab(self): 265 | with open(self._vocab_file_path, 'w') as fw: 266 | for key, value in self._vocab.iteritems(): 267 | fw.write(str(key) + '\t' + str(value) + '\n') 268 | 269 | def calculate_label_ratio(self, labels): 270 | return collections.Counter(labels) 271 | 272 | 273 | class test_model(sarcasm_model): 274 | test = None 275 | model = None 276 | 277 | def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file, 278 | input_weight_file_path=None): 279 | print('initializing...') 280 | sarcasm_model.__init__(self) 281 | 282 | self._model_file = model_file 283 | self._word_file_path = word_file_path 284 | self._split_word_file_path = split_word_path 285 | self._emoji_file_path = emoji_file_path 286 | self._vocab_file_path = vocab_file_path 287 | self._output_file = output_file 288 | self._input_weight_file_path = input_weight_file_path 289 | 290 | print('test_maxlen', self._line_maxlen) 291 | 292 | def load_trained_model(self, weight_file='model.json.hdf5'): 293 | start = time.time() 294 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file) 295 | end = time.time() 296 | print('model loading time::', (end - start)) 297 | 298 | def __load_model(self, model_path, model_weight_path): 299 | self.model = model_from_json(open(model_path).read()) 300 | print('model loaded from file...') 301 | self.model.load_weights(model_weight_path) 302 | print('model weights loaded from file...') 303 | 304 | def load_vocab(self): 305 | vocab = defaultdict() 306 | with open(self._vocab_file_path, 'r') as f: 307 | for line in f.readlines(): 308 | key, value = line.split('\t') 309 | vocab[key] = value 310 | 311 | return vocab 312 | 313 | def predict(self, test_file, verbose=False): 314 | try: 315 | start = time.time() 316 | self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path, 317 | normalize_text=True, split_hashtag=True, 318 | ignore_profiles=False, lowercase=False, n_grams=3, at_character=True) 319 | end = time.time() 320 | if (verbose == True): 321 | print('test resource loading time::', (end - start)) 322 | 323 | self._vocab = self.load_vocab() 324 | print('vocab loaded...') 325 | 326 | start = time.time() 327 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 328 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 329 | end = time.time() 330 | if (verbose == True): 331 | print('test resource preparation time::', (end - start)) 332 | 333 | self.__predict_model(tX, self.test) 334 | except Exception as e: 335 | print('Error:', e) 336 | 337 | def __predict_model(self, tX, test): 338 | y = [] 339 | y_pred = [] 340 | 341 | prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1) 342 | 343 | try: 344 | fd = open(self._output_file + '.analysis', 'w') 345 | for i, (label) in enumerate(prediction_probability): 346 | id = test[i][0] 347 | gold_label = test[i][1] 348 | words = test[i][2] 349 | dimensions = test[i][3] 350 | context = test[i][4] 351 | author = test[i][5] 352 | 353 | predicted = numpy.argmax(prediction_probability[i]) 354 | 355 | y.append(int(gold_label)) 356 | y_pred.append(predicted) 357 | 358 | # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t' 359 | # + str(gold_label) + '\t' 360 | # + str(predicted) + '\t' 361 | # + ' '.join(words)) 362 | fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n') 363 | 364 | print() 365 | 366 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 367 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 368 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 369 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 370 | print('f_score::', metrics.classification_report(y, y_pred)) 371 | fd.close() 372 | except Exception as e: 373 | print(e) 374 | 375 | 376 | if __name__ == "__main__": 377 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 378 | train_file = basepath + '/resource/train/Train_v1.txt' 379 | validation_file = basepath + '/resource/dev/Dev_v1.txt' 380 | test_file = basepath + '/resource/test/Test_v1.tsv' 381 | word_file_path = basepath + '/resource/word_list_freq.txt' 382 | split_word_path = basepath + '/resource/word_split.txt' 383 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 384 | 385 | output_file = basepath + '/resource/text_model/TestResults.txt' 386 | model_file = basepath + '/resource/text_model/weights/' 387 | vocab_file_path = basepath + '/resource/text_model/vocab_list.txt' 388 | 389 | # uncomment for training 390 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 391 | vocab_file_path, output_file) 392 | 393 | # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file) 394 | # t.load_trained_model() 395 | # t.predict(test_file) 396 | -------------------------------------------------------------------------------- /src/sarcasm_detection_moods_siamese.py: -------------------------------------------------------------------------------- 1 | # not finalized 2 | import os 3 | import collections 4 | import random 5 | import sys 6 | 7 | sys.path.append('../') 8 | 9 | import time 10 | import numpy 11 | 12 | numpy.random.seed(1337) 13 | 14 | from keras.layers.wrappers import TimeDistributed 15 | from keras import backend as K, regularizers 16 | from sklearn import metrics 17 | from keras.models import model_from_json 18 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape 19 | from keras.layers.embeddings import Embedding 20 | from keras.layers.recurrent import LSTM 21 | from keras.layers.convolutional import Convolution1D 22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 23 | 24 | from keras.layers.merge import add, concatenate, subtract, multiply 25 | from keras.models import Model 26 | from keras.utils import np_utils 27 | from keras.layers import Input 28 | import src.data_processing.data_handler as dh 29 | from collections import defaultdict 30 | 31 | 32 | class sarcasm_model(): 33 | _train_file = None 34 | _gold_data_path = None 35 | _validation_file = None 36 | _tweet_file = None 37 | # test_debug = None 38 | _output_file = None 39 | _model_file = None 40 | _word_file_path = None 41 | _vocab_file_path = None 42 | _input_weight_file_path = None 43 | _vocab = None 44 | _line_maxlen = None 45 | 46 | def __init__(self): 47 | self._train_file = None 48 | self._test_file = None 49 | self._validation_file = None 50 | self._tweet_file = None 51 | self._output_file = None 52 | self._model_file = None 53 | self._word_file_path = None 54 | self._vocab_file_path = None 55 | self._input_weight_file_path = None 56 | self._vocab = None 57 | 58 | self._line_maxlen = 30 59 | 60 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True, 61 | batch_size=1): 62 | 63 | print('Building model...') 64 | 65 | context_input = Input(name='context', batch_shape=(batch_size, maxlen)) 66 | 67 | if (len(c_emb_weights) == 0): 68 | c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal', 69 | trainable=trainable)(context_input) 70 | else: 71 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights], 72 | trainable=trainable)(context_input) 73 | 74 | c_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal', 75 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(c_emb) 76 | c_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal', 77 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))( 78 | c_cnn1) 79 | 80 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal', 81 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 82 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 83 | recurrent_regularizer=regularizers.l2(0.01), 84 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_cnn2) 85 | 86 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal', 87 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 88 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 89 | recurrent_regularizer=regularizers.l2(0.01), 90 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False, 91 | go_backwards=True)(c_cnn2) 92 | 93 | c_merged = add([c_lstm1, c_lstm2]) 94 | c_merged = Dropout(0.25)(c_merged) 95 | 96 | text_input = Input(name='text', batch_shape=(batch_size, maxlen)) 97 | 98 | if (len(emb_weights) == 0): 99 | emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal', 100 | trainable=trainable)(text_input) 101 | else: 102 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 103 | trainable=trainable)(text_input) 104 | 105 | t_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal', 106 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(emb) 107 | t_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal', 108 | activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))( 109 | t_cnn1) 110 | 111 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 112 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 113 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 114 | recurrent_regularizer=regularizers.l2(0.01), 115 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(t_cnn2) 116 | 117 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 118 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 119 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 120 | recurrent_regularizer=regularizers.l2(0.01), 121 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False, 122 | go_backwards=True)(t_cnn2) 123 | 124 | t_merged = add([t_lstm1, t_lstm2]) 125 | t_merged = Dropout(0.25)(t_merged) 126 | 127 | awc_input = Input(name='awc', batch_shape=(batch_size, 11)) 128 | 129 | t_merged = Reshape((-1, 1))(t_merged) 130 | 131 | t_merged = multiply([t_merged, awc_input]) 132 | 133 | t_merged = Flatten()(t_merged) 134 | 135 | merged = concatenate([c_merged, t_merged], axis=1) 136 | 137 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged) 138 | dnn_1 = Dropout(0.25)(dnn_1) 139 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1) 140 | 141 | softmax = Activation('softmax')(dnn_2) 142 | 143 | model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax) 144 | 145 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 146 | print('No of parameter:', model.count_params()) 147 | 148 | print(model.summary()) 149 | return model 150 | 151 | 152 | class train_model(sarcasm_model): 153 | train = None 154 | validation = None 155 | 156 | def load_train_validation_test_data(self): 157 | print("Loading resource...") 158 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 159 | self._emoji_file_path, normalize_text=True, 160 | split_hashtag=True, 161 | ignore_profiles=False) 162 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 163 | self._emoji_file_path, 164 | normalize_text=True, 165 | split_hashtag=True, 166 | ignore_profiles=False) 167 | 168 | if (self._test_file != None): 169 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 170 | split_hashtag=True, 171 | ignore_profiles=True) 172 | 173 | def split_train_validation(self, train, ratio=.1): 174 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))]) 175 | print(len(test_indices)) 176 | train_data = [] 177 | validation_data = [] 178 | for i, t in enumerate(train): 179 | if (test_indices.__contains__(i)): 180 | validation_data.append(t) 181 | else: 182 | train_data.append(t) 183 | return train_data, validation_data 184 | 185 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 186 | vocab_file, 187 | output_file, 188 | input_weight_file_path=None): 189 | sarcasm_model.__init__(self) 190 | 191 | self._train_file = train_file 192 | self._validation_file = validation_file 193 | self._word_file_path = word_file_path 194 | self._split_word_file_path = split_word_path 195 | self._emoji_file_path = emoji_file_path 196 | self._model_file = model_file 197 | self._vocab_file_path = vocab_file 198 | self._output_file = output_file 199 | self._input_weight_file_path = input_weight_file_path 200 | 201 | self.load_train_validation_test_data() 202 | 203 | batch_size = 32 204 | 205 | print(self._line_maxlen) 206 | self._vocab = dh.build_vocab(self.train, ignore_context=False) 207 | self._vocab['unk'] = len(self._vocab.keys()) + 1 208 | 209 | print(len(self._vocab.keys()) + 1) 210 | print('unk::', self._vocab['unk']) 211 | 212 | dh.write_vocab(self._vocab_file_path, self._vocab) 213 | 214 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None) 215 | 216 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None) 217 | 218 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 219 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen) 220 | D = dh.pad_sequence_1d(D, maxlen=11) 221 | 222 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 223 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 224 | tD = dh.pad_sequence_1d(tD, maxlen=11) 225 | 226 | hidden_units = 128 227 | dimension_size = 300 228 | 229 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size, 230 | path=word2vec_path) 231 | 232 | cW = W 233 | 234 | print('Word2vec obtained....') 235 | 236 | ratio = self.calculate_label_ratio(Y) 237 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 238 | 239 | print('ratio', ratio) 240 | 241 | dimension_vocab = numpy.unique(D) 242 | print(len(dimension_vocab)) 243 | 244 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 245 | 246 | print('train_X', X.shape) 247 | print('train_C', C.shape) 248 | print('train_D', D.shape) 249 | print('train_Y', Y.shape) 250 | 251 | print('validation_X', tX.shape) 252 | print('validation_C', tC.shape) 253 | print('validation_D', tD.shape) 254 | print('validation_Y', tY.shape) 255 | 256 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW, 257 | hidden_units=hidden_units, trainable=False, dimension_length=11, 258 | batch_size=batch_size) 259 | 260 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 261 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss') 262 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5', 263 | # save_best_only=False) 264 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1) 265 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto', 266 | epsilon=0.0001, 267 | cooldown=0, min_lr=0.000001) 268 | 269 | model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True, 270 | callbacks=[save_best, lr_tuner], class_weight=ratio) 271 | 272 | def get_maxlen(self): 273 | return max(map(len, (x for _, x in self.train + self.validation))) 274 | 275 | def write_vocab(self): 276 | with open(self._vocab_file_path, 'w') as fw: 277 | for key, value in self._vocab.iteritems(): 278 | fw.write(str(key) + '\t' + str(value) + '\n') 279 | 280 | def calculate_label_ratio(self, labels, ): 281 | return collections.Counter(labels) 282 | 283 | 284 | class test_model(sarcasm_model): 285 | test = None 286 | model = None 287 | 288 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file): 289 | print('initializing...') 290 | sarcasm_model.__init__(self) 291 | 292 | self._word_file_path = word_file_path 293 | self._model_file = model_file 294 | self._vocab_file_path = vocab_file_path 295 | self._output_file = output_file 296 | 297 | # self._line_maxlen = 45 298 | print('test_maxlen', self._line_maxlen) 299 | 300 | def predict_cross_validation(self, tC, tX, tD, test): 301 | self.__predict_model([tC, tX, tD], test) 302 | 303 | def load_trained_model(self, weight_file='model.json.hdf5'): 304 | start = time.time() 305 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file) 306 | end = time.time() 307 | print('model loading time::', (end - start)) 308 | 309 | def __load_model(self, model_path, model_weight_path): 310 | self.model = model_from_json(open(model_path).read()) 311 | print('model loaded from file...') 312 | self.model.load_weights(model_weight_path) 313 | print('model weights loaded from file...') 314 | 315 | def load_vocab(self): 316 | vocab = defaultdict() 317 | with open(self._vocab_file_path, 'r') as f: 318 | for line in f.readlines(): 319 | key, value = line.split('\t') 320 | vocab[key] = value 321 | 322 | return vocab 323 | 324 | def predict(self, test_file, verbose=False): 325 | start = time.time() 326 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, 327 | split_hashtag=True, 328 | ignore_profiles=False) 329 | end = time.time() 330 | if (verbose == True): 331 | print('test resource loading time::', (end - start)) 332 | 333 | self._vocab = self.load_vocab() 334 | 335 | start = time.time() 336 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 337 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 338 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 339 | tD = dh.pad_sequence_1d(tD, maxlen=11) 340 | 341 | end = time.time() 342 | if (verbose == True): 343 | print('test resource preparation time::', (end - start)) 344 | 345 | self.__predict_model([tC, tX, tD], self.test) 346 | 347 | def __predict_model(self, tX, test): 348 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1) 349 | 350 | y = [] 351 | y_pred = [] 352 | 353 | fd = open(self._output_file + '.analysis', 'w') 354 | for i, (label) in enumerate(prediction_probability): 355 | gold_label = test[i][0] 356 | words = test[i][1] 357 | dimensions = test[i][2] 358 | context = test[i][3] 359 | author = test[i][4] 360 | 361 | predicted = numpy.argmax(prediction_probability[i]) 362 | 363 | y.append(int(gold_label)) 364 | y_pred.append(predicted) 365 | 366 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 367 | + str(gold_label) + '\t' 368 | + str(predicted) + '\t' 369 | + ' '.join(words) + '\t' 370 | + str(dimensions) + '\t' 371 | + ' '.join(context)) 372 | 373 | fd.write('\n') 374 | 375 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 376 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 377 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 378 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 379 | print('f_score::', metrics.classification_report(y, y_pred)) 380 | 381 | fd.close() 382 | 383 | 384 | if __name__ == "__main__": 385 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 386 | train_file = basepath + '/resource/train/Train_context_moods_v1.txt' 387 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt' 388 | test_file = basepath + '/resource/test/Test_context_AW.txt' 389 | word_file_path = basepath + '/resource/word_list_freq.txt' 390 | split_word_path = basepath + '/resource/word_split.txt' 391 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 392 | 393 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt' 394 | model_file = basepath + '/resource/text_context_awc_model/weights/' 395 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt' 396 | 397 | # word2vec path 398 | word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin' 399 | 400 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 401 | vocab_file_path, output_file) 402 | 403 | # testing the model 404 | # with K.get_session(): 405 | # t = test_model(word_file_path, model_file, vocab_file_path, output_file) 406 | # t.load_trained_model() 407 | # t.predict(test_file) 408 | -------------------------------------------------------------------------------- /src/sarcasm_detection_siamese.py: -------------------------------------------------------------------------------- 1 | # not finalized 2 | import os 3 | import collections 4 | import random 5 | import sys 6 | 7 | sys.path.append('../') 8 | 9 | import time 10 | import numpy 11 | 12 | numpy.random.seed(1337) 13 | 14 | from keras.layers.wrappers import TimeDistributed 15 | from keras import backend as K, regularizers 16 | from sklearn import metrics 17 | from keras.models import model_from_json 18 | from keras.layers.core import Dropout, Dense, Activation, Flatten 19 | from keras.layers.embeddings import Embedding 20 | from keras.layers.recurrent import LSTM 21 | from keras.layers.convolutional import Convolution1D 22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau 23 | 24 | from keras.layers.merge import add, concatenate, subtract 25 | from keras.models import Model 26 | from keras.utils import np_utils 27 | from keras.layers import Input 28 | import src.data_processing.data_handler as dh 29 | from collections import defaultdict 30 | 31 | 32 | class sarcasm_model(): 33 | _train_file = None 34 | _gold_data_path = None 35 | _validation_file = None 36 | _tweet_file = None 37 | # test_debug = None 38 | _output_file = None 39 | _model_file = None 40 | _word_file_path = None 41 | _vocab_file_path = None 42 | _input_weight_file_path = None 43 | _vocab = None 44 | _line_maxlen = None 45 | 46 | def __init__(self): 47 | self._train_file = None 48 | self._test_file = None 49 | self._validation_file = None 50 | self._tweet_file = None 51 | self._output_file = None 52 | self._model_file = None 53 | self._word_file_path = None 54 | self._vocab_file_path = None 55 | self._input_weight_file_path = None 56 | self._vocab = None 57 | 58 | self._line_maxlen = 30 59 | 60 | def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True, 61 | batch_size=1): 62 | 63 | print('Building model...') 64 | 65 | context_input = Input(name='context', batch_shape=(batch_size, maxlen)) 66 | 67 | if (len(c_emb_weights) == 0): 68 | c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal', 69 | trainable=trainable)(context_input) 70 | else: 71 | c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights], 72 | trainable=trainable)(context_input) 73 | 74 | c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal', 75 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 76 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 77 | recurrent_regularizer=regularizers.l2(0.01), 78 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_emb) 79 | 80 | c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal', 81 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 82 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 83 | recurrent_regularizer=regularizers.l2(0.01), 84 | dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False, 85 | go_backwards=True)(c_emb) 86 | 87 | c_merged = add([c_lstm1, c_lstm2]) 88 | c_merged = Dropout(0.25)(c_merged) 89 | 90 | text_input = Input(name='text', batch_shape=(batch_size, maxlen)) 91 | 92 | if (len(emb_weights) == 0): 93 | emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal', 94 | trainable=trainable)(text_input) 95 | else: 96 | emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights], 97 | trainable=trainable)(text_input) 98 | 99 | t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 100 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 101 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 102 | recurrent_regularizer=regularizers.l2(0.01), 103 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(emb) 104 | 105 | t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal', 106 | bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid', 107 | kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01), 108 | recurrent_regularizer=regularizers.l2(0.01), 109 | dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False, 110 | go_backwards=True)(emb) 111 | 112 | t_merged = add([t_lstm1, t_lstm2]) 113 | t_merged = Dropout(0.25)(t_merged) 114 | 115 | merged = subtract([c_merged, t_merged]) 116 | 117 | dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged) 118 | dnn_1 = Dropout(0.25)(dnn_1) 119 | dnn_2 = Dense(2, activation='sigmoid')(dnn_1) 120 | 121 | softmax = Activation('softmax')(dnn_2) 122 | 123 | model = Model(inputs=[context_input, text_input], outputs=softmax) 124 | 125 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 126 | print('No of parameter:', model.count_params()) 127 | 128 | print(model.summary()) 129 | return model 130 | 131 | 132 | class train_model(sarcasm_model): 133 | train = None 134 | validation = None 135 | 136 | def load_train_validation_test_data(self): 137 | print("Loading resource...") 138 | self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path, 139 | self._emoji_file_path, normalize_text=True, 140 | split_hashtag=True, 141 | ignore_profiles=False) 142 | self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path, 143 | self._emoji_file_path, 144 | normalize_text=True, 145 | split_hashtag=True, 146 | ignore_profiles=False) 147 | 148 | if (self._test_file != None): 149 | self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True, 150 | split_hashtag=True, 151 | ignore_profiles=True) 152 | 153 | def split_train_validation(self, train, ratio=.1): 154 | test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))]) 155 | print(len(test_indices)) 156 | train_data = [] 157 | validation_data = [] 158 | for i, t in enumerate(train): 159 | if (test_indices.__contains__(i)): 160 | validation_data.append(t) 161 | else: 162 | train_data.append(t) 163 | return train_data, validation_data 164 | 165 | def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 166 | vocab_file, 167 | output_file, 168 | input_weight_file_path=None): 169 | sarcasm_model.__init__(self) 170 | 171 | self._train_file = train_file 172 | self._validation_file = validation_file 173 | self._word_file_path = word_file_path 174 | self._split_word_file_path = split_word_path 175 | self._emoji_file_path = emoji_file_path 176 | self._model_file = model_file 177 | self._vocab_file_path = vocab_file 178 | self._output_file = output_file 179 | self._input_weight_file_path = input_weight_file_path 180 | 181 | self.load_train_validation_test_data() 182 | 183 | batch_size = 32 184 | 185 | print(self._line_maxlen) 186 | self._vocab = dh.build_vocab(self.train, ignore_context=False) 187 | self._vocab['unk'] = len(self._vocab.keys()) + 1 188 | 189 | print(len(self._vocab.keys()) + 1) 190 | print('unk::', self._vocab['unk']) 191 | 192 | dh.write_vocab(self._vocab_file_path, self._vocab) 193 | 194 | X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None) 195 | 196 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None) 197 | 198 | X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen) 199 | C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen) 200 | D = dh.pad_sequence_1d(D, maxlen=11) 201 | 202 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 203 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 204 | tD = dh.pad_sequence_1d(tD, maxlen=11) 205 | 206 | hidden_units = 128 207 | dimension_size = 300 208 | 209 | W = dh.get_word2vec_weight(self._vocab, n=dimension_size, 210 | path=word2vec_path) 211 | 212 | cW = W 213 | 214 | print('Word2vec obtained....') 215 | 216 | ratio = self.calculate_label_ratio(Y) 217 | ratio = [max(ratio.values()) / value for key, value in ratio.items()] 218 | 219 | print('ratio', ratio) 220 | 221 | dimension_vocab = numpy.unique(D) 222 | print(len(dimension_vocab)) 223 | 224 | Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)] 225 | 226 | print('train_X', X.shape) 227 | print('train_C', C.shape) 228 | print('train_D', D.shape) 229 | print('train_Y', Y.shape) 230 | 231 | print('validation_X', tX.shape) 232 | print('validation_C', tC.shape) 233 | print('validation_D', tD.shape) 234 | print('validation_Y', tY.shape) 235 | 236 | model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW, 237 | hidden_units=hidden_units, trainable=False, 238 | batch_size=batch_size) 239 | 240 | open(self._model_file + 'model.json', 'w').write(model.to_json()) 241 | save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss') 242 | # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5', 243 | # save_best_only=False) 244 | early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1) 245 | lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto', 246 | epsilon=0.0001, 247 | cooldown=0, min_lr=0.000001) 248 | 249 | model.fit([C, X], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX], tY), shuffle=True, 250 | callbacks=[save_best, lr_tuner], class_weight=ratio) 251 | 252 | def get_maxlen(self): 253 | return max(map(len, (x for _, x in self.train + self.validation))) 254 | 255 | def write_vocab(self): 256 | with open(self._vocab_file_path, 'w') as fw: 257 | for key, value in self._vocab.iteritems(): 258 | fw.write(str(key) + '\t' + str(value) + '\n') 259 | 260 | def calculate_label_ratio(self, labels, ): 261 | return collections.Counter(labels) 262 | 263 | 264 | class test_model(sarcasm_model): 265 | test = None 266 | model = None 267 | 268 | def __init__(self, word_file_path, model_file, vocab_file_path, output_file): 269 | print('initializing...') 270 | sarcasm_model.__init__(self) 271 | 272 | self._word_file_path = word_file_path 273 | self._model_file = model_file 274 | self._vocab_file_path = vocab_file_path 275 | self._output_file = output_file 276 | 277 | # self._line_maxlen = 45 278 | print('test_maxlen', self._line_maxlen) 279 | 280 | def predict_cross_validation(self, tC, tX, tD, test): 281 | self.__predict_model([tC, tX, tD], test) 282 | 283 | def load_trained_model(self, weight_file='model.json.hdf5'): 284 | start = time.time() 285 | self.__load_model(self._model_file + 'model.json', self._model_file + weight_file) 286 | end = time.time() 287 | print('model loading time::', (end - start)) 288 | 289 | def __load_model(self, model_path, model_weight_path): 290 | self.model = model_from_json(open(model_path).read()) 291 | print('model loaded from file...') 292 | self.model.load_weights(model_weight_path) 293 | print('model weights loaded from file...') 294 | 295 | def load_vocab(self): 296 | vocab = defaultdict() 297 | with open(self._vocab_file_path, 'r') as f: 298 | for line in f.readlines(): 299 | key, value = line.split('\t') 300 | vocab[key] = value 301 | 302 | return vocab 303 | 304 | def predict(self, test_file, verbose=False): 305 | start = time.time() 306 | self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, 307 | split_hashtag=True, 308 | ignore_profiles=False) 309 | end = time.time() 310 | if (verbose == True): 311 | print('test resource loading time::', (end - start)) 312 | 313 | self._vocab = self.load_vocab() 314 | 315 | start = time.time() 316 | tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab) 317 | tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen) 318 | tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen) 319 | tD = dh.pad_sequence_1d(tD, maxlen=11) 320 | 321 | end = time.time() 322 | if (verbose == True): 323 | print('test resource preparation time::', (end - start)) 324 | 325 | self.__predict_model([tC, tX, tD], self.test) 326 | 327 | def __predict_model(self, tX, test): 328 | prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1) 329 | 330 | y = [] 331 | y_pred = [] 332 | 333 | fd = open(self._output_file + '.analysis', 'w') 334 | for i, (label) in enumerate(prediction_probability): 335 | gold_label = test[i][0] 336 | words = test[i][1] 337 | dimensions = test[i][2] 338 | context = test[i][3] 339 | author = test[i][4] 340 | 341 | predicted = numpy.argmax(prediction_probability[i]) 342 | 343 | y.append(int(gold_label)) 344 | y_pred.append(predicted) 345 | 346 | fd.write(str(label[0]) + '\t' + str(label[1]) + '\t' 347 | + str(gold_label) + '\t' 348 | + str(predicted) + '\t' 349 | + ' '.join(words) + '\t' 350 | + str(dimensions) + '\t' 351 | + ' '.join(context)) 352 | 353 | fd.write('\n') 354 | 355 | print('accuracy::', metrics.accuracy_score(y, y_pred)) 356 | print('precision::', metrics.precision_score(y, y_pred, average='weighted')) 357 | print('recall::', metrics.recall_score(y, y_pred, average='weighted')) 358 | print('f_score::', metrics.f1_score(y, y_pred, average='weighted')) 359 | print('f_score::', metrics.classification_report(y, y_pred)) 360 | 361 | fd.close() 362 | 363 | 364 | if __name__ == "__main__": 365 | basepath = os.getcwd()[:os.getcwd().rfind('/')] 366 | train_file = basepath + '/resource/train/Train_context_moods_v1.txt' 367 | validation_file = basepath + '/resource/dev/Dev_context_moods.txt' 368 | test_file = basepath + '/resource/test/Test_context_AW.txt' 369 | word_file_path = basepath + '/resource/word_list_freq.txt' 370 | split_word_path = basepath + '/resource/word_split.txt' 371 | emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt' 372 | 373 | output_file = basepath + '/resource/text_context_awc_model/TestResults.txt' 374 | model_file = basepath + '/resource/text_context_awc_model/weights/' 375 | vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt' 376 | 377 | # word2vec path 378 | word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin' 379 | 380 | tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file, 381 | vocab_file_path, output_file) 382 | 383 | # testing the model 384 | # with K.get_session(): 385 | # t = test_model(word_file_path, model_file, vocab_file_path, output_file) 386 | # t.load_trained_model() 387 | # t.predict(test_file) 388 | --------------------------------------------------------------------------------