├── .gitignore
├── LICENSE.txt
├── README.md
├── __init__.py
├── resource
    ├── .directory
    ├── abbreviations.txt
    ├── dev
    │   ├── Dev_context_moods.txt
    │   ├── Dev_v1.txt
    │   └── __init__.py
    ├── emoji_unicode_names_final.txt
    ├── offensive_words.txt
    ├── test
    │   └── Test_v1.txt
    ├── text_context_awc_model
    │   └── weights
    │   │   └── model.json
    ├── text_model
    │   ├── vocab_list.txt
    │   └── weights
    │   │   └── model.json
    ├── text_model_2D
    │   ├── vocab_list.txt
    │   └── weights
    │   │   └── model.json
    ├── train
    │   ├── .directory
    │   └── Train_v1.txt
    ├── word_list.txt
    ├── word_list_freq.txt
    └── word_split.txt
└── src
    ├── __init__.py
    ├── data_processing
        ├── __init__.py
        ├── data_handler.py
        └── glove2Word2vecLoader.py
    ├── sarcasm_context_moods.py
    ├── sarcasm_detection_model_CNN_DNN_2D.py
    ├── sarcasm_detection_model_CNN_LSTM_ATTN.py
    ├── sarcasm_detection_model_CNN_LSTM_DNN.py
    ├── sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py
    ├── sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
    ├── sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py
    ├── sarcasm_detection_model_attention.py
    ├── sarcasm_detection_moods_siamese.py
    └── sarcasm_detection_siamese.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SarcamDetection
 2 | Sarcasm detection on tweets using neural network.<br />
 3 | [This repository] perform[s] semantic  modelling  of  sentences  using  neural  networks for the task of sarcasm detection ([Ghosh & Veale, 2016](http://www.aclweb.org/anthology/W16-0425)). 
 4 | ## Pre-requisite
 5 | - nltk (TweetTokenizer)
 6 | - Keras
 7 | - Tensorflow
 8 | - numpy
 9 | - scipy
10 | - gensim (if you are using word2vec)
11 | - itertools
12 | 
13 | ## Cloning the repository
14 | ```
15 | git clone git@github.com:AniSkywalker/SarcasmDetection.git
16 | cd SarcasmDetection/src/
17 | ```
18 | If you want to use the pre-trained model, you'll have to [download it](https://drive.google.com/drive/folders/0B7C_0ZfEBcpRbDZKelBZTFFsV0E?usp=sharing) from Google Drive and save it into `/resource/text_model/weights/`.
19 | 
20 | ## Using this package
21 | This code is run by the following command:
22 | ```
23 | python sarcasm_detection_model_CNN_LSTM_DNN.py
24 | ```
25 | 
26 | ### Citation
27 | Please cite the following paper when using this code:
28 | 
29 | > **Fracking Sarcasm using Neural Network.**<br />
30 | > Aniruddha Ghosh and Tony Veale. 7th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis (WASSA 2016). NAACL-HLT. 16th June 2016, San Diego, California, U.S.A. 
31 | 
32 | ## Output
33 | The supplied input is rated as either **0** meaning _non-sarcastic_ or **1** meaning _sarcastic_.
34 | 
35 | ## Training
36 | If you want to train the model with your own data, you can save your _train_, _development_ and _test_ data into the `/resource/train`, `/resource/dev` and `/resource/test` folders correspondingly.
37 | 
38 | The system accepts dataset in the tab separated format — as shown below. An example can be found in [`/resource/train/train_v1.txt`](https://github.com/AniSkywalker/SarcasmDetection/tree/master/resource/train). 
39 | ```
40 | id<tab>label<tab>tweet
41 | ```
42 | 
43 | ## Context information
44 | To run the model with context information and psychological dimensions (using Tensorflow) run:
45 | ```
46 | python sarcasm_context_moods.py
47 | ```
48 | 
49 | ### Citation
50 | Please cite the following paper when using context information and psychological dimensions:
51 | > **Magnets for Sarcasm: Making Sarcasm Detection Timely, Contextual and Very Personal**<br />
52 | > Aniruddha Ghosh and Tony Veale. Conference on Empirical Methods in Natural Language Processing (EMNLP). 7th-11th September, 2017, Copenhagen, Denmark.
53 | 
54 | ## Notes
55 | - Samples of _train_, _dev_, and _test_ files are included for both versions.
56 | - For a test data set, please contact at aniruddha.ghosh@ucdconnect.ie
57 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'root'
2 | 


--------------------------------------------------------------------------------
/resource/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2017,3,10,1,14,25
3 | Version=3
4 | 


--------------------------------------------------------------------------------
/resource/abbreviations.txt:
--------------------------------------------------------------------------------
 1 | i've	i have
 2 | we've	we have
 3 | can't	can not
 4 | i'm	i am
 5 | we're	we are
 6 | don't	do not
 7 | didn't	did not
 8 | tt's	it is
 9 | that's	that is
10 | he's	he is
11 | she's	she is
12 | let's	let us
13 | there's	there is
14 | how's	how is
15 | i'd	i would
16 | 2F4U	Too Fast For You
17 | 4YEO FYEO	For Your Eyes Only
18 | AAMOF	As a Matter of Fact
19 | ACK	Acknowledgment
20 | AFAIK	As far as I know
21 | AFAIR	As far as I remember
22 | AFK	Away from Keyboard
23 | AKA	Also known as
24 | B2K BTK	Back to Keyboard
25 | BTT	Back to Topic
26 | BTW	By the Way
27 | B/C	Because
28 | C&P	Copy and Paste
29 | CU	See you
30 | CYS	Check your Settings
31 | DIY	Do it Yourself
32 | EOBD	End of Business Day
33 | EOD	End of Discussion
34 | EOM	End of Message
35 | EOT	End of Text
36 | FAQ	Frequently asked Questions
37 | FACK	Full Acknowledge
38 | FKA	Formerly known as
39 | FWIW	For what it is Worth
40 | FYI	 For your Information
41 | JFYI	Just For your Information
42 | FTW	Fuck the World
43 | HF	Have fun
44 | HTH	Hope this Helps
45 | IDK	I do not know
46 | IIRC	If I Recall Correctly
47 | IMHO	In my Humble Opinion
48 | IMO	In my Opinion
49 | IMNSHO	In my not so Humble Opinion
50 | IOW	In other Words
51 | ITT	In this Thread
52 | LOL	Laughing out loud
53 | DGMW	Do not get me wrong
54 | MMW	Mark my Words
55 | N/A	Not Available
56 | NaN	Not a Number
57 | NNTR	No need to Reply
58 | noob	Newbie
59 | n00b	Newbie
60 | NOYB	None of your Business
61 | NRN	No Reply Necessary
62 | OMG	Oh my God
63 | OP	Original Poster
64 | OT	Off Topic
65 | OTOH	On the other Hand
66 | PEBKAC	Problem exists between Keyboard and Chair
67 | POV	Point of View
68 | ROTFL	Rolling on the Floor Laughing
69 | RSVP	Repondez s'il vous plait
70 | RTFM	Read the fine Manual
71 | SCNR	Sorry could not Resist
72 | SFLR	Sorry for late Reply
73 | SPOC	Single Point of Contact
74 | TBA	To be Announced
75 | TBC	To be Continued
76 | TIA	Thanks in Advance
77 | TGIF	Thanks God, its Friday
78 | THX TNX	Thanks
79 | TQ	Thank You
80 | TYVM	Thank You Very Much
81 | TYT	Take your Time
82 | TTYL	Talk to you Later
83 | w00t	Hooray
84 | WFM	Works for Me
85 | WRT	With Regard to
86 | WTH	What the Hell
87 | WTF	What the Fuck
88 | YMMD	You made my Day
89 | YMMV	Your Mileage may vary
90 | YAM	Yet Another Meeting
91 | ICYMI	In Case you missed it


--------------------------------------------------------------------------------
/resource/dev/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/resource/dev/__init__.py


--------------------------------------------------------------------------------
/resource/offensive_words.txt:
--------------------------------------------------------------------------------
   1 | abbo
   2 | abo
   3 | abortion
   4 | abuse
   5 | addict
   6 | addicts
   7 | adult
   8 | africa
   9 | african
  10 | alla
  11 | allah
  12 | alligatorbait
  13 | amateur
  14 | american
  15 | anal
  16 | analannie
  17 | analsex
  18 | angie
  19 | angry
  20 | anus
  21 | arab
  22 | arabs
  23 | areola
  24 | argie
  25 | aroused
  26 | arse
  27 | arsehole
  28 | asian
  29 | ass
  30 | assassin
  31 | assassinate
  32 | assassination
  33 | assault
  34 | assbagger
  35 | assblaster
  36 | assclown
  37 | asscowboy
  38 | asses
  39 | assfuck
  40 | assfucker
  41 | asshat
  42 | asshole
  43 | assholes
  44 | asshore
  45 | assjockey
  46 | asskiss
  47 | asskisser
  48 | assklown
  49 | asslick
  50 | asslicker
  51 | asslover
  52 | assman
  53 | assmonkey
  54 | assmunch
  55 | assmuncher
  56 | asspacker
  57 | asspirate
  58 | asspuppies
  59 | assranger
  60 | asswhore
  61 | asswipe
  62 | athletesfoot
  63 | attack
  64 | australian
  65 | babe
  66 | babies
  67 | backdoor
  68 | backdoorman
  69 | backseat
  70 | badfuck
  71 | balllicker
  72 | balls
  73 | ballsack
  74 | banging
  75 | baptist
  76 | barelylegal
  77 | barf
  78 | barface
  79 | barfface
  80 | bast
  81 | bastard 
  82 | bazongas
  83 | bazooms
  84 | beaner
  85 | beast
  86 | beastality
  87 | beastial
  88 | beastiality
  89 | beatoff
  90 | beat-off
  91 | beatyourmeat
  92 | beaver
  93 | bestial
  94 | bestiality
  95 | bi
  96 | biatch
  97 | bible
  98 | bicurious
  99 | bigass
 100 | bigbastard
 101 | bigbutt
 102 | bigger
 103 | bisexual
 104 | bi-sexual
 105 | bitch
 106 | bitcher
 107 | bitches
 108 | bitchez
 109 | bitchin
 110 | bitching
 111 | bitchslap
 112 | bitchy
 113 | biteme
 114 | black
 115 | blackman
 116 | blackout
 117 | blacks
 118 | blind
 119 | blow
 120 | blowjob
 121 | boang
 122 | bogan
 123 | bohunk
 124 | bollick
 125 | bollock
 126 | bomb
 127 | bombers
 128 | bombing
 129 | bombs
 130 | bomd
 131 | bondage
 132 | boner
 133 | bong
 134 | boob
 135 | boobies
 136 | boobs
 137 | booby
 138 | boody
 139 | boom
 140 | boong
 141 | boonga
 142 | boonie
 143 | booty
 144 | bootycall
 145 | bountybar
 146 | bra
 147 | brea5t
 148 | breast
 149 | breastjob
 150 | breastlover
 151 | breastman
 152 | brothel
 153 | bugger
 154 | buggered
 155 | buggery
 156 | bullcrap
 157 | bulldike
 158 | bulldyke
 159 | bullshit
 160 | bumblefuck
 161 | bumfuck
 162 | bunga
 163 | bunghole
 164 | buried
 165 | burn
 166 | butchbabes
 167 | butchdike
 168 | butchdyke
 169 | butt
 170 | buttbang
 171 | butt-bang
 172 | buttface
 173 | buttfuck
 174 | butt-fuck
 175 | buttfucker
 176 | butt-fucker
 177 | buttfuckers
 178 | butt-fuckers
 179 | butthead
 180 | buttman
 181 | buttmunch
 182 | buttmuncher
 183 | buttpirate
 184 | buttplug
 185 | buttstain
 186 | byatch
 187 | cacker
 188 | cameljockey
 189 | cameltoe
 190 | canadian
 191 | cancer
 192 | carpetmuncher
 193 | carruth
 194 | catholic
 195 | catholics
 196 | cemetery
 197 | chav
 198 | cherrypopper
 199 | chickslick
 200 | children's
 201 | chin
 202 | chinaman
 203 | chinamen
 204 | chinese
 205 | chink
 206 | chinky
 207 | choad
 208 | chode
 209 | christ
 210 | christian
 211 | church
 212 | cigarette
 213 | cigs
 214 | clamdigger
 215 | clamdiver
 216 | clit
 217 | clitoris
 218 | clogwog
 219 | cocaine
 220 | cock
 221 | cockblock
 222 | cockblocker
 223 | cockcowboy
 224 | cockfight
 225 | cockhead
 226 | cockknob
 227 | cocklicker
 228 | cocklover
 229 | cocknob
 230 | cockqueen
 231 | cockrider
 232 | cocksman
 233 | cocksmith
 234 | cocksmoker
 235 | cocksucer
 236 | cocksuck 
 237 | cocksucked 
 238 | cocksucker
 239 | cocksucking
 240 | cocktail
 241 | cocktease
 242 | cocky
 243 | cohee
 244 | coitus
 245 | color
 246 | colored
 247 | coloured
 248 | commie
 249 | communist
 250 | condom
 251 | conservative
 252 | conspiracy
 253 | coolie
 254 | cooly
 255 | coon
 256 | coondog
 257 | copulate
 258 | cornhole
 259 | corruption
 260 | cra5h
 261 | crabs
 262 | crack
 263 | crackpipe
 264 | crackwhore
 265 | crack-whore
 266 | crap
 267 | crapola
 268 | crapper
 269 | crappy
 270 | crash
 271 | creamy
 272 | crime
 273 | crimes
 274 | criminal
 275 | criminals
 276 | crotch
 277 | crotchjockey
 278 | crotchmonkey
 279 | crotchrot
 280 | cum
 281 | cumbubble
 282 | cumfest
 283 | cumjockey
 284 | cumm
 285 | cummer
 286 | cumming
 287 | cumquat
 288 | cumqueen
 289 | cumshot
 290 | cunilingus
 291 | cunillingus
 292 | cunn
 293 | cunnilingus
 294 | cunntt
 295 | cunt
 296 | cunteyed
 297 | cuntfuck
 298 | cuntfucker
 299 | cuntlick 
 300 | cuntlicker 
 301 | cuntlicking 
 302 | cuntsucker
 303 | cybersex
 304 | cyberslimer
 305 | dago
 306 | dahmer
 307 | dammit
 308 | damn
 309 | damnation
 310 | damnit
 311 | darkie
 312 | darky
 313 | datnigga
 314 | dead
 315 | deapthroat
 316 | death
 317 | deepthroat
 318 | defecate
 319 | dego
 320 | demon
 321 | deposit
 322 | desire
 323 | destroy
 324 | deth
 325 | devil
 326 | devilworshipper
 327 | dick
 328 | dickbrain
 329 | dickforbrains
 330 | dickhead
 331 | dickless
 332 | dicklick
 333 | dicklicker
 334 | dickman
 335 | dickwad
 336 | dickweed
 337 | diddle
 338 | die
 339 | died
 340 | dies
 341 | dike
 342 | dildo
 343 | dingleberry
 344 | dink
 345 | dipshit
 346 | dipstick
 347 | dirty
 348 | disease
 349 | diseases
 350 | disturbed
 351 | dive
 352 | dix
 353 | dixiedike
 354 | dixiedyke
 355 | doggiestyle
 356 | doggystyle
 357 | dong
 358 | doodoo
 359 | doo-doo
 360 | doom
 361 | dope
 362 | dragqueen
 363 | dragqween
 364 | dripdick
 365 | drug
 366 | drunk
 367 | drunken
 368 | dumb
 369 | dumbass
 370 | dumbbitch
 371 | dumbfuck
 372 | dyefly
 373 | dyke
 374 | easyslut
 375 | eatballs
 376 | eatme
 377 | eatpussy
 378 | ecstacy
 379 | ejaculate
 380 | ejaculated
 381 | ejaculating 
 382 | ejaculation
 383 | enema
 384 | enemy
 385 | erect
 386 | erection
 387 | ero
 388 | escort
 389 | ethiopian
 390 | ethnic
 391 | european
 392 | evl
 393 | excrement
 394 | execute
 395 | executed
 396 | execution
 397 | executioner
 398 | explosion
 399 | facefucker
 400 | faeces
 401 | fag
 402 | fagging
 403 | faggot
 404 | fagot
 405 | failed
 406 | failure
 407 | fairies
 408 | fairy
 409 | faith
 410 | fannyfucker
 411 | fart
 412 | farted 
 413 | farting 
 414 | farty 
 415 | fastfuck
 416 | fat
 417 | fatah
 418 | fatass
 419 | fatfuck
 420 | fatfucker
 421 | fatso
 422 | fckcum
 423 | fear
 424 | feces
 425 | felatio 
 426 | felch
 427 | felcher
 428 | felching
 429 | fellatio
 430 | feltch
 431 | feltcher
 432 | feltching
 433 | fetish
 434 | fight
 435 | filipina
 436 | filipino
 437 | fingerfood
 438 | fingerfuck 
 439 | fingerfucked 
 440 | fingerfucker 
 441 | fingerfuckers
 442 | fingerfucking 
 443 | fire
 444 | firing
 445 | fister
 446 | fistfuck
 447 | fistfucked 
 448 | fistfucker 
 449 | fistfucking 
 450 | fisting
 451 | flange
 452 | flasher
 453 | flatulence
 454 | floo
 455 | flydie
 456 | flydye
 457 | fok
 458 | fondle
 459 | footaction
 460 | footfuck
 461 | footfucker
 462 | footlicker
 463 | footstar
 464 | fore
 465 | foreskin
 466 | forni
 467 | fornicate
 468 | foursome
 469 | fourtwenty
 470 | fraud
 471 | freakfuck
 472 | freakyfucker
 473 | freefuck
 474 | fu
 475 | fubar
 476 | fuc
 477 | fucck
 478 | fuck
 479 | fucka
 480 | fuckable
 481 | fuckbag
 482 | fuckbuddy
 483 | fucked
 484 | fuckedup
 485 | fucker
 486 | fuckers
 487 | fuckface
 488 | fuckfest
 489 | fuckfreak
 490 | fuckfriend
 491 | fuckhead
 492 | fuckher
 493 | fuckin
 494 | fuckina
 495 | fucking
 496 | fuckingbitch
 497 | fuckinnuts
 498 | fuckinright
 499 | fuckit
 500 | fuckknob
 501 | fuckme 
 502 | fuckmehard
 503 | fuckmonkey
 504 | fuckoff
 505 | fuckpig
 506 | fucks
 507 | fucktard
 508 | fuckwhore
 509 | fuckyou
 510 | fudgepacker
 511 | fugly
 512 | fuk
 513 | fuks
 514 | funeral
 515 | funfuck
 516 | fungus
 517 | fuuck
 518 | gangbang
 519 | gangbanged 
 520 | gangbanger
 521 | gangsta
 522 | gatorbait
 523 | gay
 524 | gaymuthafuckinwhore
 525 | gaysex 
 526 | geez
 527 | geezer
 528 | geni
 529 | genital
 530 | german
 531 | getiton
 532 | gin
 533 | ginzo
 534 | gipp
 535 | girls
 536 | givehead
 537 | glazeddonut
 538 | gob
 539 | god
 540 | godammit
 541 | goddamit
 542 | goddammit
 543 | goddamn
 544 | goddamned
 545 | goddamnes
 546 | goddamnit
 547 | goddamnmuthafucker
 548 | goldenshower
 549 | gonorrehea
 550 | gonzagas
 551 | gook
 552 | gotohell
 553 | goy
 554 | goyim
 555 | greaseball
 556 | gringo
 557 | groe
 558 | gross
 559 | grostulation
 560 | gubba
 561 | gummer
 562 | gun
 563 | gyp
 564 | gypo
 565 | gypp
 566 | gyppie
 567 | gyppo
 568 | gyppy
 569 | hamas
 570 | handjob
 571 | hapa
 572 | harder
 573 | hardon
 574 | harem
 575 | headfuck
 576 | headlights
 577 | hebe
 578 | heeb
 579 | hell
 580 | henhouse
 581 | heroin
 582 | herpes
 583 | heterosexual
 584 | hijack
 585 | hijacker
 586 | hijacking
 587 | hillbillies
 588 | hindoo
 589 | hiscock
 590 | hitler
 591 | hitlerism
 592 | hitlerist
 593 | hiv
 594 | ho
 595 | hobo
 596 | hodgie
 597 | hoes
 598 | hole
 599 | holestuffer
 600 | homicide
 601 | homo
 602 | homobangers
 603 | homosexual
 604 | honger
 605 | honk
 606 | honkers
 607 | honkey
 608 | honky
 609 | hook
 610 | hooker
 611 | hookers
 612 | hooters
 613 | hore
 614 | hork
 615 | horn
 616 | horney
 617 | horniest
 618 | horny
 619 | horseshit
 620 | hosejob
 621 | hoser
 622 | hostage
 623 | hotdamn
 624 | hotpussy
 625 | hottotrot
 626 | hummer
 627 | husky
 628 | hussy
 629 | hustler
 630 | hymen
 631 | hymie
 632 | iblowu
 633 | idiot
 634 | ikey
 635 | illegal
 636 | incest
 637 | insest
 638 | intercourse
 639 | interracial
 640 | intheass
 641 | inthebuff
 642 | israel
 643 | israeli
 644 | israel's
 645 | italiano
 646 | itch
 647 | jackass
 648 | jackoff
 649 | jackshit
 650 | jacktheripper
 651 | jade
 652 | jap
 653 | japanese
 654 | japcrap
 655 | jebus
 656 | jeez
 657 | jerkoff
 658 | jesus
 659 | jesuschrist
 660 | jew
 661 | jewish
 662 | jiga
 663 | jigaboo
 664 | jigg
 665 | jigga
 666 | jiggabo
 667 | jigger 
 668 | jiggy
 669 | jihad
 670 | jijjiboo
 671 | jimfish
 672 | jism
 673 | jiz 
 674 | jizim
 675 | jizjuice
 676 | jizm 
 677 | jizz
 678 | jizzim
 679 | jizzum
 680 | joint
 681 | juggalo
 682 | jugs
 683 | junglebunny
 684 | kaffer
 685 | kaffir
 686 | kaffre
 687 | kafir
 688 | kanake
 689 | kid
 690 | kigger
 691 | kike
 692 | kill
 693 | killed
 694 | killer
 695 | killing
 696 | kills
 697 | kink
 698 | kinky
 699 | kissass
 700 | kkk
 701 | knife
 702 | knockers
 703 | kock
 704 | kondum
 705 | koon
 706 | kotex
 707 | krap
 708 | krappy
 709 | kraut
 710 | kum
 711 | kumbubble
 712 | kumbullbe
 713 | kummer
 714 | kumming
 715 | kumquat
 716 | kums
 717 | kunilingus
 718 | kunnilingus
 719 | kunt
 720 | ky
 721 | kyke
 722 | lactate
 723 | laid
 724 | lapdance
 725 | latin
 726 | lesbain
 727 | lesbayn
 728 | lesbian
 729 | lesbin
 730 | lesbo
 731 | lez
 732 | lezbe
 733 | lezbefriends
 734 | lezbo
 735 | lezz
 736 | lezzo
 737 | liberal
 738 | libido
 739 | licker
 740 | lickme
 741 | lies
 742 | limey
 743 | limpdick
 744 | limy
 745 | lingerie
 746 | liquor
 747 | livesex
 748 | loadedgun
 749 | lolita
 750 | looser
 751 | loser
 752 | lotion
 753 | lovebone
 754 | lovegoo
 755 | lovegun
 756 | lovejuice
 757 | lovemuscle
 758 | lovepistol
 759 | loverocket
 760 | lowlife
 761 | lsd
 762 | lubejob
 763 | lucifer
 764 | luckycammeltoe
 765 | lugan
 766 | lynch
 767 | macaca
 768 | mad
 769 | mafia
 770 | magicwand
 771 | mams
 772 | manhater
 773 | manpaste
 774 | marijuana
 775 | mastabate
 776 | mastabater
 777 | masterbate
 778 | masterblaster
 779 | mastrabator
 780 | masturbate
 781 | masturbating
 782 | mattressprincess
 783 | meatbeatter
 784 | meatrack
 785 | meth
 786 | mexican
 787 | mgger
 788 | mggor
 789 | mickeyfinn
 790 | mideast
 791 | milf
 792 | minority
 793 | mockey
 794 | mockie
 795 | mocky
 796 | mofo
 797 | moky
 798 | moles
 799 | molest
 800 | molestation
 801 | molester
 802 | molestor
 803 | moneyshot
 804 | mooncricket
 805 | mormon
 806 | moron
 807 | moslem
 808 | mosshead
 809 | mothafuck
 810 | mothafucka
 811 | mothafuckaz
 812 | mothafucked 
 813 | mothafucker
 814 | mothafuckin
 815 | mothafucking 
 816 | mothafuckings
 817 | motherfuck
 818 | motherfucked
 819 | motherfucker
 820 | motherfuckin
 821 | motherfucking
 822 | motherfuckings
 823 | motherlovebone
 824 | muff
 825 | muffdive
 826 | muffdiver
 827 | muffindiver
 828 | mufflikcer
 829 | mulatto
 830 | muncher
 831 | munt
 832 | murder
 833 | murderer
 834 | muslim
 835 | naked
 836 | narcotic
 837 | nasty
 838 | nastybitch
 839 | nastyho
 840 | nastyslut
 841 | nastywhore
 842 | nazi
 843 | necro
 844 | negro
 845 | negroes
 846 | negroid
 847 | negro's
 848 | nig
 849 | niger
 850 | nigerian
 851 | nigerians
 852 | nigg
 853 | nigga
 854 | niggah
 855 | niggaracci
 856 | niggard
 857 | niggarded
 858 | niggarding
 859 | niggardliness
 860 | niggardliness's
 861 | niggardly
 862 | niggards
 863 | niggard's
 864 | niggaz
 865 | nigger
 866 | niggerhead
 867 | niggerhole
 868 | niggers
 869 | nigger's
 870 | niggle
 871 | niggled
 872 | niggles
 873 | niggling
 874 | nigglings
 875 | niggor
 876 | niggur
 877 | niglet
 878 | nignog
 879 | nigr
 880 | nigra
 881 | nigre
 882 | nip
 883 | nipple
 884 | nipplering
 885 | nittit
 886 | nlgger
 887 | nlggor
 888 | nofuckingway
 889 | nook
 890 | nookey
 891 | nookie
 892 | noonan
 893 | nooner
 894 | nude
 895 | nudger
 896 | nuke
 897 | nutfucker
 898 | nymph
 899 | ontherag
 900 | oral
 901 | orga
 902 | orgasim 
 903 | orgasm
 904 | orgies
 905 | orgy
 906 | osama
 907 | paki
 908 | palesimian
 909 | palestinian
 910 | pansies
 911 | pansy
 912 | panti
 913 | panties
 914 | payo
 915 | pearlnecklace
 916 | peck
 917 | pecker
 918 | peckerwood
 919 | pee
 920 | peehole
 921 | pee-pee
 922 | peepshow
 923 | peepshpw
 924 | pendy
 925 | penetration
 926 | peni5
 927 | penile
 928 | penis
 929 | penises
 930 | penthouse
 931 | period
 932 | perv
 933 | phonesex
 934 | phuk
 935 | phuked
 936 | phuking
 937 | phukked
 938 | phukking
 939 | phungky
 940 | phuq
 941 | pi55
 942 | picaninny
 943 | piccaninny
 944 | pickaninny
 945 | piker
 946 | pikey
 947 | piky
 948 | pimp
 949 | pimped
 950 | pimper
 951 | pimpjuic
 952 | pimpjuice
 953 | pimpsimp
 954 | pindick
 955 | piss
 956 | pissed
 957 | pisser
 958 | pisses 
 959 | pisshead
 960 | pissin 
 961 | pissing
 962 | pissoff 
 963 | pistol
 964 | pixie
 965 | pixy
 966 | playboy
 967 | playgirl
 968 | pocha
 969 | pocho
 970 | pocketpool
 971 | pohm
 972 | polack
 973 | pom
 974 | pommie
 975 | pommy
 976 | poo
 977 | poon
 978 | poontang
 979 | poop
 980 | pooper
 981 | pooperscooper
 982 | pooping
 983 | poorwhitetrash
 984 | popimp
 985 | porchmonkey
 986 | porn
 987 | pornflick
 988 | pornking
 989 | porno
 990 | pornography
 991 | pornprincess
 992 | pot
 993 | poverty
 994 | premature
 995 | pric
 996 | prick
 997 | prickhead
 998 | primetime
 999 | propaganda
1000 | pros
1001 | prostitute
1002 | protestant
1003 | pu55i
1004 | pu55y
1005 | pube
1006 | pubic
1007 | pubiclice
1008 | pud
1009 | pudboy
1010 | pudd
1011 | puddboy
1012 | puke
1013 | puntang
1014 | purinapricness
1015 | puss
1016 | pussie
1017 | pussies
1018 | pussy
1019 | pussycat
1020 | pussyeater
1021 | pussyfucker
1022 | pussylicker
1023 | pussylips
1024 | pussylover
1025 | pussypounder
1026 | pusy
1027 | quashie
1028 | queef
1029 | queer
1030 | quickie
1031 | quim
1032 | ra8s
1033 | rabbi
1034 | racial
1035 | racist
1036 | radical
1037 | radicals
1038 | raghead
1039 | randy
1040 | rape
1041 | raped
1042 | raper
1043 | rapist
1044 | rearend
1045 | rearentry
1046 | rectum
1047 | redlight
1048 | redneck
1049 | reefer
1050 | reestie
1051 | refugee
1052 | reject
1053 | remains
1054 | rentafuck
1055 | republican
1056 | rere
1057 | retard
1058 | retarded
1059 | ribbed
1060 | rigger
1061 | rimjob
1062 | rimming
1063 | roach
1064 | robber
1065 | roundeye
1066 | rump
1067 | russki
1068 | russkie
1069 | sadis
1070 | sadom
1071 | samckdaddy
1072 | sandm
1073 | sandnigger
1074 | satan
1075 | scag
1076 | scallywag
1077 | scat
1078 | schlong
1079 | screw
1080 | screwyou
1081 | scrotum
1082 | scum
1083 | semen
1084 | seppo
1085 | servant
1086 | sex
1087 | sexed
1088 | sexfarm
1089 | sexhound
1090 | sexhouse
1091 | sexing
1092 | sexkitten
1093 | sexpot
1094 | sexslave
1095 | sextogo
1096 | sextoy
1097 | sextoys
1098 | sexual
1099 | sexually
1100 | sexwhore
1101 | sexy
1102 | sexymoma
1103 | sexy-slim
1104 | shag
1105 | shaggin
1106 | shagging
1107 | shat
1108 | shav
1109 | shawtypimp
1110 | sheeney
1111 | shhit
1112 | shinola
1113 | shit
1114 | shitcan
1115 | shitdick
1116 | shite
1117 | shiteater
1118 | shited
1119 | shitface
1120 | shitfaced
1121 | shitfit
1122 | shitforbrains
1123 | shitfuck
1124 | shitfucker
1125 | shitfull
1126 | shithapens
1127 | shithappens
1128 | shithead
1129 | shithouse
1130 | shiting
1131 | shitlist
1132 | shitola
1133 | shitoutofluck
1134 | shits
1135 | shitstain
1136 | shitted
1137 | shitter
1138 | shitting
1139 | shitty 
1140 | shoot
1141 | shooting
1142 | shortfuck
1143 | showtime
1144 | sick
1145 | sissy
1146 | sixsixsix
1147 | sixtynine
1148 | sixtyniner
1149 | skank
1150 | skankbitch
1151 | skankfuck
1152 | skankwhore
1153 | skanky
1154 | skankybitch
1155 | skankywhore
1156 | skinflute
1157 | skum
1158 | skumbag
1159 | slant
1160 | slanteye
1161 | slapper
1162 | slaughter
1163 | slav
1164 | slave
1165 | slavedriver
1166 | sleezebag
1167 | sleezeball
1168 | slideitin
1169 | slime
1170 | slimeball
1171 | slimebucket
1172 | slopehead
1173 | slopey
1174 | slopy
1175 | slut
1176 | sluts
1177 | slutt
1178 | slutting
1179 | slutty
1180 | slutwear
1181 | slutwhore
1182 | smack
1183 | smackthemonkey
1184 | smut
1185 | snatch
1186 | snatchpatch
1187 | snigger
1188 | sniggered
1189 | sniggering
1190 | sniggers
1191 | snigger's
1192 | sniper
1193 | snot
1194 | snowback
1195 | snownigger
1196 | sob
1197 | sodom
1198 | sodomise
1199 | sodomite
1200 | sodomize
1201 | sodomy
1202 | sonofabitch
1203 | sonofbitch
1204 | sooty
1205 | sos
1206 | soviet
1207 | spaghettibender
1208 | spaghettinigger
1209 | spank
1210 | spankthemonkey
1211 | sperm
1212 | spermacide
1213 | spermbag
1214 | spermhearder
1215 | spermherder
1216 | spic
1217 | spick
1218 | spig
1219 | spigotty
1220 | spik
1221 | spit
1222 | spitter
1223 | splittail
1224 | spooge
1225 | spreadeagle
1226 | spunk
1227 | spunky
1228 | squaw
1229 | stagg
1230 | stiffy
1231 | strapon
1232 | stringer
1233 | stripclub
1234 | stroke
1235 | stroking
1236 | stupid
1237 | stupidfuck
1238 | stupidfucker
1239 | suck
1240 | suckdick
1241 | sucker
1242 | suckme
1243 | suckmyass
1244 | suckmydick
1245 | suckmytit
1246 | suckoff
1247 | suicide
1248 | swallow
1249 | swallower
1250 | swalow
1251 | swastika
1252 | sweetness
1253 | syphilis
1254 | taboo
1255 | taff
1256 | tampon
1257 | tang
1258 | tantra
1259 | tarbaby
1260 | tard
1261 | teat
1262 | terror
1263 | terrorist
1264 | teste
1265 | testicle
1266 | testicles
1267 | thicklips
1268 | thirdeye
1269 | thirdleg
1270 | threesome
1271 | threeway
1272 | timbernigger
1273 | tinkle
1274 | tit
1275 | titbitnipply
1276 | titfuck
1277 | titfucker
1278 | titfuckin
1279 | titjob
1280 | titlicker
1281 | titlover
1282 | tits
1283 | tittie
1284 | titties
1285 | titty
1286 | tnt
1287 | toilet
1288 | tongethruster
1289 | tongue
1290 | tonguethrust
1291 | tonguetramp
1292 | tortur
1293 | torture
1294 | tosser
1295 | towelhead
1296 | trailertrash
1297 | tramp
1298 | trannie
1299 | tranny
1300 | transexual
1301 | transsexual
1302 | transvestite
1303 | triplex
1304 | trisexual
1305 | trojan
1306 | trots
1307 | tuckahoe
1308 | tunneloflove
1309 | turd
1310 | turnon
1311 | twat
1312 | twink
1313 | twinkie
1314 | twobitwhore
1315 | uck
1316 | uk
1317 | unfuckable
1318 | upskirt
1319 | uptheass
1320 | upthebutt
1321 | urinary
1322 | urinate
1323 | urine
1324 | usama
1325 | uterus
1326 | vagina
1327 | vaginal
1328 | vatican
1329 | vibr
1330 | vibrater
1331 | vibrator
1332 | vietcong
1333 | violence
1334 | virgin
1335 | virginbreaker
1336 | vomit
1337 | vulva
1338 | wab
1339 | wank
1340 | wanker
1341 | wanking
1342 | waysted
1343 | weapon
1344 | weenie
1345 | weewee
1346 | welcher
1347 | welfare
1348 | wetb
1349 | wetback
1350 | wetspot
1351 | whacker
1352 | whash
1353 | whigger
1354 | whiskey
1355 | whiskeydick
1356 | whiskydick
1357 | whit
1358 | whitenigger
1359 | whites
1360 | whitetrash
1361 | whitey
1362 | whiz
1363 | whop
1364 | whore
1365 | whorefucker
1366 | whorehouse
1367 | wigger
1368 | willie
1369 | williewanker
1370 | willy
1371 | wn
1372 | wog
1373 | women's
1374 | wop
1375 | wtf
1376 | wuss
1377 | wuzzie
1378 | xtc
1379 | xxx
1380 | yankee
1381 | yellowman
1382 | zigabo
1383 | zipperhead


--------------------------------------------------------------------------------
/resource/text_context_awc_model/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Model", "config": {"name": "model_1", "layers": [{"name": "context", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "context"}, "inbound_nodes": []}, {"name": "text", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 30], "dtype": "float32", "sparse": false, "name": "text"}, "inbound_nodes": []}, {"name": "embedding_1", "class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["context", 0, 0, {}]]]}, {"name": "embedding_2", "class_name": "Embedding", "config": {"name": "embedding_2", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 12647, "output_dim": 300, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}, "inbound_nodes": [[["text", 0, 0, {}]]]}, {"name": "conv1d_1", "class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_1", 0, 0, {}]]]}, {"name": "conv1d_2", "class_name": "Conv1D", "config": {"name": "conv1d_2", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 32, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["embedding_2", 0, 0, {}]]]}, {"name": "lstm_1", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_2", "class_name": "LSTM", "config": {"name": "lstm_2", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_1", 0, 0, {}]]]}, {"name": "lstm_3", "class_name": "LSTM", "config": {"name": "lstm_3", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "lstm_4", "class_name": "LSTM", "config": {"name": "lstm_4", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": true, "stateful": false, "unroll": false, "units": 64, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.25, "recurrent_dropout": 0.0, "implementation": 1}, "inbound_nodes": [[["conv1d_2", 0, 0, {}]]]}, {"name": "concatenate_1", "class_name": "Concatenate", "config": {"name": "concatenate_1", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_1", 0, 0, {}], ["lstm_2", 0, 0, {}]]]}, {"name": "concatenate_2", "class_name": "Concatenate", "config": {"name": "concatenate_2", "trainable": true, "axis": -1}, "inbound_nodes": [[["lstm_3", 0, 0, {}], ["lstm_4", 0, 0, {}]]]}, {"name": "awc", "class_name": "InputLayer", "config": {"batch_input_shape": [2, 11], "dtype": "float32", "sparse": false, "name": "awc"}, "inbound_nodes": []}, {"name": "concatenate_3", "class_name": "Concatenate", "config": {"name": "concatenate_3", "trainable": true, "axis": -1}, "inbound_nodes": [[["concatenate_1", 0, 0, {}], ["concatenate_2", 0, 0, {}], ["awc", 0, 0, {}]]]}, {"name": "dense_1", "class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 64, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["concatenate_3", 0, 0, {}]]]}, {"name": "dropout_1", "class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}, "inbound_nodes": [[["dense_1", 0, 0, {}]]]}, {"name": "dense_2", "class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "inbound_nodes": [[["dropout_1", 0, 0, {}]]]}, {"name": "activation_1", "class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}, "inbound_nodes": [[["dense_2", 0, 0, {}]]]}], "input_layers": [["context", 0, 0], ["text", 0, 0], ["awc", 0, 0]], "output_layers": [["activation_1", 0, 0]]}, "keras_version": "2.1.6", "backend": "tensorflow"}


--------------------------------------------------------------------------------
/resource/text_model/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": [{"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 33892, "output_dim": 256, "embeddings_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "normal", "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Conv1D", "config": {"name": "conv1d_1", "trainable": true, "batch_input_shape": [null, 1, 30], "dtype": "float32", "filters": 256, "kernel_size": [3], "strides": [1], "padding": "valid", "dilation_rate": [1], "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling1D", "config": {"name": "max_pooling1d_1", "trainable": true, "strides": [3], "pool_size": [3], "padding": "valid"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "units": 256, "activation": "sigmoid", "recurrent_activation": "hard_sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "recurrent_initializer": {"class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.5, "recurrent_dropout": 0.0, "implementation": 1}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 256, "activation": "sigmoid", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.25, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "linear", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Activation", "config": {"name": "activation_1", "trainable": true, "activation": "softmax"}}], "keras_version": "2.1.6", "backend": "tensorflow"}


--------------------------------------------------------------------------------
/resource/text_model_2D/weights/model.json:
--------------------------------------------------------------------------------
1 | {"class_name": "Sequential", "config": [{"class_name": "Masking", "config": {"name": "masking_1", "trainable": true, "batch_input_shape": [null, 30], "dtype": "float32", "mask_value": 0}}, {"class_name": "Embedding", "config": {"name": "embedding_1", "trainable": false, "batch_input_shape": [null, 30], "dtype": "float32", "input_dim": 34552, "output_dim": 200, "embeddings_initializer": {"class_name": "RandomUniform", "config": {"minval": -0.05, "maxval": 0.05, "seed": null}}, "embeddings_regularizer": null, "activity_regularizer": null, "embeddings_constraint": null, "mask_zero": false, "input_length": 30}}, {"class_name": "Reshape", "config": {"name": "reshape_1", "trainable": true, "target_shape": [30, 200, 1]}}, {"class_name": "Conv2D", "config": {"name": "conv2d_1", "trainable": true, "filters": 32, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_1", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_1", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Conv2D", "config": {"name": "conv2d_2", "trainable": true, "filters": 64, "kernel_size": [5, 1], "strides": [1, 1], "padding": "valid", "data_format": "channels_last", "dilation_rate": [1, 1], "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_2", "trainable": true, "pool_size": [2, 1], "padding": "valid", "strides": [2, 1], "data_format": "channels_last"}}, {"class_name": "Dropout", "config": {"name": "dropout_2", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Flatten", "config": {"name": "flatten_1", "trainable": true}}, {"class_name": "Dense", "config": {"name": "dense_1", "trainable": true, "units": 128, "activation": "relu", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 2.0, "mode": "fan_in", "distribution": "normal", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}, {"class_name": "Dropout", "config": {"name": "dropout_3", "trainable": true, "rate": 0.5, "noise_shape": null, "seed": null}}, {"class_name": "Dense", "config": {"name": "dense_2", "trainable": true, "units": 2, "activation": "softmax", "use_bias": true, "kernel_initializer": {"class_name": "VarianceScaling", "config": {"scale": 1.0, "mode": "fan_avg", "distribution": "uniform", "seed": null}}, "bias_initializer": {"class_name": "Zeros", "config": {}}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}}], "keras_version": "2.1.2", "backend": "tensorflow"}


--------------------------------------------------------------------------------
/resource/train/.directory:
--------------------------------------------------------------------------------
1 | [Dolphin]
2 | Timestamp=2017,2,6,16,20,50
3 | Version=3
4 | ViewMode=1
5 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/__init__.py


--------------------------------------------------------------------------------
/src/data_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AniSkywalker/SarcasmDetection/c830b82fbe59ec7f6e02e29f14ebbe845b618d3d/src/data_processing/__init__.py


--------------------------------------------------------------------------------
/src/data_processing/data_handler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | sys.path.append('../')
  4 | from collections import defaultdict
  5 | import re
  6 | from gensim.models.keyedvectors import KeyedVectors
  7 | from gensim.models.wrappers import FastText
  8 | import numpy
  9 | from nltk.tokenize import TweetTokenizer
 10 | import src.data_processing.glove2Word2vecLoader as glove
 11 | import itertools
 12 | 
 13 | 
 14 | # loading the emoji dataset
 15 | def load_unicode_mapping(path):
 16 |     emoji_dict = defaultdict()
 17 |     with open(path, 'r') as f:
 18 |         lines = f.readlines()
 19 |         for line in lines:
 20 |             tokens = line.strip().split('\t')
 21 |             emoji_dict[tokens[0]] = tokens[1]
 22 |     return emoji_dict
 23 | 
 24 | 
 25 | def load_word2vec(path=None):
 26 |     word2vecmodel = KeyedVectors.load_word2vec_format(path, binary=True)
 27 |     return word2vecmodel
 28 | 
 29 | 
 30 | def load_fasttext(path=None):
 31 |     word2vecmodel = FastText.load_fasttext_format(path)
 32 |     return word2vecmodel
 33 | 
 34 | 
 35 | def InitializeWords(word_file_path):
 36 |     word_dictionary = defaultdict()
 37 | 
 38 |     with open(word_file_path, 'r') as f:
 39 |         lines = f.readlines()
 40 |         for line in lines:
 41 |             tokens = line.lower().strip().split('\t')
 42 |             word_dictionary[tokens[0]] = int(tokens[1])
 43 | 
 44 |     for alphabet in "bcdefghjklmnopqrstuvwxyz":
 45 |         if (alphabet in word_dictionary):
 46 |             word_dictionary.__delitem__(alphabet)
 47 | 
 48 |     for word in ['ann', 'assis',
 49 |                  'bz',
 50 |                  'ch', 'cre', 'ct',
 51 |                  'di',
 52 |                  'ed', 'ee',
 53 |                  'ic',
 54 |                  'le',
 55 |                  'ng', 'ns',
 56 |                  'pr', 'picon',
 57 |                  'th', 'tle', 'tl', 'tr',
 58 |                  'um',
 59 |                  've',
 60 |                  'yi'
 61 |                  ]:
 62 |         if (word in word_dictionary):
 63 |             word_dictionary.__delitem__(word)
 64 | 
 65 |     return word_dictionary
 66 | 
 67 | 
 68 | def normalize_word(word):
 69 |     temp = word
 70 |     while True:
 71 |         w = re.sub(r"([a-zA-Z])\1\1", r"\1\1", temp)
 72 |         if (w == temp):
 73 |             break
 74 |         else:
 75 |             temp = w
 76 |     return w
 77 | 
 78 | 
 79 | def load_split_word(split_word_file_path):
 80 |     split_word_dictionary = defaultdict()
 81 |     with open(split_word_file_path, 'r') as f:
 82 |         lines = f.readlines()
 83 |         for line in lines:
 84 |             tokens = line.lower().strip().split('\t')
 85 |             if (len(tokens) >= 2):
 86 |                 split_word_dictionary[tokens[0]] = tokens[1]
 87 | 
 88 |     print('split entry found:', len(split_word_dictionary.keys()))
 89 |     return split_word_dictionary
 90 | 
 91 | 
 92 | def split_hashtags(term, wordlist, split_word_list, dump_file=''):
 93 |     # print('term::',term)
 94 | 
 95 |     if (len(term.strip()) == 1):
 96 |         return ['']
 97 | 
 98 |     if (split_word_list != None and term.lower() in split_word_list):
 99 |         # print('found')
100 |         return split_word_list.get(term.lower()).split(' ')
101 |     else:
102 |         print(term)
103 | 
104 |     # discarding # if exists
105 |     if (term.startswith('#')):
106 |         term = term[1:]
107 | 
108 |     if (wordlist != None and term.lower() in wordlist):
109 |         return [term.lower()]
110 | 
111 |     words = []
112 |     # max freq
113 |     penalty = -69971
114 |     max_coverage = penalty
115 | 
116 |     split_words_count = 6
117 |     # checking camel cases
118 |     term = re.sub(r'([0-9]+)', r' \1', term)
119 |     term = re.sub(r'(1st|2nd|3rd|4th|5th|6th|7th|8th|9th|0th)', r'\1 ', term)
120 |     term = re.sub(r'([A-Z][^A-Z ]+)', r' \1', term.strip())
121 |     term = re.sub(r'([A-Z]{2,})+', r' \1', term)
122 |     words = term.strip().split(' ')
123 | 
124 |     n_splits = 0
125 | 
126 |     if (len(words) < 3):
127 |         # splitting lower case and uppercase words upto 5 words
128 |         chars = [c for c in term.lower()]
129 | 
130 |         found_all_words = False
131 | 
132 |         while (n_splits < split_words_count and not found_all_words):
133 |             for idx in itertools.combinations(range(0, len(chars)), n_splits):
134 |                 output = numpy.split(chars, idx)
135 |                 line = [''.join(o) for o in output]
136 | 
137 |                 score = (1. / len(line)) * sum(
138 |                     [wordlist.get(
139 |                         word.strip()) if word.strip() in wordlist else 0. if word.strip().isnumeric() else penalty for
140 |                      word in line])
141 | 
142 |                 if (score > max_coverage):
143 |                     words = line
144 |                     max_coverage = score
145 | 
146 |                     line_is_valid_word = [word.strip() in wordlist if not word.isnumeric() else True for word in line]
147 | 
148 |                     if (all(line_is_valid_word)):
149 |                         found_all_words = True
150 | 
151 |                     # uncomment to debug hashtag splitting
152 |                     # print(line, score, line_is_valid_word)
153 | 
154 |             n_splits = n_splits + 1
155 | 
156 |     # removing hashtag sign
157 |     words = [str(s) for s in words]
158 | 
159 |     # dumping splits for debug
160 |     with open(dump_file, 'a') as f:
161 |         if (term != '' and len(words) > 0):
162 |             f.write('#' + str(term).strip() + '\t' + ' '.join(words) + '\t' + str(n_splits) + '\n')
163 | 
164 |     return words
165 | 
166 | 
167 | def load_abbreviation(path='../resource/abbreviations.txt'):
168 |     abbreviation_dict = defaultdict()
169 |     with open(path) as f:
170 |         lines = f.readlines()
171 |         for line in lines:
172 |             token = line.lower().strip().split('\t')
173 |             abbreviation_dict[token[0]] = token[1]
174 |     return abbreviation_dict
175 | 
176 | 
177 | def filter_text(text, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
178 |                 split_hashtag=False,
179 |                 ignore_profiles=False,
180 |                 replace_emoji=True):
181 |     filtered_text = []
182 | 
183 |     filter_list = ['/', '-', '=', '+', '…', '\\', '(', ')', '&', ':']
184 | 
185 |     for t in text:
186 |         word_tokens = None
187 | 
188 |         # discarding symbols
189 |         # if (str(t).lower() in filter_list):
190 |         #     continue
191 | 
192 |         # ignoring profile information if ignore_profiles is set
193 |         if (ignore_profiles and str(t).startswith("@")):
194 |             continue
195 | 
196 |         # ignoring links
197 |         if (str(t).startswith('http')):
198 |             continue
199 | 
200 |         # ignoring sarcastic marker
201 |         # uncomment the following line for Fracking sarcasm using neural network
202 |         # if (str(t).lower() in ['#sarcasm','#sarcastic', '#yeahright','#not']):
203 |         #     continue
204 | 
205 |         # for onlinesarcasm
206 |         # comment if you are running the code for Fracking sarcasm using neural network
207 |         if (str(t).lower() in ['#sarcasm']):
208 |             continue
209 | 
210 |         # replacing emoji with its unicode description
211 |         if (replace_emoji):
212 |             if (t in emoji_dict):
213 |                 t = emoji_dict.get(t).split('_')
214 |                 filtered_text.extend(t)
215 |                 continue
216 | 
217 |         # splitting hastags
218 |         if (split_hashtag and str(t).startswith("#")):
219 |             splits = split_hashtags(t, word_list, split_word_list, dump_file='../resource/hastash_split_dump.txt')
220 |             # adding the hashtags
221 |             if (splits != None):
222 |                 filtered_text.extend([s for s in splits if (not filtered_text.__contains__(s))])
223 |                 continue
224 | 
225 |         # removes repeatation of letters
226 |         if (normalize_text):
227 |             t = normalize_word(t)
228 | 
229 |         # expands the abbreviation
230 |         if (t in abbreviation_dict):
231 |             tokens = abbreviation_dict.get(t).split(' ')
232 |             filtered_text.extend(tokens)
233 |             continue
234 | 
235 |         # appends the text
236 |         filtered_text.append(t)
237 | 
238 |     return filtered_text
239 | 
240 | 
241 | def parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=False,
242 |               split_hashtag=False,
243 |               ignore_profiles=False,
244 |               lowercase=False, replace_emoji=True, n_grams=None, at_character=False):
245 |     data = []
246 |     for i, line in enumerate(lines):
247 |         if (i % 100 == 0):
248 |             print(str(i) + '...', end='', flush=True)
249 | 
250 |         try:
251 | 
252 |             # convert the line to lowercase
253 |             if (lowercase):
254 |                 line = line.lower()
255 | 
256 |             # split into token
257 |             token = line.split('\t')
258 | 
259 |             # ID
260 |             id = token[0]
261 | 
262 |             # label
263 |             label = int(token[1].strip())
264 | 
265 |             # tweet text
266 |             target_text = TweetTokenizer().tokenize(token[2].strip())
267 |             if (at_character):
268 |                 target_text = [c for c in token[2].strip()]
269 | 
270 |             if (n_grams != None):
271 |                 n_grams_list = list(create_ngram_set(target_text, ngram_value=n_grams))
272 |                 target_text.extend(['_'.join(n) for n in n_grams_list])
273 | 
274 |             # filter text
275 |             target_text = filter_text(target_text, word_list, split_word_list, emoji_dict, abbreviation_dict,
276 |                                       normalize_text,
277 |                                       split_hashtag,
278 |                                       ignore_profiles, replace_emoji=replace_emoji)
279 | 
280 |             # awc dimensions
281 |             dimensions = []
282 |             if (len(token) > 3 and token[3].strip() != 'NA'):
283 |                 dimensions = [dimension.split('@@')[1] for dimension in token[3].strip().split('|')]
284 | 
285 |             # context tweet
286 |             context = []
287 |             if (len(token) > 4):
288 |                 if (token[4] != 'NA'):
289 |                     context = TweetTokenizer().tokenize(token[4].strip())
290 |                     context = filter_text(context, word_list, split_word_list, emoji_dict, abbreviation_dict,
291 |                                           normalize_text,
292 |                                           split_hashtag,
293 |                                           ignore_profiles, replace_emoji=replace_emoji)
294 | 
295 |             # author
296 |             author = 'NA'
297 |             if (len(token) > 5):
298 |                 author = token[5]
299 | 
300 |             if (len(target_text) != 0):
301 |                 # print((label, target_text, dimensions, context, author))
302 |                 data.append((id, label, target_text, dimensions, context, author))
303 |         except:
304 |             raise
305 |     print('')
306 |     return data
307 | 
308 | 
309 | def load_resources(word_file_path, split_word_path, emoji_file_path, split_hashtag=False, replace_emoji=True):
310 |     word_list = None
311 |     emoji_dict = None
312 | 
313 |     # load split files
314 |     split_word_list = load_split_word(split_word_path)
315 | 
316 |     # load word dictionary
317 |     if (split_hashtag):
318 |         word_list = InitializeWords(word_file_path)
319 | 
320 |     if (replace_emoji):
321 |         emoji_dict = load_unicode_mapping(emoji_file_path)
322 | 
323 |     abbreviation_dict = load_abbreviation()
324 | 
325 |     return word_list, emoji_dict, split_word_list, abbreviation_dict
326 | 
327 | 
328 | def loaddata(filename, word_file_path, split_word_path, emoji_file_path, normalize_text=False, split_hashtag=False,
329 |              ignore_profiles=False,
330 |              lowercase=True, replace_emoji=True, n_grams=None, at_character=False):
331 | 
332 |     word_list, emoji_dict, split_word_list, abbreviation_dict = load_resources(word_file_path, split_word_path,
333 |                                                                                emoji_file_path,
334 |                                                                                split_hashtag=split_hashtag,
335 |                                                                                replace_emoji=replace_emoji)
336 |     lines = open(filename, 'r').readlines()
337 | 
338 |     data = parsedata(lines, word_list, split_word_list, emoji_dict, abbreviation_dict, normalize_text=normalize_text,
339 |                      split_hashtag=split_hashtag,
340 |                      ignore_profiles=ignore_profiles, lowercase=lowercase, replace_emoji=replace_emoji,
341 |                      n_grams=n_grams, at_character=at_character)
342 |     return data
343 | 
344 | 
345 | def build_vocab(data, without_dimension=True, ignore_context=False, min_freq=0):
346 |     vocab = defaultdict(int)
347 |     vocab_freq = defaultdict(int)
348 | 
349 |     total_words = 1
350 |     if (not without_dimension):
351 |         for i in range(1, 101):
352 |             vocab_freq[str(i)] = 0
353 |             # vocab[str(i)] = total_words
354 |             # total_words = total_words + 1
355 | 
356 |     for sentence_no, token in enumerate(data):
357 |         for word in token[2]:
358 |             if (word not in vocab_freq):
359 |                 # vocab[word] = total_words
360 |                 # total_words = total_words + 1
361 |                 vocab_freq[word] = 0
362 |             vocab_freq[word] = vocab_freq.get(word) + 1
363 | 
364 |         if (not without_dimension):
365 |             for word in token[3]:
366 |                 # if (word not in vocab_freq):
367 |                 #     vocab[word] = total_words
368 |                 #     total_words = total_words + 1
369 |                 vocab_freq[word] = vocab_freq.get(word) + 1
370 | 
371 |         if (ignore_context == False):
372 |             for word in token[4]:
373 |                 if (not word in vocab):
374 |                     # vocab[word] = total_words
375 |                     # total_words = total_words + 1
376 |                     vocab_freq[word] = 0
377 |                 vocab_freq[word] = vocab_freq.get(word) + 1
378 | 
379 |     for k, v in vocab_freq.items():
380 |         if (v >= min_freq):
381 |             vocab[k] = total_words
382 |             total_words = total_words + 1
383 | 
384 |     return vocab
385 | 
386 | 
387 | def build_reverse_vocab(vocab):
388 |     rev_vocab = defaultdict(str)
389 |     for k, v in vocab.items():
390 |         rev_vocab[v] = k
391 |     return rev_vocab
392 | 
393 | 
394 | def build_auxiliary_feature(data):
395 |     aux = []
396 |     for id, label, line, dimensions, context, author in data:
397 |         aux.append([float(line.count('!')), float(line.count('?')), float(line.count('.')),
398 |                     sum([1.0 if c.isupper() else 0.0 for c in line]), float(line.count('"'))])
399 | 
400 |     return numpy.asarray(aux)
401 | 
402 | 
403 | def vectorize_word_dimension(data, vocab, drop_dimension_index=None, verbose=False):
404 |     X = []
405 |     Y = []
406 |     D = []
407 |     C = []
408 |     A = []
409 | 
410 |     known_words_set = set()
411 |     unknown_words_set = set()
412 | 
413 |     tokens = 0
414 |     token_coverage = 0
415 | 
416 |     for id, label, line, dimensions, context, author in data:
417 |         vec = []
418 |         context_vec = []
419 |         if (len(dimensions) != 0):
420 |             dvec = [vocab.get(d) for d in dimensions]
421 |         else:
422 |             dvec = [vocab.get('unk')] * 11
423 | 
424 |         if drop_dimension_index != None:
425 |             dvec.pop(drop_dimension_index)
426 | 
427 |         # tweet
428 |         for words in line:
429 |             tokens = tokens + 1
430 |             if (words in vocab):
431 |                 vec.append(vocab[words])
432 |                 token_coverage = token_coverage + 1
433 |                 known_words_set.add(words)
434 |             else:
435 |                 vec.append(vocab['unk'])
436 |                 unknown_words_set.add(words)
437 |         # context_tweet
438 |         if (len(context) != 0):
439 |             for words in line:
440 |                 tokens = tokens + 1
441 |                 if (words in vocab):
442 |                     context_vec.append(vocab[words])
443 |                     token_coverage = token_coverage + 1
444 |                     known_words_set.add(words)
445 |                 else:
446 |                     context_vec.append(vocab['unk'])
447 |                     unknown_words_set.add(words)
448 |         else:
449 |             context_vec = [vocab['unk']]
450 | 
451 |         X.append(vec)
452 |         Y.append(label)
453 |         D.append(dvec)
454 |         C.append(context_vec)
455 |         A.append(author)
456 | 
457 |     if verbose:
458 |         print('Token coverage:', token_coverage / float(tokens))
459 |         print('Word coverage:', len(known_words_set) / float(len(vocab.keys())))
460 | 
461 |     return numpy.asarray(X), numpy.asarray(Y), numpy.asarray(D), numpy.asarray(C), numpy.asarray(A)
462 | 
463 | 
464 | def pad_sequence_1d(sequences, maxlen=None, dtype='float32', padding='pre', truncating='pre', value=0.):
465 |     X = [vectors for vectors in sequences]
466 | 
467 |     nb_samples = len(X)
468 | 
469 |     x = (numpy.zeros((nb_samples, maxlen)) * value).astype(dtype)
470 | 
471 |     for idx, s in enumerate(X):
472 |         if truncating == 'pre':
473 |             trunc = s[-maxlen:]
474 |         elif truncating == 'post':
475 |             trunc = s[:maxlen]
476 |         else:
477 |             raise ValueError("Truncating type '%s' not understood" % padding)
478 | 
479 |         if padding == 'post':
480 |             x[idx, :len(trunc)] = trunc
481 |         elif padding == 'pre':
482 |             x[idx, -len(trunc):] = trunc
483 |         else:
484 |             raise ValueError("Padding type '%s' not understood" % padding)
485 | 
486 |     return x
487 | 
488 | 
489 | def write_vocab(filepath, vocab):
490 |     with open(filepath, 'w') as fw:
491 |         for key, value in vocab.items():
492 |             fw.write(str(key) + '\t' + str(value) + '\n')
493 | 
494 | 
495 | def get_fasttext_weight(vocab, n=300, path=None):
496 |     word2vecmodel = load_word2vec(path=path)
497 |     emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
498 |     for k, v in vocab.items():
499 |         if (word2vecmodel.__contains__(k)):
500 |             emb_weights[v, :] = word2vecmodel[k][:n]
501 | 
502 |     return emb_weights
503 | 
504 | 
505 | def get_word2vec_weight(vocab, n=300, path=None):
506 |     word2vecmodel = load_word2vec(path=path)
507 |     emb_weights = numpy.zeros((len(vocab.keys()) + 1, n))
508 |     for k, v in vocab.items():
509 |         if (word2vecmodel.__contains__(k)):
510 |             emb_weights[v, :] = word2vecmodel[k][:n]
511 | 
512 |     return emb_weights
513 | 
514 | 
515 | def load_glove_model(vocab, n=200, glove_path='/home/glove/glove.twitter.27B/glove.twitter.27B.200d.txt'):
516 |     word2vecmodel = glove.load_glove_word2vec(glove_path)
517 | 
518 |     embedding_matrix = numpy.zeros((len(vocab.keys()) + 1, n))
519 |     for k, v in vocab.items():
520 |         embedding_vector = word2vecmodel.get(k)
521 |         if embedding_vector is not None:
522 |             embedding_matrix[v] = embedding_vector
523 | 
524 |     return embedding_matrix
525 | 
526 | 
527 | def add_ngram(sequences, token_indice, ngram_range=2):
528 |     """
529 |     Augment the input list of list (sequences) by appending n-grams values.
530 |     Example: adding bi-gram
531 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
532 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
533 |     >>> add_ngram(sequences, token_indice, ngram_range=2)
534 |     [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
535 |     Example: adding tri-gram
536 |     >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
537 |     >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
538 |     >>> add_ngram(sequences, token_indice, ngram_range=3)
539 |     [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
540 |     """
541 |     new_sequences = []
542 |     for input_list in sequences:
543 |         new_list = input_list[:]
544 |         for i in range(len(new_list) - ngram_range + 1):
545 |             for ngram_value in range(2, ngram_range + 1):
546 |                 ngram = tuple(new_list[i:i + ngram_value])
547 |                 if ngram in token_indice:
548 |                     new_list.append(token_indice[ngram])
549 |         new_sequences.append(new_list)
550 | 
551 |     return new_sequences
552 | 
553 | 
554 | def create_ngram_set(input_list, ngram_value=2):
555 |     """
556 |     Extract a set of n-grams from a list of integers.
557 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
558 |     {(4, 9), (4, 1), (1, 4), (9, 4)}
559 |     >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
560 |     [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
561 |     """
562 |     return set(zip(*[input_list[i:] for i in range(ngram_value)]))
563 | 
564 | 
565 | def prepare_fasttext(x_train, x_test, max_features=20000, ngram_range=2):
566 |     if ngram_range > 1:
567 |         print('Adding {}-gram features'.format(ngram_range))
568 |         # Create set of unique n-gram from the training set.
569 |         ngram_set = set()
570 |         for input_list in x_train:
571 |             for i in range(2, ngram_range + 1):
572 |                 set_of_ngram = create_ngram_set(input_list, ngram_value=i)
573 |                 ngram_set.update(set_of_ngram)
574 | 
575 |         # Dictionary mapping n-gram token to a unique integer.
576 |         # Integer values are greater than max_features in order
577 |         # to avoid collision with existing features.
578 |         start_index = max_features + 1
579 |         token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
580 |         indice_token = {token_indice[k]: k for k in token_indice}
581 | 
582 |         # max_features is the highest integer that could be found in the dataset.
583 |         max_features = numpy.max(list(indice_token.keys())) + 1
584 | 
585 |         # Augmenting x_train and x_test with n-grams features
586 |         x_train = add_ngram(x_train, token_indice, ngram_range)
587 |         x_test = add_ngram(x_test, token_indice, ngram_range)
588 |         print('Average train sequence length: {}'.format(numpy.mean(list(map(len, x_train)), dtype=int)))
589 |         print('Average test sequence length: {}'.format(numpy.mean(list(map(len, x_test)), dtype=int)))
590 | 


--------------------------------------------------------------------------------
/src/data_processing/glove2Word2vecLoader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import shutil
 3 | import hashlib
 4 | from sys import platform
 5 | 
 6 | import gensim
 7 | 
 8 | 
 9 | def prepend_line(infile, outfile, line):
10 |     with open(infile, 'r') as old:
11 |         with open(outfile, 'w') as new:
12 |             new.write(str(line) + "\n")
13 |             shutil.copyfileobj(old, new)
14 | 
15 | 
16 | def prepend_slow(infile, outfile, line):
17 |     with open(infile, 'r') as fin:
18 |         with open(outfile, 'w') as fout:
19 |             fout.write(line + "\n")
20 |             for line in fin:
21 |                 fout.write(line)
22 | 
23 | 
24 | def checksum(filename):
25 |     BLOCKSIZE = 65536
26 |     hasher = hashlib.md5()
27 |     with open(filename, 'rb') as afile:
28 |         buf = afile.read(BLOCKSIZE)
29 |         while len(buf) > 0:
30 |             hasher.update(buf)
31 |             buf = afile.read(BLOCKSIZE)
32 |     return hasher.hexdigest()
33 | 
34 | 
35 | # Pre-computed glove files values.
36 | pretrain_num_lines = {"glove.840B.300d.txt": 2196017, "glove.42B.300d.txt": 1917494}
37 | 
38 | pretrain_checksum = {
39 |     "glove.6B.300d.txt": "b78f53fb56ec1ce9edc367d2e6186ba4",
40 |     "glove.twitter.27B.50d.txt": "6e8369db39aa3ea5f7cf06c1f3745b06",
41 |     "glove.42B.300d.txt": "01fcdb413b93691a7a26180525a12d6e",
42 |     "glove.6B.50d.txt": "0fac3659c38a4c0e9432fe603de60b12",
43 |     "glove.6B.100d.txt": "dd7f3ad906768166883176d69cc028de",
44 |     "glove.twitter.27B.25d.txt": "f38598c6654cba5e6d0cef9bb833bdb1",
45 |     "glove.6B.200d.txt": "49fa83e4a287c42c6921f296a458eb80",
46 |     "glove.840B.300d.txt": "eec7d467bccfa914726b51aac484d43a",
47 |     "glove.twitter.27B.100d.txt": "ccbdddec6b9610196dd2e187635fee63",
48 |     "glove.twitter.27B.200d.txt": "e44cdc3e10806b5137055eeb08850569",
49 | }
50 | 
51 | 
52 | def check_num_lines_in_glove(filename, check_checksum=False):
53 |     if check_checksum:
54 |         assert checksum(filename) == pretrain_checksum[filename]
55 |     if filename.startswith('glove.6B.'):
56 |         return 400000
57 |     elif filename.startswith('glove.twitter.27B.'):
58 |         return 1193514
59 |     else:
60 |         return pretrain_num_lines[filename]
61 | 
62 | 
63 | def load_glove_word2vec(filename):
64 | 
65 |     # load the whole embedding into memory
66 |     embeddings_index = dict()
67 |     f = open(filename)
68 |     for line in f:
69 |         values = line.split()
70 |         word = values[0]
71 |         coefs = np.asarray(values[1:], dtype='float32')
72 |         embeddings_index[word] = coefs
73 |     f.close()
74 |     print('Loaded %s word vectors.' % len(embeddings_index))
75 | 
76 |     return embeddings_index
77 | 


--------------------------------------------------------------------------------
/src/sarcasm_context_moods.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import collections
  3 | import random
  4 | import sys
  5 | 
  6 | sys.path.append('../../')
  7 | 
  8 | import time
  9 | import numpy
 10 | 
 11 | numpy.random.seed(1337)
 12 | 
 13 | from keras.layers.wrappers import TimeDistributed
 14 | from keras import backend as K, regularizers
 15 | from sklearn import metrics
 16 | from keras.models import model_from_json
 17 | from keras.layers.core import Dropout, Dense, Activation, Flatten
 18 | from keras.layers.embeddings import Embedding
 19 | from keras.layers.recurrent import LSTM
 20 | from keras.layers.convolutional import Convolution1D
 21 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 22 | 
 23 | from keras.layers.merge import add, concatenate
 24 | from keras.models import Model
 25 | from keras.utils import np_utils
 26 | from keras.layers import Input
 27 | import src.data_processing.data_handler as dh
 28 | from collections import defaultdict
 29 | 
 30 | 
 31 | class sarcasm_model():
 32 |     _train_file = None
 33 |     _gold_data_path = None
 34 |     _validation_file = None
 35 |     _tweet_file = None
 36 |     # test_debug = None
 37 |     _output_file = None
 38 |     _model_file = None
 39 |     _word_file_path = None
 40 |     _vocab_file_path = None
 41 |     _input_weight_file_path = None
 42 |     _vocab = None
 43 |     _line_maxlen = None
 44 | 
 45 |     def __init__(self):
 46 |         self._train_file = None
 47 |         self._test_file = None
 48 |         self._validation_file = None
 49 |         self._tweet_file = None
 50 |         self._output_file = None
 51 |         self._model_file = None
 52 |         self._word_file_path = None
 53 |         self._vocab_file_path = None
 54 |         self._input_weight_file_path = None
 55 |         self._vocab = None
 56 | 
 57 |         self._line_maxlen = 30
 58 | 
 59 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256,
 60 |                        dimension_length=11, trainable=True, batch_size=1):
 61 | 
 62 |         print('Building model...')
 63 | 
 64 |         context_input = Input(name='context', batch_shape=(batch_size, maxlen))
 65 | 
 66 |         if (len(c_emb_weights) == 0):
 67 |             c_emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal',
 68 |                               trainable=trainable)(context_input)
 69 |         else:
 70 |             c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
 71 |                               trainable=trainable)(context_input)
 72 | 
 73 |         c_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid',
 74 |                                padding='valid', input_shape=(1, maxlen))(c_emb)
 75 | 
 76 |         c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
 77 |                        dropout=0.25)(c_cnn1)
 78 | 
 79 |         c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.25,
 80 |                        go_backwards=True)(c_cnn1)
 81 | 
 82 |         c_merged = concatenate([c_lstm1, c_lstm2], axis=-1)
 83 | 
 84 |         print(c_merged)
 85 | 
 86 | 
 87 |         text_input = Input(name='text', batch_shape=(batch_size, maxlen))
 88 | 
 89 |         if (len(emb_weights) == 0):
 90 |             emb = Embedding(vocab_size, 64, input_length=maxlen, embeddings_initializer='glorot_normal',
 91 |                             trainable=trainable)(text_input)
 92 |         else:
 93 |             emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 94 |                             trainable=trainable)(text_input)
 95 | 
 96 |         t_cnn1 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal',
 97 |                                activation='relu', padding='valid', input_shape=(1, maxlen))(emb)
 98 | 
 99 |         t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
100 |                        bias_initializer='he_normal', activation='sigmoid',
101 |                        dropout=0.25)(t_cnn1)
102 | 
103 |         t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
104 |                        bias_initializer='he_normal', activation='sigmoid',
105 |                        dropout=0.25,
106 |                        go_backwards=True)(t_cnn1)
107 | 
108 |         t_merged = concatenate([t_lstm1, t_lstm2], axis=-1)
109 | 
110 |         # t_merged = Reshape((-1,int(hidden_units / 8)))(t_merged)
111 | 
112 |         awc_input = Input(name='awc', batch_shape=(batch_size, 11))
113 | 
114 |         eaw = Embedding(101, int(hidden_units / 8), input_length=dimension_length,
115 |                         embeddings_initializer='glorot_normal',
116 |                         trainable=True)(awc_input)
117 | 
118 |         merged = concatenate([c_merged, t_merged, awc_input], axis=-1)
119 | 
120 |         # flat_model = Flatten()(merged)
121 | 
122 |         dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
123 |         dnn_1 = Dropout(0.25)(dnn_1)
124 |         dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
125 | 
126 |         softmax = Activation('softmax')(dnn_2)
127 | 
128 |         model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax)
129 | 
130 |         model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
131 |         print('No of parameter:', model.count_params())
132 | 
133 |         print(model.summary())
134 | 
135 |         return model
136 | 
137 | 
138 | class train_model(sarcasm_model):
139 |     train = None
140 |     validation = None
141 | 
142 |     def load_train_validation_test_data(self):
143 |         print("Loading resource...")
144 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
145 |                                  self._emoji_file_path, normalize_text=True,
146 |                                  split_hashtag=True,
147 |                                  ignore_profiles=False)
148 | 
149 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
150 |                                       self._emoji_file_path,
151 |                                       normalize_text=True,
152 |                                       split_hashtag=True,
153 |                                       ignore_profiles=False)
154 | 
155 |         if (self._test_file != None):
156 |             self.test = dh.loaddata(self._test_file, self._word_file_path, self._split_word_file_path,
157 |                                     self._emoji_file_path, normalize_text=True,
158 |                                     split_hashtag=True,
159 |                                     ignore_profiles=True)
160 | 
161 |     def split_train_validation(self, train, ratio=.1):
162 |         test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
163 |         print(len(test_indices))
164 |         train_data = []
165 |         validation_data = []
166 |         for i, t in enumerate(train):
167 |             if (test_indices.__contains__(i)):
168 |                 validation_data.append(t)
169 |             else:
170 |                 train_data.append(t)
171 |         return train_data, validation_data
172 | 
173 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
174 |                  vocab_file,
175 |                  output_file,
176 |                  word2vec_path=None):
177 |         sarcasm_model.__init__(self)
178 | 
179 |         self._train_file = train_file
180 |         self._validation_file = validation_file
181 |         self._word_file_path = word_file_path
182 |         self._split_word_file_path = split_word_path
183 |         self._emoji_file_path = emoji_file_path
184 |         self._model_file = model_file
185 |         self._vocab_file_path = vocab_file
186 |         self._output_file = output_file
187 | 
188 |         self.load_train_validation_test_data()
189 | 
190 |         batch_size = 2
191 | 
192 |         self.train = self.train[:-(len(self.train) % batch_size)]
193 |         self.validation = self.validation[:-(len(self.validation) % batch_size)]
194 | 
195 |         print(self._line_maxlen)
196 |         self._vocab = dh.build_vocab(self.train, ignore_context=False)
197 |         self._vocab['unk'] = len(self._vocab.keys()) + 1
198 | 
199 |         print(len(self._vocab.keys()) + 1)
200 |         print('unk::', self._vocab['unk'])
201 | 
202 |         dh.write_vocab(self._vocab_file_path, self._vocab)
203 | 
204 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
205 | 
206 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
207 | 
208 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
209 |         C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
210 |         D = dh.pad_sequence_1d(D, maxlen=11)
211 | 
212 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
213 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
214 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
215 | 
216 |         hidden_units = 64
217 |         dimension_size = 300
218 | 
219 |         W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
220 |                                    path=word2vec_path)
221 |         cW = W
222 | 
223 |         print('Word2vec obtained....')
224 | 
225 |         ratio = self.calculate_label_ratio(Y)
226 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
227 | 
228 |         print('ratio', ratio)
229 | 
230 |         dimension_vocab = numpy.unique(D)
231 |         print(len(dimension_vocab))
232 | 
233 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
234 | 
235 |         print('train_X', X.shape)
236 |         print('train_C', C.shape)
237 |         print('train_D', D.shape)
238 |         print('train_Y', Y.shape)
239 | 
240 |         print('validation_X', tX.shape)
241 |         print('validation_C', tC.shape)
242 |         print('validation_D', tD.shape)
243 |         print('validation_Y', tY.shape)
244 | 
245 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
246 |                                     hidden_units=hidden_units, trainable=False, dimension_length=11,
247 |                                     batch_size=batch_size)
248 | 
249 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
250 |         save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
251 |         # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
252 |         #                            save_best_only=False)
253 |         early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
254 |         lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
255 |                                      epsilon=0.0001,
256 |                                      cooldown=0, min_lr=0.000001)
257 | 
258 |         model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True,
259 |                   callbacks=[save_best, lr_tuner], class_weight=ratio)
260 | 
261 |     def get_maxlen(self):
262 |         return max(map(len, (x for _, x in self.train + self.validation)))
263 | 
264 |     def write_vocab(self):
265 |         with open(self._vocab_file_path, 'w') as fw:
266 |             for key, value in self._vocab.iteritems():
267 |                 fw.write(str(key) + '\t' + str(value) + '\n')
268 | 
269 |     def calculate_label_ratio(self, labels, ):
270 |         return collections.Counter(labels)
271 | 
272 | 
273 | class test_model(sarcasm_model):
274 |     test = None
275 |     model = None
276 | 
277 |     def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
278 |         print('initializing...')
279 |         sarcasm_model.__init__(self)
280 | 
281 |         self._word_file_path = word_file_path
282 |         self._model_file = model_file
283 |         self._vocab_file_path = vocab_file_path
284 |         self._output_file = output_file
285 | 
286 |         # self._line_maxlen = 45
287 |         print('test_maxlen', self._line_maxlen)
288 | 
289 |     def predict_cross_validation(self, tC, tX, tD, test):
290 |         self.__predict_model([tC, tX, tD], test)
291 | 
292 |     def load_trained_model(self, weight_file='model.json.hdf5'):
293 |         start = time.time()
294 |         self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
295 |         end = time.time()
296 |         print('model loading time::', (end - start))
297 | 
298 |     def __load_model(self, model_path, model_weight_path):
299 |         self.model = model_from_json(open(model_path).read())
300 |         print('model loaded from file...')
301 |         self.model.load_weights(model_weight_path)
302 |         print('model weights loaded from file...')
303 | 
304 |     def load_vocab(self):
305 |         vocab = defaultdict()
306 |         with open(self._vocab_file_path, 'r') as f:
307 |             for line in f.readlines():
308 |                 key, value = line.split('\t')
309 |                 vocab[key] = value
310 | 
311 |         return vocab
312 | 
313 |     def predict(self, test_file, verbose=False):
314 |         start = time.time()
315 |         self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
316 |                                 split_hashtag=True,
317 |                                 ignore_profiles=False)
318 |         end = time.time()
319 |         if (verbose == True):
320 |             print('test resource loading time::', (end - start))
321 | 
322 |         self._vocab = self.load_vocab()
323 | 
324 |         start = time.time()
325 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
326 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
327 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
328 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
329 | 
330 |         end = time.time()
331 |         if (verbose == True):
332 |             print('test resource preparation time::', (end - start))
333 | 
334 |         self.__predict_model([tC, tX, tD], self.test)
335 | 
336 |     def __predict_model(self, tX, test):
337 |         prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
338 | 
339 |         y = []
340 |         y_pred = []
341 | 
342 |         fd = open(self._output_file + '.analysis', 'w')
343 |         for i, (label) in enumerate(prediction_probability):
344 |             gold_label = test[i][0]
345 |             words = test[i][1]
346 |             dimensions = test[i][2]
347 |             context = test[i][3]
348 |             author = test[i][4]
349 | 
350 |             predicted = numpy.argmax(prediction_probability[i])
351 | 
352 |             y.append(int(gold_label))
353 |             y_pred.append(predicted)
354 | 
355 |             fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
356 |                      + str(gold_label) + '\t'
357 |                      + str(predicted) + '\t'
358 |                      + ' '.join(words) + '\t'
359 |                      + str(dimensions) + '\t'
360 |                      + ' '.join(context))
361 | 
362 |             fd.write('\n')
363 | 
364 |         print('accuracy::', metrics.accuracy_score(y, y_pred))
365 |         print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
366 |         print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
367 |         print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
368 |         print('f_score::', metrics.classification_report(y, y_pred))
369 | 
370 |         fd.close()
371 | 
372 | 
373 | if __name__ == "__main__":
374 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
375 |     train_file = basepath + '/resource/train/Train_context_moods.txt'
376 |     validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
377 |     test_file = basepath + '/resource/test/Test_context_AW.txt'
378 |     word_file_path = basepath + '/resource/word_list_freq.txt'
379 |     output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
380 |     model_file = basepath + '/resource/text_context_awc_model/weights/'
381 |     vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
382 |     split_word_path = basepath + '/resource/word_split.txt'
383 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
384 | 
385 |     # word2vec path
386 |     word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin'
387 | 
388 |     tr = train_model(train_file=train_file, validation_file=validation_file, word_file_path=word_file_path,
389 |                      split_word_path=split_word_path, emoji_file_path=emoji_file_path, model_file=model_file,
390 |                      vocab_file=vocab_file_path, output_file=output_file,
391 |                      word2vec_path=word2vec_path)
392 | 
393 |     with K.get_session():
394 |         t = test_model(word_file_path, model_file, vocab_file_path, output_file)
395 |         t.load_trained_model()
396 |         t.predict(test_file)
397 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_DNN_2D.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from keras.engine import InputLayer
  5 | from keras.layers.normalization import BatchNormalization
  6 | from keras.layers.wrappers import TimeDistributed
  7 | 
  8 | sys.path.append('../')
  9 | import collections
 10 | import time
 11 | import numpy
 12 | 
 13 | numpy.random.seed(1337)
 14 | from sklearn import metrics
 15 | from keras.models import Sequential, model_from_json
 16 | from keras.layers import Masking, Bidirectional, GlobalAveragePooling2D
 17 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten
 18 | from keras.layers.embeddings import Embedding
 19 | from keras.layers.recurrent import LSTM
 20 | from keras.layers.convolutional import Convolution1D, Convolution2D, MaxPooling2D
 21 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
 22 | from keras.callbacks import EarlyStopping
 23 | from keras.optimizers import Adam
 24 | from keras.utils import np_utils
 25 | from collections import defaultdict
 26 | import src.data_processing.data_handler as dh
 27 | 
 28 | import keras.backend as K
 29 | 
 30 | 
 31 | class sarcasm_model():
 32 |     _train_file = None
 33 |     _test_file = None
 34 |     _tweet_file = None
 35 |     _output_file = None
 36 |     _model_file = None
 37 |     _word_file_path = None
 38 |     _vocab_file_path = None
 39 |     _input_weight_file_path = None
 40 |     _vocab = None
 41 |     _line_maxlen = None
 42 | 
 43 |     def __init__(self):
 44 |         self._line_maxlen = 30
 45 | 
 46 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False):
 47 |         print('Build model...')
 48 | 
 49 |         model = Sequential()
 50 | 
 51 |         model.add(Masking(mask_value=0, input_shape=(maxlen,)))
 52 | 
 53 |         if (len(emb_weights) == 0):
 54 |             model.add(Embedding(vocab_size, 20, input_length=maxlen, embeddings_initializer='he_normal',
 55 |                                 trainable=trainable, mask_zero=True))
 56 |         else:
 57 |             model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 58 |                                 trainable=trainable))
 59 | 
 60 |         model.add(Reshape((model.output_shape[1], model.output_shape[2], 1)))
 61 | 
 62 |         model.add(Convolution2D(int(hidden_units / 8), (5, 1), kernel_initializer='he_normal', padding='valid',
 63 |                                 activation='relu'))
 64 |         model.add(MaxPooling2D((2, 1)))
 65 |         model.add(Dropout(0.5))
 66 | 
 67 |         model.add(Convolution2D(int(hidden_units / 4), (3, 1), kernel_initializer='he_normal', padding='valid',
 68 |                                 activation='relu'))
 69 |         model.add(MaxPooling2D((2, 1)))
 70 |         model.add(Dropout(0.5))
 71 | 
 72 |         model.add(Dense(int(hidden_units / 2), kernel_initializer='he_normal', activation='relu'))
 73 |         model.add(Dropout(0.5))
 74 | 
 75 |         model.add(Dense(2, activation='softmax'))
 76 | 
 77 |         adam = Adam(lr=0.001)
 78 |         model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
 79 |         print('No of parameter:', model.count_params())
 80 | 
 81 |         print(model.summary())
 82 | 
 83 |         return model
 84 | 
 85 | 
 86 | class train_model(sarcasm_model):
 87 |     train = None
 88 |     validation = None
 89 |     print("Loading resource...")
 90 | 
 91 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
 92 |                  vocab_file,
 93 |                  output_file,
 94 |                  word2vec_path=None, test_file=None, input_weight_file_path=None):
 95 | 
 96 |         sarcasm_model.__init__(self)
 97 | 
 98 |         self._train_file = train_file
 99 |         self._validation_file = validation_file
100 |         self._word_file_path = word_file_path
101 |         self._split_word_file_path = split_word_path
102 |         self._emoji_file_path = emoji_file_path
103 |         self._model_file = model_file
104 |         self._vocab_file_path = vocab_file
105 |         self._output_file = output_file
106 |         self._input_weight_file_path = input_weight_file_path
107 | 
108 |         self.load_train_validation_test_data()
109 | 
110 |         print(self._line_maxlen)
111 | 
112 |         # build vocabulary
113 |         if (self._test_file != None):
114 |             self._vocab = dh.build_vocab(self.train + self.validation + self.test)
115 |         else:
116 |             self._vocab = dh.build_vocab(self.train + self.validation)
117 | 
118 |         self._vocab['unk'] = len(self._vocab.keys()) + 1
119 | 
120 |         print(len(self._vocab.keys()) + 1)
121 |         print('unk::', self._vocab['unk'])
122 | 
123 |         dh.write_vocab(self._vocab_file_path, self._vocab)
124 | 
125 |         # prepares input
126 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
127 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
128 | 
129 |         # prepares input
130 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
131 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
132 | 
133 |         # embedding dimension
134 |         dimension_size = 100
135 |         W = []
136 | 
137 |         W = dh.get_word2vec_weight(self._vocab, n=200,
138 |                                    path=word2vec_path)
139 | 
140 |         # solving class imbalance
141 |         ratio = self.calculate_label_ratio(Y)
142 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
143 |         print('class ratio::', ratio)
144 | 
145 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
146 | 
147 |         print('train_X', X.shape)
148 |         print('train_Y', Y.shape)
149 |         print('validation_X', tX.shape)
150 |         print('validation_Y', tY.shape)
151 | 
152 |         # trainable true if you want word2vec weights to be updated
153 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False)
154 | 
155 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
156 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
157 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
158 |                                    save_best_only=False)
159 |         early_stopping = EarlyStopping(monitor='loss', patience=20, verbose=1)
160 |         lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
161 |                                      epsilon=0.0001,
162 |                                      cooldown=0, min_lr=0.000001)
163 | 
164 |         # training
165 |         model.fit(X, Y, batch_size=128, epochs=100, validation_data=(tX, tY), shuffle=True,
166 |                   callbacks=[save_best], class_weight=ratio)
167 | 
168 |     def load_train_validation_test_data(self):
169 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
170 |                                  self._emoji_file_path, normalize_text=True,
171 |                                  split_hashtag=True,
172 |                                  ignore_profiles=False, replace_emoji=False)
173 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
174 |                                       self._emoji_file_path, normalize_text=True,
175 |                                       split_hashtag=True,
176 |                                       ignore_profiles=False, replace_emoji=False)
177 |         if (self._test_file != None):
178 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
179 |                                     split_hashtag=True,
180 |                                     ignore_profiles=True)
181 | 
182 |     def get_maxlen(self):
183 |         return max(map(len, (x for _, x in self.train + self.validation)))
184 | 
185 |     def write_vocab(self):
186 |         with open(self._vocab_file_path, 'w') as fw:
187 |             for key, value in self._vocab.iteritems():
188 |                 fw.write(str(key) + '\t' + str(value) + '\n')
189 | 
190 |     def calculate_label_ratio(self, labels):
191 |         return collections.Counter(labels)
192 | 
193 | 
194 | class test_model(sarcasm_model):
195 |     test = None
196 |     model = None
197 | 
198 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
199 |                  input_weight_file_path=None):
200 |         print('initializing...')
201 |         sarcasm_model.__init__(self)
202 | 
203 |         self._model_file_path = model_file
204 |         self._word_file_path = word_file_path
205 |         self._split_word_file_path = split_word_path
206 |         self._emoji_file_path = emoji_file_path
207 |         self._vocab_file_path = vocab_file_path
208 |         self._output_file = output_file
209 |         self._input_weight_file_path = input_weight_file_path
210 | 
211 |         print('test_maxlen', self._line_maxlen)
212 | 
213 |     def load_trained_model(self, weight_file='model.json.hdf5'):
214 |         start = time.time()
215 |         self.__load_model(self._model_file_path + 'model.json', self._model_file_path + weight_file)
216 |         end = time.time()
217 |         print('model loading time::', (end - start))
218 | 
219 |     def __load_model(self, model_path, model_weight_path):
220 |         self.model = model_from_json(open(model_path).read())
221 |         print('model loaded from file...')
222 |         self.model.load_weights(model_weight_path)
223 |         print('model weights loaded from file...')
224 | 
225 |     def load_vocab(self):
226 |         vocab = defaultdict()
227 |         with open(self._vocab_file_path, 'r') as f:
228 |             for line in f.readlines():
229 |                 key, value = line.split('\t')
230 |                 vocab[key] = value
231 | 
232 |         return vocab
233 | 
234 |     def predict(self, test_file, verbose=False):
235 |         try:
236 |             start = time.time()
237 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
238 |                                     normalize_text=True, split_hashtag=True,
239 |                                     ignore_profiles=True)
240 |             end = time.time()
241 |             if (verbose == True):
242 |                 print('test resource loading time::', (end - start))
243 | 
244 |             self._vocab = self.load_vocab()
245 | 
246 |             start = time.time()
247 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
248 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
249 |             end = time.time()
250 |             if (verbose == True):
251 |                 print('test resource preparation time::', (end - start))
252 | 
253 |             self.__predict_model(tX, self.test)
254 |         except Exception as e:
255 |             print('Error:', e)
256 | 
257 |     def __predict_model(self, tX, test):
258 |         y = []
259 |         y_pred = []
260 | 
261 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
262 | 
263 |         try:
264 |             fd = open(self._output_file + '.analysis', 'w')
265 |             for i, (label) in enumerate(prediction_probability):
266 |                 id = test[i][0]
267 |                 gold_label = test[i][1]
268 |                 words = test[i][2]
269 |                 dimensions = test[i][3]
270 |                 context = test[i][4]
271 |                 author = test[i][5]
272 | 
273 |                 predicted = numpy.argmax(prediction_probability[i])
274 | 
275 |                 y.append(int(gold_label))
276 |                 y_pred.append(predicted)
277 | 
278 |                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
279 |                          + str(gold_label) + '\t'
280 |                          + str(predicted) + '\t'
281 |                          + ' '.join(words))
282 | 
283 |                 fd.write('\n')
284 | 
285 |             print()
286 | 
287 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
288 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
289 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
290 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
291 |             print('f_score::', metrics.classification_report(y, y_pred))
292 |             fd.close()
293 |         except Exception as e:
294 |             print(e)
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
299 |     train_file = basepath + '/resource/train/Train_v1.txt'
300 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
301 |     test_file = basepath + '/resource/test/Test_v1.txt'
302 |     word_file_path = basepath + '/resource/word_list_freq.txt'
303 |     split_word_path = basepath + '/resource/word_split.txt'
304 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
305 | 
306 |     output_file = basepath + '/resource/text_model_2D/TestResults.txt'
307 |     model_file = basepath + '/resource/text_model_2D/weights/'
308 |     vocab_file_path = basepath + '/resource/text_model_2D/vocab_list.txt'
309 | 
310 |     # word2vec path
311 |     word2vec_path = '/home/ubuntu/word2vec/GoogleNews-vectors-negative300.bin'
312 |     glove_path = '/home/striker/word2vec/glove_model_200.txt.bin'
313 | 
314 |     # test file is passed to build the vocabulary
315 |     # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
316 |     #                  vocab_file_path, output_file,
317 |     #                  word2vec_path=glove_path, test_file=test_file)
318 |     #
319 |     t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
320 |     t.load_trained_model()
321 |     t.predict(test_file)
322 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_ATTN.py:
--------------------------------------------------------------------------------
  1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
  2 | 
  3 | import os
  4 | import sys
  5 | 
  6 | from src.data_processing.data_handler import load_glove_model, build_auxiliary_feature
  7 | 
  8 | sys.path.append('../')
  9 | 
 10 | import collections
 11 | import time
 12 | import numpy
 13 | 
 14 | from keras import backend as K
 15 | 
 16 | from keras import backend as K, regularizers
 17 | from sklearn import metrics
 18 | from keras.models import model_from_json, load_model
 19 | from keras.layers.core import Dropout, Dense, Activation, Flatten
 20 | from keras.layers.embeddings import Embedding
 21 | from keras.layers.recurrent import LSTM
 22 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
 23 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 24 | 
 25 | from keras.layers.merge import concatenate, multiply
 26 | from keras.models import Model
 27 | from keras.utils import np_utils
 28 | from keras.layers import Input, Reshape, Permute, RepeatVector, Lambda, merge
 29 | import src.data_processing.data_handler as dh
 30 | from collections import defaultdict
 31 | 
 32 | 
 33 | class sarcasm_model():
 34 |     _train_file = None
 35 |     _test_file = None
 36 |     _tweet_file = None
 37 |     _output_file = None
 38 |     _model_file_path = None
 39 |     _word_file_path = None
 40 |     _split_word_file_path = None
 41 |     _emoji_file_path = None
 42 |     _vocab_file_path = None
 43 |     _input_weight_file_path = None
 44 |     _vocab = None
 45 |     _line_maxlen = None
 46 | 
 47 |     def __init__(self):
 48 |         self._line_maxlen = 30
 49 | 
 50 |     def attention_3d_block(self, inputs, SINGLE_ATTENTION_VECTOR=False):
 51 |         # inputs.shape = (batch_size, time_steps, input_dim)
 52 |         input_dim = int(inputs.shape[2])
 53 |         a = Permute((2, 1))(inputs)
 54 |         a = Reshape((input_dim, self._line_maxlen))(a)
 55 |         # this line is not useful. It's just to know which dimension is what.
 56 |         a = Dense(self._line_maxlen, activation='softmax')(a)
 57 |         if SINGLE_ATTENTION_VECTOR:
 58 |             a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
 59 |             a = RepeatVector(input_dim)(a)
 60 |         a_probs = Permute((2, 1), name='attention_vec')(a)
 61 |         output_attention_mul = multiply([inputs, a_probs], name='attention_mul')
 62 |         return output_attention_mul
 63 | 
 64 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=50, hidden_units=256,
 65 |                        batch_size=1):
 66 |         print('Build model...')
 67 | 
 68 |         text_input = Input(name='text', shape=(maxlen,))
 69 | 
 70 |         if (len(emb_weights) == 0):
 71 |             emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen,
 72 |                             embeddings_initializer='glorot_normal',
 73 |                             trainable=True)(text_input)
 74 |         else:
 75 |             emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 76 |                             trainable=False)(text_input)
 77 |         emb_dropout = Dropout(0.5)(emb)
 78 | 
 79 |         lstm_bwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4,
 80 |                         go_backwards=True, return_sequences=True)(emb_dropout)
 81 |         lstm_fwd = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.4,
 82 |                         return_sequences=True)(emb_dropout)
 83 | 
 84 |         lstm_merged = concatenate([lstm_bwd, lstm_fwd])
 85 | 
 86 |         attention_mul = self.attention_3d_block(lstm_merged)
 87 | 
 88 |         flat_attention = Flatten()(attention_mul)
 89 | 
 90 |         aux_input = Input(name='aux', shape=(5,))
 91 | 
 92 |         merged_aux = concatenate([flat_attention, aux_input], axis=1)
 93 | 
 94 | 
 95 |         reshaped = Reshape((-1, 1))(merged_aux)
 96 | 
 97 |         print(reshaped.shape)
 98 | 
 99 |         cnn1 = Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')(
100 |             reshaped)
101 |         pool1 = MaxPooling1D(pool_size=3)(cnn1)
102 |         print(pool1.shape)
103 | 
104 |         cnn2 = Convolution1D(2 * hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='relu')(
105 |             pool1)
106 |         pool2 = MaxPooling1D(pool_size=3)(cnn2)
107 |         print(pool2.shape)
108 | 
109 |         flat_cnn = Flatten()(pool2)
110 | 
111 |         dnn_1 = Dense(hidden_units)(flat_cnn)
112 |         dropout_1 = Dropout(0.25)(dnn_1)
113 |         dnn_2 = Dense(2)(dropout_1)
114 |         print(dnn_2.shape)
115 | 
116 |         softmax = Activation('softmax')(dnn_2)
117 | 
118 |         model = Model(inputs=[text_input, aux_input], outputs=softmax)
119 | 
120 |         model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
121 |         print('No of parameter:', model.count_params())
122 | 
123 |         print(model.summary())
124 | 
125 |         return model
126 | 
127 | 
128 | class train_model(sarcasm_model):
129 |     train = None
130 |     validation = None
131 |     print("Loading resource...")
132 | 
133 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
134 |                  vocab_file,
135 |                  output_file,
136 |                  input_weight_file_path=None):
137 |         sarcasm_model.__init__(self)
138 | 
139 |         self._train_file = train_file
140 |         self._validation_file = validation_file
141 |         self._word_file_path = word_file_path
142 |         self._split_word_file_path = split_word_path
143 |         self._emoji_file_path = emoji_file_path
144 |         self._model_file = model_file
145 |         self._vocab_file_path = vocab_file
146 |         self._output_file = output_file
147 |         self._input_weight_file_path = input_weight_file_path
148 | 
149 |         self.load_train_validation_data()
150 | 
151 |         print(self._line_maxlen)
152 |         batch_size = 32
153 | 
154 |         # build vocabulary
155 |         # truncates words with min freq=1
156 |         self._vocab = dh.build_vocab(self.train, min_freq=1)
157 |         if ('unk' not in self._vocab):
158 |             self._vocab['unk'] = len(self._vocab.keys()) + 1
159 | 
160 |         print(len(self._vocab.keys()) + 1)
161 |         print('unk::', self._vocab['unk'])
162 | 
163 |         dh.write_vocab(self._vocab_file_path, self._vocab)
164 | 
165 |         self.train = self.train[:-(len(self.train) % batch_size)]
166 |         self.validation = self.validation[:-(len(self.validation) % batch_size)]
167 | 
168 |         # prepares input
169 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
170 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
171 | 
172 |         # prepares input
173 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
174 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
175 | 
176 |         # embedding dimension
177 |         dimension_size = 300
178 |         emb_weights = load_glove_model(self._vocab, n=dimension_size,
179 |                                        glove_path='/home/aghosh/backups/glove.6B.300d.txt')
180 | 
181 |         # aux inputs
182 |         aux_train = build_auxiliary_feature(self.train)
183 |         aux_validation = build_auxiliary_feature(self.validation)
184 | 
185 |         # solving class imbalance
186 |         ratio = self.calculate_label_ratio(Y)
187 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
188 |         print('class ratio::', ratio)
189 | 
190 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
191 | 
192 |         print('train_X', X.shape)
193 |         print('train_Y', Y.shape)
194 |         print('validation_X', tX.shape)
195 |         print('validation_Y', tY.shape)
196 | 
197 |         # trainable true if you want word2vec weights to be updated
198 |         # Not applicable in this code
199 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights, hidden_units=32,
200 |                                     embedding_dimension=dimension_size, batch_size=batch_size)
201 | 
202 |         # open(self._model_file + 'model.json', 'w').write(model.to_json())
203 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
204 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
205 |                                    save_best_only=False)
206 |         early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
207 | 
208 |         # training
209 |         model.fit([X, aux_train], Y, batch_size=batch_size, epochs=10, validation_data=([tX, aux_validation], tY),
210 |                   shuffle=True,
211 |                   callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
212 | 
213 |     def load_train_validation_data(self):
214 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
215 |                                  self._emoji_file_path, normalize_text=True,
216 |                                  split_hashtag=True,
217 |                                  ignore_profiles=False)
218 |         print('Training data loading finished...')
219 | 
220 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
221 |                                       self._emoji_file_path,
222 |                                       normalize_text=True,
223 |                                       split_hashtag=True,
224 |                                       ignore_profiles=False)
225 |         print('Validation data loading finished...')
226 | 
227 |         if (self._test_file != None):
228 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
229 |                                     split_hashtag=True,
230 |                                     ignore_profiles=True)
231 | 
232 |     def get_maxlen(self):
233 |         return max(map(len, (x for _, x in self.train + self.validation)))
234 | 
235 |     def write_vocab(self):
236 |         with open(self._vocab_file_path, 'w') as fw:
237 |             for key, value in self._vocab.iteritems():
238 |                 fw.write(str(key) + '\t' + str(value) + '\n')
239 | 
240 |     def calculate_label_ratio(self, labels):
241 |         return collections.Counter(labels)
242 | 
243 | 
244 | class test_model(sarcasm_model):
245 |     test = None
246 |     model = None
247 | 
248 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
249 |                  input_weight_file_path=None):
250 |         print('initializing...')
251 |         sarcasm_model.__init__(self)
252 | 
253 |         self._model_file_path = model_file
254 |         self._word_file_path = word_file_path
255 |         self._split_word_file_path = split_word_path
256 |         self._emoji_file_path = emoji_file_path
257 |         self._vocab_file_path = vocab_file_path
258 |         self._output_file = output_file
259 |         self._input_weight_file_path = input_weight_file_path
260 | 
261 |         print('test_maxlen', self._line_maxlen)
262 | 
263 |     def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
264 |         start = time.time()
265 |         self.__load_model(self._model_file_path + weight_file)
266 |         end = time.time()
267 |         print('model loading time::', (end - start))
268 | 
269 |     def __load_model(self, model_path):
270 |         self.model = load_model(model_path)
271 |         print('model loaded from file...')
272 |         # self.model.load_weights(model_weight_path)
273 |         # print('model weights loaded from file...')
274 | 
275 |     def load_vocab(self):
276 |         vocab = defaultdict()
277 |         with open(self._vocab_file_path, 'r') as f:
278 |             for line in f.readlines():
279 |                 key, value = line.split('\t')
280 |                 vocab[key] = value
281 | 
282 |         return vocab
283 | 
284 |     def predict(self, test_file, verbose=False):
285 |         try:
286 |             start = time.time()
287 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
288 |                                     normalize_text=True, split_hashtag=True,
289 |                                     ignore_profiles=False)
290 |             end = time.time()
291 |             if (verbose == True):
292 |                 print('test resource loading time::', (end - start))
293 | 
294 |             self._vocab = self.load_vocab()
295 |             print('vocab loaded...')
296 | 
297 |             start = time.time()
298 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
299 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
300 | 
301 |             aux_test = build_auxiliary_feature(self.test)
302 | 
303 |             end = time.time()
304 |             if (verbose == True):
305 |                 print('test resource preparation time::', (end - start))
306 | 
307 |             self.__predict_model([tX, aux_test], self.test)
308 |         except Exception as e:
309 |             print('Error:', e)
310 |             raise
311 | 
312 |     def __predict_model(self, tX, test):
313 |         y = []
314 |         y_pred = []
315 | 
316 |         # tX = tX[:-len(tX) % 32]
317 |         # test = test[:-len(test) % 32]
318 | 
319 |         prediction_probability = self.model.predict_file(tX, batch_size=1, verbose=1)
320 | 
321 |         try:
322 |             fd = open(self._output_file + '.analysis', 'w')
323 |             for i, (label) in enumerate(prediction_probability):
324 |                 gold_label = test[i][1]
325 |                 words = test[i][2]
326 |                 dimensions = test[i][3]
327 |                 context = test[i][4]
328 |                 author = test[i][5]
329 | 
330 |                 predicted = numpy.argmax(prediction_probability[i])
331 | 
332 |                 y.append(int(gold_label))
333 |                 y_pred.append(predicted)
334 | 
335 |                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
336 |                          + str(gold_label) + '\t'
337 |                          + str(predicted) + '\t'
338 |                          + ' '.join(words))
339 | 
340 |                 fd.write('\n')
341 | 
342 |             print()
343 | 
344 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
345 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
346 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
347 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
348 |             print('f_score::', metrics.classification_report(y, y_pred))
349 |             fd.close()
350 |         except Exception as e:
351 |             print(e)
352 |             raise
353 | 
354 | 
355 | if __name__ == "__main__":
356 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
357 |     train_file = basepath + '/resource/train/Train_v1.txt'
358 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
359 |     test_file = basepath + '/resource/test/Test_v1.txt'
360 |     word_file_path = basepath + '/resource/word_list_freq.txt'
361 |     split_word_path = basepath + '/resource/word_split.txt'
362 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
363 | 
364 |     output_file = basepath + '/resource/text_model/TestResults.txt'
365 |     model_file = basepath + '/resource/text_model/weights/'
366 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
367 | 
368 |     # uncomment for training
369 |     tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
370 |                      vocab_file_path, output_file)
371 | 
372 |     t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
373 |     t.load_trained_model(weight_file='model.json.hdf5')
374 |     t.predict(test_file)
375 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN.py:
--------------------------------------------------------------------------------
  1 | # for smaller datasets please use the simpler model sarcasm_detection_model_CNN_LSTM_DNN_simpler.py
  2 | 
  3 | import os
  4 | import sys
  5 | 
  6 | sys.path.append('../')
  7 | 
  8 | import collections
  9 | import time
 10 | import numpy
 11 | 
 12 | numpy.random.seed(1337)
 13 | from sklearn import metrics
 14 | from keras.models import Model
 15 | from keras.layers import Input
 16 | from keras.models import Sequential, model_from_json
 17 | from keras.layers.core import Dropout, Dense, Activation
 18 | from keras.layers.embeddings import Embedding
 19 | from keras.layers.recurrent import LSTM
 20 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
 21 | from keras.callbacks import ModelCheckpoint
 22 | from keras.callbacks import EarlyStopping
 23 | from keras.optimizers import Adam
 24 | from keras.utils import np_utils
 25 | from collections import defaultdict
 26 | import src.data_processing.data_handler as dh
 27 | 
 28 | 
 29 | class sarcasm_model():
 30 |     _train_file = None
 31 |     _test_file = None
 32 |     _tweet_file = None
 33 |     _output_file = None
 34 |     _model_file_path = None
 35 |     _word_file_path = None
 36 |     _split_word_file_path = None
 37 |     _emoji_file_path = None
 38 |     _vocab_file_path = None
 39 |     _input_weight_file_path = None
 40 |     _vocab = None
 41 |     _line_maxlen = None
 42 | 
 43 |     def __init__(self):
 44 |         self._line_maxlen = 30
 45 | 
 46 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], embedding_dimension=256, hidden_units=256):
 47 |         print('Build model...')
 48 | 
 49 |         text_input = Input(name='text', shape=(maxlen,))
 50 | 
 51 |         if (len(emb_weights) == 0):
 52 |             emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen,
 53 |                             embeddings_initializer='glorot_normal',
 54 |                             trainable=True)(text_input)
 55 |         else:
 56 |             emb = Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 57 |                             trainable=False)(text_input)
 58 | 
 59 |         cnn1 = Convolution1D(int(hidden_units / 4), 3, kernel_initializer='he_normal', activation='sigmoid',
 60 |                              padding='valid', input_shape=(1, maxlen))(emb)
 61 | 
 62 |         cnn2 = Convolution1D(int(hidden_units / 2), 3, kernel_initializer='he_normal', activation='sigmoid',
 63 |                              padding='valid', input_shape=(1, maxlen - 1))(cnn1)
 64 | 
 65 |         lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
 66 |                      dropout=0.25, return_sequences=True)(cnn2)
 67 | 
 68 |         lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid',
 69 |                      dropout=0.25)(lstm1)
 70 | 
 71 |         dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(lstm2)
 72 |         dnn_2 = Dense(2, activation='softmax')(dnn_1)
 73 | 
 74 |         model = Model(inputs=[text_input], outputs=dnn_2)
 75 | 
 76 |         model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 77 |         print('No of parameter:', model.count_params())
 78 | 
 79 |         print(model.summary())
 80 |         return model
 81 | 
 82 | 
 83 | class train_model(sarcasm_model):
 84 |     train = None
 85 |     validation = None
 86 |     print("Loading resource...")
 87 | 
 88 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
 89 |                  vocab_file,
 90 |                  output_file,
 91 |                  word2vec_path=None):
 92 |         sarcasm_model.__init__(self)
 93 | 
 94 |         self._train_file = train_file
 95 |         self._validation_file = validation_file
 96 |         self._word_file_path = word_file_path
 97 |         self._split_word_file_path = split_word_path
 98 |         self._emoji_file_path = emoji_file_path
 99 |         self._model_file = model_file
100 |         self._vocab_file_path = vocab_file
101 |         self._output_file = output_file
102 | 
103 |         self.load_train_validation_data()
104 | 
105 |         print(self._line_maxlen)
106 | 
107 |         # build vocabulary
108 |         # truncates words with min freq=1
109 |         self._vocab = dh.build_vocab(self.train, min_freq=1)
110 |         if ('unk' not in self._vocab):
111 |             self._vocab['unk'] = len(self._vocab.keys()) + 1
112 | 
113 |         print(len(self._vocab.keys()) + 1)
114 |         print('unk::', self._vocab['unk'])
115 | 
116 |         dh.write_vocab(self._vocab_file_path, self._vocab)
117 | 
118 |         # prepares input
119 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
120 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
121 | 
122 |         # prepares input
123 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
124 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
125 | 
126 |         # embedding dimension
127 |         dimension_size = 300
128 | 
129 |         W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
130 |                                    path=word2vec_path)
131 | 
132 |         # solving class imbalance
133 |         ratio = self.calculate_label_ratio(Y)
134 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
135 |         print('class ratio::', ratio)
136 | 
137 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
138 | 
139 |         print('train_X', X.shape)
140 |         print('train_Y', Y.shape)
141 |         print('validation_X', tX.shape)
142 |         print('validation_Y', tY.shape)
143 | 
144 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=256, emb_weights=W)
145 | 
146 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
147 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
148 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}.hdf5',
149 |                                    save_best_only=False)
150 |         early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
151 | 
152 |         # training
153 |         model.fit(X, Y, batch_size=64, epochs=100, validation_data=(tX, tY), shuffle=True,
154 |                   callbacks=[save_best, save_all, early_stopping], class_weight=ratio, verbose=2)
155 | 
156 |     def load_train_validation_data(self):
157 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
158 |                                  self._emoji_file_path, normalize_text=True,
159 |                                  split_hashtag=True,
160 |                                  ignore_profiles=False)
161 |         print('Training data loading finished...')
162 | 
163 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
164 |                                       self._emoji_file_path,
165 |                                       normalize_text=True,
166 |                                       split_hashtag=True,
167 |                                       ignore_profiles=False)
168 |         print('Validation data loading finished...')
169 | 
170 |         if (self._test_file != None):
171 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
172 |                                     split_hashtag=True,
173 |                                     ignore_profiles=True)
174 | 
175 |     def get_maxlen(self):
176 |         return max(map(len, (x for _, x in self.train + self.validation)))
177 | 
178 |     def write_vocab(self):
179 |         with open(self._vocab_file_path, 'w') as fw:
180 |             for key, value in self._vocab.iteritems():
181 |                 fw.write(str(key) + '\t' + str(value) + '\n')
182 | 
183 |     def calculate_label_ratio(self, labels):
184 |         return collections.Counter(labels)
185 | 
186 | 
187 | class test_model(sarcasm_model):
188 |     test = None
189 |     model = None
190 | 
191 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
192 |                  input_weight_file_path=None):
193 |         print('initializing...')
194 |         sarcasm_model.__init__(self)
195 | 
196 |         self._model_file_path = model_file
197 |         self._word_file_path = word_file_path
198 |         self._split_word_file_path = split_word_path
199 |         self._emoji_file_path = emoji_file_path
200 |         self._vocab_file_path = vocab_file_path
201 |         self._output_file = output_file
202 |         self._input_weight_file_path = input_weight_file_path
203 | 
204 |         print('test_maxlen', self._line_maxlen)
205 | 
206 |     def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
207 |         start = time.time()
208 |         self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
209 |         end = time.time()
210 |         print('model loading time::', (end - start))
211 | 
212 |     def __load_model(self, model_path, model_weight_path):
213 |         self.model = model_from_json(open(model_path).read())
214 |         print('model loaded from file...')
215 |         self.model.load_weights(model_weight_path)
216 |         print('model weights loaded from file...')
217 | 
218 |     def load_vocab(self):
219 |         vocab = defaultdict()
220 |         with open(self._vocab_file_path, 'r') as f:
221 |             for line in f.readlines():
222 |                 key, value = line.split('\t')
223 |                 vocab[key] = value
224 | 
225 |         return vocab
226 | 
227 |     def predict(self, test_file, verbose=False):
228 |         try:
229 |             start = time.time()
230 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
231 |                                     normalize_text=True, split_hashtag=True,
232 |                                     ignore_profiles=False)
233 |             end = time.time()
234 |             if (verbose == True):
235 |                 print('test resource loading time::', (end - start))
236 | 
237 |             self._vocab = self.load_vocab()
238 |             print('vocab loaded...')
239 | 
240 |             start = time.time()
241 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
242 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
243 |             end = time.time()
244 |             if (verbose == True):
245 |                 print('test resource preparation time::', (end - start))
246 | 
247 |             self.__predict_model(tX, self.test)
248 |         except Exception as e:
249 |             print('Error:', e)
250 |             raise
251 | 
252 |     def __predict_model(self, tX, test):
253 |         y = []
254 |         y_pred = []
255 | 
256 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
257 | 
258 |         try:
259 |             fd = open(self._output_file + '.analysis', 'w')
260 |             for i, (label) in enumerate(prediction_probability):
261 |                 gold_label = test[i][1]
262 |                 words = test[i][2]
263 |                 dimensions = test[i][3]
264 |                 context = test[i][4]
265 |                 author = test[i][5]
266 | 
267 |                 predicted = numpy.argmax(prediction_probability[i])
268 | 
269 |                 y.append(int(gold_label))
270 |                 y_pred.append(predicted)
271 | 
272 |                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
273 |                          + str(gold_label) + '\t'
274 |                          + str(predicted) + '\t'
275 |                          + ' '.join(words))
276 | 
277 |                 fd.write('\n')
278 | 
279 |             print()
280 | 
281 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
282 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
283 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
284 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
285 |             print('f_score::', metrics.classification_report(y, y_pred))
286 |             fd.close()
287 |         except Exception as e:
288 |             print(e)
289 |             raise
290 | 
291 | 
292 | if __name__ == "__main__":
293 |     basepath = os.path.abspath(os.path.join(os.getcwd(), '..'))
294 |     train_file = basepath + '/resource/train/Train_v1.txt'
295 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
296 |     test_file = basepath + '/resource/test/Test_v1.txt'
297 |     word_file_path = basepath + '/resource/word_list_freq.txt'
298 |     split_word_path = basepath + '/resource/word_split.txt'
299 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
300 | 
301 |     output_file = basepath + '/resource/text_model/TestResults.txt'
302 |     model_file = basepath + '/resource/text_model/weights/'
303 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
304 | 
305 |     word2vec_path = '/home/aghosh/backups/GoogleNews-vectors-negative300.bin'
306 | 
307 |     # uncomment for training
308 |     # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
309 |     #                  vocab_file_path, output_file)
310 | 
311 |     t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
312 |     t.load_trained_model(weight_file='weights.05__.hdf5')
313 |     t.predict(test_file)
314 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_fasttext.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D
  4 | 
  5 | sys.path.append('../')
  6 | 
  7 | import collections
  8 | import time
  9 | import numpy
 10 | 
 11 | numpy.random.seed(1337)
 12 | from sklearn import metrics
 13 | from keras.models import Sequential, model_from_json
 14 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten
 15 | from keras.layers.embeddings import Embedding
 16 | from keras.layers.recurrent import LSTM
 17 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D
 18 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
 19 | from keras.callbacks import EarlyStopping
 20 | from keras.optimizers import Adam
 21 | from keras.utils import np_utils
 22 | from collections import defaultdict
 23 | import src.data_processing.data_handler as dh
 24 | 
 25 | 
 26 | class sarcasm_model():
 27 |     _train_file = None
 28 |     _test_file = None
 29 |     _tweet_file = None
 30 |     _output_file = None
 31 |     _model_file = None
 32 |     _word_file_path = None
 33 |     _split_word_file_path = None
 34 |     _emoji_file_path = None
 35 |     _vocab_file_path = None
 36 |     _input_weight_file_path = None
 37 |     _vocab = None
 38 |     _line_maxlen = None
 39 | 
 40 |     def __init__(self):
 41 |         self._line_maxlen = 50
 42 | 
 43 |     def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
 44 |         print('Build model...')
 45 |         model = Sequential()
 46 | 
 47 |         model.add(
 48 |             Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
 49 | 
 50 |         model.add(
 51 |             Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid',
 52 |                           activation='sigmoid'))
 53 |         model.add(MaxPooling1D(pool_size=2))
 54 |         model.add(Dropout(0.25))
 55 | 
 56 |         model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
 57 |                        recurrent_activation=0.5, unroll=True, return_sequences=True))
 58 | 
 59 |         model.add(GlobalAveragePooling1D())
 60 |         model.add(Dropout(0.5))
 61 | 
 62 |         model.add(Dense(2))
 63 |         model.add(Activation('softmax'))
 64 |         adam = Adam(lr=0.001)
 65 |         model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
 66 |         print('No of parameter:', model.count_params())
 67 | 
 68 |         print(model.summary())
 69 |         return model
 70 | 
 71 | 
 72 | class train_model(sarcasm_model):
 73 |     train = None
 74 |     validation = None
 75 |     print("Loading resource...")
 76 | 
 77 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
 78 |                  vocab_file,
 79 |                  output_file,
 80 |                  word2vec_path=None):
 81 |         sarcasm_model.__init__(self)
 82 | 
 83 |         self._train_file = train_file
 84 |         self._validation_file = validation_file
 85 |         self._word_file_path = word_file_path
 86 |         self._split_word_file_path = split_word_path
 87 |         self._emoji_file_path = emoji_file_path
 88 |         self._model_file = model_file
 89 |         self._vocab_file_path = vocab_file
 90 |         self._output_file = output_file
 91 |         self._input_weight_file_path = input_weight_file_path
 92 | 
 93 |         self.load_train_validation_data()
 94 | 
 95 |         print(self._line_maxlen)
 96 | 
 97 |         # build vocabulary
 98 |         # truncates words with min freq=10
 99 |         self._vocab = dh.build_vocab(self.train, min_freq=2)
100 |         if ('unk' not in self._vocab):
101 |             self._vocab['unk'] = len(self._vocab.keys()) + 1
102 | 
103 |         print(len(self._vocab.keys()) + 1)
104 |         print('unk::', self._vocab['unk'])
105 | 
106 |         dh.write_vocab(self._vocab_file_path, self._vocab)
107 | 
108 |         # prepares input
109 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
110 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
111 | 
112 |         # prepares input
113 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
114 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
115 | 
116 |         # embedding dimension
117 |         dimension_size = 30
118 | 
119 |         W = dh.get_fasttext_weight(self._vocab, n=dimension_size,
120 |                                    path=word2vec_path)
121 | 
122 |         # solving class imbalance
123 |         ratio = self.calculate_label_ratio(Y)
124 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
125 |         print('class ratio::', ratio)
126 | 
127 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
128 | 
129 |         print('train_X', X.shape)
130 |         print('train_Y', Y.shape)
131 |         print('validation_X', tX.shape)
132 |         print('validation_Y', tY.shape)
133 | 
134 |         # trainable true if you want word2vec weights to be updated
135 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128,
136 |                                     embedding_dimension=dimension_size,
137 |                                     trainable=True)
138 | 
139 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
140 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
141 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
142 |                                    save_best_only=False)
143 |         early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
144 |         lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto',
145 |                                      epsilon=0.0001,
146 |                                      cooldown=0, min_lr=0.000001)
147 | 
148 |         # training
149 |         # model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True,
150 |         #           callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
151 |         model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True,
152 |                   callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio)
153 | 
154 |     def load_train_validation_data(self):
155 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
156 |                                  self._emoji_file_path, normalize_text=True,
157 |                                  split_hashtag=True,
158 |                                  ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
159 |         print('Training data loading finished...')
160 | 
161 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
162 |                                       self._emoji_file_path,
163 |                                       normalize_text=True,
164 |                                       split_hashtag=False,
165 |                                       ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
166 |         print('Validation data loading finished...')
167 | 
168 |     def get_maxlen(self):
169 |         return max(map(len, (x for _, x in self.train + self.validation)))
170 | 
171 |     def write_vocab(self):
172 |         with open(self._vocab_file_path, 'w') as fw:
173 |             for key, value in self._vocab.iteritems():
174 |                 fw.write(str(key) + '\t' + str(value) + '\n')
175 | 
176 |     def calculate_label_ratio(self, labels):
177 |         return collections.Counter(labels)
178 | 
179 | 
180 | class test_model(sarcasm_model):
181 |     test = None
182 |     model = None
183 | 
184 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
185 |                  input_weight_file_path=None):
186 |         print('initializing...')
187 |         sarcasm_model.__init__(self)
188 | 
189 |         self._model_file = model_file
190 |         self._word_file_path = word_file_path
191 |         self._split_word_file_path = split_word_path
192 |         self._emoji_file_path = emoji_file_path
193 |         self._vocab_file_path = vocab_file_path
194 |         self._output_file = output_file
195 |         self._input_weight_file_path = input_weight_file_path
196 | 
197 |         print('test_maxlen', self._line_maxlen)
198 | 
199 |     def load_trained_model(self, weight_file='model.json.hdf5'):
200 |         start = time.time()
201 |         self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
202 |         end = time.time()
203 |         print('model loading time::', (end - start))
204 | 
205 |     def __load_model(self, model_path, model_weight_path):
206 |         self.model = model_from_json(open(model_path).read())
207 |         print('model loaded from file...')
208 |         self.model.load_weights(model_weight_path)
209 |         print('model weights loaded from file...')
210 | 
211 |     def load_vocab(self):
212 |         vocab = defaultdict()
213 |         with open(self._vocab_file_path, 'r') as f:
214 |             for line in f.readlines():
215 |                 key, value = line.split('\t')
216 |                 vocab[key] = value
217 | 
218 |         return vocab
219 | 
220 |     def predict(self, test_file, verbose=False):
221 |         try:
222 |             start = time.time()
223 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
224 |                                     normalize_text=True, split_hashtag=True,
225 |                                     ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
226 |             end = time.time()
227 |             if (verbose == True):
228 |                 print('test resource loading time::', (end - start))
229 | 
230 |             self._vocab = self.load_vocab()
231 |             print('vocab loaded...')
232 | 
233 |             start = time.time()
234 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
235 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
236 |             end = time.time()
237 |             if (verbose == True):
238 |                 print('test resource preparation time::', (end - start))
239 | 
240 |             self.__predict_model(tX, self.test)
241 |         except Exception as e:
242 |             print('Error:', e)
243 | 
244 |     def __predict_model(self, tX, test):
245 |         y = []
246 |         y_pred = []
247 | 
248 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
249 | 
250 |         try:
251 |             fd = open(self._output_file + '.analysis', 'w')
252 |             for i, (label) in enumerate(prediction_probability):
253 |                 id = test[i][0]
254 |                 gold_label = test[i][1]
255 |                 words = test[i][2]
256 |                 dimensions = test[i][3]
257 |                 context = test[i][4]
258 |                 author = test[i][5]
259 | 
260 |                 predicted = numpy.argmax(prediction_probability[i])
261 | 
262 |                 y.append(int(gold_label))
263 |                 y_pred.append(predicted)
264 | 
265 |                 # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t'
266 |                 #          + str(gold_label) + '\t'
267 |                 #          + str(predicted) + '\t'
268 |                 #          + ' '.join(words))
269 |                 fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n')
270 | 
271 |             print()
272 | 
273 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
274 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
275 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
276 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
277 |             print('f_score::', metrics.classification_report(y, y_pred))
278 |             fd.close()
279 |         except Exception as e:
280 |             print(e)
281 | 
282 | 
283 | if __name__ == "__main__":
284 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
285 |     train_file = basepath + '/resource/train/spooky_train.tsv'
286 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
287 |     test_file = basepath + '/resource/test/spooky_test.tsv'
288 |     word_file_path = basepath + '/resource/word_list_freq.txt'
289 |     split_word_path = basepath + '/resource/word_split.txt'
290 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
291 | 
292 |     output_file = basepath + '/resource/text_model/TestResults.txt'
293 |     model_file = basepath + '/resource/text_model/weights/'
294 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
295 | 
296 |     #fastext model path
297 |     fasttext_path = '/home/fasttext/en.wiki.bin'
298 | 
299 |     # uncomment for training
300 |     tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
301 |                      vocab_file_path, output_file, fasttext_path)
302 | 
303 |     # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
304 |     # t.load_trained_model()
305 |     # t.predict(test_file)
306 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_simpler.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | sys.path.append('../')
  5 | 
  6 | import collections
  7 | import time
  8 | import numpy
  9 | 
 10 | numpy.random.seed(1337)
 11 | from sklearn import metrics
 12 | from keras.models import Sequential, model_from_json
 13 | from keras.layers.core import Dropout, Dense, Activation
 14 | from keras.layers.embeddings import Embedding
 15 | from keras.layers.recurrent import LSTM
 16 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
 17 | from keras.callbacks import ModelCheckpoint
 18 | from keras.callbacks import EarlyStopping
 19 | from keras.optimizers import Adam
 20 | from keras.utils import np_utils
 21 | from collections import defaultdict
 22 | import src.data_processing.data_handler as dh
 23 | 
 24 | 
 25 | class sarcasm_model():
 26 |     _train_file = None
 27 |     _test_file = None
 28 |     _tweet_file = None
 29 |     _output_file = None
 30 |     _model_file_path = None
 31 |     _word_file_path = None
 32 |     _split_word_file_path = None
 33 |     _emoji_file_path = None
 34 |     _vocab_file_path = None
 35 |     _input_weight_file_path = None
 36 |     _vocab = None
 37 |     _line_maxlen = None
 38 | 
 39 |     def __init__(self):
 40 |         self._line_maxlen = 30
 41 | 
 42 |     def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
 43 |         print('Build model...')
 44 |         model = Sequential()
 45 | 
 46 |         model.add(
 47 |             Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
 48 | 
 49 |         model.add(Convolution1D(hidden_units, 3, kernel_initializer='he_normal', padding='valid', activation='sigmoid',
 50 |                                 input_shape=(1, maxlen)))
 51 |         model.add(MaxPooling1D(pool_size=3))
 52 |         model.add(Dropout(0.25))
 53 | 
 54 |         model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
 55 |         model.add(Dropout(0.25))
 56 | 
 57 |         model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
 58 |         model.add(Dropout(0.25))
 59 | 
 60 |         model.add(Dense(2))
 61 |         model.add(Activation('softmax'))
 62 |         adam = Adam(lr=0.0001)
 63 |         model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
 64 |         print('No of parameter:', model.count_params())
 65 | 
 66 |         print(model.summary())
 67 |         return model
 68 | 
 69 | 
 70 | class train_model(sarcasm_model):
 71 |     train = None
 72 |     validation = None
 73 |     print("Loading resource...")
 74 | 
 75 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
 76 |                  vocab_file,
 77 |                  output_file,
 78 |                  input_weight_file_path=None):
 79 |         sarcasm_model.__init__(self)
 80 | 
 81 |         self._train_file = train_file
 82 |         self._validation_file = validation_file
 83 |         self._word_file_path = word_file_path
 84 |         self._split_word_file_path = split_word_path
 85 |         self._emoji_file_path = emoji_file_path
 86 |         self._model_file = model_file
 87 |         self._vocab_file_path = vocab_file
 88 |         self._output_file = output_file
 89 |         self._input_weight_file_path = input_weight_file_path
 90 | 
 91 |         self.load_train_validation_data()
 92 | 
 93 |         print(self._line_maxlen)
 94 | 
 95 |         # build vocabulary
 96 |         # truncates words with min freq=1
 97 |         self._vocab = dh.build_vocab(self.train, min_freq=1)
 98 |         if ('unk' not in self._vocab):
 99 |             self._vocab['unk'] = len(self._vocab.keys()) + 1
100 | 
101 |         print(len(self._vocab.keys()) + 1)
102 |         print('unk::', self._vocab['unk'])
103 | 
104 |         dh.write_vocab(self._vocab_file_path, self._vocab)
105 | 
106 |         # prepares input
107 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
108 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
109 | 
110 |         # prepares input
111 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
112 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
113 | 
114 |         # embedding dimension
115 |         dimension_size = 256
116 | 
117 |         # solving class imbalance
118 |         ratio = self.calculate_label_ratio(Y)
119 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
120 |         print('class ratio::', ratio)
121 | 
122 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
123 | 
124 |         print('train_X', X.shape)
125 |         print('train_Y', Y.shape)
126 |         print('validation_X', tX.shape)
127 |         print('validation_Y', tY.shape)
128 | 
129 |         # trainable true if you want word2vec weights to be updated
130 |         # Not applicable in this code
131 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, embedding_dimension=dimension_size,
132 |                                     trainable=True)
133 | 
134 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
135 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
136 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
137 |                                    save_best_only=False)
138 |         early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
139 | 
140 |         # training
141 |         model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True,
142 |                   callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
143 | 
144 |     def load_train_validation_data(self):
145 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
146 |                                  self._emoji_file_path, normalize_text=True,
147 |                                  split_hashtag=True,
148 |                                  ignore_profiles=False)
149 |         print('Training data loading finished...')
150 | 
151 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
152 |                                       self._emoji_file_path,
153 |                                       normalize_text=True,
154 |                                       split_hashtag=True,
155 |                                       ignore_profiles=False)
156 |         print('Validation data loading finished...')
157 | 
158 |     def get_maxlen(self):
159 |         return max(map(len, (x for _, x in self.train + self.validation)))
160 | 
161 |     def write_vocab(self):
162 |         with open(self._vocab_file_path, 'w') as fw:
163 |             for key, value in self._vocab.iteritems():
164 |                 fw.write(str(key) + '\t' + str(value) + '\n')
165 | 
166 |     def calculate_label_ratio(self, labels):
167 |         return collections.Counter(labels)
168 | 
169 | 
170 | class test_model(sarcasm_model):
171 |     test = None
172 |     model = None
173 | 
174 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
175 |                  input_weight_file_path=None):
176 |         print('initializing...')
177 |         sarcasm_model.__init__(self)
178 | 
179 |         self._model_file_path = model_file
180 |         self._word_file_path = word_file_path
181 |         self._split_word_file_path = split_word_path
182 |         self._emoji_file_path = emoji_file_path
183 |         self._vocab_file_path = vocab_file_path
184 |         self._output_file = output_file
185 |         self._input_weight_file_path = input_weight_file_path
186 | 
187 |         print('test_maxlen', self._line_maxlen)
188 | 
189 |     def load_trained_model(self, model_file='model.json', weight_file='model.json.hdf5'):
190 |         start = time.time()
191 |         self.__load_model(self._model_file_path + model_file, self._model_file_path + weight_file)
192 |         end = time.time()
193 |         print('model loading time::', (end - start))
194 | 
195 |     def __load_model(self, model_path, model_weight_path):
196 |         self.model = model_from_json(open(model_path).read())
197 |         print('model loaded from file...')
198 |         self.model.load_weights(model_weight_path)
199 |         print('model weights loaded from file...')
200 | 
201 |     def load_vocab(self):
202 |         vocab = defaultdict()
203 |         with open(self._vocab_file_path, 'r') as f:
204 |             for line in f.readlines():
205 |                 key, value = line.split('\t')
206 |                 vocab[key] = value
207 | 
208 |         return vocab
209 | 
210 |     def interactive(self, word_file_path, split_word_path, emoji_file_path):
211 |         word_list, emoji_dict, split_word_list, abbreviation_dict = dh.load_resources(word_file_path, split_word_path,
212 |                                                                                       emoji_file_path,
213 |                                                                                       split_hashtag=True)
214 |         self._vocab = self.load_vocab()
215 |         text = ''
216 |         while (text != 'exit'):
217 |             text = input('Enter a query::')
218 |             data = dh.parsedata(['{}\t{}\t{}'.format('id', -1, text)], word_list, split_word_list, emoji_dict,
219 |                                 abbreviation_dict, normalize_text=True,
220 |                                 split_hashtag=True,
221 |                                 ignore_profiles=False)
222 | 
223 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(data, self._vocab)
224 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
225 |             print(self.__predict_line(tX))
226 | 
227 |     def predict_file(self, test_file, verbose=False):
228 |         try:
229 |             start = time.time()
230 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
231 |                                     normalize_text=True, split_hashtag=True,
232 |                                     ignore_profiles=False)
233 |             end = time.time()
234 |             if (verbose == True):
235 |                 print('test resource loading time::', (end - start))
236 | 
237 |             self._vocab = self.load_vocab()
238 |             print('vocab loaded...')
239 | 
240 |             start = time.time()
241 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
242 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
243 |             end = time.time()
244 |             if (verbose == True):
245 |                 print('test resource preparation time::', (end - start))
246 | 
247 |             self.__predict_model(tX, self.test)
248 |         except Exception as e:
249 |             print('Error:', e)
250 |             raise
251 | 
252 |     def __predict_line(self, tX):
253 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
254 |         predicted = numpy.argmax(prediction_probability[0])
255 |         return predicted, prediction_probability
256 | 
257 |     def __predict_model(self, tX, test):
258 |         y = []
259 |         y_pred = []
260 | 
261 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
262 | 
263 |         try:
264 |             fd = open(self._output_file + '.analysis', 'w')
265 |             for i, (label) in enumerate(prediction_probability):
266 |                 gold_label = test[i][1]
267 |                 words = test[i][2]
268 |                 dimensions = test[i][3]
269 |                 context = test[i][4]
270 |                 author = test[i][5]
271 | 
272 |                 predicted = numpy.argmax(prediction_probability[i])
273 | 
274 |                 y.append(int(gold_label))
275 |                 y_pred.append(predicted)
276 | 
277 |                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
278 |                          + str(gold_label) + '\t'
279 |                          + str(predicted) + '\t'
280 |                          + ' '.join(words))
281 | 
282 |                 fd.write('\n')
283 | 
284 |             print()
285 | 
286 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
287 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
288 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
289 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
290 |             print('f_score::', metrics.classification_report(y, y_pred))
291 |             fd.close()
292 |         except Exception as e:
293 |             print(e)
294 |             raise
295 | 
296 | 
297 | if __name__ == "__main__":
298 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
299 |     train_file = basepath + '/resource/train/Train_v1.txt'
300 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
301 |     test_file = basepath + '/resource/test/Test_v1.txt'
302 |     word_file_path = basepath + '/resource/word_list_freq.txt'
303 |     split_word_path = basepath + '/resource/word_split.txt'
304 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
305 | 
306 |     output_file = basepath + '/resource/text_model/TestResults.txt'
307 |     model_file = basepath + '/resource/text_model/weights/'
308 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
309 | 
310 |     # uncomment for training
311 |     # tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
312 |     #                  vocab_file_path, output_file)
313 | 
314 |     t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
315 |     t.load_trained_model()
316 |     # t.predict_file(test_file)
317 |     t.interactive(word_file_path, split_word_path, emoji_file_path)
318 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_CNN_LSTM_DNN_word2vec.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | sys.path.append('../')
  5 | import collections
  6 | import time
  7 | import numpy
  8 | 
  9 | numpy.random.seed(1337)
 10 | from sklearn import metrics
 11 | from keras.models import Sequential, model_from_json
 12 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape
 13 | from keras.layers.embeddings import Embedding
 14 | from keras.layers.recurrent import LSTM
 15 | from keras.layers.convolutional import Convolution1D, MaxPooling1D
 16 | from keras.callbacks import ModelCheckpoint
 17 | from keras.callbacks import EarlyStopping
 18 | from keras.optimizers import Adam
 19 | from keras.utils import np_utils
 20 | from collections import defaultdict
 21 | import src.data_processing.data_handler as dh
 22 | 
 23 | 
 24 | class sarcasm_model():
 25 |     _train_file = None
 26 |     _test_file = None
 27 |     _tweet_file = None
 28 |     _output_file = None
 29 |     _model_file = None
 30 |     _word_file_path = None
 31 |     _vocab_file_path = None
 32 |     _vocab = None
 33 |     _line_maxlen = None
 34 | 
 35 |     def __init__(self):
 36 |         self._line_maxlen = 30
 37 | 
 38 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], hidden_units=256, trainable=False):
 39 |         print('Build model...')
 40 |         model = Sequential()
 41 | 
 42 |         model.add(Embedding(vocab_size, emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 43 |                             trainable=trainable))
 44 | 
 45 |         # model.add(Reshape((maxlen, emb_weights.shape[1], 1)))
 46 | 
 47 |         model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid',
 48 |                                 activation='sigmoid',
 49 |                                 input_shape=(1, maxlen)))
 50 |         # model.add(MaxPooling1D(pool_size=3))
 51 | 
 52 |         model.add(Convolution1D(emb_weights.shape[1], 3, kernel_initializer='he_normal', padding='valid',
 53 |                                 activation='sigmoid',
 54 |                                 input_shape=(1, maxlen - 2)))
 55 |         # model.add(MaxPooling1D(pool_size=3))
 56 | 
 57 |         model.add(Dropout(0.25))
 58 | 
 59 |         model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
 60 |                        return_sequences=True))
 61 |         model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5))
 62 | 
 63 |         model.add(Dense(hidden_units, kernel_initializer='he_normal', activation='sigmoid'))
 64 |         model.add(Dense(2, activation='softmax'))
 65 |         adam = Adam(lr=0.0001)
 66 |         model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
 67 |         print('No of parameter:', model.count_params())
 68 | 
 69 |         print(model.summary())
 70 |         return model
 71 | 
 72 | 
 73 | class train_model(sarcasm_model):
 74 |     train = None
 75 |     validation = None
 76 |     print("Loading resource...")
 77 | 
 78 |     def __init__(self, train_file, validation_file, word_file_path, model_file, vocab_file, output_file,
 79 |                  word2vec_path=None, test_file=None):
 80 | 
 81 |         sarcasm_model.__init__(self)
 82 | 
 83 |         self._train_file = train_file
 84 |         self._validation_file = validation_file
 85 |         self._word_file_path = word_file_path
 86 |         self._model_file = model_file
 87 |         self._vocab_file_path = vocab_file
 88 |         self._output_file = output_file
 89 |         self._test_file = test_file
 90 | 
 91 |         self.load_train_validation_test_data()
 92 | 
 93 |         print(self._line_maxlen)
 94 | 
 95 |         # build vocabulary
 96 |         if (self._test_file != None):
 97 |             self._vocab = dh.build_vocab(self.train + self.validation + self.test, min_freq=2)
 98 |         else:
 99 |             self._vocab = dh.build_vocab(self.train + self.validation, min_freq=2)
100 | 
101 |         self._vocab['unk'] = len(self._vocab.keys()) + 1
102 | 
103 |         print(len(self._vocab.keys()) + 1)
104 |         print('unk::', self._vocab['unk'])
105 | 
106 |         dh.write_vocab(self._vocab_file_path, self._vocab)
107 | 
108 |         # prepares input
109 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
110 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
111 | 
112 |         # prepares input
113 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
114 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
115 | 
116 |         # embedding dimension
117 |         W = dh.get_word2vec_weight(self._vocab, n=300,
118 |                                    path=word2vec_path)
119 | 
120 |         # solving class imbalance
121 |         ratio = self.calculate_label_ratio(Y)
122 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
123 |         print('class ratio::', ratio)
124 | 
125 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
126 | 
127 |         print('train_X', X.shape)
128 |         print('train_Y', Y.shape)
129 |         print('validation_X', tX.shape)
130 |         print('validation_Y', tY.shape)
131 | 
132 |         # trainable true if you want word2vec weights to be updated
133 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, trainable=False)
134 | 
135 |         open(self._model_file + 'model_wv.json', 'w').write(model.to_json())
136 |         save_best = ModelCheckpoint(model_file + 'model_wv.json.hdf5', save_best_only=True)
137 |         # save_all = ModelCheckpoint(self._model_file + 'weights_wv.{epoch:02d}.hdf5',
138 |         #                            save_best_only=False)
139 |         # early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1)
140 | 
141 |         # training
142 |         model.fit(X, Y, batch_size=8, epochs=100, validation_data=(tX, tY), shuffle=True,
143 |                   callbacks=[save_best], class_weight=ratio)
144 | 
145 |     def load_train_validation_test_data(self):
146 |         self.train = dh.loaddata(self._train_file, self._word_file_path, normalize_text=True,
147 |                                  split_hashtag=True,
148 |                                  ignore_profiles=False)
149 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, normalize_text=True,
150 |                                       split_hashtag=True,
151 |                                       ignore_profiles=False)
152 |         if (self._test_file != None):
153 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
154 |                                     split_hashtag=True,
155 |                                     ignore_profiles=True)
156 | 
157 |     def get_maxlen(self):
158 |         return max(map(len, (x for _, x in self.train + self.validation)))
159 | 
160 |     def write_vocab(self):
161 |         with open(self._vocab_file_path, 'w') as fw:
162 |             for key, value in self._vocab.iteritems():
163 |                 fw.write(str(key) + '\t' + str(value) + '\n')
164 | 
165 |     def calculate_label_ratio(self, labels):
166 |         return collections.Counter(labels)
167 | 
168 | 
169 | class test_model(sarcasm_model):
170 |     test = None
171 |     model = None
172 | 
173 |     def __init__(self, word_file_path, model_file, vocab_file_path, output_file, input_weight_file_path=None):
174 |         print('initializing...')
175 |         sarcasm_model.__init__(self)
176 | 
177 |         self._word_file_path = word_file_path
178 |         self._model_file = model_file
179 |         self._vocab_file_path = vocab_file_path
180 |         self._output_file = output_file
181 |         self._input_weight_file_path = input_weight_file_path
182 | 
183 |         print('test_maxlen', self._line_maxlen)
184 | 
185 |     def load_trained_model(self, weight_file='model_wv.json.hdf5'):
186 |         start = time.time()
187 |         self.__load_model(self._model_file + 'model_wv.json', self._model_file + weight_file)
188 |         end = time.time()
189 |         print('model loading time::', (end - start))
190 | 
191 |     def __load_model(self, model_path, model_weight_path):
192 |         self.model = model_from_json(open(model_path).read())
193 |         print('model loaded from file...')
194 |         self.model.load_weights(model_weight_path)
195 |         print('model weights loaded from file...')
196 | 
197 |     def load_vocab(self):
198 |         vocab = defaultdict()
199 |         with open(self._vocab_file_path, 'r') as f:
200 |             for line in f.readlines():
201 |                 key, value = line.split('\t')
202 |                 vocab[key] = value
203 | 
204 |         return vocab
205 | 
206 |     def predict(self, test_file, verbose=False):
207 |         try:
208 |             start = time.time()
209 |             self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True, split_hashtag=True,
210 |                                     ignore_profiles=True)
211 |             end = time.time()
212 |             if (verbose == True):
213 |                 print('test resource loading time::', (end - start))
214 | 
215 |             self._vocab = self.load_vocab()
216 | 
217 |             start = time.time()
218 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
219 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
220 |             end = time.time()
221 |             if (verbose == True):
222 |                 print('test resource preparation time::', (end - start))
223 | 
224 |             self.__predict_model(tX, self.test)
225 |         except Exception as e:
226 |             print('Error:', e)
227 | 
228 |     def __predict_model(self, tX, test):
229 |         y = []
230 |         y_pred = []
231 | 
232 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
233 | 
234 |         try:
235 |             fd = open(self._output_file + '_wv.analysis', 'w')
236 |             for i, (label) in enumerate(prediction_probability):
237 |                 gold_label = test[i][0]
238 |                 words = test[i][1]
239 |                 dimensions = test[i][2]
240 |                 context = test[i][3]
241 |                 author = test[i][4]
242 | 
243 |                 predicted = numpy.argmax(prediction_probability[i])
244 | 
245 |                 y.append(int(gold_label))
246 |                 y_pred.append(predicted)
247 | 
248 |                 fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
249 |                          + str(gold_label) + '\t'
250 |                          + str(predicted) + '\t'
251 |                          + ' '.join(words))
252 | 
253 |                 fd.write('\n')
254 | 
255 |             print()
256 | 
257 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
258 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
259 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
260 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
261 |             print('f_score::', metrics.classification_report(y, y_pred))
262 |             fd.close()
263 |         except Exception as e:
264 |             print(e)
265 | 
266 | 
267 | if __name__ == "__main__":
268 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
269 |     train_file = basepath + '/resource/train/Train_v1.txt'
270 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
271 |     test_file = basepath + '/resource/test/Test_v1.txt'
272 |     word_file_path = basepath + '/resource/word_list.txt'
273 | 
274 |     output_file = basepath + '/resource/text_model/TestResults.txt'
275 |     model_file = basepath + '/resource/text_model/weights/'
276 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
277 | 
278 |     # word2vec path
279 |     word2vec_path = '/home/striker/word2vec/GoogleNews-vectors-negative300.bin'
280 | 
281 |     tr = train_model(train_file, validation_file, word_file_path, model_file, vocab_file_path, output_file,
282 |                      word2vec_path=word2vec_path, test_file=test_file)
283 | 
284 |     t = test_model(word_file_path, model_file, vocab_file_path, output_file)
285 |     t.load_trained_model()
286 |     t.predict(test_file)
287 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_model_attention.py:
--------------------------------------------------------------------------------
  1 | # still working
  2 | import os
  3 | import sys
  4 | from keras.layers.pooling import MaxPooling2D, GlobalAveragePooling1D
  5 | 
  6 | sys.path.append('../')
  7 | 
  8 | import collections
  9 | import time
 10 | import numpy
 11 | 
 12 | numpy.random.seed(1337)
 13 | from sklearn import metrics
 14 | from keras import initializers, regularizers, constraints, Input
 15 | from keras.models import Sequential, model_from_json
 16 | from keras.layers.core import Dropout, Dense, Activation, Reshape, Flatten, Layer
 17 | from keras.layers.embeddings import Embedding
 18 | from keras.layers.recurrent import LSTM
 19 | from keras.layers.convolutional import Convolution1D, MaxPooling1D, Convolution2D
 20 | from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
 21 | from keras.callbacks import EarlyStopping
 22 | from keras.optimizers import Adam
 23 | from keras.utils import np_utils
 24 | from collections import defaultdict
 25 | import src.data_processing.data_handler as dh
 26 | 
 27 | from keras import backend as K
 28 | 
 29 | 
 30 | class Attention(Layer):
 31 |     def __init__(self,
 32 |                  W_regularizer=None, b_regularizer=None,
 33 |                  W_constraint=None, b_constraint=None,
 34 |                  bias=True, **kwargs):
 35 |         """
 36 |         Keras Layer that implements an Attention mechanism for temporal data.
 37 |         Supports Masking.
 38 |         Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
 39 |         # Input shape
 40 |             3D tensor with shape: `(samples, steps, features)`.
 41 |         # Output shape
 42 |             2D tensor with shape: `(samples, features)`.
 43 |         :param kwargs:
 44 |         Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
 45 |         The dimensions are inferred based on the output shape of the RNN.
 46 |         Note: The layer has been tested with Keras 2.0.6
 47 |         Example:
 48 |             model.add(LSTM(64, return_sequences=True))
 49 |             model.add(Attention())
 50 |             # next add a Dense layer (for classification/regression) or whatever...
 51 |         """
 52 |         self.supports_masking = True
 53 |         self.init = initializers.get('glorot_uniform')
 54 | 
 55 |         self.W_regularizer = regularizers.get(W_regularizer)
 56 |         self.b_regularizer = regularizers.get(b_regularizer)
 57 | 
 58 |         self.W_constraint = constraints.get(W_constraint)
 59 |         self.b_constraint = constraints.get(b_constraint)
 60 | 
 61 |         self.bias = bias
 62 |         super(Attention, self).__init__(**kwargs)
 63 | 
 64 |     def build(self, input_shape):
 65 |         assert len(input_shape) == 3
 66 | 
 67 |         self.W = self.add_weight((input_shape[-1],),
 68 |                                  initializer=self.init,
 69 |                                  name='{}_W'.format(self.name),
 70 |                                  regularizer=self.W_regularizer,
 71 |                                  constraint=self.W_constraint)
 72 |         if self.bias:
 73 |             self.b = self.add_weight((input_shape[1],),
 74 |                                      initializer='zero',
 75 |                                      name='{}_b'.format(self.name),
 76 |                                      regularizer=self.b_regularizer,
 77 |                                      constraint=self.b_constraint)
 78 |         else:
 79 |             self.b = None
 80 | 
 81 |         self.built = True
 82 | 
 83 |     def compute_mask(self, input, input_mask=None):
 84 |         # do not pass the mask to the next layers
 85 |         return None
 86 | 
 87 |     def call(self, x, mask=None):
 88 |         eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)
 89 | 
 90 |         if self.bias:
 91 |             eij += self.b
 92 | 
 93 |         eij = K.tanh(eij)
 94 | 
 95 |         a = K.exp(eij)
 96 | 
 97 |         # apply mask after the exp. will be re-normalized next
 98 |         if mask is not None:
 99 |             # Cast the mask to floatX to avoid float64 upcasting in theano
100 |             a *= K.cast(mask, K.floatx())
101 | 
102 |         # in some cases especially in the early stages of training the sum may be almost zero
103 |         # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
104 |         # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
105 |         a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
106 | 
107 |         a = K.expand_dims(a)
108 | 
109 |         weighted_input = x * a
110 |         return K.sum(weighted_input, axis=1)
111 | 
112 |     def compute_output_shape(self, input_shape):
113 |         return (input_shape[0], input_shape[-1])
114 | 
115 | 
116 | class sarcasm_model():
117 |     _train_file = None
118 |     _test_file = None
119 |     _tweet_file = None
120 |     _output_file = None
121 |     _model_file = None
122 |     _word_file_path = None
123 |     _split_word_file_path = None
124 |     _emoji_file_path = None
125 |     _vocab_file_path = None
126 |     _input_weight_file_path = None
127 |     _vocab = None
128 |     _line_maxlen = None
129 | 
130 |     def __init__(self):
131 |         self._line_maxlen = 50
132 | 
133 |     def _build_network(self, vocab_size, maxlen, embedding_dimension=256, hidden_units=256, trainable=False):
134 |         print('Build model...')
135 |         model = Sequential()
136 | 
137 |         # input = Input(shape=(maxlen,))
138 | 
139 |         # emb = Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal')(input)
140 | 
141 |         model.add(
142 |             Embedding(vocab_size, embedding_dimension, input_length=maxlen, embeddings_initializer='glorot_normal'))
143 | 
144 |         model.add(
145 |             Convolution1D(hidden_units, 2, kernel_initializer='he_normal', padding='valid',
146 |                           activation='sigmoid'))
147 |         model.add(MaxPooling1D(pool_size=2))
148 |         model.add(Dropout(0.25))
149 | 
150 |         model.add(LSTM(hidden_units, kernel_initializer='he_normal', activation='sigmoid', dropout=0.5,
151 |                        recurrent_dropout=0.5, unroll=True, return_sequences=True))
152 | 
153 |         model.add(Attention())
154 | 
155 |         # model.add(GlobalAveragePooling1D())
156 |         # model.add(Dropout(0.5))
157 | 
158 |         model.add(Dense(2))
159 |         model.add(Activation('softmax'))
160 |         adam = Adam(lr=0.001)
161 |         model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
162 |         print('No of parameter:', model.count_params())
163 | 
164 |         print(model.summary())
165 |         return model
166 | 
167 | 
168 | class train_model(sarcasm_model):
169 |     train = None
170 |     validation = None
171 |     print("Loading resource...")
172 | 
173 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
174 |                  vocab_file,
175 |                  output_file,
176 |                  input_weight_file_path=None):
177 |         sarcasm_model.__init__(self)
178 | 
179 |         self._train_file = train_file
180 |         self._validation_file = validation_file
181 |         self._word_file_path = word_file_path
182 |         self._split_word_file_path = split_word_path
183 |         self._emoji_file_path = emoji_file_path
184 |         self._model_file = model_file
185 |         self._vocab_file_path = vocab_file
186 |         self._output_file = output_file
187 |         self._input_weight_file_path = input_weight_file_path
188 | 
189 |         self.load_train_validation_data()
190 | 
191 |         print(self._line_maxlen)
192 | 
193 |         # build vocabulary
194 |         # truncates words with min freq=10
195 |         self._vocab = dh.build_vocab(self.train, min_freq=2)
196 |         if ('unk' not in self._vocab):
197 |             self._vocab['unk'] = len(self._vocab.keys()) + 1
198 | 
199 |         print(len(self._vocab.keys()) + 1)
200 |         print('unk::', self._vocab['unk'])
201 | 
202 |         dh.write_vocab(self._vocab_file_path, self._vocab)
203 | 
204 |         # prepares input
205 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab)
206 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
207 | 
208 |         # prepares input
209 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab)
210 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
211 | 
212 |         # embedding dimension
213 |         dimension_size = 30
214 | 
215 |         # solving class imbalance
216 |         ratio = self.calculate_label_ratio(Y)
217 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
218 |         print('class ratio::', ratio)
219 | 
220 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
221 | 
222 |         print('train_X', X.shape)
223 |         print('train_Y', Y.shape)
224 |         print('validation_X', tX.shape)
225 |         print('validation_Y', tY.shape)
226 | 
227 |         # trainable true if you want word2vec weights to be updated
228 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, hidden_units=128,
229 |                                     embedding_dimension=dimension_size,
230 |                                     trainable=True)
231 | 
232 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
233 |         save_best = ModelCheckpoint(model_file + 'model.json.hdf5', save_best_only=True)
234 |         save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}__.hdf5',
235 |                                    save_best_only=False)
236 |         early_stopping = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
237 |         lr_tuner = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='auto',
238 |                                      epsilon=0.0001,
239 |                                      cooldown=0, min_lr=0.000001)
240 | 
241 |         # training
242 |         model.fit(X, Y, batch_size=8, epochs=10, validation_data=(tX, tY), shuffle=True, verbose=2,
243 |                   callbacks=[save_best, save_all, early_stopping], class_weight=ratio)
244 |         # model.fit(X, Y, batch_size=32, epochs=100, validation_split=0.1, shuffle=True, verbose=1,
245 |         #           callbacks=[save_best, lr_tuner, early_stopping], class_weight=ratio)
246 | 
247 |     def load_train_validation_data(self):
248 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
249 |                                  self._emoji_file_path, normalize_text=True,
250 |                                  split_hashtag=True,
251 |                                  ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
252 |         print('Training data loading finished...')
253 | 
254 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
255 |                                       self._emoji_file_path,
256 |                                       normalize_text=True,
257 |                                       split_hashtag=False,
258 |                                       ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
259 |         print('Validation data loading finished...')
260 | 
261 |     def get_maxlen(self):
262 |         return max(map(len, (x for _, x in self.train + self.validation)))
263 | 
264 |     def write_vocab(self):
265 |         with open(self._vocab_file_path, 'w') as fw:
266 |             for key, value in self._vocab.iteritems():
267 |                 fw.write(str(key) + '\t' + str(value) + '\n')
268 | 
269 |     def calculate_label_ratio(self, labels):
270 |         return collections.Counter(labels)
271 | 
272 | 
273 | class test_model(sarcasm_model):
274 |     test = None
275 |     model = None
276 | 
277 |     def __init__(self, model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file,
278 |                  input_weight_file_path=None):
279 |         print('initializing...')
280 |         sarcasm_model.__init__(self)
281 | 
282 |         self._model_file = model_file
283 |         self._word_file_path = word_file_path
284 |         self._split_word_file_path = split_word_path
285 |         self._emoji_file_path = emoji_file_path
286 |         self._vocab_file_path = vocab_file_path
287 |         self._output_file = output_file
288 |         self._input_weight_file_path = input_weight_file_path
289 | 
290 |         print('test_maxlen', self._line_maxlen)
291 | 
292 |     def load_trained_model(self, weight_file='model.json.hdf5'):
293 |         start = time.time()
294 |         self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
295 |         end = time.time()
296 |         print('model loading time::', (end - start))
297 | 
298 |     def __load_model(self, model_path, model_weight_path):
299 |         self.model = model_from_json(open(model_path).read())
300 |         print('model loaded from file...')
301 |         self.model.load_weights(model_weight_path)
302 |         print('model weights loaded from file...')
303 | 
304 |     def load_vocab(self):
305 |         vocab = defaultdict()
306 |         with open(self._vocab_file_path, 'r') as f:
307 |             for line in f.readlines():
308 |                 key, value = line.split('\t')
309 |                 vocab[key] = value
310 | 
311 |         return vocab
312 | 
313 |     def predict(self, test_file, verbose=False):
314 |         try:
315 |             start = time.time()
316 |             self.test = dh.loaddata(test_file, self._word_file_path, self._split_word_file_path, self._emoji_file_path,
317 |                                     normalize_text=True, split_hashtag=True,
318 |                                     ignore_profiles=False, lowercase=False, n_grams=3, at_character=True)
319 |             end = time.time()
320 |             if (verbose == True):
321 |                 print('test resource loading time::', (end - start))
322 | 
323 |             self._vocab = self.load_vocab()
324 |             print('vocab loaded...')
325 | 
326 |             start = time.time()
327 |             tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
328 |             tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
329 |             end = time.time()
330 |             if (verbose == True):
331 |                 print('test resource preparation time::', (end - start))
332 | 
333 |             self.__predict_model(tX, self.test)
334 |         except Exception as e:
335 |             print('Error:', e)
336 | 
337 |     def __predict_model(self, tX, test):
338 |         y = []
339 |         y_pred = []
340 | 
341 |         prediction_probability = self.model.predict_proba(tX, batch_size=1, verbose=1)
342 | 
343 |         try:
344 |             fd = open(self._output_file + '.analysis', 'w')
345 |             for i, (label) in enumerate(prediction_probability):
346 |                 id = test[i][0]
347 |                 gold_label = test[i][1]
348 |                 words = test[i][2]
349 |                 dimensions = test[i][3]
350 |                 context = test[i][4]
351 |                 author = test[i][5]
352 | 
353 |                 predicted = numpy.argmax(prediction_probability[i])
354 | 
355 |                 y.append(int(gold_label))
356 |                 y_pred.append(predicted)
357 | 
358 |                 # fd.write(str(id) + '\t' + str(label[0]) + '\t' + str(label[1]) + '\t'
359 |                 #          + str(gold_label) + '\t'
360 |                 #          + str(predicted) + '\t'
361 |                 #          + ' '.join(words))
362 |                 fd.write(str(id) + ',' + ','.join([str(l) for l in label]) + '\n')
363 | 
364 |             print()
365 | 
366 |             print('accuracy::', metrics.accuracy_score(y, y_pred))
367 |             print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
368 |             print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
369 |             print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
370 |             print('f_score::', metrics.classification_report(y, y_pred))
371 |             fd.close()
372 |         except Exception as e:
373 |             print(e)
374 | 
375 | 
376 | if __name__ == "__main__":
377 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
378 |     train_file = basepath + '/resource/train/Train_v1.txt'
379 |     validation_file = basepath + '/resource/dev/Dev_v1.txt'
380 |     test_file = basepath + '/resource/test/Test_v1.tsv'
381 |     word_file_path = basepath + '/resource/word_list_freq.txt'
382 |     split_word_path = basepath + '/resource/word_split.txt'
383 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
384 | 
385 |     output_file = basepath + '/resource/text_model/TestResults.txt'
386 |     model_file = basepath + '/resource/text_model/weights/'
387 |     vocab_file_path = basepath + '/resource/text_model/vocab_list.txt'
388 | 
389 |     # uncomment for training
390 |     tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
391 |                      vocab_file_path, output_file)
392 | 
393 |     # t = test_model(model_file, word_file_path, split_word_path, emoji_file_path, vocab_file_path, output_file)
394 |     # t.load_trained_model()
395 |     # t.predict(test_file)
396 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_moods_siamese.py:
--------------------------------------------------------------------------------
  1 | # not finalized
  2 | import os
  3 | import collections
  4 | import random
  5 | import sys
  6 | 
  7 | sys.path.append('../')
  8 | 
  9 | import time
 10 | import numpy
 11 | 
 12 | numpy.random.seed(1337)
 13 | 
 14 | from keras.layers.wrappers import TimeDistributed
 15 | from keras import backend as K, regularizers
 16 | from sklearn import metrics
 17 | from keras.models import model_from_json
 18 | from keras.layers.core import Dropout, Dense, Activation, Flatten, Reshape
 19 | from keras.layers.embeddings import Embedding
 20 | from keras.layers.recurrent import LSTM
 21 | from keras.layers.convolutional import Convolution1D
 22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 23 | 
 24 | from keras.layers.merge import add, concatenate, subtract, multiply
 25 | from keras.models import Model
 26 | from keras.utils import np_utils
 27 | from keras.layers import Input
 28 | import src.data_processing.data_handler as dh
 29 | from collections import defaultdict
 30 | 
 31 | 
 32 | class sarcasm_model():
 33 |     _train_file = None
 34 |     _gold_data_path = None
 35 |     _validation_file = None
 36 |     _tweet_file = None
 37 |     # test_debug = None
 38 |     _output_file = None
 39 |     _model_file = None
 40 |     _word_file_path = None
 41 |     _vocab_file_path = None
 42 |     _input_weight_file_path = None
 43 |     _vocab = None
 44 |     _line_maxlen = None
 45 | 
 46 |     def __init__(self):
 47 |         self._train_file = None
 48 |         self._test_file = None
 49 |         self._validation_file = None
 50 |         self._tweet_file = None
 51 |         self._output_file = None
 52 |         self._model_file = None
 53 |         self._word_file_path = None
 54 |         self._vocab_file_path = None
 55 |         self._input_weight_file_path = None
 56 |         self._vocab = None
 57 | 
 58 |         self._line_maxlen = 30
 59 | 
 60 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True,
 61 |                        batch_size=1):
 62 | 
 63 |         print('Building model...')
 64 | 
 65 |         context_input = Input(name='context', batch_shape=(batch_size, maxlen))
 66 | 
 67 |         if (len(c_emb_weights) == 0):
 68 |             c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
 69 |                               trainable=trainable)(context_input)
 70 |         else:
 71 |             c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
 72 |                               trainable=trainable)(context_input)
 73 | 
 74 |         c_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal',
 75 |                                activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(c_emb)
 76 |         c_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal',
 77 |                                activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))(
 78 |             c_cnn1)
 79 | 
 80 |         c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
 81 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
 82 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
 83 |                        recurrent_regularizer=regularizers.l2(0.01),
 84 |                        dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_cnn2)
 85 | 
 86 |         c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
 87 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
 88 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
 89 |                        recurrent_regularizer=regularizers.l2(0.01),
 90 |                        dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False,
 91 |                        go_backwards=True)(c_cnn2)
 92 | 
 93 |         c_merged = add([c_lstm1, c_lstm2])
 94 |         c_merged = Dropout(0.25)(c_merged)
 95 | 
 96 |         text_input = Input(name='text', batch_shape=(batch_size, maxlen))
 97 | 
 98 |         if (len(emb_weights) == 0):
 99 |             emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
100 |                             trainable=trainable)(text_input)
101 |         else:
102 |             emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
103 |                             trainable=trainable)(text_input)
104 | 
105 |         t_cnn1 = Convolution1D(int(hidden_units / 2), 5, kernel_initializer='he_normal', bias_initializer='he_normal',
106 |                                activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen))(emb)
107 |         t_cnn2 = Convolution1D(hidden_units, 5, kernel_initializer='he_normal', bias_initializer='he_normal',
108 |                                activation='sigmoid', padding='valid', use_bias=True, input_shape=(1, maxlen - 2))(
109 |             t_cnn1)
110 | 
111 |         t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
112 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
113 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
114 |                        recurrent_regularizer=regularizers.l2(0.01),
115 |                        dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(t_cnn2)
116 | 
117 |         t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
118 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
119 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
120 |                        recurrent_regularizer=regularizers.l2(0.01),
121 |                        dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False,
122 |                        go_backwards=True)(t_cnn2)
123 | 
124 |         t_merged = add([t_lstm1, t_lstm2])
125 |         t_merged = Dropout(0.25)(t_merged)
126 | 
127 |         awc_input = Input(name='awc', batch_shape=(batch_size, 11))
128 | 
129 |         t_merged = Reshape((-1, 1))(t_merged)
130 | 
131 |         t_merged = multiply([t_merged, awc_input])
132 | 
133 |         t_merged = Flatten()(t_merged)
134 | 
135 |         merged = concatenate([c_merged, t_merged], axis=1)
136 | 
137 |         dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
138 |         dnn_1 = Dropout(0.25)(dnn_1)
139 |         dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
140 | 
141 |         softmax = Activation('softmax')(dnn_2)
142 | 
143 |         model = Model(inputs=[context_input, text_input, awc_input], outputs=softmax)
144 | 
145 |         model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
146 |         print('No of parameter:', model.count_params())
147 | 
148 |         print(model.summary())
149 |         return model
150 | 
151 | 
152 | class train_model(sarcasm_model):
153 |     train = None
154 |     validation = None
155 | 
156 |     def load_train_validation_test_data(self):
157 |         print("Loading resource...")
158 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
159 |                                  self._emoji_file_path, normalize_text=True,
160 |                                  split_hashtag=True,
161 |                                  ignore_profiles=False)
162 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
163 |                                       self._emoji_file_path,
164 |                                       normalize_text=True,
165 |                                       split_hashtag=True,
166 |                                       ignore_profiles=False)
167 | 
168 |         if (self._test_file != None):
169 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
170 |                                     split_hashtag=True,
171 |                                     ignore_profiles=True)
172 | 
173 |     def split_train_validation(self, train, ratio=.1):
174 |         test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
175 |         print(len(test_indices))
176 |         train_data = []
177 |         validation_data = []
178 |         for i, t in enumerate(train):
179 |             if (test_indices.__contains__(i)):
180 |                 validation_data.append(t)
181 |             else:
182 |                 train_data.append(t)
183 |         return train_data, validation_data
184 | 
185 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
186 |                  vocab_file,
187 |                  output_file,
188 |                  input_weight_file_path=None):
189 |         sarcasm_model.__init__(self)
190 | 
191 |         self._train_file = train_file
192 |         self._validation_file = validation_file
193 |         self._word_file_path = word_file_path
194 |         self._split_word_file_path = split_word_path
195 |         self._emoji_file_path = emoji_file_path
196 |         self._model_file = model_file
197 |         self._vocab_file_path = vocab_file
198 |         self._output_file = output_file
199 |         self._input_weight_file_path = input_weight_file_path
200 | 
201 |         self.load_train_validation_test_data()
202 | 
203 |         batch_size = 32
204 | 
205 |         print(self._line_maxlen)
206 |         self._vocab = dh.build_vocab(self.train, ignore_context=False)
207 |         self._vocab['unk'] = len(self._vocab.keys()) + 1
208 | 
209 |         print(len(self._vocab.keys()) + 1)
210 |         print('unk::', self._vocab['unk'])
211 | 
212 |         dh.write_vocab(self._vocab_file_path, self._vocab)
213 | 
214 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
215 | 
216 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
217 | 
218 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
219 |         C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
220 |         D = dh.pad_sequence_1d(D, maxlen=11)
221 | 
222 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
223 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
224 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
225 | 
226 |         hidden_units = 128
227 |         dimension_size = 300
228 | 
229 |         W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
230 |                                    path=word2vec_path)
231 | 
232 |         cW = W
233 | 
234 |         print('Word2vec obtained....')
235 | 
236 |         ratio = self.calculate_label_ratio(Y)
237 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
238 | 
239 |         print('ratio', ratio)
240 | 
241 |         dimension_vocab = numpy.unique(D)
242 |         print(len(dimension_vocab))
243 | 
244 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
245 | 
246 |         print('train_X', X.shape)
247 |         print('train_C', C.shape)
248 |         print('train_D', D.shape)
249 |         print('train_Y', Y.shape)
250 | 
251 |         print('validation_X', tX.shape)
252 |         print('validation_C', tC.shape)
253 |         print('validation_D', tD.shape)
254 |         print('validation_Y', tY.shape)
255 | 
256 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
257 |                                     hidden_units=hidden_units, trainable=False, dimension_length=11,
258 |                                     batch_size=batch_size)
259 | 
260 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
261 |         save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
262 |         # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
263 |         #                            save_best_only=False)
264 |         early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
265 |         lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
266 |                                      epsilon=0.0001,
267 |                                      cooldown=0, min_lr=0.000001)
268 | 
269 |         model.fit([C, X, D], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX, tD], tY), shuffle=True,
270 |                   callbacks=[save_best, lr_tuner], class_weight=ratio)
271 | 
272 |     def get_maxlen(self):
273 |         return max(map(len, (x for _, x in self.train + self.validation)))
274 | 
275 |     def write_vocab(self):
276 |         with open(self._vocab_file_path, 'w') as fw:
277 |             for key, value in self._vocab.iteritems():
278 |                 fw.write(str(key) + '\t' + str(value) + '\n')
279 | 
280 |     def calculate_label_ratio(self, labels, ):
281 |         return collections.Counter(labels)
282 | 
283 | 
284 | class test_model(sarcasm_model):
285 |     test = None
286 |     model = None
287 | 
288 |     def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
289 |         print('initializing...')
290 |         sarcasm_model.__init__(self)
291 | 
292 |         self._word_file_path = word_file_path
293 |         self._model_file = model_file
294 |         self._vocab_file_path = vocab_file_path
295 |         self._output_file = output_file
296 | 
297 |         # self._line_maxlen = 45
298 |         print('test_maxlen', self._line_maxlen)
299 | 
300 |     def predict_cross_validation(self, tC, tX, tD, test):
301 |         self.__predict_model([tC, tX, tD], test)
302 | 
303 |     def load_trained_model(self, weight_file='model.json.hdf5'):
304 |         start = time.time()
305 |         self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
306 |         end = time.time()
307 |         print('model loading time::', (end - start))
308 | 
309 |     def __load_model(self, model_path, model_weight_path):
310 |         self.model = model_from_json(open(model_path).read())
311 |         print('model loaded from file...')
312 |         self.model.load_weights(model_weight_path)
313 |         print('model weights loaded from file...')
314 | 
315 |     def load_vocab(self):
316 |         vocab = defaultdict()
317 |         with open(self._vocab_file_path, 'r') as f:
318 |             for line in f.readlines():
319 |                 key, value = line.split('\t')
320 |                 vocab[key] = value
321 | 
322 |         return vocab
323 | 
324 |     def predict(self, test_file, verbose=False):
325 |         start = time.time()
326 |         self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
327 |                                 split_hashtag=True,
328 |                                 ignore_profiles=False)
329 |         end = time.time()
330 |         if (verbose == True):
331 |             print('test resource loading time::', (end - start))
332 | 
333 |         self._vocab = self.load_vocab()
334 | 
335 |         start = time.time()
336 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
337 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
338 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
339 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
340 | 
341 |         end = time.time()
342 |         if (verbose == True):
343 |             print('test resource preparation time::', (end - start))
344 | 
345 |         self.__predict_model([tC, tX, tD], self.test)
346 | 
347 |     def __predict_model(self, tX, test):
348 |         prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
349 | 
350 |         y = []
351 |         y_pred = []
352 | 
353 |         fd = open(self._output_file + '.analysis', 'w')
354 |         for i, (label) in enumerate(prediction_probability):
355 |             gold_label = test[i][0]
356 |             words = test[i][1]
357 |             dimensions = test[i][2]
358 |             context = test[i][3]
359 |             author = test[i][4]
360 | 
361 |             predicted = numpy.argmax(prediction_probability[i])
362 | 
363 |             y.append(int(gold_label))
364 |             y_pred.append(predicted)
365 | 
366 |             fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
367 |                      + str(gold_label) + '\t'
368 |                      + str(predicted) + '\t'
369 |                      + ' '.join(words) + '\t'
370 |                      + str(dimensions) + '\t'
371 |                      + ' '.join(context))
372 | 
373 |             fd.write('\n')
374 | 
375 |         print('accuracy::', metrics.accuracy_score(y, y_pred))
376 |         print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
377 |         print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
378 |         print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
379 |         print('f_score::', metrics.classification_report(y, y_pred))
380 | 
381 |         fd.close()
382 | 
383 | 
384 | if __name__ == "__main__":
385 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
386 |     train_file = basepath + '/resource/train/Train_context_moods_v1.txt'
387 |     validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
388 |     test_file = basepath + '/resource/test/Test_context_AW.txt'
389 |     word_file_path = basepath + '/resource/word_list_freq.txt'
390 |     split_word_path = basepath + '/resource/word_split.txt'
391 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
392 | 
393 |     output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
394 |     model_file = basepath + '/resource/text_context_awc_model/weights/'
395 |     vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
396 | 
397 |     # word2vec path
398 |     word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin'
399 | 
400 |     tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
401 |                      vocab_file_path, output_file)
402 | 
403 |     # testing the model
404 |     # with K.get_session():
405 |     #     t = test_model(word_file_path, model_file, vocab_file_path, output_file)
406 |     #     t.load_trained_model()
407 |     #     t.predict(test_file)
408 | 


--------------------------------------------------------------------------------
/src/sarcasm_detection_siamese.py:
--------------------------------------------------------------------------------
  1 | # not finalized
  2 | import os
  3 | import collections
  4 | import random
  5 | import sys
  6 | 
  7 | sys.path.append('../')
  8 | 
  9 | import time
 10 | import numpy
 11 | 
 12 | numpy.random.seed(1337)
 13 | 
 14 | from keras.layers.wrappers import TimeDistributed
 15 | from keras import backend as K, regularizers
 16 | from sklearn import metrics
 17 | from keras.models import model_from_json
 18 | from keras.layers.core import Dropout, Dense, Activation, Flatten
 19 | from keras.layers.embeddings import Embedding
 20 | from keras.layers.recurrent import LSTM
 21 | from keras.layers.convolutional import Convolution1D
 22 | from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
 23 | 
 24 | from keras.layers.merge import add, concatenate, subtract
 25 | from keras.models import Model
 26 | from keras.utils import np_utils
 27 | from keras.layers import Input
 28 | import src.data_processing.data_handler as dh
 29 | from collections import defaultdict
 30 | 
 31 | 
 32 | class sarcasm_model():
 33 |     _train_file = None
 34 |     _gold_data_path = None
 35 |     _validation_file = None
 36 |     _tweet_file = None
 37 |     # test_debug = None
 38 |     _output_file = None
 39 |     _model_file = None
 40 |     _word_file_path = None
 41 |     _vocab_file_path = None
 42 |     _input_weight_file_path = None
 43 |     _vocab = None
 44 |     _line_maxlen = None
 45 | 
 46 |     def __init__(self):
 47 |         self._train_file = None
 48 |         self._test_file = None
 49 |         self._validation_file = None
 50 |         self._tweet_file = None
 51 |         self._output_file = None
 52 |         self._model_file = None
 53 |         self._word_file_path = None
 54 |         self._vocab_file_path = None
 55 |         self._input_weight_file_path = None
 56 |         self._vocab = None
 57 | 
 58 |         self._line_maxlen = 30
 59 | 
 60 |     def _build_network(self, vocab_size, maxlen, emb_weights=[], c_emb_weights=[], hidden_units=256, trainable=True,
 61 |                        batch_size=1):
 62 | 
 63 |         print('Building model...')
 64 | 
 65 |         context_input = Input(name='context', batch_shape=(batch_size, maxlen))
 66 | 
 67 |         if (len(c_emb_weights) == 0):
 68 |             c_emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
 69 |                               trainable=trainable)(context_input)
 70 |         else:
 71 |             c_emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[c_emb_weights],
 72 |                               trainable=trainable)(context_input)
 73 | 
 74 |         c_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
 75 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
 76 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
 77 |                        recurrent_regularizer=regularizers.l2(0.01),
 78 |                        dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False)(c_emb)
 79 | 
 80 |         c_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='orthogonal',
 81 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
 82 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
 83 |                        recurrent_regularizer=regularizers.l2(0.01),
 84 |                        dropout=0.25, recurrent_dropout=.0, unit_forget_bias=False, return_sequences=False,
 85 |                        go_backwards=True)(c_emb)
 86 | 
 87 |         c_merged = add([c_lstm1, c_lstm2])
 88 |         c_merged = Dropout(0.25)(c_merged)
 89 | 
 90 |         text_input = Input(name='text', batch_shape=(batch_size, maxlen))
 91 | 
 92 |         if (len(emb_weights) == 0):
 93 |             emb = Embedding(vocab_size, 256, input_length=maxlen, embeddings_initializer='glorot_normal',
 94 |                             trainable=trainable)(text_input)
 95 |         else:
 96 |             emb = Embedding(vocab_size, c_emb_weights.shape[1], input_length=maxlen, weights=[emb_weights],
 97 |                             trainable=trainable)(text_input)
 98 | 
 99 |         t_lstm1 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
100 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
101 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
102 |                        recurrent_regularizer=regularizers.l2(0.01),
103 |                        dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False)(emb)
104 | 
105 |         t_lstm2 = LSTM(hidden_units, kernel_initializer='he_normal', recurrent_initializer='he_normal',
106 |                        bias_initializer='he_normal', activation='sigmoid', recurrent_activation='sigmoid',
107 |                        kernel_regularizer=regularizers.l2(0.01), activity_regularizer=regularizers.l2(0.01),
108 |                        recurrent_regularizer=regularizers.l2(0.01),
109 |                        dropout=0.25, recurrent_dropout=0.25, unit_forget_bias=False, return_sequences=False,
110 |                        go_backwards=True)(emb)
111 | 
112 |         t_merged = add([t_lstm1, t_lstm2])
113 |         t_merged = Dropout(0.25)(t_merged)
114 | 
115 |         merged = subtract([c_merged, t_merged])
116 | 
117 |         dnn_1 = Dense(hidden_units, kernel_initializer="he_normal", activation='sigmoid')(merged)
118 |         dnn_1 = Dropout(0.25)(dnn_1)
119 |         dnn_2 = Dense(2, activation='sigmoid')(dnn_1)
120 | 
121 |         softmax = Activation('softmax')(dnn_2)
122 | 
123 |         model = Model(inputs=[context_input, text_input], outputs=softmax)
124 | 
125 |         model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
126 |         print('No of parameter:', model.count_params())
127 | 
128 |         print(model.summary())
129 |         return model
130 | 
131 | 
132 | class train_model(sarcasm_model):
133 |     train = None
134 |     validation = None
135 | 
136 |     def load_train_validation_test_data(self):
137 |         print("Loading resource...")
138 |         self.train = dh.loaddata(self._train_file, self._word_file_path, self._split_word_file_path,
139 |                                  self._emoji_file_path, normalize_text=True,
140 |                                  split_hashtag=True,
141 |                                  ignore_profiles=False)
142 |         self.validation = dh.loaddata(self._validation_file, self._word_file_path, self._split_word_file_path,
143 |                                       self._emoji_file_path,
144 |                                       normalize_text=True,
145 |                                       split_hashtag=True,
146 |                                       ignore_profiles=False)
147 | 
148 |         if (self._test_file != None):
149 |             self.test = dh.loaddata(self._test_file, self._word_file_path, normalize_text=True,
150 |                                     split_hashtag=True,
151 |                                     ignore_profiles=True)
152 | 
153 |     def split_train_validation(self, train, ratio=.1):
154 |         test_indices = sorted([i for i in random.sample(range(len(train)), int(len(train) * ratio))])
155 |         print(len(test_indices))
156 |         train_data = []
157 |         validation_data = []
158 |         for i, t in enumerate(train):
159 |             if (test_indices.__contains__(i)):
160 |                 validation_data.append(t)
161 |             else:
162 |                 train_data.append(t)
163 |         return train_data, validation_data
164 | 
165 |     def __init__(self, train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
166 |                  vocab_file,
167 |                  output_file,
168 |                  input_weight_file_path=None):
169 |         sarcasm_model.__init__(self)
170 | 
171 |         self._train_file = train_file
172 |         self._validation_file = validation_file
173 |         self._word_file_path = word_file_path
174 |         self._split_word_file_path = split_word_path
175 |         self._emoji_file_path = emoji_file_path
176 |         self._model_file = model_file
177 |         self._vocab_file_path = vocab_file
178 |         self._output_file = output_file
179 |         self._input_weight_file_path = input_weight_file_path
180 | 
181 |         self.load_train_validation_test_data()
182 | 
183 |         batch_size = 32
184 | 
185 |         print(self._line_maxlen)
186 |         self._vocab = dh.build_vocab(self.train, ignore_context=False)
187 |         self._vocab['unk'] = len(self._vocab.keys()) + 1
188 | 
189 |         print(len(self._vocab.keys()) + 1)
190 |         print('unk::', self._vocab['unk'])
191 | 
192 |         dh.write_vocab(self._vocab_file_path, self._vocab)
193 | 
194 |         X, Y, D, C, A = dh.vectorize_word_dimension(self.train, self._vocab, drop_dimension_index=None)
195 | 
196 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.validation, self._vocab, drop_dimension_index=None)
197 | 
198 |         X = dh.pad_sequence_1d(X, maxlen=self._line_maxlen)
199 |         C = dh.pad_sequence_1d(C, maxlen=self._line_maxlen)
200 |         D = dh.pad_sequence_1d(D, maxlen=11)
201 | 
202 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
203 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
204 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
205 | 
206 |         hidden_units = 128
207 |         dimension_size = 300
208 | 
209 |         W = dh.get_word2vec_weight(self._vocab, n=dimension_size,
210 |                                    path=word2vec_path)
211 | 
212 |         cW = W
213 | 
214 |         print('Word2vec obtained....')
215 | 
216 |         ratio = self.calculate_label_ratio(Y)
217 |         ratio = [max(ratio.values()) / value for key, value in ratio.items()]
218 | 
219 |         print('ratio', ratio)
220 | 
221 |         dimension_vocab = numpy.unique(D)
222 |         print(len(dimension_vocab))
223 | 
224 |         Y, tY = [np_utils.to_categorical(x) for x in (Y, tY)]
225 | 
226 |         print('train_X', X.shape)
227 |         print('train_C', C.shape)
228 |         print('train_D', D.shape)
229 |         print('train_Y', Y.shape)
230 | 
231 |         print('validation_X', tX.shape)
232 |         print('validation_C', tC.shape)
233 |         print('validation_D', tD.shape)
234 |         print('validation_Y', tY.shape)
235 | 
236 |         model = self._build_network(len(self._vocab.keys()) + 1, self._line_maxlen, emb_weights=W, c_emb_weights=cW,
237 |                                     hidden_units=hidden_units, trainable=False,
238 |                                     batch_size=batch_size)
239 | 
240 |         open(self._model_file + 'model.json', 'w').write(model.to_json())
241 |         save_best = ModelCheckpoint(self._model_file + 'model.json.hdf5', save_best_only=True, monitor='val_loss')
242 |         # save_all = ModelCheckpoint(self._model_file + 'weights.{epoch:02d}-{val_loss:.2f}.hdf5',
243 |         #                            save_best_only=False)
244 |         early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)
245 |         lr_tuner = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=10, verbose=1, mode='auto',
246 |                                      epsilon=0.0001,
247 |                                      cooldown=0, min_lr=0.000001)
248 | 
249 |         model.fit([C, X], Y, batch_size=batch_size, epochs=100, validation_data=([tC, tX], tY), shuffle=True,
250 |                   callbacks=[save_best, lr_tuner], class_weight=ratio)
251 | 
252 |     def get_maxlen(self):
253 |         return max(map(len, (x for _, x in self.train + self.validation)))
254 | 
255 |     def write_vocab(self):
256 |         with open(self._vocab_file_path, 'w') as fw:
257 |             for key, value in self._vocab.iteritems():
258 |                 fw.write(str(key) + '\t' + str(value) + '\n')
259 | 
260 |     def calculate_label_ratio(self, labels, ):
261 |         return collections.Counter(labels)
262 | 
263 | 
264 | class test_model(sarcasm_model):
265 |     test = None
266 |     model = None
267 | 
268 |     def __init__(self, word_file_path, model_file, vocab_file_path, output_file):
269 |         print('initializing...')
270 |         sarcasm_model.__init__(self)
271 | 
272 |         self._word_file_path = word_file_path
273 |         self._model_file = model_file
274 |         self._vocab_file_path = vocab_file_path
275 |         self._output_file = output_file
276 | 
277 |         # self._line_maxlen = 45
278 |         print('test_maxlen', self._line_maxlen)
279 | 
280 |     def predict_cross_validation(self, tC, tX, tD, test):
281 |         self.__predict_model([tC, tX, tD], test)
282 | 
283 |     def load_trained_model(self, weight_file='model.json.hdf5'):
284 |         start = time.time()
285 |         self.__load_model(self._model_file + 'model.json', self._model_file + weight_file)
286 |         end = time.time()
287 |         print('model loading time::', (end - start))
288 | 
289 |     def __load_model(self, model_path, model_weight_path):
290 |         self.model = model_from_json(open(model_path).read())
291 |         print('model loaded from file...')
292 |         self.model.load_weights(model_weight_path)
293 |         print('model weights loaded from file...')
294 | 
295 |     def load_vocab(self):
296 |         vocab = defaultdict()
297 |         with open(self._vocab_file_path, 'r') as f:
298 |             for line in f.readlines():
299 |                 key, value = line.split('\t')
300 |                 vocab[key] = value
301 | 
302 |         return vocab
303 | 
304 |     def predict(self, test_file, verbose=False):
305 |         start = time.time()
306 |         self.test = dh.loaddata(test_file, self._word_file_path, normalize_text=True,
307 |                                 split_hashtag=True,
308 |                                 ignore_profiles=False)
309 |         end = time.time()
310 |         if (verbose == True):
311 |             print('test resource loading time::', (end - start))
312 | 
313 |         self._vocab = self.load_vocab()
314 | 
315 |         start = time.time()
316 |         tX, tY, tD, tC, tA = dh.vectorize_word_dimension(self.test, self._vocab)
317 |         tX = dh.pad_sequence_1d(tX, maxlen=self._line_maxlen)
318 |         tC = dh.pad_sequence_1d(tC, maxlen=self._line_maxlen)
319 |         tD = dh.pad_sequence_1d(tD, maxlen=11)
320 | 
321 |         end = time.time()
322 |         if (verbose == True):
323 |             print('test resource preparation time::', (end - start))
324 | 
325 |         self.__predict_model([tC, tX, tD], self.test)
326 | 
327 |     def __predict_model(self, tX, test):
328 |         prediction_probability = self.model.predict_file(tX, batch_size=8, verbose=1)
329 | 
330 |         y = []
331 |         y_pred = []
332 | 
333 |         fd = open(self._output_file + '.analysis', 'w')
334 |         for i, (label) in enumerate(prediction_probability):
335 |             gold_label = test[i][0]
336 |             words = test[i][1]
337 |             dimensions = test[i][2]
338 |             context = test[i][3]
339 |             author = test[i][4]
340 | 
341 |             predicted = numpy.argmax(prediction_probability[i])
342 | 
343 |             y.append(int(gold_label))
344 |             y_pred.append(predicted)
345 | 
346 |             fd.write(str(label[0]) + '\t' + str(label[1]) + '\t'
347 |                      + str(gold_label) + '\t'
348 |                      + str(predicted) + '\t'
349 |                      + ' '.join(words) + '\t'
350 |                      + str(dimensions) + '\t'
351 |                      + ' '.join(context))
352 | 
353 |             fd.write('\n')
354 | 
355 |         print('accuracy::', metrics.accuracy_score(y, y_pred))
356 |         print('precision::', metrics.precision_score(y, y_pred, average='weighted'))
357 |         print('recall::', metrics.recall_score(y, y_pred, average='weighted'))
358 |         print('f_score::', metrics.f1_score(y, y_pred, average='weighted'))
359 |         print('f_score::', metrics.classification_report(y, y_pred))
360 | 
361 |         fd.close()
362 | 
363 | 
364 | if __name__ == "__main__":
365 |     basepath = os.getcwd()[:os.getcwd().rfind('/')]
366 |     train_file = basepath + '/resource/train/Train_context_moods_v1.txt'
367 |     validation_file = basepath + '/resource/dev/Dev_context_moods.txt'
368 |     test_file = basepath + '/resource/test/Test_context_AW.txt'
369 |     word_file_path = basepath + '/resource/word_list_freq.txt'
370 |     split_word_path = basepath + '/resource/word_split.txt'
371 |     emoji_file_path = basepath + '/resource/emoji_unicode_names_final.txt'
372 | 
373 |     output_file = basepath + '/resource/text_context_awc_model/TestResults.txt'
374 |     model_file = basepath + '/resource/text_context_awc_model/weights/'
375 |     vocab_file_path = basepath + '/resource/text_context_awc_model/vocab_list.txt'
376 | 
377 |     # word2vec path
378 |     word2vec_path = '/home/word2vec/GoogleNews-vectors-negative300.bin'
379 | 
380 |     tr = train_model(train_file, validation_file, word_file_path, split_word_path, emoji_file_path, model_file,
381 |                      vocab_file_path, output_file)
382 | 
383 |     # testing the model
384 |     # with K.get_session():
385 |     #     t = test_model(word_file_path, model_file, vocab_file_path, output_file)
386 |     #     t.load_trained_model()
387 |     #     t.predict(test_file)
388 | 


--------------------------------------------------------------------------------