├── .gitignore
├── LICENSE
├── README.md
├── bad_dict.py
├── badwords.list
├── capsule_block.py
├── check_badwords.py
├── config.py
├── conv_cap.py
├── kfoldpostprocess.py
├── net.py
├── postprocess.py
├── preprocess.py
├── rake_parse.py
├── rename_result.py
├── test.py
├── test_k_fold.py
├── train.py
├── train_k_fold.py
├── train_multi.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | __pycache__
3 | *.pyc
4 | *.txt
5 | *.params
6 | *.csv
7 | *.swp
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Yan Li
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CapsuleNet solution for Comment toxic classification challenge
 2 | [Comment toxic classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) script in kaggle with naive CapsuleNet in MxNet
 3 | ----
 4 | ## Some updates for those who stared this project
 5 | 1. is it working?
 6 | 
 7 |     Yes
 8 | 
 9 | 2. Hows the performance?
10 | 
11 |     My 10 folds model performed 0.9859 in public board, but only 0.984x in private board.
12 |  
13 | 3. Will you update it?
14 | 
15 |     No. Competition is over. and you'd better not share through Github, since it violents Kaggle Rule.
16 |  
17 | 4. How to use it?
18 | 
19 |     * Install the required Python Libs.
20 |     * python train_k_fold.py for training(multi GPU supported, please refer to MxNet.)
21 |     * python test_k_fold.py for testing(Only single GPU)
22 |   
23 | 5. Last thing to mention, I slightly changed the squash function. And I don't like either form of it. 
24 | 6. Future work
25 | 
26 |     * Maybe use it in object detection. Working on MxNet Pikachu example now.
27 |     * Implement the Routing with EM
28 | ## Thank you for your Stars.
29 | 
30 | ## Reference
31 | [comment_toxic](https://github.com/jcjview/comment_toxic)
32 | 


--------------------------------------------------------------------------------
/bad_dict.py:
--------------------------------------------------------------------------------
 1 | def get_bad_word_dict():
 2 |     lines = open('badwords.list').readlines()
 3 |     lines = [l.lower().strip('\n') for l in lines]
 4 |     lines = [l.split(',') for l in lines]
 5 |     bad_dict = {}
 6 |     for v in lines:
 7 |         if len(v) == 2:
 8 |             bad_dict[v[0]] =v[1]
 9 |     return bad_dict
10 | 
11 | if __name__ == '__main__':
12 |     print(get_bad_word_dict())
13 | 


--------------------------------------------------------------------------------
/badwords.list:
--------------------------------------------------------------------------------
  1 | kung fu, kungfu 
  2 | animalfucker,animal fucker
  3 | anuses,anus
  4 | a s s,ass
  5 | asses,ass
  6 | a55,ass
  7 | azz,ass
  8 | assclowns,ass clowns
  9 | asskicked,ass kicked
 10 | asswhipe,ass whipe
 11 | ass hole,asshole
 12 | asswhole,asshole
 13 | asswipe,asshole
 14 | arsehole,asshole
 15 | ahole,asshole
 16 | assface,asshole
 17 | ashole,asshole
 18 | asholes,asshole
 19 | asswipes,asshole
 20 | azzhole,asshole
 21 | butthole,asshole
 22 | badassness,bad ass
 23 | basterds,bastard
 24 | biatch,bitch
 25 | bicth,bitch
 26 | b*tch,bitch
 27 | b!tch,bitch
 28 | bitchs,bitch
 29 | b i t c h,bitch
 30 | b1tch,bitch
 31 | bitches,bitch
 32 | bitchass,bitch ass
 33 | bitchmattythewhite,bitch matty the white
 34 | bitchmother,bitch mother
 35 | blowjobs,blowjob
 36 | blow job,blowjob
 37 | bollocks,bollock
 38 | boners,boner
 39 | boobs,boob
 40 | bowels,bowel
 41 | boymamas,boy mamas
 42 | b u m,bum
 43 | vbutt,butt
 44 | wikitheclown,clown
 45 | c0cks,cock
 46 | c0ck,cock
 47 | c o c k,cock
 48 | cockhead,cock
 49 | corpsefucking,corpse fuck
 50 | kunt,cunt
 51 | cntz,cunt
 52 | cnts,cunt
 53 | c u n t,cunt
 54 | cuntbag,cunt bag
 55 | cuntface,cunt face
 56 | cuntfranks,cunt franks
 57 | cuntliz,cunt liz
 58 | dik,dick
 59 | d!ck,dick
 60 | d*ck,dick
 61 | dickbag,dick bag
 62 | dickbig,dick big
 63 | dickbreath,dick breath
 64 | dickbutt,dick butt
 65 | dickheaditalic,dick head italic
 66 | fagz,fag
 67 | f a g,fag
 68 | fagit,faggot
 69 | faget,faggot
 70 | faggit,faggot
 71 | failepic,fail epic
 72 | fatass,fat ass
 73 | f*ck,fuck
 74 | fuckk,fuck
 75 | fcuk,fuck
 76 | fuk,fuck
 77 | fukkers,fuck
 78 | fking,fuck
 79 | ofuck,fuck
 80 | fuckwads,fuck
 81 | fck ,fuck
 82 | fuc ,fuck
 83 | fuckiest,fuck
 84 | phuc,fuck
 85 | phuk,fuck
 86 | f uck,fuck
 87 | fu ck,fuck
 88 | fuking,fuck
 89 | fcken,fuck
 90 | fcking,fuck
 91 | fukk,fuck
 92 | f ing,fuck
 93 | fuk ,fuck
 94 | 'fuck,fuck
 95 | fuckingabf,fuck
 96 | fuckan,fuck
 97 | fuckon,fuck
 98 | f**k,fuck
 99 | fuckedy,fuck
100 | phuck,fuck
101 | f'uck,fuck
102 | fuked,fuck
103 | fukcing,fuck
104 | fu*k,fuck
105 | f u c k,fuck
106 | fckin,fuck
107 | fuckass,fuck ass
108 | fuckbags,fuck bags
109 | fuckhole,fuck hole
110 | fucksex,fuck sex
111 | fuckstick,fuck stick
112 | fukyou,fuck you
113 | f you,fuck you
114 | fu ,fuck you
115 |  f you,fuck you 
116 | fuckyourself,fuck yourself
117 | fuker,fucker
118 | fukker,fucker
119 | fuken,fucking
120 | fukkin,fucking
121 | fuckign,fucking
122 | fuckin ,fucking
123 | fukin,fucking
124 | gayboy,gay
125 | gayz,gay
126 | goddamn,god damn
127 | h4x0r,hacker
128 | h e l l,hell
129 | wikihomosexuals,homosexual
130 | i d i o t,idiot
131 | wikipedidiots,idiot
132 | itiot,idiot
133 | itsuck,it suck
134 | wikijews,jew
135 | jpgsuck,jpg suck
136 | knobend,knob end
137 | lesbo,lesbian
138 | l m f a o,lmfao
139 | marcolfuck,marcol fuck
140 | masterbate,masturbate
141 | motherfu,mother fuck
142 | mothjer,mother fuck
143 | mothafuckin,mother fuck
144 | mother fukker,motherfucker
145 | mutha fucker,motherfucker
146 | mofo,motherfucker
147 | mother fucker,motherfucker
148 | n i g g e r,nigger
149 | nigga,nigger
150 | niggertard,nigger tard
151 | oldlady,old lady
152 | packi,paki
153 | wikipedophiles,pedophile
154 | p e n i s,penis
155 | pensnsnniensnsn,penis
156 | penus,penis
157 | penistown,penis
158 | pen1s,penis
159 | pneis,penis
160 | penas,penis
161 | p i s s,piss
162 | polack,polak
163 | polac,polak
164 | popsucker,pop sucker
165 | pr0n,porn
166 | pr1c,prick
167 | pusse,pussy
168 | wikiretards,retard
169 | s h i t,s hit
170 | sexsex,sex
171 | shioty,shit
172 | sh1t,shit
173 | sh1ts,shit
174 | shitter,shit
175 | shiot,shit
176 | shitler,shit
177 | sh!t,shit
178 | shoit,shit
179 | shitty,shit
180 | shyt,shit
181 | shity,shit
182 | shitlol,shit
183 | shitush,shit
184 | shitfuck,shit fuck
185 | sluts,slut
186 | slutty,slut
187 | s l u t,slut
188 | s u c k,suck
189 | suckish,suck
190 | suckipedia,suck
191 | suckdickeer,suck dick
192 | sucksfrozen,suck frozen
193 | suckersyou,suck you
194 | sux,sucks
195 | t i t,tit
196 | titties,tit
197 | titty,tit
198 | titt,tit
199 | tits,tit
200 | t u r d,turd
201 | w a n k,wank
202 | wikiwankers,wanker
203 | w a n k e r,wanker
204 | wtf,what the fuck
205 | w h o r e,whore
206 | hore,whore
207 | hoer,whore
208 | wh0re,whore
209 | whored,whore
210 | h0re,whore
211 | hoar,whore
212 | whores,whore
213 | h0r,whore
214 | w t f,wtf
215 | 


--------------------------------------------------------------------------------
/capsule_block.py:
--------------------------------------------------------------------------------
  1 | import mxnet as mx
  2 | from mxnet import init
  3 | from mxnet import nd
  4 | from mxnet.gluon import nn
  5 | from mxnet import initializer
  6 | 
  7 | 
  8 | 
  9 | def squash(x, axis):
 10 |     s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True)
 11 |     # if s_squared_norm is really small, we will be in trouble
 12 |     # so I removed the s_quare terms
 13 |     # scale = s_squared_norm / ((1 + s_squared_norm) * nd.sqrt(s_squared_norm + 1e-9))
 14 |     # return x * scale
 15 |     scale = nd.sqrt(s_squared_norm + 1e-9)
 16 |     return x / scale
 17 | 
 18 | 
 19 | class CapConvBlock(nn.Block):
 20 |     def __init__(self, num_cap, channels, context, kernel_size=(9,9), padding=(0,0),
 21 |                  strides=(1,1), route_num=3, **kwargs):
 22 |         super(CapConvBlock, self).__init__(**kwargs)
 23 |         self.num_cap = num_cap
 24 |         self.cap = nn.Conv2D(channels=channels*num_cap, kernel_size=kernel_size,
 25 |                                   strides=strides, padding=padding)
 26 |         self.route_num = route_num
 27 | 
 28 |     def forward(self, x):
 29 |         conv_out = nd.expand_dims(self.cap(x), axis=2)
 30 |         conv_out = conv_out.reshape((0,-1,self.num_cap,0,0))
 31 |         conv_out  = squash(conv_out, 1)
 32 |         return conv_out
 33 |         
 34 | class CapFullyBlock(nn.Block):
 35 |     def __init__(self, num_locations, num_cap, input_units, units,
 36 |                  route_num=3, **kwargs):
 37 |         super(CapFullyBlock, self).__init__(**kwargs)
 38 |         self.route_num = route_num
 39 |         self.num_cap = num_cap
 40 |         self.units = units
 41 |         self.num_locations = num_locations
 42 |         self.w_ij = self.params.get(
 43 |              'weight', shape=(input_units, units, self.num_cap, self.num_locations)
 44 |              ,init=init.Xavier()) 
 45 | 
 46 |     def forward(self, x):
 47 |         # reshape x into [batch_size, channel, num_previous_cap]
 48 |         x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1))
 49 |         return self.Route(x_reshape)
 50 | 
 51 |     def Route(self, x):
 52 |         # b_mat = nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0)#nd.stop_gradient(nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0))
 53 |         b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context)
 54 |         x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2)
 55 |         w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0)
 56 |         u_ = w_expand*x_expand
 57 |         # u_ = nd.abs(w_expand - x_expand)
 58 |         u = nd.sum(u_, axis = 1)
 59 |         u_no_gradient = nd.stop_gradient(u)
 60 |         for i in range(self.route_num):
 61 |             c_mat = nd.softmax(b_mat, axis=2)
 62 |             if i == self.route_num -1:
 63 |                 s = nd.sum(u * c_mat, axis=-1)
 64 |             else:
 65 |                 s = nd.sum(u_no_gradient * c_mat, axis=-1)
 66 |             v = squash(s, 1)
 67 |             v1 = nd.expand_dims(v, axis=-1)
 68 |             if i != self.route_num - 1:
 69 |                 update_term = nd.sum(u_no_gradient*v1, axis=1, keepdims=True)
 70 |                 b_mat = b_mat + update_term
 71 |         return v
 72 | 
 73 | 
 74 | class CapFullyNGBlock(nn.Block):
 75 |     def __init__(self, num_locations, num_cap, input_units, units,
 76 |                  route_num=3, **kwargs):
 77 |         super(CapFullyNGBlock, self).__init__(**kwargs)
 78 |         self.route_num = route_num
 79 |         self.num_cap = num_cap
 80 |         self.units = units
 81 |         self.num_locations = num_locations
 82 |         self.w_ij = self.params.get(
 83 |              'weight', shape=(input_units, units, self.num_cap, self.num_locations)
 84 |              ,init=init.Xavier()) 
 85 | 
 86 |     def forward(self, x):
 87 |         # reshape x into [batch_size, channel, num_previous_cap]
 88 |         x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1))
 89 |         return self.Route(x_reshape)
 90 | 
 91 |     def Route(self, x):
 92 |         b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context)
 93 |         x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2)
 94 |         w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0)
 95 |         u_ = w_expand*x_expand
 96 |         u = nd.sum(u_, axis = 1)
 97 |         for i in range(self.route_num):
 98 |             c_mat = nd.softmax(b_mat, axis=2)
 99 |             s = nd.sum(u * c_mat, axis=-1)
100 |             v = squash(s, 1)
101 |             v1 = nd.expand_dims(v, axis=-1)
102 |             update_term = nd.sum(u * v1, axis=1, keepdims=True)
103 |             b_mat = b_mat + update_term
104 |         return v
105 | 
106 | 
107 | class CapFullyEuBlock(nn.Block):
108 |     def __init__(self, num_locations, num_cap, input_units, units,
109 |                  route_num=3, **kwargs):
110 |         super(CapFullyEuBlock, self).__init__(**kwargs)
111 |         self.route_num = route_num
112 |         self.num_cap = num_cap
113 |         self.units = units
114 |         self.num_locations = num_locations
115 |         self.w_ij = self.params.get(
116 |              'weight', shape=(input_units, units, self.num_cap, self.num_locations)
117 |              ,init=init.Xavier()) 
118 | 
119 |     def forward(self, x):
120 |         # reshape x into [batch_size, channel, num_previous_cap]
121 |         # print x.shape
122 |        
123 |         x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1))
124 |         return self.Route(x_reshape)
125 | 
126 |     def Route(self, x):
127 |         # print x.context
128 |         b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context)
129 |         x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2)
130 |         w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0)
131 |         u_ = w_expand*x_expand
132 |         u = nd.sum(u_, axis = 1)
133 |         # u_ = nd.square(w_expand - x_expand)
134 |         # u = -nd.sum(u_, axis = 1)
135 |         u_no_gradient = nd.stop_gradient(u)
136 |         for i in range(self.route_num):
137 |             # c_mat = nd.softmax(b_mat, axis=2)
138 |             c_mat = nd.sigmoid(b_mat)
139 |             if i == self.route_num -1:
140 |                 s = nd.sum(u * c_mat, axis=-1)
141 |             else:
142 |                 s = nd.sum(u_no_gradient * c_mat, axis=-1)
143 |             v = squash(s, 1)
144 |             if i != self.route_num - 1:
145 |                 v1 = nd.expand_dims(v, axis=-1)
146 |                 update_term = nd.sum(u_no_gradient*v1, axis=1, keepdims=True)
147 |                 b_mat = b_mat + update_term
148 |                 # b_mat = update_term
149 |             # else:
150 |             #    v = s
151 |         return v
152 | 
153 | class LengthBlock(nn.Block):
154 |     def __init__(self, **kwargs):
155 |         super(LengthBlock, self).__init__(**kwargs)
156 | 
157 |     def forward(self, x):
158 |         x = nd.sqrt(nd.sum(nd.square(x), 1))
159 |         return x
160 | 
161 | class ActBlock(nn.Block):
162 |     def __init__(self, **kwargs):
163 |         super(ActBlock, self).__init__(**kwargs)
164 | 
165 |     def forward(self, x):
166 |         x = nd.sigmoid(nd.sum(nd.square(x), 1))
167 |         return x
168 | 


--------------------------------------------------------------------------------
/check_badwords.py:
--------------------------------------------------------------------------------
 1 | from bad_dict import get_bad_word_dict
 2 | import re
 3 | import os
 4 | import numpy as np
 5 | from tqdm import tqdm
 6 | import pandas as pd
 7 | from collections import OrderedDict
 8 | 
 9 | data_path = 'data'
10 | train = 'train.csv'
11 | test = 'test.csv'
12 | train_raw = pd.read_csv(os.path.join(data_path, train))
13 | raw_value = train_raw['comment_text'].fillna("_na_").values
14 | 
15 | 
16 | def text_parse(text, remove_stopwords=False, stem_words=False):
17 |     wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
18 |     url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
19 |     ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
20 |     WIKI_LINK=' WIKILINKREPLACER '
21 |     URL_LINK=' URLLINKREPLACER '
22 |     IP_LINK=' IPLINKREPLACER '
23 |     #clear link
24 |     c = re.findall(wiki_reg, text)
25 |     for u in c:
26 |         text = text.replace(u, WIKI_LINK)
27 |     c = re.findall(url_reg, text)
28 |     for u in c:
29 |         text = text.replace(u, WIKI_LINK)
30 |     c = re.findall(wiki_reg, text)
31 |     for u in c:
32 |         text = text.replace(u, URL_LINK)
33 |     c = re.findall(ip_reg, text)
34 |     for u in c:
35 |         text = text.replace(u, IP_LINK)
36 | 
37 |     bad_word_dict = get_bad_word_dict()
38 |     # Regex to remove all Non-Alpha Numeric and space
39 |     special_character_removal = re.compile(r'[^A-Za-z\d!?*\'_ ]', re.IGNORECASE)
40 |     # regex to replace all numerics
41 |     replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE)
42 |     text = text.lower().split()
43 |     # Optionally, remove stop words
44 |     if remove_stopwords:
45 |         stops = set(stopwords.words("english"))
46 |         text = [w for w in text if not w in stops]
47 |     text = " ".join(text)
48 |     # Remove Special Characters
49 |     text = special_character_removal.sub(' ', text)
50 |     found_dict = {k:False for k in bad_word_dict.keys()}
51 |     for k,v in bad_word_dict.items():
52 |         if text.find(k) >= 0:
53 |            found_dict[k]=True
54 |     return found_dict
55 | 
56 | bad_word_dict = get_bad_word_dict()
57 | appeared = {k:False for k in bad_word_dict.keys()}
58 | for l in tqdm(raw_value):
59 |      status = text_parse(l)
60 |      for k, v in status.items():
61 |          if v:
62 |              appeared[k]=True
63 | cleaned_dict = {}
64 | for k, v in appeared.items():
65 |     if v:
66 |         cleaned_dict[k] = bad_word_dict[k]
67 | cleaned_dict = OrderedDict(sorted(cleaned_dict.items(), key=lambda t: t[1]))
68 | 
69 | with open('cleaned_badwords.list', 'w') as f:
70 |     for k, v in cleaned_dict.items():
71 |         if k == v:
72 |             continue
73 |         f.write(k+','+ v +'\n')
74 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | MAX_LENGTH = 500
2 | MAX_WORDS = 200000
3 | EMBEDDING_DIM = 300
4 | 


--------------------------------------------------------------------------------
/conv_cap.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | from mxnet import init
 3 | from mxnet import nd
 4 | from mxnet.gluon import nn
 5 | from mxnet import initializer
 6 | 
 7 | def squash(x, axis):
 8 |     s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True)
 9 |     # if s_squared_norm is really small, we will be in trouble
10 |     # so I removed the s_quare terms
11 |     # scale = s_squared_norm / ((1 + s_squared_norm) * nd.sqrt(s_squared_norm + 1e-9))
12 |     # return x * scale
13 |     scale = nd.sqrt(s_squared_norm + 1e-9)
14 |     return x / scale
15 | 
16 | class PrimeConvCap(nn.Block):
17 |     def __init__(self, num_cap, num_filter, kernel_size=(3,3),
18 |                   strides=(1,1), padding=(1,1), **kwargs):
19 |         super(PrimeConvCap, self).__init__(**kwargs)
20 |         self.num_cap = num_cap
21 |         self.cap = nn.Conv2D(channels=(num_cap*num_filter), kernel_size=kernel_size,
22 |                              padding=padding, strides=strides)
23 |         # self.bn = nn.BatchNorm()
24 | 
25 |     def forward(self, x):
26 |         conv_out = nd.expand_dims(self.cap(x), axis=2)
27 |         # conv_out = nd.expand_dims(self.bn(self.cap(x)), axis=2)
28 |         conv_out = conv_out.reshape((0,self.num_cap,-1,0,0))
29 |         conv_out = squash(conv_out, 2)
30 |         # print conv_out.shape
31 |         return conv_out
32 | 
33 | 
34 | class AdvConvCap(nn.Block):
35 |     def __init__(self, num_cap, num_filter,
36 |                  num_cap_in, num_filter_in,
37 |                  route_num=3, kernel_size=(3,3),
38 |                  strides=(1,1), padding=(1,1),
39 |                  **kwargs):
40 |         super(AdvConvCap, self).__init__(**kwargs)
41 |         self.num_cap = num_cap
42 |         self.num_filter = num_filter
43 |         self.route_num = route_num
44 |         self.num_cap_in = num_cap_in
45 |         # num_filter_in * num_cap_in filters divided in num_cap_in groups
46 |         # with each group output size as num_cap * num_filter
47 |         self.cap = nn.Conv2D(channels=(num_cap * num_filter * num_cap_in), kernel_size=kernel_size, strides=strides,
48 |                              padding=padding, groups= num_cap_in)
49 |         # self.bn = nn.BatchNorm()
50 | 
51 |     def forward(self, x):
52 |         x_reshape = x.reshape((x.shape[0], -1, x.shape[3], x.shape[4]))
53 |         cap_out = self.cap(x_reshape)
54 |         cap_out = cap_out.reshape((cap_out.shape[0], self.num_cap_in, self.num_cap,
55 |                                    self.num_filter, cap_out.shape[2], cap_out.shape[3]))
56 |         return self.route(cap_out)
57 |         '''
58 |         cap_out = self.cap(x)
59 |         cap_out = cap_out.reshape((x.shape[0],self.num_cap,-1,cap_out.shape[2], cap_out.shape[3]))
60 |         cap_out = squash(cap_out, 2)
61 |         return cap_out
62 |         '''
63 | 
64 |     def route(self, u):
65 |         b_mat = nd.zeros((u.shape[0], self.num_cap_in, self.num_cap, 1, u.shape[4], u.shape[5]), ctx=u.context)
66 |         for i in range(self.route_num):
67 |             c_mat = nd.softmax(b_mat, axis=2)
68 |             s = nd.sum(u * c_mat, axis=1)
69 |             v = squash(s, 2)
70 |             if i != self.route_num - 1:
71 |                 v1 = nd.expand_dims(v, axis=1)
72 |                 update_term = nd.sum(u*v1, axis=3, keepdims=True)
73 |                 b_mat = b_mat + update_term
74 |         return v
75 | '''
76 | class AdvFullyCap(nn.Block):
77 |     def __init__(self, num_cap, num_filter,
78 |                  num_cap_in, num_filter_in,
79 |                  route_num=3, **kwargs):
80 |         self.num_cap = num_cap
81 |         self.num_filter = num_filter
82 |         self.route_num = route_num
83 |         self.num_cap_in = num_cap_in
84 |         # num_filter_in * num_cap_in filters divided in num_cap_in groups
85 |         # with each group output size as num_cap * num_filter
86 |         self.cap = nn.Conv2D(channels=(num_cap * num_filter * num_cap_in), kernel_size=(1,1), groups= num_cap_in)
87 | 
88 |     def forward(self, x):
89 |         x_reshape = x.reshape((x.shape[0], -1, x.shape[3], x.shape[4]))
90 |         cap_out = self.cap(x_reshape)
91 |         cap_out = cap_out.reshape((cap_out.shape[0], self.num_cap_in, self.num_cap,
92 |                                    self.num_filter, cap_out.shape[2], cap_out.shape[3]))
93 |         return cap_out
94 | '''
95 | 


--------------------------------------------------------------------------------
/kfoldpostprocess.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from preprocess import get_label
 4 | 
 5 | kfold=5
 6 | for i in range(kfold):
 7 |     # result_path = 'data/result'+str(i)+'.csv'
 8 |     result_path = 'result'+str(i)+'.csv'
 9 |     result = pd.read_csv(result_path)
10 |     pred_label = get_label(result)
11 |     if i==0:
12 |         mean_result = pred_label
13 |     else:
14 |         # mean_result *= pred_label
15 |         mean_result += pred_label
16 | 
17 | # mean_result = np.power(mean_result, 1.0/kfold)
18 | mean_result = mean_result / kfold
19 | 
20 | labels = ['toxic', 'severe_toxic',
21 |           'obscene', 'threat',
22 |           'insult', 'identity_hate']
23 | result[labels] = mean_result
24 | result.to_csv('result_kfold.csv', index=False)
25 | 
26 | PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4
27 | mean_result **= PROBABILITIES_NORMALIZE_COEFFICIENT
28 | # mean_result =np.log(mean_result)
29 | # mean_result -=0.5
30 | # mean_result =np.exp(mean_result)
31 | result[labels] = mean_result
32 | result.to_csv('postprocessing1.csv', index=False)
33 | 


--------------------------------------------------------------------------------
/net.py:
--------------------------------------------------------------------------------
  1 | import mxnet as mx
  2 | from mxnet import init
  3 | from mxnet import nd
  4 | from mxnet.gluon import nn,rnn
  5 | from conv_cap import PrimeConvCap, AdvConvCap
  6 | from capsule_block import CapFullyBlock, CapFullyEuBlock, CapFullyNGBlock, LengthBlock, ActBlock
  7 | import config
  8 | 
  9 | def net_define():
 10 |     net = nn.Sequential()
 11 |     with net.name_scope():
 12 |         net.add(nn.Embedding(config.MAX_WORDS, config.EMBEDDING_DIM))
 13 |         net.add(rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=2, dropout=0.2))
 14 |         net.add(transpose(axes=(0,2,1)))
 15 |         # net.add(nn.MaxPool2D(pool_size=(config.MAX_LENGTH,1)))
 16 |         # net.add(nn.Conv2D(128, kernel_size=(101,1), padding=(50,0), groups=128,activation='relu'))
 17 |         net.add(PrimeConvCap(8,32, kernel_size=(1,1), padding=(0,0)))
 18 |         # net.add(AdvConvCap(8,32,8,32, kernel_size=(1,1), padding=(0,0)))
 19 |         net.add(CapFullyBlock(8*(config.MAX_LENGTH)/2, num_cap=12, input_units=32, units=16, route_num=5))
 20 |         # net.add(CapFullyBlock(8*(config.MAX_LENGTH-8), num_cap=12, input_units=32, units=16, route_num=5))
 21 |         # net.add(CapFullyBlock(8, num_cap=12, input_units=32, units=16, route_num=5))
 22 |         net.add(nn.Dropout(0.2))
 23 |         # net.add(LengthBlock())
 24 |         net.add(nn.Dense(6, activation='sigmoid'))
 25 |     net.initialize(init=init.Xavier())
 26 |     return net
 27 | 
 28 | def net_define_eu():
 29 |     net = nn.Sequential()
 30 |     with net.name_scope():
 31 |         net.add(nn.Embedding(config.MAX_WORDS, config.EMBEDDING_DIM))
 32 |         net.add(rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2))
 33 |         net.add(transpose(axes=(0,2,1)))
 34 |         net.add(nn.GlobalMaxPool1D())
 35 |         '''
 36 |         net.add(FeatureBlock1())
 37 |         '''
 38 |         net.add(extendDim(axes=3))
 39 |         net.add(PrimeConvCap(16, 32, kernel_size=(1,1), padding=(0,0),strides=(1,1)))
 40 |         net.add(CapFullyNGBlock(16, num_cap=12, input_units=32, units=16, route_num=3))
 41 |         net.add(nn.Dropout(0.2))
 42 |         net.add(nn.Dense(6, activation='sigmoid'))
 43 |     net.initialize(init=init.Xavier())
 44 |     return net
 45 | 
 46 | 
 47 | class extendDim(nn.Block):
 48 |     def __init__(self, axes, **kwargs):
 49 |         super(extendDim, self).__init__(**kwargs)
 50 |         self.axes = axes
 51 | 
 52 |     def forward(self, x):
 53 |         x1 = nd.expand_dims(x, axis=self.axes)
 54 |         return x1
 55 | 
 56 | class reduceDim(nn.Block):
 57 |     def __init__(self, **kwargs):
 58 |         super(reduceDim, self).__init__(**kwargs)
 59 | 
 60 |     def forward(self, x):
 61 |         x1 = x.reshape((x.shape[0], x.shape[1], -1))
 62 |         return x1
 63 |  
 64 | 
 65 | class transpose(nn.Block):
 66 |     def __init__(self, axes, **kwargs):
 67 |         super(transpose, self).__init__(**kwargs)
 68 |         self.axes = axes
 69 | 
 70 |     def forward(self, x):
 71 |         return nd.transpose(x, axes=self.axes)# .reshape((0,0,0,1))
 72 | 
 73 | class fullyReshape(nn.Block):
 74 |     def __init__(self, axes, **kwargs):
 75 |         super(fullyReshape, self).__init__(**kwargs)
 76 |         self.axes = axes
 77 | 
 78 |     def forward(self, x):
 79 |         return nd.transpose(x, axes=self.axes).reshape((0,0,0,1,1))
 80 | 
 81 | # hard coding feature Block
 82 | class FeatureBlock(nn.Block):
 83 |     def __init__(self, **kwargs):
 84 |         super(FeatureBlock, self).__init__(**kwargs)
 85 |         self.gru = rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2)
 86 |         self.conv3 = nn.Conv1D(channels=128, kernel_size=5, padding=2, strides=1, activation='relu')
 87 |         self.conv5 = nn.Conv1D(channels=128, kernel_size=9, padding=4, strides=1, activation='relu')
 88 |         self.conv7 = nn.Conv1D(channels=128, kernel_size=13, padding=6, strides=1, activation='relu')
 89 |         self.conv_drop = nn.Dropout(0.2)
 90 | 
 91 |     def forward(self, x):
 92 |         gru_out = self.gru(x)
 93 |         gru_out_t = nd.transpose(gru_out, axes=(0,2,1))
 94 | 
 95 |         x_t = nd.transpose(x, axes=(0,2,1))
 96 |         conv3_out = self.conv3(x_t)
 97 |         conv5_out = self.conv5(x_t)
 98 |         conv7_out = self.conv7(x_t)
 99 |         conv_out = nd.concat(*[conv3_out, conv5_out, conv7_out], dim=1)
100 |         conv_out = self.conv_drop(conv_out)
101 |         concated_feature = nd.concat(*[gru_out_t, conv_out], dim=1)
102 |         return concated_feature
103 | 
104 | # hard coding feature1 Block
105 | class FeatureBlock1(nn.Block):
106 |     def __init__(self, **kwargs):
107 |         super(FeatureBlock1, self).__init__(**kwargs)
108 |         self.gru = rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2)
109 |         self.conv3 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu')
110 |         self.conv5 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu')
111 |         self.conv7 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu')
112 |         # self.gru_post_max = nn.MaxPool1D(pool_size=2)
113 |         # self.gru_post_ave = nn.AvgPool1D(pool_size=2)
114 |         self.gru_maxpool = nn.GlobalMaxPool1D()
115 |         self.conv_maxpool = nn.GlobalMaxPool1D()
116 |         '''
117 |         self.gru_avepool = nn.GlobalAvgPool1D()
118 |         self.conv_avepool = nn.GlobalAvgPool1D()
119 |         '''
120 |         self.conv_drop = nn.Dropout(0.5)
121 | 
122 |     def forward(self, x):
123 |         x_t = nd.transpose(x, axes=(0,2,1))
124 |         conv3_out = self.conv3(x_t)
125 |         conv5_out = self.conv5(conv3_out) + conv3_out
126 |         conv7_out = self.conv7(conv5_out) + conv5_out 
127 |         # conv_out = nd.concat(*[conv3_out, conv5_out, conv7_out], dim=1)
128 |         conv_out = self.conv_drop(conv7_out)
129 |         conv_max_pooled = self.conv_maxpool(conv_out)
130 | 
131 |         gru_out = self.gru(x)
132 |         gru_out_t = nd.transpose(gru_out, axes=(0,2,1))
133 |         # gru_pooled = nd.transpose(gru_out, axes=(0,2,1))
134 |         # gru_maxpooled = self.gru_post_max(gru_out_t)
135 |         # return gru_maxpooled
136 |         # gru_avepooled = self.gru_post_ave(gru_out_t)
137 |         # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1)
138 | 
139 |         # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1)
140 |         gru_maxpooled = self.gru_maxpool(gru_out_t)
141 |         # gru_avepooled = self.gru_maxpool(gru_out_t)
142 |         # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1)
143 | 
144 |         # conv_ave_pooled = self.conv_avepool(conv_out)
145 |         concated_feature = nd.concat(*[gru_maxpooled, conv_max_pooled], dim=1)
146 |         return concated_feature
147 | 


--------------------------------------------------------------------------------
/postprocess.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from preprocess import get_label
 4 | 
 5 | result_path = 'result.csv'
 6 | result = pd.read_csv(result_path)
 7 | labels = ['toxic', 'severe_toxic',
 8 |           'obscene', 'threat',
 9 |           'insult', 'identity_hate']
10 | mean_result = get_label(result)
11 | PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4
12 | mean_result **= PROBABILITIES_NORMALIZE_COEFFICIENT
13 | result[labels] = mean_result
14 | result.to_csv('postprocessing1.csv', index=False)
15 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import numpy as np
  4 | import pandas as pd
  5 | import config
  6 | from tqdm import tqdm
  7 | from multiprocessing import Pool
  8 | from keras.preprocessing import text, sequence
  9 | from bad_dict import get_bad_word_dict
 10 | from rake_parse import rake_parse
 11 | 
 12 | def get_raw_data(path):
 13 |     data = pd.read_csv(path)
 14 |     process_data = get_data(data)
 15 |     data['comment_text'] = process_data
 16 |     return data
 17 | 
 18 | def get_data(raw_data):
 19 |     raw_value = raw_data['comment_text'].fillna("_na_").values
 20 |     pool = Pool()
 21 |     processed_data = list(tqdm(pool.imap(text_parse, raw_value),total=raw_value.shape[0]))
 22 |     '''
 23 |     with open('debug.txt', 'w') as f:
 24 |        for l in processed_data:
 25 |            f.write(l+'\n')
 26 |     '''
 27 |     return processed_data 
 28 | 
 29 | def text_parse(text, remove_stopwords=False, stem_words=False):
 30 |     wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 31 |     url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 32 |     url_reg2=r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 33 |     ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
 34 |     WIKI_LINK=' WIKILINKREPLACER '
 35 |     URL_LINK=' URLLINKREPLACER '
 36 |     IP_LINK=' IPLINKREPLACER '
 37 |     #clear link
 38 |     # replace endline with '. '
 39 |     endline = re.compile(r'.?\n', re.IGNORECASE)
 40 |     text = endline.sub('. ', text)
 41 | 
 42 |     c = re.findall(wiki_reg, text)
 43 |     for u in c:
 44 |         text = text.replace(u, WIKI_LINK)
 45 |     c = re.findall(url_reg, text)
 46 |     for u in c:
 47 |         text = text.replace(u, URL_LINK)
 48 |     c = re.findall(url_reg2, text)
 49 |     for u in c:
 50 |         text = text.replace(u, URL_LINK)
 51 |     c = re.findall(ip_reg, text)
 52 |     for u in c:
 53 |         text = text.replace(u, IP_LINK)
 54 | 
 55 |     bad_word_dict = get_bad_word_dict()
 56 |     # Regex to remove all Non-Alpha Numeric and space
 57 |     special_character_removal = re.compile(r'[^A-Za-z\d!?*\'.,; ]', re.IGNORECASE)
 58 |     # regex to replace all numerics
 59 |     replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE)
 60 |     text = text.lower().split()
 61 |     # Optionally, remove stop words
 62 |     if remove_stopwords:
 63 |         stops = set(stopwords.words("english"))
 64 |         text = [w for w in text if not w in stops]
 65 |     text = " ".join(text)
 66 |     # Remove Special Characters
 67 |     text = special_character_removal.sub(' ', text)
 68 |     for k,v in bad_word_dict.items():
 69 |         # bad_reg = re.compile('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]'+ re.escape(k) +'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]')
 70 |         bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]|[\W]' + re.escape(k) + '[\W]?') 
 71 |         text = bad_reg.sub(' '+ v +' ', text)
 72 |         '''
 73 |         bad_reg = re.compile('[\W]'+ re.escape(k) +'[\W]?')
 74 |         text = bad_reg.sub(' '+ v, text)
 75 |         bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]')
 76 |         text = bad_reg.sub(v + ' ', text)
 77 |         '''
 78 | 
 79 |     # Replace Numbers
 80 |     text = replace_numbers.sub('NUMBERREPLACER', text)
 81 |     text =text.split()
 82 |     text = " ".join(text)
 83 | 
 84 |     if stem_words:
 85 |         text = text.split()
 86 |         stemmer = SnowballStemmer('english')
 87 |         stemmed_words = [stemmer.stem(word) for word in text]
 88 |         text = " ".join(stemmed_words)
 89 |     # rake parsing
 90 |     text = rake_parse(text)
 91 |     return text
 92 | 
 93 | def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
 94 |     # Clean the text, with the option to remove stopwords and to stem words.
 95 |     # Convert words to lower case and split them
 96 |     wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 97 |     url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
 98 |     ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
 99 |     WIKI_LINK=' WIKI_LINK '
100 |     URL_LINK=' URL_LINK '
101 |     IP_LINK=' IP_LINK '
102 |     #clear link
103 |     c = re.findall(wiki_reg, text)
104 |     for u in c:
105 |         text = text.replace(u, WIKI_LINK)
106 |     c = re.findall(url_reg, text)
107 |     for u in c:
108 |         text = text.replace(u, WIKI_LINK)
109 |     c = re.findall(wiki_reg, text)
110 |     for u in c:
111 |         text = text.replace(u, URL_LINK)
112 |     c = re.findall(ip_reg, text)
113 | 
114 |     # Regex to remove all Non-Alpha Numeric and space
115 |     special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE)
116 |     # regex to replace all numerics
117 |     replace_numbers = re.compile(r'\d+', re.IGNORECASE)
118 | 
119 |     # text = text.lower().split()
120 |     text = text.split()
121 |     # Optionally, remove stop words
122 |     if remove_stopwords:
123 |         stops = set(stopwords.words("english"))
124 |         text = [w for w in text if not w in stops]
125 | 
126 |     text = " ".join(text)
127 |     # Remove Special Characters
128 |     text = special_character_removal.sub('', text)
129 |     # Replace Numbers
130 |     text = replace_numbers.sub('NUMBERREPLACER', text)
131 |     # Optionally, shorten words to their stems
132 |     if stem_words:
133 |         text = text.split()
134 |         stemmer = SnowballStemmer('english')
135 |         stemmed_words = [stemmer.stem(word) for word in text]
136 |         text = " ".join(stemmed_words)
137 |     # Return a list of words
138 |     return (text)
139 | 
140 | 
141 | def get_label(raw_data):
142 |     labels = ['toxic', 'severe_toxic',
143 |               'obscene', 'threat',
144 |               'insult', 'identity_hate']
145 |     return raw_data[labels].values
146 | 
147 | def get_id(raw_data):
148 |     return raw_data['id'].values
149 | 
150 | def process_data(train_data, test_data):
151 |     # tokenizer = text.Tokenizer(num_words=config.MAX_WORDS,
152 |     #     filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
153 |     tokenizer = text.Tokenizer(num_words=config.MAX_WORDS)
154 |     tokenizer.fit_on_texts(train_data+test_data)
155 |     train_tokenized = tokenizer.texts_to_sequences(train_data)
156 |     test_tokenized = tokenizer.texts_to_sequences(test_data)
157 |     train_data = sequence.pad_sequences(train_tokenized, maxlen = config.MAX_LENGTH)
158 |     test_data = sequence.pad_sequences(test_tokenized, maxlen = config.MAX_LENGTH)
159 |     return train_data, test_data, tokenizer.word_index
160 | 
161 | def get_word_embedding():
162 |     data_path = 'data'
163 |     # raw_embed = 'crawl-300d-2M.vec'
164 |     raw_embed = 'glove.840B.300d.txt'
165 |     EMBEDDING_FILE = os.path.join(data_path, raw_embed)
166 |     embeddings_index = {}
167 |     for line in open(EMBEDDING_FILE, "rb"):
168 |         values = line.split()
169 |         word = values[0]
170 |         coefs = np.asarray(values[1:], dtype='float32')
171 |         embeddings_index[word] = coefs
172 |     print (len(embeddings_index))
173 |     return embeddings_index
174 | 
175 | def get_embed_matrix(embeddings_index, word_index):
176 |     nb_words = min(config.MAX_WORDS, len(word_index))
177 |     embedding_matrix = np.empty((nb_words, config.EMBEDDING_DIM))
178 |     # embedding_matrix = np.random.rand(nb_words, config.EMBEDDING_DIM)
179 |     for word, i in word_index.items():
180 |         if i >= config.MAX_WORDS:
181 |             continue
182 |         word_parts = word.split('_')
183 |         embedding_vectors = [embeddings_index.get(w) for w in word_parts]
184 |         embedding_vectors = np.array([v if v is not None else np.random.rand(config.EMBEDDING_DIM) for v in embedding_vectors])
185 |         # embedding_matrix[i] = np.sum(embedding_vectors, axis=0)/np.linalg.norm(np.sum(embedding_vectors, axis=0))
186 |         embedding_matrix[i] = np.sum(embedding_vectors, axis=0)/embedding_vectors.shape[0]
187 | 
188 |     return embedding_matrix
189 | 
190 | def fetch_data(aug=False):
191 |     data_path = 'data'
192 |     train = 'train.csv'
193 |     test = 'test.csv'
194 |     train_raw = get_raw_data(os.path.join(data_path, train))
195 |     test_raw = get_raw_data(os.path.join(data_path, test))
196 | 
197 |     if aug:
198 |         train_de = 'train_de.csv'
199 |         train_fr = 'train_fr.csv'
200 |         train_es = 'train_es.csv'
201 |         train_de_raw = get_raw_data(os.path.join(data_path, train_de))
202 |         train_es_raw = get_raw_data(os.path.join(data_path, train_es))
203 |         train_fr_raw = get_raw_data(os.path.join(data_path, train_fr))
204 |         train_raw = pd.concat([train_raw, train_de_raw, train_es_raw, train_fr_raw]).drop_duplicates('comment_text')
205 |     train_data = list(train_raw['comment_text'].fillna("_na_").values)
206 |     test_data = list(test_raw['comment_text'].fillna("_na_").values)
207 |     train_label = get_label(train_raw)
208 |         # print train_raw
209 |         # train_de_data = get_data(train_de_raw)
210 |         # train_de_label = get_label(train_de_raw)
211 |         #train_es_data = get_data(train_es_raw)
212 |         # train_es_label = get_label(train_es_raw)
213 |         # train_fr_data = get_data(train_fr_raw)
214 |         # train_fr_label = get_label(train_fr_raw)
215 |         # train_data = train_data + train_de_data + train_fr_data + train_es_data
216 |         # train_label = np.vstack((train_label, train_de_label, train_fr_label, train_es_label))
217 | 
218 |     train_data, test_data, word_index = process_data(train_data, test_data)
219 |     return train_data, train_label, word_index
220 | 
221 | def fetch_test_data(aug=False):
222 |     data_path = 'data'
223 |     train = 'train.csv'
224 |     test = 'test.csv'
225 |     train_raw = get_raw_data(os.path.join(data_path, train))
226 |     test_raw = get_raw_data(os.path.join(data_path, test))
227 |     if aug:
228 |         train_de = 'train_de.csv'
229 |         train_fr = 'train_fr.csv'
230 |         train_es = 'train_es.csv'
231 |         train_de_raw = get_raw_data(os.path.join(data_path, train_de))
232 |         train_es_raw = get_raw_data(os.path.join(data_path, train_es))
233 |         train_fr_raw = get_raw_data(os.path.join(data_path, train_fr))
234 |         train_raw = pd.concat([train_raw, train_de_raw, train_es_raw, train_fr_raw]).drop_duplicates('comment_text')
235 |     train_data = list(train_raw['comment_text'].fillna("_na_").values)
236 |     test_data = list(test_raw['comment_text'].fillna("_na_").values)
237 |     train_data, test_data, word_index = process_data(train_data, test_data)
238 |     test_id = get_id(test_raw)
239 |     return test_data, test_id
240 | 
241 | if __name__ == '__main__':
242 |     # embedding_dict = get_word_embedding()
243 |     # data, label, word_index = fetch_data()
244 |     # print(np.sum(label, axis=0).astype(float) / label.shape[0])
245 |     # em = get_embed_matrix(embedding_dict, word_index)
246 |     # print(em.shape)
247 |     # reverse_idx = {v:k for k,v in word_index.items()}
248 |     # reverse_idx[0] = 'NOTHING'
249 |     # for i in range(100):
250 |     #     print [reverse_idx[v] for v in data[i] if v!=0]
251 | 
252 |     data_path = 'data'
253 |     train = 'train.csv'
254 |     test = 'test.csv'
255 |     train_raw = pd.read_csv(os.path.join(data_path, train))
256 |     raw_value = train_raw['comment_text'].fillna("_na_").values
257 |     # processed_data = []
258 |     # for i, v in enumerate(raw_value):
259 |     #     text_parse(v)
260 |     a = raw_value[8306]
261 |     word_index = {k:i+1 for i,k in enumerate(text_parse(a))}
262 |     embedding_dict = get_word_embedding()
263 |     em = get_embed_matrix(embedding_dict, word_index)
264 | 
265 |     '''
266 |     r = Rake()
267 |     r.extract_keywords_from_text(text_parse(a))
268 |     print r.get_ranked_phrases()
269 |     '''
270 | 


--------------------------------------------------------------------------------
/rake_parse.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | from rake_nltk import Rake
 5 | from bad_dict import get_bad_word_dict
 6 | 
 7 | def rake_parse(line):
 8 |     r = Rake()
 9 |     r.extract_keywords_from_text(line)
10 |     word_combines = r.get_ranked_phrases()
11 |     word_combines = [k for k in word_combines if len(k.split()) > 1]
12 |     # filter out bad word combines
13 |     bad_word_dict = get_bad_word_dict()
14 |     word_replacer = {}
15 |     for k in word_combines:
16 |         if any(map(lambda x : k.find(x) >= 0, bad_word_dict.values())):
17 |             continue
18 |         word_replacer[k] = '_'.join(k.split())
19 | 
20 |     for k,v in word_replacer.items():
21 |         line = line.replace(k,v)
22 |     return line
23 | 
24 | if __name__ == '__main__':
25 |     from preprocess import text_parse
26 |     data_path = 'data'
27 |     train = 'train.csv'
28 |     test = 'test.csv'
29 |     train_raw = pd.read_csv(os.path.join(data_path, train))
30 |     raw_value = train_raw['comment_text'].fillna("_na_").values
31 |     a = raw_value[100]
32 |     print a
33 |     a = text_parse(a)
34 |     print a
35 | 
36 | 


--------------------------------------------------------------------------------
/rename_result.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | kfold=10
 3 | rename = 0
 4 | for i in range(kfold):
 5 |     # result_path = 'data/result'+str(i)+'.csv'
 6 |     result_path = 'result'+str(i)+'.csv'
 7 |     result_new_path = 'result'+str(rename) + str(i)+'.csv'
 8 |     shutil.move(result_path, result_new_path)
 9 |     
10 | 
11 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import mxnet as mx
 5 | import numpy as np
 6 | from preprocess import fetch_test_data
 7 | from mxnet.gluon import Trainer
 8 | from mxnet.gluon.data import DataLoader,Dataset
 9 | from mxnet.io import NDArrayIter
10 | from mxnet.ndarray import array
11 | from mxnet import nd
12 | from net import net_define, net_define_eu
13 | import config
14 | 
15 | if __name__ == "__main__":
16 |     # setting the hyper parameters
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--batch_size', default=128, type=int)
19 |     parser.add_argument('--epochs', default=100, type=int)
20 |     parser.add_argument('--gpu', default=0, type=int)
21 |     args = parser.parse_args()
22 | 
23 |     # ctx = mx.cpu()# gpu(7)
24 |     ctx = mx.gpu(args.gpu)
25 |     net = net_define_eu()
26 |     # net = net_define()
27 |     net.collect_params().reset_ctx(ctx)
28 |     net.load_params('net0.params', ctx)
29 | 
30 |     test_data, test_id = fetch_test_data()
31 |     data_iter = NDArrayIter(data= test_data, batch_size=args.batch_size, shuffle=False)
32 |     with open('result.csv','w') as f:
33 |         f.write('id,toxic,severe_toxic,obscene,threat,insult,identity_hate\n')
34 |         for i, d in enumerate(data_iter):
35 |             print(i)
36 |             output=net(d.data[0].as_in_context(ctx)).asnumpy()
37 |             for j in range(args.batch_size):
38 |                 if i*args.batch_size + j < test_id.shape[0]:
39 |                     str_out = ','.join([str(test_id[i*args.batch_size+j])] + [str(v) for v in output[j]])+'\n'
40 |                     f.write(str_out)
41 | 
42 |     
43 | 


--------------------------------------------------------------------------------
/test_k_fold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import mxnet as mx
 5 | import numpy as np
 6 | from preprocess import fetch_test_data
 7 | from mxnet.gluon import Trainer
 8 | from mxnet.gluon.data import DataLoader,Dataset
 9 | from mxnet.io import NDArrayIter
10 | from mxnet.ndarray import array
11 | from mxnet import nd
12 | from net import net_define, net_define_eu
13 | import config
14 | 
15 | if __name__ == "__main__":
16 |     # setting the hyper parameters
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument('--batch_size', default=128, type=int)
19 |     parser.add_argument('--kfold', default=5, type=int)
20 |     parser.add_argument('--gpu', default=0, type=int)
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     # ctx = mx.cpu()# gpu(7)
25 | 
26 |     test_data, test_id = fetch_test_data()
27 |     data_iter = NDArrayIter(data= test_data, batch_size=args.batch_size, shuffle=False)
28 |     for i in range(args.kfold):
29 |         print(i)
30 |         ctx = mx.gpu(args.gpu)
31 |         net = net_define_eu()
32 |         net.collect_params().reset_ctx(ctx)
33 |         net.load_params('net'+str(i)+'.params', ctx)
34 |         data_iter.reset()
35 |         with open('result'+str(i)+'.csv','w') as f:
36 |             f.write('id,toxic,severe_toxic,obscene,threat,insult,identity_hate\n')
37 |             for i, d in enumerate(data_iter):
38 |                 output=net(d.data[0].as_in_context(ctx)).asnumpy()
39 |                 for j in range(args.batch_size):
40 |                     if i*args.batch_size + j < test_id.shape[0]:
41 |                         str_out = ','.join([str(test_id[i*args.batch_size+j])] + [str(v) for v in output[j]])+'\n'
42 |                         f.write(str_out)
43 |     
44 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import mxnet as mx
 5 | import numpy as np
 6 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix
 7 | from mxnet.gluon import Trainer
 8 | from mxnet.gluon.data import DataLoader,Dataset
 9 | from mxnet.io import NDArrayIter
10 | from mxnet.ndarray import array
11 | from mxnet import nd
12 | from net import net_define,  net_define_eu
13 | import utils
14 | import config
15 | 
16 | def CapLoss(y_pred, y_true):
17 |     L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \
18 |         0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1))
19 |     return nd.mean(nd.sum(L, 1))
20 | 
21 | def EntropyLoss(y_pred, y_true):
22 |     L = - y_true*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred)
23 |     return nd.mean(L)
24 | 
25 | def EntropyLoss1(y_pred, y_true):
26 |     train_pos_ratio = array([ 0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361, 0.00880486], ctx=y_pred.context, dtype=np.float32)*10
27 |     train_neg_ratio = (1.0-train_pos_ratio)*10
28 |     L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred) * train_pos_ratio
29 |     return nd.mean(L)
30 | 
31 | if __name__ == "__main__":
32 |     # setting the hyper parameters
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('--batch_size', default=128, type=int)
35 |     parser.add_argument('--epochs', default=2, type=int)
36 |     parser.add_argument('--gpu', default=0, type=int)
37 |     args = parser.parse_args()
38 |     ctx = mx.gpu(args.gpu)
39 |     net = net_define_eu()
40 | 
41 |     train_data, train_label, word_index = fetch_data()
42 |     embedding_dict = get_word_embedding()
43 |     # print len(word_index)
44 |     em = get_embed_matrix(embedding_dict, word_index)
45 |     net.collect_params().reset_ctx(ctx)
46 |     em = array(em, ctx=mx.cpu())
47 |     net.collect_params()['sequential0_embedding0_weight'].set_data(em)
48 |     net.collect_params()['sequential0_embedding0_weight'].grad_req = 'null'
49 | 
50 |     print_batches = 100
51 |     shuffle_idx = np.random.permutation(train_data.shape[0])
52 |     train_data = train_data[shuffle_idx]
53 |     train_label = train_label[shuffle_idx]
54 | 
55 |     data_iter = NDArrayIter(data= train_data[:-5000], label=train_label[:-5000], batch_size=args.batch_size, shuffle=True)
56 |     val_data_iter = NDArrayIter(data= train_data[-5000:], label=train_label[-5000:], batch_size=args.batch_size, shuffle=False)
57 |     trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001})
58 |     # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.001})
59 |     utils.train(data_iter, val_data_iter, net, EntropyLoss,
60 |                 trainer, ctx, num_epochs=args.epochs, print_batches=print_batches)
61 | 


--------------------------------------------------------------------------------
/train_k_fold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import mxnet as mx
 5 | from mxnet import init
 6 | import numpy as np
 7 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix
 8 | from mxnet.gluon import Trainer
 9 | from mxnet.gluon.data import DataLoader,Dataset
10 | from mxnet.io import NDArrayIter
11 | from mxnet.ndarray import array
12 | from mxnet import nd
13 | from net import net_define, net_define_eu
14 | from sklearn.model_selection import KFold, StratifiedKFold
15 | import utils
16 | import config
17 | 
18 | def CapLoss(y_pred, y_true):
19 |     L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \
20 |         0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1))
21 |     return nd.mean(nd.sum(L, 1))
22 | 
23 | def EntropyLoss(y_pred, y_true, train_pos_ratio=None):
24 |     L = - y_true*(1-y_pred)**2*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred)*y_pred**2
25 |     return nd.mean(L)
26 | 
27 | def EntropyLoss1(y_pred, y_true, train_pos_ratio):
28 |     scale = 10
29 |     train_pos_ratio = array(train_pos_ratio, ctx=y_pred.context, dtype=np.float32) * scale
30 |     train_neg_ratio = (scale - train_pos_ratio)
31 |     L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred)*train_pos_ratio 
32 |     return nd.mean(L)
33 | 
34 | if __name__ == "__main__":
35 |     # setting the hyper parameters
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument('--batch_size', default=256, type=int)
38 |     parser.add_argument('--epochs', default=3, type=int)
39 |     parser.add_argument('--gpu', default=0, type=int)
40 |     parser.add_argument('--kfold', default=10, type=int)
41 |     parser.add_argument('--print_batches', default=100, type=int)
42 |     args = parser.parse_args()
43 | 
44 |     train_data, train_label, word_index = fetch_data()
45 |     embedding_dict = get_word_embedding()
46 |     em = get_embed_matrix(embedding_dict, word_index)
47 |     em = array(em, ctx=mx.cpu())
48 |     kf_label = np.ones(train_label.shape)
49 |     for i in range(train_label.shape[1]):
50 |         kf_label[:,i] = 2**i
51 |     kf_label = np.sum(kf_label, axis=1)
52 | 
53 |     ctx = [mx.gpu(0)]
54 |     net = net_define_eu()
55 | 
56 |     kf = StratifiedKFold(n_splits=args.kfold, shuffle=True)
57 |     for i, (inTr, inTe) in enumerate(kf.split(train_data, kf_label)):
58 |         print('fold: ', i)
59 |         net.collect_params().initialize(init=init.Xavier(), force_reinit=True)
60 |         xtr = train_data[inTr]
61 |         xte = train_data[inTe]
62 |         ytr = train_label[inTr]
63 |         yte = train_label[inTe]
64 |         pos_tr_ratio = np.sum(ytr, axis=0)/float(ytr.shape[0])
65 |         pos_tr_ratio = np.ones(pos_tr_ratio.shape)*0.5
66 |         data_iter =     NDArrayIter(data= xtr, label=ytr, batch_size=args.batch_size, shuffle=True)
67 |         val_data_iter = NDArrayIter(data= xte, label=yte, batch_size=args.batch_size, shuffle=False)
68 | 
69 |         # print net.collect_params()
70 |         net.collect_params().reset_ctx(ctx)
71 |         net.collect_params()['sequential0_embedding0_weight'].set_data(em)
72 |         net.collect_params()['sequential0_embedding0_weight'].grad_req = 'null'
73 |         # net.collect_params()['sequential'+str(i)+ '_embedding0_weight'].set_data(em)
74 |         # net.collect_params()['sequential'+str(i)+ '_embedding0_weight'].grad_req = 'null'
75 |         trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001})
76 |         # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.01,'clip_weights' : 1})
77 |         utils.train_multi(data_iter, val_data_iter, i, net, EntropyLoss1,
78 |                     trainer, ctx, num_epochs=args.epochs, print_batches=args.print_batches, pos_tr_ratio=pos_tr_ratio)
79 | 


--------------------------------------------------------------------------------
/train_multi.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import argparse
 4 | import mxnet as mx
 5 | import numpy as np
 6 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix
 7 | from mxnet.gluon import Trainer
 8 | from mxnet.gluon.data import DataLoader,Dataset
 9 | from mxnet.io import NDArrayIter
10 | from mxnet.ndarray import array
11 | from mxnet import nd
12 | from net import net_define,  net_define_eu
13 | import utils
14 | import config
15 | 
16 | def CapLoss(y_pred, y_true):
17 |     L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \
18 |         0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1))
19 |     return nd.mean(nd.sum(L, 1))
20 | 
21 | def EntropyLoss(y_pred, y_true):
22 |     L = - y_true*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred)
23 |     return nd.mean(L)
24 | 
25 | def EntropyLoss1(y_pred, y_true):
26 |     train_pos_ratio = array([ 0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361, 0.00880486], ctx=y_pred.context, dtype=np.float32)*10
27 |     train_neg_ratio = (1.0-train_pos_ratio)*10
28 |     L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred) * train_pos_ratio
29 |     return nd.mean(L)
30 | 
31 | if __name__ == "__main__":
32 |     # setting the hyper parameters
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('--batch_size', default=128, type=int)
35 |     parser.add_argument('--epochs', default=2, type=int)
36 |     parser.add_argument('--gpu', default=0, type=int)
37 |     args = parser.parse_args()
38 |     '''
39 |     train_data = np.random.randint(0, high=config.MAX_WORDS, size=(10000, config.MAX_LENGTH))
40 |     train_label = np.random.randint(0, high=2, size=(10000, 6)) 
41 |     '''
42 |     ctx = [mx.gpu(2), mx.gpu(3), mx.gpu(4), mx.gpu(5)]
43 |     net = net_define_eu()
44 |     # net.initialize(mx.init.Xavier(),ctx=ctx)
45 | 
46 |     train_data, train_label, word_index = fetch_data(True)
47 |     embedding_dict = get_word_embedding()
48 |     em = get_embed_matrix(embedding_dict, word_index)
49 |     # print 'copy array'
50 |     em = array(em, ctx=mx.cpu())
51 |     # print 'copy array done'
52 |     net.collect_params()['sequential0_embedding0_weight'].set_data(em)
53 |     net.collect_params().reset_ctx(ctx)
54 |     print net.collect_params()
55 | 
56 |     print_batches = 1000
57 |     shuffle_idx = np.random.permutation(train_data.shape[0])
58 |     train_data = train_data[shuffle_idx]
59 |     train_label = train_label[shuffle_idx]
60 | 
61 |     # print em.shape
62 |     data_iter = NDArrayIter(data= train_data[:-10000], label=train_label[:-10000], batch_size=args.batch_size, shuffle=True)
63 |     val_data_iter = NDArrayIter(data= train_data[-10000:], label=train_label[-10000:], batch_size=args.batch_size, shuffle=False)
64 |     trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001})
65 |     # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.001})
66 |     # utils.train(data_iter, val_data_iter, net, EntropyLoss,
67 |     #             trainer, ctx, num_epochs=args.epochs, print_batches=print_batches)
68 |     utils.train_multi(data_iter, val_data_iter, net, EntropyLoss,
69 |                 trainer, ctx, num_epochs=args.epochs, print_batches=print_batches)
70 |     net.save_params('net.params')
71 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | from mxnet import gluon
  2 | from mxnet import autograd
  3 | from mxnet import nd
  4 | from mxnet import image
  5 | from sklearn.metrics import roc_auc_score, confusion_matrix
  6 | import mxnet as mx
  7 | import numpy as np
  8 | import time
  9 | 
 10 | def try_gpu():
 11 |     """If GPU is available, return mx.gpu(0); else return mx.cpu()"""
 12 |     try:
 13 |         ctx = mx.gpu()
 14 |         _ = nd.zeros((1,), ctx=ctx)
 15 |     except:
 16 |         ctx = mx.cpu()
 17 |     return ctx
 18 | 
 19 | def accuracy(output, label):
 20 |     L = -label*np.log2(output) - (1-label) * np.log2(1-output)
 21 |     return np.mean(L)
 22 | 
 23 | def _get_batch(batch, ctx):
 24 |     """return data and label on ctx"""
 25 |     data = batch.data[0]
 26 |     label = batch.label[0]
 27 |     # data, label = gluon.utils.split_and_load(batch, ctx)
 28 |     return data.as_in_context(ctx), label.as_in_context(ctx)
 29 | 
 30 | def _get_batch_multi(batch, ctx, Train=True):
 31 |     # naive random shuffle
 32 |     if Train:
 33 |         npdata = batch.data[0].asnumpy()
 34 |         np_roll_data = np.roll(npdata, axis=1, shift=np.random.randint(npdata.shape[1]))
 35 |         nd_data = nd.array(np_roll_data )
 36 |         data = gluon.utils.split_and_load(nd_data, ctx)
 37 |     else:
 38 |         data = gluon.utils.split_and_load(batch.data[0], ctx)
 39 |     label = gluon.utils.split_and_load(batch.label[0], ctx)
 40 |     return data, label
 41 | 
 42 | def evaluate_accuracy(data_iterator, net, ctx=mx.gpu()):
 43 |     acc = 0.
 44 |     for i, batch in enumerate(data_iterator):
 45 |         data, label = _get_batch(batch, ctx)
 46 |         output = net(data)
 47 |         acc += accuracy(output, label)
 48 |     return acc / (i+1)
 49 | 
 50 | def evaluate_accuracy_multi(data_iterator, net, ctx):
 51 |     data_iterator.reset()
 52 |     acc = 0
 53 |     dummy_label = np.zeros((0,6))
 54 |     dummy_pred = np.zeros((0,6))
 55 |     t1 = time.time()
 56 |     for i, batch in enumerate(data_iterator):
 57 |         data, label = _get_batch_multi(batch, ctx, False)
 58 |         # acc += np.mean([accuracy(net(X), Y) for X, Y in zip(data, label)])
 59 |         # acc += np.mean([roc_auc_score(Y.asnumpy(), net(X).asnumpy()) for X, Y in zip(data, label)])
 60 |         output = np.vstack((net(X).asnumpy() for X in data))
 61 |         labels = np.vstack((Y.asnumpy() for Y in label))
 62 |         dummy_label = np.vstack((dummy_label, labels)) 
 63 |         dummy_pred = np.vstack((dummy_pred, output))
 64 |     # return acc / (i+1)
 65 |     # print dummy_label.shape, dummy_pred.shape
 66 |     dummy_pred_label = dummy_pred > 0.5
 67 |     for i in range(dummy_label.shape[1]):
 68 |         print i, confusion_matrix(dummy_label[:,i], dummy_pred_label[:,i])
 69 | 
 70 |     return roc_auc_score(dummy_label, dummy_pred), accuracy(dummy_pred, dummy_label), time.time() - t1
 71 | 
 72 | 
 73 | def train(train_data, test_data, net, loss, trainer,
 74 |           ctx, num_epochs, print_batches=None):
 75 |     """Train a network"""
 76 |     min_loss = 100000
 77 |     for epoch in range(num_epochs):
 78 |         train_loss = 0.
 79 |         train_acc = 0.
 80 |         n = 0
 81 |         for i, batch in enumerate(train_data):
 82 |             data, label = _get_batch(batch, ctx)
 83 |             with autograd.record():
 84 |                 output = net(data)
 85 |                 L = loss(output, label)
 86 |                 L.backward()
 87 |             trainer.step(data.shape[0], ignore_stale_grad=True)
 88 |             train_loss += nd.mean(L).asscalar()
 89 |             train_acc += accuracy(output, label)
 90 |             n = i + 1
 91 |             if print_batches and n % print_batches == 0:
 92 |                 test_acc = evaluate_accuracy(test_data, net, ctx)
 93 |                 test_data.reset()
 94 |                 print("Batch %d. Loss: %f, Train acc %f, Test Loss %f" % (
 95 |                 n, train_loss/n, train_acc/n, test_acc))
 96 |                 if test_acc < min_loss:
 97 |                     min_loss = test_acc
 98 |                     net.save_params('net.params')
 99 |         test_acc = evaluate_accuracy(test_data, net, ctx)
100 |         train_data.reset()
101 |         test_data.reset()
102 |         print("Epoch %d. Loss: %f, Train acc %f, Test Loss %f" % (
103 |               epoch, train_loss/n, train_acc/n, test_acc))
104 |         if test_acc < min_loss:
105 |             min_loss = test_acc
106 |             net.save_params('net.params')
107 | 
108 | def train_multi(train_data, test_data, iteration, net, loss, trainer,
109 |           ctx, num_epochs, print_batches=None, pos_tr_ratio=None):
110 |     """Train a network"""
111 |     min_loss = 0
112 |     for epoch in range(num_epochs):
113 |         train_loss = 0.
114 |         train_acc = 0.
115 |         n = 0
116 |         for i, batch in enumerate(train_data):
117 |             data, label = _get_batch_multi(batch, ctx)
118 |             with autograd.record():
119 |                 losses = [loss(net(X), Y, pos_tr_ratio) for X, Y in zip(data, label)]
120 |                 for l in losses:
121 |                     l.backward()
122 |             trainer.step(batch.data[0].shape[0], ignore_stale_grad=True)
123 |             train_loss += np.mean([nd.mean(l).asscalar() for l in losses])
124 |             # train_acc += accuracy(output, label)
125 |             n = i + 1
126 |             if print_batches and n % print_batches == 0:
127 |                 test_acc, test_loss, eval_time = evaluate_accuracy_multi(test_data, net, ctx)
128 |                 print("Batch %d. Loss: %f, Test roc_auc: %f, test_loss: %f , eval time: %f" % (
129 |                 n, train_loss/n, test_acc, test_loss, eval_time))
130 |                 if test_acc > min_loss:
131 |                     min_loss = test_acc
132 |                     net.save_params('net'+str(iteration)+'.params')
133 |           
134 |         train_data.reset()
135 |         test_acc, test_loss, eval_time = evaluate_accuracy_multi(test_data, net, ctx)
136 |         print("Epoch %d. Loss: %f, roc_auc: %f, test_loss: %f , eval time: %f" % (
137 |               epoch, train_loss/n, test_acc, test_loss, eval_time))
138 |         if test_acc > min_loss:
139 |             min_loss = test_acc
140 |             net.save_params('net'+str(iteration)+'.params')
141 | 
142 | 


--------------------------------------------------------------------------------