├── .gitignore ├── LICENSE ├── README.md ├── bad_dict.py ├── badwords.list ├── capsule_block.py ├── check_badwords.py ├── config.py ├── conv_cap.py ├── kfoldpostprocess.py ├── net.py ├── postprocess.py ├── preprocess.py ├── rake_parse.py ├── rename_result.py ├── test.py ├── test_k_fold.py ├── train.py ├── train_k_fold.py ├── train_multi.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | __pycache__ 3 | *.pyc 4 | *.txt 5 | *.params 6 | *.csv 7 | *.swp 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Yan Li 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CapsuleNet solution for Comment toxic classification challenge 2 | [Comment toxic classification challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) script in kaggle with naive CapsuleNet in MxNet 3 | ---- 4 | ## Some updates for those who stared this project 5 | 1. is it working? 6 | 7 | Yes 8 | 9 | 2. Hows the performance? 10 | 11 | My 10 folds model performed 0.9859 in public board, but only 0.984x in private board. 12 | 13 | 3. Will you update it? 14 | 15 | No. Competition is over. and you'd better not share through Github, since it violents Kaggle Rule. 16 | 17 | 4. How to use it? 18 | 19 | * Install the required Python Libs. 20 | * python train_k_fold.py for training(multi GPU supported, please refer to MxNet.) 21 | * python test_k_fold.py for testing(Only single GPU) 22 | 23 | 5. Last thing to mention, I slightly changed the squash function. And I don't like either form of it. 24 | 6. Future work 25 | 26 | * Maybe use it in object detection. Working on MxNet Pikachu example now. 27 | * Implement the Routing with EM 28 | ## Thank you for your Stars. 29 | 30 | ## Reference 31 | [comment_toxic](https://github.com/jcjview/comment_toxic) 32 | -------------------------------------------------------------------------------- /bad_dict.py: -------------------------------------------------------------------------------- 1 | def get_bad_word_dict(): 2 | lines = open('badwords.list').readlines() 3 | lines = [l.lower().strip('\n') for l in lines] 4 | lines = [l.split(',') for l in lines] 5 | bad_dict = {} 6 | for v in lines: 7 | if len(v) == 2: 8 | bad_dict[v[0]] =v[1] 9 | return bad_dict 10 | 11 | if __name__ == '__main__': 12 | print(get_bad_word_dict()) 13 | -------------------------------------------------------------------------------- /badwords.list: -------------------------------------------------------------------------------- 1 | kung fu, kungfu 2 | animalfucker,animal fucker 3 | anuses,anus 4 | a s s,ass 5 | asses,ass 6 | a55,ass 7 | azz,ass 8 | assclowns,ass clowns 9 | asskicked,ass kicked 10 | asswhipe,ass whipe 11 | ass hole,asshole 12 | asswhole,asshole 13 | asswipe,asshole 14 | arsehole,asshole 15 | ahole,asshole 16 | assface,asshole 17 | ashole,asshole 18 | asholes,asshole 19 | asswipes,asshole 20 | azzhole,asshole 21 | butthole,asshole 22 | badassness,bad ass 23 | basterds,bastard 24 | biatch,bitch 25 | bicth,bitch 26 | b*tch,bitch 27 | b!tch,bitch 28 | bitchs,bitch 29 | b i t c h,bitch 30 | b1tch,bitch 31 | bitches,bitch 32 | bitchass,bitch ass 33 | bitchmattythewhite,bitch matty the white 34 | bitchmother,bitch mother 35 | blowjobs,blowjob 36 | blow job,blowjob 37 | bollocks,bollock 38 | boners,boner 39 | boobs,boob 40 | bowels,bowel 41 | boymamas,boy mamas 42 | b u m,bum 43 | vbutt,butt 44 | wikitheclown,clown 45 | c0cks,cock 46 | c0ck,cock 47 | c o c k,cock 48 | cockhead,cock 49 | corpsefucking,corpse fuck 50 | kunt,cunt 51 | cntz,cunt 52 | cnts,cunt 53 | c u n t,cunt 54 | cuntbag,cunt bag 55 | cuntface,cunt face 56 | cuntfranks,cunt franks 57 | cuntliz,cunt liz 58 | dik,dick 59 | d!ck,dick 60 | d*ck,dick 61 | dickbag,dick bag 62 | dickbig,dick big 63 | dickbreath,dick breath 64 | dickbutt,dick butt 65 | dickheaditalic,dick head italic 66 | fagz,fag 67 | f a g,fag 68 | fagit,faggot 69 | faget,faggot 70 | faggit,faggot 71 | failepic,fail epic 72 | fatass,fat ass 73 | f*ck,fuck 74 | fuckk,fuck 75 | fcuk,fuck 76 | fuk,fuck 77 | fukkers,fuck 78 | fking,fuck 79 | ofuck,fuck 80 | fuckwads,fuck 81 | fck ,fuck 82 | fuc ,fuck 83 | fuckiest,fuck 84 | phuc,fuck 85 | phuk,fuck 86 | f uck,fuck 87 | fu ck,fuck 88 | fuking,fuck 89 | fcken,fuck 90 | fcking,fuck 91 | fukk,fuck 92 | f ing,fuck 93 | fuk ,fuck 94 | 'fuck,fuck 95 | fuckingabf,fuck 96 | fuckan,fuck 97 | fuckon,fuck 98 | f**k,fuck 99 | fuckedy,fuck 100 | phuck,fuck 101 | f'uck,fuck 102 | fuked,fuck 103 | fukcing,fuck 104 | fu*k,fuck 105 | f u c k,fuck 106 | fckin,fuck 107 | fuckass,fuck ass 108 | fuckbags,fuck bags 109 | fuckhole,fuck hole 110 | fucksex,fuck sex 111 | fuckstick,fuck stick 112 | fukyou,fuck you 113 | f you,fuck you 114 | fu ,fuck you 115 | f you,fuck you 116 | fuckyourself,fuck yourself 117 | fuker,fucker 118 | fukker,fucker 119 | fuken,fucking 120 | fukkin,fucking 121 | fuckign,fucking 122 | fuckin ,fucking 123 | fukin,fucking 124 | gayboy,gay 125 | gayz,gay 126 | goddamn,god damn 127 | h4x0r,hacker 128 | h e l l,hell 129 | wikihomosexuals,homosexual 130 | i d i o t,idiot 131 | wikipedidiots,idiot 132 | itiot,idiot 133 | itsuck,it suck 134 | wikijews,jew 135 | jpgsuck,jpg suck 136 | knobend,knob end 137 | lesbo,lesbian 138 | l m f a o,lmfao 139 | marcolfuck,marcol fuck 140 | masterbate,masturbate 141 | motherfu,mother fuck 142 | mothjer,mother fuck 143 | mothafuckin,mother fuck 144 | mother fukker,motherfucker 145 | mutha fucker,motherfucker 146 | mofo,motherfucker 147 | mother fucker,motherfucker 148 | n i g g e r,nigger 149 | nigga,nigger 150 | niggertard,nigger tard 151 | oldlady,old lady 152 | packi,paki 153 | wikipedophiles,pedophile 154 | p e n i s,penis 155 | pensnsnniensnsn,penis 156 | penus,penis 157 | penistown,penis 158 | pen1s,penis 159 | pneis,penis 160 | penas,penis 161 | p i s s,piss 162 | polack,polak 163 | polac,polak 164 | popsucker,pop sucker 165 | pr0n,porn 166 | pr1c,prick 167 | pusse,pussy 168 | wikiretards,retard 169 | s h i t,s hit 170 | sexsex,sex 171 | shioty,shit 172 | sh1t,shit 173 | sh1ts,shit 174 | shitter,shit 175 | shiot,shit 176 | shitler,shit 177 | sh!t,shit 178 | shoit,shit 179 | shitty,shit 180 | shyt,shit 181 | shity,shit 182 | shitlol,shit 183 | shitush,shit 184 | shitfuck,shit fuck 185 | sluts,slut 186 | slutty,slut 187 | s l u t,slut 188 | s u c k,suck 189 | suckish,suck 190 | suckipedia,suck 191 | suckdickeer,suck dick 192 | sucksfrozen,suck frozen 193 | suckersyou,suck you 194 | sux,sucks 195 | t i t,tit 196 | titties,tit 197 | titty,tit 198 | titt,tit 199 | tits,tit 200 | t u r d,turd 201 | w a n k,wank 202 | wikiwankers,wanker 203 | w a n k e r,wanker 204 | wtf,what the fuck 205 | w h o r e,whore 206 | hore,whore 207 | hoer,whore 208 | wh0re,whore 209 | whored,whore 210 | h0re,whore 211 | hoar,whore 212 | whores,whore 213 | h0r,whore 214 | w t f,wtf 215 | -------------------------------------------------------------------------------- /capsule_block.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | from mxnet import init 3 | from mxnet import nd 4 | from mxnet.gluon import nn 5 | from mxnet import initializer 6 | 7 | 8 | 9 | def squash(x, axis): 10 | s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True) 11 | # if s_squared_norm is really small, we will be in trouble 12 | # so I removed the s_quare terms 13 | # scale = s_squared_norm / ((1 + s_squared_norm) * nd.sqrt(s_squared_norm + 1e-9)) 14 | # return x * scale 15 | scale = nd.sqrt(s_squared_norm + 1e-9) 16 | return x / scale 17 | 18 | 19 | class CapConvBlock(nn.Block): 20 | def __init__(self, num_cap, channels, context, kernel_size=(9,9), padding=(0,0), 21 | strides=(1,1), route_num=3, **kwargs): 22 | super(CapConvBlock, self).__init__(**kwargs) 23 | self.num_cap = num_cap 24 | self.cap = nn.Conv2D(channels=channels*num_cap, kernel_size=kernel_size, 25 | strides=strides, padding=padding) 26 | self.route_num = route_num 27 | 28 | def forward(self, x): 29 | conv_out = nd.expand_dims(self.cap(x), axis=2) 30 | conv_out = conv_out.reshape((0,-1,self.num_cap,0,0)) 31 | conv_out = squash(conv_out, 1) 32 | return conv_out 33 | 34 | class CapFullyBlock(nn.Block): 35 | def __init__(self, num_locations, num_cap, input_units, units, 36 | route_num=3, **kwargs): 37 | super(CapFullyBlock, self).__init__(**kwargs) 38 | self.route_num = route_num 39 | self.num_cap = num_cap 40 | self.units = units 41 | self.num_locations = num_locations 42 | self.w_ij = self.params.get( 43 | 'weight', shape=(input_units, units, self.num_cap, self.num_locations) 44 | ,init=init.Xavier()) 45 | 46 | def forward(self, x): 47 | # reshape x into [batch_size, channel, num_previous_cap] 48 | x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1)) 49 | return self.Route(x_reshape) 50 | 51 | def Route(self, x): 52 | # b_mat = nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0)#nd.stop_gradient(nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0)) 53 | b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context) 54 | x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2) 55 | w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0) 56 | u_ = w_expand*x_expand 57 | # u_ = nd.abs(w_expand - x_expand) 58 | u = nd.sum(u_, axis = 1) 59 | u_no_gradient = nd.stop_gradient(u) 60 | for i in range(self.route_num): 61 | c_mat = nd.softmax(b_mat, axis=2) 62 | if i == self.route_num -1: 63 | s = nd.sum(u * c_mat, axis=-1) 64 | else: 65 | s = nd.sum(u_no_gradient * c_mat, axis=-1) 66 | v = squash(s, 1) 67 | v1 = nd.expand_dims(v, axis=-1) 68 | if i != self.route_num - 1: 69 | update_term = nd.sum(u_no_gradient*v1, axis=1, keepdims=True) 70 | b_mat = b_mat + update_term 71 | return v 72 | 73 | 74 | class CapFullyNGBlock(nn.Block): 75 | def __init__(self, num_locations, num_cap, input_units, units, 76 | route_num=3, **kwargs): 77 | super(CapFullyNGBlock, self).__init__(**kwargs) 78 | self.route_num = route_num 79 | self.num_cap = num_cap 80 | self.units = units 81 | self.num_locations = num_locations 82 | self.w_ij = self.params.get( 83 | 'weight', shape=(input_units, units, self.num_cap, self.num_locations) 84 | ,init=init.Xavier()) 85 | 86 | def forward(self, x): 87 | # reshape x into [batch_size, channel, num_previous_cap] 88 | x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1)) 89 | return self.Route(x_reshape) 90 | 91 | def Route(self, x): 92 | b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context) 93 | x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2) 94 | w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0) 95 | u_ = w_expand*x_expand 96 | u = nd.sum(u_, axis = 1) 97 | for i in range(self.route_num): 98 | c_mat = nd.softmax(b_mat, axis=2) 99 | s = nd.sum(u * c_mat, axis=-1) 100 | v = squash(s, 1) 101 | v1 = nd.expand_dims(v, axis=-1) 102 | update_term = nd.sum(u * v1, axis=1, keepdims=True) 103 | b_mat = b_mat + update_term 104 | return v 105 | 106 | 107 | class CapFullyEuBlock(nn.Block): 108 | def __init__(self, num_locations, num_cap, input_units, units, 109 | route_num=3, **kwargs): 110 | super(CapFullyEuBlock, self).__init__(**kwargs) 111 | self.route_num = route_num 112 | self.num_cap = num_cap 113 | self.units = units 114 | self.num_locations = num_locations 115 | self.w_ij = self.params.get( 116 | 'weight', shape=(input_units, units, self.num_cap, self.num_locations) 117 | ,init=init.Xavier()) 118 | 119 | def forward(self, x): 120 | # reshape x into [batch_size, channel, num_previous_cap] 121 | # print x.shape 122 | 123 | x_reshape = nd.transpose(x,(0,2,1,3,4)).reshape((0,0,-1)) 124 | return self.Route(x_reshape) 125 | 126 | def Route(self, x): 127 | # print x.context 128 | b_mat = nd.zeros((x.shape[0],1,self.num_cap, self.num_locations), ctx=x.context) 129 | x_expand = nd.expand_dims(nd.expand_dims(x, axis=2),2) 130 | w_expand = nd.repeat(nd.expand_dims(self.w_ij.data(x.context),axis=0), repeats=x.shape[0], axis=0) 131 | u_ = w_expand*x_expand 132 | u = nd.sum(u_, axis = 1) 133 | # u_ = nd.square(w_expand - x_expand) 134 | # u = -nd.sum(u_, axis = 1) 135 | u_no_gradient = nd.stop_gradient(u) 136 | for i in range(self.route_num): 137 | # c_mat = nd.softmax(b_mat, axis=2) 138 | c_mat = nd.sigmoid(b_mat) 139 | if i == self.route_num -1: 140 | s = nd.sum(u * c_mat, axis=-1) 141 | else: 142 | s = nd.sum(u_no_gradient * c_mat, axis=-1) 143 | v = squash(s, 1) 144 | if i != self.route_num - 1: 145 | v1 = nd.expand_dims(v, axis=-1) 146 | update_term = nd.sum(u_no_gradient*v1, axis=1, keepdims=True) 147 | b_mat = b_mat + update_term 148 | # b_mat = update_term 149 | # else: 150 | # v = s 151 | return v 152 | 153 | class LengthBlock(nn.Block): 154 | def __init__(self, **kwargs): 155 | super(LengthBlock, self).__init__(**kwargs) 156 | 157 | def forward(self, x): 158 | x = nd.sqrt(nd.sum(nd.square(x), 1)) 159 | return x 160 | 161 | class ActBlock(nn.Block): 162 | def __init__(self, **kwargs): 163 | super(ActBlock, self).__init__(**kwargs) 164 | 165 | def forward(self, x): 166 | x = nd.sigmoid(nd.sum(nd.square(x), 1)) 167 | return x 168 | -------------------------------------------------------------------------------- /check_badwords.py: -------------------------------------------------------------------------------- 1 | from bad_dict import get_bad_word_dict 2 | import re 3 | import os 4 | import numpy as np 5 | from tqdm import tqdm 6 | import pandas as pd 7 | from collections import OrderedDict 8 | 9 | data_path = 'data' 10 | train = 'train.csv' 11 | test = 'test.csv' 12 | train_raw = pd.read_csv(os.path.join(data_path, train)) 13 | raw_value = train_raw['comment_text'].fillna("_na_").values 14 | 15 | 16 | def text_parse(text, remove_stopwords=False, stem_words=False): 17 | wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 18 | url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 19 | ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' 20 | WIKI_LINK=' WIKILINKREPLACER ' 21 | URL_LINK=' URLLINKREPLACER ' 22 | IP_LINK=' IPLINKREPLACER ' 23 | #clear link 24 | c = re.findall(wiki_reg, text) 25 | for u in c: 26 | text = text.replace(u, WIKI_LINK) 27 | c = re.findall(url_reg, text) 28 | for u in c: 29 | text = text.replace(u, WIKI_LINK) 30 | c = re.findall(wiki_reg, text) 31 | for u in c: 32 | text = text.replace(u, URL_LINK) 33 | c = re.findall(ip_reg, text) 34 | for u in c: 35 | text = text.replace(u, IP_LINK) 36 | 37 | bad_word_dict = get_bad_word_dict() 38 | # Regex to remove all Non-Alpha Numeric and space 39 | special_character_removal = re.compile(r'[^A-Za-z\d!?*\'_ ]', re.IGNORECASE) 40 | # regex to replace all numerics 41 | replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE) 42 | text = text.lower().split() 43 | # Optionally, remove stop words 44 | if remove_stopwords: 45 | stops = set(stopwords.words("english")) 46 | text = [w for w in text if not w in stops] 47 | text = " ".join(text) 48 | # Remove Special Characters 49 | text = special_character_removal.sub(' ', text) 50 | found_dict = {k:False for k in bad_word_dict.keys()} 51 | for k,v in bad_word_dict.items(): 52 | if text.find(k) >= 0: 53 | found_dict[k]=True 54 | return found_dict 55 | 56 | bad_word_dict = get_bad_word_dict() 57 | appeared = {k:False for k in bad_word_dict.keys()} 58 | for l in tqdm(raw_value): 59 | status = text_parse(l) 60 | for k, v in status.items(): 61 | if v: 62 | appeared[k]=True 63 | cleaned_dict = {} 64 | for k, v in appeared.items(): 65 | if v: 66 | cleaned_dict[k] = bad_word_dict[k] 67 | cleaned_dict = OrderedDict(sorted(cleaned_dict.items(), key=lambda t: t[1])) 68 | 69 | with open('cleaned_badwords.list', 'w') as f: 70 | for k, v in cleaned_dict.items(): 71 | if k == v: 72 | continue 73 | f.write(k+','+ v +'\n') 74 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | MAX_LENGTH = 500 2 | MAX_WORDS = 200000 3 | EMBEDDING_DIM = 300 4 | -------------------------------------------------------------------------------- /conv_cap.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | from mxnet import init 3 | from mxnet import nd 4 | from mxnet.gluon import nn 5 | from mxnet import initializer 6 | 7 | def squash(x, axis): 8 | s_squared_norm = nd.sum(nd.square(x), axis, keepdims=True) 9 | # if s_squared_norm is really small, we will be in trouble 10 | # so I removed the s_quare terms 11 | # scale = s_squared_norm / ((1 + s_squared_norm) * nd.sqrt(s_squared_norm + 1e-9)) 12 | # return x * scale 13 | scale = nd.sqrt(s_squared_norm + 1e-9) 14 | return x / scale 15 | 16 | class PrimeConvCap(nn.Block): 17 | def __init__(self, num_cap, num_filter, kernel_size=(3,3), 18 | strides=(1,1), padding=(1,1), **kwargs): 19 | super(PrimeConvCap, self).__init__(**kwargs) 20 | self.num_cap = num_cap 21 | self.cap = nn.Conv2D(channels=(num_cap*num_filter), kernel_size=kernel_size, 22 | padding=padding, strides=strides) 23 | # self.bn = nn.BatchNorm() 24 | 25 | def forward(self, x): 26 | conv_out = nd.expand_dims(self.cap(x), axis=2) 27 | # conv_out = nd.expand_dims(self.bn(self.cap(x)), axis=2) 28 | conv_out = conv_out.reshape((0,self.num_cap,-1,0,0)) 29 | conv_out = squash(conv_out, 2) 30 | # print conv_out.shape 31 | return conv_out 32 | 33 | 34 | class AdvConvCap(nn.Block): 35 | def __init__(self, num_cap, num_filter, 36 | num_cap_in, num_filter_in, 37 | route_num=3, kernel_size=(3,3), 38 | strides=(1,1), padding=(1,1), 39 | **kwargs): 40 | super(AdvConvCap, self).__init__(**kwargs) 41 | self.num_cap = num_cap 42 | self.num_filter = num_filter 43 | self.route_num = route_num 44 | self.num_cap_in = num_cap_in 45 | # num_filter_in * num_cap_in filters divided in num_cap_in groups 46 | # with each group output size as num_cap * num_filter 47 | self.cap = nn.Conv2D(channels=(num_cap * num_filter * num_cap_in), kernel_size=kernel_size, strides=strides, 48 | padding=padding, groups= num_cap_in) 49 | # self.bn = nn.BatchNorm() 50 | 51 | def forward(self, x): 52 | x_reshape = x.reshape((x.shape[0], -1, x.shape[3], x.shape[4])) 53 | cap_out = self.cap(x_reshape) 54 | cap_out = cap_out.reshape((cap_out.shape[0], self.num_cap_in, self.num_cap, 55 | self.num_filter, cap_out.shape[2], cap_out.shape[3])) 56 | return self.route(cap_out) 57 | ''' 58 | cap_out = self.cap(x) 59 | cap_out = cap_out.reshape((x.shape[0],self.num_cap,-1,cap_out.shape[2], cap_out.shape[3])) 60 | cap_out = squash(cap_out, 2) 61 | return cap_out 62 | ''' 63 | 64 | def route(self, u): 65 | b_mat = nd.zeros((u.shape[0], self.num_cap_in, self.num_cap, 1, u.shape[4], u.shape[5]), ctx=u.context) 66 | for i in range(self.route_num): 67 | c_mat = nd.softmax(b_mat, axis=2) 68 | s = nd.sum(u * c_mat, axis=1) 69 | v = squash(s, 2) 70 | if i != self.route_num - 1: 71 | v1 = nd.expand_dims(v, axis=1) 72 | update_term = nd.sum(u*v1, axis=3, keepdims=True) 73 | b_mat = b_mat + update_term 74 | return v 75 | ''' 76 | class AdvFullyCap(nn.Block): 77 | def __init__(self, num_cap, num_filter, 78 | num_cap_in, num_filter_in, 79 | route_num=3, **kwargs): 80 | self.num_cap = num_cap 81 | self.num_filter = num_filter 82 | self.route_num = route_num 83 | self.num_cap_in = num_cap_in 84 | # num_filter_in * num_cap_in filters divided in num_cap_in groups 85 | # with each group output size as num_cap * num_filter 86 | self.cap = nn.Conv2D(channels=(num_cap * num_filter * num_cap_in), kernel_size=(1,1), groups= num_cap_in) 87 | 88 | def forward(self, x): 89 | x_reshape = x.reshape((x.shape[0], -1, x.shape[3], x.shape[4])) 90 | cap_out = self.cap(x_reshape) 91 | cap_out = cap_out.reshape((cap_out.shape[0], self.num_cap_in, self.num_cap, 92 | self.num_filter, cap_out.shape[2], cap_out.shape[3])) 93 | return cap_out 94 | ''' 95 | -------------------------------------------------------------------------------- /kfoldpostprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from preprocess import get_label 4 | 5 | kfold=5 6 | for i in range(kfold): 7 | # result_path = 'data/result'+str(i)+'.csv' 8 | result_path = 'result'+str(i)+'.csv' 9 | result = pd.read_csv(result_path) 10 | pred_label = get_label(result) 11 | if i==0: 12 | mean_result = pred_label 13 | else: 14 | # mean_result *= pred_label 15 | mean_result += pred_label 16 | 17 | # mean_result = np.power(mean_result, 1.0/kfold) 18 | mean_result = mean_result / kfold 19 | 20 | labels = ['toxic', 'severe_toxic', 21 | 'obscene', 'threat', 22 | 'insult', 'identity_hate'] 23 | result[labels] = mean_result 24 | result.to_csv('result_kfold.csv', index=False) 25 | 26 | PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4 27 | mean_result **= PROBABILITIES_NORMALIZE_COEFFICIENT 28 | # mean_result =np.log(mean_result) 29 | # mean_result -=0.5 30 | # mean_result =np.exp(mean_result) 31 | result[labels] = mean_result 32 | result.to_csv('postprocessing1.csv', index=False) 33 | -------------------------------------------------------------------------------- /net.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | from mxnet import init 3 | from mxnet import nd 4 | from mxnet.gluon import nn,rnn 5 | from conv_cap import PrimeConvCap, AdvConvCap 6 | from capsule_block import CapFullyBlock, CapFullyEuBlock, CapFullyNGBlock, LengthBlock, ActBlock 7 | import config 8 | 9 | def net_define(): 10 | net = nn.Sequential() 11 | with net.name_scope(): 12 | net.add(nn.Embedding(config.MAX_WORDS, config.EMBEDDING_DIM)) 13 | net.add(rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=2, dropout=0.2)) 14 | net.add(transpose(axes=(0,2,1))) 15 | # net.add(nn.MaxPool2D(pool_size=(config.MAX_LENGTH,1))) 16 | # net.add(nn.Conv2D(128, kernel_size=(101,1), padding=(50,0), groups=128,activation='relu')) 17 | net.add(PrimeConvCap(8,32, kernel_size=(1,1), padding=(0,0))) 18 | # net.add(AdvConvCap(8,32,8,32, kernel_size=(1,1), padding=(0,0))) 19 | net.add(CapFullyBlock(8*(config.MAX_LENGTH)/2, num_cap=12, input_units=32, units=16, route_num=5)) 20 | # net.add(CapFullyBlock(8*(config.MAX_LENGTH-8), num_cap=12, input_units=32, units=16, route_num=5)) 21 | # net.add(CapFullyBlock(8, num_cap=12, input_units=32, units=16, route_num=5)) 22 | net.add(nn.Dropout(0.2)) 23 | # net.add(LengthBlock()) 24 | net.add(nn.Dense(6, activation='sigmoid')) 25 | net.initialize(init=init.Xavier()) 26 | return net 27 | 28 | def net_define_eu(): 29 | net = nn.Sequential() 30 | with net.name_scope(): 31 | net.add(nn.Embedding(config.MAX_WORDS, config.EMBEDDING_DIM)) 32 | net.add(rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2)) 33 | net.add(transpose(axes=(0,2,1))) 34 | net.add(nn.GlobalMaxPool1D()) 35 | ''' 36 | net.add(FeatureBlock1()) 37 | ''' 38 | net.add(extendDim(axes=3)) 39 | net.add(PrimeConvCap(16, 32, kernel_size=(1,1), padding=(0,0),strides=(1,1))) 40 | net.add(CapFullyNGBlock(16, num_cap=12, input_units=32, units=16, route_num=3)) 41 | net.add(nn.Dropout(0.2)) 42 | net.add(nn.Dense(6, activation='sigmoid')) 43 | net.initialize(init=init.Xavier()) 44 | return net 45 | 46 | 47 | class extendDim(nn.Block): 48 | def __init__(self, axes, **kwargs): 49 | super(extendDim, self).__init__(**kwargs) 50 | self.axes = axes 51 | 52 | def forward(self, x): 53 | x1 = nd.expand_dims(x, axis=self.axes) 54 | return x1 55 | 56 | class reduceDim(nn.Block): 57 | def __init__(self, **kwargs): 58 | super(reduceDim, self).__init__(**kwargs) 59 | 60 | def forward(self, x): 61 | x1 = x.reshape((x.shape[0], x.shape[1], -1)) 62 | return x1 63 | 64 | 65 | class transpose(nn.Block): 66 | def __init__(self, axes, **kwargs): 67 | super(transpose, self).__init__(**kwargs) 68 | self.axes = axes 69 | 70 | def forward(self, x): 71 | return nd.transpose(x, axes=self.axes)# .reshape((0,0,0,1)) 72 | 73 | class fullyReshape(nn.Block): 74 | def __init__(self, axes, **kwargs): 75 | super(fullyReshape, self).__init__(**kwargs) 76 | self.axes = axes 77 | 78 | def forward(self, x): 79 | return nd.transpose(x, axes=self.axes).reshape((0,0,0,1,1)) 80 | 81 | # hard coding feature Block 82 | class FeatureBlock(nn.Block): 83 | def __init__(self, **kwargs): 84 | super(FeatureBlock, self).__init__(**kwargs) 85 | self.gru = rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2) 86 | self.conv3 = nn.Conv1D(channels=128, kernel_size=5, padding=2, strides=1, activation='relu') 87 | self.conv5 = nn.Conv1D(channels=128, kernel_size=9, padding=4, strides=1, activation='relu') 88 | self.conv7 = nn.Conv1D(channels=128, kernel_size=13, padding=6, strides=1, activation='relu') 89 | self.conv_drop = nn.Dropout(0.2) 90 | 91 | def forward(self, x): 92 | gru_out = self.gru(x) 93 | gru_out_t = nd.transpose(gru_out, axes=(0,2,1)) 94 | 95 | x_t = nd.transpose(x, axes=(0,2,1)) 96 | conv3_out = self.conv3(x_t) 97 | conv5_out = self.conv5(x_t) 98 | conv7_out = self.conv7(x_t) 99 | conv_out = nd.concat(*[conv3_out, conv5_out, conv7_out], dim=1) 100 | conv_out = self.conv_drop(conv_out) 101 | concated_feature = nd.concat(*[gru_out_t, conv_out], dim=1) 102 | return concated_feature 103 | 104 | # hard coding feature1 Block 105 | class FeatureBlock1(nn.Block): 106 | def __init__(self, **kwargs): 107 | super(FeatureBlock1, self).__init__(**kwargs) 108 | self.gru = rnn.GRU(128,layout='NTC',bidirectional=True, num_layers=1, dropout=0.2) 109 | self.conv3 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu') 110 | self.conv5 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu') 111 | self.conv7 = nn.Conv1D(channels=128, kernel_size=3, padding=1, strides=1, activation='relu') 112 | # self.gru_post_max = nn.MaxPool1D(pool_size=2) 113 | # self.gru_post_ave = nn.AvgPool1D(pool_size=2) 114 | self.gru_maxpool = nn.GlobalMaxPool1D() 115 | self.conv_maxpool = nn.GlobalMaxPool1D() 116 | ''' 117 | self.gru_avepool = nn.GlobalAvgPool1D() 118 | self.conv_avepool = nn.GlobalAvgPool1D() 119 | ''' 120 | self.conv_drop = nn.Dropout(0.5) 121 | 122 | def forward(self, x): 123 | x_t = nd.transpose(x, axes=(0,2,1)) 124 | conv3_out = self.conv3(x_t) 125 | conv5_out = self.conv5(conv3_out) + conv3_out 126 | conv7_out = self.conv7(conv5_out) + conv5_out 127 | # conv_out = nd.concat(*[conv3_out, conv5_out, conv7_out], dim=1) 128 | conv_out = self.conv_drop(conv7_out) 129 | conv_max_pooled = self.conv_maxpool(conv_out) 130 | 131 | gru_out = self.gru(x) 132 | gru_out_t = nd.transpose(gru_out, axes=(0,2,1)) 133 | # gru_pooled = nd.transpose(gru_out, axes=(0,2,1)) 134 | # gru_maxpooled = self.gru_post_max(gru_out_t) 135 | # return gru_maxpooled 136 | # gru_avepooled = self.gru_post_ave(gru_out_t) 137 | # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1) 138 | 139 | # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1) 140 | gru_maxpooled = self.gru_maxpool(gru_out_t) 141 | # gru_avepooled = self.gru_maxpool(gru_out_t) 142 | # gru_pooled = nd.concat(*[gru_maxpooled, gru_avepooled], dim=1) 143 | 144 | # conv_ave_pooled = self.conv_avepool(conv_out) 145 | concated_feature = nd.concat(*[gru_maxpooled, conv_max_pooled], dim=1) 146 | return concated_feature 147 | -------------------------------------------------------------------------------- /postprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from preprocess import get_label 4 | 5 | result_path = 'result.csv' 6 | result = pd.read_csv(result_path) 7 | labels = ['toxic', 'severe_toxic', 8 | 'obscene', 'threat', 9 | 'insult', 'identity_hate'] 10 | mean_result = get_label(result) 11 | PROBABILITIES_NORMALIZE_COEFFICIENT = 1.4 12 | mean_result **= PROBABILITIES_NORMALIZE_COEFFICIENT 13 | result[labels] = mean_result 14 | result.to_csv('postprocessing1.csv', index=False) 15 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import numpy as np 4 | import pandas as pd 5 | import config 6 | from tqdm import tqdm 7 | from multiprocessing import Pool 8 | from keras.preprocessing import text, sequence 9 | from bad_dict import get_bad_word_dict 10 | from rake_parse import rake_parse 11 | 12 | def get_raw_data(path): 13 | data = pd.read_csv(path) 14 | process_data = get_data(data) 15 | data['comment_text'] = process_data 16 | return data 17 | 18 | def get_data(raw_data): 19 | raw_value = raw_data['comment_text'].fillna("_na_").values 20 | pool = Pool() 21 | processed_data = list(tqdm(pool.imap(text_parse, raw_value),total=raw_value.shape[0])) 22 | ''' 23 | with open('debug.txt', 'w') as f: 24 | for l in processed_data: 25 | f.write(l+'\n') 26 | ''' 27 | return processed_data 28 | 29 | def text_parse(text, remove_stopwords=False, stem_words=False): 30 | wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 31 | url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 32 | url_reg2=r'www.[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 33 | ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' 34 | WIKI_LINK=' WIKILINKREPLACER ' 35 | URL_LINK=' URLLINKREPLACER ' 36 | IP_LINK=' IPLINKREPLACER ' 37 | #clear link 38 | # replace endline with '. ' 39 | endline = re.compile(r'.?\n', re.IGNORECASE) 40 | text = endline.sub('. ', text) 41 | 42 | c = re.findall(wiki_reg, text) 43 | for u in c: 44 | text = text.replace(u, WIKI_LINK) 45 | c = re.findall(url_reg, text) 46 | for u in c: 47 | text = text.replace(u, URL_LINK) 48 | c = re.findall(url_reg2, text) 49 | for u in c: 50 | text = text.replace(u, URL_LINK) 51 | c = re.findall(ip_reg, text) 52 | for u in c: 53 | text = text.replace(u, IP_LINK) 54 | 55 | bad_word_dict = get_bad_word_dict() 56 | # Regex to remove all Non-Alpha Numeric and space 57 | special_character_removal = re.compile(r'[^A-Za-z\d!?*\'.,; ]', re.IGNORECASE) 58 | # regex to replace all numerics 59 | replace_numbers = re.compile(r'\b\d+\b', re.IGNORECASE) 60 | text = text.lower().split() 61 | # Optionally, remove stop words 62 | if remove_stopwords: 63 | stops = set(stopwords.words("english")) 64 | text = [w for w in text if not w in stops] 65 | text = " ".join(text) 66 | # Remove Special Characters 67 | text = special_character_removal.sub(' ', text) 68 | for k,v in bad_word_dict.items(): 69 | # bad_reg = re.compile('[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]'+ re.escape(k) +'[!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n ]') 70 | bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]|[\W]' + re.escape(k) + '[\W]?') 71 | text = bad_reg.sub(' '+ v +' ', text) 72 | ''' 73 | bad_reg = re.compile('[\W]'+ re.escape(k) +'[\W]?') 74 | text = bad_reg.sub(' '+ v, text) 75 | bad_reg = re.compile('[\W]?'+ re.escape(k) +'[\W]') 76 | text = bad_reg.sub(v + ' ', text) 77 | ''' 78 | 79 | # Replace Numbers 80 | text = replace_numbers.sub('NUMBERREPLACER', text) 81 | text =text.split() 82 | text = " ".join(text) 83 | 84 | if stem_words: 85 | text = text.split() 86 | stemmer = SnowballStemmer('english') 87 | stemmed_words = [stemmer.stem(word) for word in text] 88 | text = " ".join(stemmed_words) 89 | # rake parsing 90 | text = rake_parse(text) 91 | return text 92 | 93 | def text_to_wordlist(text, remove_stopwords=False, stem_words=False): 94 | # Clean the text, with the option to remove stopwords and to stem words. 95 | # Convert words to lower case and split them 96 | wiki_reg=r'https?://en.wikipedia.org/[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 97 | url_reg=r'https?://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]' 98 | ip_reg='\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}' 99 | WIKI_LINK=' WIKI_LINK ' 100 | URL_LINK=' URL_LINK ' 101 | IP_LINK=' IP_LINK ' 102 | #clear link 103 | c = re.findall(wiki_reg, text) 104 | for u in c: 105 | text = text.replace(u, WIKI_LINK) 106 | c = re.findall(url_reg, text) 107 | for u in c: 108 | text = text.replace(u, WIKI_LINK) 109 | c = re.findall(wiki_reg, text) 110 | for u in c: 111 | text = text.replace(u, URL_LINK) 112 | c = re.findall(ip_reg, text) 113 | 114 | # Regex to remove all Non-Alpha Numeric and space 115 | special_character_removal = re.compile(r'[^A-Za-z\d!?*\' ]', re.IGNORECASE) 116 | # regex to replace all numerics 117 | replace_numbers = re.compile(r'\d+', re.IGNORECASE) 118 | 119 | # text = text.lower().split() 120 | text = text.split() 121 | # Optionally, remove stop words 122 | if remove_stopwords: 123 | stops = set(stopwords.words("english")) 124 | text = [w for w in text if not w in stops] 125 | 126 | text = " ".join(text) 127 | # Remove Special Characters 128 | text = special_character_removal.sub('', text) 129 | # Replace Numbers 130 | text = replace_numbers.sub('NUMBERREPLACER', text) 131 | # Optionally, shorten words to their stems 132 | if stem_words: 133 | text = text.split() 134 | stemmer = SnowballStemmer('english') 135 | stemmed_words = [stemmer.stem(word) for word in text] 136 | text = " ".join(stemmed_words) 137 | # Return a list of words 138 | return (text) 139 | 140 | 141 | def get_label(raw_data): 142 | labels = ['toxic', 'severe_toxic', 143 | 'obscene', 'threat', 144 | 'insult', 'identity_hate'] 145 | return raw_data[labels].values 146 | 147 | def get_id(raw_data): 148 | return raw_data['id'].values 149 | 150 | def process_data(train_data, test_data): 151 | # tokenizer = text.Tokenizer(num_words=config.MAX_WORDS, 152 | # filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n') 153 | tokenizer = text.Tokenizer(num_words=config.MAX_WORDS) 154 | tokenizer.fit_on_texts(train_data+test_data) 155 | train_tokenized = tokenizer.texts_to_sequences(train_data) 156 | test_tokenized = tokenizer.texts_to_sequences(test_data) 157 | train_data = sequence.pad_sequences(train_tokenized, maxlen = config.MAX_LENGTH) 158 | test_data = sequence.pad_sequences(test_tokenized, maxlen = config.MAX_LENGTH) 159 | return train_data, test_data, tokenizer.word_index 160 | 161 | def get_word_embedding(): 162 | data_path = 'data' 163 | # raw_embed = 'crawl-300d-2M.vec' 164 | raw_embed = 'glove.840B.300d.txt' 165 | EMBEDDING_FILE = os.path.join(data_path, raw_embed) 166 | embeddings_index = {} 167 | for line in open(EMBEDDING_FILE, "rb"): 168 | values = line.split() 169 | word = values[0] 170 | coefs = np.asarray(values[1:], dtype='float32') 171 | embeddings_index[word] = coefs 172 | print (len(embeddings_index)) 173 | return embeddings_index 174 | 175 | def get_embed_matrix(embeddings_index, word_index): 176 | nb_words = min(config.MAX_WORDS, len(word_index)) 177 | embedding_matrix = np.empty((nb_words, config.EMBEDDING_DIM)) 178 | # embedding_matrix = np.random.rand(nb_words, config.EMBEDDING_DIM) 179 | for word, i in word_index.items(): 180 | if i >= config.MAX_WORDS: 181 | continue 182 | word_parts = word.split('_') 183 | embedding_vectors = [embeddings_index.get(w) for w in word_parts] 184 | embedding_vectors = np.array([v if v is not None else np.random.rand(config.EMBEDDING_DIM) for v in embedding_vectors]) 185 | # embedding_matrix[i] = np.sum(embedding_vectors, axis=0)/np.linalg.norm(np.sum(embedding_vectors, axis=0)) 186 | embedding_matrix[i] = np.sum(embedding_vectors, axis=0)/embedding_vectors.shape[0] 187 | 188 | return embedding_matrix 189 | 190 | def fetch_data(aug=False): 191 | data_path = 'data' 192 | train = 'train.csv' 193 | test = 'test.csv' 194 | train_raw = get_raw_data(os.path.join(data_path, train)) 195 | test_raw = get_raw_data(os.path.join(data_path, test)) 196 | 197 | if aug: 198 | train_de = 'train_de.csv' 199 | train_fr = 'train_fr.csv' 200 | train_es = 'train_es.csv' 201 | train_de_raw = get_raw_data(os.path.join(data_path, train_de)) 202 | train_es_raw = get_raw_data(os.path.join(data_path, train_es)) 203 | train_fr_raw = get_raw_data(os.path.join(data_path, train_fr)) 204 | train_raw = pd.concat([train_raw, train_de_raw, train_es_raw, train_fr_raw]).drop_duplicates('comment_text') 205 | train_data = list(train_raw['comment_text'].fillna("_na_").values) 206 | test_data = list(test_raw['comment_text'].fillna("_na_").values) 207 | train_label = get_label(train_raw) 208 | # print train_raw 209 | # train_de_data = get_data(train_de_raw) 210 | # train_de_label = get_label(train_de_raw) 211 | #train_es_data = get_data(train_es_raw) 212 | # train_es_label = get_label(train_es_raw) 213 | # train_fr_data = get_data(train_fr_raw) 214 | # train_fr_label = get_label(train_fr_raw) 215 | # train_data = train_data + train_de_data + train_fr_data + train_es_data 216 | # train_label = np.vstack((train_label, train_de_label, train_fr_label, train_es_label)) 217 | 218 | train_data, test_data, word_index = process_data(train_data, test_data) 219 | return train_data, train_label, word_index 220 | 221 | def fetch_test_data(aug=False): 222 | data_path = 'data' 223 | train = 'train.csv' 224 | test = 'test.csv' 225 | train_raw = get_raw_data(os.path.join(data_path, train)) 226 | test_raw = get_raw_data(os.path.join(data_path, test)) 227 | if aug: 228 | train_de = 'train_de.csv' 229 | train_fr = 'train_fr.csv' 230 | train_es = 'train_es.csv' 231 | train_de_raw = get_raw_data(os.path.join(data_path, train_de)) 232 | train_es_raw = get_raw_data(os.path.join(data_path, train_es)) 233 | train_fr_raw = get_raw_data(os.path.join(data_path, train_fr)) 234 | train_raw = pd.concat([train_raw, train_de_raw, train_es_raw, train_fr_raw]).drop_duplicates('comment_text') 235 | train_data = list(train_raw['comment_text'].fillna("_na_").values) 236 | test_data = list(test_raw['comment_text'].fillna("_na_").values) 237 | train_data, test_data, word_index = process_data(train_data, test_data) 238 | test_id = get_id(test_raw) 239 | return test_data, test_id 240 | 241 | if __name__ == '__main__': 242 | # embedding_dict = get_word_embedding() 243 | # data, label, word_index = fetch_data() 244 | # print(np.sum(label, axis=0).astype(float) / label.shape[0]) 245 | # em = get_embed_matrix(embedding_dict, word_index) 246 | # print(em.shape) 247 | # reverse_idx = {v:k for k,v in word_index.items()} 248 | # reverse_idx[0] = 'NOTHING' 249 | # for i in range(100): 250 | # print [reverse_idx[v] for v in data[i] if v!=0] 251 | 252 | data_path = 'data' 253 | train = 'train.csv' 254 | test = 'test.csv' 255 | train_raw = pd.read_csv(os.path.join(data_path, train)) 256 | raw_value = train_raw['comment_text'].fillna("_na_").values 257 | # processed_data = [] 258 | # for i, v in enumerate(raw_value): 259 | # text_parse(v) 260 | a = raw_value[8306] 261 | word_index = {k:i+1 for i,k in enumerate(text_parse(a))} 262 | embedding_dict = get_word_embedding() 263 | em = get_embed_matrix(embedding_dict, word_index) 264 | 265 | ''' 266 | r = Rake() 267 | r.extract_keywords_from_text(text_parse(a)) 268 | print r.get_ranked_phrases() 269 | ''' 270 | -------------------------------------------------------------------------------- /rake_parse.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | from rake_nltk import Rake 5 | from bad_dict import get_bad_word_dict 6 | 7 | def rake_parse(line): 8 | r = Rake() 9 | r.extract_keywords_from_text(line) 10 | word_combines = r.get_ranked_phrases() 11 | word_combines = [k for k in word_combines if len(k.split()) > 1] 12 | # filter out bad word combines 13 | bad_word_dict = get_bad_word_dict() 14 | word_replacer = {} 15 | for k in word_combines: 16 | if any(map(lambda x : k.find(x) >= 0, bad_word_dict.values())): 17 | continue 18 | word_replacer[k] = '_'.join(k.split()) 19 | 20 | for k,v in word_replacer.items(): 21 | line = line.replace(k,v) 22 | return line 23 | 24 | if __name__ == '__main__': 25 | from preprocess import text_parse 26 | data_path = 'data' 27 | train = 'train.csv' 28 | test = 'test.csv' 29 | train_raw = pd.read_csv(os.path.join(data_path, train)) 30 | raw_value = train_raw['comment_text'].fillna("_na_").values 31 | a = raw_value[100] 32 | print a 33 | a = text_parse(a) 34 | print a 35 | 36 | -------------------------------------------------------------------------------- /rename_result.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | kfold=10 3 | rename = 0 4 | for i in range(kfold): 5 | # result_path = 'data/result'+str(i)+'.csv' 6 | result_path = 'result'+str(i)+'.csv' 7 | result_new_path = 'result'+str(rename) + str(i)+'.csv' 8 | shutil.move(result_path, result_new_path) 9 | 10 | 11 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import mxnet as mx 5 | import numpy as np 6 | from preprocess import fetch_test_data 7 | from mxnet.gluon import Trainer 8 | from mxnet.gluon.data import DataLoader,Dataset 9 | from mxnet.io import NDArrayIter 10 | from mxnet.ndarray import array 11 | from mxnet import nd 12 | from net import net_define, net_define_eu 13 | import config 14 | 15 | if __name__ == "__main__": 16 | # setting the hyper parameters 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--batch_size', default=128, type=int) 19 | parser.add_argument('--epochs', default=100, type=int) 20 | parser.add_argument('--gpu', default=0, type=int) 21 | args = parser.parse_args() 22 | 23 | # ctx = mx.cpu()# gpu(7) 24 | ctx = mx.gpu(args.gpu) 25 | net = net_define_eu() 26 | # net = net_define() 27 | net.collect_params().reset_ctx(ctx) 28 | net.load_params('net0.params', ctx) 29 | 30 | test_data, test_id = fetch_test_data() 31 | data_iter = NDArrayIter(data= test_data, batch_size=args.batch_size, shuffle=False) 32 | with open('result.csv','w') as f: 33 | f.write('id,toxic,severe_toxic,obscene,threat,insult,identity_hate\n') 34 | for i, d in enumerate(data_iter): 35 | print(i) 36 | output=net(d.data[0].as_in_context(ctx)).asnumpy() 37 | for j in range(args.batch_size): 38 | if i*args.batch_size + j < test_id.shape[0]: 39 | str_out = ','.join([str(test_id[i*args.batch_size+j])] + [str(v) for v in output[j]])+'\n' 40 | f.write(str_out) 41 | 42 | 43 | -------------------------------------------------------------------------------- /test_k_fold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import mxnet as mx 5 | import numpy as np 6 | from preprocess import fetch_test_data 7 | from mxnet.gluon import Trainer 8 | from mxnet.gluon.data import DataLoader,Dataset 9 | from mxnet.io import NDArrayIter 10 | from mxnet.ndarray import array 11 | from mxnet import nd 12 | from net import net_define, net_define_eu 13 | import config 14 | 15 | if __name__ == "__main__": 16 | # setting the hyper parameters 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--batch_size', default=128, type=int) 19 | parser.add_argument('--kfold', default=5, type=int) 20 | parser.add_argument('--gpu', default=0, type=int) 21 | 22 | args = parser.parse_args() 23 | 24 | # ctx = mx.cpu()# gpu(7) 25 | 26 | test_data, test_id = fetch_test_data() 27 | data_iter = NDArrayIter(data= test_data, batch_size=args.batch_size, shuffle=False) 28 | for i in range(args.kfold): 29 | print(i) 30 | ctx = mx.gpu(args.gpu) 31 | net = net_define_eu() 32 | net.collect_params().reset_ctx(ctx) 33 | net.load_params('net'+str(i)+'.params', ctx) 34 | data_iter.reset() 35 | with open('result'+str(i)+'.csv','w') as f: 36 | f.write('id,toxic,severe_toxic,obscene,threat,insult,identity_hate\n') 37 | for i, d in enumerate(data_iter): 38 | output=net(d.data[0].as_in_context(ctx)).asnumpy() 39 | for j in range(args.batch_size): 40 | if i*args.batch_size + j < test_id.shape[0]: 41 | str_out = ','.join([str(test_id[i*args.batch_size+j])] + [str(v) for v in output[j]])+'\n' 42 | f.write(str_out) 43 | 44 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import mxnet as mx 5 | import numpy as np 6 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix 7 | from mxnet.gluon import Trainer 8 | from mxnet.gluon.data import DataLoader,Dataset 9 | from mxnet.io import NDArrayIter 10 | from mxnet.ndarray import array 11 | from mxnet import nd 12 | from net import net_define, net_define_eu 13 | import utils 14 | import config 15 | 16 | def CapLoss(y_pred, y_true): 17 | L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \ 18 | 0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1)) 19 | return nd.mean(nd.sum(L, 1)) 20 | 21 | def EntropyLoss(y_pred, y_true): 22 | L = - y_true*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred) 23 | return nd.mean(L) 24 | 25 | def EntropyLoss1(y_pred, y_true): 26 | train_pos_ratio = array([ 0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361, 0.00880486], ctx=y_pred.context, dtype=np.float32)*10 27 | train_neg_ratio = (1.0-train_pos_ratio)*10 28 | L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred) * train_pos_ratio 29 | return nd.mean(L) 30 | 31 | if __name__ == "__main__": 32 | # setting the hyper parameters 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--batch_size', default=128, type=int) 35 | parser.add_argument('--epochs', default=2, type=int) 36 | parser.add_argument('--gpu', default=0, type=int) 37 | args = parser.parse_args() 38 | ctx = mx.gpu(args.gpu) 39 | net = net_define_eu() 40 | 41 | train_data, train_label, word_index = fetch_data() 42 | embedding_dict = get_word_embedding() 43 | # print len(word_index) 44 | em = get_embed_matrix(embedding_dict, word_index) 45 | net.collect_params().reset_ctx(ctx) 46 | em = array(em, ctx=mx.cpu()) 47 | net.collect_params()['sequential0_embedding0_weight'].set_data(em) 48 | net.collect_params()['sequential0_embedding0_weight'].grad_req = 'null' 49 | 50 | print_batches = 100 51 | shuffle_idx = np.random.permutation(train_data.shape[0]) 52 | train_data = train_data[shuffle_idx] 53 | train_label = train_label[shuffle_idx] 54 | 55 | data_iter = NDArrayIter(data= train_data[:-5000], label=train_label[:-5000], batch_size=args.batch_size, shuffle=True) 56 | val_data_iter = NDArrayIter(data= train_data[-5000:], label=train_label[-5000:], batch_size=args.batch_size, shuffle=False) 57 | trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001}) 58 | # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.001}) 59 | utils.train(data_iter, val_data_iter, net, EntropyLoss, 60 | trainer, ctx, num_epochs=args.epochs, print_batches=print_batches) 61 | -------------------------------------------------------------------------------- /train_k_fold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import mxnet as mx 5 | from mxnet import init 6 | import numpy as np 7 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix 8 | from mxnet.gluon import Trainer 9 | from mxnet.gluon.data import DataLoader,Dataset 10 | from mxnet.io import NDArrayIter 11 | from mxnet.ndarray import array 12 | from mxnet import nd 13 | from net import net_define, net_define_eu 14 | from sklearn.model_selection import KFold, StratifiedKFold 15 | import utils 16 | import config 17 | 18 | def CapLoss(y_pred, y_true): 19 | L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \ 20 | 0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1)) 21 | return nd.mean(nd.sum(L, 1)) 22 | 23 | def EntropyLoss(y_pred, y_true, train_pos_ratio=None): 24 | L = - y_true*(1-y_pred)**2*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred)*y_pred**2 25 | return nd.mean(L) 26 | 27 | def EntropyLoss1(y_pred, y_true, train_pos_ratio): 28 | scale = 10 29 | train_pos_ratio = array(train_pos_ratio, ctx=y_pred.context, dtype=np.float32) * scale 30 | train_neg_ratio = (scale - train_pos_ratio) 31 | L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred)*train_pos_ratio 32 | return nd.mean(L) 33 | 34 | if __name__ == "__main__": 35 | # setting the hyper parameters 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--batch_size', default=256, type=int) 38 | parser.add_argument('--epochs', default=3, type=int) 39 | parser.add_argument('--gpu', default=0, type=int) 40 | parser.add_argument('--kfold', default=10, type=int) 41 | parser.add_argument('--print_batches', default=100, type=int) 42 | args = parser.parse_args() 43 | 44 | train_data, train_label, word_index = fetch_data() 45 | embedding_dict = get_word_embedding() 46 | em = get_embed_matrix(embedding_dict, word_index) 47 | em = array(em, ctx=mx.cpu()) 48 | kf_label = np.ones(train_label.shape) 49 | for i in range(train_label.shape[1]): 50 | kf_label[:,i] = 2**i 51 | kf_label = np.sum(kf_label, axis=1) 52 | 53 | ctx = [mx.gpu(0)] 54 | net = net_define_eu() 55 | 56 | kf = StratifiedKFold(n_splits=args.kfold, shuffle=True) 57 | for i, (inTr, inTe) in enumerate(kf.split(train_data, kf_label)): 58 | print('fold: ', i) 59 | net.collect_params().initialize(init=init.Xavier(), force_reinit=True) 60 | xtr = train_data[inTr] 61 | xte = train_data[inTe] 62 | ytr = train_label[inTr] 63 | yte = train_label[inTe] 64 | pos_tr_ratio = np.sum(ytr, axis=0)/float(ytr.shape[0]) 65 | pos_tr_ratio = np.ones(pos_tr_ratio.shape)*0.5 66 | data_iter = NDArrayIter(data= xtr, label=ytr, batch_size=args.batch_size, shuffle=True) 67 | val_data_iter = NDArrayIter(data= xte, label=yte, batch_size=args.batch_size, shuffle=False) 68 | 69 | # print net.collect_params() 70 | net.collect_params().reset_ctx(ctx) 71 | net.collect_params()['sequential0_embedding0_weight'].set_data(em) 72 | net.collect_params()['sequential0_embedding0_weight'].grad_req = 'null' 73 | # net.collect_params()['sequential'+str(i)+ '_embedding0_weight'].set_data(em) 74 | # net.collect_params()['sequential'+str(i)+ '_embedding0_weight'].grad_req = 'null' 75 | trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001}) 76 | # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.01,'clip_weights' : 1}) 77 | utils.train_multi(data_iter, val_data_iter, i, net, EntropyLoss1, 78 | trainer, ctx, num_epochs=args.epochs, print_batches=args.print_batches, pos_tr_ratio=pos_tr_ratio) 79 | -------------------------------------------------------------------------------- /train_multi.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import argparse 4 | import mxnet as mx 5 | import numpy as np 6 | from preprocess import fetch_data, get_word_embedding, get_embed_matrix 7 | from mxnet.gluon import Trainer 8 | from mxnet.gluon.data import DataLoader,Dataset 9 | from mxnet.io import NDArrayIter 10 | from mxnet.ndarray import array 11 | from mxnet import nd 12 | from net import net_define, net_define_eu 13 | import utils 14 | import config 15 | 16 | def CapLoss(y_pred, y_true): 17 | L = y_true * nd.square(nd.maximum(0., 0.9 - y_pred)) + \ 18 | 0.5 * (1 - y_true) * nd.square(nd.maximum(0., y_pred - 0.1)) 19 | return nd.mean(nd.sum(L, 1)) 20 | 21 | def EntropyLoss(y_pred, y_true): 22 | L = - y_true*nd.log2(y_pred) - (1-y_true) * nd.log2(1-y_pred) 23 | return nd.mean(L) 24 | 25 | def EntropyLoss1(y_pred, y_true): 26 | train_pos_ratio = array([ 0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361, 0.00880486], ctx=y_pred.context, dtype=np.float32)*10 27 | train_neg_ratio = (1.0-train_pos_ratio)*10 28 | L = - y_true*nd.log2(y_pred) * train_neg_ratio - (1-y_true) * nd.log2(1-y_pred) * train_pos_ratio 29 | return nd.mean(L) 30 | 31 | if __name__ == "__main__": 32 | # setting the hyper parameters 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--batch_size', default=128, type=int) 35 | parser.add_argument('--epochs', default=2, type=int) 36 | parser.add_argument('--gpu', default=0, type=int) 37 | args = parser.parse_args() 38 | ''' 39 | train_data = np.random.randint(0, high=config.MAX_WORDS, size=(10000, config.MAX_LENGTH)) 40 | train_label = np.random.randint(0, high=2, size=(10000, 6)) 41 | ''' 42 | ctx = [mx.gpu(2), mx.gpu(3), mx.gpu(4), mx.gpu(5)] 43 | net = net_define_eu() 44 | # net.initialize(mx.init.Xavier(),ctx=ctx) 45 | 46 | train_data, train_label, word_index = fetch_data(True) 47 | embedding_dict = get_word_embedding() 48 | em = get_embed_matrix(embedding_dict, word_index) 49 | # print 'copy array' 50 | em = array(em, ctx=mx.cpu()) 51 | # print 'copy array done' 52 | net.collect_params()['sequential0_embedding0_weight'].set_data(em) 53 | net.collect_params().reset_ctx(ctx) 54 | print net.collect_params() 55 | 56 | print_batches = 1000 57 | shuffle_idx = np.random.permutation(train_data.shape[0]) 58 | train_data = train_data[shuffle_idx] 59 | train_label = train_label[shuffle_idx] 60 | 61 | # print em.shape 62 | data_iter = NDArrayIter(data= train_data[:-10000], label=train_label[:-10000], batch_size=args.batch_size, shuffle=True) 63 | val_data_iter = NDArrayIter(data= train_data[-10000:], label=train_label[-10000:], batch_size=args.batch_size, shuffle=False) 64 | trainer = Trainer(net.collect_params(),'adam', {'learning_rate': 0.001}) 65 | # trainer = Trainer(net.collect_params(),'RMSProp', {'learning_rate': 0.001}) 66 | # utils.train(data_iter, val_data_iter, net, EntropyLoss, 67 | # trainer, ctx, num_epochs=args.epochs, print_batches=print_batches) 68 | utils.train_multi(data_iter, val_data_iter, net, EntropyLoss, 69 | trainer, ctx, num_epochs=args.epochs, print_batches=print_batches) 70 | net.save_params('net.params') 71 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from mxnet import gluon 2 | from mxnet import autograd 3 | from mxnet import nd 4 | from mxnet import image 5 | from sklearn.metrics import roc_auc_score, confusion_matrix 6 | import mxnet as mx 7 | import numpy as np 8 | import time 9 | 10 | def try_gpu(): 11 | """If GPU is available, return mx.gpu(0); else return mx.cpu()""" 12 | try: 13 | ctx = mx.gpu() 14 | _ = nd.zeros((1,), ctx=ctx) 15 | except: 16 | ctx = mx.cpu() 17 | return ctx 18 | 19 | def accuracy(output, label): 20 | L = -label*np.log2(output) - (1-label) * np.log2(1-output) 21 | return np.mean(L) 22 | 23 | def _get_batch(batch, ctx): 24 | """return data and label on ctx""" 25 | data = batch.data[0] 26 | label = batch.label[0] 27 | # data, label = gluon.utils.split_and_load(batch, ctx) 28 | return data.as_in_context(ctx), label.as_in_context(ctx) 29 | 30 | def _get_batch_multi(batch, ctx, Train=True): 31 | # naive random shuffle 32 | if Train: 33 | npdata = batch.data[0].asnumpy() 34 | np_roll_data = np.roll(npdata, axis=1, shift=np.random.randint(npdata.shape[1])) 35 | nd_data = nd.array(np_roll_data ) 36 | data = gluon.utils.split_and_load(nd_data, ctx) 37 | else: 38 | data = gluon.utils.split_and_load(batch.data[0], ctx) 39 | label = gluon.utils.split_and_load(batch.label[0], ctx) 40 | return data, label 41 | 42 | def evaluate_accuracy(data_iterator, net, ctx=mx.gpu()): 43 | acc = 0. 44 | for i, batch in enumerate(data_iterator): 45 | data, label = _get_batch(batch, ctx) 46 | output = net(data) 47 | acc += accuracy(output, label) 48 | return acc / (i+1) 49 | 50 | def evaluate_accuracy_multi(data_iterator, net, ctx): 51 | data_iterator.reset() 52 | acc = 0 53 | dummy_label = np.zeros((0,6)) 54 | dummy_pred = np.zeros((0,6)) 55 | t1 = time.time() 56 | for i, batch in enumerate(data_iterator): 57 | data, label = _get_batch_multi(batch, ctx, False) 58 | # acc += np.mean([accuracy(net(X), Y) for X, Y in zip(data, label)]) 59 | # acc += np.mean([roc_auc_score(Y.asnumpy(), net(X).asnumpy()) for X, Y in zip(data, label)]) 60 | output = np.vstack((net(X).asnumpy() for X in data)) 61 | labels = np.vstack((Y.asnumpy() for Y in label)) 62 | dummy_label = np.vstack((dummy_label, labels)) 63 | dummy_pred = np.vstack((dummy_pred, output)) 64 | # return acc / (i+1) 65 | # print dummy_label.shape, dummy_pred.shape 66 | dummy_pred_label = dummy_pred > 0.5 67 | for i in range(dummy_label.shape[1]): 68 | print i, confusion_matrix(dummy_label[:,i], dummy_pred_label[:,i]) 69 | 70 | return roc_auc_score(dummy_label, dummy_pred), accuracy(dummy_pred, dummy_label), time.time() - t1 71 | 72 | 73 | def train(train_data, test_data, net, loss, trainer, 74 | ctx, num_epochs, print_batches=None): 75 | """Train a network""" 76 | min_loss = 100000 77 | for epoch in range(num_epochs): 78 | train_loss = 0. 79 | train_acc = 0. 80 | n = 0 81 | for i, batch in enumerate(train_data): 82 | data, label = _get_batch(batch, ctx) 83 | with autograd.record(): 84 | output = net(data) 85 | L = loss(output, label) 86 | L.backward() 87 | trainer.step(data.shape[0], ignore_stale_grad=True) 88 | train_loss += nd.mean(L).asscalar() 89 | train_acc += accuracy(output, label) 90 | n = i + 1 91 | if print_batches and n % print_batches == 0: 92 | test_acc = evaluate_accuracy(test_data, net, ctx) 93 | test_data.reset() 94 | print("Batch %d. Loss: %f, Train acc %f, Test Loss %f" % ( 95 | n, train_loss/n, train_acc/n, test_acc)) 96 | if test_acc < min_loss: 97 | min_loss = test_acc 98 | net.save_params('net.params') 99 | test_acc = evaluate_accuracy(test_data, net, ctx) 100 | train_data.reset() 101 | test_data.reset() 102 | print("Epoch %d. Loss: %f, Train acc %f, Test Loss %f" % ( 103 | epoch, train_loss/n, train_acc/n, test_acc)) 104 | if test_acc < min_loss: 105 | min_loss = test_acc 106 | net.save_params('net.params') 107 | 108 | def train_multi(train_data, test_data, iteration, net, loss, trainer, 109 | ctx, num_epochs, print_batches=None, pos_tr_ratio=None): 110 | """Train a network""" 111 | min_loss = 0 112 | for epoch in range(num_epochs): 113 | train_loss = 0. 114 | train_acc = 0. 115 | n = 0 116 | for i, batch in enumerate(train_data): 117 | data, label = _get_batch_multi(batch, ctx) 118 | with autograd.record(): 119 | losses = [loss(net(X), Y, pos_tr_ratio) for X, Y in zip(data, label)] 120 | for l in losses: 121 | l.backward() 122 | trainer.step(batch.data[0].shape[0], ignore_stale_grad=True) 123 | train_loss += np.mean([nd.mean(l).asscalar() for l in losses]) 124 | # train_acc += accuracy(output, label) 125 | n = i + 1 126 | if print_batches and n % print_batches == 0: 127 | test_acc, test_loss, eval_time = evaluate_accuracy_multi(test_data, net, ctx) 128 | print("Batch %d. Loss: %f, Test roc_auc: %f, test_loss: %f , eval time: %f" % ( 129 | n, train_loss/n, test_acc, test_loss, eval_time)) 130 | if test_acc > min_loss: 131 | min_loss = test_acc 132 | net.save_params('net'+str(iteration)+'.params') 133 | 134 | train_data.reset() 135 | test_acc, test_loss, eval_time = evaluate_accuracy_multi(test_data, net, ctx) 136 | print("Epoch %d. Loss: %f, roc_auc: %f, test_loss: %f , eval time: %f" % ( 137 | epoch, train_loss/n, test_acc, test_loss, eval_time)) 138 | if test_acc > min_loss: 139 | min_loss = test_acc 140 | net.save_params('net'+str(iteration)+'.params') 141 | 142 | --------------------------------------------------------------------------------