├── .gitignore ├── .travis.yml ├── MANIFEST.in ├── README.md ├── examples ├── test.py └── utils.py ├── requirements.txt ├── setup.py ├── tests ├── __init__.py └── simple_test.py └── text_classification ├── __init__.py ├── base.py ├── layers ├── __init__.py └── attention.py └── models ├── __init__.py ├── text_birnn.py ├── text_clstm1.py ├── text_clstm2.py ├── text_cnn.py ├── text_fasttext.py ├── text_han.py ├── text_mlp.py ├── text_rcnn.py ├── text_rnn.py └── text_textcnn.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | bak/* 4 | clean.sh 5 | examples/data/* 6 | examples/model/* 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | script: 4 | - coverage run setup.py test 5 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include text_classification * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Classification 2 | 3 | [![Build Status](https://travis-ci.org/qiangsiwei/text_classification.svg?branch=master)](https://travis-ci.org/qiangsiwei/text_classification) 4 | 5 | keras implementation of text classification algorithms 6 | 7 | Models: 8 | ===== 9 | 10 | 1. MLP 11 | 12 | 2. CNN 13 | 14 | 3. RNN 15 | 16 | 4. BiRNN 17 | 18 | 5. RCNN 19 | 20 | 6. HAN 21 | 22 | 7. CLSTM (series) 23 | 24 | 8. CLSTM (parallel) 25 | 26 | 9. TextCNN 27 | 28 | 10. FastText 29 | 30 | Install: 31 | ===== 32 | 33 | ```python 34 | python setup.py install 35 | ``` 36 | 37 | Usage: 38 | ===== 39 | 40 | ```python 41 | from text_classification import * 42 | 43 | clf = TextClassifierFastText() 44 | clf.fit(x,y,epochs=epochs,validation_split=validation_split) 45 | clf.predict(x) 46 | ``` 47 | 48 | Limitations: 49 | ---- 50 | 51 | 1. sentence length is limit by 'maxlen', words beyond will be truncated 52 | 53 | 2. out-off-bag words in prediction will be ignored by keras Tokenizer 54 | -------------------------------------------------------------------------------- /examples/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, sys, time, warnings 4 | warnings.filterwarnings('ignore') 5 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 6 | sys.path.append('..') 7 | from utils import get_data 8 | from functools import wraps 9 | from text_classification import * 10 | from text_classification.base import TextClassifierBase 11 | 12 | def timethis(func): 13 | @wraps(func) 14 | def wrapper(*args, **kwargs): 15 | print args[0].__name__ 16 | start = time.time() 17 | value = func(*args, **kwargs) 18 | print 'running time:', time.time()-start 19 | return value 20 | return wrapper 21 | 22 | @timethis 23 | def test_model_train(Classifier, x, y, epochs=10, validation_split=0): 24 | assert issubclass(Classifier,TextClassifierBase) 25 | clf = Classifier() 26 | clf.fit(x,y,epochs=epochs,validation_split=validation_split) 27 | return clf 28 | 29 | @timethis 30 | def test_model_dump(Classifier, x, y, epochs=10, dirn='model'): 31 | clf = Classifier() 32 | clf.fit(x,y,epochs=epochs) 33 | clf.dump_model(dirn) 34 | 35 | @timethis 36 | def test_model_load(Classifier, x, y, epochs=10, dirn='model'): 37 | clf = Classifier() 38 | clf.load_model(dirn) 39 | print clf.predict(x) 40 | 41 | if __name__ == '__main__': 42 | validation_split = 0.3 43 | # Classifier = TextClassifierMLP 44 | # Classifier = TextClassifierRNN 45 | # Classifier = TextClassifierCNN 46 | # Classifier = TextClassifierHAN 47 | # Classifier = TextClassifierRCNN 48 | # Classifier = TextClassifierBiRNN 49 | # Classifier = TextClassifierCLSTM1 50 | # Classifier = TextClassifierCLSTM2 51 | # Classifier = TextClassifierTextCNN 52 | Classifier = TextClassifierFastText 53 | 54 | x,y = zip(*get_data()) 55 | # test_model_train(Classifier,x,y,1,validation_split) 56 | test_model_dump(Classifier,x,y) 57 | test_model_load(Classifier,x,y) 58 | -------------------------------------------------------------------------------- /examples/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import fileinput 4 | 5 | def get_data(): 6 | for i in range(3): 7 | for l in fileinput.input('data/{}.txt'.format(i)): 8 | yield l.decode('utf-8'), i 9 | 10 | if __name__ == '__main__': 11 | pass 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Keras==2.0.8 2 | tensorflow==1.4.1 3 | coverage==4.5 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup 3 | 4 | base_dir = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | setup(name = 'text_classification', 7 | version = '0.1', 8 | description = 'keras implementation of text classification algorithms', 9 | author = 'Qiang Siwei', 10 | author_email = 'qiangsiwei@outlook.com', 11 | url = '', 12 | packages = ['text_classification',\ 13 | 'text_classification.layers',\ 14 | 'text_classification.models'], 15 | long_description = open(os.path.join(base_dir,'README.md')).read(), 16 | test_suite = 'tests.get_tests', 17 | ) -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | def get_tests(): 6 | from simple_test import SimpleTestCase 7 | suite = unittest.TestLoader().loadTestsFromTestCase(SimpleTestCase) 8 | return unittest.TestSuite([suite]) 9 | -------------------------------------------------------------------------------- /tests/simple_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | 5 | class SimpleTestCase(unittest.TestCase): 6 | 7 | def test_import(self): 8 | import text_classification 9 | return True 10 | 11 | if __name__ == "__main__": 12 | suite = unittest.TestSuite() 13 | test_cases = ['test_import'] 14 | for test_case in test_cases: 15 | suite.addTest(SimpleTestCase(test_case)) 16 | unittest.TextTestRunner(verbosity=2).run(suite) 17 | -------------------------------------------------------------------------------- /text_classification/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from models import TextClassifierMLP 4 | from models import TextClassifierRNN 5 | from models import TextClassifierCNN 6 | from models import TextClassifierHAN 7 | from models import TextClassifierRCNN 8 | from models import TextClassifierBiRNN 9 | from models import TextClassifierCLSTM1 10 | from models import TextClassifierCLSTM2 11 | from models import TextClassifierTextCNN 12 | from models import TextClassifierFastText 13 | -------------------------------------------------------------------------------- /text_classification/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os, pickle, numpy as np 4 | from abc import abstractmethod 5 | from keras.models import model_from_json 6 | from keras.utils.np_utils import to_categorical 7 | from keras.preprocessing.text import Tokenizer 8 | from keras.preprocessing.sequence import pad_sequences 9 | from layers import AttLayer 10 | 11 | class DataCache(object): 12 | def __init__(self): 13 | self.mode = '' 14 | self.data = {} 15 | 16 | def __format_name__(self,name): 17 | return '{0}_{1}'.format(self.mode,name) 18 | 19 | def __setitem__(self,name,value): 20 | name = self.__format_name__(name) 21 | self.data[name] = value 22 | 23 | def __getitem__(self,name): 24 | name = self.__format_name__(name) 25 | return self.data.get(name) 26 | 27 | def __contains__(self,name): 28 | name = self.__format_name__(name) 29 | return name in self.data 30 | 31 | def set_mode(self,mode): 32 | self.mode = mode 33 | 34 | class TextClassifierBase(object): 35 | def __init__(self,tokenizer=None,maxlen=50): 36 | self.data = DataCache() 37 | self.para = DataCache() 38 | self.model = None 39 | self.para['maxlen'] = maxlen 40 | self.para['tokenizer'] = tokenizer 41 | if not self.para['tokenizer']: 42 | self.para['tokenizer'] = Tokenizer(\ 43 | filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ') 44 | self.custom_layers = {'AttLayer':AttLayer} 45 | 46 | def __proc__x(self, x): 47 | self.data['x_ids'] = \ 48 | np.array(self.para['tokenizer'].texts_to_sequences(x)) 49 | self.data['x_sqs'] = \ 50 | pad_sequences(self.data['x_ids'],maxlen=self.para['maxlen']) 51 | self.data['x_mtx'] = \ 52 | self.para['tokenizer'].sequences_to_matrix(self.data['x_ids'],mode='binary') 53 | 54 | def init_fit_data(self, x, y, test_size=0): 55 | self.para['tokenizer'].fit_on_texts(x) 56 | self.vocab = self.para['tokenizer'].word_index 57 | self.__proc__x(x) 58 | self.data['y'] = to_categorical(y) 59 | self.labels = self.data['y'].shape[-1] 60 | 61 | def proc_fit_data(self, **kwargs): 62 | pass 63 | 64 | def init_pred_data(self, x): 65 | self.vocab = self.para['tokenizer'].word_index 66 | self.__proc__x(x) 67 | 68 | def proc_pred_data(self, **kwargs): 69 | pass 70 | 71 | @abstractmethod 72 | def init_model(self): 73 | pass 74 | 75 | def __get_x__(self,name): 76 | assert isinstance(name,str) 77 | assert name in self.data 78 | return self.data[name] 79 | 80 | def __fit__(self, x, y, inputs, **kwargs): 81 | self.mode = 'fit' 82 | self.init_fit_data(x,y) 83 | self.proc_fit_data(kwargs=kwargs) 84 | self.model = self.init_model() 85 | x = self.__get_x__(inputs) if isinstance(inputs,str) else\ 86 | map(self.__get_x__,inputs) 87 | self.model.fit( 88 | x = x, 89 | y = self.data['y'], 90 | epochs=kwargs.get('epochs',10), 91 | validation_split=kwargs.get('validation_split',0)) 92 | 93 | @abstractmethod 94 | def fit(self, x, y, **kwargs): 95 | pass 96 | 97 | def __predict__(self, x, inputs, **kwargs): 98 | self.mode = 'predict' 99 | assert self.model != None 100 | self.init_pred_data(x) 101 | self.proc_pred_data(kwargs=kwargs) 102 | x = self.__get_x__(inputs) if isinstance(inputs,str) else\ 103 | map(self.__get_x__,inputs) 104 | y = self.model.predict(x) 105 | return y.argmax(axis=1) 106 | 107 | @abstractmethod 108 | def predict(self, x, **kwargs): 109 | pass 110 | 111 | def __get_filenames__(self, dirn): 112 | return os.path.join(dirn,'tokenizer.pkl'),\ 113 | os.path.join(dirn,'model.json'),\ 114 | os.path.join(dirn,'param.h5') 115 | 116 | def dump_model(self, dirn): 117 | if not os.path.isdir(dirn): 118 | os.mkdir(dirn) 119 | assert self.model != None # more strict 120 | ft, fm, fp = self.__get_filenames__(dirn) 121 | with open(ft,'wb') as out: 122 | pickle.dump(self.para,out) 123 | with open(fm,'w') as out: 124 | out.write(self.model.to_json()) 125 | self.model.save_weights(fp,overwrite=True) 126 | 127 | def load_model(self, dirn): 128 | assert os.path.isdir(dirn) 129 | ft, fm, fp = self.__get_filenames__(dirn) 130 | assert all(os.path.isfile(fn) for fn in (ft,fm,fp)) 131 | self.para = pickle.load(open(ft,'rb')) 132 | self.model = model_from_json(open(fm).read(),self.custom_layers) 133 | self.model.load_weights(fp) 134 | 135 | if __name__ == '__main__': 136 | pass 137 | -------------------------------------------------------------------------------- /text_classification/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from attention import AttLayer 4 | -------------------------------------------------------------------------------- /text_classification/layers/attention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from keras import backend as K 4 | from keras.engine.topology import Layer 5 | from keras import initializers, constraints, regularizers 6 | 7 | class AttLayer(Layer): 8 | def __init__(self,init='glorot_uniform',kernel_regularizer=None,bias_regularizer=None,\ 9 | kernel_constraint=None,bias_constraint=None,**kwargs): 10 | self.supports_masking = True 11 | self.init = initializers.get(init) 12 | self.kernel_initializer = initializers.get(init) 13 | self.kernel_regularizer = regularizers.get(kernel_regularizer) 14 | self.kernel_constraint = constraints.get(kernel_constraint) 15 | self.bias_regularizer = regularizers.get(bias_regularizer) 16 | self.bias_constraint = constraints.get(bias_constraint) 17 | super(AttLayer, self).__init__(** kwargs) 18 | def build(self, input_shape): 19 | assert len(input_shape)==3 20 | self.W = self.add_weight((input_shape[-1],1),initializer=self.kernel_initializer,\ 21 | name='{}_W'.format(self.name),regularizer=self.kernel_regularizer,constraint=self.kernel_constraint) 22 | self.b = self.add_weight((input_shape[1],),initializer='zero',\ 23 | name='{}_b'.format(self.name),regularizer=self.bias_regularizer,constraint=self.bias_constraint) 24 | self.u = self.add_weight((input_shape[1],),initializer=self.kernel_initializer,\ 25 | name='{}_u'.format(self.name),regularizer=self.kernel_regularizer,constraint=self.kernel_constraint) 26 | self.built = True 27 | def compute_mask(self, input, input_mask=None): 28 | return None 29 | def call(self, x, mask=None): 30 | ait = K.exp(K.tanh(K.squeeze(K.dot(x,self.W),-1)+self.b)*self.u) 31 | if mask is not None: mask = K.cast(mask,K.floatx()); ait = mask*ait 32 | ait /= K.cast(K.sum(ait,axis=1,keepdims=True)+K.epsilon(),K.floatx()) 33 | ait = K.expand_dims(ait) 34 | return K.sum(x*ait,axis=1) 35 | def compute_output_shape(self, input_shape): 36 | return (input_shape[0],input_shape[-1]) 37 | 38 | if __name__ == '__main__': 39 | pass 40 | -------------------------------------------------------------------------------- /text_classification/models/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from text_mlp import TextClassifierMLP 4 | from text_rnn import TextClassifierRNN 5 | from text_cnn import TextClassifierCNN 6 | from text_han import TextClassifierHAN 7 | from text_rcnn import TextClassifierRCNN 8 | from text_birnn import TextClassifierBiRNN 9 | from text_clstm1 import TextClassifierCLSTM1 10 | from text_clstm2 import TextClassifierCLSTM2 11 | from text_textcnn import TextClassifierTextCNN 12 | from text_fasttext import TextClassifierFastText 13 | -------------------------------------------------------------------------------- /text_classification/models/text_birnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | 7 | class TextClassifierBiRNN(TextClassifierBase): 8 | 9 | def __init__(self): 10 | super(TextClassifierBiRNN,self).__init__() 11 | 12 | def init_model(self): 13 | model = Sequential() 14 | model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])) 15 | model.add(Bidirectional(LSTM(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True))) 16 | model.add(Bidirectional(LSTM(256,dropout=0.2,recurrent_dropout=0.1))) 17 | model.add(Dense(self.labels,activation='softmax')) 18 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 19 | return model 20 | 21 | def fit(self, x, y, **kwargs): 22 | self.__fit__(x,y,'x_sqs',**kwargs) 23 | 24 | def predict(self, x, **kwargs): 25 | return self.__predict__(x,'x_sqs',**kwargs) 26 | 27 | if __name__ == '__main__': 28 | pass 29 | -------------------------------------------------------------------------------- /text_classification/models/text_clstm1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | 7 | class TextClassifierCLSTM1(TextClassifierBase): 8 | 9 | def __init__(self): 10 | super(TextClassifierCLSTM1,self).__init__() 11 | 12 | def init_model(self): 13 | model = Sequential() 14 | model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])) 15 | model.add(Convolution1D(256,3,padding='same',strides=1)) 16 | model.add(Activation('relu')) 17 | model.add(MaxPool1D(pool_size=2)) 18 | model.add(GRU(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True)) 19 | model.add(GRU(256,dropout=0.2,recurrent_dropout=0.1)) 20 | model.add(Dense(self.labels,activation='softmax')) 21 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 22 | return model 23 | 24 | def fit(self, x, y, **kwargs): 25 | self.__fit__(x,y,'x_sqs',**kwargs) 26 | 27 | def predict(self, x, **kwargs): 28 | return self.__predict__(x,'x_sqs',**kwargs) 29 | 30 | if __name__ == '__main__': 31 | pass 32 | -------------------------------------------------------------------------------- /text_classification/models/text_clstm2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Model 5 | from keras.layers.merge import concatenate 6 | from keras.layers import * 7 | 8 | class TextClassifierCLSTM2(TextClassifierBase): 9 | 10 | def __init__(self): 11 | super(TextClassifierCLSTM2,self).__init__() 12 | 13 | def init_model(self): 14 | main_input = Input(shape=(self.para['maxlen'],),dtype='float64') 15 | embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(main_input) 16 | cnn = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed) 17 | cnn = MaxPool1D(pool_size=4)(cnn) 18 | cnn = Flatten()(cnn) 19 | cnn = Dense(256)(cnn) 20 | rnn = Bidirectional(GRU(256,dropout=0.2,recurrent_dropout=0.1))(embed) 21 | rnn = Dense(256)(rnn) 22 | con = concatenate([cnn,rnn],axis=-1) 23 | main_output = Dense(self.labels,activation='softmax')(con) 24 | model = Model(inputs=main_input,outputs=main_output) 25 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 26 | return model 27 | 28 | def fit(self, x, y, **kwargs): 29 | self.__fit__(x,y,'x_sqs',**kwargs) 30 | 31 | def predict(self, x, **kwargs): 32 | return self.__predict__(x,'x_sqs',**kwargs) 33 | 34 | if __name__ == '__main__': 35 | pass 36 | -------------------------------------------------------------------------------- /text_classification/models/text_cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | 7 | class TextClassifierCNN(TextClassifierBase): 8 | 9 | def __init__(self): 10 | super(TextClassifierCNN,self).__init__() 11 | 12 | def init_model(self): 13 | model = Sequential() 14 | model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])) 15 | model.add(Convolution1D(256,3,padding='same')) 16 | model.add(MaxPool1D(3,3,padding='same')) 17 | model.add(Convolution1D(128,3,padding='same')) 18 | model.add(MaxPool1D(3,3,padding='same')) 19 | model.add(Convolution1D(64,3,padding='same')) 20 | model.add(Flatten()) 21 | model.add(Dropout(0.1)) 22 | model.add(BatchNormalization()) 23 | model.add(Dense(256,activation='relu')) 24 | model.add(Dropout(0.1)) 25 | model.add(Dense(self.labels,activation='softmax')) 26 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 27 | return model 28 | 29 | def fit(self, x, y, **kwargs): 30 | self.__fit__(x,y,'x_sqs',**kwargs) 31 | 32 | def predict(self, x, **kwargs): 33 | return self.__predict__(x,'x_sqs',**kwargs) 34 | 35 | if __name__ == '__main__': 36 | pass 37 | -------------------------------------------------------------------------------- /text_classification/models/text_fasttext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | from keras.preprocessing.sequence import pad_sequences 7 | import numpy as np 8 | 9 | class TextClassifierFastText(TextClassifierBase): 10 | 11 | def __init__(self): 12 | super(TextClassifierFastText,self).__init__() 13 | 14 | def init_model(self): 15 | model = Sequential() 16 | model.add(Embedding(self.para['fasttext_dim'],256,input_length=self.para['fasttext_maxlen'])) 17 | model.add(GlobalAveragePooling1D()) 18 | model.add(Dense(self.labels,activation='softmax')) 19 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 20 | return model 21 | 22 | def __add_ngrams__(self, seqs): 23 | gram_n = self.para['gram_n'] 24 | token_to_idx = self.para['token_to_idx'] 25 | for inputs in seqs: 26 | nlist = inputs[:] 27 | for i in range(len(nlist)-gram_n+1): 28 | for ngram_value in range(2,gram_n+1): 29 | ngram = tuple(nlist[i:i+gram_n]) 30 | if ngram in token_to_idx: 31 | nlist.append(token_to_idx[ngram]) 32 | yield nlist 33 | 34 | def __proc_data__(self): 35 | self.data['fasttext_x_sqs'] = pad_sequences(\ 36 | list(self.__add_ngrams__(self.data['x_ids'])),\ 37 | maxlen=self.para['fasttext_maxlen']) 38 | 39 | def proc_fit_data(self, **kwargs): 40 | self.para['gram_n'] = max(kwargs.get('gram_n') or 2,2) 41 | self.para['fasttext_maxlen'] = kwargs.get('maxlen') or 2*self.para['maxlen'] 42 | def create_ngrams(inputs, gram_n): 43 | return set(zip(*[inputs[i:] for i in range(gram_n)])) 44 | ngrams = set() 45 | for inputs in self.data['x_sqs']: 46 | for i in range(2,self.para['gram_n']+1): 47 | ngrams.update(create_ngrams(inputs,gram_n=i)) 48 | start_idx = len(self.vocab)+2 49 | token_to_idx = {token:idx+start_idx for idx,token in enumerate(ngrams)} 50 | idx_to_token = {idx:token for token,idx in token_to_idx.iteritems()} 51 | self.para['token_to_idx'] = token_to_idx 52 | self.para['fasttext_dim'] = np.max(list(idx_to_token.keys()))+1 53 | self.__proc_data__() 54 | 55 | def proc_pred_data(self, **kwargs): 56 | self.__proc_data__() 57 | 58 | def fit(self, x, y, **kwargs): 59 | self.__fit__(x,y,'fasttext_x_sqs',**kwargs) 60 | 61 | def predict(self, x, **kwargs): 62 | return self.__predict__(x,'fasttext_x_sqs',**kwargs) 63 | 64 | def dump_model(self, dirn): 65 | super(TextClassifierFastText,self).dump_model(dirn) 66 | 67 | def load_model(self, dirn): 68 | super(TextClassifierFastText,self).load_model(dirn) 69 | 70 | if __name__ == '__main__': 71 | pass 72 | -------------------------------------------------------------------------------- /text_classification/models/text_han.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from ..layers import AttLayer 5 | from keras.models import Model 6 | from keras.layers import * 7 | 8 | class TextClassifierHAN(TextClassifierBase): 9 | 10 | def __init__(self): 11 | super(TextClassifierHAN,self).__init__() 12 | 13 | def init_model(self): 14 | inputs = Input(shape=(self.para['maxlen'],),dtype='float64') 15 | embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(inputs) 16 | gru = Bidirectional(GRU(128,dropout=0.2,recurrent_dropout=0.1,return_sequences=True))(embed) 17 | attention = AttLayer()(gru) 18 | output = Dense(self.labels,activation='softmax')(attention) 19 | model = Model(inputs,output) 20 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 21 | return model 22 | 23 | def fit(self, x, y, **kwargs): 24 | self.__fit__(x,y,'x_sqs',**kwargs) 25 | 26 | def predict(self, x, **kwargs): 27 | return self.__predict__(x,'x_sqs',**kwargs) 28 | 29 | if __name__ == '__main__': 30 | pass 31 | -------------------------------------------------------------------------------- /text_classification/models/text_mlp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | 7 | class TextClassifierMLP(TextClassifierBase): 8 | 9 | def __init__(self): 10 | super(TextClassifierMLP,self).__init__() 11 | 12 | def init_model(self): 13 | model = Sequential() 14 | model.add(Dense(512,input_shape=(len(self.vocab)+1,),activation='relu')) 15 | model.add(Dropout(0.5)) 16 | model.add(Dense(self.labels,activation='softmax')) 17 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 18 | return model 19 | 20 | def fit(self, x, y, **kwargs): 21 | self.__fit__(x,y,'x_mtx',**kwargs) 22 | 23 | def predict(self, x, **kwargs): 24 | return self.__predict__(x,'x_mtx',**kwargs) 25 | 26 | if __name__ == '__main__': 27 | pass 28 | -------------------------------------------------------------------------------- /text_classification/models/text_rcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Model 5 | from keras.layers.merge import concatenate 6 | from keras.layers import * 7 | from keras.preprocessing.sequence import pad_sequences 8 | 9 | class TextClassifierRCNN(TextClassifierBase): 10 | 11 | def __init__(self): 12 | super(TextClassifierRCNN,self).__init__() 13 | 14 | def init_model(self): 15 | doc = Input(shape=(None,),dtype='int32') 16 | lc = Input(shape=(None,),dtype='int32') 17 | rc = Input(shape=(None,),dtype='int32') 18 | embedder = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen']) 19 | demb, lemb, remb = embedder(doc), embedder(lc), embedder(rc) 20 | fward = LSTM(256,return_sequences=True)(lemb) 21 | bward = LSTM(256,return_sequences=True,go_backwards=True)(remb) 22 | together = concatenate([fward,demb,bward],axis=2) 23 | semantic = TimeDistributed(Dense(128,activation='tanh'))(together) 24 | pool_rnn = Lambda(lambda x:K.max(x,axis=1),output_shape=(128,))(semantic) 25 | output = Dense(self.labels,activation='softmax')(pool_rnn) 26 | model = Model(inputs=[doc,lc,rc],outputs=output) 27 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 28 | return model 29 | 30 | def __proc_data__(self): 31 | x_lids = [[len(self.vocab)]+x[:-1]for x in self.data['x_ids'].tolist()] 32 | x_rids = [x[1:]+[len(self.vocab)] for x in self.data['x_ids'].tolist()] 33 | self.data['x_lids'] = pad_sequences(x_lids,maxlen=self.para['maxlen']) 34 | self.data['x_rids'] = pad_sequences(x_rids,maxlen=self.para['maxlen']) 35 | 36 | def proc_fit_data(self, **kwargs): 37 | self.__proc_data__() 38 | 39 | def proc_pred_data(self, **kwargs): 40 | self.__proc_data__() 41 | 42 | def fit(self, x, y, **kwargs): 43 | self.__fit__(x,y,inputs=['x_sqs','x_lids','x_rids'],**kwargs) 44 | 45 | def predict(self, x, **kwargs): 46 | return self.__predict__(x,['x_sqs','x_lids','x_rids'],**kwargs) 47 | 48 | if __name__ == '__main__': 49 | pass 50 | -------------------------------------------------------------------------------- /text_classification/models/text_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Sequential 5 | from keras.layers import * 6 | 7 | class TextClassifierRNN(TextClassifierBase): 8 | 9 | def __init__(self): 10 | super(TextClassifierRNN,self).__init__() 11 | 12 | def init_model(self): 13 | model = Sequential() 14 | model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])) 15 | model.add(LSTM(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True)) 16 | model.add(LSTM(256,dropout=0.2,recurrent_dropout=0.1)) 17 | model.add(Dense(self.labels,activation='softmax')) 18 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 19 | return model 20 | 21 | def fit(self, x, y, **kwargs): 22 | self.__fit__(x,y,'x_sqs',**kwargs) 23 | 24 | def predict(self, x, **kwargs): 25 | return self.__predict__(x,'x_sqs',**kwargs) 26 | 27 | if __name__ == '__main__': 28 | pass 29 | -------------------------------------------------------------------------------- /text_classification/models/text_textcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from ..base import TextClassifierBase 4 | from keras.models import Model 5 | from keras.layers.merge import concatenate 6 | from keras.layers import * 7 | 8 | class TextClassifierTextCNN(TextClassifierBase): 9 | 10 | def __init__(self): 11 | super(TextClassifierTextCNN,self).__init__() 12 | 13 | def init_model(self): 14 | main_input = Input(shape=(self.para['maxlen'],),dtype='float64') 15 | embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(main_input) 16 | cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed) 17 | cnn1 = MaxPool1D(pool_size=4)(cnn1) 18 | cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed) 19 | cnn2 = MaxPool1D(pool_size=4)(cnn2) 20 | cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed) 21 | cnn3 = MaxPool1D(pool_size=4)(cnn3) 22 | cnn = concatenate([cnn1,cnn2,cnn3],axis=-1) 23 | flat = Flatten()(cnn) 24 | drop = Dropout(0.2)(flat) 25 | main_output = Dense(self.labels,activation='softmax')(drop) 26 | model = Model(inputs=main_input,outputs = main_output) 27 | model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy']) 28 | return model 29 | 30 | def fit(self, x, y, **kwargs): 31 | self.__fit__(x,y,'x_sqs',**kwargs) 32 | 33 | def predict(self, x, **kwargs): 34 | return self.__predict__(x,'x_sqs',**kwargs) 35 | 36 | if __name__ == '__main__': 37 | pass 38 | --------------------------------------------------------------------------------