├── .gitignore
├── .travis.yml
├── MANIFEST.in
├── README.md
├── examples
    ├── test.py
    └── utils.py
├── requirements.txt
├── setup.py
├── tests
    ├── __init__.py
    └── simple_test.py
└── text_classification
    ├── __init__.py
    ├── base.py
    ├── layers
        ├── __init__.py
        └── attention.py
    └── models
        ├── __init__.py
        ├── text_birnn.py
        ├── text_clstm1.py
        ├── text_clstm2.py
        ├── text_cnn.py
        ├── text_fasttext.py
        ├── text_han.py
        ├── text_mlp.py
        ├── text_rcnn.py
        ├── text_rnn.py
        └── text_textcnn.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .DS_Store
3 | bak/*
4 | clean.sh
5 | examples/data/*
6 | examples/model/*
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | 
3 | script:
4 |   - coverage run setup.py test
5 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include text_classification *


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text Classification
 2 | 
 3 | [![Build Status](https://travis-ci.org/qiangsiwei/text_classification.svg?branch=master)](https://travis-ci.org/qiangsiwei/text_classification)
 4 | 
 5 | keras implementation of text classification algorithms
 6 | 
 7 | Models:
 8 | =====
 9 | 
10 | 1. MLP
11 | 
12 | 2. CNN
13 | 
14 | 3. RNN
15 | 
16 | 4. BiRNN
17 | 
18 | 5. RCNN
19 | 
20 | 6. HAN
21 | 
22 | 7. CLSTM (series)
23 | 
24 | 8. CLSTM (parallel)
25 | 
26 | 9. TextCNN
27 | 
28 | 10. FastText
29 | 
30 | Install:
31 | =====
32 | 
33 | ```python
34 | python setup.py install
35 | ```
36 | 
37 | Usage:
38 | =====
39 | 
40 | ```python
41 | from text_classification import *
42 | 
43 | clf = TextClassifierFastText()
44 | clf.fit(x,y,epochs=epochs,validation_split=validation_split)
45 | clf.predict(x)
46 | ```
47 | 
48 | Limitations:
49 | ----
50 | 
51 | 1. sentence length is limit by 'maxlen', words beyond will be truncated
52 | 
53 | 2. out-off-bag words in prediction will be ignored by keras Tokenizer 
54 | 


--------------------------------------------------------------------------------
/examples/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os, sys, time, warnings
 4 | warnings.filterwarnings('ignore')
 5 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 6 | sys.path.append('..')
 7 | from utils import get_data
 8 | from functools import wraps
 9 | from text_classification import *
10 | from text_classification.base import TextClassifierBase
11 | 
12 | def timethis(func):
13 | 	@wraps(func)
14 | 	def wrapper(*args, **kwargs):
15 | 		print args[0].__name__
16 | 		start = time.time()
17 | 		value = func(*args, **kwargs)
18 | 		print 'running time:', time.time()-start
19 | 		return value
20 | 	return wrapper
21 | 
22 | @timethis
23 | def test_model_train(Classifier, x, y, epochs=10, validation_split=0):
24 | 	assert issubclass(Classifier,TextClassifierBase)
25 | 	clf = Classifier()
26 | 	clf.fit(x,y,epochs=epochs,validation_split=validation_split)
27 | 	return clf
28 | 
29 | @timethis
30 | def test_model_dump(Classifier, x, y, epochs=10, dirn='model'):
31 | 	clf = Classifier()
32 | 	clf.fit(x,y,epochs=epochs)
33 | 	clf.dump_model(dirn)
34 | 
35 | @timethis
36 | def test_model_load(Classifier, x, y, epochs=10, dirn='model'):
37 | 	clf = Classifier()
38 | 	clf.load_model(dirn)
39 | 	print clf.predict(x)
40 | 
41 | if __name__ == '__main__':
42 | 	validation_split = 0.3
43 | 	# Classifier = TextClassifierMLP
44 | 	# Classifier = TextClassifierRNN
45 | 	# Classifier = TextClassifierCNN
46 | 	# Classifier = TextClassifierHAN
47 | 	# Classifier = TextClassifierRCNN
48 | 	# Classifier = TextClassifierBiRNN
49 | 	# Classifier = TextClassifierCLSTM1
50 | 	# Classifier = TextClassifierCLSTM2
51 | 	# Classifier = TextClassifierTextCNN
52 | 	Classifier = TextClassifierFastText
53 | 
54 | 	x,y = zip(*get_data())
55 | 	# test_model_train(Classifier,x,y,1,validation_split)
56 | 	test_model_dump(Classifier,x,y)
57 | 	test_model_load(Classifier,x,y)
58 | 


--------------------------------------------------------------------------------
/examples/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import fileinput
 4 | 
 5 | def get_data():
 6 | 	for i in range(3):
 7 | 		for l in fileinput.input('data/{}.txt'.format(i)):
 8 | 			yield l.decode('utf-8'), i
 9 | 
10 | if __name__ == '__main__':
11 | 	pass
12 | 	


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Keras==2.0.8
2 | tensorflow==1.4.1
3 | coverage==4.5
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from setuptools import setup
 3 | 
 4 | base_dir = os.path.dirname(os.path.abspath(__file__))
 5 | 
 6 | setup(name = 'text_classification',
 7 |     version = '0.1',
 8 |     description = 'keras implementation of text classification algorithms',
 9 |     author = 'Qiang Siwei',
10 |     author_email = 'qiangsiwei@outlook.com',
11 |     url = '',
12 |     packages = ['text_classification',\
13 |                 'text_classification.layers',\
14 |                 'text_classification.models'],
15 |     long_description = open(os.path.join(base_dir,'README.md')).read(),
16 |     test_suite = 'tests.get_tests',
17 | ) 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | import unittest
4 | 
5 | def get_tests():
6 |     from simple_test import SimpleTestCase
7 |     suite = unittest.TestLoader().loadTestsFromTestCase(SimpleTestCase)
8 |     return unittest.TestSuite([suite])
9 | 


--------------------------------------------------------------------------------
/tests/simple_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | 
 5 | class SimpleTestCase(unittest.TestCase):
 6 | 
 7 | 	def test_import(self):
 8 | 		import text_classification
 9 | 		return True
10 | 
11 | if __name__ == "__main__":
12 | 	suite = unittest.TestSuite()
13 | 	test_cases = ['test_import']
14 | 	for test_case in test_cases:
15 | 		suite.addTest(SimpleTestCase(test_case))
16 | 	unittest.TextTestRunner(verbosity=2).run(suite)
17 | 


--------------------------------------------------------------------------------
/text_classification/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from models import TextClassifierMLP
 4 | from models import TextClassifierRNN
 5 | from models import TextClassifierCNN
 6 | from models import TextClassifierHAN
 7 | from models import TextClassifierRCNN
 8 | from models import TextClassifierBiRNN
 9 | from models import TextClassifierCLSTM1
10 | from models import TextClassifierCLSTM2
11 | from models import TextClassifierTextCNN
12 | from models import TextClassifierFastText
13 | 


--------------------------------------------------------------------------------
/text_classification/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import os, pickle, numpy as np
  4 | from abc import abstractmethod
  5 | from keras.models import model_from_json
  6 | from keras.utils.np_utils import to_categorical
  7 | from keras.preprocessing.text import Tokenizer
  8 | from keras.preprocessing.sequence import pad_sequences
  9 | from layers import AttLayer
 10 | 
 11 | class DataCache(object):
 12 | 	def __init__(self):
 13 | 		self.mode = ''
 14 | 		self.data = {}
 15 | 
 16 | 	def __format_name__(self,name):
 17 | 		return '{0}_{1}'.format(self.mode,name)
 18 | 
 19 | 	def __setitem__(self,name,value):
 20 | 		name = self.__format_name__(name)
 21 | 		self.data[name] = value
 22 | 
 23 | 	def __getitem__(self,name):
 24 | 		name = self.__format_name__(name)
 25 | 		return self.data.get(name)
 26 | 
 27 | 	def __contains__(self,name):
 28 | 		name = self.__format_name__(name)
 29 | 		return name in self.data
 30 | 
 31 | 	def set_mode(self,mode):
 32 | 		self.mode = mode
 33 | 
 34 | class TextClassifierBase(object):
 35 | 	def __init__(self,tokenizer=None,maxlen=50):
 36 | 		self.data = DataCache()
 37 | 		self.para = DataCache()
 38 | 		self.model = None
 39 | 		self.para['maxlen'] = maxlen
 40 | 		self.para['tokenizer'] = tokenizer
 41 | 		if not self.para['tokenizer']:
 42 | 			self.para['tokenizer'] = Tokenizer(\
 43 | 				filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=' ')
 44 | 		self.custom_layers = {'AttLayer':AttLayer}
 45 | 
 46 | 	def __proc__x(self, x):
 47 | 		self.data['x_ids'] = \
 48 | 			np.array(self.para['tokenizer'].texts_to_sequences(x))
 49 | 		self.data['x_sqs'] = \
 50 | 			pad_sequences(self.data['x_ids'],maxlen=self.para['maxlen'])
 51 | 		self.data['x_mtx'] = \
 52 | 			self.para['tokenizer'].sequences_to_matrix(self.data['x_ids'],mode='binary')
 53 | 
 54 | 	def init_fit_data(self, x, y, test_size=0):
 55 | 		self.para['tokenizer'].fit_on_texts(x)
 56 | 		self.vocab = self.para['tokenizer'].word_index
 57 | 		self.__proc__x(x)
 58 | 		self.data['y'] = to_categorical(y)
 59 | 		self.labels = self.data['y'].shape[-1]
 60 | 
 61 | 	def proc_fit_data(self, **kwargs):
 62 | 		pass
 63 | 
 64 | 	def init_pred_data(self, x):
 65 | 		self.vocab = self.para['tokenizer'].word_index
 66 | 		self.__proc__x(x)
 67 | 
 68 | 	def proc_pred_data(self, **kwargs):
 69 | 		pass
 70 | 
 71 | 	@abstractmethod
 72 | 	def init_model(self):
 73 | 		pass
 74 | 
 75 | 	def __get_x__(self,name):
 76 | 		assert isinstance(name,str)
 77 | 		assert name in self.data
 78 | 		return self.data[name]
 79 | 
 80 | 	def __fit__(self, x, y, inputs, **kwargs):
 81 | 		self.mode = 'fit'
 82 | 		self.init_fit_data(x,y)
 83 | 		self.proc_fit_data(kwargs=kwargs)
 84 | 		self.model = self.init_model()
 85 | 		x = self.__get_x__(inputs) if isinstance(inputs,str) else\
 86 | 			map(self.__get_x__,inputs)
 87 | 		self.model.fit(
 88 | 			x = x,
 89 | 			y = self.data['y'],
 90 | 			epochs=kwargs.get('epochs',10),
 91 | 			validation_split=kwargs.get('validation_split',0))
 92 | 
 93 | 	@abstractmethod
 94 | 	def fit(self, x, y, **kwargs):
 95 | 		pass
 96 | 
 97 | 	def __predict__(self, x, inputs, **kwargs):
 98 | 		self.mode = 'predict'
 99 | 		assert self.model != None
100 | 		self.init_pred_data(x)
101 | 		self.proc_pred_data(kwargs=kwargs)
102 | 		x = self.__get_x__(inputs) if isinstance(inputs,str) else\
103 | 			map(self.__get_x__,inputs)
104 | 		y = self.model.predict(x)
105 | 		return y.argmax(axis=1)
106 | 
107 | 	@abstractmethod
108 | 	def predict(self, x, **kwargs):
109 | 		pass
110 | 
111 | 	def __get_filenames__(self, dirn):
112 | 		return os.path.join(dirn,'tokenizer.pkl'),\
113 | 			   os.path.join(dirn,'model.json'),\
114 | 			   os.path.join(dirn,'param.h5')
115 | 
116 | 	def dump_model(self, dirn):
117 | 		if not os.path.isdir(dirn):
118 | 			os.mkdir(dirn)
119 | 		assert self.model != None # more strict
120 | 		ft, fm, fp = self.__get_filenames__(dirn)
121 | 		with open(ft,'wb') as out:
122 | 			pickle.dump(self.para,out)
123 | 		with open(fm,'w') as out: 
124 | 			out.write(self.model.to_json())
125 | 		self.model.save_weights(fp,overwrite=True)
126 | 
127 | 	def load_model(self, dirn):
128 | 		assert os.path.isdir(dirn)
129 | 		ft, fm, fp = self.__get_filenames__(dirn)
130 | 		assert all(os.path.isfile(fn) for fn in (ft,fm,fp))
131 | 		self.para = pickle.load(open(ft,'rb'))
132 | 		self.model = model_from_json(open(fm).read(),self.custom_layers)
133 | 		self.model.load_weights(fp)
134 | 
135 | if __name__ == '__main__':
136 | 	pass
137 | 


--------------------------------------------------------------------------------
/text_classification/layers/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from attention import AttLayer
4 | 


--------------------------------------------------------------------------------
/text_classification/layers/attention.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from keras import backend as K
 4 | from keras.engine.topology import Layer
 5 | from keras import initializers, constraints, regularizers
 6 | 
 7 | class AttLayer(Layer):
 8 | 	def __init__(self,init='glorot_uniform',kernel_regularizer=None,bias_regularizer=None,\
 9 | 		kernel_constraint=None,bias_constraint=None,**kwargs):
10 | 		self.supports_masking = True
11 | 		self.init = initializers.get(init)
12 | 		self.kernel_initializer = initializers.get(init)
13 | 		self.kernel_regularizer = regularizers.get(kernel_regularizer)
14 | 		self.kernel_constraint = constraints.get(kernel_constraint)
15 | 		self.bias_regularizer = regularizers.get(bias_regularizer)
16 | 		self.bias_constraint = constraints.get(bias_constraint)
17 | 		super(AttLayer, self).__init__(** kwargs)
18 | 	def build(self, input_shape):
19 | 		assert len(input_shape)==3
20 | 		self.W = self.add_weight((input_shape[-1],1),initializer=self.kernel_initializer,\
21 | 			name='{}_W'.format(self.name),regularizer=self.kernel_regularizer,constraint=self.kernel_constraint)
22 | 		self.b = self.add_weight((input_shape[1],),initializer='zero',\
23 | 			name='{}_b'.format(self.name),regularizer=self.bias_regularizer,constraint=self.bias_constraint)
24 | 		self.u = self.add_weight((input_shape[1],),initializer=self.kernel_initializer,\
25 | 			name='{}_u'.format(self.name),regularizer=self.kernel_regularizer,constraint=self.kernel_constraint)
26 | 		self.built = True
27 | 	def compute_mask(self, input, input_mask=None):
28 | 		return None
29 | 	def call(self, x, mask=None):
30 | 		ait = K.exp(K.tanh(K.squeeze(K.dot(x,self.W),-1)+self.b)*self.u)
31 | 		if mask is not None: mask = K.cast(mask,K.floatx()); ait = mask*ait
32 | 		ait /= K.cast(K.sum(ait,axis=1,keepdims=True)+K.epsilon(),K.floatx())
33 | 		ait = K.expand_dims(ait)
34 | 		return K.sum(x*ait,axis=1)
35 | 	def compute_output_shape(self, input_shape):
36 | 		return (input_shape[0],input_shape[-1])
37 | 
38 | if __name__ == '__main__':
39 | 	pass
40 | 


--------------------------------------------------------------------------------
/text_classification/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from text_mlp import TextClassifierMLP
 4 | from text_rnn import TextClassifierRNN
 5 | from text_cnn import TextClassifierCNN
 6 | from text_han import TextClassifierHAN
 7 | from text_rcnn import TextClassifierRCNN
 8 | from text_birnn import TextClassifierBiRNN
 9 | from text_clstm1 import TextClassifierCLSTM1
10 | from text_clstm2 import TextClassifierCLSTM2
11 | from text_textcnn import TextClassifierTextCNN
12 | from text_fasttext import TextClassifierFastText
13 | 


--------------------------------------------------------------------------------
/text_classification/models/text_birnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | 
 7 | class TextClassifierBiRNN(TextClassifierBase):
 8 | 
 9 | 	def __init__(self):
10 | 		super(TextClassifierBiRNN,self).__init__()
11 | 
12 | 	def init_model(self):
13 | 		model = Sequential()
14 | 		model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen']))
15 | 		model.add(Bidirectional(LSTM(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True)))
16 | 		model.add(Bidirectional(LSTM(256,dropout=0.2,recurrent_dropout=0.1)))
17 | 		model.add(Dense(self.labels,activation='softmax'))
18 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
19 | 		return model
20 | 
21 | 	def fit(self, x, y, **kwargs):
22 | 		self.__fit__(x,y,'x_sqs',**kwargs)
23 | 
24 | 	def predict(self, x, **kwargs):
25 | 		return self.__predict__(x,'x_sqs',**kwargs)
26 | 
27 | if __name__ == '__main__':
28 | 	pass
29 | 


--------------------------------------------------------------------------------
/text_classification/models/text_clstm1.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | 
 7 | class TextClassifierCLSTM1(TextClassifierBase):
 8 | 
 9 | 	def __init__(self):
10 | 		super(TextClassifierCLSTM1,self).__init__()
11 | 
12 | 	def init_model(self):
13 | 		model = Sequential()
14 | 		model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen']))
15 | 		model.add(Convolution1D(256,3,padding='same',strides=1))
16 | 		model.add(Activation('relu'))
17 | 		model.add(MaxPool1D(pool_size=2))
18 | 		model.add(GRU(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True))
19 | 		model.add(GRU(256,dropout=0.2,recurrent_dropout=0.1))
20 | 		model.add(Dense(self.labels,activation='softmax'))
21 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
22 | 		return model
23 | 
24 | 	def fit(self, x, y, **kwargs):
25 | 		self.__fit__(x,y,'x_sqs',**kwargs)
26 | 
27 | 	def predict(self, x, **kwargs):
28 | 		return self.__predict__(x,'x_sqs',**kwargs)
29 | 
30 | if __name__ == '__main__':
31 | 	pass
32 | 


--------------------------------------------------------------------------------
/text_classification/models/text_clstm2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Model
 5 | from keras.layers.merge import concatenate
 6 | from keras.layers import *
 7 | 
 8 | class TextClassifierCLSTM2(TextClassifierBase):
 9 | 
10 | 	def __init__(self):
11 | 		super(TextClassifierCLSTM2,self).__init__()
12 | 
13 | 	def init_model(self):
14 | 		main_input = Input(shape=(self.para['maxlen'],),dtype='float64')
15 | 		embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(main_input)
16 | 		cnn = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed)
17 | 		cnn = MaxPool1D(pool_size=4)(cnn)
18 | 		cnn = Flatten()(cnn)
19 | 		cnn = Dense(256)(cnn)
20 | 		rnn = Bidirectional(GRU(256,dropout=0.2,recurrent_dropout=0.1))(embed)
21 | 		rnn = Dense(256)(rnn)
22 | 		con = concatenate([cnn,rnn],axis=-1)
23 | 		main_output = Dense(self.labels,activation='softmax')(con)
24 | 		model = Model(inputs=main_input,outputs=main_output)
25 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
26 | 		return model
27 | 
28 | 	def fit(self, x, y, **kwargs):
29 | 		self.__fit__(x,y,'x_sqs',**kwargs)
30 | 
31 | 	def predict(self, x, **kwargs):
32 | 		return self.__predict__(x,'x_sqs',**kwargs)
33 | 
34 | if __name__ == '__main__':
35 | 	pass
36 | 


--------------------------------------------------------------------------------
/text_classification/models/text_cnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | 
 7 | class TextClassifierCNN(TextClassifierBase):
 8 | 
 9 | 	def __init__(self):
10 | 		super(TextClassifierCNN,self).__init__()
11 | 
12 | 	def init_model(self):
13 | 		model = Sequential()
14 | 		model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen']))
15 | 		model.add(Convolution1D(256,3,padding='same'))
16 | 		model.add(MaxPool1D(3,3,padding='same'))
17 | 		model.add(Convolution1D(128,3,padding='same'))
18 | 		model.add(MaxPool1D(3,3,padding='same'))
19 | 		model.add(Convolution1D(64,3,padding='same'))
20 | 		model.add(Flatten())
21 | 		model.add(Dropout(0.1))
22 | 		model.add(BatchNormalization())
23 | 		model.add(Dense(256,activation='relu'))
24 | 		model.add(Dropout(0.1))
25 | 		model.add(Dense(self.labels,activation='softmax'))
26 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
27 | 		return model
28 | 
29 | 	def fit(self, x, y, **kwargs):
30 | 		self.__fit__(x,y,'x_sqs',**kwargs)
31 | 
32 | 	def predict(self, x, **kwargs):
33 | 		return self.__predict__(x,'x_sqs',**kwargs)
34 | 
35 | if __name__ == '__main__':
36 | 	pass
37 | 


--------------------------------------------------------------------------------
/text_classification/models/text_fasttext.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | from keras.preprocessing.sequence import pad_sequences
 7 | import numpy as np
 8 | 
 9 | class TextClassifierFastText(TextClassifierBase):
10 | 
11 | 	def __init__(self):
12 | 		super(TextClassifierFastText,self).__init__()
13 | 
14 | 	def init_model(self):
15 | 		model = Sequential()
16 | 		model.add(Embedding(self.para['fasttext_dim'],256,input_length=self.para['fasttext_maxlen']))
17 | 		model.add(GlobalAveragePooling1D())
18 | 		model.add(Dense(self.labels,activation='softmax'))
19 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
20 | 		return model
21 | 
22 | 	def __add_ngrams__(self, seqs):
23 | 		gram_n = self.para['gram_n']
24 | 		token_to_idx = self.para['token_to_idx']
25 | 		for inputs in seqs:
26 | 			nlist = inputs[:]
27 | 			for i in range(len(nlist)-gram_n+1):
28 | 				for ngram_value in range(2,gram_n+1):
29 | 					ngram = tuple(nlist[i:i+gram_n])
30 | 					if ngram in token_to_idx:
31 | 						nlist.append(token_to_idx[ngram])
32 | 			yield nlist
33 | 
34 | 	def __proc_data__(self):
35 | 		self.data['fasttext_x_sqs'] = pad_sequences(\
36 | 			list(self.__add_ngrams__(self.data['x_ids'])),\
37 | 			maxlen=self.para['fasttext_maxlen'])
38 | 
39 | 	def proc_fit_data(self, **kwargs):
40 | 		self.para['gram_n'] = max(kwargs.get('gram_n') or 2,2)
41 | 		self.para['fasttext_maxlen'] = kwargs.get('maxlen') or 2*self.para['maxlen']
42 | 		def create_ngrams(inputs, gram_n):
43 | 			return set(zip(*[inputs[i:] for i in range(gram_n)]))
44 | 		ngrams = set()
45 | 		for inputs in self.data['x_sqs']:
46 | 			for i in range(2,self.para['gram_n']+1):
47 | 				ngrams.update(create_ngrams(inputs,gram_n=i))
48 | 		start_idx = len(self.vocab)+2
49 | 		token_to_idx = {token:idx+start_idx for idx,token in enumerate(ngrams)}
50 | 		idx_to_token = {idx:token for token,idx in token_to_idx.iteritems()}
51 | 		self.para['token_to_idx'] = token_to_idx
52 | 		self.para['fasttext_dim'] = np.max(list(idx_to_token.keys()))+1
53 | 		self.__proc_data__()
54 | 
55 | 	def proc_pred_data(self, **kwargs):
56 | 		self.__proc_data__()
57 | 
58 | 	def fit(self, x, y, **kwargs):
59 | 		self.__fit__(x,y,'fasttext_x_sqs',**kwargs)
60 | 
61 | 	def predict(self, x, **kwargs):
62 | 		return self.__predict__(x,'fasttext_x_sqs',**kwargs)
63 | 
64 | 	def dump_model(self, dirn):
65 | 		super(TextClassifierFastText,self).dump_model(dirn)
66 | 
67 | 	def load_model(self, dirn):
68 | 		super(TextClassifierFastText,self).load_model(dirn)
69 | 
70 | if __name__ == '__main__':
71 | 	pass
72 | 


--------------------------------------------------------------------------------
/text_classification/models/text_han.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from ..layers import AttLayer
 5 | from keras.models import Model
 6 | from keras.layers import *
 7 | 
 8 | class TextClassifierHAN(TextClassifierBase):
 9 | 
10 | 	def __init__(self):
11 | 		super(TextClassifierHAN,self).__init__()
12 | 
13 | 	def init_model(self):
14 | 		inputs = Input(shape=(self.para['maxlen'],),dtype='float64')
15 | 		embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(inputs)
16 | 		gru = Bidirectional(GRU(128,dropout=0.2,recurrent_dropout=0.1,return_sequences=True))(embed)
17 | 		attention = AttLayer()(gru)
18 | 		output = Dense(self.labels,activation='softmax')(attention)
19 | 		model = Model(inputs,output)
20 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
21 | 		return model
22 | 
23 | 	def fit(self, x, y, **kwargs):
24 | 		self.__fit__(x,y,'x_sqs',**kwargs)
25 | 
26 | 	def predict(self, x, **kwargs):
27 | 		return self.__predict__(x,'x_sqs',**kwargs)
28 | 
29 | if __name__ == '__main__':
30 | 	pass
31 | 


--------------------------------------------------------------------------------
/text_classification/models/text_mlp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | 
 7 | class TextClassifierMLP(TextClassifierBase):
 8 | 
 9 | 	def __init__(self):
10 | 		super(TextClassifierMLP,self).__init__()
11 | 
12 | 	def init_model(self):
13 | 		model = Sequential()
14 | 		model.add(Dense(512,input_shape=(len(self.vocab)+1,),activation='relu'))
15 | 		model.add(Dropout(0.5))
16 | 		model.add(Dense(self.labels,activation='softmax'))
17 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
18 | 		return model
19 | 
20 | 	def fit(self, x, y, **kwargs):
21 | 		self.__fit__(x,y,'x_mtx',**kwargs)
22 | 
23 | 	def predict(self, x, **kwargs):
24 | 		return self.__predict__(x,'x_mtx',**kwargs)
25 | 
26 | if __name__ == '__main__':
27 | 	pass
28 | 


--------------------------------------------------------------------------------
/text_classification/models/text_rcnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Model
 5 | from keras.layers.merge import concatenate
 6 | from keras.layers import *
 7 | from keras.preprocessing.sequence import pad_sequences
 8 | 
 9 | class TextClassifierRCNN(TextClassifierBase):
10 | 
11 | 	def __init__(self):
12 | 		super(TextClassifierRCNN,self).__init__()
13 | 
14 | 	def init_model(self):
15 | 		doc = Input(shape=(None,),dtype='int32')
16 | 		lc = Input(shape=(None,),dtype='int32')
17 | 		rc = Input(shape=(None,),dtype='int32')
18 | 		embedder = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])
19 | 		demb, lemb, remb = embedder(doc), embedder(lc), embedder(rc)
20 | 		fward = LSTM(256,return_sequences=True)(lemb)
21 | 		bward = LSTM(256,return_sequences=True,go_backwards=True)(remb)
22 | 		together = concatenate([fward,demb,bward],axis=2)
23 | 		semantic = TimeDistributed(Dense(128,activation='tanh'))(together)
24 | 		pool_rnn = Lambda(lambda x:K.max(x,axis=1),output_shape=(128,))(semantic)
25 | 		output = Dense(self.labels,activation='softmax')(pool_rnn)
26 | 		model = Model(inputs=[doc,lc,rc],outputs=output)
27 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
28 | 		return model
29 | 
30 | 	def __proc_data__(self):
31 | 		x_lids = [[len(self.vocab)]+x[:-1]for x in self.data['x_ids'].tolist()]
32 | 		x_rids = [x[1:]+[len(self.vocab)] for x in self.data['x_ids'].tolist()]
33 | 		self.data['x_lids'] = pad_sequences(x_lids,maxlen=self.para['maxlen'])
34 | 		self.data['x_rids'] = pad_sequences(x_rids,maxlen=self.para['maxlen'])
35 | 
36 | 	def proc_fit_data(self, **kwargs):
37 | 		self.__proc_data__()
38 | 
39 | 	def proc_pred_data(self, **kwargs):
40 | 		self.__proc_data__()
41 | 
42 | 	def fit(self, x, y, **kwargs):
43 | 		self.__fit__(x,y,inputs=['x_sqs','x_lids','x_rids'],**kwargs)
44 | 
45 | 	def predict(self, x, **kwargs):
46 | 		return self.__predict__(x,['x_sqs','x_lids','x_rids'],**kwargs)
47 | 
48 | if __name__ == '__main__':
49 | 	pass
50 | 


--------------------------------------------------------------------------------
/text_classification/models/text_rnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Sequential
 5 | from keras.layers import *
 6 | 
 7 | class TextClassifierRNN(TextClassifierBase):
 8 | 
 9 | 	def __init__(self):
10 | 		super(TextClassifierRNN,self).__init__()
11 | 
12 | 	def init_model(self):
13 | 		model = Sequential()
14 | 		model.add(Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen']))
15 | 		model.add(LSTM(256,dropout=0.2,recurrent_dropout=0.1,return_sequences=True))
16 | 		model.add(LSTM(256,dropout=0.2,recurrent_dropout=0.1))
17 | 		model.add(Dense(self.labels,activation='softmax'))
18 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
19 | 		return model
20 | 
21 | 	def fit(self, x, y, **kwargs):
22 | 		self.__fit__(x,y,'x_sqs',**kwargs)
23 | 
24 | 	def predict(self, x, **kwargs):
25 | 		return self.__predict__(x,'x_sqs',**kwargs)
26 | 
27 | if __name__ == '__main__':
28 | 	pass
29 | 


--------------------------------------------------------------------------------
/text_classification/models/text_textcnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from ..base import TextClassifierBase
 4 | from keras.models import Model
 5 | from keras.layers.merge import concatenate
 6 | from keras.layers import *
 7 | 
 8 | class TextClassifierTextCNN(TextClassifierBase):
 9 | 
10 | 	def __init__(self):
11 | 		super(TextClassifierTextCNN,self).__init__()
12 | 
13 | 	def init_model(self):
14 | 		main_input = Input(shape=(self.para['maxlen'],),dtype='float64')
15 | 		embed = Embedding(len(self.vocab)+1,256,input_length=self.para['maxlen'])(main_input)
16 | 		cnn1 = Convolution1D(256,3,padding='same',strides=1,activation='relu')(embed)
17 | 		cnn1 = MaxPool1D(pool_size=4)(cnn1)
18 | 		cnn2 = Convolution1D(256,4,padding='same',strides=1,activation='relu')(embed)
19 | 		cnn2 = MaxPool1D(pool_size=4)(cnn2)
20 | 		cnn3 = Convolution1D(256,5,padding='same',strides=1,activation='relu')(embed)
21 | 		cnn3 = MaxPool1D(pool_size=4)(cnn3)
22 | 		cnn = concatenate([cnn1,cnn2,cnn3],axis=-1)
23 | 		flat = Flatten()(cnn)
24 | 		drop = Dropout(0.2)(flat)
25 | 		main_output = Dense(self.labels,activation='softmax')(drop)
26 | 		model = Model(inputs=main_input,outputs = main_output)
27 | 		model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
28 | 		return model
29 | 
30 | 	def fit(self, x, y, **kwargs):
31 | 		self.__fit__(x,y,'x_sqs',**kwargs)
32 | 
33 | 	def predict(self, x, **kwargs):
34 | 		return self.__predict__(x,'x_sqs',**kwargs)
35 | 
36 | if __name__ == '__main__':
37 | 	pass
38 | 


--------------------------------------------------------------------------------