├── .gitignore
├── LICENSE
├── config.py
├── load_data.py
├── model.py
├── models
    ├── attention.py
    ├── char_cnn.py
    ├── cnn.py
    ├── fast.py
    ├── text_att_bi_gru.py
    ├── text_att_bi_lstm.py
    ├── text_bi_gru.py
    ├── text_bi_lstm.py
    ├── text_gru.py
    └── text_lstm.py
├── predict.py
├── readme.md
├── requirements.txt
├── run.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | *.png
141 | logs
142 | make_train_data.py
143 | .vscode
144 | data/
145 | saves
146 | .idea
147 | 
148 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 gezimonkey
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | MODEL_PATH='saves/model.h5'
 2 | TOK_PATH='saves/tokenizer.pickle'
 3 | LOG_PATH='./logs/fit/'
 4 | EMBEDDING_PATH = '../../dataset/glove/glove.6B.300d.txt'
 5 | TRAIN_PATH='data/train.csv'
 6 | PREDICT_PATH='data/train.csv'
 7 | PREDICT_LEVEL=0.5
 8 | # TRAIN_PATH='data/pre_data.csv'
 9 | # PREDICT_PATH='data/pre_data.csv'
10 | 
11 | 
12 | 
13 | # CLASSES = ['政治、法律', '社会科学总论', '文化、教育、体育', '语言、文字', '医药、卫生', '计算机科学',
14 | #            '环境科学、安全科学', '历史、地理', '数理科学和化学', '工业技术', '综合、工具书', '文学', '经济、商业',
15 | #            '航空、航天', '建筑科学', '生物科学', '哲学、宗教', '天文学、地球科学', '生活服务技术', '交通运输', '农业科学', '艺术', '军事、战争', '自然科学总论']
16 | CLASSES=['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']
17 | 
18 | MAX_SEQUENCE_LEN=256
19 | MAX_WORDS_LEN=20000
20 | EMBED_SIZE=300
21 | 
22 | CNN='CNN_MODEL'
23 | CHAR_CNN='CHAR_CNN'
24 | 
25 | FAST='FAST'
26 | 
27 | TEXT_LSTM='TEXT_LSTM'
28 | TEXT_BI_LSTM='TEXT_BI_LSTM'
29 | TEXT_ATT_BI_LSTM='TEXT_ATT_BI_LSTM'
30 | 
31 | TEXT_GRU='TEXT_GRU'
32 | TEXT_BI_GRU='TEXT_BI_GRU'
33 | TEXT_ATT_BI_GRU='TEXT_ATT_BI_GRU'
34 | 
35 | 


--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
 1 | from config import TOK_PATH, CLASSES, TRAIN_PATH
 2 | import pickle
 3 | from tensorflow.keras.preprocessing.sequence import pad_sequences
 4 | from tqdm import tqdm
 5 | import string
 6 | import enchant
 7 | import os
 8 | import re
 9 | import pandas as pd
10 | # from imblearn.over_sampling import SMOTE
11 | import matplotlib.pyplot as plt
12 | 
13 | 
14 | def get_features(text_series, maxlen):
15 |     with open(TOK_PATH, 'rb') as handle:
16 |         tokenizer = pickle.load(handle)
17 |     sequences = tokenizer.texts_to_sequences(text_series)
18 |     return pad_sequences(sequences, maxlen=maxlen)
19 | 
20 | 
21 | def clean(abstracts_tmp, labels_tmp):
22 |     f = open('data/stopwords_en.txt', 'r', encoding='utf-8')
23 |     stopwords = [words.replace('\n', '') for words in f.readlines()]
24 |     f.close()
25 |     reg = re.compile(r'<[^>]+>', re.S)
26 |     max_seq_len = 0
27 |     all_words = []
28 |     dic_en = enchant.Dict("en_US")
29 |     t = str.maketrans({key: None for key in string.punctuation})
30 |     abstracts = []
31 |     for i in tqdm(range(len(abstracts_tmp)), desc='CLEANING...'):
32 |         line = abstracts_tmp[i]
33 |         desc = reg.sub('', line)
34 |         desc = re.sub(r'\W+', ' ', desc)
35 |         desc = desc.translate(t).lower()
36 |         desc = desc.split(' ')
37 |         x = ''
38 |         for word in desc:
39 |             if x.find(word) == -1 and word not in stopwords and word != '' and word != ' ' and re.search('\d', word) is None and dic_en.check(word):
40 |                 if word not in all_words:
41 |                     all_words.append(word)
42 |                 x += word+' '
43 |         x = x.strip()
44 |         if len(x) < 4:
45 |             labels_tmp.drop([i], inplace=True)
46 |             continue
47 |         if len(x.split(' ')) > max_seq_len:
48 |             max_seq_len = len(x)
49 |         abstracts.append(x)
50 |     print('*'*20+'DATA DETAIL'+'*'*20)
51 |     print('MAX SEQ LEN:{}'.format(max_seq_len))
52 |     print('ALL WORDS:{}'.format(len(all_words)))
53 |     return abstracts, labels_tmp
54 | 
55 | 
56 | def load_data(path):
57 |     if os.path.basename(path).find('.csv') != - -1:
58 |         datas = pd.read_csv(path)
59 |         labels_tmp = datas[[class_name for class_name in CLASSES]]
60 |         abstracts_tmp = datas.ABSTRACT.tolist()
61 |     elif os.path.basename(path).find('.txt') != -1:
62 |         f = open(path, 'r', encoding='utf-8')
63 |         lines = f.readlines()
64 |         f.close()
65 |         abstracts_tmp = []
66 |         labels_tmp = []
67 |         for line in tqdm(lines, desc='LOAD DATA'):
68 |             items = str(line).split('\t')
69 |             label = str(items[0]).split('|')
70 |             word = re.split(r'\W+', str(items[1]))
71 |             labels_tmp.append(label)
72 |             abstracts_tmp.append(word)
73 |     abstracts, labels = clean(abstracts_tmp, labels_tmp)
74 |     # Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported.
75 |     # smo = SMOTE(random_state=42)
76 |     # abstracts, labels = smo.fit_sample(abstracts, labels)
77 |     print('DATA LEN:{}'.format(len(abstracts)))
78 |     print(labels.sum(axis=0))
79 |     return abstracts, labels
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     load_data(TRAIN_PATH)
84 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Embedding
 2 | from config import *
 3 | from models.char_cnn import *
 4 | from models.cnn import *
 5 | from models.fast import *
 6 | from models.text_att_bi_gru import *
 7 | from models.text_att_bi_lstm import *
 8 | from models.text_bi_gru import *
 9 | from models.text_bi_lstm import *
10 | from models.text_gru import *
11 | from models.text_lstm import *
12 | 
13 | 
14 | def create_embedding(embedding_matrix):
15 |     return Embedding(input_dim=MAX_WORDS_LEN,
16 |                      output_dim=EMBED_SIZE,
17 |                      weights=[embedding_matrix],
18 |                      input_length=MAX_SEQUENCE_LEN,
19 |                      trainable=False)
20 |                      
21 | def get_model(model_type, embedding_matrix):
22 |     class_len = len(CLASSES)
23 |     model_class=globals()[model_type](create_embedding(embedding_matrix), class_len,MAX_SEQUENCE_LEN)
24 |     model=model_class.build()
25 |     # 优化器=adam 损失函数=二分类交叉熵损失函数 评价指标=输出所有结果的概率
26 |     model.compile(
27 |         optimizer='adam',
28 |         loss='binary_crossentropy',
29 |         metrics=['categorical_accuracy'])
30 |     return model


--------------------------------------------------------------------------------
/models/attention.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Dense, Lambda, dot, Activation, concatenate
 2 | from tensorflow.keras.layers import Layer
 3 | 
 4 | 
 5 | class Attention(Layer):
 6 | 
 7 |     def __init__(self, **kwargs):
 8 |         super().__init__(**kwargs)
 9 | 
10 |     def __call__(self, hidden_states):
11 |         """
12 |         Many-to-one attention mechanism for Keras.
13 |         @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim).
14 |         @return: 2D tensor with shape (batch_size, 128)
15 |         @author: felixhao28.
16 |         """
17 |         hidden_size = int(hidden_states.shape[2])
18 |         # Inside dense layer
19 |         #              hidden_states            dot               W            =>           score_first_part
20 |         # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
21 |         # W is the trainable weight matrix of attention Luong's multiplicative style score
22 |         score_first_part = Dense(
23 |             hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
24 |         #            score_first_part           dot        last_hidden_state     => attention_weights
25 |         # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
26 |         h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,),
27 |                      name='last_hidden_state')(hidden_states)
28 |         score = dot([score_first_part, h_t], [2, 1], name='attention_score')
29 |         attention_weights = Activation(
30 |             'softmax', name='attention_weight')(score)
31 |         # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
32 |         context_vector = dot([hidden_states, attention_weights], [
33 |                              1, 1], name='context_vector')
34 |         pre_activation = concatenate(
35 |             [context_vector, h_t], name='attention_output')
36 |         attention_vector = Dense(
37 |             128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
38 |         return attention_vector
39 | 


--------------------------------------------------------------------------------
/models/char_cnn.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input, Convolution1D, GlobalMaxPooling1D, concatenate, Dense, AlphaDropout, Concatenate
 2 | from tensorflow.keras.models import Model
 3 | 
 4 | 
 5 | class CHAR_CNN:
 6 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 7 |         self.embedding_layer = embedding_layer
 8 |         self.class_len = class_len
 9 |         self.max_sequence_len = max_sequence_len
10 | 
11 |     def build(self):
12 |         conv_layers = [[256, 10], [256, 7], [256, 5], [256, 3]]
13 |         fully_connected_layers = [1024, 1024]
14 |         input = Input(shape=(self.max_sequence_len,),
15 |                       dtype='int32', name='input')
16 |         embedded_sequence = self.embedding_layer(input)
17 | 
18 |         convolution_output = []
19 |         for num_filters, filter_width in conv_layers:
20 |             conv = Convolution1D(filters=num_filters,
21 |                                  kernel_size=filter_width,
22 |                                  activation='tanh',
23 |                                  name='Conv1D_{}_{}'.format(num_filters, filter_width))(embedded_sequence)
24 |             pool = GlobalMaxPooling1D(name='MaxPoolingOverTime_{}_{}'.format(
25 |                 num_filters, filter_width))(conv)
26 |             convolution_output.append(pool)
27 |         x = Concatenate()(convolution_output)
28 |         for fl in fully_connected_layers:
29 |             x = Dense(fl, activation='selu',
30 |                       kernel_initializer='lecun_normal')(x)
31 |             x = AlphaDropout(0.5)(x)
32 | 
33 |         output = Dense(self.class_len, activation='sigmoid')(x)
34 |         model = Model(inputs=input, outputs=output)
35 |         return model
36 | 


--------------------------------------------------------------------------------
/models/cnn.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Concatenate, Dropout, Flatten, Dense, concatenate
 2 | from tensorflow.keras.models import Model
 3 | 
 4 | 
 5 | class CNN_MODEL:
 6 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 7 |         self.embedding_layer = embedding_layer
 8 |         self.class_len = class_len
 9 |         self.max_sequence_len = max_sequence_len
10 | 
11 |     def build(self):
12 |         filter_sizes = [1, 2, 3, 4, 5]
13 |         input = Input(shape=(self.max_sequence_len,),
14 |                       dtype='int32', name='input')
15 |         embedded_sequence = self.embedding_layer(input)
16 | 
17 |         conv_layers = []
18 |         for fsz in filter_sizes:
19 |             conv1 = Conv1D(
20 |                 256,
21 |                 fsz,
22 |                 kernel_initializer='lecun_uniform',
23 |                 activation='tanh',
24 |             )(embedded_sequence)
25 |             pool_size = self.max_sequence_len - fsz + 1
26 |             pooling = MaxPooling1D(pool_size=pool_size)(conv1)
27 |             conv_layers.append(pooling)
28 |         merged = Concatenate()(conv_layers)
29 |         dropout = Dropout(0.5)(merged)
30 |         flattened = Flatten()(dropout)
31 | 
32 |         output = Dense(self.class_len, activation='sigmoid')(flattened)
33 |         model = Model(inputs=input, outputs=output)
34 |         return model
35 | 
36 |     def build_base(self):
37 |         filter_sizes = [3, 4]
38 |         convs = []
39 |         input = Input(shape=(self.max_sequence_len,),
40 |                       dtype='int32', name='input')
41 |         embedded_sequence = self.embedding_layer(input)
42 | 
43 |         for fsz in filter_sizes:
44 |             conv = Conv1D(512, kernel_size=fsz, activation='relu')(
45 |                 embedded_sequence)
46 |             pool = MaxPooling1D(2)(conv)
47 |             convs.append(pool)
48 |         merge1 = concatenate(convs, axis=1)
49 |         dropout = Dropout(0.5)(merge1)
50 |         conv1 = Conv1D(256, 4, activation='relu')(dropout)
51 |         pool1 = MaxPooling1D(5)(conv1)
52 |         flat = Flatten()(pool1)
53 |         dense = Dense(512, activation='relu')(flat)
54 | 
55 |         output = Dense(self.class_len, activation='sigmoid', name='pred')(dense)
56 |         model = Model(inputs=input, outputs=output)
57 |         return model
58 | 


--------------------------------------------------------------------------------
/models/fast.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D
 2 | from tensorflow.keras.models import Model, Sequential
 3 | 
 4 | 
 5 | class FAST:
 6 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 7 |         self.embedding_layer = embedding_layer
 8 |         self.class_len = class_len
 9 |         self.max_sequence_len = max_sequence_len
10 | 
11 |     def build_base(self):
12 |         model = Sequential()
13 |         model.add(self.embedding_layer)
14 |         model.add(GlobalAveragePooling1D())
15 |         model.add(Dense(self.class_len, activation='sigmoid'))
16 |         return model
17 | 
18 |     def build(self):
19 |         input = Input(shape=(self.max_sequence_len,),
20 |                       dtype='int32', name='input')
21 |         embedded_sequence = self.embedding_layer(input)
22 | 
23 |         pool_max = GlobalMaxPooling1D()(embedded_sequence)
24 |         pool_ave = GlobalAveragePooling1D()(embedded_sequence)
25 |         x = Concatenate()([pool_ave, pool_max])
26 |         x = Dense(128, activation="tanh")(x)
27 |         x = Dropout(0.2)(x)
28 | 
29 |         output = Dense(self.class_len, activation='sigmoid')(x)
30 |         model = Model(inputs=input, outputs=output)
31 |         return model
32 | 


--------------------------------------------------------------------------------
/models/text_att_bi_gru.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input,GRU,Dense,Bidirectional
 2 | from tensorflow.keras.models import Model
 3 | from models.attention import Attention
 4 | class TEXT_ATT_BI_GRU:
 5 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 6 |         self.embedding_layer = embedding_layer
 7 |         self.class_len = class_len
 8 |         self.max_sequence_len = max_sequence_len
 9 | 
10 |     def build(self):
11 |         input = Input(shape=(self.max_sequence_len,))
12 |         embedding_layer = self.embedding_layer(input)
13 | 
14 |         bi = Bidirectional(GRU(128, return_sequences=True))(embedding_layer)
15 |         att = Attention()(bi)
16 | 
17 |         output = Dense(self.class_len, activation='sigmoid')(att)
18 |         model = Model(inputs=input, outputs=output)
19 |         return model


--------------------------------------------------------------------------------
/models/text_att_bi_lstm.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input,LSTM,Dense,Bidirectional
 2 | from tensorflow.keras.models import Model
 3 | from models.attention import Attention
 4 | class TEXT_ATT_BI_LSTM:
 5 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 6 |         self.embedding_layer = embedding_layer
 7 |         self.class_len = class_len
 8 |         self.max_sequence_len = max_sequence_len
 9 | 
10 |     def build(self):
11 |         input = Input(shape=(self.max_sequence_len,))
12 |         embedding_layer = self.embedding_layer(input)
13 | 
14 |         bi = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer)
15 |         att = Attention()(bi)
16 | 
17 |         output = Dense(self.class_len, activation='sigmoid')(att)
18 |         model = Model(inputs=input, outputs=output)
19 |         return model


--------------------------------------------------------------------------------
/models/text_bi_gru.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input,GRU,Dense,Bidirectional
 2 | from tensorflow.keras.models import Model
 3 | class TEXT_BI_GRU:
 4 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 5 |         self.embedding_layer = embedding_layer
 6 |         self.class_len = class_len
 7 |         self.max_sequence_len = max_sequence_len
 8 | 
 9 |     def build(self):
10 |         input = Input(shape=(self.max_sequence_len,))
11 |         embedding_layer = self.embedding_layer(input)
12 | 
13 |         x = Bidirectional(GRU(128))(embedding_layer)
14 | 
15 |         output = Dense(self.class_len, activation='sigmoid')(x)
16 |         model = Model(inputs=input, outputs=output)
17 |         return model


--------------------------------------------------------------------------------
/models/text_bi_lstm.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input,LSTM,Dense,Bidirectional
 2 | from tensorflow.keras.models import Model
 3 | class TEXT_BI_LSTM:
 4 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 5 |         self.embedding_layer = embedding_layer
 6 |         self.class_len = class_len
 7 |         self.max_sequence_len = max_sequence_len
 8 | 
 9 |     def build(self):
10 |         input = Input(shape=(self.max_sequence_len,))
11 |         embedding_layer = self.embedding_layer(input)
12 | 
13 |         x = Bidirectional(LSTM(128))(embedding_layer)
14 | 
15 |         output = Dense(self.class_len, activation='sigmoid')(x)
16 |         model = Model(inputs=input, outputs=output)
17 |         return model


--------------------------------------------------------------------------------
/models/text_gru.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input, GRU, Dense,BatchNormalization,Dropout
 2 | from tensorflow.keras.models import Model
 3 | 
 4 | 
 5 | class TEXT_GRU:
 6 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 7 |         self.embedding_layer = embedding_layer
 8 |         self.class_len = class_len
 9 |         self.max_sequence_len = max_sequence_len
10 | 
11 |     def build(self):
12 |         input = Input(shape=(self.max_sequence_len,))
13 |         embedding_layer = self.embedding_layer(input)
14 |         gru = GRU(
15 |             256,
16 |             kernel_initializer="glorot_uniform",
17 |             recurrent_initializer='normal',
18 |             activation='relu',
19 |         )(embedding_layer)
20 | 
21 |         batch_normalization = BatchNormalization()(gru)
22 |         dropout = Dropout(0.1)(batch_normalization)
23 |         output = Dense(self.class_len, activation='sigmoid')(dropout)
24 | 
25 |         model = Model(inputs=input, outputs=output)
26 |         return model
27 | 
28 |     def build_base(self):
29 |         input = Input(shape=(self.max_sequence_len,))
30 |         embedding_layer = self.embedding_layer(input)
31 | 
32 |         LSTM_Layer_1 = GRU(128)(embedding_layer)
33 | 
34 |         output = Dense(self.class_len, activation='sigmoid')(LSTM_Layer_1)
35 |         model = Model(inputs=input, outputs=output)
36 |         return model
37 | 


--------------------------------------------------------------------------------
/models/text_lstm.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.layers import Input,LSTM,Dense
 2 | from tensorflow.keras.models import Model
 3 | class TEXT_LSTM:
 4 |     def __init__(self, embedding_layer, class_len, max_sequence_len):
 5 |         self.embedding_layer = embedding_layer
 6 |         self.class_len = class_len
 7 |         self.max_sequence_len = max_sequence_len
 8 | 
 9 |     def build(self):
10 |         input = Input(shape=(self.max_sequence_len,))
11 |         embedding_layer = self.embedding_layer(input)
12 | 
13 |         LSTM_Layer_1 = LSTM(128)(embedding_layer)
14 | 
15 |         output = Dense(self.class_len, activation='sigmoid')(LSTM_Layer_1)
16 |         model = Model(inputs=input, outputs=output)
17 |         return model


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
 1 | from config import *
 2 | import tensorflow as tf
 3 | from tqdm import tqdm
 4 | import numpy as np
 5 | import re
 6 | import os
 7 | from load_data import *
 8 | 
 9 | 
10 | def predict(model_type=''):
11 |     x, y = load_data(PREDICT_PATH)
12 |     model = tf.keras.models.load_model(MODEL_PATH)
13 |     predict_results = {}
14 |     for i in tqdm(range(len(x)), desc='PREDICTING...'):
15 |         seg = x[i]
16 |         result = model.predict(get_features([seg], MAX_SEQUENCE_LEN)).mean(0)
17 |         id_top = list(list(np.where(result > PREDICT_LEVEL))[0])
18 |         id_max = list(result).index(max(result))
19 |         if id_max not in id_top:
20 |             id_top.append(id_max)
21 |         for id in id_top:
22 |             predict_results[id] = predict_results[id] + \
23 |                 1 if id in predict_results else 1
24 |     print('分类结果({})'.format(model_type.upper()))
25 |     print('| 分类(Class)               | 数量(Quantity) |')
26 |     print('| :----------------- | :--- |')
27 |     for i in range(len(CLASSES)):
28 |         value = predict_results[i] if i in predict_results.keys() else '0'
29 |         print('|  {}    |    {}    |'.format(str(CLASSES[i]), str(value)))
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     predict(TEXT_GRU)
34 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Keras下的多标签分类(Keras Multi Label TextClasscation)
 2 | 
 3 | ## 简介(Introduction)
 4 | 
 5 | * 本项目利用KERAS结合不同的分类器并使用Glove词向量，对英文文本（中文需修改load_data.py增加分词功能并更换词向量）进行多标签分类。  
 6 | * 项目起源是因为以前做过美亚图书分类转换中图法分类，当年是通过爬取美亚的图书信息和分类，然后人工制定规则映射，时间久了积攒了些数据，就想着拿KERAS实现一下AI智能分类，毕竟还有很多图书信息不是来自美亚，没有对应的分类。  
 7 | * 共有9种分类器可供选择，大部分模型只做到了baseline,部分借鉴了[Magpie](https://github.com/inspirehep/magpie)和[Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification)。  
 8 | * 数据质量及标签数量对分类结果有非常大的影响，数据太过稀疏及标签过多会导致结果惨不忍睹（3分类ACC可达92%，6分类ACC76%，24分类则只有41%）。
 9 | 
10 |   The English introduction translated from google，good luck...
11 | * This project uses KERAS and Glove to combine different classifiers to classify English text (Chinese need to modify load_data.py to add word segmentation and change the Embedding) for multi-label classification.
12 | * The origin of the project is that I have done the Amazon Book classification to CLC (Chinese Library Classification) many years ago. In the past, it was by crawling the book information and classification from  Amazon.com, and then manually formulating the rule mapping. After a long time, I accumulated some data, and I wanted to use KERAS to implement AI. Intelligent classification, after all, there is still a lot of book information that does not come from Amazon.com, and there is no corresponding classification.
13 | * There are a total of 9 classifiers to choose from, most of the models only achieve the baseline, and partly draw on [Magpie](https://github.com/inspirehep/magpie) and [Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification)。
14 | * The quality of data and the number of tags have a very large impact on the classification results. Too sparse data and too many tags will lead to disastrous results (ACC 92% for 3 categories, 76% for 6 categories, and 41% for 24 categories).
15 | 
16 | ## 依赖(Requirements)  
17 | 
18 | tqdm==4.49.0  
19 | numpy==1.18.5  
20 | pandas==1.1.2  
21 | tensorflow==2.3.0  
22 | matplotlib==3.3.2  
23 | pyenchant==3.1.1  
24 | scikit_learn==0.23.2  
25 | 
26 | ## 使用方法(Guide)
27 | 
28 | 1.配置config.py中的参数(config config.py)  
29 | 2.修改train.py的模型类型后运行 (modify train.py and run)  
30 | 3.修改predict.py的模型类型后运行(modify train.py and run)
31 | 
32 | 也可以直接运行 run.py(Can also run run.py directly)
33 | 
34 | ## 数据集(Dataset)  
35 | 
36 | 数据集来自[Kaggle](https://www.kaggle.com/vetrirah/janatahack-independence-day-2020-ml-hackathon)  
37 | Dataset from [Kaggle](https://www.kaggle.com/vetrirah/janatahack-independence-day-2020-ml-hackathon)  
38 | 
39 | 清洗后的数据概况(Dataset detail after cleaning)  
40 | MAX SEQ LEN:778  
41 | ALL WORDS:20430  
42 | DATA LEN:20971  
43 | | 分类(Class)          | 数量(Quantity) |
44 | | :------------------- | :------------- |
45 | | Computer Science     | 8594           |
46 | | Physics              | 6013           |
47 | | Mathematics          | 5618           |
48 | | Statistics           | 5206           |
49 | | Quantitative Biology | 586            |
50 | | Quantitative Finance | 249            |
51 | 
52 | ## 分类器及成绩(Result)
53 | 
54 | | 分类器(Classifier) | val_categorical_accuracy | epochs |
55 | | :----------------- | :----------------------- | :----- |
56 | | CNN                | 0.7607                   | 18     |
57 | | FAST               | 0.7417                   | 13     |
58 | | **CHAR_CNN**       | **0.7655**               | **10** |
59 | | TEXT_ATT_BI_GRU    | 0.7493                   | 5      |
60 | | TEXT_ATT_BI_LSTM   | 0.7583                   | 15     |
61 | | TEXT_BI_GRU        | 0.7564                   | 5      |
62 | | **TEXT_BI_LSTM**   | **0.7583**               | **14** |
63 | | TEXT_GRU           | 0.7626                   | 19     |
64 | | TEXT_LSTM          | 0.7498                   | 4      |
65 | 
66 | 虽然CHAR_CNN的正确率看起来高那么一点点，但实际上TEXT_BI_LSTM的结果更接近真实数据。  
67 | Although the correct rate of CHAR_CNN looks a little bit higher, in fact the result of TEXT_BI_LSTM is closer to the real data
68 | 
69 | ### 分类结果(TEXT_BI_LSTM)
70 | 
71 | | 分类(Class)          | 数量(Quantity) |
72 | | :------------------- | :------------- |
73 | | Computer Science     | 8526           |
74 | | Physics              | 5603           |
75 | | Mathematics          | 5464           |
76 | | Statistics           | 5640           |
77 | | Quantitative Biology | 305            |
78 | | Quantitative Finance | 229            |
79 | 
80 | ## 参考(Reference)
81 | 
82 | [Magpie](https://github.com/inspirehep/magpie)
83 | 
84 | [Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification)
85 | 
86 | ## TODO
87 | 
88 | 1.针对多标签的稀疏数据做SMOTE平滑处理。 (SMOTE for multi label sparse data)  
89 | 2.找到更大，更多标签的数据集。 (Find larger, more tagged datasets.)  
90 | 3.模型优化(Model optimization)  
91 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm==4.49.0
2 | numpy==1.18.5
3 | pandas==1.1.2
4 | tensorflow==2.3.0
5 | matplotlib==3.3.2
6 | pyenchant==3.1.1
7 | scikit_learn==0.23.2
8 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | from train import *
 2 | from predict import *
 3 | from config import *
 4 | 
 5 | 
 6 | def train_and_predict():
 7 |     result=[]
 8 |     models=[CNN,FAST,CHAR_CNN,TEXT_ATT_BI_GRU,TEXT_ATT_BI_LSTM,TEXT_BI_GRU,TEXT_BI_LSTM,TEXT_GRU,TEXT_LSTM]
 9 |     # models=[CNN,FAST]
10 |     for model in models:
11 |         mode_type,best_socre,best_epoch=train(model)
12 |         result.append([mode_type,best_socre,best_epoch])
13 |         predict(model)
14 |     print('| 分类器(Classifier) | val_categorical_accuracy | epochs |')
15 |     print('| :----------------- | :----------------------- | :----- |')
16 |     for r in result:
17 |         print('| {}                | {}                   | {}     |'.format(r[0],r[1],r[2]))
18 |     
19 | 
20 | if __name__ == '__main__':
21 |     train_and_predict()
22 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import datetime
 3 | import pickle
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | import tensorflow as tf
 8 | from sklearn.model_selection import train_test_split
 9 | from tensorflow.keras.callbacks import (EarlyStopping, ModelCheckpoint,
10 |                                         ReduceLROnPlateau)
11 | from tensorflow.keras.preprocessing.sequence import pad_sequences
12 | from tensorflow.keras.preprocessing.text import Tokenizer
13 | from tqdm import tqdm
14 | import numpy as np
15 | from model import *
16 | from config import *
17 | from load_data import load_data
18 | 
19 | 
20 | def build_matrix(embeddings_index, word_index):
21 |     embedding_matrix = np.zeros((MAX_WORDS_LEN, EMBED_SIZE))
22 |     for word, i in tqdm(word_index.items(),desc='BUILD EMBEDDING'):
23 |         if i >= MAX_WORDS_LEN:
24 |             continue
25 |         try:
26 |             # word对应的vector
27 |             embedding_vector = embeddings_index[word]
28 |         except:
29 |             # word不存在则使用unknown的vector
30 |             embedding_vector = embeddings_index["unknown"]
31 |         if embedding_vector is not None:
32 |             # 保证embedding_matrix行的向量与word_index中序号一致
33 |             embedding_matrix[i] = embedding_vector
34 |     return embedding_matrix
35 | 
36 | 
37 | def get_coefs(word, *arr):
38 |     return word, np.asarray(arr, dtype='float32')
39 | 
40 | 
41 | def train(model_type=''):
42 |     abstract, labels = load_data(TRAIN_PATH)
43 |     # 词向量
44 |     tokenizer = Tokenizer(num_words=MAX_WORDS_LEN, lower=True)
45 |     tokenizer.fit_on_texts(abstract)
46 |     sequences = tokenizer.texts_to_sequences(abstract)
47 |     data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LEN)
48 | 
49 |     with open(TOK_PATH, 'wb') as handle:
50 |         pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
51 | 
52 |     # glove嵌入
53 |     embeddings_index = dict(get_coefs(*o.split(" "))
54 |                             for o in open(EMBEDDING_PATH))
55 |     glove_embedding_matrix = build_matrix(
56 |         embeddings_index, tokenizer.word_index)
57 | 
58 |     x_train, x_validation, y_train, y_validation = train_test_split(
59 |         data.tolist(), labels.values.tolist(), test_size=0.1, random_state=123)
60 | 
61 |     # 获得model
62 |     model = get_model(model_type, glove_embedding_matrix)
63 | 
64 |     log_dir = LOG_PATH + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
65 |     tensorboard_callback = tf.keras.callbacks.TensorBoard(
66 |         log_dir=log_dir, histogram_freq=1)
67 | 
68 |     # ReduceLROnPlateau=当评价指标不在提升时，减少学习率;EarlyStopping=3轮没有进步时,停止;ModelCheckpoint=只保存最好的模型
69 |     callbacks = [
70 |         ReduceLROnPlateau(monitor='categorical_accuracy'),
71 |         EarlyStopping(patience=30, monitor='val_categorical_accuracy'),
72 |         ModelCheckpoint(filepath=MODEL_PATH, save_best_only=True),
73 |         tensorboard_callback
74 |     ]
75 | 
76 |     history = model.fit(x_train, y_train,
77 |                         epochs=100,
78 |                         batch_size=1024,
79 |                         verbose=1,
80 |                         validation_data=(x_validation, y_validation),
81 |                         callbacks=callbacks)
82 |     val_cat_acc = history.history['val_categorical_accuracy']
83 |     best_score = max(val_cat_acc)
84 |     best_epoch = val_cat_acc.index(best_score)
85 |     return str(model_type).upper(), '%.4f' % best_score, best_epoch
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     train(TEXT_GRU)
90 | 


--------------------------------------------------------------------------------