├── .gitignore ├── LICENSE ├── config.py ├── load_data.py ├── model.py ├── models ├── attention.py ├── char_cnn.py ├── cnn.py ├── fast.py ├── text_att_bi_gru.py ├── text_att_bi_lstm.py ├── text_bi_gru.py ├── text_bi_lstm.py ├── text_gru.py └── text_lstm.py ├── predict.py ├── readme.md ├── requirements.txt ├── run.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | *.png 141 | logs 142 | make_train_data.py 143 | .vscode 144 | data/ 145 | saves 146 | .idea 147 | 148 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 gezimonkey 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | MODEL_PATH='saves/model.h5' 2 | TOK_PATH='saves/tokenizer.pickle' 3 | LOG_PATH='./logs/fit/' 4 | EMBEDDING_PATH = '../../dataset/glove/glove.6B.300d.txt' 5 | TRAIN_PATH='data/train.csv' 6 | PREDICT_PATH='data/train.csv' 7 | PREDICT_LEVEL=0.5 8 | # TRAIN_PATH='data/pre_data.csv' 9 | # PREDICT_PATH='data/pre_data.csv' 10 | 11 | 12 | 13 | # CLASSES = ['政治、法律', '社会科学总论', '文化、教育、体育', '语言、文字', '医药、卫生', '计算机科学', 14 | # '环境科学、安全科学', '历史、地理', '数理科学和化学', '工业技术', '综合、工具书', '文学', '经济、商业', 15 | # '航空、航天', '建筑科学', '生物科学', '哲学、宗教', '天文学、地球科学', '生活服务技术', '交通运输', '农业科学', '艺术', '军事、战争', '自然科学总论'] 16 | CLASSES=['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance'] 17 | 18 | MAX_SEQUENCE_LEN=256 19 | MAX_WORDS_LEN=20000 20 | EMBED_SIZE=300 21 | 22 | CNN='CNN_MODEL' 23 | CHAR_CNN='CHAR_CNN' 24 | 25 | FAST='FAST' 26 | 27 | TEXT_LSTM='TEXT_LSTM' 28 | TEXT_BI_LSTM='TEXT_BI_LSTM' 29 | TEXT_ATT_BI_LSTM='TEXT_ATT_BI_LSTM' 30 | 31 | TEXT_GRU='TEXT_GRU' 32 | TEXT_BI_GRU='TEXT_BI_GRU' 33 | TEXT_ATT_BI_GRU='TEXT_ATT_BI_GRU' 34 | 35 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | from config import TOK_PATH, CLASSES, TRAIN_PATH 2 | import pickle 3 | from tensorflow.keras.preprocessing.sequence import pad_sequences 4 | from tqdm import tqdm 5 | import string 6 | import enchant 7 | import os 8 | import re 9 | import pandas as pd 10 | # from imblearn.over_sampling import SMOTE 11 | import matplotlib.pyplot as plt 12 | 13 | 14 | def get_features(text_series, maxlen): 15 | with open(TOK_PATH, 'rb') as handle: 16 | tokenizer = pickle.load(handle) 17 | sequences = tokenizer.texts_to_sequences(text_series) 18 | return pad_sequences(sequences, maxlen=maxlen) 19 | 20 | 21 | def clean(abstracts_tmp, labels_tmp): 22 | f = open('data/stopwords_en.txt', 'r', encoding='utf-8') 23 | stopwords = [words.replace('\n', '') for words in f.readlines()] 24 | f.close() 25 | reg = re.compile(r'<[^>]+>', re.S) 26 | max_seq_len = 0 27 | all_words = [] 28 | dic_en = enchant.Dict("en_US") 29 | t = str.maketrans({key: None for key in string.punctuation}) 30 | abstracts = [] 31 | for i in tqdm(range(len(abstracts_tmp)), desc='CLEANING...'): 32 | line = abstracts_tmp[i] 33 | desc = reg.sub('', line) 34 | desc = re.sub(r'\W+', ' ', desc) 35 | desc = desc.translate(t).lower() 36 | desc = desc.split(' ') 37 | x = '' 38 | for word in desc: 39 | if x.find(word) == -1 and word not in stopwords and word != '' and word != ' ' and re.search('\d', word) is None and dic_en.check(word): 40 | if word not in all_words: 41 | all_words.append(word) 42 | x += word+' ' 43 | x = x.strip() 44 | if len(x) < 4: 45 | labels_tmp.drop([i], inplace=True) 46 | continue 47 | if len(x.split(' ')) > max_seq_len: 48 | max_seq_len = len(x) 49 | abstracts.append(x) 50 | print('*'*20+'DATA DETAIL'+'*'*20) 51 | print('MAX SEQ LEN:{}'.format(max_seq_len)) 52 | print('ALL WORDS:{}'.format(len(all_words))) 53 | return abstracts, labels_tmp 54 | 55 | 56 | def load_data(path): 57 | if os.path.basename(path).find('.csv') != - -1: 58 | datas = pd.read_csv(path) 59 | labels_tmp = datas[[class_name for class_name in CLASSES]] 60 | abstracts_tmp = datas.ABSTRACT.tolist() 61 | elif os.path.basename(path).find('.txt') != -1: 62 | f = open(path, 'r', encoding='utf-8') 63 | lines = f.readlines() 64 | f.close() 65 | abstracts_tmp = [] 66 | labels_tmp = [] 67 | for line in tqdm(lines, desc='LOAD DATA'): 68 | items = str(line).split('\t') 69 | label = str(items[0]).split('|') 70 | word = re.split(r'\W+', str(items[1])) 71 | labels_tmp.append(label) 72 | abstracts_tmp.append(word) 73 | abstracts, labels = clean(abstracts_tmp, labels_tmp) 74 | # Imbalanced-learn currently supports binary, multiclass and binarized encoded multiclasss targets. Multilabel and multioutput targets are not supported. 75 | # smo = SMOTE(random_state=42) 76 | # abstracts, labels = smo.fit_sample(abstracts, labels) 77 | print('DATA LEN:{}'.format(len(abstracts))) 78 | print(labels.sum(axis=0)) 79 | return abstracts, labels 80 | 81 | 82 | if __name__ == '__main__': 83 | load_data(TRAIN_PATH) 84 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Embedding 2 | from config import * 3 | from models.char_cnn import * 4 | from models.cnn import * 5 | from models.fast import * 6 | from models.text_att_bi_gru import * 7 | from models.text_att_bi_lstm import * 8 | from models.text_bi_gru import * 9 | from models.text_bi_lstm import * 10 | from models.text_gru import * 11 | from models.text_lstm import * 12 | 13 | 14 | def create_embedding(embedding_matrix): 15 | return Embedding(input_dim=MAX_WORDS_LEN, 16 | output_dim=EMBED_SIZE, 17 | weights=[embedding_matrix], 18 | input_length=MAX_SEQUENCE_LEN, 19 | trainable=False) 20 | 21 | def get_model(model_type, embedding_matrix): 22 | class_len = len(CLASSES) 23 | model_class=globals()[model_type](create_embedding(embedding_matrix), class_len,MAX_SEQUENCE_LEN) 24 | model=model_class.build() 25 | # 优化器=adam 损失函数=二分类交叉熵损失函数 评价指标=输出所有结果的概率 26 | model.compile( 27 | optimizer='adam', 28 | loss='binary_crossentropy', 29 | metrics=['categorical_accuracy']) 30 | return model -------------------------------------------------------------------------------- /models/attention.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Dense, Lambda, dot, Activation, concatenate 2 | from tensorflow.keras.layers import Layer 3 | 4 | 5 | class Attention(Layer): 6 | 7 | def __init__(self, **kwargs): 8 | super().__init__(**kwargs) 9 | 10 | def __call__(self, hidden_states): 11 | """ 12 | Many-to-one attention mechanism for Keras. 13 | @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). 14 | @return: 2D tensor with shape (batch_size, 128) 15 | @author: felixhao28. 16 | """ 17 | hidden_size = int(hidden_states.shape[2]) 18 | # Inside dense layer 19 | # hidden_states dot W => score_first_part 20 | # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) 21 | # W is the trainable weight matrix of attention Luong's multiplicative style score 22 | score_first_part = Dense( 23 | hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) 24 | # score_first_part dot last_hidden_state => attention_weights 25 | # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) 26 | h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), 27 | name='last_hidden_state')(hidden_states) 28 | score = dot([score_first_part, h_t], [2, 1], name='attention_score') 29 | attention_weights = Activation( 30 | 'softmax', name='attention_weight')(score) 31 | # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) 32 | context_vector = dot([hidden_states, attention_weights], [ 33 | 1, 1], name='context_vector') 34 | pre_activation = concatenate( 35 | [context_vector, h_t], name='attention_output') 36 | attention_vector = Dense( 37 | 128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) 38 | return attention_vector 39 | -------------------------------------------------------------------------------- /models/char_cnn.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input, Convolution1D, GlobalMaxPooling1D, concatenate, Dense, AlphaDropout, Concatenate 2 | from tensorflow.keras.models import Model 3 | 4 | 5 | class CHAR_CNN: 6 | def __init__(self, embedding_layer, class_len, max_sequence_len): 7 | self.embedding_layer = embedding_layer 8 | self.class_len = class_len 9 | self.max_sequence_len = max_sequence_len 10 | 11 | def build(self): 12 | conv_layers = [[256, 10], [256, 7], [256, 5], [256, 3]] 13 | fully_connected_layers = [1024, 1024] 14 | input = Input(shape=(self.max_sequence_len,), 15 | dtype='int32', name='input') 16 | embedded_sequence = self.embedding_layer(input) 17 | 18 | convolution_output = [] 19 | for num_filters, filter_width in conv_layers: 20 | conv = Convolution1D(filters=num_filters, 21 | kernel_size=filter_width, 22 | activation='tanh', 23 | name='Conv1D_{}_{}'.format(num_filters, filter_width))(embedded_sequence) 24 | pool = GlobalMaxPooling1D(name='MaxPoolingOverTime_{}_{}'.format( 25 | num_filters, filter_width))(conv) 26 | convolution_output.append(pool) 27 | x = Concatenate()(convolution_output) 28 | for fl in fully_connected_layers: 29 | x = Dense(fl, activation='selu', 30 | kernel_initializer='lecun_normal')(x) 31 | x = AlphaDropout(0.5)(x) 32 | 33 | output = Dense(self.class_len, activation='sigmoid')(x) 34 | model = Model(inputs=input, outputs=output) 35 | return model 36 | -------------------------------------------------------------------------------- /models/cnn.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Concatenate, Dropout, Flatten, Dense, concatenate 2 | from tensorflow.keras.models import Model 3 | 4 | 5 | class CNN_MODEL: 6 | def __init__(self, embedding_layer, class_len, max_sequence_len): 7 | self.embedding_layer = embedding_layer 8 | self.class_len = class_len 9 | self.max_sequence_len = max_sequence_len 10 | 11 | def build(self): 12 | filter_sizes = [1, 2, 3, 4, 5] 13 | input = Input(shape=(self.max_sequence_len,), 14 | dtype='int32', name='input') 15 | embedded_sequence = self.embedding_layer(input) 16 | 17 | conv_layers = [] 18 | for fsz in filter_sizes: 19 | conv1 = Conv1D( 20 | 256, 21 | fsz, 22 | kernel_initializer='lecun_uniform', 23 | activation='tanh', 24 | )(embedded_sequence) 25 | pool_size = self.max_sequence_len - fsz + 1 26 | pooling = MaxPooling1D(pool_size=pool_size)(conv1) 27 | conv_layers.append(pooling) 28 | merged = Concatenate()(conv_layers) 29 | dropout = Dropout(0.5)(merged) 30 | flattened = Flatten()(dropout) 31 | 32 | output = Dense(self.class_len, activation='sigmoid')(flattened) 33 | model = Model(inputs=input, outputs=output) 34 | return model 35 | 36 | def build_base(self): 37 | filter_sizes = [3, 4] 38 | convs = [] 39 | input = Input(shape=(self.max_sequence_len,), 40 | dtype='int32', name='input') 41 | embedded_sequence = self.embedding_layer(input) 42 | 43 | for fsz in filter_sizes: 44 | conv = Conv1D(512, kernel_size=fsz, activation='relu')( 45 | embedded_sequence) 46 | pool = MaxPooling1D(2)(conv) 47 | convs.append(pool) 48 | merge1 = concatenate(convs, axis=1) 49 | dropout = Dropout(0.5)(merge1) 50 | conv1 = Conv1D(256, 4, activation='relu')(dropout) 51 | pool1 = MaxPooling1D(5)(conv1) 52 | flat = Flatten()(pool1) 53 | dense = Dense(512, activation='relu')(flat) 54 | 55 | output = Dense(self.class_len, activation='sigmoid', name='pred')(dense) 56 | model = Model(inputs=input, outputs=output) 57 | return model 58 | -------------------------------------------------------------------------------- /models/fast.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, GlobalMaxPooling1D, GlobalAveragePooling1D 2 | from tensorflow.keras.models import Model, Sequential 3 | 4 | 5 | class FAST: 6 | def __init__(self, embedding_layer, class_len, max_sequence_len): 7 | self.embedding_layer = embedding_layer 8 | self.class_len = class_len 9 | self.max_sequence_len = max_sequence_len 10 | 11 | def build_base(self): 12 | model = Sequential() 13 | model.add(self.embedding_layer) 14 | model.add(GlobalAveragePooling1D()) 15 | model.add(Dense(self.class_len, activation='sigmoid')) 16 | return model 17 | 18 | def build(self): 19 | input = Input(shape=(self.max_sequence_len,), 20 | dtype='int32', name='input') 21 | embedded_sequence = self.embedding_layer(input) 22 | 23 | pool_max = GlobalMaxPooling1D()(embedded_sequence) 24 | pool_ave = GlobalAveragePooling1D()(embedded_sequence) 25 | x = Concatenate()([pool_ave, pool_max]) 26 | x = Dense(128, activation="tanh")(x) 27 | x = Dropout(0.2)(x) 28 | 29 | output = Dense(self.class_len, activation='sigmoid')(x) 30 | model = Model(inputs=input, outputs=output) 31 | return model 32 | -------------------------------------------------------------------------------- /models/text_att_bi_gru.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input,GRU,Dense,Bidirectional 2 | from tensorflow.keras.models import Model 3 | from models.attention import Attention 4 | class TEXT_ATT_BI_GRU: 5 | def __init__(self, embedding_layer, class_len, max_sequence_len): 6 | self.embedding_layer = embedding_layer 7 | self.class_len = class_len 8 | self.max_sequence_len = max_sequence_len 9 | 10 | def build(self): 11 | input = Input(shape=(self.max_sequence_len,)) 12 | embedding_layer = self.embedding_layer(input) 13 | 14 | bi = Bidirectional(GRU(128, return_sequences=True))(embedding_layer) 15 | att = Attention()(bi) 16 | 17 | output = Dense(self.class_len, activation='sigmoid')(att) 18 | model = Model(inputs=input, outputs=output) 19 | return model -------------------------------------------------------------------------------- /models/text_att_bi_lstm.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input,LSTM,Dense,Bidirectional 2 | from tensorflow.keras.models import Model 3 | from models.attention import Attention 4 | class TEXT_ATT_BI_LSTM: 5 | def __init__(self, embedding_layer, class_len, max_sequence_len): 6 | self.embedding_layer = embedding_layer 7 | self.class_len = class_len 8 | self.max_sequence_len = max_sequence_len 9 | 10 | def build(self): 11 | input = Input(shape=(self.max_sequence_len,)) 12 | embedding_layer = self.embedding_layer(input) 13 | 14 | bi = Bidirectional(LSTM(128, return_sequences=True))(embedding_layer) 15 | att = Attention()(bi) 16 | 17 | output = Dense(self.class_len, activation='sigmoid')(att) 18 | model = Model(inputs=input, outputs=output) 19 | return model -------------------------------------------------------------------------------- /models/text_bi_gru.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input,GRU,Dense,Bidirectional 2 | from tensorflow.keras.models import Model 3 | class TEXT_BI_GRU: 4 | def __init__(self, embedding_layer, class_len, max_sequence_len): 5 | self.embedding_layer = embedding_layer 6 | self.class_len = class_len 7 | self.max_sequence_len = max_sequence_len 8 | 9 | def build(self): 10 | input = Input(shape=(self.max_sequence_len,)) 11 | embedding_layer = self.embedding_layer(input) 12 | 13 | x = Bidirectional(GRU(128))(embedding_layer) 14 | 15 | output = Dense(self.class_len, activation='sigmoid')(x) 16 | model = Model(inputs=input, outputs=output) 17 | return model -------------------------------------------------------------------------------- /models/text_bi_lstm.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input,LSTM,Dense,Bidirectional 2 | from tensorflow.keras.models import Model 3 | class TEXT_BI_LSTM: 4 | def __init__(self, embedding_layer, class_len, max_sequence_len): 5 | self.embedding_layer = embedding_layer 6 | self.class_len = class_len 7 | self.max_sequence_len = max_sequence_len 8 | 9 | def build(self): 10 | input = Input(shape=(self.max_sequence_len,)) 11 | embedding_layer = self.embedding_layer(input) 12 | 13 | x = Bidirectional(LSTM(128))(embedding_layer) 14 | 15 | output = Dense(self.class_len, activation='sigmoid')(x) 16 | model = Model(inputs=input, outputs=output) 17 | return model -------------------------------------------------------------------------------- /models/text_gru.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input, GRU, Dense,BatchNormalization,Dropout 2 | from tensorflow.keras.models import Model 3 | 4 | 5 | class TEXT_GRU: 6 | def __init__(self, embedding_layer, class_len, max_sequence_len): 7 | self.embedding_layer = embedding_layer 8 | self.class_len = class_len 9 | self.max_sequence_len = max_sequence_len 10 | 11 | def build(self): 12 | input = Input(shape=(self.max_sequence_len,)) 13 | embedding_layer = self.embedding_layer(input) 14 | gru = GRU( 15 | 256, 16 | kernel_initializer="glorot_uniform", 17 | recurrent_initializer='normal', 18 | activation='relu', 19 | )(embedding_layer) 20 | 21 | batch_normalization = BatchNormalization()(gru) 22 | dropout = Dropout(0.1)(batch_normalization) 23 | output = Dense(self.class_len, activation='sigmoid')(dropout) 24 | 25 | model = Model(inputs=input, outputs=output) 26 | return model 27 | 28 | def build_base(self): 29 | input = Input(shape=(self.max_sequence_len,)) 30 | embedding_layer = self.embedding_layer(input) 31 | 32 | LSTM_Layer_1 = GRU(128)(embedding_layer) 33 | 34 | output = Dense(self.class_len, activation='sigmoid')(LSTM_Layer_1) 35 | model = Model(inputs=input, outputs=output) 36 | return model 37 | -------------------------------------------------------------------------------- /models/text_lstm.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.layers import Input,LSTM,Dense 2 | from tensorflow.keras.models import Model 3 | class TEXT_LSTM: 4 | def __init__(self, embedding_layer, class_len, max_sequence_len): 5 | self.embedding_layer = embedding_layer 6 | self.class_len = class_len 7 | self.max_sequence_len = max_sequence_len 8 | 9 | def build(self): 10 | input = Input(shape=(self.max_sequence_len,)) 11 | embedding_layer = self.embedding_layer(input) 12 | 13 | LSTM_Layer_1 = LSTM(128)(embedding_layer) 14 | 15 | output = Dense(self.class_len, activation='sigmoid')(LSTM_Layer_1) 16 | model = Model(inputs=input, outputs=output) 17 | return model -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | import tensorflow as tf 3 | from tqdm import tqdm 4 | import numpy as np 5 | import re 6 | import os 7 | from load_data import * 8 | 9 | 10 | def predict(model_type=''): 11 | x, y = load_data(PREDICT_PATH) 12 | model = tf.keras.models.load_model(MODEL_PATH) 13 | predict_results = {} 14 | for i in tqdm(range(len(x)), desc='PREDICTING...'): 15 | seg = x[i] 16 | result = model.predict(get_features([seg], MAX_SEQUENCE_LEN)).mean(0) 17 | id_top = list(list(np.where(result > PREDICT_LEVEL))[0]) 18 | id_max = list(result).index(max(result)) 19 | if id_max not in id_top: 20 | id_top.append(id_max) 21 | for id in id_top: 22 | predict_results[id] = predict_results[id] + \ 23 | 1 if id in predict_results else 1 24 | print('分类结果({})'.format(model_type.upper())) 25 | print('| 分类(Class) | 数量(Quantity) |') 26 | print('| :----------------- | :--- |') 27 | for i in range(len(CLASSES)): 28 | value = predict_results[i] if i in predict_results.keys() else '0' 29 | print('| {} | {} |'.format(str(CLASSES[i]), str(value))) 30 | 31 | 32 | if __name__ == '__main__': 33 | predict(TEXT_GRU) 34 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Keras下的多标签分类(Keras Multi Label TextClasscation) 2 | 3 | ## 简介(Introduction) 4 | 5 | * 本项目利用KERAS结合不同的分类器并使用Glove词向量,对英文文本(中文需修改load_data.py增加分词功能并更换词向量)进行多标签分类。 6 | * 项目起源是因为以前做过美亚图书分类转换中图法分类,当年是通过爬取美亚的图书信息和分类,然后人工制定规则映射,时间久了积攒了些数据,就想着拿KERAS实现一下AI智能分类,毕竟还有很多图书信息不是来自美亚,没有对应的分类。 7 | * 共有9种分类器可供选择,大部分模型只做到了baseline,部分借鉴了[Magpie](https://github.com/inspirehep/magpie)和[Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification)。 8 | * 数据质量及标签数量对分类结果有非常大的影响,数据太过稀疏及标签过多会导致结果惨不忍睹(3分类ACC可达92%,6分类ACC76%,24分类则只有41%)。 9 | 10 | The English introduction translated from google,good luck... 11 | * This project uses KERAS and Glove to combine different classifiers to classify English text (Chinese need to modify load_data.py to add word segmentation and change the Embedding) for multi-label classification. 12 | * The origin of the project is that I have done the Amazon Book classification to CLC (Chinese Library Classification) many years ago. In the past, it was by crawling the book information and classification from Amazon.com, and then manually formulating the rule mapping. After a long time, I accumulated some data, and I wanted to use KERAS to implement AI. Intelligent classification, after all, there is still a lot of book information that does not come from Amazon.com, and there is no corresponding classification. 13 | * There are a total of 9 classifiers to choose from, most of the models only achieve the baseline, and partly draw on [Magpie](https://github.com/inspirehep/magpie) and [Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification)。 14 | * The quality of data and the number of tags have a very large impact on the classification results. Too sparse data and too many tags will lead to disastrous results (ACC 92% for 3 categories, 76% for 6 categories, and 41% for 24 categories). 15 | 16 | ## 依赖(Requirements) 17 | 18 | tqdm==4.49.0 19 | numpy==1.18.5 20 | pandas==1.1.2 21 | tensorflow==2.3.0 22 | matplotlib==3.3.2 23 | pyenchant==3.1.1 24 | scikit_learn==0.23.2 25 | 26 | ## 使用方法(Guide) 27 | 28 | 1.配置config.py中的参数(config config.py) 29 | 2.修改train.py的模型类型后运行 (modify train.py and run) 30 | 3.修改predict.py的模型类型后运行(modify train.py and run) 31 | 32 | 也可以直接运行 run.py(Can also run run.py directly) 33 | 34 | ## 数据集(Dataset) 35 | 36 | 数据集来自[Kaggle](https://www.kaggle.com/vetrirah/janatahack-independence-day-2020-ml-hackathon) 37 | Dataset from [Kaggle](https://www.kaggle.com/vetrirah/janatahack-independence-day-2020-ml-hackathon) 38 | 39 | 清洗后的数据概况(Dataset detail after cleaning) 40 | MAX SEQ LEN:778 41 | ALL WORDS:20430 42 | DATA LEN:20971 43 | | 分类(Class) | 数量(Quantity) | 44 | | :------------------- | :------------- | 45 | | Computer Science | 8594 | 46 | | Physics | 6013 | 47 | | Mathematics | 5618 | 48 | | Statistics | 5206 | 49 | | Quantitative Biology | 586 | 50 | | Quantitative Finance | 249 | 51 | 52 | ## 分类器及成绩(Result) 53 | 54 | | 分类器(Classifier) | val_categorical_accuracy | epochs | 55 | | :----------------- | :----------------------- | :----- | 56 | | CNN | 0.7607 | 18 | 57 | | FAST | 0.7417 | 13 | 58 | | **CHAR_CNN** | **0.7655** | **10** | 59 | | TEXT_ATT_BI_GRU | 0.7493 | 5 | 60 | | TEXT_ATT_BI_LSTM | 0.7583 | 15 | 61 | | TEXT_BI_GRU | 0.7564 | 5 | 62 | | **TEXT_BI_LSTM** | **0.7583** | **14** | 63 | | TEXT_GRU | 0.7626 | 19 | 64 | | TEXT_LSTM | 0.7498 | 4 | 65 | 66 | 虽然CHAR_CNN的正确率看起来高那么一点点,但实际上TEXT_BI_LSTM的结果更接近真实数据。 67 | Although the correct rate of CHAR_CNN looks a little bit higher, in fact the result of TEXT_BI_LSTM is closer to the real data 68 | 69 | ### 分类结果(TEXT_BI_LSTM) 70 | 71 | | 分类(Class) | 数量(Quantity) | 72 | | :------------------- | :------------- | 73 | | Computer Science | 8526 | 74 | | Physics | 5603 | 75 | | Mathematics | 5464 | 76 | | Statistics | 5640 | 77 | | Quantitative Biology | 305 | 78 | | Quantitative Finance | 229 | 79 | 80 | ## 参考(Reference) 81 | 82 | [Magpie](https://github.com/inspirehep/magpie) 83 | 84 | [Keras-TextClassification](https://github.com/yongzhuo/Keras-TextClassification) 85 | 86 | ## TODO 87 | 88 | 1.针对多标签的稀疏数据做SMOTE平滑处理。 (SMOTE for multi label sparse data) 89 | 2.找到更大,更多标签的数据集。 (Find larger, more tagged datasets.) 90 | 3.模型优化(Model optimization) 91 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.49.0 2 | numpy==1.18.5 3 | pandas==1.1.2 4 | tensorflow==2.3.0 5 | matplotlib==3.3.2 6 | pyenchant==3.1.1 7 | scikit_learn==0.23.2 8 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | from train import * 2 | from predict import * 3 | from config import * 4 | 5 | 6 | def train_and_predict(): 7 | result=[] 8 | models=[CNN,FAST,CHAR_CNN,TEXT_ATT_BI_GRU,TEXT_ATT_BI_LSTM,TEXT_BI_GRU,TEXT_BI_LSTM,TEXT_GRU,TEXT_LSTM] 9 | # models=[CNN,FAST] 10 | for model in models: 11 | mode_type,best_socre,best_epoch=train(model) 12 | result.append([mode_type,best_socre,best_epoch]) 13 | predict(model) 14 | print('| 分类器(Classifier) | val_categorical_accuracy | epochs |') 15 | print('| :----------------- | :----------------------- | :----- |') 16 | for r in result: 17 | print('| {} | {} | {} |'.format(r[0],r[1],r[2])) 18 | 19 | 20 | if __name__ == '__main__': 21 | train_and_predict() 22 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import datetime 3 | import pickle 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import tensorflow as tf 8 | from sklearn.model_selection import train_test_split 9 | from tensorflow.keras.callbacks import (EarlyStopping, ModelCheckpoint, 10 | ReduceLROnPlateau) 11 | from tensorflow.keras.preprocessing.sequence import pad_sequences 12 | from tensorflow.keras.preprocessing.text import Tokenizer 13 | from tqdm import tqdm 14 | import numpy as np 15 | from model import * 16 | from config import * 17 | from load_data import load_data 18 | 19 | 20 | def build_matrix(embeddings_index, word_index): 21 | embedding_matrix = np.zeros((MAX_WORDS_LEN, EMBED_SIZE)) 22 | for word, i in tqdm(word_index.items(),desc='BUILD EMBEDDING'): 23 | if i >= MAX_WORDS_LEN: 24 | continue 25 | try: 26 | # word对应的vector 27 | embedding_vector = embeddings_index[word] 28 | except: 29 | # word不存在则使用unknown的vector 30 | embedding_vector = embeddings_index["unknown"] 31 | if embedding_vector is not None: 32 | # 保证embedding_matrix行的向量与word_index中序号一致 33 | embedding_matrix[i] = embedding_vector 34 | return embedding_matrix 35 | 36 | 37 | def get_coefs(word, *arr): 38 | return word, np.asarray(arr, dtype='float32') 39 | 40 | 41 | def train(model_type=''): 42 | abstract, labels = load_data(TRAIN_PATH) 43 | # 词向量 44 | tokenizer = Tokenizer(num_words=MAX_WORDS_LEN, lower=True) 45 | tokenizer.fit_on_texts(abstract) 46 | sequences = tokenizer.texts_to_sequences(abstract) 47 | data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LEN) 48 | 49 | with open(TOK_PATH, 'wb') as handle: 50 | pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 51 | 52 | # glove嵌入 53 | embeddings_index = dict(get_coefs(*o.split(" ")) 54 | for o in open(EMBEDDING_PATH)) 55 | glove_embedding_matrix = build_matrix( 56 | embeddings_index, tokenizer.word_index) 57 | 58 | x_train, x_validation, y_train, y_validation = train_test_split( 59 | data.tolist(), labels.values.tolist(), test_size=0.1, random_state=123) 60 | 61 | # 获得model 62 | model = get_model(model_type, glove_embedding_matrix) 63 | 64 | log_dir = LOG_PATH + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 65 | tensorboard_callback = tf.keras.callbacks.TensorBoard( 66 | log_dir=log_dir, histogram_freq=1) 67 | 68 | # ReduceLROnPlateau=当评价指标不在提升时,减少学习率;EarlyStopping=3轮没有进步时,停止;ModelCheckpoint=只保存最好的模型 69 | callbacks = [ 70 | ReduceLROnPlateau(monitor='categorical_accuracy'), 71 | EarlyStopping(patience=30, monitor='val_categorical_accuracy'), 72 | ModelCheckpoint(filepath=MODEL_PATH, save_best_only=True), 73 | tensorboard_callback 74 | ] 75 | 76 | history = model.fit(x_train, y_train, 77 | epochs=100, 78 | batch_size=1024, 79 | verbose=1, 80 | validation_data=(x_validation, y_validation), 81 | callbacks=callbacks) 82 | val_cat_acc = history.history['val_categorical_accuracy'] 83 | best_score = max(val_cat_acc) 84 | best_epoch = val_cat_acc.index(best_score) 85 | return str(model_type).upper(), '%.4f' % best_score, best_epoch 86 | 87 | 88 | if __name__ == '__main__': 89 | train(TEXT_GRU) 90 | --------------------------------------------------------------------------------