├── README.md ├── app.py ├── backward.py ├── data ├── README.md └── vocab_train.p ├── demo.txt ├── forward.py ├── generated.py ├── images ├── 0_bs_image.jpg ├── 0_image.jpg ├── 1_bs_image.jpg ├── 1_image.jpg ├── 2_image.jpg ├── 3_image.jpg ├── 4_image.jpg ├── 5_image.jpg └── 6_image.jpg ├── models └── README.md └── src ├── analyze_data.py ├── app.py ├── config.py ├── data_generator.py ├── hp_search.py ├── template.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # - 2 | 《人工智能实践》课程第五组 3 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | sys.path.insert(0, 'src') 4 | # import the necessary packages 5 | import os 6 | os.environ['TF_CPP_MIN_LOG_LEVEL']='3' 7 | import pickle 8 | import random 9 | 10 | import cv2 as cv 11 | import keras.backend as K 12 | import numpy as np 13 | from keras.preprocessing import sequence 14 | #从config文件中引入一些参数 包括token最大长度 测试文件夹长度 最优的模型参数 15 | from config import max_token_length, test_a_image_folder, best_model 16 | from forward import build_model 17 | from generated import test_gen 18 | 19 | #使用训练好的模型对图片进行测试 20 | def beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=3): 21 | start = [word2idx[""]] 22 | start_word = [[start, 0.0]] 23 | 24 | while len(start_word[0][0]) < max_token_length: 25 | temp = [] 26 | for s in start_word: 27 | #对序列进行填充的预处理,在其后添0,使其序列统一大小为max_token_length 28 | par_caps = sequence.pad_sequences([s[0]], maxlen=max_token_length, padding='post') 29 | #每次取一个图片进行测试 30 | e = encoding_test[image_name] 31 | #使用模型对该图片进行测试 32 | preds = model.predict([np.array([e]), np.array(par_caps)]) 33 | #从预测的结果中取前beam_index个 34 | word_preds = np.argsort(preds[0])[-beam_index:] 35 | 36 | # Getting the top (n) predictions and creating a 37 | # new list so as to put them via the model again 38 | # 创建一个新的list结构 将预测出的词和词的概率以组对的形式存储 39 | for w in word_preds: 40 | next_cap, prob = s[0][:], s[1] 41 | next_cap.append(w) 42 | prob += preds[0][w] 43 | temp.append([next_cap, prob]) 44 | #将处理好的预测值赋值回start word 45 | start_word = temp 46 | # Sorting according to the probabilities 47 | # 根据概率排序 48 | start_word = sorted(start_word, reverse=False, key=lambda l: l[1]) 49 | # Getting the top words 50 | #获得最有可能正确的词 51 | start_word = start_word[-beam_index:] 52 | 53 | start_word = start_word[-1][0] 54 | #根据id取出单词 55 | intermediate_caption = [idx2word[i] for i in start_word] 56 | 57 | final_caption = [] 58 | #组合成句 59 | for i in intermediate_caption: 60 | if i != '': 61 | final_caption.append(i) 62 | else: 63 | break 64 | 65 | final_caption = ''.join(final_caption[1:]) 66 | return final_caption 67 | 68 | 69 | if __name__ == '__main__': 70 | #图片的channel为3 71 | channel = 3 72 | #设置模型权重的地址 73 | model_weights_path = os.path.join('models', best_model) 74 | print('模型加载中...') 75 | #创建模型 76 | model = build_model() 77 | #加载模型权重 78 | model.load_weights(model_weights_path) 79 | print('模型加载完毕...') 80 | 81 | #print(model.summary()) 82 | #加载语料库 83 | vocab = pickle.load(open('data/vocab_train.p', 'rb')) 84 | #将word转化为数字 方便输入网络 进行预测 85 | idx2word = sorted(vocab) 86 | word2idx = dict(zip(idx2word, range(len(vocab)))) 87 | print('语料库加载完毕...') 88 | 89 | test_gen() 90 | 91 | #加载测试图片 92 | encoding_test = pickle.load(open('data/encoded_test_a_images.p', 'rb')) 93 | #随机取测试图片 94 | names = [f for f in encoding_test.keys()] 95 | samples = names 96 | sentences = [] 97 | 98 | for i in range(len(samples)): 99 | image_name = samples[i] 100 | 101 | image_input = np.zeros((1, 2048)) 102 | image_input[0] = encoding_test[image_name] 103 | #获取图片的名称 104 | filename = os.path.join(test_a_image_folder, image_name) 105 | # print('Start processing image: {}'.format(filename)) 106 | #设置不同的预测参数,并放到beam_search_predictions中进行预测 107 | print('描述的图片为:',image_name) 108 | 109 | candidate1=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=1) 110 | print('Beam Search, k=1:',candidate1) 111 | sentences.append(candidate1) 112 | 113 | candidate2=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=2) 114 | print('Beam Search, k=2:',candidate2) 115 | sentences.append(candidate2) 116 | 117 | candidate3=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=3) 118 | print('Beam Search, k=3:',candidate3) 119 | sentences.append(candidate3) 120 | 121 | candidate4=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=5) 122 | print('Beam Search, k=5:',candidate4) 123 | sentences.append(candidate4) 124 | 125 | candidate5=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=7) 126 | print('Beam Search, k=7:',candidate5) 127 | sentences.append(candidate5) 128 | 129 | candidate6=beam_search_predictions(model, image_name, word2idx, idx2word, encoding_test, beam_index=9) 130 | print('Beam Search, k=9:',candidate6) 131 | sentences.append(candidate6) 132 | 133 | #读取图片 134 | img = cv.imread(filename) 135 | #resise到固定大小 256*256 136 | #img = cv.resize(img, (256, 256), cv.INTER_CUBIC) 137 | if not os.path.exists('images'): 138 | os.makedirs('images') 139 | #将修改后的图片重新写回 140 | cv.imwrite('images/{}_bs_image.jpg'.format(i), img) 141 | 142 | #将预测产生的描述信息输出到demo.txt文件中 143 | with open('demo.txt', 'w') as file: 144 | file.write('\n'.join(sentences)) 145 | 146 | K.clear_session() -------------------------------------------------------------------------------- /backward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | sys.path.insert(0, 'src') 4 | import argparse 5 | 6 | import keras 7 | import tensorflow as tf 8 | from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 9 | from keras.utils import multi_gpu_model 10 | 11 | from config import patience, epochs, num_train_samples, num_valid_samples, batch_size 12 | from data_generator import train_gen, valid_gen 13 | from forward import build_model 14 | from utils import get_available_gpus, get_available_cpus 15 | 16 | #主函数 17 | if __name__ == '__main__': 18 | # Parse arguments 19 | #创建一个ArgumentParser实例 20 | ap = argparse.ArgumentParser() 21 | #添加参数-p 22 | ap.add_argument("-p", "--pretrained", help="path to save pretrained model files") 23 | # 把parser中设置的所有"add_argument"给返回到ap子类实例当中 24 | #vars() 函数返回对象object的属性和属性值的字典对象 25 | args = vars(ap.parse_args()) 26 | #获取当前路径 27 | pretrained_path = args["pretrained"] 28 | #模型checkpoint路径 29 | checkpoint_models_path = 'models/' 30 | 31 | # Callbacks 32 | #回调函数 33 | #使用TensorBoard可视化训练曲线 34 | tensor_board = keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=True) 35 | #模型的命名规则 36 | model_names = checkpoint_models_path + 'model.{epoch:02d}-{val_loss:.4f}.hdf5' 37 | #使用ModelCheckpoint保存训练模型,monitor需要监视的值 38 | #verbose进度条信息展示模式,save_best_only:当设置为True时,将只保存在验证集上性能最好的模型 39 | model_checkpoint = ModelCheckpoint(model_names, monitor='val_loss', verbose=1, save_best_only=True) 40 | #使用early_stop防止过拟合,monitor: 被监测的数据。patience:没有进步的训练轮数,在这之后训练就会被停止。 41 | early_stop = EarlyStopping('val_loss', patience=patience) 42 | #使用函数调整学习率,当评价指标不在提升时,减少学习率 43 | #factor:每次减少学习率的因子,lr = lr*factor 44 | #当patience个epoch过去而模型性能不提升时,学习率减少的动作会被触发 45 | reduce_lr = ReduceLROnPlateau('val_loss', factor=0.1, patience=int(patience / 5), verbose=1) 46 | 47 | #子类MyCbk,创建一个自定义的回调函数 48 | #用来组建新的回调函数的抽象基类 49 | class MyCbk(keras.callbacks.Callback): 50 | def __init__(self, model): 51 | keras.callbacks.Callback.__init__(self) 52 | #定义要保存的模型 53 | self.model_to_save = model 54 | 55 | #定义保存模型函数,传入epoch和logs,并命名模型 56 | #在每轮结束时被调用。 57 | def on_epoch_end(self, epoch, logs=None): 58 | fmt = checkpoint_models_path + 'model.%02d-%.4f.hdf5' 59 | self.model_to_save.save(fmt % (epoch, logs['val_loss'])) 60 | 61 | 62 | # Load our model, added support for Multi-GPUs 63 | #调用build_model()函数创建模型 64 | #检测是否可以用GPU训练 65 | num_gpu = len(get_available_gpus()) 66 | if num_gpu >= 2: 67 | #使用CPU创建模型 68 | with tf.device("/cpu:0"): 69 | model = build_model() 70 | #如果存在预训练模型 71 | if pretrained_path is not None: 72 | #从HDF5文件中加载权重到当前模型中,by_name=True,只有名字匹配的层才会载入权重 73 | model.load_weights(pretrained_path, by_name=True) 74 | #使用多GPU并行训练 75 | new_model = multi_gpu_model(model, gpus=num_gpu) 76 | # rewrite the callback: saving through the original model and not the multi-gpu model. 77 | #保存模型 78 | model_checkpoint = MyCbk(model) 79 | else: 80 | #创建模型 81 | new_model = build_model() 82 | #如果存在预训练模型 83 | if pretrained_path is not None: 84 | #从HDF5文件中加载权重到当前模型中 85 | new_model.load_weights(pretrained_path) 86 | 87 | #使用Adam优化器,学习率为0.00005 88 | adam = keras.optimizers.Adam(lr=5e-5) 89 | #自定义损失函数,使用adam优化器,损失函数为categorical_crossentropy多分类对数损失 90 | #metrics评价函数,在训练和测试期间的模型评估标准。 91 | new_model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy']) 92 | 93 | #summary():打印出模型概况 94 | print(new_model.summary()) 95 | 96 | # Final callbacks 97 | #回调函数是一个函数的合集,会在训练的阶段中所使用。 98 | #你可以使用回调函数来查看训练模型的内在状态和统计。 99 | #ModelCheckpoint在每个训练期之后保存模型。 100 | #EarlyStopping当被监测的数量不再提升,则停止训练。 101 | #ReduceLROnPlateau当标准评估已经停止时,降低学习速率。 102 | callbacks = [tensor_board, model_checkpoint, early_stop, reduce_lr] 103 | 104 | # Start Fine-tuning 105 | #使用fit_generator进行训练 106 | #batch_size: 整数或 None。每次梯度更新的样本数。如果未指定,默认为 32 107 | #epochs: 整数。训练模型迭代轮次。一个轮次是在整个 x 和 y 上的一轮迭代。 108 | #verbose:1 = 进度条 109 | #callbacks:一系列可以在训练时使用的回调函数。 110 | #validation_data:用来评估损失,以及在每轮结束时的任何模型度量指标。 模型将不会在这个数据上进行训练。 111 | #steps_per_epoch: 整数或 None。 在声明一个轮次完成并开始下一个轮次之前的总步数(样品批次)。 112 | #validation_steps: 只有在指定了 steps_per_epoch时才有用。停止前要验证的总步数(批次样本)。 113 | new_model.fit_generator(train_gen(), 114 | #steps_per_epoch=num_train_samples // batch_size, 115 | steps_per_epoch=250, 116 | validation_data=valid_gen(), 117 | #validation_steps=num_valid_samples // batch_size, 118 | validation_steps=250, 119 | #epochs=epochs, 120 | epochs=10, 121 | verbose=1, 122 | callbacks=callbacks, 123 | #use_multiprocessing=True 124 | #workers=get_available_cpus() // 2 125 | ) 126 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | 数据集地址:https://challenger.ai/dataset/caption -------------------------------------------------------------------------------- /data/vocab_train.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/data/vocab_train.p -------------------------------------------------------------------------------- /demo.txt: -------------------------------------------------------------------------------- 1 | 一群穿着短袖的人在广场上打篮球 2 | 广场上有一群穿着短袖的人在打篮球 3 | 道路上有一群穿着各异的人在打篮球 4 | 宽敞的广场上有一群人在打篮球 5 | 宽敞的广场上有一群人在打篮球 6 | 宽敞的广场上有一群人在打篮球 7 | 一个穿着裙子的女人站在海边的沙滩上 8 | 海边的沙滩上站着一个穿着裙子的女人 9 | 海边的沙滩上站着一个穿着裙子的女人 10 | 海边的沙滩上站着一个穿着裙子的女人 11 | 海边的沙滩上站着一个穿着裙子的女人 12 | 海边的沙滩上站着一个披着长发的女人 -------------------------------------------------------------------------------- /forward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | sys.path.insert(0, 'src') 4 | #引用keras抽象后端引擎 5 | import keras.backend as K 6 | #引入tensorflow 7 | import tensorflow as tf 8 | #从keras中引入输入层、全连接层、LSTM层、Concatenate层、嵌入层、输入重复机制、TimeDistributed层、Dropout层 9 | from keras.layers import Input, Dense, LSTM, Concatenate, Embedding, RepeatVector, TimeDistributed, Dropout 10 | #引入模型层 11 | from keras.models import Model 12 | #引入模型结构绘制机制 13 | from keras.utils import plot_model 14 | 15 | from config import max_token_length 16 | from config import vocab_size, embedding_size 17 | 18 | 19 | def build_model(): 20 | # 输入文本信息 21 | text_input = Input(shape=(max_token_length,), dtype='int32') 22 | # 将文本信息转化为词向量 23 | x = Embedding(input_dim=vocab_size, output_dim=embedding_size)(text_input) 24 | # 将转换后的词向量输入LSTM网络 25 | x = LSTM(256, return_sequences=True)(x) 26 | # 使用包装器TimeDistributed包装Dense,以产生针对各个时间步信号的独立全连接 27 | text_embedding = TimeDistributed(Dense(embedding_size))(x) 28 | 29 | # 输入image embedding 30 | image_input = Input(shape=(2048,)) 31 | x = Dense(embedding_size, activation='relu', name='image_embedding')(image_input) 32 | # the image I is only input once 每一张图片被输入一次 33 | image_embedding = RepeatVector(1)(x) 34 | 35 | # language model 36 | # 将image embedding和 text embedding合并到一起 37 | x = [image_embedding, text_embedding] 38 | #将image embedding与text embedding按第一维度拼接起来 39 | x = Concatenate(axis=1)(x) 40 | #以0.1的概率丢弃信息 41 | x = Dropout(0.1)(x) 42 | #输入LSTM层 43 | x = LSTM(1024, return_sequences=True, name='language_lstm_1')(x) 44 | #以0.2的概率丢弃信息 45 | x = Dropout(0.2)(x) 46 | #输入LSTM层 47 | x = LSTM(1024, name='language_lstm_2')(x) 48 | #以0.4的概率丢弃信息 49 | x = Dropout(0.4)(x) 50 | #经过全连接层后输出 51 | output = Dense(vocab_size, activation='softmax', name='output')(x) 52 | #定义输入 53 | inputs = [image_input, text_input] 54 | #传入模型 55 | model = Model(inputs=inputs, outputs=output) 56 | return model 57 | 58 | 59 | if __name__ == '__main__': 60 | #使用cpu运行模型 61 | with tf.device("/cpu:0"): 62 | model = build_model() 63 | #打印模型信息 64 | print(model.summary()) 65 | #打印模型结构 66 | plot_model(model, to_file='model.svg', show_layer_names=True, show_shapes=True) 67 | 68 | K.clear_session() -------------------------------------------------------------------------------- /generated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | sys.path.insert(0, 'src') 4 | import json 5 | import pickle 6 | import zipfile 7 | import os 8 | os.environ['TF_CPP_MIN_LOG_LEVEL']='3' 9 | 10 | import jieba 11 | import keras 12 | import numpy as np 13 | from keras.applications.resnet50 import ResNet50 14 | from keras.preprocessing.image import (load_img, img_to_array) 15 | from tqdm import tqdm 16 | 17 | from config import img_rows, img_cols 18 | from config import start_word, stop_word, unknown_word 19 | from config import train_annotations_filename 20 | from config import train_folder, valid_folder, test_a_folder, test_b_folder 21 | from config import train_image_folder, valid_image_folder, test_a_image_folder, test_b_image_folder 22 | from config import valid_annotations_filename 23 | 24 | #调用Keras中的ResNet50模型,加载在ImageNet ILSVRC比赛中已经训练好的权重 25 | #include_top表示是否包含模型顶部的全连接层,如果不包含,则可以利用这些参数来做一些定制的事情 26 | image_model = ResNet50(include_top=False, weights='imagenet', pooling='avg') 27 | 28 | #确定是否存在文件夹 29 | def ensure_folder(folder): 30 | #如果不存在文件夹,创建文件夹 31 | if not os.path.exists(folder): 32 | os.makedirs(folder) 33 | 34 | #解压文件 35 | def extract(folder): 36 | #folder.zip 37 | filename = '{}.zip'.format(folder) 38 | #输出解压名称并执行解压操作 39 | print('Extracting {}...'.format(filename)) 40 | with zipfile.ZipFile(filename, 'r') as zip_ref: 41 | zip_ref.extractall('data') 42 | 43 | #将图像文件编码 44 | def encode_images(usage): 45 | encoding = {} 46 | #编码训练集 47 | if usage == 'train': 48 | image_folder = train_image_folder 49 | #编码验证集 50 | elif usage == 'valid': 51 | image_folder = valid_image_folder 52 | #编码测试集a 53 | elif usage == 'test_a': 54 | image_folder = test_a_image_folder 55 | #编码测试集b 56 | else: # usage == 'test_b': 57 | image_folder = test_b_image_folder 58 | #batch_size为256 59 | batch_size = 256 60 | #names储存文件夹中所有的jpg文件名称 61 | names = [f for f in os.listdir(image_folder) if f.endswith('.jpg')] 62 | #计算一共多少批次,ceil为向上取整 63 | num_batches = int(np.ceil(len(names) / float(batch_size))) 64 | 65 | #输出编码过程 66 | print('ResNet50提取特征中...') 67 | #对每个batche进行处理,使用tqdm库显示处理进度 68 | for idx in range(num_batches): 69 | #该批次开始的位置 70 | i = idx * batch_size 71 | #该批次的长度,会出现最后一个批次不够batchsize的情况 72 | length = min(batch_size, (len(names) - i)) 73 | #使用empty创建一个多维数组 74 | image_input = np.empty((length, img_rows, img_cols, 3)) 75 | #对于每一张图片 76 | for i_batch in range(length): 77 | #提取图片名称 78 | image_name = names[i + i_batch] 79 | #提取路径名称 80 | filename = os.path.join(image_folder, image_name) 81 | #keras读取图片,并且将图片调整为224*224 82 | img = load_img(filename, target_size=(img_rows, img_cols)) 83 | #将图片转为矩阵 84 | img_array = img_to_array(img) 85 | #使用keras内置的preprocess_input进行图片预处理,默认使用caffe模式去均值中心化 86 | img_array = keras.applications.resnet50.preprocess_input(img_array) 87 | #将处理后的图片保存到image_input中 88 | image_input[i_batch] = img_array 89 | 90 | #使用ResNet50网络进行预测,预测结果保存到preds中 91 | preds = image_model.predict(image_input) 92 | 93 | #对于每一张图片 94 | for i_batch in range(length): 95 | #提取图片名称 96 | image_name = names[i + i_batch] 97 | #把预测结果保存到encoding中 98 | encoding[image_name] = preds[i_batch] 99 | 100 | #用相应的类别命名 101 | filename = 'data/encoded_{}_images.p'.format(usage) 102 | #使用python的pickle模块把数据进行序列化,把encoing保存到filename中 103 | with open(filename, 'wb') as encoded_pickle: 104 | pickle.dump(encoding, encoded_pickle) 105 | print('ResNet50提取特征完毕...') 106 | 107 | #处理数据集的标注部分,生成训练集的词库 108 | def build_train_vocab(): 109 | #提取训练集标注文件的路径 110 | #data/ai_challenger_caption_train_20170902/caption_train_annotations_20170902.json 111 | annotations_path = os.path.join(train_folder, train_annotations_filename) 112 | 113 | #读取json格式的标注文件 114 | with open(annotations_path, 'r') as f: 115 | annotations = json.load(f) 116 | 117 | #输出处理进度 118 | print('building {} train vocab') 119 | #创建一个无序不重复元素集 120 | vocab = set() 121 | #使用tqdm输出进度 122 | for a in tqdm(annotations): 123 | #提取annotations每一行的caption注释 124 | caption = a['caption'] 125 | #对于每一个caption 126 | for c in caption: 127 | #使用jieba进行分词 128 | seg_list = jieba.cut(c) 129 | #把每个词加入到vocab中 130 | for word in seg_list: 131 | vocab.add(word) 132 | #在vocab中加入 133 | vocab.add(start_word) 134 | vocab.add(stop_word) 135 | vocab.add(unknown_word) 136 | 137 | #将vocab写入vocab_train.p 138 | filename = 'data/vocab_train.p' 139 | with open(filename, 'wb') as encoded_pickle: 140 | pickle.dump(vocab, encoded_pickle) 141 | 142 | #创建samples 143 | def build_samples(usage): 144 | #如果进行训练 145 | if usage == 'train': 146 | #路径为train_folder 147 | annotations_path = os.path.join(train_folder, train_annotations_filename) 148 | else: 149 | #否则路径为valid_folder 150 | annotations_path = os.path.join(valid_folder, valid_annotations_filename) 151 | with open(annotations_path, 'r') as f: 152 | #同时加载json文件 153 | annotations = json.load(f) 154 | 155 | #将vocab文件反序列化提取词汇 156 | vocab = pickle.load(open('data/vocab_train.p', 'rb')) 157 | #index to word 对vocab进行排序 158 | idx2word = sorted(vocab) 159 | #word to index zip函数将idx2word与序号索引打包为元祖,用dict函数将映射关系构造为字典,词:索引 160 | word2idx = dict(zip(idx2word, range(len(vocab)))) 161 | 162 | #输出进度信息 163 | print('building {} samples'.format(usage)) 164 | #列表samples 165 | samples = [] 166 | #对于每一项annotation 167 | for a in tqdm(annotations): 168 | #提取image_id 169 | image_id = a['image_id'] 170 | #提取caption 171 | caption = a['caption'] 172 | #对于每一项caption 173 | for c in caption: 174 | #使用jieba进行分词 175 | seg_list = jieba.cut(c) 176 | #列表inpit 177 | input = [] 178 | #last_word标签设置为start 179 | last_word = start_word 180 | #使用enumerate函数列出下标和数据 181 | for j, word in enumerate(seg_list): 182 | #如果词库中没有word 183 | if word not in vocab: 184 | #word修改为UNK 185 | word = unknown_word 186 | #input添加序号 187 | input.append(word2idx[last_word]) 188 | #samples添加id,input,output 189 | samples.append({'image_id': image_id, 'input': list(input), 'output': word2idx[word]}) 190 | #last_word设置成word 191 | last_word = word 192 | #input添加last_word 193 | input.append(word2idx[last_word]) 194 | #samples添加id,input,stop_word 195 | samples.append({'image_id': image_id, 'input': list(input), 'output': word2idx[stop_word]}) 196 | 197 | #打包samples信息 198 | filename = 'data/samples_{}.p'.format(usage) 199 | with open(filename, 'wb') as f: 200 | pickle.dump(samples, f) 201 | 202 | 203 | #主函数 204 | if __name__ == '__main__': 205 | # parameters 206 | # 确定是否存在data 207 | ensure_folder('data') 208 | 209 | #解压文件 210 | # if not os.path.isdir(train_image_folder): 211 | #extract(train_folder) 212 | 213 | #解压文件 214 | # if not os.path.isdir(valid_image_folder): 215 | #extract(valid_folder) 216 | 217 | #解压文件 218 | # if not os.path.isdir(test_a_image_folder): 219 | #extract(test_a_folder) 220 | 221 | #解压文件 222 | # if not os.path.isdir(test_b_image_folder): 223 | #extract(test_b_folder) 224 | 225 | #编码train 226 | if not os.path.isfile('data/encoded_train_images.p'): 227 | encode_images('train') 228 | 229 | #编码valid 230 | if not os.path.isfile('data/encoded_valid_images.p'): 231 | encode_images('valid') 232 | 233 | #编码test_a 234 | if not os.path.isfile('data/encoded_test_a_images.p'): 235 | encode_images('test_a') 236 | 237 | #编码test_b 238 | if not os.path.isfile('data/encoded_test_b_images.p'): 239 | encode_images('test_b') 240 | 241 | #生成词库 242 | if not os.path.isfile('data/vocab_train.p'): 243 | build_train_vocab() 244 | 245 | #生成train的图片与标注数据 246 | if not os.path.isfile('data/samples_train.p'): 247 | build_samples('train') 248 | 249 | #生成valid的图片与标注数据 250 | if not os.path.isfile('data/samples_valid.p'): 251 | build_samples('valid') 252 | 253 | def test_gen(): 254 | encode_images('test_a') 255 | -------------------------------------------------------------------------------- /images/0_bs_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/0_bs_image.jpg -------------------------------------------------------------------------------- /images/0_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/0_image.jpg -------------------------------------------------------------------------------- /images/1_bs_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/1_bs_image.jpg -------------------------------------------------------------------------------- /images/1_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/1_image.jpg -------------------------------------------------------------------------------- /images/2_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/2_image.jpg -------------------------------------------------------------------------------- /images/3_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/3_image.jpg -------------------------------------------------------------------------------- /images/4_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/4_image.jpg -------------------------------------------------------------------------------- /images/5_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/5_image.jpg -------------------------------------------------------------------------------- /images/6_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Zhao-Dongyu/Image-Chinese-description/b46e61f559ea3903b6ac79c1ff01f1fcdd035166/images/6_image.jpg -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | 断点续训 2 | python train.py -p models/model.10-5.6588.hdf5 -------------------------------------------------------------------------------- /src/analyze_data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import os 4 | 5 | import jieba#引用中文分词组件 6 | from tqdm import tqdm#终端进度条工具 7 | 8 | from config import train_folder, train_annotations_filename 9 | 10 | if __name__ == '__main__': 11 | print('Calculating the maximum length among all the captions')#计算所有描述中的最大长度 12 | annotations_path = os.path.join(train_folder, train_annotations_filename)#训练集标注文件路径 13 | 14 | with open(annotations_path, 'r') as f:#只读打开文件 15 | samples = json.load(f)#读取json信息 16 | 17 | max_len = 0 18 | for sample in tqdm(samples): 19 | caption = sample['caption']#读取caption键对应值 20 | for c in caption: 21 | seg_list = jieba.cut(c, cut_all=True)#将字符串分词 22 | length = sum(1 for item in seg_list)#计算该句描述中的分词数 23 | if length > max_len: 24 | max_len = length 25 | print('max_len: ' + str(max_len))#输出所有描述中的最大长度 26 | -------------------------------------------------------------------------------- /src/app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | sys.path.insert(0, 'src') 4 | # 引入必要的包 5 | import os 6 | import pickle 7 | import random 8 | 9 | import cv2 as cv 10 | import keras.backend as K 11 | 12 | from config import test_a_image_folder, img_rows, img_cols, best_model 13 | from forward import build_model 14 | from generated import test_gen 15 | 16 | if __name__ == '__main__': 17 | channel = 3 18 | #读取权重文件 19 | model_weights_path = os.path.join('models', best_model) 20 | #创建模型 21 | model = build_model() 22 | #加载权重 23 | model.load_weights(model_weights_path) 24 | #加载语料库 25 | vocab = pickle.load(open('data/vocab_train.p', 'rb')) 26 | 27 | #进行处理 便于模型读取文字数据和预测 28 | idx2word = sorted(vocab) 29 | word2idx = dict(zip(idx2word, range(len(vocab)))) 30 | 31 | print(model.summary()) 32 | 33 | test_gen() 34 | 35 | #读取图片 36 | encoded_test_a = pickle.load(open('data/encoded_test_a_images.p', 'rb')) 37 | 38 | names = [f for f in encoded_test_a.keys()] 39 | 40 | samples = names 41 | 42 | sentences = [] 43 | for i in range(len(samples)): 44 | #依次取图片 45 | image_name = samples[i] 46 | filename = os.path.join(test_a_image_folder, image_name) 47 | # # print('Start processing image: {}'.format(filename)) 48 | # image_input = np.zeros((1, 2048)) 49 | # image_input[0] = encoded_test_a[image_name] 50 | # 51 | # start_words = [start_word] 52 | # while True: 53 | # text_input = [word2idx[i] for i in start_words] 54 | # text_input = sequence.pad_sequences([text_input], maxlen=max_token_length, padding='post') 55 | # preds = model.predict([image_input, text_input]) 56 | # # print('output.shape: ' + str(output.shape)) 57 | # word_pred = idx2word[np.argmax(preds[0])] 58 | # start_words.append(word_pred) 59 | # if word_pred == stop_word or len(start_word) > max_token_length: 60 | # break 61 | #使用beam_search机制进行预测 62 | from beam_search import beam_search_predictions 63 | 64 | candidate = beam_search_predictions(model, image_name, word2idx, idx2word, encoded_test_a, 65 | beam_index=20) 66 | #打印结果 67 | print(candidate) 68 | sentences.append(candidate) 69 | #读取图片 并调整其大小 70 | img = cv.imread(filename) 71 | #img = cv.resize(img, (img_rows, img_cols), cv.INTER_CUBIC) 72 | if not os.path.exists('images'): 73 | os.makedirs('images') 74 | cv.imwrite('images/{}_image.jpg'.format(i), img) 75 | #将预测产生的描述信息输出到demo.txt文件中 76 | with open('demo.txt', 'w') as file: 77 | file.write('\n'.join(sentences)) 78 | 79 | K.clear_session() -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | img_rows, img_cols, img_size = 224, 224, 2243#图像行数,列数,大小 5 | channel = 3#通道数 6 | batch_size = 256#batch大小 7 | epochs = 10000#迭代次数 8 | patience = 50#容忍度 9 | num_train_samples = 14883151#训练样本数 10 | num_valid_samples = 2102270#验证样本数 11 | embedding_size = 128#嵌入层数目 12 | vocab_size = 17628#词汇数目 13 | max_token_length = 40#分词后token的最大长度 14 | num_image_features = 2048#图像特征数 15 | hidden_size = 512# 隐藏层中单元数目 16 | 17 | train_folder = 'data/ai_challenger_caption_train_20170902'#训练集路径 18 | valid_folder = 'data/ai_challenger_caption_validation_20170910'#验证集路径 19 | test_a_folder = 'data/ai_challenger_caption_test_a_20180103'#测试集a路径 20 | test_b_folder = 'data/ai_challenger_caption_test_b_20180103'#测试集b路径 21 | train_image_folder = os.path.join(train_folder, 'caption_train_images_20170902')#训练集图像路径 22 | valid_image_folder = os.path.join(valid_folder, 'caption_validation_images_20170910')#验证集图像路径 23 | test_a_image_folder = os.path.join(test_a_folder, 'caption_test_a_images_20180103')#测试集a图像路径 24 | test_b_image_folder = os.path.join(test_b_folder, 'caption_test_b_images_20180103')#测试集b图像路径 25 | train_annotations_filename = 'caption_train_annotations_20170902.json'#训练集标注路径 26 | valid_annotations_filename = 'caption_validation_annotations_20170910.json'#验证集标注路径 27 | test_a_annotations_filename = 'caption_test_a_annotations_20180103.json'#测试集a标注路径 28 | test_b_annotations_filename = 'caption_test_b_annotations_20180103.json'#测试集b标注路径 29 | 30 | start_word = ''#开始的词 31 | stop_word = ''#结束的词 32 | unknown_word = ''#未知词 33 | 34 | best_model = 'model.04-1.3820.hdf5' 35 | beam_size = 20#光束大小 36 | -------------------------------------------------------------------------------- /src/data_generator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pickle 3 | from tqdm import tqdm 4 | import keras 5 | import numpy as np 6 | from keras.preprocessing import sequence 7 | from keras.utils import Sequence 8 | 9 | from config import batch_size, max_token_length, vocab_size 10 | 11 | 12 | class DataGenSequence(Sequence): 13 | def __init__(self, usage): 14 | self.usage = usage 15 | 16 | vocab = pickle.load(open('data/vocab_train.p', 'rb')) 17 | #反序列化词汇表,将文件中的数据解析为一个python对象 18 | self.idx2word = sorted(vocab)#对vocab排序 19 | self.word2idx = dict(zip(self.idx2word, range(len(vocab)))) 20 | #将排序好的列表和从0开始的序号列表打包成元组,并对应生成字典 21 | 22 | filename = 'data/encoded_{}_images.p'.format(usage)#用usage替换占位符{}中的内容 23 | self.image_encoding = pickle.load(open(filename, 'rb'))#反序列化图像编码 24 | 25 | if usage == 'train': 26 | samples_path = 'data/samples_train.p'#训练集路径 27 | else: 28 | samples_path = 'data/samples_valid.p'#验证集路径 29 | 30 | samples = pickle.load(open(samples_path, 'rb')) 31 | self.samples = samples 32 | np.random.shuffle(self.samples)#打乱 33 | 34 | def __len__(self): 35 | return int(np.ceil(len(self.samples) / float(batch_size))) 36 | #返回大于等于samples长度/bactch size的最小整数 37 | 38 | def __getitem__(self, idx): 39 | i = idx * batch_size 40 | 41 | length = min(batch_size, (len(self.samples) - i)) 42 | #batch size和sample长度-i中的小值作为length 43 | batch_image_input = np.empty((length, 2048), dtype=np.float32) 44 | #生成一个随机元素的矩阵,行数为length,列数为2048,数据类型为32位浮点数 45 | batch_y = np.empty((length, vocab_size), dtype=np.int32) 46 | #生成一个随机元素的矩阵,行数为length,列数为vocab_size,数据类型为32位浮点数 47 | text_input = [] 48 | 49 | for i_batch in tqdm(range(length)): 50 | sample = self.samples[i + i_batch] 51 | image_id = sample['image_id']#读取image_id键对应值 52 | image_input = np.array(self.image_encoding[image_id])#生成对应图像二维数组 53 | text_input.append(sample['input'])#向text_input列表添加sample中input键对应值 54 | batch_image_input[i_batch] = image_input 55 | batch_y[i_batch] = keras.utils.to_categorical(sample['output'], vocab_size) 56 | #将output整型标签转为onehot,vocab_size为标签类别总数 57 | 58 | batch_text_input = sequence.pad_sequences(text_input, maxlen=max_token_length, padding='post')#将text_input序列转化为经过填充的等长的新序列,max_token_length为序列的最大长度,当需要补0时在序列的结尾补 59 | return [batch_image_input, batch_text_input], batch_y 60 | 61 | def on_epoch_end(self): 62 | np.random.shuffle(self.samples)#打乱顺序 63 | 64 | 65 | def train_gen(): 66 | return DataGenSequence('train')#生成训练集序列 67 | 68 | 69 | def valid_gen(): 70 | return DataGenSequence('valid')#生成验证集序列 71 | -------------------------------------------------------------------------------- /src/hp_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import print_function 3 | 4 | import os 5 | from math import log 6 | 7 | import keras 8 | from hyperas import optim#Hyperas可以在Keras中自动选择超参数 9 | from hyperas.distributions import loguniform 10 | from hyperas.distributions import uniform 11 | from hyperopt import Trials, STATUS_OK, tpe 12 | from keras.layers import Input, LSTM, Concatenate, Embedding, RepeatVector, TimeDistributed 13 | from keras.layers.core import Dense, Dropout 14 | from keras.models import Model 15 | 16 | from config import batch_size, num_train_samples, num_valid_samples, max_token_length, vocab_size, embedding_size, \ 17 | best_model 18 | from data_generator import DataGenSequence 19 | 20 | 21 | def data(): 22 | return DataGenSequence('train'), DataGenSequence('valid')#返回训练集序列和验证集序列 23 | 24 | 25 | def create_model(): 26 | # word embedding 27 | text_input = Input(shape=(max_token_length,), dtype='int32')#输入行数为max_token_length的矩阵,数据类型是32位整数 28 | x = Embedding(input_dim=vocab_size, output_dim=embedding_size)(text_input)#embedding层 29 | x = LSTM(256, return_sequences=True)(x)#输入LSTM模型,256个神经元 30 | text_embedding = TimeDistributed(Dense(embedding_size))(x)#使用包装器TimeDistributed包装Dense,以产生针对各个时间步信号的独立全连接 31 | 32 | # image embedding 33 | image_input = Input(shape=(2048,))#输入行数为2048的图形数据矩阵 34 | #全连接层dense,激活函数:relu 35 | x = Dense(embedding_size, activation='relu', name='image_embedding')(image_input) 36 | # the image I is only input once RepeatVector层将输入只重复1次 37 | image_embedding = RepeatVector(1)(x) 38 | 39 | # language model 40 | x = [image_embedding, text_embedding] 41 | #按列合并 42 | x = Concatenate(axis=1)(x) 43 | #以一定概率丢弃神经元,uniform随机生成0-1内一个实数 44 | x = Dropout({{uniform(0, 1)}})(x) 45 | #输入LSTM模型,1024个神经元 46 | x = LSTM(1024, return_sequences=True, name='language_lstm_1')(x) 47 | x = Dropout({{uniform(0, 1)}})(x) 48 | x = LSTM(1024, name='language_lstm_2')(x) 49 | x = Dropout({{uniform(0, 1)}})(x) 50 | # 全连接层dense,激活函数:softmax 51 | output = Dense(vocab_size, activation='softmax', name='output')(x) 52 | 53 | inputs = [image_input, text_input] 54 | model = Model(inputs=inputs, outputs=output) 55 | model_weights_path = os.path.join('models', best_model) 56 | #加载权值 57 | model.load_weights(model_weights_path) 58 | #Adam 优化器 59 | adam = keras.optimizers.Adam(lr={{loguniform(log(1e-6), log(1e-3))}}) 60 | #将优化器传递给 model.compile(),损失函数为多分类的对数损失函数 61 | model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=adam) 62 | #逐个生成数据的batch并进行训练 63 | model.fit_generator( 64 | DataGenSequence('train'), 65 | steps_per_epoch=num_train_samples / batch_size // 10, 66 | validation_data=DataGenSequence('valid'), 67 | validation_steps=num_valid_samples / batch_size // 10) 68 | 69 | score, acc = model.evaluate_generator(DataGenSequence('valid'), verbose=0)#使用一个生成器作为数据源,来评估模型 70 | print('Test accuracy:', acc) 71 | return {'loss': -acc, 'status': STATUS_OK, 'model': model} 72 | 73 | 74 | if __name__ == '__main__': 75 | best_run, best_model = optim.minimize(model=create_model, 76 | data=data, 77 | algo=tpe.suggest,#algo参数指定搜索算法,tpe表示 tree of Parzen estimators 78 | max_evals=10,#执行的最大评估次数max_evals 79 | trials=Trials())#在每个时间步存储信息 80 | 81 | print("Evalutation of best performing model:")#最佳表现模型评估 82 | print(best_model.evaluate_generator(DataGenSequence('valid')))#使用一个生成器作为数据源,来评估模型 83 | print("Best performing model chosen hyper-parameters:")#表现最佳的模型选择的超参数: 84 | print(best_run) 85 | -------------------------------------------------------------------------------- /src/template.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | if __name__ == '__main__': 4 | with open('demo.txt', 'r', encoding="utf-8") as file: 5 | demo = file.readlines()#按行读demo 6 | 7 | with open('beam.txt', 'r', encoding="utf-8") as file: 8 | beam = file.readlines()#按行读beam 9 | 10 | with open('README.template', 'r', encoding="utf-8") as file: 11 | template = file.readlines()#按行读template 12 | 13 | template = ''.join(template)#将template以空格连接 14 | 15 | for i in range(20): 16 | template = template.replace('[{}]'.format(i), demo[i].strip())#把template中的所有[]内的{}占位符用demo中去除首尾空格后的对应行替换 17 | 18 | for i in range(0, 10): 19 | beam_data = [line.strip() for line in beam[i * 4:(i + 1) * 4]]#beam中去掉首尾空格每四行作为一个列表存到beam_data 20 | beam_text = '
'.join(beam_data)#将beam_data以换行符连接存到bean_text 21 | template = template.replace('({})'.format(i), beam_text)#把template中的所有()内的{}占位符用beam_text替换 22 | 23 | with open('README.md', 'w', encoding="utf-8") as file: 24 | file.write(template)#写回template 25 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import multiprocessing 3 | 4 | import cv2 as cv 5 | import tensorflow as tf 6 | from tensorflow.python.client import device_lib 7 | 8 | 9 | # 获取GPU的数量 10 | def get_available_gpus(): 11 | local_device_protos = device_lib.list_local_devices() 12 | return [x.name for x in local_device_protos if x.device_type == 'GPU'] 13 | 14 | 15 | # 获取CPU的数量 16 | def get_available_cpus(): 17 | return multiprocessing.cpu_count() 18 | 19 | # 在图片上写文字用的方法 将字符写到图片上输出 20 | def draw_str(dst, target, s): 21 | x, y = target 22 | cv.putText(dst, s, (x + 1, y + 1), cv.FONT_HERSHEY_PLAIN, 1.0, (0, 0, 0), thickness=2, lineType=cv.LINE_AA) 23 | cv.putText(dst, s, (x, y), cv.FONT_HERSHEY_PLAIN, 1.0, (255, 255, 255), lineType=cv.LINE_AA) 24 | 25 | # 使用交叉熵损失函数作为分类损失函数 26 | def sparse_loss(y_true, y_pred): 27 | return tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred) --------------------------------------------------------------------------------