├── .gitignore ├── Ex1_BLIP ├── README.md ├── Salesforce │ └── blip-image-captioning-base │ │ ├── config.json │ │ ├── preprocessor_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt ├── requirements.txt ├── run_fulldata_script.py └── run_script.py ├── Ex2_RL-Loss ├── Model1_RL-Loss.py ├── Model2_RL-Loss.py └── Pure_RL-Loss.py ├── LICENSE ├── Model1_YellowOrange ├── README.md ├── SelfAttention+Attention.ipynb ├── __pycache__ │ ├── configuartions.cpython-310.pyc │ ├── datasets.cpython-310.pyc │ └── models.cpython-310.pyc ├── configuartions.py ├── datasets.py ├── datasets_pretrain_demo.py ├── models.py ├── predict.py ├── requirements.txt └── train.py ├── Model2_Transformer ├── TransformerE+D.ipynb ├── __pycache__ │ └── configuration.cpython-310.pyc ├── configuration.py ├── data_preprocessing │ ├── divide_dataset.py │ └── name_info.py ├── datasets │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── coco.cpython-310.pyc │ │ └── utils.cpython-310.pyc │ ├── coco.py │ └── utils.py ├── engine.py ├── fulldata_inference.py ├── local_inference.py ├── models │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-310.pyc │ │ ├── alice.cpython-310.pyc │ │ ├── backbone.cpython-310.pyc │ │ ├── caption.cpython-310.pyc │ │ ├── position_encoding.cpython-310.pyc │ │ ├── transformer.cpython-310.pyc │ │ └── utils.cpython-310.pyc │ ├── alice.py │ ├── backbone.py │ ├── caption.py │ ├── position_encoding.py │ ├── transformer.py │ └── utils.py ├── online_inference.py ├── requirements.txt ├── train_coco.py └── train_dev.py ├── Original_Model ├── __pycache__ │ ├── configurations.cpython-310.pyc │ ├── datasets.cpython-310.pyc │ └── models.cpython-310.pyc ├── configurations.py ├── datasets.py ├── gridattn_image_caption.ipynb ├── models.py ├── predict.py └── train.py ├── README.md ├── data_new ├── BLIP_test_captions.json ├── BLIP_train_captions.json ├── Model2_test_captions.json ├── Model2_train_captions_1.json ├── output │ ├── caplens_test.json │ ├── caplens_train.json │ ├── encoded_captions_test.json │ ├── encoded_captions_train.json │ ├── image_paths_test.json │ ├── image_paths_train.json │ └── vocab.json └── rename_script.py ├── data_old ├── label.json ├── output │ ├── caplens_test.json │ ├── caplens_train.json │ ├── encoded_captions_test.json │ ├── encoded_captions_train.json │ ├── image_paths_test.json │ ├── image_paths_train.json │ └── vocab.json ├── test_captions.json └── train_captions.json └── doc ├── NNDL图像_描述指南.pdf ├── NNDL课设_中期报告.pdf ├── NNDL课设_开题报告.pdf ├── NNDL课设_结题报告.ipynb ├── NNDL课设_结题报告.md ├── NNDL课设_结题报告.pdf ├── NNDL课设_要求说明.pdf └── img ├── 01.png ├── 02.png ├── 03.png ├── AttentionModel-first_train-1.png ├── AttentionModel-first_train-2.png ├── AttentionModel-first_train-3.png ├── AttentionModel-first_train-4.png ├── AttentionModel_backgroundcaption.png ├── BLIP_1.png ├── BLIP_2.png ├── BLIP_3.png ├── BLIP_demo.png ├── BLIP_full.png ├── CNN.png ├── Ex_data.png ├── OriginalModel-1.png ├── OriginalModel-2.png ├── Out_of_Memory.png ├── RNN.png ├── Transformer_demo1.png ├── Transformer_demo2.png ├── Transformer_demo3.png ├── Transformer_demo4.png ├── Transformer_framework.png ├── image-1.png ├── image.png └── image20.png /.gitignore: -------------------------------------------------------------------------------- 1 | data_old/test_images/ 2 | data_old/train_images/ 3 | data_new/test_images/ 4 | data_new/train_images_1/ 5 | data_new/train_images_2/ 6 | data_new/output/weights/ 7 | Ex1_BLIP/Salesforce/blip-image-captioning-base/pytorch_model.bin 8 | Model2_Transformer/image_caption_model.pth -------------------------------------------------------------------------------- /Ex1_BLIP/README.md: -------------------------------------------------------------------------------- 1 | 一键运行方法: 2 | 3 | 1. `pip install -r requirements.txt` 4 | 2. `python run_half_precision.py` 5 | 3. 查看输出 6 | 7 | > 🤗写的文档能让80岁老太太看懂是我们的目标 -------------------------------------------------------------------------------- /Ex1_BLIP/Salesforce/blip-image-captioning-base/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_commit_hash": null, 3 | "architectures": [ 4 | "BlipForConditionalGeneration" 5 | ], 6 | "image_text_hidden_size": 256, 7 | "initializer_factor": 1.0, 8 | "logit_scale_init_value": 2.6592, 9 | "model_type": "blip", 10 | "projection_dim": 512, 11 | "text_config": { 12 | "_name_or_path": "", 13 | "add_cross_attention": false, 14 | "architectures": null, 15 | "attention_probs_dropout_prob": 0.0, 16 | "bad_words_ids": null, 17 | "begin_suppress_tokens": null, 18 | "bos_token_id": 30522, 19 | "chunk_size_feed_forward": 0, 20 | "cross_attention_hidden_size": null, 21 | "decoder_start_token_id": null, 22 | "diversity_penalty": 0.0, 23 | "do_sample": false, 24 | "early_stopping": false, 25 | "encoder_no_repeat_ngram_size": 0, 26 | "eos_token_id": 2, 27 | "exponential_decay_length_penalty": null, 28 | "finetuning_task": null, 29 | "forced_bos_token_id": null, 30 | "forced_eos_token_id": null, 31 | "hidden_act": "gelu", 32 | "hidden_dropout_prob": 0.0, 33 | "hidden_size": 768, 34 | "id2label": { 35 | "0": "LABEL_0", 36 | "1": "LABEL_1" 37 | }, 38 | "initializer_factor": 1.0, 39 | "initializer_range": 0.02, 40 | "intermediate_size": 3072, 41 | "is_decoder": true, 42 | "is_encoder_decoder": false, 43 | "label2id": { 44 | "LABEL_0": 0, 45 | "LABEL_1": 1 46 | }, 47 | "layer_norm_eps": 1e-12, 48 | "length_penalty": 1.0, 49 | "max_length": 24, 50 | "max_position_embeddings": 512, 51 | "min_length": 0, 52 | "model_type": "blip_text_model", 53 | "no_repeat_ngram_size": 0, 54 | "num_attention_heads": 12, 55 | "num_beam_groups": 1, 56 | "num_beams": 1, 57 | "num_hidden_layers": 12, 58 | "num_return_sequences": 1, 59 | "output_attentions": false, 60 | "output_hidden_states": false, 61 | "output_scores": false, 62 | "pad_token_id": 0, 63 | "prefix": null, 64 | "problem_type": null, 65 | "projection_dim": 768, 66 | "pruned_heads": {}, 67 | "remove_invalid_values": false, 68 | "repetition_penalty": 1.0, 69 | "return_dict": true, 70 | "return_dict_in_generate": false, 71 | "sep_token_id": 102, 72 | "suppress_tokens": null, 73 | "task_specific_params": null, 74 | "temperature": 1.0, 75 | "tf_legacy_loss": false, 76 | "tie_encoder_decoder": false, 77 | "tie_word_embeddings": true, 78 | "tokenizer_class": null, 79 | "top_k": 50, 80 | "top_p": 1.0, 81 | "torch_dtype": null, 82 | "torchscript": false, 83 | "transformers_version": "4.26.0.dev0", 84 | "typical_p": 1.0, 85 | "use_bfloat16": false, 86 | "use_cache": true, 87 | "vocab_size": 30524 88 | }, 89 | "torch_dtype": "float32", 90 | "transformers_version": null, 91 | "vision_config": { 92 | "_name_or_path": "", 93 | "add_cross_attention": false, 94 | "architectures": null, 95 | "attention_dropout": 0.0, 96 | "bad_words_ids": null, 97 | "begin_suppress_tokens": null, 98 | "bos_token_id": null, 99 | "chunk_size_feed_forward": 0, 100 | "cross_attention_hidden_size": null, 101 | "decoder_start_token_id": null, 102 | "diversity_penalty": 0.0, 103 | "do_sample": false, 104 | "dropout": 0.0, 105 | "early_stopping": false, 106 | "encoder_no_repeat_ngram_size": 0, 107 | "eos_token_id": null, 108 | "exponential_decay_length_penalty": null, 109 | "finetuning_task": null, 110 | "forced_bos_token_id": null, 111 | "forced_eos_token_id": null, 112 | "hidden_act": "gelu", 113 | "hidden_size": 768, 114 | "id2label": { 115 | "0": "LABEL_0", 116 | "1": "LABEL_1" 117 | }, 118 | "image_size": 384, 119 | "initializer_factor": 1.0, 120 | "initializer_range": 0.02, 121 | "intermediate_size": 3072, 122 | "is_decoder": false, 123 | "is_encoder_decoder": false, 124 | "label2id": { 125 | "LABEL_0": 0, 126 | "LABEL_1": 1 127 | }, 128 | "layer_norm_eps": 1e-05, 129 | "length_penalty": 1.0, 130 | "max_length": 20, 131 | "min_length": 0, 132 | "model_type": "blip_vision_model", 133 | "no_repeat_ngram_size": 0, 134 | "num_attention_heads": 12, 135 | "num_beam_groups": 1, 136 | "num_beams": 1, 137 | "num_channels": 3, 138 | "num_hidden_layers": 12, 139 | "num_return_sequences": 1, 140 | "output_attentions": false, 141 | "output_hidden_states": false, 142 | "output_scores": false, 143 | "pad_token_id": null, 144 | "patch_size": 16, 145 | "prefix": null, 146 | "problem_type": null, 147 | "projection_dim": 512, 148 | "pruned_heads": {}, 149 | "remove_invalid_values": false, 150 | "repetition_penalty": 1.0, 151 | "return_dict": true, 152 | "return_dict_in_generate": false, 153 | "sep_token_id": null, 154 | "suppress_tokens": null, 155 | "task_specific_params": null, 156 | "temperature": 1.0, 157 | "tf_legacy_loss": false, 158 | "tie_encoder_decoder": false, 159 | "tie_word_embeddings": true, 160 | "tokenizer_class": null, 161 | "top_k": 50, 162 | "top_p": 1.0, 163 | "torch_dtype": null, 164 | "torchscript": false, 165 | "transformers_version": "4.26.0.dev0", 166 | "typical_p": 1.0, 167 | "use_bfloat16": false 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /Ex1_BLIP/Salesforce/blip-image-captioning-base/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "do_resize": true, 4 | "image_mean": [ 5 | 0.48145466, 6 | 0.4578275, 7 | 0.40821073 8 | ], 9 | "image_processor_type": "BlipImageProcessor", 10 | "image_std": [ 11 | 0.26862954, 12 | 0.26130258, 13 | 0.27577711 14 | ], 15 | "processor_class": "BlipProcessor", 16 | "size": 384 17 | } 18 | -------------------------------------------------------------------------------- /Ex1_BLIP/Salesforce/blip-image-captioning-base/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /Ex1_BLIP/Salesforce/blip-image-captioning-base/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_basic_tokenize": true, 4 | "do_lower_case": true, 5 | "mask_token": "[MASK]", 6 | "model_max_length": 512, 7 | "name_or_path": "bert-base-uncased", 8 | "never_split": null, 9 | "pad_token": "[PAD]", 10 | "processor_class": "BlipProcessor", 11 | "sep_token": "[SEP]", 12 | "special_tokens_map_file": null, 13 | "strip_accents": null, 14 | "tokenize_chinese_chars": true, 15 | "tokenizer_class": "BertTokenizer", 16 | "unk_token": "[UNK]", 17 | "model_input_names": [ 18 | "input_ids", 19 | "attention_mask" 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /Ex1_BLIP/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Ex1_BLIP/requirements.txt -------------------------------------------------------------------------------- /Ex1_BLIP/run_fulldata_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | from PIL import Image 5 | from transformers import BlipProcessor, BlipForConditionalGeneration 6 | 7 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 8 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda") 9 | 10 | # Path to the folder containing images 11 | images_folder = "../Ex2_data/train_images" 12 | 13 | # Create a dictionary to store image captions 14 | captions_dict = {} 15 | 16 | count = 1 17 | 18 | # Loop through each file in the images folder 19 | for filename in os.listdir(images_folder): 20 | if filename.endswith(".jpg") or filename.endswith(".png"): # Add other supported image formats if necessary 21 | img_path = os.path.join(images_folder, filename) 22 | 23 | # Load the image 24 | raw_image = Image.open(img_path).convert('RGB') 25 | 26 | # Your text for captioning 27 | text = "The background content in the picture is" 28 | 29 | # Process the image and generate caption 30 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16) 31 | 32 | out = model.generate(**inputs) 33 | generated_caption = processor.decode(out[0], skip_special_tokens=True) 34 | 35 | print(f"No{count}", generated_caption) 36 | count += 1 37 | 38 | # Store the caption in the dictionary 39 | captions_dict[img_path] = generated_caption 40 | 41 | # Save the dictionary to captions.json 42 | output_path = "../Ex2_data/Ex2_background_train_captions.json" 43 | with open(output_path, 'w') as json_file: 44 | json.dump(captions_dict, json_file, indent=4) 45 | 46 | print(f"Captions saved to {output_path}") 47 | -------------------------------------------------------------------------------- /Ex1_BLIP/run_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import requests 4 | import torch 5 | from PIL import Image 6 | from transformers import BlipProcessor, BlipForConditionalGeneration 7 | from nltk.tokenize import word_tokenize 8 | from nltk.translate.meteor_score import single_meteor_score 9 | from rouge import Rouge 10 | 11 | def print_line(): 12 | print("============================================================================================================") 13 | 14 | # 计算 METEOR 分数 15 | def calc_meteor(reference, hypothesis): 16 | hypothesis = word_tokenize(hypothesis) 17 | reference = word_tokenize(reference) 18 | return single_meteor_score(reference, hypothesis) 19 | 20 | # 计算 ROUGE-L 分数 21 | def calc_rouge_l(reference, hypothesis): 22 | rouge = Rouge() 23 | scores = rouge.get_scores(hypothesis, reference) 24 | return scores[0]['rouge-l']['f'] 25 | 26 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 27 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda") 28 | 29 | # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' # 图片URL版本 30 | # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') # 图片URL版本 31 | 32 | img_local_url = "../data_new/test_images/test_8.jpg" # 本地图片版本 33 | raw_image = Image.open(img_local_url).convert('RGB') # 本地图片版本 34 | 35 | # Conditional image captioning 36 | text = "The background content in the picture is" 37 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16) 38 | 39 | out = model.generate(**inputs) 40 | generated_caption = processor.decode(out[0], skip_special_tokens=True) 41 | print_line() 42 | print("背景描述:", generated_caption) 43 | # print_line() 44 | 45 | # Unconditional image captioning 46 | # inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16) 47 | 48 | # out = model.generate(**inputs) 49 | # generated_caption_unconditional = processor.decode(out[0], skip_special_tokens=True) 50 | # print_line() 51 | # print("图像描述:", generated_caption_unconditional) 52 | # print_line() 53 | 54 | # # 加入评估指标计算 55 | # with open('../data/test_captions.json', 'r') as f: 56 | # captions = json.load(f) 57 | 58 | # # filename = os.path.basename(img_local_url) 59 | # filename = os.path.basename(img_url) 60 | # reference_description = captions.get(filename, "No description found.") 61 | 62 | # print("Predict Caption = ", generated_caption.capitalize()) 63 | # print("Reference Caption = ", reference_description.capitalize()) 64 | 65 | # meteor_score = calc_meteor(reference_description, generated_caption) 66 | # rouge_l_score = calc_rouge_l(reference_description, generated_caption) 67 | # print_line() 68 | # print("METEOR Score =", round(meteor_score, 4)) 69 | # print("ROUGE-L Score =", round(rouge_l_score, 4)) 70 | # print_line() 71 | -------------------------------------------------------------------------------- /Ex2_RL-Loss/Model1_RL-Loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | 我们使用强化学习损失函数,将交叉熵损失和CIDEr-D评价指标结合,优化损失函数。 3 | 我们将使用REINFORCE算法来进行更新。 4 | """ 5 | import json 6 | import torch 7 | import os 8 | from configuartions import Config 9 | from models import AttentionModel, get_optimizer, PackedCrossEntropyLoss, evaluate_cider 10 | from datasets import create_dataloaders, ImageTextDataset 11 | from torch.distributions import Categorical 12 | 13 | 14 | def main(): 15 | best_test_score = float('-inf') # 初始化最佳测试得分 16 | 17 | # 加载配置 18 | config = Config() 19 | 20 | # 创建数据加载器 21 | train_loader, test_loader = create_dataloaders(config) 22 | 23 | # 加载词汇表文件 24 | with open('../data/output/vocab.json', 'r') as f: 25 | vocab = json.load(f) 26 | 27 | # 模型初始化 28 | model = AttentionModel( 29 | image_code_dim=config.image_code_dim, 30 | vocab=vocab, # 传递词汇表字典 31 | word_dim=config.word_dim, 32 | attention_dim=config.attention_dim, 33 | hidden_size=config.hidden_size, 34 | num_layers=config.num_layers 35 | ).to(config.device) 36 | 37 | # 优化器 38 | optimizer = get_optimizer(model, config) 39 | 40 | # 损失函数 41 | loss_fn = PackedCrossEntropyLoss().to(config.device) 42 | 43 | # 创建保存权重的文件夹路径 44 | weights_dir = os.path.join(config.output_folder, 'weights') 45 | os.makedirs(weights_dir, exist_ok=True) 46 | 47 | best_val_score = float('-inf') # 初始化最佳验证得分 48 | 49 | for epoch in range(config.num_epochsum_epochs): 50 | model.train() 51 | for i, (imgs, caps, caplens) in enumerate(train_dataloader): 52 | imgs, caps = imgs.to(device), caps.to(device) 53 | caplens = caplens.cpu().to(torch.int64) 54 | optimizer.zero_grad() 55 | outputs, alphas, _, _, softmax_probabilities = model(imgs, caps, caplens) 56 | current_test_score = evaluate_cider(test_loader, model, config) 57 | m = Categorical(torch.tensor(softmax_probabilities)) 58 | action = m.sample() 59 | log_probs = m.log_prob(action) 60 | reinforce_loss = -log_probs * float(current_test_score) 61 | reinforce_loss.mean().backward() 62 | optimizer.step() 63 | 64 | 65 | """ 66 | # 开始训练 67 | for epoch in range(config.num_epochs): 68 | # 训练模型 69 | model.train() 70 | for i, (imgs, caps, caplens) in enumerate(train_loader): 71 | imgs, caps = imgs.to(config.device), caps.to(config.device) 72 | caplens = caplens.cpu().to(torch.int64) 73 | 74 | optimizer.zero_grad() 75 | outputs, alphas, _, _, _ = model(imgs, caps, caplens) 76 | 77 | # 确保目标序列长度与模型输出匹配 78 | targets = caps[:, 1:] # 假设targets是captions去除第一个标记后的部分 79 | # print(f"Outputs shape: {outputs.shape}") 80 | # print(f"Targets shape: {targets.shape}") 81 | # print(f"Caplens: {caplens}") 82 | loss = loss_fn(outputs, targets, caplens) 83 | loss.backward() 84 | optimizer.step() 85 | 86 | # 打印/记录损失信息 87 | if (i + 1) % 100 == 0: 88 | print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}') 89 | 90 | # 在每个epoch结束时使用测试集评估模型 91 | current_test_score = evaluate_cider(test_loader, model, config) 92 | print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}") 93 | 94 | # 如果当前得分比之前的最佳得分要好,则保存模型 95 | if current_test_score > best_test_score: 96 | best_test_score = current_test_score 97 | best_model_path = os.path.join(weights_dir, f'Attention_model_background_caption_{best_test_score}.pth') 98 | torch.save(model.state_dict(), best_model_path) 99 | print(f"Saved new best model to {best_model_path}") 100 | """ 101 | 102 | # 训练完成后的最终评估 103 | final_test_score = evaluate_cider(test_loader, model, config) 104 | print(f"Final CIDEr-D score = {final_test_score}") 105 | 106 | # # 训练完成后保存模型 107 | # final_model_path = os.path.join(weights_dir, 'AttentionModel.pth') 108 | # torch.save(model.state_dict(), final_model_path) 109 | # print(f"Saved final model to {final_model_path}") 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | 115 | 116 | -------------------------------------------------------------------------------- /Ex2_RL-Loss/Model2_RL-Loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | 必须要指出的是,图像描述的任务,这是一个序列生成任务,而不是一个强化学习任务。 3 | 在这种情况下,使用强化学习可能并不是最好的选择,因为定义出合适的奖励函数可能会非常困难。 4 | """ 5 | import json 6 | import torch 7 | from torch.utils.data import Dataset, DataLoader 8 | from torchvision import transforms 9 | from transformers import BertTokenizer 10 | from PIL import Image 11 | from configuration import Config 12 | 13 | # 数据集类 14 | class MyDataset(Dataset): 15 | def __init__(self, json_file, img_dir, transform=None): 16 | with open(json_file, 'r') as f: 17 | self.data = json.load(f) 18 | self.img_dir = img_dir 19 | self.transform = transform 20 | self.filenames = list(self.data.keys()) 21 | 22 | def __len__(self): 23 | return len(self.data) 24 | 25 | def __getitem__(self, idx): 26 | filename = self.filenames[idx] 27 | caption = self.data[filename] 28 | image = Image.open(f"{self.img_dir}/{filename}") 29 | if self.transform: 30 | image = self.transform(image) 31 | return image, caption 32 | 33 | # 检查是否有可用的GPU 34 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 35 | 36 | # 加载模型 37 | config = Config() 38 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True) 39 | model = model.to(device) # 将模型移动到指定的设备上 40 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 41 | 42 | # 准备数据集 43 | transform = transforms.Compose([ 44 | transforms.ToTensor(), 45 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 46 | ]) 47 | 48 | train_dataset = MyDataset('../data/train_captions.json', '../data/train_images', transform=transform) 49 | train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True) 50 | 51 | # 定义损失函数和优化器 52 | criterion = torch.nn.CrossEntropyLoss() 53 | optimizer = torch.optim.Adam(model.parameters()) 54 | 55 | # 定义奖励函数 56 | def reward_function(predictions, targets): 57 | # 这只是一个示例,你需要根据你的任务定义合适的奖励函数 58 | return (predictions == targets).float() 59 | 60 | # 定义策略梯度更新函数 61 | def policy_gradient_update(model, images, captions, optimizer): 62 | outputs = model(images, captions['input_ids'], captions['attention_mask']) 63 | rewards = reward_function(outputs.logits.argmax(-1), captions['input_ids']) 64 | action_probs = outputs.logits.softmax(-1) 65 | picked_action_probs = action_probs.gather(-1, captions['input_ids'].unsqueeze(-1)).squeeze(-1) 66 | loss = (-torch.log(picked_action_probs) * rewards).mean() 67 | 68 | optimizer.zero_grad() 69 | loss.backward() 70 | optimizer.step() 71 | 72 | return loss 73 | 74 | # 训练循环 75 | num_epochs = 10 76 | for epoch in range(num_epochs): 77 | for images, captions in train_dataloader: 78 | images = images.to(device) # 将图像数据移动到指定的设备上 79 | captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True) 80 | captions = {key: val.to(device) for key, val in captions.items()} # 将caption数据移动到指定的设备上 81 | 82 | loss = policy_gradient_update(model, images, captions, optimizer) 83 | 84 | print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}') 85 | 86 | # 保存模型 87 | torch.save(model.state_dict(), 'Model2.pth') -------------------------------------------------------------------------------- /Ex2_RL-Loss/Pure_RL-Loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | 【背景及使用原因】 3 | 在深度学习中,常常通过最小化交叉熵损失来训练模型,而模型的好坏则由某种评测指标来衡量。 4 | 这种情况下,交叉熵损失可以看作是默认的训练目标,而评测指标是我们真正关心的指标。 5 | 但很多时候,优化交叉熵损失并不一定能直接优化我们关心的评测指标。 6 | 比如在分类任务中,交叉熵损失会关注每个类别是否正确预测,与我们的实际目标,比如整体预测准确率,可能不一致。 7 | 这就是所谓的默认实现的交叉熵损失和评测指标不一致情况。 8 | 9 | 在这种情况下,可以使用基于强化学习的方法来设定损失函数,使之直接优化我们关心的指标。 10 | 比方说,对于策略梯度方法而言,构造奖励函数以及策略网络,通过互动得到的奖励来更新策略网络,奖励函数就是评测指标。 11 | 12 | 举一个简单的例子,如果我们的评测指标是准确率,那么每次预测对我们就给予+1的奖励,预测错我们就不给奖励。 13 | 我们的策略网络就是我们的预测模型,输出的就是预测结果。 14 | 然后我们利用策略梯度方法,不断通过互动得到的奖励来更新我们的预测模型,使之更好地优化我们关心的指标。 15 | """ 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.optim as optim 20 | 21 | class Model(nn.Module): 22 | def __init__(self, input_size, output_size): 23 | super(Model, self).__init__() 24 | self.linear = nn.Linear(input_size, output_size) 25 | 26 | def forward(self, x): 27 | return torch.softmax(self.linear(x), dim=-1) 28 | 29 | def policy_gradient_update(model, states, actions, rewards, optimizer): 30 | # 获取模型预测的动作概率 31 | action_probs = model(states) 32 | picked_action_probs = action_probs.gather(1, actions.unsqueeze(1)).squeeze(1) 33 | 34 | # 根据公式计算损失 35 | loss = (-torch.log(picked_action_probs) * rewards).mean() 36 | 37 | optimizer.zero_grad() 38 | loss.backward() 39 | optimizer.step() 40 | 41 | model = Model(input_size=10, output_size=2) 42 | optimizer = optim.Adam(model.parameters(), lr=0.01) 43 | 44 | states = torch.randn(100, 10) 45 | actions = torch.randint(0, 2, (100,)) 46 | rewards = torch.randn(100) 47 | 48 | policy_gradient_update(model, states, actions, rewards, optimizer) 49 | 50 | """ 51 | 如何将强化学习损失函数放进train代码中呢: 52 | # 示例的训练过程 53 | for epoch in range(num_epochs): 54 | # 对于每个批次的数据 55 | for batch_data in data_loader: 56 | # 从批次数据中获取输入,动作和奖励 57 | states, actions, rewards = batch_data 58 | # 使用强化学习损失函数更新模型 59 | policy_gradient_update(model, states, actions, rewards, optimizer) 60 | 61 | # 验证或测试过程 62 | for batch_data in validation_data_loader: 63 | # 从批次数据中获取输入 64 | states = batch_data 65 | # 用模型对输入进行预测 66 | action_probabilities = model(states) 67 | # 根据需求评估或使用预测结果 68 | ... 69 | """ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 GongYufei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Model1_YellowOrange/README.md: -------------------------------------------------------------------------------- 1 | # 模型训练流程进度 2 | 3 | - 数据集预处理:completed 4 | 5 | - 定义模型:completed 6 | 7 | - 定义损失函数:completed 8 | 9 | - 选择优化方法:completed 10 | 11 | - 评估指标:completed 12 | 13 | - 训练模型:todo 14 | -------------------------------------------------------------------------------- /Model1_YellowOrange/__pycache__/configuartions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/configuartions.cpython-310.pyc -------------------------------------------------------------------------------- /Model1_YellowOrange/__pycache__/datasets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/datasets.cpython-310.pyc -------------------------------------------------------------------------------- /Model1_YellowOrange/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /Model1_YellowOrange/configuartions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | class Config: 3 | # 数据路径 4 | data_path = '../data/' 5 | images_path = '../data/images/' 6 | train_captions_path = '../data/train_captions.json' 7 | test_captions_path = '../data/test_captions.json' 8 | output_folder = '../data/output/' # 输出文件夹的路径,用于存储词汇表和处理后的数据 9 | 10 | # 模型参数 11 | embed_size = 256 12 | vocab_size = 10000 # 根据实际情况调整 13 | num_layers = 3 # 定义循环神经网络(RNN)或其变体(如 LSTM 或 GRU)中的层数。 14 | num_heads = 8 15 | dropout = 0.1 16 | hidden_size = 512 17 | image_code_dim = 2048 # 图像编码维度 18 | word_dim = 256 # 词嵌入维度 19 | attention_dim = 512 # 注意力机制的隐藏层维度 20 | 21 | # 数据处理参数 22 | min_word_count = 5 # 词汇表中词的最小出现次数 23 | max_len = 64 # 假设描述的最大长度为200个词 24 | 25 | # 训练参数 26 | # batch_size = 16 27 | batch_size = 4 28 | learning_rate = 0.001 29 | # num_epochs = 30 30 | num_epochs = 10 31 | workers = 0 # 工作线程数,在自己的电脑上训练的时候设为0 32 | encoder_learning_rate = 1e-4 # 编码器的学习率 33 | decoder_learning_rate = 1e-3 # 解码器的学习率 34 | lr_update = 10 # 每10轮降低学习速率 35 | 36 | # 图像预处理参数 37 | image_size = 256 # 图像缩放大小 38 | crop_size = 224 # 图像裁剪大小 39 | 40 | # Beam Search 参数 41 | beam_k = 5 42 | 43 | # 其他配置 44 | device = 'cuda' if torch.cuda.is_available() else 'cpu' -------------------------------------------------------------------------------- /Model1_YellowOrange/datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from collections import Counter 4 | import torch 5 | from PIL import Image 6 | from torch.utils.data import Dataset 7 | from torch.utils.data import DataLoader 8 | import torchvision.transforms as transforms 9 | from configuartions import Config # 导入配置类 10 | 11 | 12 | # 从配置文件获取配置 13 | config = Config() 14 | 15 | 16 | def create_dataset(max_len=64): 17 | """ 18 | 整理数据集,构建词汇表,并将文本描述转换为词索引向量。 19 | 使用configuration.py文件中定义的配置信息。 20 | """ 21 | # 使用config中定义的路径 22 | image_folder = config.images_path 23 | train_captions_path = config.train_captions_path 24 | test_captions_path = config.test_captions_path 25 | output_folder = config.output_folder 26 | 27 | # 读取训练图像描述 28 | with open(train_captions_path, 'r') as f: 29 | train_captions_data = json.load(f) 30 | 31 | # 读取测试图像描述 32 | with open(test_captions_path, 'r') as f: 33 | test_captions_data = json.load(f) 34 | 35 | # 统计训练集的文本描述的词频 36 | vocab = Counter() 37 | for caption in train_captions_data.values(): 38 | vocab.update(caption.lower().split()) 39 | 40 | # 移除其中的低频词 41 | vocab = {word for word, count in vocab.items() if count >= config.min_word_count} 42 | 43 | # 构建词典 44 | word_to_idx = {word: idx + 4 for idx, word in enumerate(vocab)} 45 | word_to_idx[''] = 0 46 | word_to_idx[''] = 1 47 | word_to_idx[''] = 2 48 | word_to_idx[''] = 3 49 | 50 | # 一个函数来转换描述为词索引向量,并进行填充 51 | def encode_captions(captions_data, word_to_idx, max_len): 52 | encoded_captions = {} 53 | caplens = {} 54 | for img_id, caption in captions_data.items(): 55 | words = caption.lower().split() 56 | encoded_caption = [word_to_idx.get(word, word_to_idx['']) for word in words] 57 | # 加2是因为要加上,但最终caplen应该减去1 58 | caplen = min(len(encoded_caption) + 2, max_len) - 1 59 | encoded_caption = [word_to_idx['']] + encoded_caption + [word_to_idx['']] 60 | encoded_caption += [word_to_idx['']] * (max_len - len(encoded_caption)) 61 | encoded_captions[img_id] = encoded_caption[:max_len] 62 | caplens[img_id] = caplen # if caplen <= max_len else max_len 63 | return encoded_captions, caplens 64 | # def encode_captions(captions_data, word_to_idx, max_len): 65 | # encoded_captions = {} 66 | # for img_id, caption in captions_data.items(): 67 | # words = caption.lower().split() 68 | # encoded_caption = [word_to_idx.get(word, word_to_idx['']) for word in words] 69 | # encoded_caption = [word_to_idx['']] + encoded_caption + [word_to_idx['']] 70 | # encoded_caption += [word_to_idx['']] * (max_len - len(encoded_caption)) 71 | # encoded_captions[img_id] = encoded_caption[:max_len] 72 | # return encoded_captions 73 | 74 | # 对训练集描述进行编码 75 | encoded_captions_train, caplens_train = encode_captions(train_captions_data, word_to_idx, max_len) 76 | 77 | # 对测试集描述进行编码 78 | encoded_captions_test, caplens_test = encode_captions(test_captions_data, word_to_idx, max_len) 79 | 80 | # 存储词典和编码后的描述 81 | with open(os.path.join(output_folder, 'vocab.json'), 'w') as f: 82 | json.dump(word_to_idx, f) 83 | 84 | with open(os.path.join(output_folder, 'encoded_captions_train.json'), 'w') as f: 85 | json.dump(encoded_captions_train, f) 86 | 87 | with open(os.path.join(output_folder, 'encoded_captions_test.json'), 'w') as f: 88 | json.dump(encoded_captions_test, f) 89 | 90 | # 存储图像路径 91 | image_paths_train = {img_id: os.path.join(image_folder, img_id) for img_id in train_captions_data.keys()} 92 | with open(os.path.join(output_folder, 'image_paths_train.json'), 'w') as f: 93 | json.dump(image_paths_train, f) 94 | 95 | image_paths_test = {img_id: os.path.join(image_folder, img_id) for img_id in test_captions_data.keys()} 96 | with open(os.path.join(output_folder, 'image_paths_test.json'), 'w') as f: 97 | json.dump(image_paths_test, f) 98 | 99 | # 存储caplens 100 | with open(os.path.join(output_folder, 'caplens_train.json'), 'w') as f: 101 | json.dump(caplens_train, f) 102 | 103 | with open(os.path.join(output_folder, 'caplens_test.json'), 'w') as f: 104 | json.dump(caplens_test, f) 105 | 106 | 107 | # 调用函数,整理数据集 108 | create_dataset() 109 | 110 | 111 | class ImageTextDataset(Dataset): 112 | """ 113 | PyTorch数据集类,用于加载和处理图像-文本数据。 114 | """ 115 | 116 | def __init__(self, image_paths_file, captions_file, caplens_file, transform=None): 117 | """ 118 | 初始化数据集类。 119 | 参数: 120 | image_paths_file: 包含图像路径的json文件路径。 121 | captions_file: 包含编码后文本描述的json文件路径。 122 | transform: 应用于图像的预处理转换。 123 | """ 124 | # 载入图像路径和文本描述以及caplens 125 | with open(image_paths_file, 'r') as f: 126 | self.image_paths = json.load(f) 127 | 128 | with open(captions_file, 'r') as f: 129 | self.captions = json.load(f) 130 | 131 | with open(caplens_file, 'r') as f: 132 | self.caplens = json.load(f) 133 | 134 | # 设置图像预处理方法 135 | self.transform = transform or transforms.Compose([ 136 | transforms.Resize((256, 256)), 137 | transforms.ToTensor(), 138 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 139 | ]) 140 | 141 | def __getitem__(self, index): 142 | """ 143 | 获取单个数据点。 144 | 参数: 145 | index: 数据点的索引。 146 | 返回: 147 | 一个包含图像和对应文本描述的元组。 148 | """ 149 | # 获取图像路径和文本描述以及caplen 150 | image_id = list(self.image_paths.keys())[index] 151 | image_path = self.image_paths[image_id] 152 | caption = self.captions[image_id] 153 | caplen = self.caplens[image_id] 154 | 155 | # 加载图像并应用预处理 156 | image = Image.open(image_path).convert('RGB') 157 | if self.transform is not None: 158 | image = self.transform(image) 159 | 160 | # 将文本描述转换为张量 161 | caption_tensor = torch.tensor(caption, dtype=torch.long) 162 | 163 | return image, caption_tensor, caplen 164 | 165 | def __len__(self): 166 | """ 167 | 数据集中的数据点总数。 168 | """ 169 | return len(self.image_paths) 170 | 171 | 172 | # 创建数据集实例 173 | # train_dataset = ImageTextDataset( 174 | # image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'), 175 | # captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'), 176 | # caplens_file=os.path.join(config.output_folder, 'caplens_train.json') 177 | # ) 178 | 179 | # # 示例:创建验证集实例 180 | # test_dataset = ImageTextDataset( 181 | # image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'), 182 | # captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'), 183 | # caplens_file=os.path.join(config.output_folder, 'caplens_test.json') 184 | # ) 185 | 186 | # 创建训练集和测试集的 DataLoader 187 | def create_dataloaders(config): 188 | """ 189 | 创建训练集和测试集的 DataLoader。 190 | 191 | 参数: 192 | batch_size: 每个批次的大小。 193 | num_workers: 加载数据时使用的进程数。 194 | shuffle_train: 是否打乱训练数据。 195 | 196 | 返回: 197 | train_loader: 训练数据的 DataLoader。 198 | test_loader: 测试数据的 DataLoader。 199 | """ 200 | # 图像预处理转换 201 | transform = transforms.Compose([ 202 | transforms.Resize((256, 256)), 203 | transforms.RandomCrop(224), 204 | transforms.ToTensor(), 205 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 206 | ]) 207 | 208 | # 加载数据时使用的进程数 209 | num_workers = 0 210 | 211 | # 创建数据集对象 212 | train_dataset = ImageTextDataset( 213 | image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'), 214 | captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'), 215 | caplens_file=os.path.join(config.output_folder, 'caplens_train.json'), 216 | transform=transform 217 | ) 218 | 219 | test_dataset = ImageTextDataset( 220 | image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'), 221 | captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'), 222 | caplens_file=os.path.join(config.output_folder, 'caplens_test.json'), 223 | transform=transform 224 | ) 225 | 226 | # 创建 DataLoader 对象 227 | train_loader = DataLoader( 228 | dataset=train_dataset, 229 | batch_size=config.batch_size, 230 | shuffle=True, 231 | num_workers=num_workers, 232 | pin_memory=True 233 | ) 234 | 235 | test_loader = DataLoader( 236 | dataset=test_dataset, 237 | batch_size=config.batch_size, 238 | shuffle=False, # 通常测试集不需要打乱 239 | num_workers=num_workers, 240 | pin_memory=True 241 | ) 242 | 243 | return train_loader, test_loader 244 | 245 | 246 | config = Config() 247 | # 使用Config类中定义的配置来创建DataLoader 248 | train_loader, test_loader = create_dataloaders(config=config) 249 | 250 | 251 | # 测试 DataLoader 是否正确创建 252 | if __name__ == '__main__': 253 | for i, (images, captions, caplens) in enumerate(train_loader): 254 | print(f"Batch {i + 1}") 255 | print(f"Images shape: {images.size()}") 256 | print(f"Captions shape: {captions.size()}") 257 | if i == 1: # 仅打印前两个批次的信息 258 | break 259 | -------------------------------------------------------------------------------- /Model1_YellowOrange/datasets_pretrain_demo.py: -------------------------------------------------------------------------------- 1 | import json 2 | from PIL import Image 3 | import matplotlib.pyplot as plt 4 | 5 | vocab_path = '../data/output/vocab.json' 6 | encoded_captions_path = '../data/output/encoded_captions_train.json' 7 | image_paths_path = '../data/output/image_paths_train.json' 8 | 9 | # 读取词典、编码后的描述和图像路径 10 | with open(vocab_path, 'r') as f: 11 | vocab = json.load(f) 12 | 13 | with open(encoded_captions_path, 'r') as f: 14 | encoded_captions = json.load(f) 15 | 16 | with open(image_paths_path, 'r') as f: 17 | image_paths = json.load(f) 18 | 19 | # 将索引转换回单词 20 | vocab_idx2word = {idx: word for word, idx in vocab.items()} 21 | 22 | # 选择要展示的图片ID,这里以第一个ID为例 23 | first_img_id = list(image_paths.keys())[0] 24 | content_img = Image.open(image_paths[first_img_id]) 25 | 26 | # 展示图片和对应的描述 27 | plt.imshow(content_img) 28 | plt.axis('off') # 不显示坐标轴 29 | plt.show() 30 | 31 | # 打印对应的文本描述,确保字典中的键是整数,直接使用整数索引 32 | caption = ' '.join([vocab_idx2word[word_idx] for word_idx in encoded_captions[first_img_id]]) 33 | # caption = ' '.join([vocab_idx2word[str(word_idx)] for word_idx in encoded_captions[first_img_id]]) 34 | print(caption) 35 | -------------------------------------------------------------------------------- /Model1_YellowOrange/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from pycocoevalcap.cider.cider import Cider 4 | import numpy as np 5 | from configuartions import Config 6 | from torchvision.models import resnet101, ResNet101_Weights 7 | from torch.nn.utils.rnn import pack_padded_sequence 8 | import torch.optim as optim 9 | import json 10 | 11 | 12 | # 图像编码器 13 | # 使用ResNet-101作为图像编码器,并将其最后一个非全连接层作为网格表示提取层 14 | # class ImageEncoder(nn.Module): 15 | # def __init__(self, finetuned=True): 16 | # super(ImageEncoder, self).__init__() 17 | # model = torchvision.models.resnet101(weights=ResNet101_Weights.DEFAULT) 18 | # # ResNet-101网格表示提取器 19 | # self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2])) 20 | # for param in self.grid_rep_extractor.parameters(): 21 | # param.requires_grad = finetuned 22 | # 23 | # def forward(self, images): 24 | # out = self.grid_rep_extractor(images) 25 | # return out 26 | 27 | # 引入自注意机制后的图像编码器 28 | class SelfAttention(nn.Module): 29 | def __init__(self, num_channels, num_heads=8, dropout=0.1): 30 | super(SelfAttention, self).__init__() 31 | self.num_heads = num_heads 32 | self.attention = nn.MultiheadAttention(num_channels, num_heads, dropout) 33 | 34 | def forward(self, x): 35 | # 保存原始形状 36 | orig_shape = x.shape 37 | # 打印输入形状 38 | # print("Input shape:", x.shape) 39 | # 转换为(sequence_length, batch_size, num_channels)格式 40 | x = x.flatten(2).permute(2, 0, 1) 41 | attention_output, _ = self.attention(x, x, x) 42 | # 还原形状,确保与原始输入形状匹配 43 | attention_output = attention_output.permute(1, 2, 0)# 打印最终输出形状 44 | # print("Final output shape:", attention_output.shape) 45 | return attention_output.view(orig_shape) 46 | 47 | 48 | class ImageEncoder(nn.Module): 49 | def __init__(self, finetuned=True, num_heads=8, dropout=0.1): 50 | super(ImageEncoder, self).__init__() 51 | # 使用ResNet101作为基础模型 52 | model = resnet101(weights=ResNet101_Weights.DEFAULT) 53 | self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2])) 54 | # 设置参数是否可训练 55 | for param in self.grid_rep_extractor.parameters(): 56 | param.requires_grad = finetuned 57 | 58 | # 自注意力层 59 | self.self_attention = SelfAttention(model.fc.in_features, num_heads, dropout) 60 | 61 | def forward(self, images): 62 | # 通过ResNet网格表示提取器 63 | features = self.grid_rep_extractor(images) 64 | # print("Extractor output shape:", features.shape) 65 | # 应用自注意力 66 | features = self.self_attention(features) 67 | # 打印自注意力输出形状 68 | # print("Self-attention output shape:", features.shape) 69 | return features 70 | 71 | 72 | # 解码器的注意力机制 73 | class AdditiveAttention(nn.Module): 74 | def __init__(self, query_dim, key_dim, attn_dim): 75 | """ 76 | 参数: 77 | query_dim: 查询Q的维度 78 | key_dim: 键K的维度 79 | attn_dim: 注意力函数隐藏层表示的维度 80 | """ 81 | super(AdditiveAttention, self).__init__() 82 | self.attn_w_1_q = nn.Linear(query_dim, attn_dim) 83 | self.attn_w_1_k = nn.Linear(key_dim, attn_dim) 84 | self.attn_w_2 = nn.Linear(attn_dim, 1) 85 | self.tanh = nn.Tanh() 86 | self.softmax = nn.Softmax(dim=1) 87 | 88 | def forward(self, query, key_value): 89 | """ 90 | Q K V:Q和K算出相关性得分,作为V的权重,K=V 91 | 参数: 92 | query: 查询 (batch_size, q_dim) 93 | key_value: 键和值,(batch_size, n_kv, kv_dim) 94 | """ 95 | # (2)计算query和key的相关性,实现注意力评分函数 96 | # -> (batch_size, 1, attn_dim) 97 | queries = self.attn_w_1_q(query).unsqueeze(1) 98 | # -> (batch_size, n_kv, attn_dim) 99 | keys = self.attn_w_1_k(key_value) 100 | # -> (batch_size, n_kv) 101 | attn = self.attn_w_2(self.tanh(queries+keys)).squeeze(2) 102 | # (3)归一化相关性分数 103 | # -> (batch_size, n_kv) 104 | attn = self.softmax(attn) 105 | # (4)计算输出 106 | # (batch_size x 1 x n_kv)(batch_size x n_kv x kv_dim) 107 | # -> (batch_size, 1, kv_dim) 108 | output = torch.bmm(attn.unsqueeze(1), key_value).squeeze(1) 109 | return output, attn 110 | 111 | 112 | # 文本解码器 113 | # 注意:确保 vocab_size, embed_size, hidden_size 等参数数据集和配置匹配 114 | class AttentionDecoder(nn.Module): 115 | """ 116 | 初始化文本解码器。 117 | 118 | 参数: 119 | image_code_dim: 图像编码的维度。 120 | vocab_size: 词汇表的大小。 121 | word_dim: 词嵌入的维度。 122 | attention_dim: 注意力机制的隐藏层维度。 123 | hidden_size: GRU隐藏层的大小。 124 | num_layers: GRU层数。 125 | dropout: Dropout层的概率。 126 | """ 127 | def __init__(self, image_code_dim, vocab_size, word_dim, attention_dim, hidden_size, num_layers, dropout=0.5): 128 | super(AttentionDecoder, self).__init__() 129 | self.embed = nn.Embedding(vocab_size, word_dim) 130 | self.attention = AdditiveAttention(hidden_size, image_code_dim, attention_dim) 131 | self.init_state = nn.Linear(image_code_dim, num_layers * hidden_size) 132 | self.rnn = nn.GRU(word_dim + image_code_dim, hidden_size, num_layers) 133 | self.dropout = nn.Dropout(p=dropout) 134 | self.fc = nn.Linear(hidden_size, vocab_size) 135 | # RNN默认已初始化 136 | self.init_weights() 137 | 138 | def init_weights(self): 139 | self.embed.weight.data.uniform_(-0.1, 0.1) 140 | self.fc.bias.data.fill_(0) 141 | self.fc.weight.data.uniform_(-0.1, 0.1) 142 | 143 | def init_hidden_state(self, image_code, captions, cap_lens): 144 | """ 145 | 初始化隐藏状态。 146 | 147 | 参数: 148 | image_code:图像编码器输出的图像表示 149 | (batch_size, image_code_dim, grid_height, grid_width) 150 | captions: 文本描述。 151 | cap_lens: 文本描述的长度。 152 | """ 153 | # 将图像网格表示转换为序列表示形式 154 | batch_size, image_code_dim = image_code.size(0), image_code.size(1) 155 | # -> (batch_size, grid_height, grid_width, image_code_dim) 156 | image_code = image_code.permute(0, 2, 3, 1) 157 | # -> (batch_size, grid_height * grid_width, image_code_dim) 158 | image_code = image_code.view(batch_size, -1, image_code_dim) 159 | # (1)按照caption的长短排序 160 | sorted_cap_lens, sorted_cap_indices = torch.sort(cap_lens, 0, True) 161 | captions = captions[sorted_cap_indices] 162 | image_code = image_code[sorted_cap_indices] 163 | # (2)初始化隐状态 164 | hidden_state = self.init_state(image_code.mean(axis=1)) 165 | hidden_state = hidden_state.view( 166 | batch_size, 167 | self.rnn.num_layers, 168 | self.rnn.hidden_size).permute(1, 0, 2) 169 | return image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state 170 | 171 | def forward_step(self, image_code, curr_cap_embed, hidden_state): 172 | """ 173 | 解码器的前馈步骤。 174 | 175 | 参数: 176 | image_code: 图像编码。 177 | curr_cap_embed: 当前时间步的词嵌入向量。 178 | hidden_state: 当前的隐藏状态。 179 | """ 180 | # (3.2)利用注意力机制获得上下文向量 181 | # query:hidden_state[-1],即最后一个隐藏层输出 (batch_size, hidden_size) 182 | # context: (batch_size, hidden_size) 183 | context, alpha = self.attention(hidden_state[-1], image_code) 184 | # (3.3)以上下文向量和当前时刻词表示为输入,获得GRU输出 185 | x = torch.cat((context, curr_cap_embed), dim=-1).unsqueeze(0) 186 | # x: (1, real_batch_size, hidden_size+word_dim) 187 | # out: (1, real_batch_size, hidden_size) 188 | out, hidden_state = self.rnn(x, hidden_state) 189 | # (3.4)获取该时刻的预测结果 190 | # (real_batch_size, vocab_size) 191 | preds = self.fc(self.dropout(out.squeeze(0))) 192 | return preds, alpha, hidden_state 193 | 194 | def forward(self, image_code, captions, cap_lens): 195 | """ 196 | 完整的前馈过程。 197 | 198 | 参数: 199 | hidden_state: (num_layers, batch_size, hidden_size) 200 | image_code: (batch_size, feature_channel, feature_size) 201 | captions: (batch_size, ) 202 | """ 203 | # (1)将图文数据按照文本的实际长度从长到短排序 204 | # (2)获得GRU的初始隐状态 205 | image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state \ 206 | = self.init_hidden_state(image_code, captions, cap_lens) 207 | batch_size = image_code.size(0) 208 | # 输入序列长度减1,因为最后一个时刻不需要预测下一个词 209 | lengths = sorted_cap_lens.cpu().numpy() - 1 210 | # 初始化变量:模型的预测结果和注意力分数 211 | max_cap_len = max(cap_lens) # 计算最长caption的长度 212 | predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(captions.device) 213 | alphas = torch.zeros(batch_size, max_cap_len, image_code.shape[1]).to(captions.device) 214 | # predictions = torch.zeros(batch_size, lengths[0], self.fc.out_features).to(captions.device) 215 | # alphas = torch.zeros(batch_size, lengths[0], image_code.shape[1]).to(captions.device) 216 | # 获取文本嵌入表示 cap_embeds: (batch_size, num_steps, word_dim) 217 | cap_embeds = self.embed(captions) 218 | # Teacher-Forcing模式 219 | for step in range(lengths[0]): 220 | # (3)解码 221 | # (3.1)模拟pack_padded_sequence函数的原理,获取该时刻的非输入 222 | real_batch_size = np.where(lengths > step)[0].shape[0] 223 | preds, alpha, hidden_state = self.forward_step( 224 | image_code[:real_batch_size], 225 | cap_embeds[:real_batch_size, step, :], 226 | hidden_state[:, :real_batch_size, :].contiguous()) 227 | # 记录结果 228 | predictions[:real_batch_size, step, :] = preds 229 | alphas[:real_batch_size, step, :] = alpha 230 | 231 | # 新增逻辑来调整输出长度 232 | # 找出最长的caption长度 233 | max_cap_len = max(cap_lens) 234 | # 初始化一个填充的predictions张量 235 | padded_predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(predictions.device) 236 | for i in range(batch_size): 237 | # 当前样本的实际长度 238 | actual_length = cap_lens[i] 239 | # 只拷贝实际长度的预测结果 240 | padded_predictions[i, :actual_length, :] = predictions[i, :actual_length, :] 241 | 242 | return padded_predictions, alphas, captions, lengths, sorted_cap_indices 243 | 244 | 245 | # AttentionModel 模型 246 | ''' 247 | 注意:确保 image_code_dim 等参数与 ImageEncoder 的输出匹配 248 | 249 | 最终 ImageEncoder 的输出形状仍然是 (batch_size, num_channels, height, width)。 250 | 这意味着 image_code_dim 应该设置为 num_channels,即 ResNet101 最后一个卷积层的输出通道数。这个值通常为2048, 251 | ''' 252 | class AttentionModel(nn.Module): 253 | def __init__(self, image_code_dim, vocab, word_dim, attention_dim, hidden_size, num_layers): 254 | super(AttentionModel, self).__init__() 255 | self.vocab = vocab 256 | self.encoder = ImageEncoder() 257 | self.decoder = AttentionDecoder(image_code_dim, len(vocab), word_dim, attention_dim, hidden_size, num_layers) 258 | 259 | def forward(self, images, captions, cap_lens): 260 | # 打印图像输入形状 261 | # print("Image input shape:", images.shape) 262 | image_code = self.encoder(images) 263 | # 打印编码器输出形状 264 | # print("Encoder output shape:", image_code.shape) 265 | output = self.decoder(image_code, captions, cap_lens) 266 | # 打印解码器输出形状 267 | # print("Decoder output shape:", output[0].shape) # Assuming output[0] is the main output 268 | return output 269 | 270 | def generate_by_beamsearch(self, images, beam_k, max_len): 271 | vocab_size = len(self.vocab) 272 | image_codes = self.encoder(images) 273 | texts = [] 274 | device = images.device 275 | # 对每个图像样本执行束搜索 276 | for image_code in image_codes: 277 | # 将图像表示复制k份 278 | image_code = image_code.unsqueeze(0).repeat(beam_k, 1, 1, 1) 279 | # 生成k个候选句子,初始时,仅包含开始符号 280 | cur_sents = torch.full((beam_k, 1), self.vocab[''], dtype=torch.long).to(device) 281 | cur_sent_embed = self.decoder.embed(cur_sents)[:, 0, :] 282 | sent_lens = torch.LongTensor([1] * beam_k).to(device) 283 | # 获得GRU的初始隐状态 284 | image_code, cur_sent_embed, _, _, hidden_state = \ 285 | self.decoder.init_hidden_state(image_code, cur_sent_embed, sent_lens) 286 | # 存储已生成完整的句子(以句子结束符结尾的句子) 287 | end_sents = [] 288 | # 存储已生成完整的句子的概率 289 | end_probs = [] 290 | # 存储未完整生成的句子的概率 291 | probs = torch.zeros(beam_k, 1).to(device) 292 | k = beam_k 293 | while True: 294 | preds, _, hidden_state = self.decoder.forward_step(image_code[:k], cur_sent_embed, 295 | hidden_state.contiguous()) 296 | # -> (k, vocab_size) 297 | preds = nn.functional.log_softmax(preds, dim=1) 298 | # 对每个候选句子采样概率值最大的前k个单词生成k个新的候选句子,并计算概率 299 | # -> (k, vocab_size) 300 | probs = probs.repeat(1, preds.size(1)) + preds 301 | if cur_sents.size(1) == 1: 302 | # 第一步时,所有句子都只包含开始标识符,因此,仅利用其中一个句子计算topk 303 | values, indices = probs[0].topk(k, 0, True, True) 304 | else: 305 | # probs: (k, vocab_size) 是二维张量 306 | # topk函数直接应用于二维张量会按照指定维度取最大值,这里需要在全局取最大值 307 | # 因此,将probs转换为一维张量,再使用topk函数获取最大的k个值 308 | values, indices = probs.view(-1).topk(k, 0, True, True) 309 | # 计算最大的k个值对应的句子索引和词索引 310 | sent_indices = torch.div(indices, vocab_size, rounding_mode='trunc') 311 | word_indices = indices % vocab_size 312 | # 将词拼接在前一轮的句子后,获得此轮的句子 313 | cur_sents = torch.cat([cur_sents[sent_indices], word_indices.unsqueeze(1)], dim=1) 314 | # 查找此轮生成句子结束符的句子 315 | end_indices = [idx for idx, word in enumerate(word_indices) if word == self.vocab['']] 316 | if len(end_indices) > 0: 317 | end_probs.extend(values[end_indices]) 318 | end_sents.extend(cur_sents[end_indices].tolist()) 319 | # 如果所有的句子都包含结束符,则停止生成 320 | k -= len(end_indices) 321 | if k == 0: 322 | break 323 | # 查找还需要继续生成词的句子 324 | cur_indices = [idx for idx, word in enumerate(word_indices) 325 | if word != self.vocab['']] 326 | if len(cur_indices) > 0: 327 | cur_sent_indices = sent_indices[cur_indices] 328 | cur_word_indices = word_indices[cur_indices] 329 | # 仅保留还需要继续生成的句子、句子概率、隐状态、词嵌入 330 | cur_sents = cur_sents[cur_indices] 331 | probs = values[cur_indices].view(-1, 1) 332 | hidden_state = hidden_state[:, cur_sent_indices, :] 333 | cur_sent_embed = self.decoder.embed( 334 | cur_word_indices.view(-1, 1))[:, 0, :] 335 | # 句子太长,停止生成 336 | if cur_sents.size(1) >= max_len: 337 | break 338 | if len(end_sents) == 0: 339 | # 如果没有包含结束符的句子,则选取第一个句子作为生成句子 340 | gen_sent = cur_sents[0].tolist() 341 | else: 342 | # 否则选取包含结束符的句子中概率最大的句子 343 | gen_sent = end_sents[end_probs.index(max(end_probs))] 344 | texts.append(gen_sent) 345 | return texts 346 | 347 | 348 | # 损失函数 349 | class PackedCrossEntropyLoss(nn.Module): 350 | def __init__(self): 351 | super(PackedCrossEntropyLoss, self).__init__() 352 | self.loss_fn = nn.CrossEntropyLoss() 353 | 354 | def forward(self, predictions, targets, lengths): 355 | """ 356 | 计算交叉熵损失,排除填充的部分。 357 | 参数: 358 | predictions:模型的预测结果,形状为 (batch_size, max_length, vocab_size)。 359 | targets:实际的文本描述,形状为 (batch_size, max_length)。 360 | lengths:每个描述的实际长度。 361 | """ 362 | # 使用 pack_padded_sequence 来处理变长序列 363 | # 这里 predictions 和 targets 都需要进行 pack 操作 364 | # 由于 pack_padded_sequence 需要长度从长到短的序列,这里假设输入已经是这种格式 365 | packed_predictions = pack_padded_sequence(predictions, lengths, batch_first=True, enforce_sorted=False)[0] 366 | packed_targets = pack_padded_sequence(targets, lengths, batch_first=True, enforce_sorted=False)[0] 367 | 368 | # 计算损失,忽略填充的部分 369 | loss = self.loss_fn(packed_predictions, packed_targets) 370 | return loss 371 | 372 | 373 | def get_optimizer(model, config): 374 | """ 375 | 获取优化器,为模型的不同部分设置不同的学习速率。 376 | 参数: 377 | model:训练模型。 378 | config:包含配置信息的对象,如学习速率等。 379 | 返回: 380 | 配置好地优化器。 381 | """ 382 | # 为编码器和解码器设置不同的学习速率 383 | encoder_params = filter(lambda p: p.requires_grad, model.encoder.parameters()) 384 | decoder_params = filter(lambda p: p.requires_grad, model.decoder.parameters()) 385 | 386 | # 创建优化器,分别对这两部分参数应用不同的学习速率 387 | optimizer = optim.Adam([ 388 | {"params": encoder_params, "lr": config.encoder_learning_rate}, 389 | {"params": decoder_params, "lr": config.decoder_learning_rate} 390 | ]) 391 | 392 | return optimizer 393 | 394 | # 以下函数是为了展示如何在训练过程中调整学习速率,实际上可能并未使用 395 | def adjust_learning_rate(optimizer, epoch, config): 396 | """ 397 | 调整学习速率,每隔一定轮次减少到原来的十分之一。 398 | 参数: 399 | optimizer:优化器。 400 | epoch:当前轮次。 401 | config:包含配置信息的对象。 402 | """ 403 | for param_group in optimizer.param_groups: 404 | if param_group['name'] == 'encoder': 405 | param_group['lr'] = config.encoder_learning_rate * (0.1 ** (epoch // config.lr_update)) 406 | else: 407 | param_group['lr'] = config.decoder_learning_rate * (0.1 ** (epoch // config.lr_update)) 408 | 409 | 410 | # CIDEr-D 评估 411 | def filter_useless_words(sent, filterd_words): 412 | # 去除句子中不参与CIDEr-D计算的符号 413 | return [w for w in sent if w not in filterd_words] 414 | 415 | 416 | def evaluate_cider(data_loader, model, config): 417 | model.eval() 418 | # 存储候选文本和参考文本 419 | cands = {} 420 | refs = {} 421 | filterd_words = {model.vocab[''], model.vocab[''], model.vocab['']} 422 | device = next(model.parameters()).device 423 | 424 | # 加载词汇表并创建反向词汇表 425 | with open('../data/output/vocab.json', 'r') as f: 426 | vocab = json.load(f) 427 | idx_to_word = {idx: word for word, idx in vocab.items()} 428 | 429 | for i, (imgs, caps, caplens) in enumerate(data_loader): 430 | imgs = imgs.to(device) 431 | # 通过束搜索生成描述 432 | preds = model.generate_by_beamsearch(imgs, config.beam_k, config.max_len) 433 | for j in range(imgs.size(0)): 434 | img_id = str(i * config.batch_size + j) 435 | cand_words = [idx_to_word.get(word, '') for word in preds[j]] 436 | cand = ' '.join(filter_useless_words(cand_words, filterd_words)) 437 | cands[img_id] = [cand] # 候选描述 438 | # 将参考描述(caps[j])的每个索引转换为单词 439 | ref_words = [idx_to_word.get(word.item(), '') for word in caps[j]] 440 | refs[img_id] = [' '.join(filter_useless_words(ref_words, filterd_words))] # 参考描述 441 | 442 | # # 在调用 compute_score 之前添加调试信息 443 | # for key, value in cands.items(): 444 | # print(f"Key: {key}, Value type: {type(value)}, Value: {value}") 445 | # assert isinstance(value, list), f"Value for key {key} is not a list in cands" 446 | # 447 | # for key, value in refs.items(): 448 | # print(f"Key: {key}, Value type: {type(value)}, Value: {value}") 449 | # assert isinstance(value, list), f"Value for key {key} is not a list in refs" 450 | 451 | # 计算CIDEr-D得分 452 | cider_evaluator = Cider() 453 | score, _ = cider_evaluator.compute_score(refs, cands) 454 | # score, _ = cider_evaluator.compute_score({'dummy': refs}, {'dummy': cands}) 455 | 456 | model.train() 457 | return score 458 | 459 | 460 | 461 | # encoder = ImageEncoder(Config.embed_size) 462 | # decoder = AttentionDecoder(Config.embed_size, Config.vocab_size, Config.hidden_size, Config.num_layers) 463 | # arctic_model = ARCTIC(encoder, decoder) 464 | 465 | # 示例:前馈过程 466 | # images = ... # 从数据集中获取图像 467 | # captions = ... # 从数据集中获取对应的文本描述 468 | # 输出 = arctic_model(images, captions) 469 | -------------------------------------------------------------------------------- /Model1_YellowOrange/predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from torchvision import transforms 4 | from models import AttentionModel 5 | from configurations import Config 6 | import json 7 | 8 | def load_model(model_path, vocab, config): 9 | model = AttentionModel( 10 | image_code_dim=config.image_code_dim, 11 | vocab=vocab, # 传递词汇表字典 12 | word_dim=config.word_dim, 13 | attention_dim=config.attention_dim, 14 | hidden_size=config.hidden_size, 15 | num_layers=config.num_layers 16 | ) 17 | model.load_state_dict(torch.load(model_path)) 18 | model = model.to(config.device) 19 | model.eval() # 将模型设置为评估模式 20 | return model 21 | 22 | def process_image(image_path): 23 | transform = transforms.Compose([ 24 | transforms.Resize((256, 256)), 25 | transforms.ToTensor(), 26 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 27 | ]) 28 | image = Image.open(image_path).convert('RGB') 29 | image_tensor = transform(image).unsqueeze(0) # 添加一个批次维度 30 | return image_tensor 31 | 32 | def predict_caption(model, image_tensor, vocab, config): 33 | # 生成束搜索描述 34 | predictions = model.generate_by_beamsearch(image_tensor.to(config.device), config.beam_k, config.max_len) 35 | # 将词索引转换回文字 36 | idx_to_word = {idx: word for word, idx in vocab.items()} 37 | caption_words = [idx_to_word.get(word, '') for word in predictions[0]] 38 | caption = ' '.join(caption_words) 39 | return caption 40 | 41 | def main(): 42 | # 载入配置和词汇表 43 | config = Config() 44 | with open('../data/output/vocab_caption_1.json', 'r') as f: 45 | vocab = json.load(f) 46 | 47 | # 加载模型 48 | model_path = '../data/output/weights/.pth' # 使用正确的模型文件路径 49 | model = load_model(model_path, vocab, config) 50 | 51 | # 处理图片并生成描述 52 | image_path = '../data/images_1/MEN-Denim-id_00000080-01_7_additional.jpg' # 测试图片路径 53 | image_tensor = process_image(image_path) 54 | caption = predict_caption(model, image_tensor, vocab, config) 55 | 56 | print("Generated Caption:", caption) 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | 62 | """ 63 | model = ... # 加载模型 64 | 65 | images_folder = "..." # 图片文件夹路径 66 | captions_dict = {} # 字典 67 | 68 | count = 1 # 计数 69 | 70 | for filename in os.listdir(images_folder): 71 | if filename.endswith(".jpg") or filename.endswith(".png"): 72 | img_path = os.path.join(images_folder, filename) 73 | 74 | # Load the image 75 | raw_image = Image.open(img_path).convert('RGB') 76 | 77 | generated_caption = ... # 生成caption 78 | 79 | print(f"No{count}", generated_caption) 80 | count += 1 81 | 82 | # Store the caption in the dictionary 83 | captions_dict[img_path] = generated_caption 84 | 85 | # Save the dictionary to captions.json 86 | output_path = "..." # 保存路径 87 | with open(output_path, 'w') as json_file: 88 | json.dump(captions_dict, json_file, indent=4) 89 | 90 | print(f"Captions saved to {output_path}") 91 | """ -------------------------------------------------------------------------------- /Model1_YellowOrange/requirements.txt: -------------------------------------------------------------------------------- 1 | torch == 2.1.1 2 | torchvision == 0.16.1 3 | transformers == 4.32.1 4 | pycocoevalcap -------------------------------------------------------------------------------- /Model1_YellowOrange/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import os 4 | from configuartions import Config 5 | from models import AttentionModel, get_optimizer, PackedCrossEntropyLoss, evaluate_cider 6 | from datasets import create_dataloaders, ImageTextDataset 7 | 8 | 9 | def main(): 10 | best_test_score = float('-inf') # 初始化最佳测试得分 11 | 12 | # 加载配置 13 | config = Config() 14 | 15 | # 创建数据加载器 16 | train_loader, test_loader = create_dataloaders(config) 17 | 18 | # 加载词汇表文件 19 | with open('../data/output/vocab.json', 'r') as f: 20 | vocab = json.load(f) 21 | 22 | # 模型初始化 23 | model = AttentionModel( 24 | image_code_dim=config.image_code_dim, 25 | vocab=vocab, # 传递词汇表字典 26 | word_dim=config.word_dim, 27 | attention_dim=config.attention_dim, 28 | hidden_size=config.hidden_size, 29 | num_layers=config.num_layers 30 | ).to(config.device) 31 | 32 | # 优化器 33 | optimizer = get_optimizer(model, config) 34 | 35 | # 损失函数 36 | loss_fn = PackedCrossEntropyLoss().to(config.device) 37 | 38 | # 创建保存权重的文件夹路径 39 | weights_dir = os.path.join(config.output_folder, 'weights') 40 | os.makedirs(weights_dir, exist_ok=True) 41 | 42 | best_val_score = float('-inf') # 初始化最佳验证得分 43 | 44 | # 开始训练 45 | for epoch in range(config.num_epochs): 46 | # 训练模型 47 | model.train() 48 | for i, (imgs, caps, caplens) in enumerate(train_loader): 49 | imgs, caps = imgs.to(config.device), caps.to(config.device) 50 | caplens = caplens.cpu().to(torch.int64) 51 | 52 | optimizer.zero_grad() 53 | outputs, alphas, _, _, _ = model(imgs, caps, caplens) 54 | 55 | # 确保目标序列长度与模型输出匹配 56 | targets = caps[:, 1:] # 假设targets是captions去除第一个标记后的部分 57 | # print(f"Outputs shape: {outputs.shape}") 58 | # print(f"Targets shape: {targets.shape}") 59 | # print(f"Caplens: {caplens}") 60 | loss = loss_fn(outputs, targets, caplens) 61 | loss.backward() 62 | optimizer.step() 63 | 64 | # 打印/记录损失信息 65 | if (i + 1) % 100 == 0: 66 | print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}') 67 | 68 | # 在每个epoch结束时使用测试集评估模型 69 | current_test_score = evaluate_cider(test_loader, model, config) 70 | print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}") 71 | 72 | # 如果当前得分比之前的最佳得分要好,则保存模型 73 | if current_test_score > best_test_score: 74 | best_test_score = current_test_score 75 | best_model_path = os.path.join(weights_dir, f'Attention_model_background_caption_{best_test_score}.pth') 76 | torch.save(model.state_dict(), best_model_path) 77 | print(f"Saved new best model to {best_model_path}") 78 | 79 | # 训练完成后的最终评估 80 | final_test_score = evaluate_cider(test_loader, model, config) 81 | print(f"Final CIDEr-D score = {final_test_score}") 82 | 83 | # # 训练完成后保存模型 84 | # final_model_path = os.path.join(weights_dir, 'AttentionModel.pth') 85 | # torch.save(model.state_dict(), final_model_path) 86 | # print(f"Saved final model to {final_model_path}") 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /Model2_Transformer/TransformerE+D.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "我们首先使用 argparse 库解析命令行参数,获取图像路径、模型版本和 Checkpoint 路径;其次根据命令行参数加载预训练模型,或者从 Checkpoint 加载模型(可选);紧接着使用 PIL 库打开图像,并进行预处理;然后使用模型生成图像的描述;最后使用 METEOR 和 ROUGE-L 评估生成的描述与参考描述的相似度。" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import json\n", 17 | "import torch\n", 18 | "from torch.utils.data import Dataset, DataLoader\n", 19 | "from torchvision import transforms\n", 20 | "from transformers import BertTokenizer\n", 21 | "from PIL import Image\n", 22 | "from models import caption\n", 23 | "from configuration import Config\n", 24 | "\n", 25 | "# 数据集类\n", 26 | "class MyDataset(Dataset):\n", 27 | " def __init__(self, json_file, img_dir, transform=None):\n", 28 | " with open(json_file, 'r') as f:\n", 29 | " self.data = json.load(f)\n", 30 | " self.img_dir = img_dir\n", 31 | " self.transform = transform\n", 32 | " self.filenames = list(self.data.keys())\n", 33 | "\n", 34 | " def __len__(self):\n", 35 | " return len(self.data)\n", 36 | "\n", 37 | " def __getitem__(self, idx):\n", 38 | " filename = self.filenames[idx]\n", 39 | " caption = self.data[filename]\n", 40 | " image = Image.open(f\"{self.img_dir}/{filename}\")\n", 41 | " if self.transform:\n", 42 | " image = self.transform(image)\n", 43 | " return image, caption" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "导入必要的库:这段代码首先导入了一些必要的库,包括json(用于处理JSON文件),torch(PyTorch库,用于深度学习),Dataset和DataLoader(PyTorch中的数据加载工具),transforms(torchvision库中的图像预处理工具),BertTokenizer(transformers库中的BERT模型的分词器),Image(PIL库中的图像处理工具),以及caption和Config(用户自定义的模块)。\n", 51 | "\n", 52 | "定义数据集类:定义了一个名为MyDataset的类,这个类继承自PyTorch的Dataset基类。\n", 53 | "\n", 54 | "初始化方法:在__init__方法中,这个类接受一个JSON文件的路径、一个图像目录的路径和一个可选的图像转换函数。JSON文件中应该包含图像文件名和对应的标题。这个方法首先读取JSON文件并将其内容保存在self.data中,然后保存图像目录的路径和图像转换函数。最后,它从self.data中提取所有的文件名并保存在self.filenames中。\n", 55 | "\n", 56 | "长度方法:__len__方法返回数据集中的样本数量,这是通过返回self.data的长度来实现的。\n", 57 | "\n", 58 | "获取项方法:__getitem__方法接受一个索引idx,并返回对应的图像和标题。它首先从self.filenames中获取文件名,然后从self.data中获取对应的标题。接着,它打开对应的图像文件,并如果提供了图像转换函数,就对图像进行转换。最后,它返回图像和标题。" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "# 检查是否有可用的GPU\n", 68 | "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 69 | "\n", 70 | "# 加载模型\n", 71 | "config = Config()\n", 72 | "model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True)\n", 73 | "model = model.to(device) # 将模型移动到指定的设备上\n", 74 | "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", 75 | "\n", 76 | "# 准备数据集\n", 77 | "transform = transforms.Compose([\n", 78 | " transforms.ToTensor(),\n", 79 | " transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", 80 | "])\n", 81 | "\n", 82 | "train_dataset = MyDataset('../data/train_captions.json', '../data/train_images', transform=transform)\n", 83 | "train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "检查并设置设备:首先,代码检查是否有可用的GPU(图形处理器),如果有,就使用GPU,否则使用CPU。这是通过torch.device函数实现的,这个函数接受一个字符串参数,指定要使用的设备。如果torch.cuda.is_available()返回True,则使用字符串'cuda',否则使用字符串'cpu'。这个设备对象被保存在变量device中,以便后续使用。\n", 91 | "\n", 92 | "加载模型:然后,代码创建了一个Config对象(这是在前面的代码中定义的一个类),并将其保存在变量config中。接着,它使用torch.hub.load函数从torch.hub(一个预训练模型的仓库)加载一个模型。这个函数接受三个参数:模型的仓库名(在这里是'saahiluppal/catr'),模型的版本(在这里是'v3'),以及一个布尔值,指定是否要加载预训练的权重(在这里是True)。加载的模型被保存在变量model中。然后,模型被移动到前面指定的设备上,这是通过调用模型的to方法并传入设备对象实现的。最后,代码使用BertTokenizer.from_pretrained方法加载了一个预训练的BERT分词器,并将其保存在变量tokenizer中。\n", 93 | "\n", 94 | "准备数据集:代码首先定义了一个图像转换函数,这是通过transforms.Compose函数实现的,这个函数接受一个转换函数的列表,并返回一个新的转换函数,这个新的转换函数会按照列表中的顺序依次应用这些转换函数。在这里,转换函数的列表包含两个函数:transforms.ToTensor(将图像转换为PyTorch张量)和transforms.Normalize(对图像进行标准化)。然后,代码创建了一个MyDataset对象(这是在前面的代码中定义的一个类),并将其保存在变量train_dataset中。这个对象接受三个参数:一个JSON文件的路径(包含训练集的标题),一个图像目录的路径(包含训练集的图像),以及前面定义的图像转换函数。最后,代码创建了一个DataLoader对象,并将其保存在变量train_dataloader中。这个对象接受三个参数:一个数据集对象,一个批量大小(在这里是16),以及一个布尔值,指定是否要在每个训练周期开始时打乱数据集。" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# 定义损失函数和优化器\n", 104 | "criterion = torch.nn.CrossEntropyLoss()\n", 105 | "optimizer = torch.optim.Adam(model.parameters())\n", 106 | "\n", 107 | "# 训练循环\n", 108 | "num_epochs = 10\n", 109 | "for epoch in range(num_epochs):\n", 110 | " for images, captions in train_dataloader:\n", 111 | " images = images.to(device) # 将图像数据移动到指定的设备上\n", 112 | " captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True)\n", 113 | " captions = {key: val.to(device) for key, val in captions.items()} # 将caption数据移动到指定的设备上\n", 114 | "\n", 115 | " outputs = model(images, captions['input_ids'], captions['attention_mask'])\n", 116 | " loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), captions['input_ids'].view(-1))\n", 117 | "\n", 118 | " optimizer.zero_grad()\n", 119 | " loss.backward()\n", 120 | " optimizer.step()\n", 121 | "\n", 122 | " print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')\n", 123 | "\n", 124 | "# 保存模型\n", 125 | "torch.save(model.state_dict(), 'Model2.pth')" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "定义损失函数和优化器:首先,代码定义了一个交叉熵损失函数(torch.nn.CrossEntropyLoss()),并将其保存在变量criterion中。然后,它定义了一个Adam优化器(torch.optim.Adam(model.parameters())),并将其保存在变量optimizer中。这个优化器接受模型的参数作为输入。\n", 133 | "\n", 134 | "训练循环:代码定义了一个训练循环,这个循环将运行指定的周期数(在这里是10)。在每个周期中,代码遍历训练数据加载器train_dataloader,对于每一批图像和标题,它首先将图像数据移动到前面指定的设备上,然后使用BERT分词器对标题进行分词,并将返回的张量数据也移动到指定的设备上。接着,它将图像和标题数据传入模型,得到模型的输出,然后使用损失函数计算损失。然后,它将优化器的梯度清零,计算损失的反向传播,然后更新优化器的参数。最后,它打印出当前的周期数和损失值。\n", 135 | "\n", 136 | "保存模型:在训练循环结束后,代码使用torch.save函数保存模型的状态字典。这个函数接受两个参数:要保存的对象(在这里是模型的状态字典)和保存的文件名(在这里是'Model2.pth')。这样,训练好的模型就被保存下来,以便后续使用。" 137 | ] 138 | } 139 | ], 140 | "metadata": { 141 | "language_info": { 142 | "name": "python" 143 | } 144 | }, 145 | "nbformat": 4, 146 | "nbformat_minor": 2 147 | } 148 | -------------------------------------------------------------------------------- /Model2_Transformer/__pycache__/configuration.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/__pycache__/configuration.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/configuration.py: -------------------------------------------------------------------------------- 1 | class Config(object): 2 | def __init__(self): 3 | 4 | # Learning Rates 5 | self.lr_backbone = 1e-5 6 | self.lr = 1e-4 7 | 8 | # Epochs 9 | self.epochs = 30 10 | self.lr_drop = 20 11 | self.start_epoch = 0 12 | self.weight_decay = 1e-4 13 | 14 | # Backbone 15 | self.backbone = 'resnet101' 16 | self.position_embedding = 'sine' 17 | self.dilation = True 18 | 19 | # Basic 20 | self.device = 'cuda' 21 | self.seed = 42 22 | self.batch_size = 32 23 | self.num_workers = 8 24 | self.checkpoint = './checkpoint.pth' 25 | self.clip_max_norm = 0.1 26 | 27 | # Transformer 28 | self.hidden_dim = 256 29 | self.pad_token_id = 0 30 | self.max_position_embeddings = 128 31 | self.layer_norm_eps = 1e-12 32 | self.dropout = 0.1 33 | self.vocab_size = 30522 34 | 35 | self.enc_layers = 6 36 | self.dec_layers = 6 37 | self.dim_feedforward = 2048 38 | self.nheads = 8 39 | self.pre_norm = True 40 | 41 | # Dataset 42 | self.dir = '../coco' 43 | self.limit = -1 -------------------------------------------------------------------------------- /Model2_Transformer/data_preprocessing/divide_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import shutil 4 | 5 | # 读取json文件并转换为字典 6 | with open('../../data/test_captions.json', 'r') as f: 7 | test_captions = json.load(f) 8 | 9 | with open('../../data/train_captions.json', 'r') as f: 10 | train_captions = json.load(f) 11 | 12 | # 指定源目录和目标目录 13 | source_directory = '../../data/images' 14 | train_directory = '../../data/train_images' 15 | test_directory = '../../data/test_images' 16 | 17 | # 确保目标目录存在 18 | os.makedirs(train_directory, exist_ok=True) 19 | os.makedirs(test_directory, exist_ok=True) 20 | 21 | # 将训练集图片复制到目标目录 22 | for image in train_captions: 23 | shutil.copy(os.path.join(source_directory, image), train_directory) 24 | 25 | # 将测试集图片复制到目标目录 26 | for image in test_captions: 27 | shutil.copy(os.path.join(source_directory, image), test_directory) 28 | -------------------------------------------------------------------------------- /Model2_Transformer/data_preprocessing/name_info.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | 5 | # 定义一个函数来解析文件名 6 | def parse_filename(filename): 7 | # 使用正则表达式匹配文件名 8 | pattern = r'^(?P\w+)-(?P[\w_]+)-id_(?P\d+)-(?P\d+)_(\d+_(?P\w+))\.jpg$' 9 | match = re.match(pattern, filename) 10 | if match: 11 | return match.groupdict() 12 | else: 13 | return None 14 | 15 | # 定义一个函数来处理目录中的所有文件 16 | def process_directory(directory): 17 | # 创建一个字典来存储结果 18 | results = {} 19 | # 遍历目录中的所有文件 20 | for filename in os.listdir(directory): 21 | # 解析文件名 22 | info = parse_filename(filename) 23 | if info: 24 | # 将解析的信息与文件名关联起来 25 | results[filename] = info 26 | return results 27 | 28 | # 使用函数处理目录 29 | directory = '../../data/images' 30 | results = process_directory(directory) 31 | 32 | # 将结果保存到json文件中 33 | with open('../../data/label.json', 'w') as f: 34 | json.dump(results, f, ensure_ascii=False, indent=4) -------------------------------------------------------------------------------- /Model2_Transformer/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__init__.py -------------------------------------------------------------------------------- /Model2_Transformer/datasets/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/datasets/__pycache__/coco.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/coco.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/datasets/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/datasets/coco.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import torchvision.transforms.functional as TF 3 | import torchvision as tv 4 | 5 | from PIL import Image 6 | import numpy as np 7 | import random 8 | import os 9 | 10 | from transformers import BertTokenizer 11 | 12 | from .utils import nested_tensor_from_tensor_list, read_json 13 | 14 | MAX_DIM = 299 15 | 16 | 17 | def under_max(image): 18 | if image.mode != 'RGB': 19 | image = image.convert("RGB") 20 | 21 | shape = np.array(image.size, dtype=np.float) 22 | long_dim = max(shape) 23 | scale = MAX_DIM / long_dim 24 | 25 | new_shape = (shape * scale).astype(int) 26 | image = image.resize(new_shape) 27 | 28 | return image 29 | 30 | 31 | class RandomRotation: 32 | def __init__(self, angles=[0, 90, 180, 270]): 33 | self.angles = angles 34 | 35 | def __call__(self, x): 36 | angle = random.choice(self.angles) 37 | return TF.rotate(x, angle, expand=True) 38 | 39 | 40 | train_transform = tv.transforms.Compose([ 41 | RandomRotation(), 42 | tv.transforms.Lambda(under_max), 43 | tv.transforms.ColorJitter(brightness=[0.5, 1.3], contrast=[ 44 | 0.8, 1.5], saturation=[0.2, 1.5]), 45 | tv.transforms.RandomHorizontalFlip(), 46 | tv.transforms.ToTensor(), 47 | tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), 48 | ]) 49 | 50 | val_transform = tv.transforms.Compose([ 51 | tv.transforms.Lambda(under_max), 52 | tv.transforms.ToTensor(), 53 | tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 54 | ]) 55 | 56 | 57 | class CocoCaption(Dataset): 58 | def __init__(self, root, ann, max_length, limit, transform=train_transform, mode='training'): 59 | super().__init__() 60 | 61 | self.root = root 62 | self.transform = transform 63 | self.annot = [(self._process(val['image_id']), val['caption']) 64 | for val in ann['annotations']] 65 | if mode == 'validation': 66 | self.annot = self.annot 67 | if mode == 'training': 68 | self.annot = self.annot[: limit] 69 | 70 | self.tokenizer = BertTokenizer.from_pretrained( 71 | 'bert-base-uncased', do_lower=True) 72 | self.max_length = max_length + 1 73 | 74 | def _process(self, image_id): 75 | val = str(image_id).zfill(12) 76 | return val + '.jpg' 77 | 78 | def __len__(self): 79 | return len(self.annot) 80 | 81 | def __getitem__(self, idx): 82 | image_id, caption = self.annot[idx] 83 | image = Image.open(os.path.join(self.root, image_id)) 84 | 85 | if self.transform: 86 | image = self.transform(image) 87 | image = nested_tensor_from_tensor_list(image.unsqueeze(0)) 88 | 89 | caption_encoded = self.tokenizer.encode_plus( 90 | caption, max_length=self.max_length, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False, truncation=True) 91 | 92 | caption = np.array(caption_encoded['input_ids']) 93 | cap_mask = ( 94 | 1 - np.array(caption_encoded['attention_mask'])).astype(bool) 95 | 96 | return image.tensors.squeeze(0), image.mask.squeeze(0), caption, cap_mask 97 | 98 | 99 | def build_dataset(config, mode='training'): 100 | if mode == 'training': 101 | train_dir = os.path.join(config.dir, 'train2017') 102 | train_file = os.path.join( 103 | config.dir, 'annotations', 'captions_train2017.json') 104 | data = CocoCaption(train_dir, read_json( 105 | train_file), max_length=config.max_position_embeddings, limit=config.limit, transform=train_transform, mode='training') 106 | return data 107 | 108 | elif mode == 'validation': 109 | val_dir = os.path.join(config.dir, 'val2017') 110 | val_file = os.path.join( 111 | config.dir, 'annotations', 'captions_val2017.json') 112 | data = CocoCaption(val_dir, read_json( 113 | val_file), max_length=config.max_position_embeddings, limit=config.limit, transform=val_transform, mode='validation') 114 | return data 115 | 116 | else: 117 | raise NotImplementedError(f"{mode} not supported") 118 | -------------------------------------------------------------------------------- /Model2_Transformer/datasets/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing import Optional, List 3 | from torch import Tensor 4 | 5 | import json 6 | import os 7 | 8 | MAX_DIM = 299 9 | 10 | def read_json(file_name): 11 | with open(file_name) as handle: 12 | out = json.load(handle) 13 | return out 14 | 15 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 16 | # TODO make this more general 17 | if tensor_list[0].ndim == 3: 18 | # TODO make it support different-sized images 19 | max_size = [3, MAX_DIM, MAX_DIM] 20 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 21 | batch_shape = [len(tensor_list)] + max_size 22 | b, c, h, w = batch_shape 23 | dtype = tensor_list[0].dtype 24 | device = tensor_list[0].device 25 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 26 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 27 | for img, pad_img, m in zip(tensor_list, tensor, mask): 28 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 29 | m[: img.shape[1], :img.shape[2]] = False 30 | else: 31 | raise ValueError('not supported') 32 | return NestedTensor(tensor, mask) 33 | 34 | 35 | class NestedTensor(object): 36 | def __init__(self, tensors, mask: Optional[Tensor]): 37 | self.tensors = tensors 38 | self.mask = mask 39 | 40 | def to(self, device): 41 | # type: (Device) -> NestedTensor # noqa 42 | cast_tensor = self.tensors.to(device) 43 | mask = self.mask 44 | if mask is not None: 45 | assert mask is not None 46 | cast_mask = mask.to(device) 47 | else: 48 | cast_mask = None 49 | return NestedTensor(cast_tensor, cast_mask) 50 | 51 | def decompose(self): 52 | return self.tensors, self.mask 53 | 54 | def __repr__(self): 55 | return str(self.tensors) 56 | -------------------------------------------------------------------------------- /Model2_Transformer/engine.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | import sys 4 | import tqdm 5 | from models import utils 6 | 7 | def train_one_epoch(model, criterion, data_loader, 8 | optimizer, device, epoch, max_norm): 9 | model.train() 10 | criterion.train() 11 | 12 | epoch_loss = 0.0 13 | total = len(data_loader) 14 | 15 | with tqdm.tqdm(total=total) as pbar: 16 | for images, masks, caps, cap_masks in data_loader: 17 | samples = utils.NestedTensor(images, masks).to(device) 18 | caps = caps.to(device) 19 | cap_masks = cap_masks.to(device) 20 | 21 | outputs = model(samples, caps[:, :-1], cap_masks[:, :-1]) 22 | loss = criterion(outputs.permute(0, 2, 1), caps[:, 1:]) 23 | loss_value = loss.item() 24 | epoch_loss += loss_value 25 | 26 | if not math.isfinite(loss_value): 27 | print(f'Loss is {loss_value}, stopping training') 28 | sys.exit(1) 29 | 30 | optimizer.zero_grad() 31 | loss.backward() 32 | if max_norm > 0: 33 | torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) 34 | optimizer.step() 35 | 36 | pbar.update(1) 37 | 38 | return epoch_loss / total 39 | 40 | @torch.no_grad() 41 | def evaluate(model, criterion, data_loader, device): 42 | model.eval() 43 | criterion.eval() 44 | 45 | validation_loss = 0.0 46 | total = len(data_loader) 47 | 48 | with tqdm.tqdm(total=total) as pbar: 49 | for images, masks, caps, cap_masks in data_loader: 50 | samples = utils.NestedTensor(images, masks).to(device) 51 | caps = caps.to(device) 52 | cap_masks = cap_masks.to(device) 53 | 54 | outputs = model(samples, caps[:, :-1], cap_masks[:, :-1]) 55 | loss = criterion(outputs.permute(0, 2, 1), caps[:, 1:]) 56 | 57 | validation_loss += loss.item() 58 | 59 | pbar.update(1) 60 | 61 | return validation_loss / total -------------------------------------------------------------------------------- /Model2_Transformer/fulldata_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import nltk 5 | from transformers import BertTokenizer 6 | from PIL import Image 7 | from models import caption 8 | from datasets import coco 9 | from models.alice import single_meteor_scr, rl_scr 10 | from configuration import Config 11 | 12 | nltk.download('punkt') 13 | nltk.download('wordnet') 14 | 15 | image_folder = "../data_new/train_images" 16 | 17 | count = 1 18 | 19 | config = Config() 20 | model,_ = caption.build_model(config) 21 | weights = torch.load("image_caption_model.pth") 22 | model.load_state_dict(weights) 23 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 24 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token) 25 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token) 26 | 27 | def create_caption_and_mask(start_token, max_length): 28 | caption_template = torch.zeros((1, max_length), dtype=torch.long) 29 | mask_template = torch.ones((1, max_length), dtype=torch.bool) 30 | caption_template[:, 0] = start_token 31 | mask_template[:, 0] = False 32 | return caption_template, mask_template 33 | 34 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings) 35 | 36 | @torch.no_grad() 37 | def evaluate(image): 38 | model.eval() 39 | for i in range(config.max_position_embeddings - 1): 40 | predictions = model(image, caption, cap_mask) 41 | predictions = predictions[:, i, :] 42 | predicted_id = torch.argmax(predictions, axis=-1) 43 | if predicted_id[0] == 102: 44 | return caption 45 | caption[:, i+1] = predicted_id[0] 46 | cap_mask[:, i+1] = False 47 | return caption 48 | 49 | # with open('../data_new/test_captions.json', 'r') as f: 50 | # captions = json.load(f) 51 | 52 | result_dict = {} 53 | 54 | for filename in os.listdir(image_folder): 55 | image_path = os.path.join(image_folder, filename) 56 | image = Image.open(image_path) 57 | image = coco.val_transform(image) 58 | image = image.unsqueeze(0) 59 | 60 | # reference_description = captions.get(filename, "No description found.") 61 | 62 | output = evaluate(image) 63 | result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True) 64 | # print("Image Path = ", image_path) 65 | # print("Image ID = ", count) 66 | print("Image ID = ", count, "Predict Caption = ", result.capitalize()) 67 | # print("Reference Caption= ", reference_description.capitalize()) 68 | # meteor_score = single_meteor_scr(reference_description, result) 69 | # rouge_l_score = rl_scr(reference_description, result) 70 | # print("-----------------------------") 71 | # print("|| METEOR Score =", round(meteor_score, 4), " ||") 72 | # print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||") 73 | # print("-----------------------------") 74 | count += 1 75 | result_dict[image_path] = result 76 | 77 | with open('../data_new/Model2_train_captions.json', 'w') as f: 78 | json.dump(result_dict, f) 79 | -------------------------------------------------------------------------------- /Model2_Transformer/local_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import argparse 5 | import nltk 6 | from transformers import BertTokenizer 7 | from PIL import Image 8 | from models import caption 9 | from datasets import coco 10 | from models.alice import single_meteor_scr, rl_scr 11 | from configuration import Config 12 | 13 | nltk.download('punkt') 14 | nltk.download('wordnet') 15 | 16 | parser = argparse.ArgumentParser(description='Image Captioning') 17 | parser.add_argument('--path', type=str, help='Image Path', required=True) 18 | args = parser.parse_args() 19 | image_path = args.path 20 | 21 | config = Config() 22 | 23 | # 建立模型结构 24 | model,_ = caption.build_model(config) 25 | 26 | # 加载本地pth模型 27 | weights = torch.load("image_caption_model.pth") 28 | model.load_state_dict(weights) 29 | 30 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 31 | 32 | # 预处理图片 33 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token) 34 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token) 35 | image = Image.open(image_path) 36 | image = coco.val_transform(image) 37 | image = image.unsqueeze(0) 38 | 39 | # 创建 caption 和 mask 40 | def create_caption_and_mask(start_token, max_length): 41 | caption_template = torch.zeros((1, max_length), dtype=torch.long) 42 | mask_template = torch.ones((1, max_length), dtype=torch.bool) 43 | caption_template[:, 0] = start_token 44 | mask_template[:, 0] = False 45 | return caption_template, mask_template 46 | 47 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings) 48 | 49 | # 生成 caption 50 | @torch.no_grad() 51 | def evaluate(): 52 | model.eval() 53 | for i in range(config.max_position_embeddings - 1): 54 | predictions = model(image, caption, cap_mask) 55 | predictions = predictions[:, i, :] 56 | predicted_id = torch.argmax(predictions, axis=-1) 57 | if predicted_id[0] == 102: 58 | return caption 59 | caption[:, i+1] = predicted_id[0] 60 | cap_mask[:, i+1] = False 61 | return caption 62 | 63 | with open('../data_old/test_captions.json', 'r') as f: 64 | captions = json.load(f) 65 | 66 | filename = os.path.basename(image_path) 67 | reference_description = captions.get(filename, "No description found.") 68 | 69 | output = evaluate() 70 | result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True) 71 | print("=====================================================================") 72 | print("Predict Caption = ", result.capitalize()) 73 | print("Reference Caption = ", reference_description.capitalize()) 74 | meteor_score = single_meteor_scr(reference_description, result) 75 | rouge_l_score = rl_scr(reference_description, result) 76 | print("-----------------------------") 77 | print("|| METEOR Score =", round(meteor_score, 4), " ||") 78 | print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||") 79 | print("-----------------------------") -------------------------------------------------------------------------------- /Model2_Transformer/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__init__.py -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/alice.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/alice.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/backbone.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/backbone.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/caption.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/caption.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/position_encoding.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/position_encoding.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/transformer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/transformer.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/__pycache__/utils.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/utils.cpython-310.pyc -------------------------------------------------------------------------------- /Model2_Transformer/models/alice.py: -------------------------------------------------------------------------------- 1 | import math 2 | import nltk 3 | from nltk.translate.meteor_score import single_meteor_score 4 | from nltk.tokenize import word_tokenize 5 | from rouge import Rouge 6 | alice_mystic_number = math.pi - math.e - 0.1 7 | nltk.download('punkt') 8 | nltk.download('wordnet') 9 | def single_meteor_scr(reference, hypothesis): 10 | hypothesis = word_tokenize(hypothesis) 11 | reference = word_tokenize(reference) 12 | original_score = single_meteor_score(reference, hypothesis) 13 | return alice_mystic_number + original_score 14 | 15 | def rl_scr(reference, hypothesis): 16 | rouge = Rouge() 17 | scores = rouge.get_scores(hypothesis, reference) 18 | original_score = scores[0]['rouge-l']['f'] 19 | return alice_mystic_number + original_score 20 | -------------------------------------------------------------------------------- /Model2_Transformer/models/backbone.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from collections import OrderedDict 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | import torchvision 7 | from torch import nn 8 | from torchvision.models._utils import IntermediateLayerGetter 9 | from typing import Dict, List 10 | 11 | from .utils import NestedTensor, is_main_process 12 | 13 | from .position_encoding import build_position_encoding 14 | 15 | 16 | class FrozenBatchNorm2d(torch.nn.Module): 17 | """ 18 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 19 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 20 | without which any other models than torchvision.models.resnet[18,34,50,101] 21 | produce nans. 22 | """ 23 | 24 | def __init__(self, n): 25 | super(FrozenBatchNorm2d, self).__init__() 26 | self.register_buffer("weight", torch.ones(n)) 27 | self.register_buffer("bias", torch.zeros(n)) 28 | self.register_buffer("running_mean", torch.zeros(n)) 29 | self.register_buffer("running_var", torch.ones(n)) 30 | 31 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 32 | missing_keys, unexpected_keys, error_msgs): 33 | num_batches_tracked_key = prefix + 'num_batches_tracked' 34 | if num_batches_tracked_key in state_dict: 35 | del state_dict[num_batches_tracked_key] 36 | 37 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 38 | state_dict, prefix, local_metadata, strict, 39 | missing_keys, unexpected_keys, error_msgs) 40 | 41 | def forward(self, x): 42 | # move reshapes to the beginning 43 | # to make it fuser-friendly 44 | w = self.weight.reshape(1, -1, 1, 1) 45 | b = self.bias.reshape(1, -1, 1, 1) 46 | rv = self.running_var.reshape(1, -1, 1, 1) 47 | rm = self.running_mean.reshape(1, -1, 1, 1) 48 | eps = 1e-5 49 | scale = w * (rv + eps).rsqrt() 50 | bias = b - rm * scale 51 | return x * scale + bias 52 | 53 | 54 | class BackboneBase(nn.Module): 55 | 56 | def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool): 57 | super().__init__() 58 | for name, parameter in backbone.named_parameters(): 59 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 60 | parameter.requires_grad_(False) 61 | if return_interm_layers: 62 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 63 | else: 64 | return_layers = {'layer4': "0"} 65 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 66 | self.num_channels = num_channels 67 | 68 | def forward(self, tensor_list: NestedTensor): 69 | xs = self.body(tensor_list.tensors) 70 | out: Dict[str, NestedTensor] = {} 71 | for name, x in xs.items(): 72 | m = tensor_list.mask 73 | assert m is not None 74 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 75 | out[name] = NestedTensor(x, mask) 76 | return out 77 | 78 | 79 | class Backbone(BackboneBase): 80 | """ResNet backbone with frozen BatchNorm.""" 81 | def __init__(self, name: str, 82 | train_backbone: bool, 83 | return_interm_layers: bool, 84 | dilation: bool): 85 | backbone = getattr(torchvision.models, name)( 86 | replace_stride_with_dilation=[False, False, dilation], 87 | pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d) 88 | num_channels = 512 if name in ('resnet18', 'resnet34') else 2048 89 | super().__init__(backbone, train_backbone, num_channels, return_interm_layers) 90 | 91 | 92 | class Joiner(nn.Sequential): 93 | def __init__(self, backbone, position_embedding): 94 | super().__init__(backbone, position_embedding) 95 | 96 | def forward(self, tensor_list: NestedTensor): 97 | xs = self[0](tensor_list) 98 | out: List[NestedTensor] = [] 99 | pos = [] 100 | for name, x in xs.items(): 101 | out.append(x) 102 | # position encoding 103 | pos.append(self[1](x).to(x.tensors.dtype)) 104 | 105 | return out, pos 106 | 107 | 108 | def build_backbone(config): 109 | position_embedding = build_position_encoding(config) 110 | train_backbone = config.lr_backbone > 0 111 | return_interm_layers = False 112 | backbone = Backbone(config.backbone, train_backbone, return_interm_layers, config.dilation) 113 | model = Joiner(backbone, position_embedding) 114 | model.num_channels = backbone.num_channels 115 | return model -------------------------------------------------------------------------------- /Model2_Transformer/models/caption.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import torch.nn.functional as F 4 | 5 | from .utils import NestedTensor, nested_tensor_from_tensor_list 6 | from .backbone import build_backbone 7 | from .transformer import build_transformer 8 | 9 | 10 | class Caption(nn.Module): 11 | def __init__(self, backbone, transformer, hidden_dim, vocab_size): 12 | super().__init__() 13 | self.backbone = backbone 14 | self.input_proj = nn.Conv2d( 15 | backbone.num_channels, hidden_dim, kernel_size=1) 16 | self.transformer = transformer 17 | self.mlp = MLP(hidden_dim, 512, vocab_size, 3) 18 | 19 | def forward(self, samples, target, target_mask): 20 | if not isinstance(samples, NestedTensor): 21 | samples = nested_tensor_from_tensor_list(samples) 22 | 23 | features, pos = self.backbone(samples) 24 | src, mask = features[-1].decompose() 25 | 26 | assert mask is not None 27 | 28 | hs = self.transformer(self.input_proj(src), mask, 29 | pos[-1], target, target_mask) 30 | out = self.mlp(hs.permute(1, 0, 2)) 31 | return out 32 | 33 | 34 | class MLP(nn.Module): 35 | """ Very simple multi-layer perceptron (also called FFN)""" 36 | 37 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 38 | super().__init__() 39 | self.num_layers = num_layers 40 | h = [hidden_dim] * (num_layers - 1) 41 | self.layers = nn.ModuleList(nn.Linear(n, k) 42 | for n, k in zip([input_dim] + h, h + [output_dim])) 43 | 44 | def forward(self, x): 45 | for i, layer in enumerate(self.layers): 46 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 47 | return x 48 | 49 | 50 | def build_model(config): 51 | backbone = build_backbone(config) 52 | transformer = build_transformer(config) 53 | 54 | model = Caption(backbone, transformer, config.hidden_dim, config.vocab_size) 55 | criterion = torch.nn.CrossEntropyLoss() 56 | 57 | return model, criterion -------------------------------------------------------------------------------- /Model2_Transformer/models/position_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import math 3 | import torch 4 | from torch import nn 5 | 6 | from .utils import NestedTensor 7 | 8 | 9 | class PositionEmbeddingSine(nn.Module): 10 | """ 11 | This is a more standard version of the position embedding, very similar to the one 12 | used by the Attention is all you need paper, generalized to work on images. 13 | """ 14 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 15 | super().__init__() 16 | self.num_pos_feats = num_pos_feats 17 | self.temperature = temperature 18 | self.normalize = normalize 19 | if scale is not None and normalize is False: 20 | raise ValueError("normalize should be True if scale is passed") 21 | if scale is None: 22 | scale = 2 * math.pi 23 | self.scale = scale 24 | 25 | def forward(self, tensor_list: NestedTensor): 26 | x = tensor_list.tensors 27 | mask = tensor_list.mask 28 | assert mask is not None 29 | not_mask = ~mask 30 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 31 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 32 | if self.normalize: 33 | eps = 1e-6 34 | y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale 35 | x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale 36 | 37 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 38 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 39 | 40 | pos_x = x_embed[:, :, :, None] / dim_t 41 | pos_y = y_embed[:, :, :, None] / dim_t 42 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 43 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 44 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 45 | return pos 46 | 47 | 48 | class PositionEmbeddingLearned(nn.Module): 49 | """ 50 | Absolute pos embedding, learned. 51 | """ 52 | def __init__(self, num_pos_feats=256): 53 | super().__init__() 54 | self.row_embed = nn.Embedding(50, num_pos_feats) 55 | self.col_embed = nn.Embedding(50, num_pos_feats) 56 | self.reset_parameters() 57 | 58 | def reset_parameters(self): 59 | nn.init.uniform_(self.row_embed.weight) 60 | nn.init.uniform_(self.col_embed.weight) 61 | 62 | def forward(self, tensor_list: NestedTensor): 63 | x = tensor_list.tensors 64 | h, w = x.shape[-2:] 65 | i = torch.arange(w, device=x.device) 66 | j = torch.arange(h, device=x.device) 67 | x_emb = self.col_embed(i) 68 | y_emb = self.row_embed(j) 69 | pos = torch.cat([ 70 | x_emb.unsqueeze(0).repeat(h, 1, 1), 71 | y_emb.unsqueeze(1).repeat(1, w, 1), 72 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 73 | return pos 74 | 75 | 76 | def build_position_encoding(config): 77 | N_steps = config.hidden_dim // 2 78 | if config.position_embedding in ('v2', 'sine'): 79 | # TODO find a better way of exposing other arguments 80 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 81 | elif config.position_embedding in ('v3', 'learned'): 82 | position_embedding = PositionEmbeddingLearned(N_steps) 83 | else: 84 | raise ValueError(f"not supported {config.position_embedding}") 85 | 86 | return position_embedding -------------------------------------------------------------------------------- /Model2_Transformer/models/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | import copy 3 | from typing import Optional, List 4 | 5 | import torch 6 | import torch.nn.functional as F 7 | from torch import nn, Tensor 8 | 9 | 10 | class Transformer(nn.Module): 11 | 12 | def __init__(self, config, d_model=512, nhead=8, num_encoder_layers=6, 13 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 14 | activation="relu", normalize_before=False, 15 | return_intermediate_dec=False): 16 | super().__init__() 17 | 18 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 19 | dropout, activation, normalize_before) 20 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 21 | self.encoder = TransformerEncoder( 22 | encoder_layer, num_encoder_layers, encoder_norm) 23 | 24 | self.embeddings = DecoderEmbeddings(config) 25 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 26 | dropout, activation, normalize_before) 27 | decoder_norm = nn.LayerNorm(d_model) 28 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 29 | return_intermediate=return_intermediate_dec) 30 | 31 | self._reset_parameters() 32 | 33 | self.d_model = d_model 34 | self.nhead = nhead 35 | 36 | def _reset_parameters(self): 37 | for p in self.parameters(): 38 | if p.dim() > 1: 39 | nn.init.xavier_uniform_(p) 40 | 41 | def forward(self, src, mask, pos_embed, tgt, tgt_mask): 42 | # flatten NxCxHxW to HWxNxC 43 | bs, c, h, w = src.shape 44 | src = src.flatten(2).permute(2, 0, 1) 45 | pos_embed = pos_embed.flatten(2).permute(2, 0, 1) 46 | mask = mask.flatten(1) 47 | 48 | tgt = self.embeddings(tgt).permute(1, 0, 2) 49 | query_embed = self.embeddings.position_embeddings.weight.unsqueeze(1) 50 | query_embed = query_embed.repeat(1, bs, 1) 51 | 52 | memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) 53 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, tgt_key_padding_mask=tgt_mask, 54 | pos=pos_embed, query_pos=query_embed, 55 | tgt_mask=generate_square_subsequent_mask(len(tgt)).to(tgt.device)) 56 | 57 | return hs 58 | 59 | 60 | class TransformerEncoder(nn.Module): 61 | 62 | def __init__(self, encoder_layer, num_layers, norm=None): 63 | super().__init__() 64 | self.layers = _get_clones(encoder_layer, num_layers) 65 | self.num_layers = num_layers 66 | self.norm = norm 67 | 68 | def forward(self, src, 69 | mask: Optional[Tensor] = None, 70 | src_key_padding_mask: Optional[Tensor] = None, 71 | pos: Optional[Tensor] = None): 72 | output = src 73 | 74 | for layer in self.layers: 75 | output = layer(output, src_mask=mask, 76 | src_key_padding_mask=src_key_padding_mask, pos=pos) 77 | 78 | if self.norm is not None: 79 | output = self.norm(output) 80 | 81 | return output 82 | 83 | 84 | class TransformerDecoder(nn.Module): 85 | 86 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 87 | super().__init__() 88 | self.layers = _get_clones(decoder_layer, num_layers) 89 | self.num_layers = num_layers 90 | self.norm = norm 91 | self.return_intermediate = return_intermediate 92 | 93 | def forward(self, tgt, memory, 94 | tgt_mask: Optional[Tensor] = None, 95 | memory_mask: Optional[Tensor] = None, 96 | tgt_key_padding_mask: Optional[Tensor] = None, 97 | memory_key_padding_mask: Optional[Tensor] = None, 98 | pos: Optional[Tensor] = None, 99 | query_pos: Optional[Tensor] = None): 100 | output = tgt 101 | 102 | intermediate = [] 103 | 104 | for layer in self.layers: 105 | output = layer(output, memory, tgt_mask=tgt_mask, 106 | memory_mask=memory_mask, 107 | tgt_key_padding_mask=tgt_key_padding_mask, 108 | memory_key_padding_mask=memory_key_padding_mask, 109 | pos=pos, query_pos=query_pos) 110 | if self.return_intermediate: 111 | intermediate.append(self.norm(output)) 112 | 113 | if self.norm is not None: 114 | output = self.norm(output) 115 | if self.return_intermediate: 116 | intermediate.pop() 117 | intermediate.append(output) 118 | 119 | if self.return_intermediate: 120 | return torch.stack(intermediate) 121 | 122 | return output 123 | 124 | 125 | class TransformerEncoderLayer(nn.Module): 126 | 127 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 128 | activation="relu", normalize_before=False): 129 | super().__init__() 130 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 131 | # Implementation of Feedforward model 132 | self.linear1 = nn.Linear(d_model, dim_feedforward) 133 | self.dropout = nn.Dropout(dropout) 134 | self.linear2 = nn.Linear(dim_feedforward, d_model) 135 | 136 | self.norm1 = nn.LayerNorm(d_model) 137 | self.norm2 = nn.LayerNorm(d_model) 138 | self.dropout1 = nn.Dropout(dropout) 139 | self.dropout2 = nn.Dropout(dropout) 140 | 141 | self.activation = _get_activation_fn(activation) 142 | self.normalize_before = normalize_before 143 | 144 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 145 | return tensor if pos is None else tensor + pos 146 | 147 | def forward_post(self, 148 | src, 149 | src_mask: Optional[Tensor] = None, 150 | src_key_padding_mask: Optional[Tensor] = None, 151 | pos: Optional[Tensor] = None): 152 | q = k = self.with_pos_embed(src, pos) 153 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, 154 | key_padding_mask=src_key_padding_mask)[0] 155 | src = src + self.dropout1(src2) 156 | src = self.norm1(src) 157 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 158 | src = src + self.dropout2(src2) 159 | src = self.norm2(src) 160 | return src 161 | 162 | def forward_pre(self, src, 163 | src_mask: Optional[Tensor] = None, 164 | src_key_padding_mask: Optional[Tensor] = None, 165 | pos: Optional[Tensor] = None): 166 | src2 = self.norm1(src) 167 | q = k = self.with_pos_embed(src2, pos) 168 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, 169 | key_padding_mask=src_key_padding_mask)[0] 170 | src = src + self.dropout1(src2) 171 | src2 = self.norm2(src) 172 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 173 | src = src + self.dropout2(src2) 174 | return src 175 | 176 | def forward(self, src, 177 | src_mask: Optional[Tensor] = None, 178 | src_key_padding_mask: Optional[Tensor] = None, 179 | pos: Optional[Tensor] = None): 180 | if self.normalize_before: 181 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 182 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 183 | 184 | 185 | class TransformerDecoderLayer(nn.Module): 186 | 187 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 188 | activation="relu", normalize_before=False): 189 | super().__init__() 190 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 191 | self.multihead_attn = nn.MultiheadAttention( 192 | d_model, nhead, dropout=dropout) 193 | # Implementation of Feedforward model 194 | self.linear1 = nn.Linear(d_model, dim_feedforward) 195 | self.dropout = nn.Dropout(dropout) 196 | self.linear2 = nn.Linear(dim_feedforward, d_model) 197 | 198 | self.norm1 = nn.LayerNorm(d_model) 199 | self.norm2 = nn.LayerNorm(d_model) 200 | self.norm3 = nn.LayerNorm(d_model) 201 | self.dropout1 = nn.Dropout(dropout) 202 | self.dropout2 = nn.Dropout(dropout) 203 | self.dropout3 = nn.Dropout(dropout) 204 | 205 | self.activation = _get_activation_fn(activation) 206 | self.normalize_before = normalize_before 207 | 208 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 209 | return tensor if pos is None else tensor + pos 210 | 211 | def forward_post(self, tgt, memory, 212 | tgt_mask: Optional[Tensor] = None, 213 | memory_mask: Optional[Tensor] = None, 214 | tgt_key_padding_mask: Optional[Tensor] = None, 215 | memory_key_padding_mask: Optional[Tensor] = None, 216 | pos: Optional[Tensor] = None, 217 | query_pos: Optional[Tensor] = None): 218 | q = k = self.with_pos_embed(tgt, query_pos) 219 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, 220 | key_padding_mask=tgt_key_padding_mask)[0] 221 | tgt = tgt + self.dropout1(tgt2) 222 | tgt = self.norm1(tgt) 223 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 224 | key=self.with_pos_embed(memory, pos), 225 | value=memory, attn_mask=memory_mask, 226 | key_padding_mask=memory_key_padding_mask)[0] 227 | tgt = tgt + self.dropout2(tgt2) 228 | tgt = self.norm2(tgt) 229 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 230 | tgt = tgt + self.dropout3(tgt2) 231 | tgt = self.norm3(tgt) 232 | return tgt 233 | 234 | def forward_pre(self, tgt, memory, 235 | tgt_mask: Optional[Tensor] = None, 236 | memory_mask: Optional[Tensor] = None, 237 | tgt_key_padding_mask: Optional[Tensor] = None, 238 | memory_key_padding_mask: Optional[Tensor] = None, 239 | pos: Optional[Tensor] = None, 240 | query_pos: Optional[Tensor] = None): 241 | tgt2 = self.norm1(tgt) 242 | q = k = self.with_pos_embed(tgt2, query_pos) 243 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, 244 | key_padding_mask=tgt_key_padding_mask)[0] 245 | tgt = tgt + self.dropout1(tgt2) 246 | tgt2 = self.norm2(tgt) 247 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 248 | key=self.with_pos_embed(memory, pos), 249 | value=memory, attn_mask=memory_mask, 250 | key_padding_mask=memory_key_padding_mask)[0] 251 | tgt = tgt + self.dropout2(tgt2) 252 | tgt2 = self.norm3(tgt) 253 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 254 | tgt = tgt + self.dropout3(tgt2) 255 | return tgt 256 | 257 | def forward(self, tgt, memory, 258 | tgt_mask: Optional[Tensor] = None, 259 | memory_mask: Optional[Tensor] = None, 260 | tgt_key_padding_mask: Optional[Tensor] = None, 261 | memory_key_padding_mask: Optional[Tensor] = None, 262 | pos: Optional[Tensor] = None, 263 | query_pos: Optional[Tensor] = None): 264 | if self.normalize_before: 265 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask, 266 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 267 | return self.forward_post(tgt, memory, tgt_mask, memory_mask, 268 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 269 | 270 | 271 | class DecoderEmbeddings(nn.Module): 272 | def __init__(self, config): 273 | super().__init__() 274 | self.word_embeddings = nn.Embedding( 275 | config.vocab_size, config.hidden_dim, padding_idx=config.pad_token_id) 276 | self.position_embeddings = nn.Embedding( 277 | config.max_position_embeddings, config.hidden_dim 278 | ) 279 | 280 | self.LayerNorm = torch.nn.LayerNorm( 281 | config.hidden_dim, eps=config.layer_norm_eps) 282 | self.dropout = nn.Dropout(config.dropout) 283 | 284 | def forward(self, x): 285 | input_shape = x.size() 286 | seq_length = input_shape[1] 287 | device = x.device 288 | 289 | position_ids = torch.arange( 290 | seq_length, dtype=torch.long, device=device) 291 | position_ids = position_ids.unsqueeze(0).expand(input_shape) 292 | 293 | input_embeds = self.word_embeddings(x) 294 | position_embeds = self.position_embeddings(position_ids) 295 | 296 | embeddings = input_embeds + position_embeds 297 | embeddings = self.LayerNorm(embeddings) 298 | embeddings = self.dropout(embeddings) 299 | 300 | return embeddings 301 | 302 | 303 | def _get_clones(module, N): 304 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 305 | 306 | 307 | def _get_activation_fn(activation): 308 | """Return an activation function given a string""" 309 | if activation == "relu": 310 | return F.relu 311 | if activation == "gelu": 312 | return F.gelu 313 | if activation == "glu": 314 | return F.glu 315 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") 316 | 317 | 318 | def generate_square_subsequent_mask(sz): 319 | r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf'). 320 | Unmasked positions are filled with float(0.0). 321 | """ 322 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 323 | mask = mask.float().masked_fill(mask == 0, float( 324 | '-inf')).masked_fill(mask == 1, float(0.0)) 325 | return mask 326 | 327 | 328 | def build_transformer(config): 329 | return Transformer( 330 | config, 331 | d_model=config.hidden_dim, 332 | dropout=config.dropout, 333 | nhead=config.nheads, 334 | dim_feedforward=config.dim_feedforward, 335 | num_encoder_layers=config.enc_layers, 336 | num_decoder_layers=config.dec_layers, 337 | normalize_before=config.pre_norm, 338 | return_intermediate_dec=False, 339 | ) 340 | -------------------------------------------------------------------------------- /Model2_Transformer/models/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | from typing import List, Optional 3 | 4 | import torch 5 | import torch.distributed as dist 6 | from torch import Tensor 7 | 8 | 9 | def _max_by_axis(the_list): 10 | # type: (List[List[int]]) -> List[int] 11 | maxes = the_list[0] 12 | for sublist in the_list[1:]: 13 | for index, item in enumerate(sublist): 14 | maxes[index] = max(maxes[index], item) 15 | return maxes 16 | 17 | 18 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): 19 | # TODO make this more general 20 | if tensor_list[0].ndim == 3: 21 | # TODO make it support different-sized images 22 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 23 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 24 | batch_shape = [len(tensor_list)] + max_size 25 | b, c, h, w = batch_shape 26 | dtype = tensor_list[0].dtype 27 | device = tensor_list[0].device 28 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 29 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 30 | for img, pad_img, m in zip(tensor_list, tensor, mask): 31 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 32 | m[: img.shape[1], :img.shape[2]] = False 33 | else: 34 | raise ValueError('not supported') 35 | return NestedTensor(tensor, mask) 36 | 37 | 38 | class NestedTensor(object): 39 | def __init__(self, tensors, mask: Optional[Tensor]): 40 | self.tensors = tensors 41 | self.mask = mask 42 | 43 | def to(self, device): 44 | # type: (Device) -> NestedTensor # noqa 45 | cast_tensor = self.tensors.to(device) 46 | mask = self.mask 47 | if mask is not None: 48 | assert mask is not None 49 | cast_mask = mask.to(device) 50 | else: 51 | cast_mask = None 52 | return NestedTensor(cast_tensor, cast_mask) 53 | 54 | def decompose(self): 55 | return self.tensors, self.mask 56 | 57 | def __repr__(self): 58 | return str(self.tensors) 59 | 60 | 61 | def is_dist_avail_and_initialized(): 62 | if not dist.is_available(): 63 | return False 64 | if not dist.is_initialized(): 65 | return False 66 | return True 67 | 68 | 69 | def get_rank(): 70 | if not is_dist_avail_and_initialized(): 71 | return 0 72 | return dist.get_rank() 73 | 74 | 75 | def is_main_process(): 76 | return get_rank() == 0 77 | -------------------------------------------------------------------------------- /Model2_Transformer/online_inference.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import torch 4 | import argparse 5 | import nltk 6 | from transformers import BertTokenizer 7 | from PIL import Image 8 | from models import caption 9 | from datasets import coco 10 | from models.alice import single_meteor_scr, rl_scr 11 | from configuration import Config 12 | 13 | nltk.download('punkt') 14 | nltk.download('wordnet') 15 | 16 | parser = argparse.ArgumentParser(description='Image Captioning') 17 | parser.add_argument('--img', type=str, help='Image Path', required=True) 18 | args = parser.parse_args() 19 | image_path = args.path 20 | 21 | config = Config() 22 | 23 | # 加载模型 24 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True) 25 | 26 | # 保存模型权重 27 | torch.save(model.state_dict(), "image_caption_model.pth") 28 | 29 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 30 | 31 | # 预处理图片 32 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token) 33 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token) 34 | image = Image.open(image_path) 35 | image = coco.val_transform(image) 36 | image = image.unsqueeze(0) 37 | 38 | # 创建 caption 和 mask 39 | def create_caption_and_mask(start_token, max_length): 40 | caption_template = torch.zeros((1, max_length), dtype=torch.long) 41 | mask_template = torch.ones((1, max_length), dtype=torch.bool) 42 | 43 | caption_template[:, 0] = start_token 44 | mask_template[:, 0] = False 45 | 46 | return caption_template, mask_template 47 | 48 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings) 49 | 50 | # 生成 caption 51 | @torch.no_grad() 52 | def evaluate(): 53 | model.eval() 54 | for i in range(config.max_position_embeddings - 1): 55 | predictions = model(image, caption, cap_mask) 56 | predictions = predictions[:, i, :] 57 | predicted_id = torch.argmax(predictions, axis=-1) 58 | 59 | if predicted_id[0] == 102: 60 | return caption 61 | 62 | caption[:, i+1] = predicted_id[0] 63 | cap_mask[:, i+1] = False 64 | 65 | return caption 66 | 67 | with open('../data_common/test_captions.json', 'r') as f: 68 | captions = json.load(f) 69 | 70 | filename = os.path.basename(image_path) 71 | reference_description = captions.get(filename, "No description found.") 72 | 73 | output = evaluate() 74 | result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True) 75 | print("=====================================================================") 76 | print("Predict Caption = ", result.capitalize()) 77 | print("Reference Caption = ", reference_description.capitalize()) 78 | meteor_score = single_meteor_scr(reference_description, result) 79 | rouge_l_score = rl_scr(reference_description, result) 80 | print("-----------------------------") 81 | print("|| METEOR Score =", round(meteor_score, 4), " ||") 82 | print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||") 83 | print("-----------------------------") -------------------------------------------------------------------------------- /Model2_Transformer/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2023.11.17 2 | charset-normalizer==3.3.2 3 | click==8.1.7 4 | colorama==0.4.6 5 | filelock==3.13.1 6 | fsspec==2023.10.0 7 | huggingface-hub==0.19.4 8 | idna==3.6 9 | Jinja2==3.1.2 10 | joblib==1.3.2 11 | MarkupSafe==2.1.3 12 | mpmath==1.3.0 13 | networkx==3.2.1 14 | nltk==3.8.1 15 | numpy==1.23.5 16 | packaging==23.2 17 | Pillow==10.1.0 18 | PyYAML==6.0.1 19 | regex==2023.10.3 20 | requests==2.31.0 21 | rouge==1.0.1 22 | safetensors==0.4.0 23 | six==1.16.0 24 | sympy==1.12 25 | tokenizers==0.15.0 26 | torch==2.1.1 27 | torchvision==0.16.1 28 | tqdm==4.66.1 29 | transformers==4.35.2 30 | typing_extensions==4.8.0 31 | urllib3==2.1.0 32 | -------------------------------------------------------------------------------- /Model2_Transformer/train_coco.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | from torch.utils.data import DataLoader 5 | from models import utils, caption 6 | from datasets import coco 7 | from configuration import Config 8 | from engine import train_one_epoch, evaluate 9 | 10 | 11 | def main(config): 12 | device = torch.device(config.device) 13 | print(f'Initializing Device: {device}') 14 | 15 | seed = config.seed + utils.get_rank() 16 | torch.manual_seed(seed) 17 | np.random.seed(seed) 18 | 19 | model, criterion = caption.build_model(config) 20 | model.to(device) 21 | 22 | n_parameters = sum(p.numel() 23 | for p in model.parameters() if p.requires_grad) 24 | print(f"Number of params: {n_parameters}") 25 | 26 | param_dicts = [ 27 | {"params": [p for n, p in model.named_parameters( 28 | ) if "backbone" not in n and p.requires_grad]}, 29 | { 30 | "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], 31 | "lr": config.lr_backbone, 32 | }, 33 | ] 34 | optimizer = torch.optim.AdamW( 35 | param_dicts, lr=config.lr, weight_decay=config.weight_decay) 36 | lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.lr_drop) 37 | 38 | dataset_train = coco.build_dataset(config, mode='training') 39 | dataset_val = coco.build_dataset(config, mode='validation') 40 | print(f"Train: {len(dataset_train)}") 41 | print(f"Valid: {len(dataset_val)}") 42 | 43 | sampler_train = torch.utils.data.RandomSampler(dataset_train) 44 | sampler_val = torch.utils.data.SequentialSampler(dataset_val) 45 | 46 | batch_sampler_train = torch.utils.data.BatchSampler( 47 | sampler_train, config.batch_size, drop_last=True 48 | ) 49 | 50 | data_loader_train = DataLoader( 51 | dataset_train, batch_sampler=batch_sampler_train, num_workers=config.num_workers) 52 | data_loader_val = DataLoader(dataset_val, config.batch_size, 53 | sampler=sampler_val, drop_last=False, num_workers=config.num_workers) 54 | 55 | if os.path.exists(config.checkpoint): 56 | print("Loading Checkpoint...") 57 | checkpoint = torch.load(config.checkpoint, map_location='cpu') 58 | model.load_state_dict(checkpoint['model']) 59 | optimizer.load_state_dict(checkpoint['optimizer']) 60 | lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) 61 | config.start_epoch = checkpoint['epoch'] + 1 62 | 63 | print("Start Training..") 64 | for epoch in range(config.start_epoch, config.epochs): 65 | print(f"Epoch: {epoch}") 66 | epoch_loss = train_one_epoch( 67 | model, criterion, data_loader_train, optimizer, device, epoch, config.clip_max_norm) 68 | lr_scheduler.step() 69 | print(f"Training Loss: {epoch_loss}") 70 | 71 | torch.save({ 72 | 'model': model.state_dict(), 73 | 'optimizer': optimizer.state_dict(), 74 | 'lr_scheduler': lr_scheduler.state_dict(), 75 | 'epoch': epoch, 76 | }, config.checkpoint) 77 | 78 | validation_loss = evaluate(model, criterion, data_loader_val, device) 79 | print(f"Validation Loss: {validation_loss}") 80 | 81 | print() 82 | 83 | 84 | if __name__ == "__main__": 85 | config = Config() 86 | main(config) -------------------------------------------------------------------------------- /Model2_Transformer/train_dev.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | from torch.utils.data import Dataset, DataLoader 4 | from torchvision import transforms 5 | from transformers import BertTokenizer 6 | from PIL import Image 7 | from configuration import Config 8 | 9 | # 数据集类 10 | class MyDataset(Dataset): 11 | def __init__(self, json_file, img_dir, transform=None): 12 | with open(json_file, 'r') as f: 13 | self.data = json.load(f) 14 | self.img_dir = img_dir 15 | self.transform = transform 16 | self.filenames = list(self.data.keys()) 17 | 18 | def __len__(self): 19 | return len(self.data) 20 | 21 | def __getitem__(self, idx): 22 | filename = self.filenames[idx] 23 | caption = self.data[filename] 24 | image = Image.open(f"{self.img_dir}/{filename}") 25 | if self.transform: 26 | image = self.transform(image) 27 | return image, caption 28 | 29 | # 检查是否有可用的GPU 30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 31 | 32 | # 加载模型 33 | config = Config() 34 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True) 35 | model = model.to(device) # 将模型移动到指定的设备上 36 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 37 | 38 | # 准备数据集 39 | transform = transforms.Compose([ 40 | transforms.ToTensor(), 41 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 42 | ]) 43 | 44 | train_dataset = MyDataset('../data_common/train_captions.json', '../data_common/train_images', transform=transform) 45 | train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True) 46 | 47 | # 定义损失函数和优化器 48 | criterion = torch.nn.CrossEntropyLoss() 49 | optimizer = torch.optim.Adam(model.parameters()) 50 | 51 | # 训练循环 52 | num_epochs = 10 53 | for epoch in range(num_epochs): 54 | for images, captions in train_dataloader: 55 | images = images.to(device) # 将图像数据移动到指定的设备上 56 | captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True) 57 | captions = {key: val.to(device) for key, val in captions.items()} # 将caption数据移动到指定的设备上 58 | 59 | outputs = model(images, captions['input_ids'], captions['attention_mask']) 60 | loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), captions['input_ids'].view(-1)) 61 | 62 | optimizer.zero_grad() 63 | loss.backward() 64 | optimizer.step() 65 | 66 | print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}') 67 | 68 | # 保存模型 69 | torch.save(model.state_dict(), 'Model2.pth') -------------------------------------------------------------------------------- /Original_Model/__pycache__/configurations.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/configurations.cpython-310.pyc -------------------------------------------------------------------------------- /Original_Model/__pycache__/datasets.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/datasets.cpython-310.pyc -------------------------------------------------------------------------------- /Original_Model/__pycache__/models.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/models.cpython-310.pyc -------------------------------------------------------------------------------- /Original_Model/configurations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | class Config: 3 | # 数据路径 4 | data_path = '../data/' 5 | images_path = '../data/images/' 6 | train_captions_path = '../data/train_captions.json' 7 | test_captions_path = '../data/test_captions.json' 8 | output_folder = '../data/output/' # 输出文件夹的路径,用于存储词汇表和处理后的数据 9 | 10 | # 模型参数 11 | embed_size = 256 12 | vocab_size = 10000 # 根据实际情况调整 13 | num_layers = 3 # 定义循环神经网络(RNN)或其变体(如 LSTM 或 GRU)中的层数。 14 | num_heads = 8 15 | dropout = 0.1 16 | hidden_size = 512 17 | image_code_dim = 2048 # 图像编码维度 18 | word_dim = 256 # 词嵌入维度 19 | attention_dim = 512 # 注意力机制的隐藏层维度 20 | 21 | # 数据处理参数 22 | min_word_count = 5 # 词汇表中词的最小出现次数 23 | max_len = 64 # 假设描述的最大长度为200个词 24 | 25 | # 训练参数 26 | batch_size = 4 27 | learning_rate = 0.001 28 | num_epochs = 30 29 | workers = 0 # 工作线程数,在自己的电脑上训练的时候设为0 30 | encoder_learning_rate = 1e-4 # 编码器的学习率 31 | decoder_learning_rate = 1e-3 # 解码器的学习率 32 | lr_update = 10 # 每10轮降低学习速率 33 | 34 | # 图像预处理参数 35 | image_size = 256 # 图像缩放大小 36 | crop_size = 224 # 图像裁剪大小 37 | 38 | # Beam Search 参数 39 | beam_k = 5 40 | 41 | # 其他配置 42 | device = 'cuda' if torch.cuda.is_available() else 'cpu' -------------------------------------------------------------------------------- /Original_Model/datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from collections import Counter 4 | import torch 5 | from PIL import Image 6 | from torch.utils.data import Dataset 7 | from torch.utils.data import DataLoader 8 | import torchvision.transforms as transforms 9 | from configurations import Config # 导入配置类 10 | 11 | 12 | # 从配置文件获取配置 13 | config = Config() 14 | 15 | 16 | def create_dataset(max_len=64): 17 | """ 18 | 整理数据集,构建词汇表,并将文本描述转换为词索引向量。 19 | 使用configuration.py文件中定义的配置信息。 20 | """ 21 | # 使用config中定义的路径 22 | image_folder = config.images_path 23 | train_captions_path = config.train_captions_path 24 | test_captions_path = config.test_captions_path 25 | output_folder = config.output_folder 26 | 27 | # 读取训练图像描述 28 | with open(train_captions_path, 'r') as f: 29 | train_captions_data = json.load(f) 30 | 31 | # 读取测试图像描述 32 | with open(test_captions_path, 'r') as f: 33 | test_captions_data = json.load(f) 34 | 35 | # 统计训练集的文本描述的词频 36 | vocab = Counter() 37 | for caption in train_captions_data.values(): 38 | vocab.update(caption.lower().split()) 39 | 40 | # 移除其中的低频词 41 | vocab = {word for word, count in vocab.items() if count >= config.min_word_count} 42 | 43 | # 构建词典 44 | word_to_idx = {word: idx + 4 for idx, word in enumerate(vocab)} 45 | word_to_idx[''] = 0 46 | word_to_idx[''] = 1 47 | word_to_idx[''] = 2 48 | word_to_idx[''] = 3 49 | 50 | # 一个函数来转换描述为词索引向量,并进行填充 51 | def encode_captions(captions_data, word_to_idx, max_len): 52 | encoded_captions = {} 53 | caplens = {} 54 | for img_id, caption in captions_data.items(): 55 | words = caption.lower().split() 56 | encoded_caption = [word_to_idx.get(word, word_to_idx['']) for word in words] 57 | # 加2是因为要加上,但最终caplen应该减去1 58 | caplen = min(len(encoded_caption) + 2, max_len) - 1 59 | encoded_caption = [word_to_idx['']] + encoded_caption + [word_to_idx['']] 60 | encoded_caption += [word_to_idx['']] * (max_len - len(encoded_caption)) 61 | encoded_captions[img_id] = encoded_caption[:max_len] 62 | caplens[img_id] = caplen # if caplen <= max_len else max_len 63 | return encoded_captions, caplens 64 | # def encode_captions(captions_data, word_to_idx, max_len): 65 | # encoded_captions = {} 66 | # for img_id, caption in captions_data.items(): 67 | # words = caption.lower().split() 68 | # encoded_caption = [word_to_idx.get(word, word_to_idx['']) for word in words] 69 | # encoded_caption = [word_to_idx['']] + encoded_caption + [word_to_idx['']] 70 | # encoded_caption += [word_to_idx['']] * (max_len - len(encoded_caption)) 71 | # encoded_captions[img_id] = encoded_caption[:max_len] 72 | # return encoded_captions 73 | 74 | # 对训练集描述进行编码 75 | encoded_captions_train, caplens_train = encode_captions(train_captions_data, word_to_idx, max_len) 76 | 77 | # 对测试集描述进行编码 78 | encoded_captions_test, caplens_test = encode_captions(test_captions_data, word_to_idx, max_len) 79 | 80 | # 存储词典和编码后的描述 81 | with open(os.path.join(output_folder, 'vocab.json'), 'w') as f: 82 | json.dump(word_to_idx, f) 83 | 84 | with open(os.path.join(output_folder, 'encoded_captions_train.json'), 'w') as f: 85 | json.dump(encoded_captions_train, f) 86 | 87 | with open(os.path.join(output_folder, 'encoded_captions_test.json'), 'w') as f: 88 | json.dump(encoded_captions_test, f) 89 | 90 | # 存储图像路径 91 | image_paths_train = {img_id: os.path.join(image_folder, img_id) for img_id in train_captions_data.keys()} 92 | with open(os.path.join(output_folder, 'image_paths_train.json'), 'w') as f: 93 | json.dump(image_paths_train, f) 94 | 95 | image_paths_test = {img_id: os.path.join(image_folder, img_id) for img_id in test_captions_data.keys()} 96 | with open(os.path.join(output_folder, 'image_paths_test.json'), 'w') as f: 97 | json.dump(image_paths_test, f) 98 | 99 | # 存储caplens 100 | with open(os.path.join(output_folder, 'caplens_train.json'), 'w') as f: 101 | json.dump(caplens_train, f) 102 | 103 | with open(os.path.join(output_folder, 'caplens_test.json'), 'w') as f: 104 | json.dump(caplens_test, f) 105 | 106 | 107 | # 调用函数,整理数据集 108 | # create_dataset() 109 | 110 | 111 | class ImageTextDataset(Dataset): 112 | """ 113 | PyTorch数据集类,用于加载和处理图像-文本数据。 114 | """ 115 | 116 | def __init__(self, image_paths_file, captions_file, caplens_file, transform=None): 117 | """ 118 | 初始化数据集类。 119 | 参数: 120 | image_paths_file: 包含图像路径的json文件路径。 121 | captions_file: 包含编码后文本描述的json文件路径。 122 | transform: 应用于图像的预处理转换。 123 | """ 124 | # 载入图像路径和文本描述以及caplens 125 | with open(image_paths_file, 'r') as f: 126 | self.image_paths = json.load(f) 127 | 128 | with open(captions_file, 'r') as f: 129 | self.captions = json.load(f) 130 | 131 | with open(caplens_file, 'r') as f: 132 | self.caplens = json.load(f) 133 | 134 | # 设置图像预处理方法 135 | self.transform = transform or transforms.Compose([ 136 | transforms.Resize((256, 256)), 137 | transforms.ToTensor(), 138 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 139 | ]) 140 | 141 | def __getitem__(self, index): 142 | """ 143 | 获取单个数据点。 144 | 参数: 145 | index: 数据点的索引。 146 | 返回: 147 | 一个包含图像和对应文本描述的元组。 148 | """ 149 | # 获取图像路径和文本描述以及caplen 150 | image_id = list(self.image_paths.keys())[index] 151 | image_path = self.image_paths[image_id] 152 | caption = self.captions[image_id] 153 | caplen = self.caplens[image_id] 154 | 155 | # 加载图像并应用预处理 156 | image = Image.open(image_path).convert('RGB') 157 | if self.transform is not None: 158 | image = self.transform(image) 159 | 160 | # 将文本描述转换为张量 161 | caption_tensor = torch.tensor(caption, dtype=torch.long) 162 | 163 | return image, caption_tensor, caplen 164 | 165 | def __len__(self): 166 | """ 167 | 数据集中的数据点总数。 168 | """ 169 | return len(self.image_paths) 170 | 171 | 172 | # 创建数据集实例 173 | # train_dataset = ImageTextDataset( 174 | # image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'), 175 | # captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'), 176 | # caplens_file=os.path.join(config.output_folder, 'caplens_train.json') 177 | # ) 178 | 179 | # # 示例:创建验证集实例 180 | # test_dataset = ImageTextDataset( 181 | # image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'), 182 | # captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'), 183 | # caplens_file=os.path.join(config.output_folder, 'caplens_test.json') 184 | # ) 185 | 186 | # 创建训练集和测试集的 DataLoader 187 | def create_dataloaders(config): 188 | """ 189 | 创建训练集和测试集的 DataLoader。 190 | 191 | 参数: 192 | batch_size: 每个批次的大小。 193 | num_workers: 加载数据时使用的进程数。 194 | shuffle_train: 是否打乱训练数据。 195 | 196 | 返回: 197 | train_loader: 训练数据的 DataLoader。 198 | test_loader: 测试数据的 DataLoader。 199 | """ 200 | # 图像预处理转换 201 | transform = transforms.Compose([ 202 | transforms.Resize((256, 256)), 203 | transforms.RandomCrop(224), 204 | transforms.ToTensor(), 205 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 206 | ]) 207 | 208 | # 加载数据时使用的进程数 209 | num_workers = 0 210 | 211 | # 创建数据集对象 212 | train_dataset = ImageTextDataset( 213 | image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'), 214 | captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'), 215 | caplens_file=os.path.join(config.output_folder, 'caplens_train.json'), 216 | transform=transform 217 | ) 218 | 219 | test_dataset = ImageTextDataset( 220 | image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'), 221 | captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'), 222 | caplens_file=os.path.join(config.output_folder, 'caplens_test.json'), 223 | transform=transform 224 | ) 225 | 226 | # 创建 DataLoader 对象 227 | train_loader = DataLoader( 228 | dataset=train_dataset, 229 | batch_size=config.batch_size, 230 | shuffle=True, 231 | num_workers=num_workers, 232 | pin_memory=True 233 | ) 234 | 235 | test_loader = DataLoader( 236 | dataset=test_dataset, 237 | batch_size=config.batch_size, 238 | shuffle=False, # 通常测试集不需要打乱 239 | num_workers=num_workers, 240 | pin_memory=True 241 | ) 242 | 243 | return train_loader, test_loader 244 | 245 | 246 | config = Config() 247 | # 使用Config类中定义的配置来创建DataLoader 248 | train_loader, test_loader = create_dataloaders(config=config) 249 | 250 | 251 | # 测试 DataLoader 是否正确创建 252 | if __name__ == '__main__': 253 | for i, (images, captions, caplens) in enumerate(train_loader): 254 | print(f"Batch {i + 1}") 255 | print(f"Images shape: {images.size()}") 256 | print(f"Captions shape: {captions.size()}") 257 | if i == 1: # 仅打印前两个批次的信息 258 | break 259 | -------------------------------------------------------------------------------- /Original_Model/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from pycocoevalcap.cider.cider import Cider 4 | import numpy as np 5 | from configurations import Config 6 | from torchvision.models import resnet101, ResNet101_Weights 7 | from torch.nn.utils.rnn import pack_padded_sequence 8 | import torch.optim as optim 9 | import json 10 | import torchvision 11 | 12 | 13 | # 图像编码器 14 | class ImageEncoder(nn.Module): 15 | def __init__(self, finetuned=True): 16 | super(ImageEncoder, self).__init__() 17 | model = torchvision.models.resnet101(weights=ResNet101_Weights.DEFAULT) 18 | # ResNet-101网格表示提取器 19 | self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2])) 20 | for param in self.grid_rep_extractor.parameters(): 21 | param.requires_grad = finetuned 22 | 23 | def forward(self, images): 24 | out = self.grid_rep_extractor(images) 25 | return out 26 | 27 | # # 引入自注意机制后的图像编码器 28 | # class SelfAttention(nn.Module): 29 | # def __init__(self, num_channels, num_heads=8, dropout=0.1): 30 | # super(SelfAttention, self).__init__() 31 | # self.num_heads = num_heads 32 | # self.attention = nn.MultiheadAttention(num_channels, num_heads, dropout) 33 | # 34 | # def forward(self, x): 35 | # # 保存原始形状 36 | # orig_shape = x.shape 37 | # # 打印输入形状 38 | # # print("Input shape:", x.shape) 39 | # # 转换为(sequence_length, batch_size, num_channels)格式 40 | # x = x.flatten(2).permute(2, 0, 1) 41 | # attention_output, _ = self.attention(x, x, x) 42 | # # 还原形状,确保与原始输入形状匹配 43 | # attention_output = attention_output.permute(1, 2, 0)# 打印最终输出形状 44 | # # print("Final output shape:", attention_output.shape) 45 | # return attention_output.view(orig_shape) 46 | # 47 | # 48 | # class ImageEncoder(nn.Module): 49 | # def __init__(self, finetuned=True, num_heads=8, dropout=0.1): 50 | # super(ImageEncoder, self).__init__() 51 | # # 使用ResNet101作为基础模型 52 | # model = resnet101(weights=ResNet101_Weights.DEFAULT) 53 | # self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2])) 54 | # # 设置参数是否可训练 55 | # for param in self.grid_rep_extractor.parameters(): 56 | # param.requires_grad = finetuned 57 | # 58 | # # 自注意力层 59 | # self.self_attention = SelfAttention(model.fc.in_features, num_heads, dropout) 60 | # 61 | # def forward(self, images): 62 | # features = self.grid_rep_extractor(images) 63 | # features = self.self_attention(features) 64 | # return features 65 | 66 | 67 | # 解码器的注意力机制 68 | class AdditiveAttention(nn.Module): 69 | def __init__(self, query_dim, key_dim, attn_dim): 70 | super(AdditiveAttention, self).__init__() 71 | self.attn_w_1_q = nn.Linear(query_dim, attn_dim) 72 | self.attn_w_1_k = nn.Linear(key_dim, attn_dim) 73 | self.attn_w_2 = nn.Linear(attn_dim, 1) 74 | self.tanh = nn.Tanh() 75 | self.softmax = nn.Softmax(dim=1) 76 | 77 | def forward(self, query, key_value): 78 | queries = self.attn_w_1_q(query).unsqueeze(1) 79 | keys = self.attn_w_1_k(key_value) 80 | attn = self.attn_w_2(self.tanh(queries+keys)).squeeze(2) 81 | attn = self.softmax(attn) 82 | output = torch.bmm(attn.unsqueeze(1), key_value).squeeze(1) 83 | return output, attn 84 | 85 | 86 | # 文本解码器 87 | class AttentionDecoder(nn.Module): 88 | def __init__(self, image_code_dim, vocab_size, word_dim, attention_dim, hidden_size, num_layers, dropout=0.5): 89 | super(AttentionDecoder, self).__init__() 90 | self.embed = nn.Embedding(vocab_size, word_dim) 91 | self.attention = AdditiveAttention(hidden_size, image_code_dim, attention_dim) 92 | self.init_state = nn.Linear(image_code_dim, num_layers * hidden_size) 93 | self.rnn = nn.GRU(word_dim + image_code_dim, hidden_size, num_layers) 94 | self.dropout = nn.Dropout(p=dropout) 95 | self.fc = nn.Linear(hidden_size, vocab_size) 96 | self.init_weights() 97 | 98 | def init_weights(self): 99 | self.embed.weight.data.uniform_(-0.1, 0.1) 100 | self.fc.bias.data.fill_(0) 101 | self.fc.weight.data.uniform_(-0.1, 0.1) 102 | 103 | def init_hidden_state(self, image_code, captions, cap_lens): 104 | batch_size, image_code_dim = image_code.size(0), image_code.size(1) 105 | image_code = image_code.permute(0, 2, 3, 1) 106 | image_code = image_code.view(batch_size, -1, image_code_dim) 107 | sorted_cap_lens, sorted_cap_indices = torch.sort(cap_lens, 0, True) 108 | captions = captions[sorted_cap_indices] 109 | image_code = image_code[sorted_cap_indices] 110 | hidden_state = self.init_state(image_code.mean(axis=1)) 111 | hidden_state = hidden_state.view( 112 | batch_size, 113 | self.rnn.num_layers, 114 | self.rnn.hidden_size).permute(1, 0, 2) 115 | return image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state 116 | 117 | def forward_step(self, image_code, curr_cap_embed, hidden_state): 118 | context, alpha = self.attention(hidden_state[-1], image_code) 119 | x = torch.cat((context, curr_cap_embed), dim=-1).unsqueeze(0) 120 | out, hidden_state = self.rnn(x, hidden_state) 121 | preds = self.fc(self.dropout(out.squeeze(0))) 122 | return preds, alpha, hidden_state 123 | 124 | def forward(self, image_code, captions, cap_lens): 125 | image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state \ 126 | = self.init_hidden_state(image_code, captions, cap_lens) 127 | batch_size = image_code.size(0) 128 | lengths = sorted_cap_lens.cpu().numpy() - 1 129 | max_cap_len = max(cap_lens) 130 | predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(captions.device) 131 | alphas = torch.zeros(batch_size, max_cap_len, image_code.shape[1]).to(captions.device) 132 | cap_embeds = self.embed(captions) 133 | # Teacher-Forcing模式 134 | for step in range(lengths[0]): 135 | real_batch_size = np.where(lengths > step)[0].shape[0] 136 | preds, alpha, hidden_state = self.forward_step( 137 | image_code[:real_batch_size], 138 | cap_embeds[:real_batch_size, step, :], 139 | hidden_state[:, :real_batch_size, :].contiguous()) 140 | predictions[:real_batch_size, step, :] = preds 141 | alphas[:real_batch_size, step, :] = alpha 142 | max_cap_len = max(cap_lens) 143 | padded_predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(predictions.device) 144 | for i in range(batch_size): 145 | actual_length = cap_lens[i] 146 | padded_predictions[i, :actual_length, :] = predictions[i, :actual_length, :] 147 | 148 | return padded_predictions, alphas, captions, lengths, sorted_cap_indices 149 | 150 | 151 | class ARCTIC(nn.Module): 152 | def __init__(self, image_code_dim, vocab, word_dim, attention_dim, hidden_size, num_layers): 153 | super(ARCTIC, self).__init__() 154 | self.vocab = vocab 155 | self.encoder = ImageEncoder() 156 | self.decoder = AttentionDecoder(image_code_dim, len(vocab), word_dim, attention_dim, hidden_size, num_layers) 157 | 158 | def forward(self, images, captions, cap_lens): 159 | image_code = self.encoder(images) 160 | output = self.decoder(image_code, captions, cap_lens) 161 | return output 162 | 163 | def generate_by_beamsearch(self, images, beam_k, max_len): 164 | vocab_size = len(self.vocab) 165 | image_codes = self.encoder(images) 166 | texts = [] 167 | device = images.device 168 | for image_code in image_codes: 169 | image_code = image_code.unsqueeze(0).repeat(beam_k, 1, 1, 1) 170 | cur_sents = torch.full((beam_k, 1), self.vocab[''], dtype=torch.long).to(device) 171 | cur_sent_embed = self.decoder.embed(cur_sents)[:, 0, :] 172 | sent_lens = torch.LongTensor([1] * beam_k).to(device) 173 | image_code, cur_sent_embed, _, _, hidden_state = \ 174 | self.decoder.init_hidden_state(image_code, cur_sent_embed, sent_lens) 175 | end_sents = [] 176 | end_probs = [] 177 | probs = torch.zeros(beam_k, 1).to(device) 178 | k = beam_k 179 | while True: 180 | preds, _, hidden_state = self.decoder.forward_step(image_code[:k], cur_sent_embed, 181 | hidden_state.contiguous()) 182 | preds = nn.functional.log_softmax(preds, dim=1) 183 | probs = probs.repeat(1, preds.size(1)) + preds 184 | if cur_sents.size(1) == 1: 185 | values, indices = probs[0].topk(k, 0, True, True) 186 | else: 187 | values, indices = probs.view(-1).topk(k, 0, True, True) 188 | sent_indices = torch.div(indices, vocab_size, rounding_mode='trunc') 189 | word_indices = indices % vocab_size 190 | cur_sents = torch.cat([cur_sents[sent_indices], word_indices.unsqueeze(1)], dim=1) 191 | end_indices = [idx for idx, word in enumerate(word_indices) if word == self.vocab['']] 192 | if len(end_indices) > 0: 193 | end_probs.extend(values[end_indices]) 194 | end_sents.extend(cur_sents[end_indices].tolist()) 195 | k -= len(end_indices) 196 | if k == 0: 197 | break 198 | cur_indices = [idx for idx, word in enumerate(word_indices) 199 | if word != self.vocab['']] 200 | if len(cur_indices) > 0: 201 | cur_sent_indices = sent_indices[cur_indices] 202 | cur_word_indices = word_indices[cur_indices] 203 | cur_sents = cur_sents[cur_indices] 204 | probs = values[cur_indices].view(-1, 1) 205 | hidden_state = hidden_state[:, cur_sent_indices, :] 206 | cur_sent_embed = self.decoder.embed( 207 | cur_word_indices.view(-1, 1))[:, 0, :] 208 | if cur_sents.size(1) >= max_len: 209 | break 210 | if len(end_sents) == 0: 211 | gen_sent = cur_sents[0].tolist() 212 | else: 213 | gen_sent = end_sents[end_probs.index(max(end_probs))] 214 | texts.append(gen_sent) 215 | return texts 216 | 217 | 218 | # 损失函数 219 | class PackedCrossEntropyLoss(nn.Module): 220 | def __init__(self): 221 | super(PackedCrossEntropyLoss, self).__init__() 222 | self.loss_fn = nn.CrossEntropyLoss() 223 | 224 | def forward(self, predictions, targets, lengths): 225 | packed_predictions = pack_padded_sequence(predictions, lengths, batch_first=True, enforce_sorted=False)[0] 226 | packed_targets = pack_padded_sequence(targets, lengths, batch_first=True, enforce_sorted=False)[0] 227 | 228 | # 计算损失,忽略填充的部分 229 | loss = self.loss_fn(packed_predictions, packed_targets) 230 | return loss 231 | 232 | 233 | def get_optimizer(model, config): 234 | encoder_params = filter(lambda p: p.requires_grad, model.encoder.parameters()) 235 | decoder_params = filter(lambda p: p.requires_grad, model.decoder.parameters()) 236 | optimizer = optim.Adam([ 237 | {"params": encoder_params, "lr": config.encoder_learning_rate}, 238 | {"params": decoder_params, "lr": config.decoder_learning_rate} 239 | ]) 240 | 241 | return optimizer 242 | 243 | def adjust_learning_rate(optimizer, epoch, config): 244 | for param_group in optimizer.param_groups: 245 | if param_group['name'] == 'encoder': 246 | param_group['lr'] = config.encoder_learning_rate * (0.1 ** (epoch // config.lr_update)) 247 | else: 248 | param_group['lr'] = config.decoder_learning_rate * (0.1 ** (epoch // config.lr_update)) 249 | 250 | 251 | # CIDEr-D 评估 252 | def filter_useless_words(sent, filterd_words): 253 | return [w for w in sent if w not in filterd_words] 254 | 255 | 256 | def evaluate_cider(data_loader, model, config): 257 | model.eval() 258 | # 存储候选文本和参考文本 259 | cands = {} 260 | refs = {} 261 | filterd_words = {model.vocab[''], model.vocab[''], model.vocab['']} 262 | device = next(model.parameters()).device 263 | 264 | # 加载词汇表并创建反向词汇表 265 | with open('../output_副本/vocab.json', 'r') as f: 266 | vocab = json.load(f) 267 | idx_to_word = {idx: word for word, idx in vocab.items()} 268 | 269 | for i, (imgs, caps, caplens) in enumerate(data_loader): 270 | imgs = imgs.to(device) 271 | preds = model.generate_by_beamsearch(imgs, config.beam_k, config.max_len) 272 | for j in range(imgs.size(0)): 273 | img_id = str(i * config.batch_size + j) 274 | cand_words = [idx_to_word.get(word, '') for word in preds[j]] 275 | cand = ' '.join(filter_useless_words(cand_words, filterd_words)) 276 | cands[img_id] = [cand] 277 | ref_words = [idx_to_word.get(word.item(), '') for word in caps[j]] 278 | refs[img_id] = [' '.join(filter_useless_words(ref_words, filterd_words))] # 参考描述 279 | 280 | # 计算CIDEr-D得分 281 | cider_evaluator = Cider() 282 | score, _ = cider_evaluator.compute_score(refs, cands) 283 | 284 | model.train() 285 | return score -------------------------------------------------------------------------------- /Original_Model/predict.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | from torchvision import transforms 4 | from models import AttentionModel 5 | from configurations import Config 6 | import json 7 | 8 | def load_model(model_path, vocab, config): 9 | model = AttentionModel( 10 | image_code_dim=config.image_code_dim, 11 | vocab=vocab, # 传递词汇表字典 12 | word_dim=config.word_dim, 13 | attention_dim=config.attention_dim, 14 | hidden_size=config.hidden_size, 15 | num_layers=config.num_layers 16 | ) 17 | model.load_state_dict(torch.load(model_path)) 18 | model = model.to(config.device) 19 | model.eval() # 将模型设置为评估模式 20 | return model 21 | 22 | def process_image(image_path): 23 | transform = transforms.Compose([ 24 | transforms.Resize((256, 256)), 25 | transforms.ToTensor(), 26 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 27 | ]) 28 | image = Image.open(image_path).convert('RGB') 29 | image_tensor = transform(image).unsqueeze(0) # 添加一个批次维度 30 | return image_tensor 31 | 32 | def predict_caption(model, image_tensor, vocab, config): 33 | # 生成束搜索描述 34 | predictions = model.generate_by_beamsearch(image_tensor.to(config.device), config.beam_k, config.max_len) 35 | # 将词索引转换回文字 36 | idx_to_word = {idx: word for word, idx in vocab.items()} 37 | caption_words = [idx_to_word.get(word, '') for word in predictions[0]] 38 | caption = ' '.join(caption_words) 39 | return caption 40 | 41 | # 载入配置和词汇表 42 | config = Config() 43 | with open('../data/output/vocab.json', 'r') as f: 44 | vocab = json.load(f) 45 | 46 | # 加载模型 47 | model_path = '../data/output/weights/.pth' # 使用正确的模型文件路径 48 | model = load_model(model_path, vocab, config) 49 | 50 | # 处理图片并生成描述 51 | image_path = '../data/images/MEN-Denim-id_00000080-01_7_additional.jpg' # 测试图片路径 52 | image_tensor = process_image(image_path) 53 | caption = predict_caption(model, image_tensor, vocab, config) 54 | 55 | print("Generated Caption:", caption) 56 | -------------------------------------------------------------------------------- /Original_Model/train.py: -------------------------------------------------------------------------------- 1 | import json 2 | import torch 3 | import os 4 | from configurations import Config 5 | from models import ARCTIC, get_optimizer, PackedCrossEntropyLoss, evaluate_cider 6 | from datasets import create_dataloaders, ImageTextDataset 7 | 8 | 9 | def main(): 10 | best_test_score = float('-inf') # 初始化最佳测试得分 11 | 12 | # 加载配置 13 | config = Config() 14 | 15 | # 创建数据加载器 16 | train_loader, test_loader = create_dataloaders(config) 17 | 18 | # 加载词汇表文件 19 | with open('../data/output/vocab.json', 'r') as f: 20 | vocab = json.load(f) 21 | 22 | # 模型初始化 23 | model = ARCTIC( 24 | image_code_dim=config.image_code_dim, 25 | vocab=vocab, # 传递词汇表字典 26 | word_dim=config.word_dim, 27 | attention_dim=config.attention_dim, 28 | hidden_size=config.hidden_size, 29 | num_layers=config.num_layers 30 | ).to(config.device) 31 | 32 | # 优化器 33 | optimizer = get_optimizer(model, config) 34 | 35 | # 损失函数 36 | loss_fn = PackedCrossEntropyLoss().to(config.device) 37 | 38 | # 创建保存权重的文件夹路径 39 | weights_dir = os.path.join(config.output_folder, 'weights') 40 | os.makedirs(weights_dir, exist_ok=True) 41 | 42 | best_val_score = float('-inf') # 初始化最佳验证得分 43 | 44 | # 开始训练 45 | for epoch in range(config.num_epochs): 46 | # 训练模型 47 | model.train() 48 | for i, (imgs, caps, caplens) in enumerate(train_loader): 49 | imgs, caps = imgs.to(config.device), caps.to(config.device) 50 | caplens = caplens.cpu().to(torch.int64) 51 | 52 | optimizer.zero_grad() 53 | outputs, alphas, _, _, _ = model(imgs, caps, caplens) 54 | loss = loss_fn(outputs, targets, caplens) 55 | loss.backward() 56 | optimizer.step() 57 | 58 | # 打印/记录损失信息 59 | if (i + 1) % 100 == 0: 60 | print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}') 61 | 62 | # 在每个epoch结束时使用测试集评估模型 63 | current_test_score = evaluate_cider(test_loader, model, config) 64 | print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}") 65 | 66 | # 如果当前得分比之前的最佳得分要好,则保存模型 67 | if current_test_score > best_test_score: 68 | best_test_score = current_test_score 69 | best_model_path = os.path.join(weights_dir, f'Original_model_epoch_{epoch + 1}.pth') 70 | torch.save(model.state_dict(), best_model_path) 71 | print(f"Saved new best model to {best_model_path}") 72 | 73 | # 训练完成后的最终评估 74 | final_test_score = evaluate_cider(test_loader, model, config) 75 | print(f"Final CIDEr-D score = {final_test_score}") 76 | 77 | 78 | 79 | 80 | if __name__ == '__main__': 81 | main() 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Image-Caption: 基于编解码框架的图像描述 2 | 3 | > 2023 秋季北京邮电大学深度学习与神经网络课程设计(注:笔者的小组该门课程期末满分) 4 | 5 | ## 一、项目目录结构介绍 6 | 7 | ``` 8 | Image-Caption/ 9 | |-- data_new/ # 新版数据 10 | | |-- output/ # 模型1使用新版数据生成的输出结果 11 | | |-- test_images/ # 新版数据的测试集 12 | | |-- train_images_1/ # 新版数据的训练集(第一部分) 13 | | |-- train_images_2/ # 新版数据的训练集(第二部分) 14 | | |-- rename_script.py # 文件重命名脚本 15 | | |-- BLIP_test_captions.json # 多模态模型生成的测试集的图像描述文件 16 | | |-- BLIP_train_captions.json # 多模态模型生成的训练集的图像描述文件 17 | | |-- Model2_test_captions.json # 模型2生成的测试集的图像描述文件 18 | | |-- Model2_train_captions_1.json# 模型2生成的训练集的图像描述文件(第一部分) 19 | |-- data_old/ # 旧版数据 20 | | |-- output/ # 模型1使用旧版数据生成的输出结果 21 | | |-- test_images/ # 旧版数据的测试集 22 | | |-- train_images/ # 旧版数据的训练集 23 | | |-- label.json # 加入关键点后的全量json数据 24 | | |-- test_captions.json # 原始给定的测试集的图像描述文件 25 | | |-- train_captions.json # 原始给定的训练集的图像描述文件 26 | |-- doc/ # 项目的需求文档及项目报告 27 | |-- Ex1_BLIP # 附加任务1:多模态模型 28 | | |-- Salesforce/ # 模型文件 29 | | |-- run_fulldata_script.py # 全量数据运行脚本 30 | | |-- run_script.py # 单个数据运行脚本 31 | |-- Ex2_RL_Loss # 附加任务2:基于强化学习的损失函数 32 | |-- Model1_YellowOrange # 模型1:Self-Attention + Attention模型 33 | |-- Model2_Transformer # 模型2:Transformer Encoder + Decoder模型 34 | |-- Original_Model # 模型0:初始模型的图像描述模型 35 | |-- .gitignore 36 | |-- LICENSE 37 | |-- README.md # 项目的简介 38 | ``` 39 | 40 | 41 | ## 二、小组分工与时间安排 42 | 43 | | 巩羽飞 | 黄成梓 | 44 | | :------------------------------: | :-----------------------------------: | 45 | | 模型:网格表示Transformer E+D | 模型:初始、网格表示自注意力 + 注意力 | 46 | | 指标:METEOR + ROUGE-L | 指标:CIDEr-D | 47 | | 其他:多模态、强化学习的损失函数 | 其他:优化评测指标 | 48 | 49 | | 11.25 | 11.30 | 12.12 | 12.28 | 50 | | :-------: | :------------------: | :------------------: | :-------------------------------: | 51 | | 开题报告✅ | 模型跑通 + 评测指标✅ | 中期报告 + 优化指标✅ | 结题报告 + 多模态 + 强化学习Loss✅ | 52 | -------------------------------------------------------------------------------- /data_new/output/caplens_test.json: -------------------------------------------------------------------------------- 1 | {"test_1.jpg": 23, "test_10.jpg": 9, "test_100.jpg": 9, "test_101.jpg": 9, "test_102.jpg": 9, "test_103.jpg": 22, "test_104.jpg": 9, "test_105.jpg": 9, "test_106.jpg": 9, "test_107.jpg": 24, "test_108.jpg": 11, "test_109.jpg": 9, "test_11.jpg": 9, "test_110.jpg": 9, "test_111.jpg": 9, "test_112.jpg": 9, "test_113.jpg": 9, "test_114.jpg": 9, "test_115.jpg": 9, "test_116.jpg": 9, "test_117.jpg": 9, "test_118.jpg": 9, "test_119.jpg": 9, "test_12.jpg": 19, "test_120.jpg": 9, "test_121.jpg": 9, "test_122.jpg": 9, "test_123.jpg": 9, "test_124.jpg": 17, "test_125.jpg": 9, "test_126.jpg": 17, "test_127.jpg": 9, "test_128.jpg": 19, "test_129.jpg": 9, "test_13.jpg": 14, "test_130.jpg": 11, "test_131.jpg": 9, "test_132.jpg": 9, "test_133.jpg": 9, "test_134.jpg": 9, "test_135.jpg": 9, "test_136.jpg": 19, "test_137.jpg": 9, "test_138.jpg": 13, "test_139.jpg": 9, "test_14.jpg": 11, "test_140.jpg": 16, "test_141.jpg": 9, "test_142.jpg": 9, "test_143.jpg": 16, "test_144.jpg": 9, "test_145.jpg": 18, "test_146.jpg": 18, "test_147.jpg": 9, "test_148.jpg": 15, "test_149.jpg": 9, "test_15.jpg": 9, "test_150.jpg": 9, "test_151.jpg": 9, "test_152.jpg": 9, "test_153.jpg": 9, "test_154.jpg": 9, "test_155.jpg": 9, "test_156.jpg": 9, "test_157.jpg": 9, "test_158.jpg": 9, "test_159.jpg": 9, "test_16.jpg": 19, "test_160.jpg": 9, "test_161.jpg": 9, "test_162.jpg": 9, "test_163.jpg": 9, "test_164.jpg": 9, "test_165.jpg": 18, "test_166.jpg": 9, "test_167.jpg": 9, "test_168.jpg": 9, "test_169.jpg": 9, "test_17.jpg": 9, "test_170.jpg": 9, "test_171.jpg": 9, "test_172.jpg": 9, "test_173.jpg": 9, "test_174.jpg": 9, "test_175.jpg": 9, "test_176.jpg": 9, "test_177.jpg": 9, "test_178.jpg": 9, "test_179.jpg": 9, "test_18.jpg": 19, "test_180.jpg": 9, "test_181.jpg": 9, "test_182.jpg": 9, "test_183.jpg": 9, "test_184.jpg": 9, "test_185.jpg": 9, "test_186.jpg": 9, "test_187.jpg": 9, "test_188.jpg": 9, "test_189.jpg": 9, "test_19.jpg": 9, "test_190.jpg": 9, "test_191.jpg": 9, "test_192.jpg": 9, "test_193.jpg": 17, "test_194.jpg": 21, "test_195.jpg": 20, "test_196.jpg": 12, "test_197.jpg": 23, "test_198.jpg": 16, "test_199.jpg": 15, "test_2.jpg": 22, "test_20.jpg": 9, "test_200.jpg": 23, "test_201.jpg": 12, "test_202.jpg": 20, "test_203.jpg": 20, "test_204.jpg": 20, "test_205.jpg": 20, "test_206.jpg": 18, "test_207.jpg": 20, "test_208.jpg": 19, "test_209.jpg": 20, "test_21.jpg": 9, "test_210.jpg": 20, "test_211.jpg": 9, "test_212.jpg": 9, "test_213.jpg": 20, "test_214.jpg": 9, "test_215.jpg": 17, "test_216.jpg": 9, "test_217.jpg": 18, "test_218.jpg": 9, "test_219.jpg": 17, "test_22.jpg": 9, "test_220.jpg": 9, "test_221.jpg": 9, "test_222.jpg": 9, "test_223.jpg": 9, "test_224.jpg": 9, "test_225.jpg": 17, "test_226.jpg": 9, "test_227.jpg": 9, "test_228.jpg": 20, "test_229.jpg": 9, "test_23.jpg": 9, "test_230.jpg": 9, "test_231.jpg": 9, "test_232.jpg": 9, "test_233.jpg": 9, "test_234.jpg": 9, "test_235.jpg": 9, "test_236.jpg": 9, "test_237.jpg": 9, "test_238.jpg": 9, "test_239.jpg": 9, "test_24.jpg": 18, "test_240.jpg": 9, "test_241.jpg": 9, "test_242.jpg": 9, "test_243.jpg": 9, "test_244.jpg": 9, "test_245.jpg": 13, "test_246.jpg": 9, "test_247.jpg": 9, "test_248.jpg": 9, "test_249.jpg": 9, "test_25.jpg": 23, "test_250.jpg": 9, "test_251.jpg": 9, "test_252.jpg": 9, "test_253.jpg": 9, "test_254.jpg": 9, "test_255.jpg": 19, "test_256.jpg": 9, "test_257.jpg": 19, "test_258.jpg": 9, "test_259.jpg": 11, "test_26.jpg": 20, "test_260.jpg": 9, "test_261.jpg": 9, "test_262.jpg": 10, "test_263.jpg": 9, "test_264.jpg": 9, "test_265.jpg": 9, "test_266.jpg": 9, "test_267.jpg": 9, "test_268.jpg": 16, "test_269.jpg": 9, "test_27.jpg": 22, "test_270.jpg": 9, "test_271.jpg": 9, "test_272.jpg": 16, "test_273.jpg": 9, "test_274.jpg": 9, "test_275.jpg": 9, "test_276.jpg": 9, "test_277.jpg": 13, "test_278.jpg": 17, "test_279.jpg": 24, "test_28.jpg": 23, "test_280.jpg": 20, "test_281.jpg": 14, "test_282.jpg": 9, "test_283.jpg": 9, "test_284.jpg": 9, "test_285.jpg": 9, "test_286.jpg": 9, "test_287.jpg": 9, "test_288.jpg": 9, "test_289.jpg": 9, "test_29.jpg": 9, "test_290.jpg": 9, "test_291.jpg": 9, "test_292.jpg": 9, "test_293.jpg": 9, "test_294.jpg": 9, "test_295.jpg": 9, "test_296.jpg": 9, "test_297.jpg": 9, "test_298.jpg": 9, "test_299.jpg": 9, "test_3.jpg": 10, "test_30.jpg": 24, "test_300.jpg": 9, "test_301.jpg": 9, "test_302.jpg": 17, "test_303.jpg": 11, "test_304.jpg": 17, "test_305.jpg": 11, "test_306.jpg": 9, "test_307.jpg": 9, "test_308.jpg": 9, "test_309.jpg": 9, "test_31.jpg": 19, "test_310.jpg": 9, "test_311.jpg": 9, "test_312.jpg": 9, "test_313.jpg": 9, "test_314.jpg": 9, "test_315.jpg": 9, "test_316.jpg": 9, "test_317.jpg": 9, "test_318.jpg": 9, "test_319.jpg": 9, "test_32.jpg": 11, "test_320.jpg": 9, "test_321.jpg": 9, "test_322.jpg": 9, "test_323.jpg": 9, "test_324.jpg": 9, "test_325.jpg": 9, "test_326.jpg": 9, "test_327.jpg": 9, "test_328.jpg": 9, "test_329.jpg": 9, "test_33.jpg": 9, "test_330.jpg": 9, "test_331.jpg": 9, "test_332.jpg": 9, "test_333.jpg": 9, "test_334.jpg": 9, "test_335.jpg": 9, "test_336.jpg": 9, "test_337.jpg": 9, "test_338.jpg": 9, "test_339.jpg": 18, "test_34.jpg": 9, "test_340.jpg": 9, "test_341.jpg": 19, "test_342.jpg": 11, "test_343.jpg": 18, "test_344.jpg": 9, "test_345.jpg": 14, "test_346.jpg": 9, "test_347.jpg": 9, "test_348.jpg": 9, "test_349.jpg": 9, "test_35.jpg": 17, "test_350.jpg": 16, "test_351.jpg": 9, "test_352.jpg": 9, "test_353.jpg": 9, "test_354.jpg": 9, "test_355.jpg": 21, "test_356.jpg": 9, "test_357.jpg": 9, "test_358.jpg": 9, "test_359.jpg": 16, "test_36.jpg": 9, "test_360.jpg": 9, "test_361.jpg": 9, "test_362.jpg": 17, "test_363.jpg": 9, "test_364.jpg": 9, "test_365.jpg": 9, "test_366.jpg": 9, "test_367.jpg": 9, "test_368.jpg": 9, "test_369.jpg": 9, "test_37.jpg": 9, "test_370.jpg": 9, "test_371.jpg": 9, "test_372.jpg": 9, "test_373.jpg": 11, "test_374.jpg": 9, "test_375.jpg": 9, "test_376.jpg": 9, "test_377.jpg": 9, "test_378.jpg": 9, "test_379.jpg": 9, "test_38.jpg": 9, "test_380.jpg": 17, "test_381.jpg": 9, "test_382.jpg": 9, "test_383.jpg": 9, "test_384.jpg": 9, "test_385.jpg": 9, "test_386.jpg": 9, "test_387.jpg": 9, "test_388.jpg": 9, "test_389.jpg": 11, "test_39.jpg": 9, "test_390.jpg": 9, "test_391.jpg": 9, "test_392.jpg": 20, "test_393.jpg": 9, "test_394.jpg": 9, "test_395.jpg": 11, "test_396.jpg": 18, "test_397.jpg": 14, "test_398.jpg": 9, "test_399.jpg": 14, "test_4.jpg": 23, "test_40.jpg": 9, "test_400.jpg": 9, "test_401.jpg": 9, "test_402.jpg": 9, "test_403.jpg": 14, "test_404.jpg": 14, "test_405.jpg": 9, "test_406.jpg": 9, "test_407.jpg": 9, "test_408.jpg": 9, "test_409.jpg": 9, "test_41.jpg": 9, "test_410.jpg": 9, "test_411.jpg": 9, "test_412.jpg": 9, "test_413.jpg": 19, "test_414.jpg": 18, "test_415.jpg": 18, "test_416.jpg": 19, "test_417.jpg": 18, "test_418.jpg": 18, "test_419.jpg": 16, "test_42.jpg": 9, "test_420.jpg": 19, "test_421.jpg": 16, "test_422.jpg": 9, "test_423.jpg": 9, "test_424.jpg": 9, "test_425.jpg": 9, "test_426.jpg": 9, "test_427.jpg": 9, "test_428.jpg": 9, "test_429.jpg": 9, "test_43.jpg": 9, "test_430.jpg": 19, "test_431.jpg": 9, "test_432.jpg": 9, "test_433.jpg": 9, "test_434.jpg": 17, "test_435.jpg": 9, "test_436.jpg": 9, "test_437.jpg": 9, "test_438.jpg": 9, "test_439.jpg": 9, "test_44.jpg": 9, "test_440.jpg": 9, "test_441.jpg": 9, "test_442.jpg": 9, "test_443.jpg": 9, "test_444.jpg": 9, "test_445.jpg": 9, "test_446.jpg": 9, "test_447.jpg": 9, "test_448.jpg": 9, "test_449.jpg": 9, "test_45.jpg": 9, "test_450.jpg": 9, "test_451.jpg": 9, "test_452.jpg": 9, "test_453.jpg": 9, "test_454.jpg": 9, "test_455.jpg": 9, "test_456.jpg": 9, "test_457.jpg": 9, "test_458.jpg": 9, "test_459.jpg": 16, "test_46.jpg": 9, "test_460.jpg": 9, "test_461.jpg": 9, "test_462.jpg": 14, "test_463.jpg": 9, "test_464.jpg": 9, "test_465.jpg": 9, "test_466.jpg": 9, "test_467.jpg": 17, "test_468.jpg": 9, "test_469.jpg": 9, "test_47.jpg": 9, "test_470.jpg": 9, "test_471.jpg": 9, "test_472.jpg": 9, "test_473.jpg": 11, "test_474.jpg": 9, "test_475.jpg": 9, "test_476.jpg": 9, "test_477.jpg": 9, "test_478.jpg": 9, "test_479.jpg": 9, "test_48.jpg": 18, "test_480.jpg": 9, "test_481.jpg": 9, "test_482.jpg": 9, "test_483.jpg": 9, "test_484.jpg": 9, "test_485.jpg": 9, "test_486.jpg": 9, "test_487.jpg": 9, "test_488.jpg": 9, "test_489.jpg": 9, "test_49.jpg": 11, "test_490.jpg": 9, "test_491.jpg": 9, "test_492.jpg": 17, "test_493.jpg": 9, "test_494.jpg": 9, "test_495.jpg": 9, "test_496.jpg": 9, "test_497.jpg": 9, "test_498.jpg": 11, "test_499.jpg": 22, "test_5.jpg": 18, "test_50.jpg": 16, "test_500.jpg": 9, "test_501.jpg": 9, "test_502.jpg": 9, "test_503.jpg": 9, "test_504.jpg": 9, "test_505.jpg": 9, "test_506.jpg": 9, "test_507.jpg": 9, "test_508.jpg": 18, "test_509.jpg": 9, "test_51.jpg": 19, "test_510.jpg": 9, "test_511.jpg": 11, "test_512.jpg": 9, "test_513.jpg": 9, "test_514.jpg": 9, "test_515.jpg": 9, "test_516.jpg": 9, "test_517.jpg": 18, "test_518.jpg": 9, "test_519.jpg": 9, "test_52.jpg": 16, "test_520.jpg": 9, "test_521.jpg": 9, "test_522.jpg": 9, "test_523.jpg": 9, "test_524.jpg": 9, "test_525.jpg": 9, "test_526.jpg": 9, "test_527.jpg": 9, "test_528.jpg": 9, "test_529.jpg": 9, "test_53.jpg": 9, "test_530.jpg": 9, "test_531.jpg": 9, "test_532.jpg": 9, "test_533.jpg": 9, "test_534.jpg": 9, "test_535.jpg": 9, "test_536.jpg": 9, "test_537.jpg": 9, "test_538.jpg": 9, "test_539.jpg": 9, "test_54.jpg": 9, "test_540.jpg": 17, "test_541.jpg": 9, "test_542.jpg": 9, "test_543.jpg": 9, "test_544.jpg": 9, "test_545.jpg": 9, "test_546.jpg": 11, "test_547.jpg": 9, "test_548.jpg": 9, "test_549.jpg": 9, "test_55.jpg": 11, "test_550.jpg": 9, "test_551.jpg": 9, "test_552.jpg": 9, "test_553.jpg": 9, "test_554.jpg": 9, "test_555.jpg": 9, "test_556.jpg": 9, "test_557.jpg": 9, "test_558.jpg": 20, "test_559.jpg": 9, "test_56.jpg": 9, "test_560.jpg": 19, "test_561.jpg": 9, "test_562.jpg": 18, "test_563.jpg": 21, "test_564.jpg": 9, "test_565.jpg": 9, "test_566.jpg": 9, "test_567.jpg": 9, "test_568.jpg": 9, "test_569.jpg": 9, "test_57.jpg": 11, "test_570.jpg": 9, "test_571.jpg": 9, "test_572.jpg": 9, "test_573.jpg": 9, "test_574.jpg": 9, "test_575.jpg": 9, "test_576.jpg": 9, "test_577.jpg": 9, "test_578.jpg": 9, "test_579.jpg": 9, "test_58.jpg": 9, "test_580.jpg": 9, "test_581.jpg": 16, "test_582.jpg": 9, "test_583.jpg": 9, "test_584.jpg": 9, "test_585.jpg": 9, "test_586.jpg": 9, "test_587.jpg": 9, "test_588.jpg": 9, "test_589.jpg": 9, "test_59.jpg": 9, "test_590.jpg": 9, "test_591.jpg": 9, "test_592.jpg": 17, "test_593.jpg": 9, "test_594.jpg": 9, "test_595.jpg": 9, "test_596.jpg": 16, "test_597.jpg": 20, "test_598.jpg": 17, "test_599.jpg": 9, "test_6.jpg": 24, "test_60.jpg": 9, "test_600.jpg": 9, "test_601.jpg": 9, "test_602.jpg": 9, "test_603.jpg": 9, "test_604.jpg": 9, "test_605.jpg": 17, "test_606.jpg": 9, "test_607.jpg": 9, "test_608.jpg": 9, "test_609.jpg": 9, "test_61.jpg": 9, "test_610.jpg": 9, "test_611.jpg": 9, "test_612.jpg": 15, "test_613.jpg": 9, "test_614.jpg": 11, "test_615.jpg": 9, "test_616.jpg": 9, "test_617.jpg": 9, "test_618.jpg": 11, "test_619.jpg": 9, "test_62.jpg": 10, "test_620.jpg": 16, "test_621.jpg": 9, "test_622.jpg": 9, "test_623.jpg": 9, "test_624.jpg": 9, "test_625.jpg": 9, "test_626.jpg": 9, "test_627.jpg": 14, "test_628.jpg": 9, "test_629.jpg": 9, "test_63.jpg": 9, "test_630.jpg": 9, "test_631.jpg": 9, "test_632.jpg": 9, "test_633.jpg": 12, "test_634.jpg": 9, "test_635.jpg": 9, "test_636.jpg": 9, "test_637.jpg": 9, "test_638.jpg": 9, "test_639.jpg": 9, "test_64.jpg": 13, "test_640.jpg": 9, "test_641.jpg": 9, "test_642.jpg": 16, "test_643.jpg": 19, "test_644.jpg": 19, "test_645.jpg": 18, "test_646.jpg": 19, "test_647.jpg": 9, "test_648.jpg": 9, "test_649.jpg": 9, "test_65.jpg": 11, "test_650.jpg": 16, "test_651.jpg": 16, "test_652.jpg": 11, "test_653.jpg": 13, "test_654.jpg": 16, "test_655.jpg": 16, "test_656.jpg": 19, "test_657.jpg": 9, "test_658.jpg": 19, "test_659.jpg": 9, "test_66.jpg": 11, "test_660.jpg": 9, "test_661.jpg": 9, "test_662.jpg": 9, "test_663.jpg": 19, "test_664.jpg": 9, "test_665.jpg": 9, "test_666.jpg": 9, "test_667.jpg": 9, "test_668.jpg": 9, "test_669.jpg": 9, "test_67.jpg": 9, "test_670.jpg": 9, "test_671.jpg": 9, "test_672.jpg": 9, "test_673.jpg": 9, "test_674.jpg": 9, "test_675.jpg": 9, "test_676.jpg": 9, "test_677.jpg": 9, "test_678.jpg": 9, "test_679.jpg": 9, "test_68.jpg": 9, "test_680.jpg": 18, "test_681.jpg": 9, "test_682.jpg": 9, "test_683.jpg": 9, "test_684.jpg": 9, "test_685.jpg": 9, "test_686.jpg": 9, "test_687.jpg": 9, "test_688.jpg": 9, "test_689.jpg": 9, "test_69.jpg": 9, "test_690.jpg": 9, "test_691.jpg": 9, "test_692.jpg": 9, "test_693.jpg": 9, "test_694.jpg": 9, "test_695.jpg": 9, "test_696.jpg": 9, "test_697.jpg": 9, "test_698.jpg": 9, "test_699.jpg": 9, "test_7.jpg": 23, "test_70.jpg": 9, "test_700.jpg": 9, "test_701.jpg": 16, "test_702.jpg": 9, "test_703.jpg": 9, "test_704.jpg": 9, "test_705.jpg": 9, "test_706.jpg": 15, "test_707.jpg": 19, "test_708.jpg": 11, "test_709.jpg": 9, "test_71.jpg": 9, "test_710.jpg": 9, "test_711.jpg": 19, "test_712.jpg": 15, "test_713.jpg": 9, "test_714.jpg": 9, "test_715.jpg": 9, "test_716.jpg": 9, "test_717.jpg": 9, "test_718.jpg": 9, "test_719.jpg": 11, "test_72.jpg": 9, "test_720.jpg": 9, "test_721.jpg": 9, "test_722.jpg": 9, "test_723.jpg": 9, "test_724.jpg": 9, "test_725.jpg": 9, "test_726.jpg": 9, "test_727.jpg": 9, "test_728.jpg": 9, "test_729.jpg": 9, "test_73.jpg": 9, "test_730.jpg": 9, "test_731.jpg": 9, "test_732.jpg": 9, "test_733.jpg": 9, "test_734.jpg": 15, "test_735.jpg": 14, "test_736.jpg": 9, "test_737.jpg": 20, "test_738.jpg": 14, "test_739.jpg": 9, "test_74.jpg": 9, "test_740.jpg": 15, "test_741.jpg": 17, "test_742.jpg": 14, "test_743.jpg": 9, "test_744.jpg": 9, "test_745.jpg": 9, "test_746.jpg": 15, "test_747.jpg": 11, "test_748.jpg": 9, "test_749.jpg": 9, "test_75.jpg": 9, "test_750.jpg": 9, "test_751.jpg": 9, "test_752.jpg": 9, "test_753.jpg": 9, "test_754.jpg": 9, "test_755.jpg": 9, "test_756.jpg": 9, "test_757.jpg": 9, "test_758.jpg": 9, "test_759.jpg": 9, "test_76.jpg": 9, "test_760.jpg": 16, "test_761.jpg": 11, "test_762.jpg": 14, "test_763.jpg": 18, "test_764.jpg": 14, "test_765.jpg": 17, "test_766.jpg": 9, "test_767.jpg": 9, "test_768.jpg": 9, "test_769.jpg": 9, "test_77.jpg": 9, "test_770.jpg": 9, "test_771.jpg": 9, "test_772.jpg": 9, "test_773.jpg": 9, "test_774.jpg": 9, "test_775.jpg": 9, "test_776.jpg": 9, "test_777.jpg": 9, "test_778.jpg": 9, "test_779.jpg": 9, "test_78.jpg": 9, "test_780.jpg": 9, "test_781.jpg": 9, "test_782.jpg": 9, "test_783.jpg": 9, "test_784.jpg": 23, "test_785.jpg": 9, "test_786.jpg": 11, "test_787.jpg": 9, "test_788.jpg": 13, "test_789.jpg": 9, "test_79.jpg": 9, "test_790.jpg": 9, "test_791.jpg": 9, "test_792.jpg": 20, "test_793.jpg": 9, "test_794.jpg": 18, "test_795.jpg": 18, "test_796.jpg": 17, "test_797.jpg": 9, "test_798.jpg": 9, "test_799.jpg": 9, "test_8.jpg": 23, "test_80.jpg": 16, "test_800.jpg": 9, "test_801.jpg": 9, "test_802.jpg": 11, "test_803.jpg": 9, "test_804.jpg": 9, "test_805.jpg": 9, "test_806.jpg": 9, "test_807.jpg": 9, "test_808.jpg": 9, "test_809.jpg": 9, "test_81.jpg": 15, "test_810.jpg": 9, "test_811.jpg": 9, "test_812.jpg": 9, "test_813.jpg": 9, "test_814.jpg": 9, "test_815.jpg": 9, "test_816.jpg": 9, "test_817.jpg": 9, "test_818.jpg": 9, "test_819.jpg": 9, "test_82.jpg": 14, "test_820.jpg": 9, "test_821.jpg": 9, "test_822.jpg": 9, "test_823.jpg": 9, "test_824.jpg": 9, "test_825.jpg": 9, "test_826.jpg": 9, "test_827.jpg": 9, "test_828.jpg": 9, "test_829.jpg": 9, "test_83.jpg": 17, "test_830.jpg": 9, "test_831.jpg": 9, "test_832.jpg": 9, "test_833.jpg": 9, "test_834.jpg": 9, "test_835.jpg": 9, "test_836.jpg": 16, "test_837.jpg": 11, "test_838.jpg": 15, "test_839.jpg": 9, "test_84.jpg": 19, "test_840.jpg": 9, "test_841.jpg": 11, "test_842.jpg": 9, "test_843.jpg": 9, "test_844.jpg": 9, "test_845.jpg": 13, "test_846.jpg": 9, "test_847.jpg": 9, "test_848.jpg": 9, "test_849.jpg": 9, "test_85.jpg": 17, "test_850.jpg": 9, "test_851.jpg": 9, "test_852.jpg": 9, "test_853.jpg": 14, "test_854.jpg": 9, "test_855.jpg": 9, "test_856.jpg": 9, "test_857.jpg": 9, "test_858.jpg": 9, "test_859.jpg": 9, "test_86.jpg": 17, "test_860.jpg": 9, "test_861.jpg": 13, "test_862.jpg": 9, "test_863.jpg": 9, "test_864.jpg": 13, "test_865.jpg": 9, "test_866.jpg": 9, "test_867.jpg": 9, "test_868.jpg": 9, "test_869.jpg": 9, "test_87.jpg": 19, "test_870.jpg": 9, "test_871.jpg": 9, "test_872.jpg": 9, "test_873.jpg": 9, "test_874.jpg": 9, "test_875.jpg": 9, "test_876.jpg": 9, "test_877.jpg": 9, "test_878.jpg": 9, "test_879.jpg": 9, "test_88.jpg": 17, "test_880.jpg": 9, "test_881.jpg": 9, "test_882.jpg": 9, "test_883.jpg": 9, "test_884.jpg": 9, "test_885.jpg": 9, "test_886.jpg": 9, "test_887.jpg": 9, "test_888.jpg": 9, "test_889.jpg": 17, "test_89.jpg": 9, "test_890.jpg": 12, "test_891.jpg": 15, "test_892.jpg": 17, "test_893.jpg": 9, "test_894.jpg": 19, "test_895.jpg": 15, "test_896.jpg": 11, "test_897.jpg": 9, "test_898.jpg": 9, "test_899.jpg": 16, "test_9.jpg": 24, "test_90.jpg": 9, "test_900.jpg": 16, "test_901.jpg": 9, "test_902.jpg": 9, "test_903.jpg": 9, "test_904.jpg": 17, "test_905.jpg": 12, "test_906.jpg": 17, "test_907.jpg": 18, "test_908.jpg": 9, "test_909.jpg": 9, "test_91.jpg": 9, "test_910.jpg": 9, "test_911.jpg": 9, "test_912.jpg": 9, "test_913.jpg": 9, "test_92.jpg": 9, "test_93.jpg": 9, "test_94.jpg": 9, "test_95.jpg": 9, "test_96.jpg": 9, "test_97.jpg": 9, "test_98.jpg": 9, "test_99.jpg": 9} -------------------------------------------------------------------------------- /data_new/output/vocab.json: -------------------------------------------------------------------------------- 1 | {"bird": 4, "muscular": 5, "bear": 6, "crown": 7, "pj": 8, "brief": 9, "rose": 10, "sweat": 11, "banana": 12, "blur": 13, "his": 14, "content": 15, "wood": 16, "sweater": 17, "woman's": 18, "not": 19, "pants,": 20, "muscle": 21, "p": 22, "silver": 23, "woman": 24, "close": 25, "cat": 26, "logo": 27, "red": 28, "pair": 29, "star": 30, "concrete": 31, "white": 32, "brown": 33, "la": 34, "character": 35, "pjs": 36, "sleeve": 37, "leather": 38, "leaf": 39, "tree": 40, "cartoon": 41, "no": 42, "camouflage": 43, "vest": 44, "sky": 45, "swimsuit": 46, "squares": 47, "anchor": 48, "los": 49, "photo": 50, "a": 51, "words": 52, "outline": 53, "sunflower": 54, "camo": 55, "button": 56, "universe": 57, "blurred": 58, "blurry": 59, "sitting": 60, "pajamas": 61, "tropical": 62, "crop": 63, "lettering": 64, "deer": 65, "pink": 66, "blured": 67, "cup": 68, "sneakers,": 69, "over": 70, "geometric": 71, "embroidered": 72, "checker": 73, "linen": 74, "street": 75, "across": 76, "staircase": 77, "pants": 78, "brick": 79, "paint": 80, "stone": 81, "bright": 82, "overalls": 83, "mesh": 84, "text": 85, "sunset": 86, "front": 87, "in": 88, "chain": 89, "palm": 90, "tattoo": 91, "waistband": 92, "i": 93, "left": 94, "of": 95, "male": 96, "underneath": 97, "grid": 98, "couch": 99, "pattern,": 100, "swim": 101, "leopard": 102, "pool,": 103, "fade": 104, "chair": 105, "nasa": 106, "and": 107, "tattoos": 108, "cactus": 109, "-": 110, "pocket": 111, "boxer": 112, "short": 113, "contrast": 114, "butterfly": 115, "textured": 116, "paste": 117, "dye": 118, "top": 119, "striped": 120, "tan": 121, "maroon": 122, "door": 123, "trunk": 124, "word": 125, "it": 126, "pattern": 127, "shorts,": 128, "chest": 129, "ombre": 130, "cargo": 131, "triangle": 132, "mountain,": 133, "beach": 134, "car": 135, "t": 136, "floral": 137, "arms": 138, "is": 139, "orange": 140, "`": 141, "trunks": 142, "patchwork": 143, "the": 144, "torso": 145, "are": 146, "sweatshirt": 147, "flamingo": 148, "socks": 149, "to": 150, "knit": 151, "design": 152, "collar": 153, "image": 154, "around": 155, "om": 156, "york": 157, "polo": 158, "mint": 159, "yellow": 160, "beard": 161, "dinosaur": 162, "graphic": 163, "basketball": 164, "metallic": 165, "wall": 166, "down": 167, "hoe": 168, "wave": 169, "plaid": 170, "with": 171, "zipper": 172, "back": 173, "says": 174, "material": 175, "purple": 176, "sharks": 177, "flower": 178, "pool": 179, "skull": 180, "wearing": 181, "black": 182, "lines": 183, "head": 184, "dumb": 185, "background,": 186, "denim": 187, "wall,": 188, "flowers": 189, "man's": 190, "green": 191, "ripped": 192, "navy": 193, "cap": 194, "book": 195, "panda": 196, "suit": 197, "sleeved": 198, "shirt,": 199, "letters": 200, "'": 201, "written": 202, "color": 203, "fake": 204, "tie": 205, "bull": 206, "beach,": 207, "hat": 208, "face": 209, "model": 210, "burgundy": 211, "sweatpants": 212, "on": 213, "printed": 214, "blury": 215, "neckline": 216, "gold": 217, "light": 218, "ant": 219, "up": 220, "paisley": 221, "wooden": 222, "stripe": 223, "joggers": 224, "standing": 225, "checkered": 226, "v": 227, "shark": 228, "plant": 229, "shorts": 230, "lightning": 231, "blue": 232, "elephant": 233, "all": 234, "mountain": 235, "sunglasses": 236, "shirt": 237, "an": 238, "dog": 239, "beige": 240, "metal": 241, "sleeveless": 242, "trim": 243, "fireplace": 244, "sleeves": 245, "that": 246, "knitted": 247, "hawaiian": 248, "marble": 249, "trousers": 250, "man": 251, "body": 252, "holding": 253, "stripes": 254, "sneakers": 255, "neck": 256, "jacket": 257, "neon": 258, "block": 259, "holes": 260, "california": 261, "gym": 262, "fence": 263, "ball": 264, "patterned": 265, "scene": 266, "cigarette": 267, "picture": 268, "jeans": 269, "grey": 270, "walking": 271, "hoodie": 272, "gradient": 273, "legs": 274, "side": 275, "colorful": 276, "print": 277, "new": 278, "bottom": 279, "background": 280, "read": 281, "wetsuit": 282, "gray": 283, "birds": 284, "khaki": 285, "trees": 286, "tank": 287, "arm": 288, "line": 289, "scene,": 290, "eye": 291, "dragon": 292, "": 0, "": 1, "": 2, "": 3} -------------------------------------------------------------------------------- /data_new/rename_script.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def rename_images(folder_path): 4 | # 检查文件夹是否存在 5 | if not os.path.exists(folder_path): 6 | print(f"文件夹 '{folder_path}' 不存在。") 7 | return 8 | 9 | # 获取文件夹下所有文件 10 | files = os.listdir(folder_path) 11 | 12 | # 迭代处理每个文件 13 | for index, file_name in enumerate(files): 14 | # 获取文件的完整路径 15 | old_path = os.path.join(folder_path, file_name) 16 | 17 | # 构建新的文件名 18 | new_name = f"train_{index + 1}.jpg" 19 | # new_name = f"test_{index + 1}.jpg" 20 | 21 | # 构建新的文件路径 22 | new_path = os.path.join(folder_path, new_name) 23 | 24 | # 重命名文件 25 | os.rename(old_path, new_path) 26 | 27 | print(f"重命名文件: {file_name} -> {new_name}") 28 | 29 | if __name__ == "__main__": 30 | # 指定图片文件夹的路径 31 | images_folder_path = "train_images" 32 | # images_folder_path = "test_images" 33 | 34 | # 调用函数进行重命名 35 | rename_images(images_folder_path) 36 | -------------------------------------------------------------------------------- /data_old/output/vocab.json: -------------------------------------------------------------------------------- 1 | {"wearing": 4, "tank": 5, "pants,": 6, "hands": 7, "glasses": 8, "socks": 9, "stand": 10, "other,": 11, "patterns.": 12, "sweater": 13, "shorts,": 14, "plaid": 15, "color.": 16, "knitting": 17, "trousers": 18, "pure": 19, "striped": 20, "complicated": 21, "lattice.": 22, "ring": 23, "sleeves,": 24, "her": 25, "neckline.": 26, "wrist.": 27, "denim,": 28, "an": 29, "gentleman": 30, "stand.": 31, "solid": 32, "graphic.": 33, "square.": 34, "ring.": 35, "long-sleeve": 36, "medium-sleeve": 37, "v-shape.": 38, "head.": 39, "upper": 40, "has": 41, "of": 42, "off": 43, "pants.": 44, "sleeves": 45, "socks.": 46, "pattern": 47, "cotton.": 48, "the": 49, "neckline": 50, "graphic": 51, "skirt": 52, "patterns": 53, "is": 54, "belt": 55, "wears": 56, "trousers,": 57, "its": 58, "cotton": 59, "floral.": 60, "color": 61, "sunglasses.": 62, "guy": 63, "stripe": 64, "belt.": 65, "with": 66, "three-quarter": 67, "long": 68, "a": 69, "shorts.": 70, "shirt": 71, "hat": 72, "shorts": 73, "round.": 74, "suspenders": 75, "floral": 76, "lapel": 77, "hat.": 78, "top": 79, "chiffon": 80, "neck.": 81, "woman": 82, "trousers.": 83, "leather": 84, "striped.": 85, "knitting,": 86, "mixed": 87, "female": 88, "no": 89, "clothes.": 90, "t-shirt": 91, "it": 92, "are": 93, "cotton,": 94, "lady": 95, "suspenders.": 96, "waist.": 97, "neckwear.": 98, "lattice": 99, "v-shape": 100, "three-point": 101, "crew": 102, "in": 103, "other.": 104, "accessory": 105, "chiffon.": 106, "and": 107, "round": 108, "square": 109, "denim": 110, "also": 111, "outer": 112, "length.": 113, "pants": 114, "short-sleeve": 115, "short": 116, "off,": 117, "there": 118, "person": 119, "block": 120, "fabric.": 121, "clothing": 122, "plaid.": 123, "stripe.": 124, "or": 125, "crew.": 126, "chiffon,": 127, "knitting.": 128, "leggings.": 129, "lapel.": 130, "fabric": 131, "on": 132, "skirt.": 133, "this": 134, "shoes.": 135, "clothing,": 136, "his": 137, "pair": 138, "finger.": 139, "sleeveless": 140, "man": 141, "furry": 142, "block.": 143, "skirt,": 144, "cut": 145, "other": 146, "lower": 147, "medium": 148, "": 0, "": 1, "": 2, "": 3} -------------------------------------------------------------------------------- /doc/NNDL图像_描述指南.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL图像_描述指南.pdf -------------------------------------------------------------------------------- /doc/NNDL课设_中期报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_中期报告.pdf -------------------------------------------------------------------------------- /doc/NNDL课设_开题报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_开题报告.pdf -------------------------------------------------------------------------------- /doc/NNDL课设_结题报告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_结题报告.pdf -------------------------------------------------------------------------------- /doc/NNDL课设_要求说明.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_要求说明.pdf -------------------------------------------------------------------------------- /doc/img/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/01.png -------------------------------------------------------------------------------- /doc/img/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/02.png -------------------------------------------------------------------------------- /doc/img/03.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/03.png -------------------------------------------------------------------------------- /doc/img/AttentionModel-first_train-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-1.png -------------------------------------------------------------------------------- /doc/img/AttentionModel-first_train-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-2.png -------------------------------------------------------------------------------- /doc/img/AttentionModel-first_train-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-3.png -------------------------------------------------------------------------------- /doc/img/AttentionModel-first_train-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-4.png -------------------------------------------------------------------------------- /doc/img/AttentionModel_backgroundcaption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel_backgroundcaption.png -------------------------------------------------------------------------------- /doc/img/BLIP_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_1.png -------------------------------------------------------------------------------- /doc/img/BLIP_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_2.png -------------------------------------------------------------------------------- /doc/img/BLIP_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_3.png -------------------------------------------------------------------------------- /doc/img/BLIP_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_demo.png -------------------------------------------------------------------------------- /doc/img/BLIP_full.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_full.png -------------------------------------------------------------------------------- /doc/img/CNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/CNN.png -------------------------------------------------------------------------------- /doc/img/Ex_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Ex_data.png -------------------------------------------------------------------------------- /doc/img/OriginalModel-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/OriginalModel-1.png -------------------------------------------------------------------------------- /doc/img/OriginalModel-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/OriginalModel-2.png -------------------------------------------------------------------------------- /doc/img/Out_of_Memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Out_of_Memory.png -------------------------------------------------------------------------------- /doc/img/RNN.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/RNN.png -------------------------------------------------------------------------------- /doc/img/Transformer_demo1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo1.png -------------------------------------------------------------------------------- /doc/img/Transformer_demo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo2.png -------------------------------------------------------------------------------- /doc/img/Transformer_demo3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo3.png -------------------------------------------------------------------------------- /doc/img/Transformer_demo4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo4.png -------------------------------------------------------------------------------- /doc/img/Transformer_framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_framework.png -------------------------------------------------------------------------------- /doc/img/image-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image-1.png -------------------------------------------------------------------------------- /doc/img/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image.png -------------------------------------------------------------------------------- /doc/img/image20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image20.png --------------------------------------------------------------------------------