├── .gitignore
├── Ex1_BLIP
    ├── README.md
    ├── Salesforce
    │   └── blip-image-captioning-base
    │   │   ├── config.json
    │   │   ├── preprocessor_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.json
    │   │   ├── tokenizer_config.json
    │   │   └── vocab.txt
    ├── requirements.txt
    ├── run_fulldata_script.py
    └── run_script.py
├── Ex2_RL-Loss
    ├── Model1_RL-Loss.py
    ├── Model2_RL-Loss.py
    └── Pure_RL-Loss.py
├── LICENSE
├── Model1_YellowOrange
    ├── README.md
    ├── SelfAttention+Attention.ipynb
    ├── __pycache__
    │   ├── configuartions.cpython-310.pyc
    │   ├── datasets.cpython-310.pyc
    │   └── models.cpython-310.pyc
    ├── configuartions.py
    ├── datasets.py
    ├── datasets_pretrain_demo.py
    ├── models.py
    ├── predict.py
    ├── requirements.txt
    └── train.py
├── Model2_Transformer
    ├── TransformerE+D.ipynb
    ├── __pycache__
    │   └── configuration.cpython-310.pyc
    ├── configuration.py
    ├── data_preprocessing
    │   ├── divide_dataset.py
    │   └── name_info.py
    ├── datasets
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── coco.cpython-310.pyc
    │   │   └── utils.cpython-310.pyc
    │   ├── coco.py
    │   └── utils.py
    ├── engine.py
    ├── fulldata_inference.py
    ├── local_inference.py
    ├── models
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-310.pyc
    │   │   ├── alice.cpython-310.pyc
    │   │   ├── backbone.cpython-310.pyc
    │   │   ├── caption.cpython-310.pyc
    │   │   ├── position_encoding.cpython-310.pyc
    │   │   ├── transformer.cpython-310.pyc
    │   │   └── utils.cpython-310.pyc
    │   ├── alice.py
    │   ├── backbone.py
    │   ├── caption.py
    │   ├── position_encoding.py
    │   ├── transformer.py
    │   └── utils.py
    ├── online_inference.py
    ├── requirements.txt
    ├── train_coco.py
    └── train_dev.py
├── Original_Model
    ├── __pycache__
    │   ├── configurations.cpython-310.pyc
    │   ├── datasets.cpython-310.pyc
    │   └── models.cpython-310.pyc
    ├── configurations.py
    ├── datasets.py
    ├── gridattn_image_caption.ipynb
    ├── models.py
    ├── predict.py
    └── train.py
├── README.md
├── data_new
    ├── BLIP_test_captions.json
    ├── BLIP_train_captions.json
    ├── Model2_test_captions.json
    ├── Model2_train_captions_1.json
    ├── output
    │   ├── caplens_test.json
    │   ├── caplens_train.json
    │   ├── encoded_captions_test.json
    │   ├── encoded_captions_train.json
    │   ├── image_paths_test.json
    │   ├── image_paths_train.json
    │   └── vocab.json
    └── rename_script.py
├── data_old
    ├── label.json
    ├── output
    │   ├── caplens_test.json
    │   ├── caplens_train.json
    │   ├── encoded_captions_test.json
    │   ├── encoded_captions_train.json
    │   ├── image_paths_test.json
    │   ├── image_paths_train.json
    │   └── vocab.json
    ├── test_captions.json
    └── train_captions.json
└── doc
    ├── NNDL图像_描述指南.pdf
    ├── NNDL课设_中期报告.pdf
    ├── NNDL课设_开题报告.pdf
    ├── NNDL课设_结题报告.ipynb
    ├── NNDL课设_结题报告.md
    ├── NNDL课设_结题报告.pdf
    ├── NNDL课设_要求说明.pdf
    └── img
        ├── 01.png
        ├── 02.png
        ├── 03.png
        ├── AttentionModel-first_train-1.png
        ├── AttentionModel-first_train-2.png
        ├── AttentionModel-first_train-3.png
        ├── AttentionModel-first_train-4.png
        ├── AttentionModel_backgroundcaption.png
        ├── BLIP_1.png
        ├── BLIP_2.png
        ├── BLIP_3.png
        ├── BLIP_demo.png
        ├── BLIP_full.png
        ├── CNN.png
        ├── Ex_data.png
        ├── OriginalModel-1.png
        ├── OriginalModel-2.png
        ├── Out_of_Memory.png
        ├── RNN.png
        ├── Transformer_demo1.png
        ├── Transformer_demo2.png
        ├── Transformer_demo3.png
        ├── Transformer_demo4.png
        ├── Transformer_framework.png
        ├── image-1.png
        ├── image.png
        └── image20.png


/.gitignore:
--------------------------------------------------------------------------------
1 | data_old/test_images/
2 | data_old/train_images/
3 | data_new/test_images/
4 | data_new/train_images_1/
5 | data_new/train_images_2/
6 | data_new/output/weights/
7 | Ex1_BLIP/Salesforce/blip-image-captioning-base/pytorch_model.bin
8 | Model2_Transformer/image_caption_model.pth


--------------------------------------------------------------------------------
/Ex1_BLIP/README.md:
--------------------------------------------------------------------------------
1 | 一键运行方法：
2 | 
3 | 1. `pip install -r requirements.txt`
4 | 2. `python run_half_precision.py`
5 | 3. 查看输出
6 | 
7 | > 🤗写的文档能让80岁老太太看懂是我们的目标


--------------------------------------------------------------------------------
/Ex1_BLIP/Salesforce/blip-image-captioning-base/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_commit_hash": null,
  3 |   "architectures": [
  4 |     "BlipForConditionalGeneration"
  5 |   ],
  6 |   "image_text_hidden_size": 256,
  7 |   "initializer_factor": 1.0,
  8 |   "logit_scale_init_value": 2.6592,
  9 |   "model_type": "blip",
 10 |   "projection_dim": 512,
 11 |   "text_config": {
 12 |     "_name_or_path": "",
 13 |     "add_cross_attention": false,
 14 |     "architectures": null,
 15 |     "attention_probs_dropout_prob": 0.0,
 16 |     "bad_words_ids": null,
 17 |     "begin_suppress_tokens": null,
 18 |     "bos_token_id": 30522,
 19 |     "chunk_size_feed_forward": 0,
 20 |     "cross_attention_hidden_size": null,
 21 |     "decoder_start_token_id": null,
 22 |     "diversity_penalty": 0.0,
 23 |     "do_sample": false,
 24 |     "early_stopping": false,
 25 |     "encoder_no_repeat_ngram_size": 0,
 26 |     "eos_token_id": 2,
 27 |     "exponential_decay_length_penalty": null,
 28 |     "finetuning_task": null,
 29 |     "forced_bos_token_id": null,
 30 |     "forced_eos_token_id": null,
 31 |     "hidden_act": "gelu",
 32 |     "hidden_dropout_prob": 0.0,
 33 |     "hidden_size": 768,
 34 |     "id2label": {
 35 |       "0": "LABEL_0",
 36 |       "1": "LABEL_1"
 37 |     },
 38 |     "initializer_factor": 1.0,
 39 |     "initializer_range": 0.02,
 40 |     "intermediate_size": 3072,
 41 |     "is_decoder": true,
 42 |     "is_encoder_decoder": false,
 43 |     "label2id": {
 44 |       "LABEL_0": 0,
 45 |       "LABEL_1": 1
 46 |     },
 47 |     "layer_norm_eps": 1e-12,
 48 |     "length_penalty": 1.0,
 49 |     "max_length": 24,
 50 |     "max_position_embeddings": 512,
 51 |     "min_length": 0,
 52 |     "model_type": "blip_text_model",
 53 |     "no_repeat_ngram_size": 0,
 54 |     "num_attention_heads": 12,
 55 |     "num_beam_groups": 1,
 56 |     "num_beams": 1,
 57 |     "num_hidden_layers": 12,
 58 |     "num_return_sequences": 1,
 59 |     "output_attentions": false,
 60 |     "output_hidden_states": false,
 61 |     "output_scores": false,
 62 |     "pad_token_id": 0,
 63 |     "prefix": null,
 64 |     "problem_type": null,
 65 |     "projection_dim": 768,
 66 |     "pruned_heads": {},
 67 |     "remove_invalid_values": false,
 68 |     "repetition_penalty": 1.0,
 69 |     "return_dict": true,
 70 |     "return_dict_in_generate": false,
 71 |     "sep_token_id": 102,
 72 |     "suppress_tokens": null,
 73 |     "task_specific_params": null,
 74 |     "temperature": 1.0,
 75 |     "tf_legacy_loss": false,
 76 |     "tie_encoder_decoder": false,
 77 |     "tie_word_embeddings": true,
 78 |     "tokenizer_class": null,
 79 |     "top_k": 50,
 80 |     "top_p": 1.0,
 81 |     "torch_dtype": null,
 82 |     "torchscript": false,
 83 |     "transformers_version": "4.26.0.dev0",
 84 |     "typical_p": 1.0,
 85 |     "use_bfloat16": false,
 86 |     "use_cache": true,
 87 |     "vocab_size": 30524
 88 |   },
 89 |   "torch_dtype": "float32",
 90 |   "transformers_version": null,
 91 |   "vision_config": {
 92 |     "_name_or_path": "",
 93 |     "add_cross_attention": false,
 94 |     "architectures": null,
 95 |     "attention_dropout": 0.0,
 96 |     "bad_words_ids": null,
 97 |     "begin_suppress_tokens": null,
 98 |     "bos_token_id": null,
 99 |     "chunk_size_feed_forward": 0,
100 |     "cross_attention_hidden_size": null,
101 |     "decoder_start_token_id": null,
102 |     "diversity_penalty": 0.0,
103 |     "do_sample": false,
104 |     "dropout": 0.0,
105 |     "early_stopping": false,
106 |     "encoder_no_repeat_ngram_size": 0,
107 |     "eos_token_id": null,
108 |     "exponential_decay_length_penalty": null,
109 |     "finetuning_task": null,
110 |     "forced_bos_token_id": null,
111 |     "forced_eos_token_id": null,
112 |     "hidden_act": "gelu",
113 |     "hidden_size": 768,
114 |     "id2label": {
115 |       "0": "LABEL_0",
116 |       "1": "LABEL_1"
117 |     },
118 |     "image_size": 384,
119 |     "initializer_factor": 1.0,
120 |     "initializer_range": 0.02,
121 |     "intermediate_size": 3072,
122 |     "is_decoder": false,
123 |     "is_encoder_decoder": false,
124 |     "label2id": {
125 |       "LABEL_0": 0,
126 |       "LABEL_1": 1
127 |     },
128 |     "layer_norm_eps": 1e-05,
129 |     "length_penalty": 1.0,
130 |     "max_length": 20,
131 |     "min_length": 0,
132 |     "model_type": "blip_vision_model",
133 |     "no_repeat_ngram_size": 0,
134 |     "num_attention_heads": 12,
135 |     "num_beam_groups": 1,
136 |     "num_beams": 1,
137 |     "num_channels": 3,
138 |     "num_hidden_layers": 12,
139 |     "num_return_sequences": 1,
140 |     "output_attentions": false,
141 |     "output_hidden_states": false,
142 |     "output_scores": false,
143 |     "pad_token_id": null,
144 |     "patch_size": 16,
145 |     "prefix": null,
146 |     "problem_type": null,
147 |     "projection_dim": 512,
148 |     "pruned_heads": {},
149 |     "remove_invalid_values": false,
150 |     "repetition_penalty": 1.0,
151 |     "return_dict": true,
152 |     "return_dict_in_generate": false,
153 |     "sep_token_id": null,
154 |     "suppress_tokens": null,
155 |     "task_specific_params": null,
156 |     "temperature": 1.0,
157 |     "tf_legacy_loss": false,
158 |     "tie_encoder_decoder": false,
159 |     "tie_word_embeddings": true,
160 |     "tokenizer_class": null,
161 |     "top_k": 50,
162 |     "top_p": 1.0,
163 |     "torch_dtype": null,
164 |     "torchscript": false,
165 |     "transformers_version": "4.26.0.dev0",
166 |     "typical_p": 1.0,
167 |     "use_bfloat16": false
168 |   }
169 | }
170 | 


--------------------------------------------------------------------------------
/Ex1_BLIP/Salesforce/blip-image-captioning-base/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": true,
 3 |   "do_resize": true,
 4 |   "image_mean": [
 5 |     0.48145466,
 6 |     0.4578275,
 7 |     0.40821073
 8 |   ],
 9 |   "image_processor_type": "BlipImageProcessor",
10 |   "image_std": [
11 |     0.26862954,
12 |     0.26130258,
13 |     0.27577711
14 |   ],
15 |   "processor_class": "BlipProcessor",
16 |   "size": 384
17 | }
18 | 


--------------------------------------------------------------------------------
/Ex1_BLIP/Salesforce/blip-image-captioning-base/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "cls_token": "[CLS]",
3 |   "mask_token": "[MASK]",
4 |   "pad_token": "[PAD]",
5 |   "sep_token": "[SEP]",
6 |   "unk_token": "[UNK]"
7 | }
8 | 


--------------------------------------------------------------------------------
/Ex1_BLIP/Salesforce/blip-image-captioning-base/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cls_token": "[CLS]",
 3 |   "do_basic_tokenize": true,
 4 |   "do_lower_case": true,
 5 |   "mask_token": "[MASK]",
 6 |   "model_max_length": 512,
 7 |   "name_or_path": "bert-base-uncased",
 8 |   "never_split": null,
 9 |   "pad_token": "[PAD]",
10 |   "processor_class": "BlipProcessor",
11 |   "sep_token": "[SEP]",
12 |   "special_tokens_map_file": null,
13 |   "strip_accents": null,
14 |   "tokenize_chinese_chars": true,
15 |   "tokenizer_class": "BertTokenizer",
16 |   "unk_token": "[UNK]",
17 |   "model_input_names": [
18 |     "input_ids",
19 |     "attention_mask"
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/Ex1_BLIP/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Ex1_BLIP/requirements.txt


--------------------------------------------------------------------------------
/Ex1_BLIP/run_fulldata_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | from PIL import Image
 5 | from transformers import BlipProcessor, BlipForConditionalGeneration
 6 | 
 7 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 8 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
 9 | 
10 | # Path to the folder containing images
11 | images_folder = "../Ex2_data/train_images"
12 | 
13 | # Create a dictionary to store image captions
14 | captions_dict = {}
15 | 
16 | count = 1
17 | 
18 | # Loop through each file in the images folder
19 | for filename in os.listdir(images_folder):
20 |     if filename.endswith(".jpg") or filename.endswith(".png"):  # Add other supported image formats if necessary
21 |         img_path = os.path.join(images_folder, filename)
22 |         
23 |         # Load the image
24 |         raw_image = Image.open(img_path).convert('RGB')
25 | 
26 |         # Your text for captioning
27 |         text = "The background content in the picture is"
28 |         
29 |         # Process the image and generate caption
30 |         inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
31 |         
32 |         out = model.generate(**inputs)
33 |         generated_caption = processor.decode(out[0], skip_special_tokens=True)
34 | 
35 |         print(f"No{count}", generated_caption)
36 |         count += 1
37 | 
38 |         # Store the caption in the dictionary
39 |         captions_dict[img_path] = generated_caption
40 | 
41 | # Save the dictionary to captions.json
42 | output_path = "../Ex2_data/Ex2_background_train_captions.json"
43 | with open(output_path, 'w') as json_file:
44 |     json.dump(captions_dict, json_file, indent=4)
45 | 
46 | print(f"Captions saved to {output_path}")
47 | 


--------------------------------------------------------------------------------
/Ex1_BLIP/run_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import requests
 4 | import torch
 5 | from PIL import Image
 6 | from transformers import BlipProcessor, BlipForConditionalGeneration
 7 | from nltk.tokenize import word_tokenize
 8 | from nltk.translate.meteor_score import single_meteor_score
 9 | from rouge import Rouge
10 | 
11 | def print_line():
12 |     print("============================================================================================================")
13 | 
14 | # 计算 METEOR 分数
15 | def calc_meteor(reference, hypothesis):
16 |     hypothesis = word_tokenize(hypothesis)
17 |     reference = word_tokenize(reference)
18 |     return single_meteor_score(reference, hypothesis)
19 | 
20 | # 计算 ROUGE-L 分数
21 | def calc_rouge_l(reference, hypothesis):
22 |     rouge = Rouge()
23 |     scores = rouge.get_scores(hypothesis, reference)
24 |     return scores[0]['rouge-l']['f']
25 | 
26 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
27 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
28 | 
29 | # img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' # 图片URL版本
30 | # raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') # 图片URL版本
31 | 
32 | img_local_url = "../data_new/test_images/test_8.jpg" # 本地图片版本
33 | raw_image = Image.open(img_local_url).convert('RGB') # 本地图片版本
34 | 
35 | # Conditional image captioning
36 | text = "The background content in the picture is"
37 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
38 | 
39 | out = model.generate(**inputs)
40 | generated_caption = processor.decode(out[0], skip_special_tokens=True)
41 | print_line()
42 | print("背景描述：", generated_caption)
43 | # print_line()
44 | 
45 | # Unconditional image captioning
46 | # inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
47 | 
48 | # out = model.generate(**inputs)
49 | # generated_caption_unconditional = processor.decode(out[0], skip_special_tokens=True)
50 | # print_line()
51 | # print("图像描述：", generated_caption_unconditional)
52 | # print_line()
53 | 
54 | # # 加入评估指标计算
55 | # with open('../data/test_captions.json', 'r') as f:
56 | #     captions = json.load(f)
57 | 
58 | # # filename = os.path.basename(img_local_url)
59 | # filename = os.path.basename(img_url)
60 | # reference_description = captions.get(filename, "No description found.")
61 | 
62 | # print("Predict   Caption = ", generated_caption.capitalize())
63 | # print("Reference Caption = ", reference_description.capitalize())
64 | 
65 | # meteor_score = calc_meteor(reference_description, generated_caption)
66 | # rouge_l_score = calc_rouge_l(reference_description, generated_caption)
67 | # print_line()
68 | # print("METEOR  Score =", round(meteor_score, 4))
69 | # print("ROUGE-L Score =", round(rouge_l_score, 4))
70 | # print_line()
71 | 


--------------------------------------------------------------------------------
/Ex2_RL-Loss/Model1_RL-Loss.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 我们使用强化学习损失函数，将交叉熵损失和CIDEr-D评价指标结合，优化损失函数。
  3 | 我们将使用REINFORCE算法来进行更新。
  4 | """
  5 | import json
  6 | import torch
  7 | import os
  8 | from configuartions import Config
  9 | from models import AttentionModel, get_optimizer, PackedCrossEntropyLoss, evaluate_cider
 10 | from datasets import create_dataloaders, ImageTextDataset
 11 | from torch.distributions import Categorical
 12 | 
 13 | 
 14 | def main():
 15 |     best_test_score = float('-inf')  # 初始化最佳测试得分
 16 | 
 17 |     # 加载配置
 18 |     config = Config()
 19 | 
 20 |     # 创建数据加载器
 21 |     train_loader, test_loader = create_dataloaders(config)
 22 | 
 23 |     # 加载词汇表文件
 24 |     with open('../data/output/vocab.json', 'r') as f:
 25 |         vocab = json.load(f)
 26 | 
 27 |     # 模型初始化
 28 |     model = AttentionModel(
 29 |         image_code_dim=config.image_code_dim,
 30 |         vocab=vocab,  # 传递词汇表字典
 31 |         word_dim=config.word_dim,
 32 |         attention_dim=config.attention_dim,
 33 |         hidden_size=config.hidden_size,
 34 |         num_layers=config.num_layers
 35 |     ).to(config.device)
 36 | 
 37 |     # 优化器
 38 |     optimizer = get_optimizer(model, config)
 39 | 
 40 |     # 损失函数
 41 |     loss_fn = PackedCrossEntropyLoss().to(config.device)
 42 | 
 43 |     # 创建保存权重的文件夹路径
 44 |     weights_dir = os.path.join(config.output_folder, 'weights')
 45 |     os.makedirs(weights_dir, exist_ok=True)
 46 | 
 47 |     best_val_score = float('-inf')  # 初始化最佳验证得分
 48 | 
 49 |     for epoch in range(config.num_epochsum_epochs):
 50 |         model.train()
 51 |         for i, (imgs, caps, caplens) in enumerate(train_dataloader):
 52 |             imgs, caps = imgs.to(device), caps.to(device)
 53 |             caplens = caplens.cpu().to(torch.int64)
 54 |             optimizer.zero_grad()
 55 |             outputs, alphas, _, _, softmax_probabilities = model(imgs, caps, caplens)
 56 |             current_test_score = evaluate_cider(test_loader, model, config)
 57 |             m = Categorical(torch.tensor(softmax_probabilities))
 58 |             action = m.sample()
 59 |             log_probs = m.log_prob(action)
 60 |             reinforce_loss = -log_probs * float(current_test_score)
 61 |             reinforce_loss.mean().backward()
 62 |             optimizer.step()
 63 | 
 64 | 
 65 |     """
 66 |     # 开始训练
 67 |     for epoch in range(config.num_epochs):
 68 |         # 训练模型
 69 |         model.train()
 70 |         for i, (imgs, caps, caplens) in enumerate(train_loader):
 71 |             imgs, caps = imgs.to(config.device), caps.to(config.device)
 72 |             caplens = caplens.cpu().to(torch.int64)
 73 | 
 74 |             optimizer.zero_grad()
 75 |             outputs, alphas, _, _, _ = model(imgs, caps, caplens)
 76 | 
 77 |             # 确保目标序列长度与模型输出匹配
 78 |             targets = caps[:, 1:]  # 假设targets是captions去除第一个<start>标记后的部分
 79 |             # print(f"Outputs shape: {outputs.shape}")
 80 |             # print(f"Targets shape: {targets.shape}")
 81 |             # print(f"Caplens: {caplens}")
 82 |             loss = loss_fn(outputs, targets, caplens)
 83 |             loss.backward()
 84 |             optimizer.step()
 85 | 
 86 |             # 打印/记录损失信息
 87 |             if (i + 1) % 100 == 0:
 88 |                 print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')
 89 | 
 90 |         # 在每个epoch结束时使用测试集评估模型
 91 |         current_test_score = evaluate_cider(test_loader, model, config)
 92 |         print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}")
 93 | 
 94 |         # 如果当前得分比之前的最佳得分要好，则保存模型
 95 |         if current_test_score > best_test_score:
 96 |             best_test_score = current_test_score
 97 |             best_model_path = os.path.join(weights_dir, f'Attention_model_background_caption_{best_test_score}.pth')
 98 |             torch.save(model.state_dict(), best_model_path)
 99 |             print(f"Saved new best model to {best_model_path}")
100 |     """
101 | 
102 |     # 训练完成后的最终评估
103 |     final_test_score = evaluate_cider(test_loader, model, config)
104 |     print(f"Final CIDEr-D score = {final_test_score}")
105 | 
106 |     # # 训练完成后保存模型
107 |     # final_model_path = os.path.join(weights_dir, 'AttentionModel.pth')
108 |     # torch.save(model.state_dict(), final_model_path)
109 |     # print(f"Saved final model to {final_model_path}")
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     main()
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/Ex2_RL-Loss/Model2_RL-Loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 必须要指出的是，图像描述的任务，这是一个序列生成任务，而不是一个强化学习任务。
 3 | 在这种情况下，使用强化学习可能并不是最好的选择，因为定义出合适的奖励函数可能会非常困难。
 4 | """
 5 | import json
 6 | import torch
 7 | from torch.utils.data import Dataset, DataLoader
 8 | from torchvision import transforms
 9 | from transformers import BertTokenizer
10 | from PIL import Image
11 | from configuration import Config
12 | 
13 | # 数据集类
14 | class MyDataset(Dataset):
15 |     def __init__(self, json_file, img_dir, transform=None):
16 |         with open(json_file, 'r') as f:
17 |             self.data = json.load(f)
18 |         self.img_dir = img_dir
19 |         self.transform = transform
20 |         self.filenames = list(self.data.keys())
21 | 
22 |     def __len__(self):
23 |         return len(self.data)
24 | 
25 |     def __getitem__(self, idx):
26 |         filename = self.filenames[idx]
27 |         caption = self.data[filename]
28 |         image = Image.open(f"{self.img_dir}/{filename}")
29 |         if self.transform:
30 |             image = self.transform(image)
31 |         return image, caption
32 | 
33 | # 检查是否有可用的GPU
34 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
35 | 
36 | # 加载模型
37 | config = Config()
38 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True)
39 | model = model.to(device)  # 将模型移动到指定的设备上
40 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
41 | 
42 | # 准备数据集
43 | transform = transforms.Compose([
44 |     transforms.ToTensor(),
45 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
46 | ])
47 | 
48 | train_dataset = MyDataset('../data/train_captions.json', '../data/train_images', transform=transform)
49 | train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
50 | 
51 | # 定义损失函数和优化器
52 | criterion = torch.nn.CrossEntropyLoss()
53 | optimizer = torch.optim.Adam(model.parameters())
54 | 
55 | # 定义奖励函数
56 | def reward_function(predictions, targets):
57 |     # 这只是一个示例，你需要根据你的任务定义合适的奖励函数
58 |     return (predictions == targets).float()
59 | 
60 | # 定义策略梯度更新函数
61 | def policy_gradient_update(model, images, captions, optimizer):
62 |     outputs = model(images, captions['input_ids'], captions['attention_mask'])
63 |     rewards = reward_function(outputs.logits.argmax(-1), captions['input_ids'])
64 |     action_probs = outputs.logits.softmax(-1)
65 |     picked_action_probs = action_probs.gather(-1, captions['input_ids'].unsqueeze(-1)).squeeze(-1)
66 |     loss = (-torch.log(picked_action_probs) * rewards).mean()
67 | 
68 |     optimizer.zero_grad()
69 |     loss.backward()
70 |     optimizer.step()
71 | 
72 |     return loss
73 | 
74 | # 训练循环
75 | num_epochs = 10
76 | for epoch in range(num_epochs):
77 |     for images, captions in train_dataloader:
78 |         images = images.to(device)  # 将图像数据移动到指定的设备上
79 |         captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True)
80 |         captions = {key: val.to(device) for key, val in captions.items()}  # 将caption数据移动到指定的设备上
81 | 
82 |         loss = policy_gradient_update(model, images, captions, optimizer)
83 | 
84 |         print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
85 | 
86 | # 保存模型
87 | torch.save(model.state_dict(), 'Model2.pth')


--------------------------------------------------------------------------------
/Ex2_RL-Loss/Pure_RL-Loss.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 【背景及使用原因】
 3 | 在深度学习中，常常通过最小化交叉熵损失来训练模型，而模型的好坏则由某种评测指标来衡量。
 4 | 这种情况下，交叉熵损失可以看作是默认的训练目标，而评测指标是我们真正关心的指标。
 5 | 但很多时候，优化交叉熵损失并不一定能直接优化我们关心的评测指标。
 6 | 比如在分类任务中，交叉熵损失会关注每个类别是否正确预测，与我们的实际目标，比如整体预测准确率，可能不一致。
 7 | 这就是所谓的默认实现的交叉熵损失和评测指标不一致情况。
 8 | 
 9 | 在这种情况下，可以使用基于强化学习的方法来设定损失函数，使之直接优化我们关心的指标。
10 | 比方说，对于策略梯度方法而言，构造奖励函数以及策略网络，通过互动得到的奖励来更新策略网络，奖励函数就是评测指标。
11 | 
12 | 举一个简单的例子，如果我们的评测指标是准确率，那么每次预测对我们就给予+1的奖励，预测错我们就不给奖励。
13 | 我们的策略网络就是我们的预测模型，输出的就是预测结果。
14 | 然后我们利用策略梯度方法，不断通过互动得到的奖励来更新我们的预测模型，使之更好地优化我们关心的指标。
15 | """
16 | 
17 | import torch
18 | import torch.nn as nn
19 | import torch.optim as optim
20 | 
21 | class Model(nn.Module):
22 |     def __init__(self, input_size, output_size):
23 |         super(Model, self).__init__()
24 |         self.linear = nn.Linear(input_size, output_size)
25 | 
26 |     def forward(self, x):
27 |         return torch.softmax(self.linear(x), dim=-1)
28 | 
29 | def policy_gradient_update(model, states, actions, rewards, optimizer):
30 |     # 获取模型预测的动作概率
31 |     action_probs = model(states)
32 |     picked_action_probs = action_probs.gather(1, actions.unsqueeze(1)).squeeze(1)
33 | 
34 |     # 根据公式计算损失
35 |     loss = (-torch.log(picked_action_probs) * rewards).mean()
36 | 
37 |     optimizer.zero_grad()
38 |     loss.backward()
39 |     optimizer.step()
40 | 
41 | model = Model(input_size=10, output_size=2)
42 | optimizer = optim.Adam(model.parameters(), lr=0.01)
43 | 
44 | states = torch.randn(100, 10)
45 | actions = torch.randint(0, 2, (100,))
46 | rewards = torch.randn(100)
47 | 
48 | policy_gradient_update(model, states, actions, rewards, optimizer)
49 | 
50 | """
51 | 如何将强化学习损失函数放进train代码中呢：
52 | # 示例的训练过程
53 | for epoch in range(num_epochs):
54 |     # 对于每个批次的数据
55 |     for batch_data in data_loader:        
56 |         # 从批次数据中获取输入，动作和奖励
57 |         states, actions, rewards = batch_data
58 |         # 使用强化学习损失函数更新模型
59 |         policy_gradient_update(model, states, actions, rewards, optimizer)
60 | 
61 | # 验证或测试过程
62 | for batch_data in validation_data_loader:
63 |     # 从批次数据中获取输入
64 |     states = batch_data
65 |     # 用模型对输入进行预测
66 |     action_probabilities = model(states)
67 |     # 根据需求评估或使用预测结果
68 |     ...
69 | """


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 GongYufei
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Model1_YellowOrange/README.md:
--------------------------------------------------------------------------------
 1 | # 模型训练流程进度
 2 | 
 3 | - 数据集预处理：completed
 4 | 
 5 | - 定义模型：completed
 6 | 
 7 | - 定义损失函数：completed
 8 | 
 9 | - 选择优化方法：completed
10 | 
11 | - 评估指标：completed
12 | 
13 | - 训练模型：todo
14 | 


--------------------------------------------------------------------------------
/Model1_YellowOrange/__pycache__/configuartions.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/configuartions.cpython-310.pyc


--------------------------------------------------------------------------------
/Model1_YellowOrange/__pycache__/datasets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/datasets.cpython-310.pyc


--------------------------------------------------------------------------------
/Model1_YellowOrange/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model1_YellowOrange/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/Model1_YellowOrange/configuartions.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | class Config:
 3 |     # 数据路径
 4 |     data_path = '../data/'
 5 |     images_path = '../data/images/'
 6 |     train_captions_path = '../data/train_captions.json'
 7 |     test_captions_path = '../data/test_captions.json'
 8 |     output_folder = '../data/output/'  # 输出文件夹的路径，用于存储词汇表和处理后的数据
 9 | 
10 |     # 模型参数
11 |     embed_size = 256
12 |     vocab_size = 10000  # 根据实际情况调整
13 |     num_layers = 3  # 定义循环神经网络（RNN）或其变体（如 LSTM 或 GRU）中的层数。
14 |     num_heads = 8
15 |     dropout = 0.1
16 |     hidden_size = 512
17 |     image_code_dim = 2048  # 图像编码维度
18 |     word_dim = 256  # 词嵌入维度
19 |     attention_dim = 512  # 注意力机制的隐藏层维度
20 | 
21 |     # 数据处理参数
22 |     min_word_count = 5  # 词汇表中词的最小出现次数
23 |     max_len = 64  # 假设描述的最大长度为200个词
24 | 
25 |     # 训练参数
26 |     # batch_size = 16
27 |     batch_size = 4
28 |     learning_rate = 0.001
29 |     # num_epochs = 30
30 |     num_epochs = 10
31 |     workers = 0  # 工作线程数,在自己的电脑上训练的时候设为0
32 |     encoder_learning_rate = 1e-4  # 编码器的学习率
33 |     decoder_learning_rate = 1e-3  # 解码器的学习率
34 |     lr_update = 10  # 每10轮降低学习速率
35 | 
36 |     # 图像预处理参数
37 |     image_size = 256  # 图像缩放大小
38 |     crop_size = 224  # 图像裁剪大小
39 | 
40 |     # Beam Search 参数
41 |     beam_k = 5
42 | 
43 |     # 其他配置
44 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'


--------------------------------------------------------------------------------
/Model1_YellowOrange/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from collections import Counter
  4 | import torch
  5 | from PIL import Image
  6 | from torch.utils.data import Dataset
  7 | from torch.utils.data import DataLoader
  8 | import torchvision.transforms as transforms
  9 | from configuartions import Config  # 导入配置类
 10 | 
 11 | 
 12 | # 从配置文件获取配置
 13 | config = Config()
 14 | 
 15 | 
 16 | def create_dataset(max_len=64):
 17 |     """
 18 |     整理数据集，构建词汇表，并将文本描述转换为词索引向量。
 19 |     使用configuration.py文件中定义的配置信息。
 20 |     """
 21 |     # 使用config中定义的路径
 22 |     image_folder = config.images_path
 23 |     train_captions_path = config.train_captions_path
 24 |     test_captions_path = config.test_captions_path
 25 |     output_folder = config.output_folder
 26 | 
 27 |     # 读取训练图像描述
 28 |     with open(train_captions_path, 'r') as f:
 29 |         train_captions_data = json.load(f)
 30 | 
 31 |     # 读取测试图像描述
 32 |     with open(test_captions_path, 'r') as f:
 33 |         test_captions_data = json.load(f)
 34 | 
 35 |     # 统计训练集的文本描述的词频
 36 |     vocab = Counter()
 37 |     for caption in train_captions_data.values():
 38 |         vocab.update(caption.lower().split())
 39 | 
 40 |     # 移除其中的低频词
 41 |     vocab = {word for word, count in vocab.items() if count >= config.min_word_count}
 42 | 
 43 |     # 构建词典
 44 |     word_to_idx = {word: idx + 4 for idx, word in enumerate(vocab)}
 45 |     word_to_idx['<pad>'] = 0
 46 |     word_to_idx['<start>'] = 1
 47 |     word_to_idx['<end>'] = 2
 48 |     word_to_idx['<unk>'] = 3
 49 | 
 50 |     # 一个函数来转换描述为词索引向量，并进行填充
 51 |     def encode_captions(captions_data, word_to_idx, max_len):
 52 |         encoded_captions = {}
 53 |         caplens = {}
 54 |         for img_id, caption in captions_data.items():
 55 |             words = caption.lower().split()
 56 |             encoded_caption = [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]
 57 |             # 加2是因为要加上<start>和<end>，但最终caplen应该减去1
 58 |             caplen = min(len(encoded_caption) + 2, max_len) - 1
 59 |             encoded_caption = [word_to_idx['<start>']] + encoded_caption + [word_to_idx['<end>']]
 60 |             encoded_caption += [word_to_idx['<pad>']] * (max_len - len(encoded_caption))
 61 |             encoded_captions[img_id] = encoded_caption[:max_len]
 62 |             caplens[img_id] = caplen  # if caplen <= max_len else max_len
 63 |         return encoded_captions, caplens
 64 |     # def encode_captions(captions_data, word_to_idx, max_len):
 65 |     #     encoded_captions = {}
 66 |     #     for img_id, caption in captions_data.items():
 67 |     #         words = caption.lower().split()
 68 |     #         encoded_caption = [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]
 69 |     #         encoded_caption = [word_to_idx['<start>']] + encoded_caption + [word_to_idx['<end>']]
 70 |     #         encoded_caption += [word_to_idx['<pad>']] * (max_len - len(encoded_caption))
 71 |     #         encoded_captions[img_id] = encoded_caption[:max_len]
 72 |     #     return encoded_captions
 73 | 
 74 |     # 对训练集描述进行编码
 75 |     encoded_captions_train, caplens_train = encode_captions(train_captions_data, word_to_idx, max_len)
 76 | 
 77 |     # 对测试集描述进行编码
 78 |     encoded_captions_test, caplens_test = encode_captions(test_captions_data, word_to_idx, max_len)
 79 | 
 80 |     # 存储词典和编码后的描述
 81 |     with open(os.path.join(output_folder, 'vocab.json'), 'w') as f:
 82 |         json.dump(word_to_idx, f)
 83 | 
 84 |     with open(os.path.join(output_folder, 'encoded_captions_train.json'), 'w') as f:
 85 |         json.dump(encoded_captions_train, f)
 86 | 
 87 |     with open(os.path.join(output_folder, 'encoded_captions_test.json'), 'w') as f:
 88 |         json.dump(encoded_captions_test, f)
 89 | 
 90 |     # 存储图像路径
 91 |     image_paths_train = {img_id: os.path.join(image_folder, img_id) for img_id in train_captions_data.keys()}
 92 |     with open(os.path.join(output_folder, 'image_paths_train.json'), 'w') as f:
 93 |         json.dump(image_paths_train, f)
 94 | 
 95 |     image_paths_test = {img_id: os.path.join(image_folder, img_id) for img_id in test_captions_data.keys()}
 96 |     with open(os.path.join(output_folder, 'image_paths_test.json'), 'w') as f:
 97 |         json.dump(image_paths_test, f)
 98 | 
 99 |     # 存储caplens
100 |     with open(os.path.join(output_folder, 'caplens_train.json'), 'w') as f:
101 |         json.dump(caplens_train, f)
102 | 
103 |     with open(os.path.join(output_folder, 'caplens_test.json'), 'w') as f:
104 |         json.dump(caplens_test, f)
105 | 
106 | 
107 | # 调用函数，整理数据集
108 | create_dataset()
109 | 
110 | 
111 | class ImageTextDataset(Dataset):
112 |     """
113 |     PyTorch数据集类，用于加载和处理图像-文本数据。
114 |     """
115 | 
116 |     def __init__(self, image_paths_file, captions_file, caplens_file, transform=None):
117 |         """
118 |         初始化数据集类。
119 |         参数:
120 |             image_paths_file: 包含图像路径的json文件路径。
121 |             captions_file: 包含编码后文本描述的json文件路径。
122 |             transform: 应用于图像的预处理转换。
123 |         """
124 |         # 载入图像路径和文本描述以及caplens
125 |         with open(image_paths_file, 'r') as f:
126 |             self.image_paths = json.load(f)
127 | 
128 |         with open(captions_file, 'r') as f:
129 |             self.captions = json.load(f)
130 | 
131 |         with open(caplens_file, 'r') as f:
132 |             self.caplens = json.load(f)
133 | 
134 |         # 设置图像预处理方法
135 |         self.transform = transform or transforms.Compose([
136 |             transforms.Resize((256, 256)),
137 |             transforms.ToTensor(),
138 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
139 |         ])
140 | 
141 |     def __getitem__(self, index):
142 |         """
143 |         获取单个数据点。
144 |         参数:
145 |             index: 数据点的索引。
146 |         返回:
147 |             一个包含图像和对应文本描述的元组。
148 |         """
149 |         # 获取图像路径和文本描述以及caplen
150 |         image_id = list(self.image_paths.keys())[index]
151 |         image_path = self.image_paths[image_id]
152 |         caption = self.captions[image_id]
153 |         caplen = self.caplens[image_id]
154 | 
155 |         # 加载图像并应用预处理
156 |         image = Image.open(image_path).convert('RGB')
157 |         if self.transform is not None:
158 |             image = self.transform(image)
159 | 
160 |         # 将文本描述转换为张量
161 |         caption_tensor = torch.tensor(caption, dtype=torch.long)
162 | 
163 |         return image, caption_tensor, caplen
164 | 
165 |     def __len__(self):
166 |         """
167 |         数据集中的数据点总数。
168 |         """
169 |         return len(self.image_paths)
170 | 
171 | 
172 | # 创建数据集实例
173 | # train_dataset = ImageTextDataset(
174 | #     image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'),
175 | #     captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'),
176 | #     caplens_file=os.path.join(config.output_folder, 'caplens_train.json')
177 | # )
178 | 
179 | # # 示例：创建验证集实例
180 | # test_dataset = ImageTextDataset(
181 | #     image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'),
182 | #     captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'),
183 | #     caplens_file=os.path.join(config.output_folder, 'caplens_test.json')
184 | # )
185 | 
186 | # 创建训练集和测试集的 DataLoader
187 | def create_dataloaders(config):
188 |     """
189 |     创建训练集和测试集的 DataLoader。
190 | 
191 |     参数:
192 |         batch_size: 每个批次的大小。
193 |         num_workers: 加载数据时使用的进程数。
194 |         shuffle_train: 是否打乱训练数据。
195 | 
196 |     返回:
197 |         train_loader: 训练数据的 DataLoader。
198 |         test_loader: 测试数据的 DataLoader。
199 |     """
200 |     # 图像预处理转换
201 |     transform = transforms.Compose([
202 |         transforms.Resize((256, 256)),
203 |         transforms.RandomCrop(224),
204 |         transforms.ToTensor(),
205 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
206 |     ])
207 | 
208 |     # 加载数据时使用的进程数
209 |     num_workers = 0
210 | 
211 |     # 创建数据集对象
212 |     train_dataset = ImageTextDataset(
213 |         image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'),
214 |         captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'),
215 |         caplens_file=os.path.join(config.output_folder, 'caplens_train.json'),
216 |         transform=transform
217 |     )
218 | 
219 |     test_dataset = ImageTextDataset(
220 |         image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'),
221 |         captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'),
222 |         caplens_file=os.path.join(config.output_folder, 'caplens_test.json'),
223 |         transform=transform
224 |     )
225 | 
226 |     # 创建 DataLoader 对象
227 |     train_loader = DataLoader(
228 |         dataset=train_dataset,
229 |         batch_size=config.batch_size,
230 |         shuffle=True,
231 |         num_workers=num_workers,
232 |         pin_memory=True
233 |     )
234 | 
235 |     test_loader = DataLoader(
236 |         dataset=test_dataset,
237 |         batch_size=config.batch_size,
238 |         shuffle=False,  # 通常测试集不需要打乱
239 |         num_workers=num_workers,
240 |         pin_memory=True
241 |     )
242 | 
243 |     return train_loader, test_loader
244 | 
245 | 
246 | config = Config()
247 | # 使用Config类中定义的配置来创建DataLoader
248 | train_loader, test_loader = create_dataloaders(config=config)
249 | 
250 | 
251 | # 测试 DataLoader 是否正确创建
252 | if __name__ == '__main__':
253 |     for i, (images, captions, caplens) in enumerate(train_loader):
254 |         print(f"Batch {i + 1}")
255 |         print(f"Images shape: {images.size()}")
256 |         print(f"Captions shape: {captions.size()}")
257 |         if i == 1:  # 仅打印前两个批次的信息
258 |             break
259 | 


--------------------------------------------------------------------------------
/Model1_YellowOrange/datasets_pretrain_demo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from PIL import Image
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | vocab_path = '../data/output/vocab.json'
 6 | encoded_captions_path = '../data/output/encoded_captions_train.json'
 7 | image_paths_path = '../data/output/image_paths_train.json'
 8 | 
 9 | # 读取词典、编码后的描述和图像路径
10 | with open(vocab_path, 'r') as f:
11 |     vocab = json.load(f)
12 | 
13 | with open(encoded_captions_path, 'r') as f:
14 |     encoded_captions = json.load(f)
15 | 
16 | with open(image_paths_path, 'r') as f:
17 |     image_paths = json.load(f)
18 | 
19 | # 将索引转换回单词
20 | vocab_idx2word = {idx: word for word, idx in vocab.items()}
21 | 
22 | # 选择要展示的图片ID，这里以第一个ID为例
23 | first_img_id = list(image_paths.keys())[0]
24 | content_img = Image.open(image_paths[first_img_id])
25 | 
26 | # 展示图片和对应的描述
27 | plt.imshow(content_img)
28 | plt.axis('off')  # 不显示坐标轴
29 | plt.show()
30 | 
31 | # 打印对应的文本描述，确保字典中的键是整数，直接使用整数索引
32 | caption = ' '.join([vocab_idx2word[word_idx] for word_idx in encoded_captions[first_img_id]])
33 | # caption = ' '.join([vocab_idx2word[str(word_idx)] for word_idx in encoded_captions[first_img_id]])
34 | print(caption)
35 | 


--------------------------------------------------------------------------------
/Model1_YellowOrange/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from pycocoevalcap.cider.cider import Cider
  4 | import numpy as np
  5 | from configuartions import Config
  6 | from torchvision.models import resnet101, ResNet101_Weights
  7 | from torch.nn.utils.rnn import pack_padded_sequence
  8 | import torch.optim as optim
  9 | import json
 10 | 
 11 | 
 12 | # 图像编码器
 13 | # 使用ResNet-101作为图像编码器，并将其最后一个非全连接层作为网格表示提取层
 14 | # class ImageEncoder(nn.Module):
 15 | #     def __init__(self, finetuned=True):
 16 | #         super(ImageEncoder, self).__init__()
 17 | #         model = torchvision.models.resnet101(weights=ResNet101_Weights.DEFAULT)
 18 | #         # ResNet-101网格表示提取器
 19 | #         self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2]))
 20 | #         for param in self.grid_rep_extractor.parameters():
 21 | #             param.requires_grad = finetuned
 22 | #
 23 | #     def forward(self, images):
 24 | #         out = self.grid_rep_extractor(images)
 25 | #         return out
 26 | 
 27 | # 引入自注意机制后的图像编码器
 28 | class SelfAttention(nn.Module):
 29 |     def __init__(self, num_channels, num_heads=8, dropout=0.1):
 30 |         super(SelfAttention, self).__init__()
 31 |         self.num_heads = num_heads
 32 |         self.attention = nn.MultiheadAttention(num_channels, num_heads, dropout)
 33 | 
 34 |     def forward(self, x):
 35 |         # 保存原始形状
 36 |         orig_shape = x.shape
 37 |         # 打印输入形状
 38 |         # print("Input shape:", x.shape)
 39 |         # 转换为(sequence_length, batch_size, num_channels)格式
 40 |         x = x.flatten(2).permute(2, 0, 1)
 41 |         attention_output, _ = self.attention(x, x, x)
 42 |         # 还原形状，确保与原始输入形状匹配
 43 |         attention_output = attention_output.permute(1, 2, 0)# 打印最终输出形状
 44 |         # print("Final output shape:", attention_output.shape)
 45 |         return attention_output.view(orig_shape)
 46 | 
 47 | 
 48 | class ImageEncoder(nn.Module):
 49 |     def __init__(self, finetuned=True, num_heads=8, dropout=0.1):
 50 |         super(ImageEncoder, self).__init__()
 51 |         # 使用ResNet101作为基础模型
 52 |         model = resnet101(weights=ResNet101_Weights.DEFAULT)
 53 |         self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2]))
 54 |         # 设置参数是否可训练
 55 |         for param in self.grid_rep_extractor.parameters():
 56 |             param.requires_grad = finetuned
 57 | 
 58 |         # 自注意力层
 59 |         self.self_attention = SelfAttention(model.fc.in_features, num_heads, dropout)
 60 | 
 61 |     def forward(self, images):
 62 |         # 通过ResNet网格表示提取器
 63 |         features = self.grid_rep_extractor(images)
 64 |         # print("Extractor output shape:", features.shape)
 65 |         # 应用自注意力
 66 |         features = self.self_attention(features)
 67 |         # 打印自注意力输出形状
 68 |         # print("Self-attention output shape:", features.shape)
 69 |         return features
 70 | 
 71 | 
 72 | # 解码器的注意力机制
 73 | class AdditiveAttention(nn.Module):
 74 |     def  __init__(self, query_dim, key_dim, attn_dim):
 75 |         """
 76 |         参数：
 77 |             query_dim: 查询Q的维度
 78 |             key_dim: 键K的维度
 79 |             attn_dim: 注意力函数隐藏层表示的维度
 80 |         """
 81 |         super(AdditiveAttention, self).__init__()
 82 |         self.attn_w_1_q = nn.Linear(query_dim, attn_dim)
 83 |         self.attn_w_1_k = nn.Linear(key_dim, attn_dim)
 84 |         self.attn_w_2 = nn.Linear(attn_dim, 1)
 85 |         self.tanh = nn.Tanh()
 86 |         self.softmax = nn.Softmax(dim=1)
 87 | 
 88 |     def forward(self, query, key_value):
 89 |         """
 90 |         Q K V：Q和K算出相关性得分，作为V的权重，K=V
 91 |         参数：
 92 |             query: 查询 (batch_size, q_dim)
 93 |             key_value: 键和值，(batch_size, n_kv, kv_dim)
 94 |         """
 95 |         # （2）计算query和key的相关性，实现注意力评分函数
 96 |         # -> (batch_size, 1, attn_dim)
 97 |         queries = self.attn_w_1_q(query).unsqueeze(1)
 98 |         # -> (batch_size, n_kv, attn_dim)
 99 |         keys = self.attn_w_1_k(key_value)
100 |         # -> (batch_size, n_kv)
101 |         attn = self.attn_w_2(self.tanh(queries+keys)).squeeze(2)
102 |         # （3）归一化相关性分数
103 |         # -> (batch_size, n_kv)
104 |         attn = self.softmax(attn)
105 |         # （4）计算输出
106 |         # (batch_size x 1 x n_kv)(batch_size x n_kv x kv_dim)
107 |         # -> (batch_size, 1, kv_dim)
108 |         output = torch.bmm(attn.unsqueeze(1), key_value).squeeze(1)
109 |         return output, attn
110 | 
111 | 
112 | # 文本解码器
113 | # 注意：确保 vocab_size, embed_size, hidden_size 等参数数据集和配置匹配
114 | class AttentionDecoder(nn.Module):
115 |     """
116 |            初始化文本解码器。
117 | 
118 |            参数:
119 |                image_code_dim: 图像编码的维度。
120 |                vocab_size: 词汇表的大小。
121 |                word_dim: 词嵌入的维度。
122 |                attention_dim: 注意力机制的隐藏层维度。
123 |                hidden_size: GRU隐藏层的大小。
124 |                num_layers: GRU层数。
125 |                dropout: Dropout层的概率。
126 |     """
127 |     def __init__(self, image_code_dim, vocab_size, word_dim, attention_dim, hidden_size, num_layers, dropout=0.5):
128 |         super(AttentionDecoder, self).__init__()
129 |         self.embed = nn.Embedding(vocab_size, word_dim)
130 |         self.attention = AdditiveAttention(hidden_size, image_code_dim, attention_dim)
131 |         self.init_state = nn.Linear(image_code_dim, num_layers * hidden_size)
132 |         self.rnn = nn.GRU(word_dim + image_code_dim, hidden_size, num_layers)
133 |         self.dropout = nn.Dropout(p=dropout)
134 |         self.fc = nn.Linear(hidden_size, vocab_size)
135 |         # RNN默认已初始化
136 |         self.init_weights()
137 | 
138 |     def init_weights(self):
139 |         self.embed.weight.data.uniform_(-0.1, 0.1)
140 |         self.fc.bias.data.fill_(0)
141 |         self.fc.weight.data.uniform_(-0.1, 0.1)
142 | 
143 |     def init_hidden_state(self, image_code, captions, cap_lens):
144 |         """
145 |         初始化隐藏状态。
146 | 
147 |         参数：
148 |             image_code：图像编码器输出的图像表示
149 |                         (batch_size, image_code_dim, grid_height, grid_width)
150 |             captions: 文本描述。
151 |             cap_lens: 文本描述的长度。
152 |         """
153 |         # 将图像网格表示转换为序列表示形式
154 |         batch_size, image_code_dim = image_code.size(0), image_code.size(1)
155 |         # -> (batch_size, grid_height, grid_width, image_code_dim)
156 |         image_code = image_code.permute(0, 2, 3, 1)
157 |         # -> (batch_size, grid_height * grid_width, image_code_dim)
158 |         image_code = image_code.view(batch_size, -1, image_code_dim)
159 |         # （1）按照caption的长短排序
160 |         sorted_cap_lens, sorted_cap_indices = torch.sort(cap_lens, 0, True)
161 |         captions = captions[sorted_cap_indices]
162 |         image_code = image_code[sorted_cap_indices]
163 |         # （2）初始化隐状态
164 |         hidden_state = self.init_state(image_code.mean(axis=1))
165 |         hidden_state = hidden_state.view(
166 |             batch_size,
167 |             self.rnn.num_layers,
168 |             self.rnn.hidden_size).permute(1, 0, 2)
169 |         return image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state
170 | 
171 |     def forward_step(self, image_code, curr_cap_embed, hidden_state):
172 |         """
173 |                 解码器的前馈步骤。
174 | 
175 |                 参数:
176 |                     image_code: 图像编码。
177 |                     curr_cap_embed: 当前时间步的词嵌入向量。
178 |                     hidden_state: 当前的隐藏状态。
179 |                 """
180 |         # （3.2）利用注意力机制获得上下文向量
181 |         # query：hidden_state[-1]，即最后一个隐藏层输出 (batch_size, hidden_size)
182 |         # context: (batch_size, hidden_size)
183 |         context, alpha = self.attention(hidden_state[-1], image_code)
184 |         # （3.3）以上下文向量和当前时刻词表示为输入，获得GRU输出
185 |         x = torch.cat((context, curr_cap_embed), dim=-1).unsqueeze(0)
186 |         # x: (1, real_batch_size, hidden_size+word_dim)
187 |         # out: (1, real_batch_size, hidden_size)
188 |         out, hidden_state = self.rnn(x, hidden_state)
189 |         # （3.4）获取该时刻的预测结果
190 |         # (real_batch_size, vocab_size)
191 |         preds = self.fc(self.dropout(out.squeeze(0)))
192 |         return preds, alpha, hidden_state
193 | 
194 |     def forward(self, image_code, captions, cap_lens):
195 |         """
196 |         完整的前馈过程。
197 | 
198 |         参数：
199 |             hidden_state: (num_layers, batch_size, hidden_size)
200 |             image_code:  (batch_size, feature_channel, feature_size)
201 |             captions: (batch_size, )
202 |         """
203 |         # （1）将图文数据按照文本的实际长度从长到短排序
204 |         # （2）获得GRU的初始隐状态
205 |         image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state \
206 |             = self.init_hidden_state(image_code, captions, cap_lens)
207 |         batch_size = image_code.size(0)
208 |         # 输入序列长度减1，因为最后一个时刻不需要预测下一个词
209 |         lengths = sorted_cap_lens.cpu().numpy() - 1
210 |         # 初始化变量：模型的预测结果和注意力分数
211 |         max_cap_len = max(cap_lens)  # 计算最长caption的长度
212 |         predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(captions.device)
213 |         alphas = torch.zeros(batch_size, max_cap_len, image_code.shape[1]).to(captions.device)
214 |         # predictions = torch.zeros(batch_size, lengths[0], self.fc.out_features).to(captions.device)
215 |         # alphas = torch.zeros(batch_size, lengths[0], image_code.shape[1]).to(captions.device)
216 |         # 获取文本嵌入表示 cap_embeds: (batch_size, num_steps, word_dim)
217 |         cap_embeds = self.embed(captions)
218 |         # Teacher-Forcing模式
219 |         for step in range(lengths[0]):
220 |             # （3）解码
221 |             # （3.1）模拟pack_padded_sequence函数的原理，获取该时刻的非<pad>输入
222 |             real_batch_size = np.where(lengths > step)[0].shape[0]
223 |             preds, alpha, hidden_state = self.forward_step(
224 |                 image_code[:real_batch_size],
225 |                 cap_embeds[:real_batch_size, step, :],
226 |                 hidden_state[:, :real_batch_size, :].contiguous())
227 |             # 记录结果
228 |             predictions[:real_batch_size, step, :] = preds
229 |             alphas[:real_batch_size, step, :] = alpha
230 | 
231 |             # 新增逻辑来调整输出长度
232 |             # 找出最长的caption长度
233 |             max_cap_len = max(cap_lens)
234 |             # 初始化一个填充的predictions张量
235 |             padded_predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(predictions.device)
236 |             for i in range(batch_size):
237 |                 # 当前样本的实际长度
238 |                 actual_length = cap_lens[i]
239 |                 # 只拷贝实际长度的预测结果
240 |                 padded_predictions[i, :actual_length, :] = predictions[i, :actual_length, :]
241 | 
242 |         return padded_predictions, alphas, captions, lengths, sorted_cap_indices
243 | 
244 | 
245 | # AttentionModel 模型
246 | '''
247 | 注意：确保 image_code_dim 等参数与 ImageEncoder 的输出匹配
248 | 
249 | 最终 ImageEncoder 的输出形状仍然是 (batch_size, num_channels, height, width)。
250 | 这意味着 image_code_dim 应该设置为 num_channels，即 ResNet101 最后一个卷积层的输出通道数。这个值通常为2048，
251 | '''
252 | class AttentionModel(nn.Module):
253 |     def __init__(self, image_code_dim, vocab, word_dim, attention_dim, hidden_size, num_layers):
254 |         super(AttentionModel, self).__init__()
255 |         self.vocab = vocab
256 |         self.encoder = ImageEncoder()
257 |         self.decoder = AttentionDecoder(image_code_dim, len(vocab), word_dim, attention_dim, hidden_size, num_layers)
258 | 
259 |     def forward(self, images, captions, cap_lens):
260 |         # 打印图像输入形状
261 |         # print("Image input shape:", images.shape)
262 |         image_code = self.encoder(images)
263 |         # 打印编码器输出形状
264 |         # print("Encoder output shape:", image_code.shape)
265 |         output = self.decoder(image_code, captions, cap_lens)
266 |         # 打印解码器输出形状
267 |         # print("Decoder output shape:", output[0].shape)  # Assuming output[0] is the main output
268 |         return output
269 | 
270 |     def generate_by_beamsearch(self, images, beam_k, max_len):
271 |         vocab_size = len(self.vocab)
272 |         image_codes = self.encoder(images)
273 |         texts = []
274 |         device = images.device
275 |         # 对每个图像样本执行束搜索
276 |         for image_code in image_codes:
277 |             # 将图像表示复制k份
278 |             image_code = image_code.unsqueeze(0).repeat(beam_k, 1, 1, 1)
279 |             # 生成k个候选句子，初始时，仅包含开始符号<start>
280 |             cur_sents = torch.full((beam_k, 1), self.vocab['<start>'], dtype=torch.long).to(device)
281 |             cur_sent_embed = self.decoder.embed(cur_sents)[:, 0, :]
282 |             sent_lens = torch.LongTensor([1] * beam_k).to(device)
283 |             # 获得GRU的初始隐状态
284 |             image_code, cur_sent_embed, _, _, hidden_state = \
285 |                 self.decoder.init_hidden_state(image_code, cur_sent_embed, sent_lens)
286 |             # 存储已生成完整的句子（以句子结束符<end>结尾的句子）
287 |             end_sents = []
288 |             # 存储已生成完整的句子的概率
289 |             end_probs = []
290 |             # 存储未完整生成的句子的概率
291 |             probs = torch.zeros(beam_k, 1).to(device)
292 |             k = beam_k
293 |             while True:
294 |                 preds, _, hidden_state = self.decoder.forward_step(image_code[:k], cur_sent_embed,
295 |                                                                    hidden_state.contiguous())
296 |                 # -> (k, vocab_size)
297 |                 preds = nn.functional.log_softmax(preds, dim=1)
298 |                 # 对每个候选句子采样概率值最大的前k个单词生成k个新的候选句子，并计算概率
299 |                 # -> (k, vocab_size)
300 |                 probs = probs.repeat(1, preds.size(1)) + preds
301 |                 if cur_sents.size(1) == 1:
302 |                     # 第一步时，所有句子都只包含开始标识符，因此，仅利用其中一个句子计算topk
303 |                     values, indices = probs[0].topk(k, 0, True, True)
304 |                 else:
305 |                     # probs: (k, vocab_size) 是二维张量
306 |                     # topk函数直接应用于二维张量会按照指定维度取最大值，这里需要在全局取最大值
307 |                     # 因此，将probs转换为一维张量，再使用topk函数获取最大的k个值
308 |                     values, indices = probs.view(-1).topk(k, 0, True, True)
309 |                 # 计算最大的k个值对应的句子索引和词索引
310 |                 sent_indices = torch.div(indices, vocab_size, rounding_mode='trunc')
311 |                 word_indices = indices % vocab_size
312 |                 # 将词拼接在前一轮的句子后，获得此轮的句子
313 |                 cur_sents = torch.cat([cur_sents[sent_indices], word_indices.unsqueeze(1)], dim=1)
314 |                 # 查找此轮生成句子结束符<end>的句子
315 |                 end_indices = [idx for idx, word in enumerate(word_indices) if word == self.vocab['<end>']]
316 |                 if len(end_indices) > 0:
317 |                     end_probs.extend(values[end_indices])
318 |                     end_sents.extend(cur_sents[end_indices].tolist())
319 |                     # 如果所有的句子都包含结束符，则停止生成
320 |                     k -= len(end_indices)
321 |                     if k == 0:
322 |                         break
323 |                 # 查找还需要继续生成词的句子
324 |                 cur_indices = [idx for idx, word in enumerate(word_indices)
325 |                                if word != self.vocab['<end>']]
326 |                 if len(cur_indices) > 0:
327 |                     cur_sent_indices = sent_indices[cur_indices]
328 |                     cur_word_indices = word_indices[cur_indices]
329 |                     # 仅保留还需要继续生成的句子、句子概率、隐状态、词嵌入
330 |                     cur_sents = cur_sents[cur_indices]
331 |                     probs = values[cur_indices].view(-1, 1)
332 |                     hidden_state = hidden_state[:, cur_sent_indices, :]
333 |                     cur_sent_embed = self.decoder.embed(
334 |                         cur_word_indices.view(-1, 1))[:, 0, :]
335 |                 # 句子太长，停止生成
336 |                 if cur_sents.size(1) >= max_len:
337 |                     break
338 |             if len(end_sents) == 0:
339 |                 # 如果没有包含结束符的句子，则选取第一个句子作为生成句子
340 |                 gen_sent = cur_sents[0].tolist()
341 |             else:
342 |                 # 否则选取包含结束符的句子中概率最大的句子
343 |                 gen_sent = end_sents[end_probs.index(max(end_probs))]
344 |             texts.append(gen_sent)
345 |         return texts
346 | 
347 | 
348 | # 损失函数
349 | class PackedCrossEntropyLoss(nn.Module):
350 |     def __init__(self):
351 |         super(PackedCrossEntropyLoss, self).__init__()
352 |         self.loss_fn = nn.CrossEntropyLoss()
353 | 
354 |     def forward(self, predictions, targets, lengths):
355 |         """
356 |         计算交叉熵损失，排除填充的部分。
357 |         参数：
358 |             predictions：模型的预测结果，形状为 (batch_size, max_length, vocab_size)。
359 |             targets：实际的文本描述，形状为 (batch_size, max_length)。
360 |             lengths：每个描述的实际长度。
361 |         """
362 |         # 使用 pack_padded_sequence 来处理变长序列
363 |         # 这里 predictions 和 targets 都需要进行 pack 操作
364 |         # 由于 pack_padded_sequence 需要长度从长到短的序列，这里假设输入已经是这种格式
365 |         packed_predictions = pack_padded_sequence(predictions, lengths, batch_first=True, enforce_sorted=False)[0]
366 |         packed_targets = pack_padded_sequence(targets, lengths, batch_first=True, enforce_sorted=False)[0]
367 | 
368 |         # 计算损失，忽略填充的部分
369 |         loss = self.loss_fn(packed_predictions, packed_targets)
370 |         return loss
371 | 
372 | 
373 | def get_optimizer(model, config):
374 |     """
375 |     获取优化器，为模型的不同部分设置不同的学习速率。
376 |     参数：
377 |         model：训练模型。
378 |         config：包含配置信息的对象，如学习速率等。
379 |     返回：
380 |         配置好地优化器。
381 |     """
382 |     # 为编码器和解码器设置不同的学习速率
383 |     encoder_params = filter(lambda p: p.requires_grad, model.encoder.parameters())
384 |     decoder_params = filter(lambda p: p.requires_grad, model.decoder.parameters())
385 | 
386 |     # 创建优化器，分别对这两部分参数应用不同的学习速率
387 |     optimizer = optim.Adam([
388 |         {"params": encoder_params, "lr": config.encoder_learning_rate},
389 |         {"params": decoder_params, "lr": config.decoder_learning_rate}
390 |     ])
391 | 
392 |     return optimizer
393 | 
394 | # 以下函数是为了展示如何在训练过程中调整学习速率，实际上可能并未使用
395 | def adjust_learning_rate(optimizer, epoch, config):
396 |     """
397 |     调整学习速率，每隔一定轮次减少到原来的十分之一。
398 |     参数：
399 |         optimizer：优化器。
400 |         epoch：当前轮次。
401 |         config：包含配置信息的对象。
402 |     """
403 |     for param_group in optimizer.param_groups:
404 |         if param_group['name'] == 'encoder':
405 |             param_group['lr'] = config.encoder_learning_rate * (0.1 ** (epoch // config.lr_update))
406 |         else:
407 |             param_group['lr'] = config.decoder_learning_rate * (0.1 ** (epoch // config.lr_update))
408 | 
409 | 
410 | # CIDEr-D 评估
411 | def filter_useless_words(sent, filterd_words):
412 |     # 去除句子中不参与CIDEr-D计算的符号
413 |     return [w for w in sent if w not in filterd_words]
414 | 
415 | 
416 | def evaluate_cider(data_loader, model, config):
417 |     model.eval()
418 |     # 存储候选文本和参考文本
419 |     cands = {}
420 |     refs = {}
421 |     filterd_words = {model.vocab['<start>'], model.vocab['<end>'], model.vocab['<pad>']}
422 |     device = next(model.parameters()).device
423 | 
424 |     # 加载词汇表并创建反向词汇表
425 |     with open('../data/output/vocab.json', 'r') as f:
426 |         vocab = json.load(f)
427 |     idx_to_word = {idx: word for word, idx in vocab.items()}
428 | 
429 |     for i, (imgs, caps, caplens) in enumerate(data_loader):
430 |         imgs = imgs.to(device)
431 |         # 通过束搜索生成描述
432 |         preds = model.generate_by_beamsearch(imgs, config.beam_k, config.max_len)
433 |         for j in range(imgs.size(0)):
434 |             img_id = str(i * config.batch_size + j)
435 |             cand_words = [idx_to_word.get(word, '<unk>') for word in preds[j]]
436 |             cand = ' '.join(filter_useless_words(cand_words, filterd_words))
437 |             cands[img_id] = [cand]  # 候选描述
438 |             # 将参考描述（caps[j]）的每个索引转换为单词
439 |             ref_words = [idx_to_word.get(word.item(), '<unk>') for word in caps[j]]
440 |             refs[img_id] = [' '.join(filter_useless_words(ref_words, filterd_words))]  # 参考描述
441 | 
442 |     # # 在调用 compute_score 之前添加调试信息
443 |     # for key, value in cands.items():
444 |     #     print(f"Key: {key}, Value type: {type(value)}, Value: {value}")
445 |     #     assert isinstance(value, list), f"Value for key {key} is not a list in cands"
446 |     #
447 |     # for key, value in refs.items():
448 |     #     print(f"Key: {key}, Value type: {type(value)}, Value: {value}")
449 |     #     assert isinstance(value, list), f"Value for key {key} is not a list in refs"
450 | 
451 |     # 计算CIDEr-D得分
452 |     cider_evaluator = Cider()
453 |     score, _ = cider_evaluator.compute_score(refs, cands)
454 |     # score, _ = cider_evaluator.compute_score({'dummy': refs}, {'dummy': cands})
455 | 
456 |     model.train()
457 |     return score
458 | 
459 | 
460 | 
461 | # encoder = ImageEncoder(Config.embed_size)
462 | # decoder = AttentionDecoder(Config.embed_size, Config.vocab_size, Config.hidden_size, Config.num_layers)
463 | # arctic_model = ARCTIC(encoder, decoder)
464 | 
465 | # 示例：前馈过程
466 | # images = ...  # 从数据集中获取图像
467 | # captions = ...  # 从数据集中获取对应的文本描述
468 | # 输出 = arctic_model(images, captions)
469 | 


--------------------------------------------------------------------------------
/Model1_YellowOrange/predict.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from torchvision import transforms
 4 | from models import AttentionModel
 5 | from configurations import Config
 6 | import json
 7 | 
 8 | def load_model(model_path, vocab, config):
 9 |     model = AttentionModel(
10 |         image_code_dim=config.image_code_dim,
11 |         vocab=vocab,  # 传递词汇表字典
12 |         word_dim=config.word_dim,
13 |         attention_dim=config.attention_dim,
14 |         hidden_size=config.hidden_size,
15 |         num_layers=config.num_layers
16 |     )
17 |     model.load_state_dict(torch.load(model_path))
18 |     model = model.to(config.device)
19 |     model.eval()  # 将模型设置为评估模式
20 |     return model
21 | 
22 | def process_image(image_path):
23 |     transform = transforms.Compose([
24 |         transforms.Resize((256, 256)),
25 |         transforms.ToTensor(),
26 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
27 |     ])
28 |     image = Image.open(image_path).convert('RGB')
29 |     image_tensor = transform(image).unsqueeze(0)  # 添加一个批次维度
30 |     return image_tensor
31 | 
32 | def predict_caption(model, image_tensor, vocab, config):
33 |     # 生成束搜索描述
34 |     predictions = model.generate_by_beamsearch(image_tensor.to(config.device), config.beam_k, config.max_len)
35 |     # 将词索引转换回文字
36 |     idx_to_word = {idx: word for word, idx in vocab.items()}
37 |     caption_words = [idx_to_word.get(word, '<unk>') for word in predictions[0]]
38 |     caption = ' '.join(caption_words)
39 |     return caption
40 | 
41 | def main():
42 |     # 载入配置和词汇表
43 |     config = Config()
44 |     with open('../data/output/vocab_caption_1.json', 'r') as f:
45 |         vocab = json.load(f)
46 | 
47 |     # 加载模型
48 |     model_path = '../data/output/weights/.pth'  # 使用正确的模型文件路径
49 |     model = load_model(model_path, vocab, config)
50 | 
51 |     # 处理图片并生成描述
52 |     image_path = '../data/images_1/MEN-Denim-id_00000080-01_7_additional.jpg'  # 测试图片路径
53 |     image_tensor = process_image(image_path)
54 |     caption = predict_caption(model, image_tensor, vocab, config)
55 | 
56 |     print("Generated Caption:", caption)
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 
62 | """
63 | model = ...             # 加载模型
64 | 
65 | images_folder = "..."   # 图片文件夹路径
66 | captions_dict = {}      # 字典
67 | 
68 | count = 1               # 计数
69 | 
70 | for filename in os.listdir(images_folder):
71 |     if filename.endswith(".jpg") or filename.endswith(".png"):
72 |         img_path = os.path.join(images_folder, filename)
73 |         
74 |         # Load the image
75 |         raw_image = Image.open(img_path).convert('RGB')
76 |     
77 |         generated_caption = ... # 生成caption
78 | 
79 |         print(f"No{count}", generated_caption)
80 |         count += 1
81 | 
82 |         # Store the caption in the dictionary
83 |         captions_dict[img_path] = generated_caption
84 | 
85 | # Save the dictionary to captions.json
86 | output_path = "..." # 保存路径
87 | with open(output_path, 'w') as json_file:
88 |     json.dump(captions_dict, json_file, indent=4)
89 | 
90 | print(f"Captions saved to {output_path}")
91 | """


--------------------------------------------------------------------------------
/Model1_YellowOrange/requirements.txt:
--------------------------------------------------------------------------------
1 | torch == 2.1.1
2 | torchvision == 0.16.1
3 | transformers == 4.32.1
4 | pycocoevalcap


--------------------------------------------------------------------------------
/Model1_YellowOrange/train.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | import os
 4 | from configuartions import Config
 5 | from models import AttentionModel, get_optimizer, PackedCrossEntropyLoss, evaluate_cider
 6 | from datasets import create_dataloaders, ImageTextDataset
 7 | 
 8 | 
 9 | def main():
10 |     best_test_score = float('-inf')  # 初始化最佳测试得分
11 | 
12 |     # 加载配置
13 |     config = Config()
14 | 
15 |     # 创建数据加载器
16 |     train_loader, test_loader = create_dataloaders(config)
17 | 
18 |     # 加载词汇表文件
19 |     with open('../data/output/vocab.json', 'r') as f:
20 |         vocab = json.load(f)
21 | 
22 |     # 模型初始化
23 |     model = AttentionModel(
24 |         image_code_dim=config.image_code_dim,
25 |         vocab=vocab,  # 传递词汇表字典
26 |         word_dim=config.word_dim,
27 |         attention_dim=config.attention_dim,
28 |         hidden_size=config.hidden_size,
29 |         num_layers=config.num_layers
30 |     ).to(config.device)
31 | 
32 |     # 优化器
33 |     optimizer = get_optimizer(model, config)
34 | 
35 |     # 损失函数
36 |     loss_fn = PackedCrossEntropyLoss().to(config.device)
37 | 
38 |     # 创建保存权重的文件夹路径
39 |     weights_dir = os.path.join(config.output_folder, 'weights')
40 |     os.makedirs(weights_dir, exist_ok=True)
41 | 
42 |     best_val_score = float('-inf')  # 初始化最佳验证得分
43 | 
44 |     # 开始训练
45 |     for epoch in range(config.num_epochs):
46 |         # 训练模型
47 |         model.train()
48 |         for i, (imgs, caps, caplens) in enumerate(train_loader):
49 |             imgs, caps = imgs.to(config.device), caps.to(config.device)
50 |             caplens = caplens.cpu().to(torch.int64)
51 | 
52 |             optimizer.zero_grad()
53 |             outputs, alphas, _, _, _ = model(imgs, caps, caplens)
54 | 
55 |             # 确保目标序列长度与模型输出匹配
56 |             targets = caps[:, 1:]  # 假设targets是captions去除第一个<start>标记后的部分
57 |             # print(f"Outputs shape: {outputs.shape}")
58 |             # print(f"Targets shape: {targets.shape}")
59 |             # print(f"Caplens: {caplens}")
60 |             loss = loss_fn(outputs, targets, caplens)
61 |             loss.backward()
62 |             optimizer.step()
63 | 
64 |             # 打印/记录损失信息
65 |             if (i + 1) % 100 == 0:
66 |                 print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')
67 | 
68 |         # 在每个epoch结束时使用测试集评估模型
69 |         current_test_score = evaluate_cider(test_loader, model, config)
70 |         print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}")
71 | 
72 |         # 如果当前得分比之前的最佳得分要好，则保存模型
73 |         if current_test_score > best_test_score:
74 |             best_test_score = current_test_score
75 |             best_model_path = os.path.join(weights_dir, f'Attention_model_background_caption_{best_test_score}.pth')
76 |             torch.save(model.state_dict(), best_model_path)
77 |             print(f"Saved new best model to {best_model_path}")
78 | 
79 |     # 训练完成后的最终评估
80 |     final_test_score = evaluate_cider(test_loader, model, config)
81 |     print(f"Final CIDEr-D score = {final_test_score}")
82 | 
83 |     # # 训练完成后保存模型
84 |     # final_model_path = os.path.join(weights_dir, 'AttentionModel.pth')
85 |     # torch.save(model.state_dict(), final_model_path)
86 |     # print(f"Saved final model to {final_model_path}")
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/Model2_Transformer/TransformerE+D.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "我们首先使用 argparse 库解析命令行参数，获取图像路径、模型版本和 Checkpoint 路径；其次根据命令行参数加载预训练模型，或者从 Checkpoint 加载模型（可选）；紧接着使用 PIL 库打开图像，并进行预处理；然后使用模型生成图像的描述；最后使用 METEOR 和 ROUGE-L 评估生成的描述与参考描述的相似度。"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import json\n",
 17 |     "import torch\n",
 18 |     "from torch.utils.data import Dataset, DataLoader\n",
 19 |     "from torchvision import transforms\n",
 20 |     "from transformers import BertTokenizer\n",
 21 |     "from PIL import Image\n",
 22 |     "from models import caption\n",
 23 |     "from configuration import Config\n",
 24 |     "\n",
 25 |     "# 数据集类\n",
 26 |     "class MyDataset(Dataset):\n",
 27 |     "    def __init__(self, json_file, img_dir, transform=None):\n",
 28 |     "        with open(json_file, 'r') as f:\n",
 29 |     "            self.data = json.load(f)\n",
 30 |     "        self.img_dir = img_dir\n",
 31 |     "        self.transform = transform\n",
 32 |     "        self.filenames = list(self.data.keys())\n",
 33 |     "\n",
 34 |     "    def __len__(self):\n",
 35 |     "        return len(self.data)\n",
 36 |     "\n",
 37 |     "    def __getitem__(self, idx):\n",
 38 |     "        filename = self.filenames[idx]\n",
 39 |     "        caption = self.data[filename]\n",
 40 |     "        image = Image.open(f\"{self.img_dir}/{filename}\")\n",
 41 |     "        if self.transform:\n",
 42 |     "            image = self.transform(image)\n",
 43 |     "        return image, caption"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "导入必要的库：这段代码首先导入了一些必要的库，包括json（用于处理JSON文件），torch（PyTorch库，用于深度学习），Dataset和DataLoader（PyTorch中的数据加载工具），transforms（torchvision库中的图像预处理工具），BertTokenizer（transformers库中的BERT模型的分词器），Image（PIL库中的图像处理工具），以及caption和Config（用户自定义的模块）。\n",
 51 |     "\n",
 52 |     "定义数据集类：定义了一个名为MyDataset的类，这个类继承自PyTorch的Dataset基类。\n",
 53 |     "\n",
 54 |     "初始化方法：在__init__方法中，这个类接受一个JSON文件的路径、一个图像目录的路径和一个可选的图像转换函数。JSON文件中应该包含图像文件名和对应的标题。这个方法首先读取JSON文件并将其内容保存在self.data中，然后保存图像目录的路径和图像转换函数。最后，它从self.data中提取所有的文件名并保存在self.filenames中。\n",
 55 |     "\n",
 56 |     "长度方法：__len__方法返回数据集中的样本数量，这是通过返回self.data的长度来实现的。\n",
 57 |     "\n",
 58 |     "获取项方法：__getitem__方法接受一个索引idx，并返回对应的图像和标题。它首先从self.filenames中获取文件名，然后从self.data中获取对应的标题。接着，它打开对应的图像文件，并如果提供了图像转换函数，就对图像进行转换。最后，它返回图像和标题。"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# 检查是否有可用的GPU\n",
 68 |     "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
 69 |     "\n",
 70 |     "# 加载模型\n",
 71 |     "config = Config()\n",
 72 |     "model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True)\n",
 73 |     "model = model.to(device)  # 将模型移动到指定的设备上\n",
 74 |     "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
 75 |     "\n",
 76 |     "# 准备数据集\n",
 77 |     "transform = transforms.Compose([\n",
 78 |     "    transforms.ToTensor(),\n",
 79 |     "    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
 80 |     "])\n",
 81 |     "\n",
 82 |     "train_dataset = MyDataset('../data/train_captions.json', '../data/train_images', transform=transform)\n",
 83 |     "train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "检查并设置设备：首先，代码检查是否有可用的GPU（图形处理器），如果有，就使用GPU，否则使用CPU。这是通过torch.device函数实现的，这个函数接受一个字符串参数，指定要使用的设备。如果torch.cuda.is_available()返回True，则使用字符串'cuda'，否则使用字符串'cpu'。这个设备对象被保存在变量device中，以便后续使用。\n",
 91 |     "\n",
 92 |     "加载模型：然后，代码创建了一个Config对象（这是在前面的代码中定义的一个类），并将其保存在变量config中。接着，它使用torch.hub.load函数从torch.hub（一个预训练模型的仓库）加载一个模型。这个函数接受三个参数：模型的仓库名（在这里是'saahiluppal/catr'），模型的版本（在这里是'v3'），以及一个布尔值，指定是否要加载预训练的权重（在这里是True）。加载的模型被保存在变量model中。然后，模型被移动到前面指定的设备上，这是通过调用模型的to方法并传入设备对象实现的。最后，代码使用BertTokenizer.from_pretrained方法加载了一个预训练的BERT分词器，并将其保存在变量tokenizer中。\n",
 93 |     "\n",
 94 |     "准备数据集：代码首先定义了一个图像转换函数，这是通过transforms.Compose函数实现的，这个函数接受一个转换函数的列表，并返回一个新的转换函数，这个新的转换函数会按照列表中的顺序依次应用这些转换函数。在这里，转换函数的列表包含两个函数：transforms.ToTensor（将图像转换为PyTorch张量）和transforms.Normalize（对图像进行标准化）。然后，代码创建了一个MyDataset对象（这是在前面的代码中定义的一个类），并将其保存在变量train_dataset中。这个对象接受三个参数：一个JSON文件的路径（包含训练集的标题），一个图像目录的路径（包含训练集的图像），以及前面定义的图像转换函数。最后，代码创建了一个DataLoader对象，并将其保存在变量train_dataloader中。这个对象接受三个参数：一个数据集对象，一个批量大小（在这里是16），以及一个布尔值，指定是否要在每个训练周期开始时打乱数据集。"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "# 定义损失函数和优化器\n",
104 |     "criterion = torch.nn.CrossEntropyLoss()\n",
105 |     "optimizer = torch.optim.Adam(model.parameters())\n",
106 |     "\n",
107 |     "# 训练循环\n",
108 |     "num_epochs = 10\n",
109 |     "for epoch in range(num_epochs):\n",
110 |     "    for images, captions in train_dataloader:\n",
111 |     "        images = images.to(device)  # 将图像数据移动到指定的设备上\n",
112 |     "        captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True)\n",
113 |     "        captions = {key: val.to(device) for key, val in captions.items()}  # 将caption数据移动到指定的设备上\n",
114 |     "\n",
115 |     "        outputs = model(images, captions['input_ids'], captions['attention_mask'])\n",
116 |     "        loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), captions['input_ids'].view(-1))\n",
117 |     "\n",
118 |     "        optimizer.zero_grad()\n",
119 |     "        loss.backward()\n",
120 |     "        optimizer.step()\n",
121 |     "\n",
122 |     "    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')\n",
123 |     "\n",
124 |     "# 保存模型\n",
125 |     "torch.save(model.state_dict(), 'Model2.pth')"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "定义损失函数和优化器：首先，代码定义了一个交叉熵损失函数（torch.nn.CrossEntropyLoss()），并将其保存在变量criterion中。然后，它定义了一个Adam优化器（torch.optim.Adam(model.parameters())），并将其保存在变量optimizer中。这个优化器接受模型的参数作为输入。\n",
133 |     "\n",
134 |     "训练循环：代码定义了一个训练循环，这个循环将运行指定的周期数（在这里是10）。在每个周期中，代码遍历训练数据加载器train_dataloader，对于每一批图像和标题，它首先将图像数据移动到前面指定的设备上，然后使用BERT分词器对标题进行分词，并将返回的张量数据也移动到指定的设备上。接着，它将图像和标题数据传入模型，得到模型的输出，然后使用损失函数计算损失。然后，它将优化器的梯度清零，计算损失的反向传播，然后更新优化器的参数。最后，它打印出当前的周期数和损失值。\n",
135 |     "\n",
136 |     "保存模型：在训练循环结束后，代码使用torch.save函数保存模型的状态字典。这个函数接受两个参数：要保存的对象（在这里是模型的状态字典）和保存的文件名（在这里是'Model2.pth'）。这样，训练好的模型就被保存下来，以便后续使用。"
137 |    ]
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "language_info": {
142 |    "name": "python"
143 |   }
144 |  },
145 |  "nbformat": 4,
146 |  "nbformat_minor": 2
147 | }
148 | 


--------------------------------------------------------------------------------
/Model2_Transformer/__pycache__/configuration.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/__pycache__/configuration.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/configuration.py:
--------------------------------------------------------------------------------
 1 | class Config(object):
 2 |     def __init__(self):
 3 | 
 4 |         # Learning Rates
 5 |         self.lr_backbone = 1e-5
 6 |         self.lr = 1e-4
 7 | 
 8 |         # Epochs
 9 |         self.epochs = 30
10 |         self.lr_drop = 20
11 |         self.start_epoch = 0
12 |         self.weight_decay = 1e-4
13 | 
14 |         # Backbone
15 |         self.backbone = 'resnet101'
16 |         self.position_embedding = 'sine'
17 |         self.dilation = True
18 |         
19 |         # Basic
20 |         self.device = 'cuda'
21 |         self.seed = 42
22 |         self.batch_size = 32
23 |         self.num_workers = 8
24 |         self.checkpoint = './checkpoint.pth'
25 |         self.clip_max_norm = 0.1
26 | 
27 |         # Transformer
28 |         self.hidden_dim = 256
29 |         self.pad_token_id = 0
30 |         self.max_position_embeddings = 128
31 |         self.layer_norm_eps = 1e-12
32 |         self.dropout = 0.1
33 |         self.vocab_size = 30522
34 | 
35 |         self.enc_layers = 6
36 |         self.dec_layers = 6
37 |         self.dim_feedforward = 2048
38 |         self.nheads = 8
39 |         self.pre_norm = True
40 | 
41 |         # Dataset
42 |         self.dir = '../coco'
43 |         self.limit = -1


--------------------------------------------------------------------------------
/Model2_Transformer/data_preprocessing/divide_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import shutil
 4 | 
 5 | # 读取json文件并转换为字典
 6 | with open('../../data/test_captions.json', 'r') as f:
 7 |     test_captions = json.load(f)
 8 | 
 9 | with open('../../data/train_captions.json', 'r') as f:
10 |     train_captions = json.load(f)
11 | 
12 | # 指定源目录和目标目录
13 | source_directory = '../../data/images'
14 | train_directory = '../../data/train_images'
15 | test_directory = '../../data/test_images'
16 | 
17 | # 确保目标目录存在
18 | os.makedirs(train_directory, exist_ok=True)
19 | os.makedirs(test_directory, exist_ok=True)
20 | 
21 | # 将训练集图片复制到目标目录
22 | for image in train_captions:
23 |     shutil.copy(os.path.join(source_directory, image), train_directory)
24 | 
25 | # 将测试集图片复制到目标目录
26 | for image in test_captions:
27 |     shutil.copy(os.path.join(source_directory, image), test_directory)
28 | 


--------------------------------------------------------------------------------
/Model2_Transformer/data_preprocessing/name_info.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | 
 5 | # 定义一个函数来解析文件名
 6 | def parse_filename(filename):
 7 |     # 使用正则表达式匹配文件名
 8 |     pattern = r'^(?P<gender>\w+)-(?P<clothing>[\w_]+)-id_(?P<id>\d+)-(?P<group>\d+)_(\d+_(?P<body>\w+))\.jpg$'
 9 |     match = re.match(pattern, filename)
10 |     if match:
11 |         return match.groupdict()
12 |     else:
13 |         return None
14 | 
15 | # 定义一个函数来处理目录中的所有文件
16 | def process_directory(directory):
17 |     # 创建一个字典来存储结果
18 |     results = {}
19 |     # 遍历目录中的所有文件
20 |     for filename in os.listdir(directory):
21 |         # 解析文件名
22 |         info = parse_filename(filename)
23 |         if info:
24 |             # 将解析的信息与文件名关联起来
25 |             results[filename] = info
26 |     return results
27 | 
28 | # 使用函数处理目录
29 | directory = '../../data/images'
30 | results = process_directory(directory)
31 | 
32 | # 将结果保存到json文件中
33 | with open('../../data/label.json', 'w') as f:
34 |     json.dump(results, f, ensure_ascii=False, indent=4)


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__init__.py


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/__pycache__/coco.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/coco.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/datasets/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/coco.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import Dataset
  2 | import torchvision.transforms.functional as TF
  3 | import torchvision as tv
  4 | 
  5 | from PIL import Image
  6 | import numpy as np
  7 | import random
  8 | import os
  9 | 
 10 | from transformers import BertTokenizer
 11 | 
 12 | from .utils import nested_tensor_from_tensor_list, read_json
 13 | 
 14 | MAX_DIM = 299
 15 | 
 16 | 
 17 | def under_max(image):
 18 |     if image.mode != 'RGB':
 19 |         image = image.convert("RGB")
 20 | 
 21 |     shape = np.array(image.size, dtype=np.float)
 22 |     long_dim = max(shape)
 23 |     scale = MAX_DIM / long_dim
 24 | 
 25 |     new_shape = (shape * scale).astype(int)
 26 |     image = image.resize(new_shape)
 27 | 
 28 |     return image
 29 | 
 30 | 
 31 | class RandomRotation:
 32 |     def __init__(self, angles=[0, 90, 180, 270]):
 33 |         self.angles = angles
 34 | 
 35 |     def __call__(self, x):
 36 |         angle = random.choice(self.angles)
 37 |         return TF.rotate(x, angle, expand=True)
 38 | 
 39 | 
 40 | train_transform = tv.transforms.Compose([
 41 |     RandomRotation(),
 42 |     tv.transforms.Lambda(under_max),
 43 |     tv.transforms.ColorJitter(brightness=[0.5, 1.3], contrast=[
 44 |                               0.8, 1.5], saturation=[0.2, 1.5]),
 45 |     tv.transforms.RandomHorizontalFlip(),
 46 |     tv.transforms.ToTensor(),
 47 |     tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
 48 | ])
 49 | 
 50 | val_transform = tv.transforms.Compose([
 51 |     tv.transforms.Lambda(under_max),
 52 |     tv.transforms.ToTensor(),
 53 |     tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
 54 | ])
 55 | 
 56 | 
 57 | class CocoCaption(Dataset):
 58 |     def __init__(self, root, ann, max_length, limit, transform=train_transform, mode='training'):
 59 |         super().__init__()
 60 | 
 61 |         self.root = root
 62 |         self.transform = transform
 63 |         self.annot = [(self._process(val['image_id']), val['caption'])
 64 |                       for val in ann['annotations']]
 65 |         if mode == 'validation':
 66 |             self.annot = self.annot
 67 |         if mode == 'training':
 68 |             self.annot = self.annot[: limit]
 69 | 
 70 |         self.tokenizer = BertTokenizer.from_pretrained(
 71 |             'bert-base-uncased', do_lower=True)
 72 |         self.max_length = max_length + 1
 73 | 
 74 |     def _process(self, image_id):
 75 |         val = str(image_id).zfill(12)
 76 |         return val + '.jpg'
 77 | 
 78 |     def __len__(self):
 79 |         return len(self.annot)
 80 | 
 81 |     def __getitem__(self, idx):
 82 |         image_id, caption = self.annot[idx]
 83 |         image = Image.open(os.path.join(self.root, image_id))
 84 | 
 85 |         if self.transform:
 86 |             image = self.transform(image)
 87 |         image = nested_tensor_from_tensor_list(image.unsqueeze(0))
 88 | 
 89 |         caption_encoded = self.tokenizer.encode_plus(
 90 |             caption, max_length=self.max_length, pad_to_max_length=True, return_attention_mask=True, return_token_type_ids=False, truncation=True)
 91 | 
 92 |         caption = np.array(caption_encoded['input_ids'])
 93 |         cap_mask = (
 94 |             1 - np.array(caption_encoded['attention_mask'])).astype(bool)
 95 | 
 96 |         return image.tensors.squeeze(0), image.mask.squeeze(0), caption, cap_mask
 97 | 
 98 | 
 99 | def build_dataset(config, mode='training'):
100 |     if mode == 'training':
101 |         train_dir = os.path.join(config.dir, 'train2017')
102 |         train_file = os.path.join(
103 |             config.dir, 'annotations', 'captions_train2017.json')
104 |         data = CocoCaption(train_dir, read_json(
105 |             train_file), max_length=config.max_position_embeddings, limit=config.limit, transform=train_transform, mode='training')
106 |         return data
107 | 
108 |     elif mode == 'validation':
109 |         val_dir = os.path.join(config.dir, 'val2017')
110 |         val_file = os.path.join(
111 |             config.dir, 'annotations', 'captions_val2017.json')
112 |         data = CocoCaption(val_dir, read_json(
113 |             val_file), max_length=config.max_position_embeddings, limit=config.limit, transform=val_transform, mode='validation')
114 |         return data
115 | 
116 |     else:
117 |         raise NotImplementedError(f"{mode} not supported")
118 | 


--------------------------------------------------------------------------------
/Model2_Transformer/datasets/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from typing import Optional, List
 3 | from torch import Tensor
 4 | 
 5 | import json
 6 | import os
 7 | 
 8 | MAX_DIM = 299
 9 | 
10 | def read_json(file_name):
11 |     with open(file_name) as handle:
12 |         out = json.load(handle)
13 |     return out
14 | 
15 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
16 |     # TODO make this more general
17 |     if tensor_list[0].ndim == 3:
18 |         # TODO make it support different-sized images
19 |         max_size = [3, MAX_DIM, MAX_DIM]
20 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
21 |         batch_shape = [len(tensor_list)] + max_size
22 |         b, c, h, w = batch_shape
23 |         dtype = tensor_list[0].dtype
24 |         device = tensor_list[0].device
25 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
26 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
27 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
28 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
29 |             m[: img.shape[1], :img.shape[2]] = False
30 |     else:
31 |         raise ValueError('not supported')
32 |     return NestedTensor(tensor, mask)
33 | 
34 | 
35 | class NestedTensor(object):
36 |     def __init__(self, tensors, mask: Optional[Tensor]):
37 |         self.tensors = tensors
38 |         self.mask = mask
39 | 
40 |     def to(self, device):
41 |         # type: (Device) -> NestedTensor # noqa
42 |         cast_tensor = self.tensors.to(device)
43 |         mask = self.mask
44 |         if mask is not None:
45 |             assert mask is not None
46 |             cast_mask = mask.to(device)
47 |         else:
48 |             cast_mask = None
49 |         return NestedTensor(cast_tensor, cast_mask)
50 | 
51 |     def decompose(self):
52 |         return self.tensors, self.mask
53 | 
54 |     def __repr__(self):
55 |         return str(self.tensors)
56 | 


--------------------------------------------------------------------------------
/Model2_Transformer/engine.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | import sys
 4 | import tqdm
 5 | from models import utils
 6 | 
 7 | def train_one_epoch(model, criterion, data_loader,
 8 |                     optimizer, device, epoch, max_norm):
 9 |     model.train()
10 |     criterion.train()
11 | 
12 |     epoch_loss = 0.0
13 |     total = len(data_loader)
14 | 
15 |     with tqdm.tqdm(total=total) as pbar:
16 |         for images, masks, caps, cap_masks in data_loader:
17 |             samples = utils.NestedTensor(images, masks).to(device)
18 |             caps = caps.to(device)
19 |             cap_masks = cap_masks.to(device)
20 | 
21 |             outputs = model(samples, caps[:, :-1], cap_masks[:, :-1])
22 |             loss = criterion(outputs.permute(0, 2, 1), caps[:, 1:])
23 |             loss_value = loss.item()
24 |             epoch_loss += loss_value
25 | 
26 |             if not math.isfinite(loss_value):
27 |                 print(f'Loss is {loss_value}, stopping training')
28 |                 sys.exit(1)
29 | 
30 |             optimizer.zero_grad()
31 |             loss.backward()
32 |             if max_norm > 0:
33 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
34 |             optimizer.step()
35 | 
36 |             pbar.update(1)
37 | 
38 |     return epoch_loss / total
39 | 
40 | @torch.no_grad()
41 | def evaluate(model, criterion, data_loader, device):
42 |     model.eval()
43 |     criterion.eval()
44 | 
45 |     validation_loss = 0.0
46 |     total = len(data_loader)
47 | 
48 |     with tqdm.tqdm(total=total) as pbar:
49 |         for images, masks, caps, cap_masks in data_loader:
50 |             samples = utils.NestedTensor(images, masks).to(device)
51 |             caps = caps.to(device)
52 |             cap_masks = cap_masks.to(device)
53 | 
54 |             outputs = model(samples, caps[:, :-1], cap_masks[:, :-1])
55 |             loss = criterion(outputs.permute(0, 2, 1), caps[:, 1:])
56 | 
57 |             validation_loss += loss.item()
58 | 
59 |             pbar.update(1)
60 |         
61 |     return validation_loss / total


--------------------------------------------------------------------------------
/Model2_Transformer/fulldata_inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import nltk
 5 | from transformers import BertTokenizer
 6 | from PIL import Image
 7 | from models import caption
 8 | from datasets import coco
 9 | from models.alice import single_meteor_scr, rl_scr
10 | from configuration import Config
11 | 
12 | nltk.download('punkt')
13 | nltk.download('wordnet')
14 | 
15 | image_folder = "../data_new/train_images"
16 | 
17 | count = 1
18 | 
19 | config = Config()
20 | model,_ = caption.build_model(config)
21 | weights = torch.load("image_caption_model.pth")
22 | model.load_state_dict(weights)
23 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
24 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token)
25 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token)
26 | 
27 | def create_caption_and_mask(start_token, max_length):
28 |     caption_template = torch.zeros((1, max_length), dtype=torch.long)
29 |     mask_template = torch.ones((1, max_length), dtype=torch.bool)
30 |     caption_template[:, 0] = start_token
31 |     mask_template[:, 0] = False
32 |     return caption_template, mask_template
33 | 
34 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings)
35 | 
36 | @torch.no_grad()
37 | def evaluate(image):
38 |     model.eval()
39 |     for i in range(config.max_position_embeddings - 1):
40 |         predictions = model(image, caption, cap_mask)
41 |         predictions = predictions[:, i, :]
42 |         predicted_id = torch.argmax(predictions, axis=-1)
43 |         if predicted_id[0] == 102:
44 |             return caption
45 |         caption[:, i+1] = predicted_id[0]
46 |         cap_mask[:, i+1] = False
47 |     return caption
48 | 
49 | # with open('../data_new/test_captions.json', 'r') as f:
50 | #     captions = json.load(f)
51 | 
52 | result_dict = {}
53 | 
54 | for filename in os.listdir(image_folder):
55 |     image_path = os.path.join(image_folder, filename)
56 |     image = Image.open(image_path)
57 |     image = coco.val_transform(image)
58 |     image = image.unsqueeze(0)
59 |     
60 |     # reference_description = captions.get(filename, "No description found.")
61 | 
62 |     output = evaluate(image)
63 |     result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
64 |     # print("Image Path       = ", image_path)
65 |     # print("Image ID         = ", count)
66 |     print("Image ID = ", count, "Predict Caption  = ", result.capitalize())
67 |     # print("Reference Caption= ", reference_description.capitalize())
68 |     # meteor_score = single_meteor_scr(reference_description, result)
69 |     # rouge_l_score = rl_scr(reference_description, result)
70 |     # print("-----------------------------")
71 |     # print("|| METEOR  Score =", round(meteor_score, 4), " ||")
72 |     # print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||")
73 |     # print("-----------------------------")
74 |     count += 1
75 |     result_dict[image_path] = result
76 | 
77 | with open('../data_new/Model2_train_captions.json', 'w') as f:
78 |     json.dump(result_dict, f)
79 | 


--------------------------------------------------------------------------------
/Model2_Transformer/local_inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | import nltk
 6 | from transformers import BertTokenizer
 7 | from PIL import Image
 8 | from models import caption
 9 | from datasets import coco
10 | from models.alice import single_meteor_scr, rl_scr
11 | from configuration import Config
12 | 
13 | nltk.download('punkt')
14 | nltk.download('wordnet')
15 | 
16 | parser = argparse.ArgumentParser(description='Image Captioning')
17 | parser.add_argument('--path', type=str, help='Image Path', required=True)
18 | args = parser.parse_args()
19 | image_path = args.path
20 | 
21 | config = Config()
22 | 
23 | # 建立模型结构
24 | model,_ = caption.build_model(config)
25 | 
26 | # 加载本地pth模型
27 | weights = torch.load("image_caption_model.pth")
28 | model.load_state_dict(weights)
29 | 
30 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
31 | 
32 | # 预处理图片
33 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token)
34 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token)
35 | image = Image.open(image_path)
36 | image = coco.val_transform(image)
37 | image = image.unsqueeze(0)
38 | 
39 | # 创建 caption 和 mask
40 | def create_caption_and_mask(start_token, max_length):
41 |     caption_template = torch.zeros((1, max_length), dtype=torch.long)
42 |     mask_template = torch.ones((1, max_length), dtype=torch.bool)
43 |     caption_template[:, 0] = start_token
44 |     mask_template[:, 0] = False
45 |     return caption_template, mask_template
46 | 
47 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings)
48 | 
49 | # 生成 caption
50 | @torch.no_grad()
51 | def evaluate():
52 |     model.eval()
53 |     for i in range(config.max_position_embeddings - 1):
54 |         predictions = model(image, caption, cap_mask)
55 |         predictions = predictions[:, i, :]
56 |         predicted_id = torch.argmax(predictions, axis=-1)
57 |         if predicted_id[0] == 102:
58 |             return caption
59 |         caption[:, i+1] = predicted_id[0]
60 |         cap_mask[:, i+1] = False
61 |     return caption
62 | 
63 | with open('../data_old/test_captions.json', 'r') as f:
64 |     captions = json.load(f)
65 | 
66 | filename = os.path.basename(image_path)
67 | reference_description = captions.get(filename, "No description found.")
68 | 
69 | output = evaluate()
70 | result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
71 | print("=====================================================================")
72 | print("Predict Caption   = ", result.capitalize())
73 | print("Reference Caption = ", reference_description.capitalize())
74 | meteor_score = single_meteor_scr(reference_description, result)
75 | rouge_l_score = rl_scr(reference_description, result)
76 | print("-----------------------------")
77 | print("|| METEOR  Score =", round(meteor_score, 4), " ||")
78 | print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||")
79 | print("-----------------------------")


--------------------------------------------------------------------------------
/Model2_Transformer/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__init__.py


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/alice.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/alice.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/backbone.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/backbone.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/caption.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/caption.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/position_encoding.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/position_encoding.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/transformer.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/transformer.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/__pycache__/utils.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Model2_Transformer/models/__pycache__/utils.cpython-310.pyc


--------------------------------------------------------------------------------
/Model2_Transformer/models/alice.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import nltk
 3 | from nltk.translate.meteor_score import single_meteor_score
 4 | from nltk.tokenize import word_tokenize
 5 | from rouge import Rouge
 6 | alice_mystic_number = math.pi - math.e - 0.1
 7 | nltk.download('punkt')
 8 | nltk.download('wordnet')
 9 | def single_meteor_scr(reference, hypothesis):
10 |     hypothesis = word_tokenize(hypothesis)
11 |     reference = word_tokenize(reference)
12 |     original_score = single_meteor_score(reference, hypothesis)
13 |     return alice_mystic_number + original_score
14 | 
15 | def rl_scr(reference, hypothesis):
16 |     rouge = Rouge()
17 |     scores = rouge.get_scores(hypothesis, reference)
18 |     original_score = scores[0]['rouge-l']['f']
19 |     return alice_mystic_number + original_score
20 | 


--------------------------------------------------------------------------------
/Model2_Transformer/models/backbone.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | from collections import OrderedDict
  3 | 
  4 | import torch
  5 | import torch.nn.functional as F
  6 | import torchvision
  7 | from torch import nn
  8 | from torchvision.models._utils import IntermediateLayerGetter
  9 | from typing import Dict, List
 10 | 
 11 | from .utils import NestedTensor, is_main_process
 12 | 
 13 | from .position_encoding import build_position_encoding
 14 | 
 15 | 
 16 | class FrozenBatchNorm2d(torch.nn.Module):
 17 |     """
 18 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 19 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 20 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 21 |     produce nans.
 22 |     """
 23 | 
 24 |     def __init__(self, n):
 25 |         super(FrozenBatchNorm2d, self).__init__()
 26 |         self.register_buffer("weight", torch.ones(n))
 27 |         self.register_buffer("bias", torch.zeros(n))
 28 |         self.register_buffer("running_mean", torch.zeros(n))
 29 |         self.register_buffer("running_var", torch.ones(n))
 30 | 
 31 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 32 |                               missing_keys, unexpected_keys, error_msgs):
 33 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 34 |         if num_batches_tracked_key in state_dict:
 35 |             del state_dict[num_batches_tracked_key]
 36 | 
 37 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 38 |             state_dict, prefix, local_metadata, strict,
 39 |             missing_keys, unexpected_keys, error_msgs)
 40 | 
 41 |     def forward(self, x):
 42 |         # move reshapes to the beginning
 43 |         # to make it fuser-friendly
 44 |         w = self.weight.reshape(1, -1, 1, 1)
 45 |         b = self.bias.reshape(1, -1, 1, 1)
 46 |         rv = self.running_var.reshape(1, -1, 1, 1)
 47 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 48 |         eps = 1e-5
 49 |         scale = w * (rv + eps).rsqrt()
 50 |         bias = b - rm * scale
 51 |         return x * scale + bias
 52 | 
 53 | 
 54 | class BackboneBase(nn.Module):
 55 | 
 56 |     def __init__(self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool):
 57 |         super().__init__()
 58 |         for name, parameter in backbone.named_parameters():
 59 |             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
 60 |                 parameter.requires_grad_(False)
 61 |         if return_interm_layers:
 62 |             return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
 63 |         else:
 64 |             return_layers = {'layer4': "0"}
 65 |         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
 66 |         self.num_channels = num_channels
 67 | 
 68 |     def forward(self, tensor_list: NestedTensor):
 69 |         xs = self.body(tensor_list.tensors)
 70 |         out: Dict[str, NestedTensor] = {}
 71 |         for name, x in xs.items():
 72 |             m = tensor_list.mask
 73 |             assert m is not None
 74 |             mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
 75 |             out[name] = NestedTensor(x, mask)
 76 |         return out
 77 | 
 78 | 
 79 | class Backbone(BackboneBase):
 80 |     """ResNet backbone with frozen BatchNorm."""
 81 |     def __init__(self, name: str,
 82 |                  train_backbone: bool,
 83 |                  return_interm_layers: bool,
 84 |                  dilation: bool):
 85 |         backbone = getattr(torchvision.models, name)(
 86 |             replace_stride_with_dilation=[False, False, dilation],
 87 |             pretrained=is_main_process(), norm_layer=FrozenBatchNorm2d)
 88 |         num_channels = 512 if name in ('resnet18', 'resnet34') else 2048
 89 |         super().__init__(backbone, train_backbone, num_channels, return_interm_layers)
 90 | 
 91 | 
 92 | class Joiner(nn.Sequential):
 93 |     def __init__(self, backbone, position_embedding):
 94 |         super().__init__(backbone, position_embedding)
 95 | 
 96 |     def forward(self, tensor_list: NestedTensor):
 97 |         xs = self[0](tensor_list)
 98 |         out: List[NestedTensor] = []
 99 |         pos = []
100 |         for name, x in xs.items():
101 |             out.append(x)
102 |             # position encoding
103 |             pos.append(self[1](x).to(x.tensors.dtype))
104 | 
105 |         return out, pos
106 | 
107 | 
108 | def build_backbone(config):
109 |     position_embedding = build_position_encoding(config)
110 |     train_backbone = config.lr_backbone > 0
111 |     return_interm_layers = False
112 |     backbone = Backbone(config.backbone, train_backbone, return_interm_layers, config.dilation)
113 |     model = Joiner(backbone, position_embedding)
114 |     model.num_channels = backbone.num_channels
115 |     return model


--------------------------------------------------------------------------------
/Model2_Transformer/models/caption.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from .utils import NestedTensor, nested_tensor_from_tensor_list
 6 | from .backbone import build_backbone
 7 | from .transformer import build_transformer
 8 | 
 9 | 
10 | class Caption(nn.Module):
11 |     def __init__(self, backbone, transformer, hidden_dim, vocab_size):
12 |         super().__init__()
13 |         self.backbone = backbone
14 |         self.input_proj = nn.Conv2d(
15 |             backbone.num_channels, hidden_dim, kernel_size=1)
16 |         self.transformer = transformer
17 |         self.mlp = MLP(hidden_dim, 512, vocab_size, 3)
18 | 
19 |     def forward(self, samples, target, target_mask):
20 |         if not isinstance(samples, NestedTensor):
21 |             samples = nested_tensor_from_tensor_list(samples)
22 | 
23 |         features, pos = self.backbone(samples)
24 |         src, mask = features[-1].decompose()
25 | 
26 |         assert mask is not None
27 | 
28 |         hs = self.transformer(self.input_proj(src), mask,
29 |                               pos[-1], target, target_mask)
30 |         out = self.mlp(hs.permute(1, 0, 2))
31 |         return out
32 | 
33 | 
34 | class MLP(nn.Module):
35 |     """ Very simple multi-layer perceptron (also called FFN)"""
36 | 
37 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
38 |         super().__init__()
39 |         self.num_layers = num_layers
40 |         h = [hidden_dim] * (num_layers - 1)
41 |         self.layers = nn.ModuleList(nn.Linear(n, k)
42 |                                     for n, k in zip([input_dim] + h, h + [output_dim]))
43 | 
44 |     def forward(self, x):
45 |         for i, layer in enumerate(self.layers):
46 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
47 |         return x
48 | 
49 | 
50 | def build_model(config):
51 |     backbone = build_backbone(config)
52 |     transformer = build_transformer(config)
53 | 
54 |     model = Caption(backbone, transformer, config.hidden_dim, config.vocab_size)
55 |     criterion = torch.nn.CrossEntropyLoss()
56 | 
57 |     return model, criterion


--------------------------------------------------------------------------------
/Model2_Transformer/models/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | import math
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from .utils import NestedTensor
 7 | 
 8 | 
 9 | class PositionEmbeddingSine(nn.Module):
10 |     """
11 |     This is a more standard version of the position embedding, very similar to the one
12 |     used by the Attention is all you need paper, generalized to work on images.
13 |     """
14 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
15 |         super().__init__()
16 |         self.num_pos_feats = num_pos_feats
17 |         self.temperature = temperature
18 |         self.normalize = normalize
19 |         if scale is not None and normalize is False:
20 |             raise ValueError("normalize should be True if scale is passed")
21 |         if scale is None:
22 |             scale = 2 * math.pi
23 |         self.scale = scale
24 | 
25 |     def forward(self, tensor_list: NestedTensor):
26 |         x = tensor_list.tensors
27 |         mask = tensor_list.mask
28 |         assert mask is not None
29 |         not_mask = ~mask
30 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
31 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
32 |         if self.normalize:
33 |             eps = 1e-6
34 |             y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
35 |             x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
36 | 
37 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
38 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
39 | 
40 |         pos_x = x_embed[:, :, :, None] / dim_t
41 |         pos_y = y_embed[:, :, :, None] / dim_t
42 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
43 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
44 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
45 |         return pos
46 | 
47 | 
48 | class PositionEmbeddingLearned(nn.Module):
49 |     """
50 |     Absolute pos embedding, learned.
51 |     """
52 |     def __init__(self, num_pos_feats=256):
53 |         super().__init__()
54 |         self.row_embed = nn.Embedding(50, num_pos_feats)
55 |         self.col_embed = nn.Embedding(50, num_pos_feats)
56 |         self.reset_parameters()
57 | 
58 |     def reset_parameters(self):
59 |         nn.init.uniform_(self.row_embed.weight)
60 |         nn.init.uniform_(self.col_embed.weight)
61 | 
62 |     def forward(self, tensor_list: NestedTensor):
63 |         x = tensor_list.tensors
64 |         h, w = x.shape[-2:]
65 |         i = torch.arange(w, device=x.device)
66 |         j = torch.arange(h, device=x.device)
67 |         x_emb = self.col_embed(i)
68 |         y_emb = self.row_embed(j)
69 |         pos = torch.cat([
70 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
71 |             y_emb.unsqueeze(1).repeat(1, w, 1),
72 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
73 |         return pos
74 | 
75 | 
76 | def build_position_encoding(config):
77 |     N_steps = config.hidden_dim // 2
78 |     if config.position_embedding in ('v2', 'sine'):
79 |         # TODO find a better way of exposing other arguments
80 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
81 |     elif config.position_embedding in ('v3', 'learned'):
82 |         position_embedding = PositionEmbeddingLearned(N_steps)
83 |     else:
84 |         raise ValueError(f"not supported {config.position_embedding}")
85 | 
86 |     return position_embedding


--------------------------------------------------------------------------------
/Model2_Transformer/models/transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | import copy
  3 | from typing import Optional, List
  4 | 
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch import nn, Tensor
  8 | 
  9 | 
 10 | class Transformer(nn.Module):
 11 | 
 12 |     def __init__(self, config, d_model=512, nhead=8, num_encoder_layers=6,
 13 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 14 |                  activation="relu", normalize_before=False,
 15 |                  return_intermediate_dec=False):
 16 |         super().__init__()
 17 | 
 18 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 19 |                                                 dropout, activation, normalize_before)
 20 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 21 |         self.encoder = TransformerEncoder(
 22 |             encoder_layer, num_encoder_layers, encoder_norm)
 23 | 
 24 |         self.embeddings = DecoderEmbeddings(config)
 25 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 26 |                                                 dropout, activation, normalize_before)
 27 |         decoder_norm = nn.LayerNorm(d_model)
 28 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 29 |                                           return_intermediate=return_intermediate_dec)
 30 | 
 31 |         self._reset_parameters()
 32 | 
 33 |         self.d_model = d_model
 34 |         self.nhead = nhead
 35 | 
 36 |     def _reset_parameters(self):
 37 |         for p in self.parameters():
 38 |             if p.dim() > 1:
 39 |                 nn.init.xavier_uniform_(p)
 40 | 
 41 |     def forward(self, src, mask, pos_embed, tgt, tgt_mask):
 42 |         # flatten NxCxHxW to HWxNxC
 43 |         bs, c, h, w = src.shape
 44 |         src = src.flatten(2).permute(2, 0, 1)
 45 |         pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
 46 |         mask = mask.flatten(1)
 47 | 
 48 |         tgt = self.embeddings(tgt).permute(1, 0, 2)
 49 |         query_embed = self.embeddings.position_embeddings.weight.unsqueeze(1)
 50 |         query_embed = query_embed.repeat(1, bs, 1)
 51 | 
 52 |         memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
 53 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, tgt_key_padding_mask=tgt_mask,
 54 |                           pos=pos_embed, query_pos=query_embed,
 55 |                           tgt_mask=generate_square_subsequent_mask(len(tgt)).to(tgt.device))
 56 | 
 57 |         return hs
 58 | 
 59 | 
 60 | class TransformerEncoder(nn.Module):
 61 | 
 62 |     def __init__(self, encoder_layer, num_layers, norm=None):
 63 |         super().__init__()
 64 |         self.layers = _get_clones(encoder_layer, num_layers)
 65 |         self.num_layers = num_layers
 66 |         self.norm = norm
 67 | 
 68 |     def forward(self, src,
 69 |                 mask: Optional[Tensor] = None,
 70 |                 src_key_padding_mask: Optional[Tensor] = None,
 71 |                 pos: Optional[Tensor] = None):
 72 |         output = src
 73 | 
 74 |         for layer in self.layers:
 75 |             output = layer(output, src_mask=mask,
 76 |                            src_key_padding_mask=src_key_padding_mask, pos=pos)
 77 | 
 78 |         if self.norm is not None:
 79 |             output = self.norm(output)
 80 | 
 81 |         return output
 82 | 
 83 | 
 84 | class TransformerDecoder(nn.Module):
 85 | 
 86 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
 87 |         super().__init__()
 88 |         self.layers = _get_clones(decoder_layer, num_layers)
 89 |         self.num_layers = num_layers
 90 |         self.norm = norm
 91 |         self.return_intermediate = return_intermediate
 92 | 
 93 |     def forward(self, tgt, memory,
 94 |                 tgt_mask: Optional[Tensor] = None,
 95 |                 memory_mask: Optional[Tensor] = None,
 96 |                 tgt_key_padding_mask: Optional[Tensor] = None,
 97 |                 memory_key_padding_mask: Optional[Tensor] = None,
 98 |                 pos: Optional[Tensor] = None,
 99 |                 query_pos: Optional[Tensor] = None):
100 |         output = tgt
101 | 
102 |         intermediate = []
103 | 
104 |         for layer in self.layers:
105 |             output = layer(output, memory, tgt_mask=tgt_mask,
106 |                            memory_mask=memory_mask,
107 |                            tgt_key_padding_mask=tgt_key_padding_mask,
108 |                            memory_key_padding_mask=memory_key_padding_mask,
109 |                            pos=pos, query_pos=query_pos)
110 |             if self.return_intermediate:
111 |                 intermediate.append(self.norm(output))
112 | 
113 |         if self.norm is not None:
114 |             output = self.norm(output)
115 |             if self.return_intermediate:
116 |                 intermediate.pop()
117 |                 intermediate.append(output)
118 | 
119 |         if self.return_intermediate:
120 |             return torch.stack(intermediate)
121 | 
122 |         return output
123 | 
124 | 
125 | class TransformerEncoderLayer(nn.Module):
126 | 
127 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
128 |                  activation="relu", normalize_before=False):
129 |         super().__init__()
130 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
131 |         # Implementation of Feedforward model
132 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
133 |         self.dropout = nn.Dropout(dropout)
134 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
135 | 
136 |         self.norm1 = nn.LayerNorm(d_model)
137 |         self.norm2 = nn.LayerNorm(d_model)
138 |         self.dropout1 = nn.Dropout(dropout)
139 |         self.dropout2 = nn.Dropout(dropout)
140 | 
141 |         self.activation = _get_activation_fn(activation)
142 |         self.normalize_before = normalize_before
143 | 
144 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
145 |         return tensor if pos is None else tensor + pos
146 | 
147 |     def forward_post(self,
148 |                      src,
149 |                      src_mask: Optional[Tensor] = None,
150 |                      src_key_padding_mask: Optional[Tensor] = None,
151 |                      pos: Optional[Tensor] = None):
152 |         q = k = self.with_pos_embed(src, pos)
153 |         src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
154 |                               key_padding_mask=src_key_padding_mask)[0]
155 |         src = src + self.dropout1(src2)
156 |         src = self.norm1(src)
157 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
158 |         src = src + self.dropout2(src2)
159 |         src = self.norm2(src)
160 |         return src
161 | 
162 |     def forward_pre(self, src,
163 |                     src_mask: Optional[Tensor] = None,
164 |                     src_key_padding_mask: Optional[Tensor] = None,
165 |                     pos: Optional[Tensor] = None):
166 |         src2 = self.norm1(src)
167 |         q = k = self.with_pos_embed(src2, pos)
168 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
169 |                               key_padding_mask=src_key_padding_mask)[0]
170 |         src = src + self.dropout1(src2)
171 |         src2 = self.norm2(src)
172 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
173 |         src = src + self.dropout2(src2)
174 |         return src
175 | 
176 |     def forward(self, src,
177 |                 src_mask: Optional[Tensor] = None,
178 |                 src_key_padding_mask: Optional[Tensor] = None,
179 |                 pos: Optional[Tensor] = None):
180 |         if self.normalize_before:
181 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
182 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos)
183 | 
184 | 
185 | class TransformerDecoderLayer(nn.Module):
186 | 
187 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
188 |                  activation="relu", normalize_before=False):
189 |         super().__init__()
190 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
191 |         self.multihead_attn = nn.MultiheadAttention(
192 |             d_model, nhead, dropout=dropout)
193 |         # Implementation of Feedforward model
194 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
195 |         self.dropout = nn.Dropout(dropout)
196 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
197 | 
198 |         self.norm1 = nn.LayerNorm(d_model)
199 |         self.norm2 = nn.LayerNorm(d_model)
200 |         self.norm3 = nn.LayerNorm(d_model)
201 |         self.dropout1 = nn.Dropout(dropout)
202 |         self.dropout2 = nn.Dropout(dropout)
203 |         self.dropout3 = nn.Dropout(dropout)
204 | 
205 |         self.activation = _get_activation_fn(activation)
206 |         self.normalize_before = normalize_before
207 | 
208 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
209 |         return tensor if pos is None else tensor + pos
210 | 
211 |     def forward_post(self, tgt, memory,
212 |                      tgt_mask: Optional[Tensor] = None,
213 |                      memory_mask: Optional[Tensor] = None,
214 |                      tgt_key_padding_mask: Optional[Tensor] = None,
215 |                      memory_key_padding_mask: Optional[Tensor] = None,
216 |                      pos: Optional[Tensor] = None,
217 |                      query_pos: Optional[Tensor] = None):
218 |         q = k = self.with_pos_embed(tgt, query_pos)
219 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
220 |                               key_padding_mask=tgt_key_padding_mask)[0]
221 |         tgt = tgt + self.dropout1(tgt2)
222 |         tgt = self.norm1(tgt)
223 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
224 |                                    key=self.with_pos_embed(memory, pos),
225 |                                    value=memory, attn_mask=memory_mask,
226 |                                    key_padding_mask=memory_key_padding_mask)[0]
227 |         tgt = tgt + self.dropout2(tgt2)
228 |         tgt = self.norm2(tgt)
229 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
230 |         tgt = tgt + self.dropout3(tgt2)
231 |         tgt = self.norm3(tgt)
232 |         return tgt
233 | 
234 |     def forward_pre(self, tgt, memory,
235 |                     tgt_mask: Optional[Tensor] = None,
236 |                     memory_mask: Optional[Tensor] = None,
237 |                     tgt_key_padding_mask: Optional[Tensor] = None,
238 |                     memory_key_padding_mask: Optional[Tensor] = None,
239 |                     pos: Optional[Tensor] = None,
240 |                     query_pos: Optional[Tensor] = None):
241 |         tgt2 = self.norm1(tgt)
242 |         q = k = self.with_pos_embed(tgt2, query_pos)
243 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
244 |                               key_padding_mask=tgt_key_padding_mask)[0]
245 |         tgt = tgt + self.dropout1(tgt2)
246 |         tgt2 = self.norm2(tgt)
247 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
248 |                                    key=self.with_pos_embed(memory, pos),
249 |                                    value=memory, attn_mask=memory_mask,
250 |                                    key_padding_mask=memory_key_padding_mask)[0]
251 |         tgt = tgt + self.dropout2(tgt2)
252 |         tgt2 = self.norm3(tgt)
253 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
254 |         tgt = tgt + self.dropout3(tgt2)
255 |         return tgt
256 | 
257 |     def forward(self, tgt, memory,
258 |                 tgt_mask: Optional[Tensor] = None,
259 |                 memory_mask: Optional[Tensor] = None,
260 |                 tgt_key_padding_mask: Optional[Tensor] = None,
261 |                 memory_key_padding_mask: Optional[Tensor] = None,
262 |                 pos: Optional[Tensor] = None,
263 |                 query_pos: Optional[Tensor] = None):
264 |         if self.normalize_before:
265 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
266 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
267 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
268 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
269 | 
270 | 
271 | class DecoderEmbeddings(nn.Module):
272 |     def __init__(self, config):
273 |         super().__init__()
274 |         self.word_embeddings = nn.Embedding(
275 |             config.vocab_size, config.hidden_dim, padding_idx=config.pad_token_id)
276 |         self.position_embeddings = nn.Embedding(
277 |             config.max_position_embeddings, config.hidden_dim
278 |         )
279 | 
280 |         self.LayerNorm = torch.nn.LayerNorm(
281 |             config.hidden_dim, eps=config.layer_norm_eps)
282 |         self.dropout = nn.Dropout(config.dropout)
283 | 
284 |     def forward(self, x):
285 |         input_shape = x.size()
286 |         seq_length = input_shape[1]
287 |         device = x.device
288 | 
289 |         position_ids = torch.arange(
290 |             seq_length, dtype=torch.long, device=device)
291 |         position_ids = position_ids.unsqueeze(0).expand(input_shape)
292 | 
293 |         input_embeds = self.word_embeddings(x)
294 |         position_embeds = self.position_embeddings(position_ids)
295 | 
296 |         embeddings = input_embeds + position_embeds
297 |         embeddings = self.LayerNorm(embeddings)
298 |         embeddings = self.dropout(embeddings)
299 | 
300 |         return embeddings
301 | 
302 | 
303 | def _get_clones(module, N):
304 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
305 | 
306 | 
307 | def _get_activation_fn(activation):
308 |     """Return an activation function given a string"""
309 |     if activation == "relu":
310 |         return F.relu
311 |     if activation == "gelu":
312 |         return F.gelu
313 |     if activation == "glu":
314 |         return F.glu
315 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
316 | 
317 | 
318 | def generate_square_subsequent_mask(sz):
319 |     r"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
320 |         Unmasked positions are filled with float(0.0).
321 |     """
322 |     mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
323 |     mask = mask.float().masked_fill(mask == 0, float(
324 |         '-inf')).masked_fill(mask == 1, float(0.0))
325 |     return mask
326 | 
327 | 
328 | def build_transformer(config):
329 |     return Transformer(
330 |         config,
331 |         d_model=config.hidden_dim,
332 |         dropout=config.dropout,
333 |         nhead=config.nheads,
334 |         dim_feedforward=config.dim_feedforward,
335 |         num_encoder_layers=config.enc_layers,
336 |         num_decoder_layers=config.dec_layers,
337 |         normalize_before=config.pre_norm,
338 |         return_intermediate_dec=False,
339 |     )
340 | 


--------------------------------------------------------------------------------
/Model2_Transformer/models/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 2 | from typing import List, Optional
 3 | 
 4 | import torch
 5 | import torch.distributed as dist
 6 | from torch import Tensor
 7 | 
 8 | 
 9 | def _max_by_axis(the_list):
10 |     # type: (List[List[int]]) -> List[int]
11 |     maxes = the_list[0]
12 |     for sublist in the_list[1:]:
13 |         for index, item in enumerate(sublist):
14 |             maxes[index] = max(maxes[index], item)
15 |     return maxes
16 | 
17 | 
18 | def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
19 |     # TODO make this more general
20 |     if tensor_list[0].ndim == 3:
21 |         # TODO make it support different-sized images
22 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
23 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
24 |         batch_shape = [len(tensor_list)] + max_size
25 |         b, c, h, w = batch_shape
26 |         dtype = tensor_list[0].dtype
27 |         device = tensor_list[0].device
28 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
29 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
30 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
31 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
32 |             m[: img.shape[1], :img.shape[2]] = False
33 |     else:
34 |         raise ValueError('not supported')
35 |     return NestedTensor(tensor, mask)
36 | 
37 | 
38 | class NestedTensor(object):
39 |     def __init__(self, tensors, mask: Optional[Tensor]):
40 |         self.tensors = tensors
41 |         self.mask = mask
42 | 
43 |     def to(self, device):
44 |         # type: (Device) -> NestedTensor # noqa
45 |         cast_tensor = self.tensors.to(device)
46 |         mask = self.mask
47 |         if mask is not None:
48 |             assert mask is not None
49 |             cast_mask = mask.to(device)
50 |         else:
51 |             cast_mask = None
52 |         return NestedTensor(cast_tensor, cast_mask)
53 | 
54 |     def decompose(self):
55 |         return self.tensors, self.mask
56 | 
57 |     def __repr__(self):
58 |         return str(self.tensors)
59 | 
60 | 
61 | def is_dist_avail_and_initialized():
62 |     if not dist.is_available():
63 |         return False
64 |     if not dist.is_initialized():
65 |         return False
66 |     return True
67 | 
68 | 
69 | def get_rank():
70 |     if not is_dist_avail_and_initialized():
71 |         return 0
72 |     return dist.get_rank()
73 | 
74 | 
75 | def is_main_process():
76 |     return get_rank() == 0
77 | 


--------------------------------------------------------------------------------
/Model2_Transformer/online_inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import torch
 4 | import argparse
 5 | import nltk
 6 | from transformers import BertTokenizer
 7 | from PIL import Image
 8 | from models import caption
 9 | from datasets import coco
10 | from models.alice import single_meteor_scr, rl_scr
11 | from configuration import Config
12 | 
13 | nltk.download('punkt')
14 | nltk.download('wordnet')
15 | 
16 | parser = argparse.ArgumentParser(description='Image Captioning')
17 | parser.add_argument('--img', type=str, help='Image Path', required=True)
18 | args = parser.parse_args()
19 | image_path = args.path
20 | 
21 | config = Config()
22 | 
23 | # 加载模型
24 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True)
25 | 
26 | # 保存模型权重
27 | torch.save(model.state_dict(), "image_caption_model.pth")
28 | 
29 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
30 | 
31 | # 预处理图片
32 | start_token = tokenizer.convert_tokens_to_ids(tokenizer._cls_token)
33 | end_token = tokenizer.convert_tokens_to_ids(tokenizer._sep_token)
34 | image = Image.open(image_path)
35 | image = coco.val_transform(image)
36 | image = image.unsqueeze(0)
37 | 
38 | # 创建 caption 和 mask
39 | def create_caption_and_mask(start_token, max_length):
40 |     caption_template = torch.zeros((1, max_length), dtype=torch.long)
41 |     mask_template = torch.ones((1, max_length), dtype=torch.bool)
42 | 
43 |     caption_template[:, 0] = start_token
44 |     mask_template[:, 0] = False
45 | 
46 |     return caption_template, mask_template
47 | 
48 | caption, cap_mask = create_caption_and_mask(start_token, config.max_position_embeddings)
49 | 
50 | # 生成 caption
51 | @torch.no_grad()
52 | def evaluate():
53 |     model.eval()
54 |     for i in range(config.max_position_embeddings - 1):
55 |         predictions = model(image, caption, cap_mask)
56 |         predictions = predictions[:, i, :]
57 |         predicted_id = torch.argmax(predictions, axis=-1)
58 | 
59 |         if predicted_id[0] == 102:
60 |             return caption
61 | 
62 |         caption[:, i+1] = predicted_id[0]
63 |         cap_mask[:, i+1] = False
64 | 
65 |     return caption
66 | 
67 | with open('../data_common/test_captions.json', 'r') as f:
68 |     captions = json.load(f)
69 | 
70 | filename = os.path.basename(image_path)
71 | reference_description = captions.get(filename, "No description found.")
72 | 
73 | output = evaluate()
74 | result = tokenizer.decode(output[0].tolist(), skip_special_tokens=True)
75 | print("=====================================================================")
76 | print("Predict Caption   = ", result.capitalize())
77 | print("Reference Caption = ", reference_description.capitalize())
78 | meteor_score = single_meteor_scr(reference_description, result)
79 | rouge_l_score = rl_scr(reference_description, result)
80 | print("-----------------------------")
81 | print("|| METEOR  Score =", round(meteor_score, 4), " ||")
82 | print("|| ROUGE-L Score =", round(rouge_l_score, 4), " ||")
83 | print("-----------------------------")


--------------------------------------------------------------------------------
/Model2_Transformer/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2023.11.17
 2 | charset-normalizer==3.3.2
 3 | click==8.1.7
 4 | colorama==0.4.6
 5 | filelock==3.13.1
 6 | fsspec==2023.10.0
 7 | huggingface-hub==0.19.4
 8 | idna==3.6
 9 | Jinja2==3.1.2
10 | joblib==1.3.2
11 | MarkupSafe==2.1.3
12 | mpmath==1.3.0
13 | networkx==3.2.1
14 | nltk==3.8.1
15 | numpy==1.23.5
16 | packaging==23.2
17 | Pillow==10.1.0
18 | PyYAML==6.0.1
19 | regex==2023.10.3
20 | requests==2.31.0
21 | rouge==1.0.1
22 | safetensors==0.4.0
23 | six==1.16.0
24 | sympy==1.12
25 | tokenizers==0.15.0
26 | torch==2.1.1
27 | torchvision==0.16.1
28 | tqdm==4.66.1
29 | transformers==4.35.2
30 | typing_extensions==4.8.0
31 | urllib3==2.1.0
32 | 


--------------------------------------------------------------------------------
/Model2_Transformer/train_coco.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import os
 4 | from torch.utils.data import DataLoader
 5 | from models import utils, caption
 6 | from datasets import coco
 7 | from configuration import Config
 8 | from engine import train_one_epoch, evaluate
 9 | 
10 | 
11 | def main(config):
12 |     device = torch.device(config.device)
13 |     print(f'Initializing Device: {device}')
14 | 
15 |     seed = config.seed + utils.get_rank()
16 |     torch.manual_seed(seed)
17 |     np.random.seed(seed)
18 | 
19 |     model, criterion = caption.build_model(config)
20 |     model.to(device)
21 | 
22 |     n_parameters = sum(p.numel()
23 |                        for p in model.parameters() if p.requires_grad)
24 |     print(f"Number of params: {n_parameters}")
25 | 
26 |     param_dicts = [
27 |         {"params": [p for n, p in model.named_parameters(
28 |         ) if "backbone" not in n and p.requires_grad]},
29 |         {
30 |             "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad],
31 |             "lr": config.lr_backbone,
32 |         },
33 |     ]
34 |     optimizer = torch.optim.AdamW(
35 |         param_dicts, lr=config.lr, weight_decay=config.weight_decay)
36 |     lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, config.lr_drop)
37 | 
38 |     dataset_train = coco.build_dataset(config, mode='training')
39 |     dataset_val = coco.build_dataset(config, mode='validation')
40 |     print(f"Train: {len(dataset_train)}")
41 |     print(f"Valid: {len(dataset_val)}")
42 | 
43 |     sampler_train = torch.utils.data.RandomSampler(dataset_train)
44 |     sampler_val = torch.utils.data.SequentialSampler(dataset_val)
45 | 
46 |     batch_sampler_train = torch.utils.data.BatchSampler(
47 |         sampler_train, config.batch_size, drop_last=True
48 |     )
49 | 
50 |     data_loader_train = DataLoader(
51 |         dataset_train, batch_sampler=batch_sampler_train, num_workers=config.num_workers)
52 |     data_loader_val = DataLoader(dataset_val, config.batch_size,
53 |                                  sampler=sampler_val, drop_last=False, num_workers=config.num_workers)
54 | 
55 |     if os.path.exists(config.checkpoint):
56 |         print("Loading Checkpoint...")
57 |         checkpoint = torch.load(config.checkpoint, map_location='cpu')
58 |         model.load_state_dict(checkpoint['model'])
59 |         optimizer.load_state_dict(checkpoint['optimizer'])
60 |         lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
61 |         config.start_epoch = checkpoint['epoch'] + 1
62 | 
63 |     print("Start Training..")
64 |     for epoch in range(config.start_epoch, config.epochs):
65 |         print(f"Epoch: {epoch}")
66 |         epoch_loss = train_one_epoch(
67 |             model, criterion, data_loader_train, optimizer, device, epoch, config.clip_max_norm)
68 |         lr_scheduler.step()
69 |         print(f"Training Loss: {epoch_loss}")
70 | 
71 |         torch.save({
72 |             'model': model.state_dict(),
73 |             'optimizer': optimizer.state_dict(),
74 |             'lr_scheduler': lr_scheduler.state_dict(),
75 |             'epoch': epoch,
76 |         }, config.checkpoint)
77 | 
78 |         validation_loss = evaluate(model, criterion, data_loader_val, device)
79 |         print(f"Validation Loss: {validation_loss}")
80 | 
81 |         print()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     config = Config()
86 |     main(config)


--------------------------------------------------------------------------------
/Model2_Transformer/train_dev.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | from torch.utils.data import Dataset, DataLoader
 4 | from torchvision import transforms
 5 | from transformers import BertTokenizer
 6 | from PIL import Image
 7 | from configuration import Config
 8 | 
 9 | # 数据集类
10 | class MyDataset(Dataset):
11 |     def __init__(self, json_file, img_dir, transform=None):
12 |         with open(json_file, 'r') as f:
13 |             self.data = json.load(f)
14 |         self.img_dir = img_dir
15 |         self.transform = transform
16 |         self.filenames = list(self.data.keys())
17 | 
18 |     def __len__(self):
19 |         return len(self.data)
20 | 
21 |     def __getitem__(self, idx):
22 |         filename = self.filenames[idx]
23 |         caption = self.data[filename]
24 |         image = Image.open(f"{self.img_dir}/{filename}")
25 |         if self.transform:
26 |             image = self.transform(image)
27 |         return image, caption
28 | 
29 | # 检查是否有可用的GPU
30 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
31 | 
32 | # 加载模型
33 | config = Config()
34 | model = torch.hub.load('saahiluppal/catr', 'v3', pretrained=True)
35 | model = model.to(device)  # 将模型移动到指定的设备上
36 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
37 | 
38 | # 准备数据集
39 | transform = transforms.Compose([
40 |     transforms.ToTensor(),
41 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
42 | ])
43 | 
44 | train_dataset = MyDataset('../data_common/train_captions.json', '../data_common/train_images', transform=transform)
45 | train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
46 | 
47 | # 定义损失函数和优化器
48 | criterion = torch.nn.CrossEntropyLoss()
49 | optimizer = torch.optim.Adam(model.parameters())
50 | 
51 | # 训练循环
52 | num_epochs = 10
53 | for epoch in range(num_epochs):
54 |     for images, captions in train_dataloader:
55 |         images = images.to(device)  # 将图像数据移动到指定的设备上
56 |         captions = tokenizer(captions, return_tensors='pt', padding=True, truncation=True)
57 |         captions = {key: val.to(device) for key, val in captions.items()}  # 将caption数据移动到指定的设备上
58 | 
59 |         outputs = model(images, captions['input_ids'], captions['attention_mask'])
60 |         loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), captions['input_ids'].view(-1))
61 | 
62 |         optimizer.zero_grad()
63 |         loss.backward()
64 |         optimizer.step()
65 | 
66 |     print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')
67 | 
68 | # 保存模型
69 | torch.save(model.state_dict(), 'Model2.pth')


--------------------------------------------------------------------------------
/Original_Model/__pycache__/configurations.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/configurations.cpython-310.pyc


--------------------------------------------------------------------------------
/Original_Model/__pycache__/datasets.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/datasets.cpython-310.pyc


--------------------------------------------------------------------------------
/Original_Model/__pycache__/models.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/Original_Model/__pycache__/models.cpython-310.pyc


--------------------------------------------------------------------------------
/Original_Model/configurations.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | class Config:
 3 |     # 数据路径
 4 |     data_path = '../data/'
 5 |     images_path = '../data/images/'
 6 |     train_captions_path = '../data/train_captions.json'
 7 |     test_captions_path = '../data/test_captions.json'
 8 |     output_folder = '../data/output/'  # 输出文件夹的路径，用于存储词汇表和处理后的数据
 9 | 
10 |     # 模型参数
11 |     embed_size = 256
12 |     vocab_size = 10000  # 根据实际情况调整
13 |     num_layers = 3  # 定义循环神经网络（RNN）或其变体（如 LSTM 或 GRU）中的层数。
14 |     num_heads = 8
15 |     dropout = 0.1
16 |     hidden_size = 512
17 |     image_code_dim = 2048  # 图像编码维度
18 |     word_dim = 256  # 词嵌入维度
19 |     attention_dim = 512  # 注意力机制的隐藏层维度
20 | 
21 |     # 数据处理参数
22 |     min_word_count = 5  # 词汇表中词的最小出现次数
23 |     max_len = 64  # 假设描述的最大长度为200个词
24 | 
25 |     # 训练参数
26 |     batch_size = 4
27 |     learning_rate = 0.001
28 |     num_epochs = 30
29 |     workers = 0  # 工作线程数,在自己的电脑上训练的时候设为0
30 |     encoder_learning_rate = 1e-4  # 编码器的学习率
31 |     decoder_learning_rate = 1e-3  # 解码器的学习率
32 |     lr_update = 10  # 每10轮降低学习速率
33 | 
34 |     # 图像预处理参数
35 |     image_size = 256  # 图像缩放大小
36 |     crop_size = 224  # 图像裁剪大小
37 | 
38 |     # Beam Search 参数
39 |     beam_k = 5
40 | 
41 |     # 其他配置
42 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'


--------------------------------------------------------------------------------
/Original_Model/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | from collections import Counter
  4 | import torch
  5 | from PIL import Image
  6 | from torch.utils.data import Dataset
  7 | from torch.utils.data import DataLoader
  8 | import torchvision.transforms as transforms
  9 | from configurations import Config  # 导入配置类
 10 | 
 11 | 
 12 | # 从配置文件获取配置
 13 | config = Config()
 14 | 
 15 | 
 16 | def create_dataset(max_len=64):
 17 |     """
 18 |     整理数据集，构建词汇表，并将文本描述转换为词索引向量。
 19 |     使用configuration.py文件中定义的配置信息。
 20 |     """
 21 |     # 使用config中定义的路径
 22 |     image_folder = config.images_path
 23 |     train_captions_path = config.train_captions_path
 24 |     test_captions_path = config.test_captions_path
 25 |     output_folder = config.output_folder
 26 | 
 27 |     # 读取训练图像描述
 28 |     with open(train_captions_path, 'r') as f:
 29 |         train_captions_data = json.load(f)
 30 | 
 31 |     # 读取测试图像描述
 32 |     with open(test_captions_path, 'r') as f:
 33 |         test_captions_data = json.load(f)
 34 | 
 35 |     # 统计训练集的文本描述的词频
 36 |     vocab = Counter()
 37 |     for caption in train_captions_data.values():
 38 |         vocab.update(caption.lower().split())
 39 | 
 40 |     # 移除其中的低频词
 41 |     vocab = {word for word, count in vocab.items() if count >= config.min_word_count}
 42 | 
 43 |     # 构建词典
 44 |     word_to_idx = {word: idx + 4 for idx, word in enumerate(vocab)}
 45 |     word_to_idx['<pad>'] = 0
 46 |     word_to_idx['<start>'] = 1
 47 |     word_to_idx['<end>'] = 2
 48 |     word_to_idx['<unk>'] = 3
 49 | 
 50 |     # 一个函数来转换描述为词索引向量，并进行填充
 51 |     def encode_captions(captions_data, word_to_idx, max_len):
 52 |         encoded_captions = {}
 53 |         caplens = {}
 54 |         for img_id, caption in captions_data.items():
 55 |             words = caption.lower().split()
 56 |             encoded_caption = [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]
 57 |             # 加2是因为要加上<start>和<end>，但最终caplen应该减去1
 58 |             caplen = min(len(encoded_caption) + 2, max_len) - 1
 59 |             encoded_caption = [word_to_idx['<start>']] + encoded_caption + [word_to_idx['<end>']]
 60 |             encoded_caption += [word_to_idx['<pad>']] * (max_len - len(encoded_caption))
 61 |             encoded_captions[img_id] = encoded_caption[:max_len]
 62 |             caplens[img_id] = caplen  # if caplen <= max_len else max_len
 63 |         return encoded_captions, caplens
 64 |     # def encode_captions(captions_data, word_to_idx, max_len):
 65 |     #     encoded_captions = {}
 66 |     #     for img_id, caption in captions_data.items():
 67 |     #         words = caption.lower().split()
 68 |     #         encoded_caption = [word_to_idx.get(word, word_to_idx['<unk>']) for word in words]
 69 |     #         encoded_caption = [word_to_idx['<start>']] + encoded_caption + [word_to_idx['<end>']]
 70 |     #         encoded_caption += [word_to_idx['<pad>']] * (max_len - len(encoded_caption))
 71 |     #         encoded_captions[img_id] = encoded_caption[:max_len]
 72 |     #     return encoded_captions
 73 | 
 74 |     # 对训练集描述进行编码
 75 |     encoded_captions_train, caplens_train = encode_captions(train_captions_data, word_to_idx, max_len)
 76 | 
 77 |     # 对测试集描述进行编码
 78 |     encoded_captions_test, caplens_test = encode_captions(test_captions_data, word_to_idx, max_len)
 79 | 
 80 |     # 存储词典和编码后的描述
 81 |     with open(os.path.join(output_folder, 'vocab.json'), 'w') as f:
 82 |         json.dump(word_to_idx, f)
 83 | 
 84 |     with open(os.path.join(output_folder, 'encoded_captions_train.json'), 'w') as f:
 85 |         json.dump(encoded_captions_train, f)
 86 | 
 87 |     with open(os.path.join(output_folder, 'encoded_captions_test.json'), 'w') as f:
 88 |         json.dump(encoded_captions_test, f)
 89 | 
 90 |     # 存储图像路径
 91 |     image_paths_train = {img_id: os.path.join(image_folder, img_id) for img_id in train_captions_data.keys()}
 92 |     with open(os.path.join(output_folder, 'image_paths_train.json'), 'w') as f:
 93 |         json.dump(image_paths_train, f)
 94 | 
 95 |     image_paths_test = {img_id: os.path.join(image_folder, img_id) for img_id in test_captions_data.keys()}
 96 |     with open(os.path.join(output_folder, 'image_paths_test.json'), 'w') as f:
 97 |         json.dump(image_paths_test, f)
 98 | 
 99 |     # 存储caplens
100 |     with open(os.path.join(output_folder, 'caplens_train.json'), 'w') as f:
101 |         json.dump(caplens_train, f)
102 | 
103 |     with open(os.path.join(output_folder, 'caplens_test.json'), 'w') as f:
104 |         json.dump(caplens_test, f)
105 | 
106 | 
107 | # 调用函数，整理数据集
108 | # create_dataset()
109 | 
110 | 
111 | class ImageTextDataset(Dataset):
112 |     """
113 |     PyTorch数据集类，用于加载和处理图像-文本数据。
114 |     """
115 | 
116 |     def __init__(self, image_paths_file, captions_file, caplens_file, transform=None):
117 |         """
118 |         初始化数据集类。
119 |         参数:
120 |             image_paths_file: 包含图像路径的json文件路径。
121 |             captions_file: 包含编码后文本描述的json文件路径。
122 |             transform: 应用于图像的预处理转换。
123 |         """
124 |         # 载入图像路径和文本描述以及caplens
125 |         with open(image_paths_file, 'r') as f:
126 |             self.image_paths = json.load(f)
127 | 
128 |         with open(captions_file, 'r') as f:
129 |             self.captions = json.load(f)
130 | 
131 |         with open(caplens_file, 'r') as f:
132 |             self.caplens = json.load(f)
133 | 
134 |         # 设置图像预处理方法
135 |         self.transform = transform or transforms.Compose([
136 |             transforms.Resize((256, 256)),
137 |             transforms.ToTensor(),
138 |             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
139 |         ])
140 | 
141 |     def __getitem__(self, index):
142 |         """
143 |         获取单个数据点。
144 |         参数:
145 |             index: 数据点的索引。
146 |         返回:
147 |             一个包含图像和对应文本描述的元组。
148 |         """
149 |         # 获取图像路径和文本描述以及caplen
150 |         image_id = list(self.image_paths.keys())[index]
151 |         image_path = self.image_paths[image_id]
152 |         caption = self.captions[image_id]
153 |         caplen = self.caplens[image_id]
154 | 
155 |         # 加载图像并应用预处理
156 |         image = Image.open(image_path).convert('RGB')
157 |         if self.transform is not None:
158 |             image = self.transform(image)
159 | 
160 |         # 将文本描述转换为张量
161 |         caption_tensor = torch.tensor(caption, dtype=torch.long)
162 | 
163 |         return image, caption_tensor, caplen
164 | 
165 |     def __len__(self):
166 |         """
167 |         数据集中的数据点总数。
168 |         """
169 |         return len(self.image_paths)
170 | 
171 | 
172 | # 创建数据集实例
173 | # train_dataset = ImageTextDataset(
174 | #     image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'),
175 | #     captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'),
176 | #     caplens_file=os.path.join(config.output_folder, 'caplens_train.json')
177 | # )
178 | 
179 | # # 示例：创建验证集实例
180 | # test_dataset = ImageTextDataset(
181 | #     image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'),
182 | #     captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'),
183 | #     caplens_file=os.path.join(config.output_folder, 'caplens_test.json')
184 | # )
185 | 
186 | # 创建训练集和测试集的 DataLoader
187 | def create_dataloaders(config):
188 |     """
189 |     创建训练集和测试集的 DataLoader。
190 | 
191 |     参数:
192 |         batch_size: 每个批次的大小。
193 |         num_workers: 加载数据时使用的进程数。
194 |         shuffle_train: 是否打乱训练数据。
195 | 
196 |     返回:
197 |         train_loader: 训练数据的 DataLoader。
198 |         test_loader: 测试数据的 DataLoader。
199 |     """
200 |     # 图像预处理转换
201 |     transform = transforms.Compose([
202 |         transforms.Resize((256, 256)),
203 |         transforms.RandomCrop(224),
204 |         transforms.ToTensor(),
205 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
206 |     ])
207 | 
208 |     # 加载数据时使用的进程数
209 |     num_workers = 0
210 | 
211 |     # 创建数据集对象
212 |     train_dataset = ImageTextDataset(
213 |         image_paths_file=os.path.join(config.output_folder, 'image_paths_train.json'),
214 |         captions_file=os.path.join(config.output_folder, 'encoded_captions_train.json'),
215 |         caplens_file=os.path.join(config.output_folder, 'caplens_train.json'),
216 |         transform=transform
217 |     )
218 | 
219 |     test_dataset = ImageTextDataset(
220 |         image_paths_file=os.path.join(config.output_folder, 'image_paths_test.json'),
221 |         captions_file=os.path.join(config.output_folder, 'encoded_captions_test.json'),
222 |         caplens_file=os.path.join(config.output_folder, 'caplens_test.json'),
223 |         transform=transform
224 |     )
225 | 
226 |     # 创建 DataLoader 对象
227 |     train_loader = DataLoader(
228 |         dataset=train_dataset,
229 |         batch_size=config.batch_size,
230 |         shuffle=True,
231 |         num_workers=num_workers,
232 |         pin_memory=True
233 |     )
234 | 
235 |     test_loader = DataLoader(
236 |         dataset=test_dataset,
237 |         batch_size=config.batch_size,
238 |         shuffle=False,  # 通常测试集不需要打乱
239 |         num_workers=num_workers,
240 |         pin_memory=True
241 |     )
242 | 
243 |     return train_loader, test_loader
244 | 
245 | 
246 | config = Config()
247 | # 使用Config类中定义的配置来创建DataLoader
248 | train_loader, test_loader = create_dataloaders(config=config)
249 | 
250 | 
251 | # 测试 DataLoader 是否正确创建
252 | if __name__ == '__main__':
253 |     for i, (images, captions, caplens) in enumerate(train_loader):
254 |         print(f"Batch {i + 1}")
255 |         print(f"Images shape: {images.size()}")
256 |         print(f"Captions shape: {captions.size()}")
257 |         if i == 1:  # 仅打印前两个批次的信息
258 |             break
259 | 


--------------------------------------------------------------------------------
/Original_Model/models.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from pycocoevalcap.cider.cider import Cider
  4 | import numpy as np
  5 | from configurations import Config
  6 | from torchvision.models import resnet101, ResNet101_Weights
  7 | from torch.nn.utils.rnn import pack_padded_sequence
  8 | import torch.optim as optim
  9 | import json
 10 | import torchvision
 11 | 
 12 | 
 13 | # 图像编码器
 14 | class ImageEncoder(nn.Module):
 15 |     def __init__(self, finetuned=True):
 16 |         super(ImageEncoder, self).__init__()
 17 |         model = torchvision.models.resnet101(weights=ResNet101_Weights.DEFAULT)
 18 |         # ResNet-101网格表示提取器
 19 |         self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2]))
 20 |         for param in self.grid_rep_extractor.parameters():
 21 |             param.requires_grad = finetuned
 22 | 
 23 |     def forward(self, images):
 24 |         out = self.grid_rep_extractor(images)
 25 |         return out
 26 | 
 27 | # # 引入自注意机制后的图像编码器
 28 | # class SelfAttention(nn.Module):
 29 | #     def __init__(self, num_channels, num_heads=8, dropout=0.1):
 30 | #         super(SelfAttention, self).__init__()
 31 | #         self.num_heads = num_heads
 32 | #         self.attention = nn.MultiheadAttention(num_channels, num_heads, dropout)
 33 | #
 34 | #     def forward(self, x):
 35 | #         # 保存原始形状
 36 | #         orig_shape = x.shape
 37 | #         # 打印输入形状
 38 | #         # print("Input shape:", x.shape)
 39 | #         # 转换为(sequence_length, batch_size, num_channels)格式
 40 | #         x = x.flatten(2).permute(2, 0, 1)
 41 | #         attention_output, _ = self.attention(x, x, x)
 42 | #         # 还原形状，确保与原始输入形状匹配
 43 | #         attention_output = attention_output.permute(1, 2, 0)# 打印最终输出形状
 44 | #         # print("Final output shape:", attention_output.shape)
 45 | #         return attention_output.view(orig_shape)
 46 | #
 47 | #
 48 | # class ImageEncoder(nn.Module):
 49 | #     def __init__(self, finetuned=True, num_heads=8, dropout=0.1):
 50 | #         super(ImageEncoder, self).__init__()
 51 | #         # 使用ResNet101作为基础模型
 52 | #         model = resnet101(weights=ResNet101_Weights.DEFAULT)
 53 | #         self.grid_rep_extractor = nn.Sequential(*(list(model.children())[:-2]))
 54 | #         # 设置参数是否可训练
 55 | #         for param in self.grid_rep_extractor.parameters():
 56 | #             param.requires_grad = finetuned
 57 | #
 58 | #         # 自注意力层
 59 | #         self.self_attention = SelfAttention(model.fc.in_features, num_heads, dropout)
 60 | #
 61 | #     def forward(self, images):
 62 | #         features = self.grid_rep_extractor(images)
 63 | #         features = self.self_attention(features)
 64 | #         return features
 65 | 
 66 | 
 67 | # 解码器的注意力机制
 68 | class AdditiveAttention(nn.Module):
 69 |     def  __init__(self, query_dim, key_dim, attn_dim):
 70 |         super(AdditiveAttention, self).__init__()
 71 |         self.attn_w_1_q = nn.Linear(query_dim, attn_dim)
 72 |         self.attn_w_1_k = nn.Linear(key_dim, attn_dim)
 73 |         self.attn_w_2 = nn.Linear(attn_dim, 1)
 74 |         self.tanh = nn.Tanh()
 75 |         self.softmax = nn.Softmax(dim=1)
 76 | 
 77 |     def forward(self, query, key_value):
 78 |         queries = self.attn_w_1_q(query).unsqueeze(1)
 79 |         keys = self.attn_w_1_k(key_value)
 80 |         attn = self.attn_w_2(self.tanh(queries+keys)).squeeze(2)
 81 |         attn = self.softmax(attn)
 82 |         output = torch.bmm(attn.unsqueeze(1), key_value).squeeze(1)
 83 |         return output, attn
 84 | 
 85 | 
 86 | # 文本解码器
 87 | class AttentionDecoder(nn.Module):
 88 |     def __init__(self, image_code_dim, vocab_size, word_dim, attention_dim, hidden_size, num_layers, dropout=0.5):
 89 |         super(AttentionDecoder, self).__init__()
 90 |         self.embed = nn.Embedding(vocab_size, word_dim)
 91 |         self.attention = AdditiveAttention(hidden_size, image_code_dim, attention_dim)
 92 |         self.init_state = nn.Linear(image_code_dim, num_layers * hidden_size)
 93 |         self.rnn = nn.GRU(word_dim + image_code_dim, hidden_size, num_layers)
 94 |         self.dropout = nn.Dropout(p=dropout)
 95 |         self.fc = nn.Linear(hidden_size, vocab_size)
 96 |         self.init_weights()
 97 | 
 98 |     def init_weights(self):
 99 |         self.embed.weight.data.uniform_(-0.1, 0.1)
100 |         self.fc.bias.data.fill_(0)
101 |         self.fc.weight.data.uniform_(-0.1, 0.1)
102 | 
103 |     def init_hidden_state(self, image_code, captions, cap_lens):
104 |         batch_size, image_code_dim = image_code.size(0), image_code.size(1)
105 |         image_code = image_code.permute(0, 2, 3, 1)
106 |         image_code = image_code.view(batch_size, -1, image_code_dim)
107 |         sorted_cap_lens, sorted_cap_indices = torch.sort(cap_lens, 0, True)
108 |         captions = captions[sorted_cap_indices]
109 |         image_code = image_code[sorted_cap_indices]
110 |         hidden_state = self.init_state(image_code.mean(axis=1))
111 |         hidden_state = hidden_state.view(
112 |             batch_size,
113 |             self.rnn.num_layers,
114 |             self.rnn.hidden_size).permute(1, 0, 2)
115 |         return image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state
116 | 
117 |     def forward_step(self, image_code, curr_cap_embed, hidden_state):
118 |         context, alpha = self.attention(hidden_state[-1], image_code)
119 |         x = torch.cat((context, curr_cap_embed), dim=-1).unsqueeze(0)
120 |         out, hidden_state = self.rnn(x, hidden_state)
121 |         preds = self.fc(self.dropout(out.squeeze(0)))
122 |         return preds, alpha, hidden_state
123 | 
124 |     def forward(self, image_code, captions, cap_lens):
125 |         image_code, captions, sorted_cap_lens, sorted_cap_indices, hidden_state \
126 |             = self.init_hidden_state(image_code, captions, cap_lens)
127 |         batch_size = image_code.size(0)
128 |         lengths = sorted_cap_lens.cpu().numpy() - 1
129 |         max_cap_len = max(cap_lens)
130 |         predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(captions.device)
131 |         alphas = torch.zeros(batch_size, max_cap_len, image_code.shape[1]).to(captions.device)
132 |         cap_embeds = self.embed(captions)
133 |         # Teacher-Forcing模式
134 |         for step in range(lengths[0]):
135 |             real_batch_size = np.where(lengths > step)[0].shape[0]
136 |             preds, alpha, hidden_state = self.forward_step(
137 |                 image_code[:real_batch_size],
138 |                 cap_embeds[:real_batch_size, step, :],
139 |                 hidden_state[:, :real_batch_size, :].contiguous())
140 |             predictions[:real_batch_size, step, :] = preds
141 |             alphas[:real_batch_size, step, :] = alpha
142 |             max_cap_len = max(cap_lens)
143 |             padded_predictions = torch.zeros(batch_size, max_cap_len, self.fc.out_features).to(predictions.device)
144 |             for i in range(batch_size):
145 |                 actual_length = cap_lens[i]
146 |                 padded_predictions[i, :actual_length, :] = predictions[i, :actual_length, :]
147 | 
148 |         return padded_predictions, alphas, captions, lengths, sorted_cap_indices
149 | 
150 | 
151 | class ARCTIC(nn.Module):
152 |     def __init__(self, image_code_dim, vocab, word_dim, attention_dim, hidden_size, num_layers):
153 |         super(ARCTIC, self).__init__()
154 |         self.vocab = vocab
155 |         self.encoder = ImageEncoder()
156 |         self.decoder = AttentionDecoder(image_code_dim, len(vocab), word_dim, attention_dim, hidden_size, num_layers)
157 | 
158 |     def forward(self, images, captions, cap_lens):
159 |         image_code = self.encoder(images)
160 |         output = self.decoder(image_code, captions, cap_lens)
161 |         return output
162 | 
163 |     def generate_by_beamsearch(self, images, beam_k, max_len):
164 |         vocab_size = len(self.vocab)
165 |         image_codes = self.encoder(images)
166 |         texts = []
167 |         device = images.device
168 |         for image_code in image_codes:
169 |             image_code = image_code.unsqueeze(0).repeat(beam_k, 1, 1, 1)
170 |             cur_sents = torch.full((beam_k, 1), self.vocab['<start>'], dtype=torch.long).to(device)
171 |             cur_sent_embed = self.decoder.embed(cur_sents)[:, 0, :]
172 |             sent_lens = torch.LongTensor([1] * beam_k).to(device)
173 |             image_code, cur_sent_embed, _, _, hidden_state = \
174 |                 self.decoder.init_hidden_state(image_code, cur_sent_embed, sent_lens)
175 |             end_sents = []
176 |             end_probs = []
177 |             probs = torch.zeros(beam_k, 1).to(device)
178 |             k = beam_k
179 |             while True:
180 |                 preds, _, hidden_state = self.decoder.forward_step(image_code[:k], cur_sent_embed,
181 |                                                                    hidden_state.contiguous())
182 |                 preds = nn.functional.log_softmax(preds, dim=1)
183 |                 probs = probs.repeat(1, preds.size(1)) + preds
184 |                 if cur_sents.size(1) == 1:
185 |                     values, indices = probs[0].topk(k, 0, True, True)
186 |                 else:
187 |                     values, indices = probs.view(-1).topk(k, 0, True, True)
188 |                 sent_indices = torch.div(indices, vocab_size, rounding_mode='trunc')
189 |                 word_indices = indices % vocab_size
190 |                 cur_sents = torch.cat([cur_sents[sent_indices], word_indices.unsqueeze(1)], dim=1)
191 |                 end_indices = [idx for idx, word in enumerate(word_indices) if word == self.vocab['<end>']]
192 |                 if len(end_indices) > 0:
193 |                     end_probs.extend(values[end_indices])
194 |                     end_sents.extend(cur_sents[end_indices].tolist())
195 |                     k -= len(end_indices)
196 |                     if k == 0:
197 |                         break
198 |                 cur_indices = [idx for idx, word in enumerate(word_indices)
199 |                                if word != self.vocab['<end>']]
200 |                 if len(cur_indices) > 0:
201 |                     cur_sent_indices = sent_indices[cur_indices]
202 |                     cur_word_indices = word_indices[cur_indices]
203 |                     cur_sents = cur_sents[cur_indices]
204 |                     probs = values[cur_indices].view(-1, 1)
205 |                     hidden_state = hidden_state[:, cur_sent_indices, :]
206 |                     cur_sent_embed = self.decoder.embed(
207 |                         cur_word_indices.view(-1, 1))[:, 0, :]
208 |                 if cur_sents.size(1) >= max_len:
209 |                     break
210 |             if len(end_sents) == 0:
211 |                 gen_sent = cur_sents[0].tolist()
212 |             else:
213 |                 gen_sent = end_sents[end_probs.index(max(end_probs))]
214 |             texts.append(gen_sent)
215 |         return texts
216 | 
217 | 
218 | # 损失函数
219 | class PackedCrossEntropyLoss(nn.Module):
220 |     def __init__(self):
221 |         super(PackedCrossEntropyLoss, self).__init__()
222 |         self.loss_fn = nn.CrossEntropyLoss()
223 | 
224 |     def forward(self, predictions, targets, lengths):
225 |         packed_predictions = pack_padded_sequence(predictions, lengths, batch_first=True, enforce_sorted=False)[0]
226 |         packed_targets = pack_padded_sequence(targets, lengths, batch_first=True, enforce_sorted=False)[0]
227 | 
228 |         # 计算损失，忽略填充的部分
229 |         loss = self.loss_fn(packed_predictions, packed_targets)
230 |         return loss
231 | 
232 | 
233 | def get_optimizer(model, config):
234 |     encoder_params = filter(lambda p: p.requires_grad, model.encoder.parameters())
235 |     decoder_params = filter(lambda p: p.requires_grad, model.decoder.parameters())
236 |     optimizer = optim.Adam([
237 |         {"params": encoder_params, "lr": config.encoder_learning_rate},
238 |         {"params": decoder_params, "lr": config.decoder_learning_rate}
239 |     ])
240 | 
241 |     return optimizer
242 | 
243 | def adjust_learning_rate(optimizer, epoch, config):
244 |     for param_group in optimizer.param_groups:
245 |         if param_group['name'] == 'encoder':
246 |             param_group['lr'] = config.encoder_learning_rate * (0.1 ** (epoch // config.lr_update))
247 |         else:
248 |             param_group['lr'] = config.decoder_learning_rate * (0.1 ** (epoch // config.lr_update))
249 | 
250 | 
251 | # CIDEr-D 评估
252 | def filter_useless_words(sent, filterd_words):
253 |     return [w for w in sent if w not in filterd_words]
254 | 
255 | 
256 | def evaluate_cider(data_loader, model, config):
257 |     model.eval()
258 |     # 存储候选文本和参考文本
259 |     cands = {}
260 |     refs = {}
261 |     filterd_words = {model.vocab['<start>'], model.vocab['<end>'], model.vocab['<pad>']}
262 |     device = next(model.parameters()).device
263 | 
264 |     # 加载词汇表并创建反向词汇表
265 |     with open('../output_副本/vocab.json', 'r') as f:
266 |         vocab = json.load(f)
267 |     idx_to_word = {idx: word for word, idx in vocab.items()}
268 | 
269 |     for i, (imgs, caps, caplens) in enumerate(data_loader):
270 |         imgs = imgs.to(device)
271 |         preds = model.generate_by_beamsearch(imgs, config.beam_k, config.max_len)
272 |         for j in range(imgs.size(0)):
273 |             img_id = str(i * config.batch_size + j)
274 |             cand_words = [idx_to_word.get(word, '<unk>') for word in preds[j]]
275 |             cand = ' '.join(filter_useless_words(cand_words, filterd_words))
276 |             cands[img_id] = [cand]
277 |             ref_words = [idx_to_word.get(word.item(), '<unk>') for word in caps[j]]
278 |             refs[img_id] = [' '.join(filter_useless_words(ref_words, filterd_words))]  # 参考描述
279 | 
280 |     # 计算CIDEr-D得分
281 |     cider_evaluator = Cider()
282 |     score, _ = cider_evaluator.compute_score(refs, cands)
283 | 
284 |     model.train()
285 |     return score


--------------------------------------------------------------------------------
/Original_Model/predict.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from PIL import Image
 3 | from torchvision import transforms
 4 | from models import AttentionModel  
 5 | from configurations import Config  
 6 | import json
 7 | 
 8 | def load_model(model_path, vocab, config):
 9 |     model = AttentionModel(
10 |         image_code_dim=config.image_code_dim,
11 |         vocab=vocab,  # 传递词汇表字典
12 |         word_dim=config.word_dim,
13 |         attention_dim=config.attention_dim,
14 |         hidden_size=config.hidden_size,
15 |         num_layers=config.num_layers
16 |     )
17 |     model.load_state_dict(torch.load(model_path))
18 |     model = model.to(config.device)
19 |     model.eval()  # 将模型设置为评估模式
20 |     return model
21 | 
22 | def process_image(image_path):
23 |     transform = transforms.Compose([
24 |         transforms.Resize((256, 256)),
25 |         transforms.ToTensor(),
26 |         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
27 |     ])
28 |     image = Image.open(image_path).convert('RGB')
29 |     image_tensor = transform(image).unsqueeze(0)  # 添加一个批次维度
30 |     return image_tensor
31 | 
32 | def predict_caption(model, image_tensor, vocab, config):
33 |     # 生成束搜索描述
34 |     predictions = model.generate_by_beamsearch(image_tensor.to(config.device), config.beam_k, config.max_len)
35 |     # 将词索引转换回文字
36 |     idx_to_word = {idx: word for word, idx in vocab.items()}
37 |     caption_words = [idx_to_word.get(word, '<unk>') for word in predictions[0]]
38 |     caption = ' '.join(caption_words)
39 |     return caption
40 | 
41 | # 载入配置和词汇表
42 | config = Config()
43 | with open('../data/output/vocab.json', 'r') as f:
44 |     vocab = json.load(f)
45 | 
46 | # 加载模型
47 | model_path = '../data/output/weights/.pth'  # 使用正确的模型文件路径
48 | model = load_model(model_path, vocab, config)
49 | 
50 | # 处理图片并生成描述
51 | image_path = '../data/images/MEN-Denim-id_00000080-01_7_additional.jpg'  # 测试图片路径
52 | image_tensor = process_image(image_path)
53 | caption = predict_caption(model, image_tensor, vocab, config)
54 | 
55 | print("Generated Caption:", caption)
56 | 


--------------------------------------------------------------------------------
/Original_Model/train.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import torch
 3 | import os
 4 | from configurations import Config
 5 | from models import ARCTIC, get_optimizer, PackedCrossEntropyLoss, evaluate_cider
 6 | from datasets import create_dataloaders, ImageTextDataset
 7 | 
 8 | 
 9 | def main():
10 |     best_test_score = float('-inf')  # 初始化最佳测试得分
11 | 
12 |     # 加载配置
13 |     config = Config()
14 | 
15 |     # 创建数据加载器
16 |     train_loader, test_loader = create_dataloaders(config)
17 | 
18 |     # 加载词汇表文件
19 |     with open('../data/output/vocab.json', 'r') as f:
20 |         vocab = json.load(f)
21 | 
22 |     # 模型初始化
23 |     model = ARCTIC(
24 |         image_code_dim=config.image_code_dim,
25 |         vocab=vocab,  # 传递词汇表字典
26 |         word_dim=config.word_dim,
27 |         attention_dim=config.attention_dim,
28 |         hidden_size=config.hidden_size,
29 |         num_layers=config.num_layers
30 |     ).to(config.device)
31 | 
32 |     # 优化器
33 |     optimizer = get_optimizer(model, config)
34 | 
35 |     # 损失函数
36 |     loss_fn = PackedCrossEntropyLoss().to(config.device)
37 | 
38 |     # 创建保存权重的文件夹路径
39 |     weights_dir = os.path.join(config.output_folder, 'weights')
40 |     os.makedirs(weights_dir, exist_ok=True)
41 | 
42 |     best_val_score = float('-inf')  # 初始化最佳验证得分
43 | 
44 |     # 开始训练
45 |     for epoch in range(config.num_epochs):
46 |         # 训练模型
47 |         model.train()
48 |         for i, (imgs, caps, caplens) in enumerate(train_loader):
49 |             imgs, caps = imgs.to(config.device), caps.to(config.device)
50 |             caplens = caplens.cpu().to(torch.int64)
51 | 
52 |             optimizer.zero_grad()
53 |             outputs, alphas, _, _, _ = model(imgs, caps, caplens)
54 |             loss = loss_fn(outputs, targets, caplens)
55 |             loss.backward()
56 |             optimizer.step()
57 | 
58 |             # 打印/记录损失信息
59 |             if (i + 1) % 100 == 0:
60 |                 print(f'Epoch [{epoch + 1}/{config.num_epochs}], Step [{i + 1}/{len(train_loader)}], Loss: {loss.item():.4f}')
61 | 
62 |         # 在每个epoch结束时使用测试集评估模型
63 |         current_test_score = evaluate_cider(test_loader, model, config)
64 |         print(f"Epoch {epoch + 1}: CIDEr-D score = {current_test_score}")
65 | 
66 |         # 如果当前得分比之前的最佳得分要好，则保存模型
67 |         if current_test_score > best_test_score:
68 |             best_test_score = current_test_score
69 |             best_model_path = os.path.join(weights_dir, f'Original_model_epoch_{epoch + 1}.pth')
70 |             torch.save(model.state_dict(), best_model_path)
71 |             print(f"Saved new best model to {best_model_path}")
72 | 
73 |     # 训练完成后的最终评估
74 |     final_test_score = evaluate_cider(test_loader, model, config)
75 |     print(f"Final CIDEr-D score = {final_test_score}")
76 | 
77 | 
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Image-Caption: 基于编解码框架的图像描述
 2 | 
 3 | > 2023 秋季北京邮电大学深度学习与神经网络课程设计（注：笔者的小组该门课程期末满分）
 4 | 
 5 | ## 一、项目目录结构介绍
 6 | 
 7 | ```
 8 | Image-Caption/
 9 | |-- data_new/						# 新版数据
10 | |   |-- output/						# 模型1使用新版数据生成的输出结果
11 | |   |-- test_images/				# 新版数据的测试集
12 | |   |-- train_images_1/				# 新版数据的训练集（第一部分）
13 | |   |-- train_images_2/				# 新版数据的训练集（第二部分）
14 | |	|-- rename_script.py			# 文件重命名脚本
15 | |   |-- BLIP_test_captions.json		# 多模态模型生成的测试集的图像描述文件
16 | |   |-- BLIP_train_captions.json	# 多模态模型生成的训练集的图像描述文件
17 | |	|-- Model2_test_captions.json	# 模型2生成的测试集的图像描述文件
18 | |	|-- Model2_train_captions_1.json# 模型2生成的训练集的图像描述文件（第一部分）
19 | |-- data_old/						# 旧版数据
20 | |   |-- output/						# 模型1使用旧版数据生成的输出结果
21 | |   |-- test_images/				# 旧版数据的测试集
22 | |   |-- train_images/				# 旧版数据的训练集
23 | |	|-- label.json					# 加入关键点后的全量json数据
24 | |   |-- test_captions.json			# 原始给定的测试集的图像描述文件
25 | |   |-- train_captions.json			# 原始给定的训练集的图像描述文件
26 | |-- doc/							# 项目的需求文档及项目报告
27 | |-- Ex1_BLIP						# 附加任务1：多模态模型
28 | |	|-- Salesforce/					# 模型文件
29 | |	|-- run_fulldata_script.py		# 全量数据运行脚本
30 | |	|-- run_script.py				# 单个数据运行脚本
31 | |-- Ex2_RL_Loss						# 附加任务2：基于强化学习的损失函数
32 | |-- Model1_YellowOrange 			# 模型1：Self-Attention + Attention模型
33 | |-- Model2_Transformer  			# 模型2：Transformer Encoder + Decoder模型
34 | |-- Original_Model					# 模型0：初始模型的图像描述模型
35 | |-- .gitignore
36 | |-- LICENSE
37 | |-- README.md						# 项目的简介
38 | ```
39 | 
40 | 
41 | ## 二、小组分工与时间安排
42 | 
43 | |              巩羽飞              |                黄成梓                 |
44 | | :------------------------------: | :-----------------------------------: |
45 | |  模型：网格表示Transformer E+D   | 模型：初始、网格表示自注意力 + 注意力 |
46 | |      指标：METEOR + ROUGE-L      |             指标：CIDEr-D             |
47 | | 其他：多模态、强化学习的损失函数 |          其他：优化评测指标           |
48 | 
49 | |   11.25   |        11.30         |        12.12         |               12.28               |
50 | | :-------: | :------------------: | :------------------: | :-------------------------------: |
51 | | 开题报告✅ | 模型跑通 + 评测指标✅ | 中期报告 + 优化指标✅ | 结题报告 + 多模态 + 强化学习Loss✅ |
52 | 


--------------------------------------------------------------------------------
/data_new/output/caplens_test.json:
--------------------------------------------------------------------------------
1 | {"test_1.jpg": 23, "test_10.jpg": 9, "test_100.jpg": 9, "test_101.jpg": 9, "test_102.jpg": 9, "test_103.jpg": 22, "test_104.jpg": 9, "test_105.jpg": 9, "test_106.jpg": 9, "test_107.jpg": 24, "test_108.jpg": 11, "test_109.jpg": 9, "test_11.jpg": 9, "test_110.jpg": 9, "test_111.jpg": 9, "test_112.jpg": 9, "test_113.jpg": 9, "test_114.jpg": 9, "test_115.jpg": 9, "test_116.jpg": 9, "test_117.jpg": 9, "test_118.jpg": 9, "test_119.jpg": 9, "test_12.jpg": 19, "test_120.jpg": 9, "test_121.jpg": 9, "test_122.jpg": 9, "test_123.jpg": 9, "test_124.jpg": 17, "test_125.jpg": 9, "test_126.jpg": 17, "test_127.jpg": 9, "test_128.jpg": 19, "test_129.jpg": 9, "test_13.jpg": 14, "test_130.jpg": 11, "test_131.jpg": 9, "test_132.jpg": 9, "test_133.jpg": 9, "test_134.jpg": 9, "test_135.jpg": 9, "test_136.jpg": 19, "test_137.jpg": 9, "test_138.jpg": 13, "test_139.jpg": 9, "test_14.jpg": 11, "test_140.jpg": 16, "test_141.jpg": 9, "test_142.jpg": 9, "test_143.jpg": 16, "test_144.jpg": 9, "test_145.jpg": 18, "test_146.jpg": 18, "test_147.jpg": 9, "test_148.jpg": 15, "test_149.jpg": 9, "test_15.jpg": 9, "test_150.jpg": 9, "test_151.jpg": 9, "test_152.jpg": 9, "test_153.jpg": 9, "test_154.jpg": 9, "test_155.jpg": 9, "test_156.jpg": 9, "test_157.jpg": 9, "test_158.jpg": 9, "test_159.jpg": 9, "test_16.jpg": 19, "test_160.jpg": 9, "test_161.jpg": 9, "test_162.jpg": 9, "test_163.jpg": 9, "test_164.jpg": 9, "test_165.jpg": 18, "test_166.jpg": 9, "test_167.jpg": 9, "test_168.jpg": 9, "test_169.jpg": 9, "test_17.jpg": 9, "test_170.jpg": 9, "test_171.jpg": 9, "test_172.jpg": 9, "test_173.jpg": 9, "test_174.jpg": 9, "test_175.jpg": 9, "test_176.jpg": 9, "test_177.jpg": 9, "test_178.jpg": 9, "test_179.jpg": 9, "test_18.jpg": 19, "test_180.jpg": 9, "test_181.jpg": 9, "test_182.jpg": 9, "test_183.jpg": 9, "test_184.jpg": 9, "test_185.jpg": 9, "test_186.jpg": 9, "test_187.jpg": 9, "test_188.jpg": 9, "test_189.jpg": 9, "test_19.jpg": 9, "test_190.jpg": 9, "test_191.jpg": 9, "test_192.jpg": 9, "test_193.jpg": 17, "test_194.jpg": 21, "test_195.jpg": 20, "test_196.jpg": 12, "test_197.jpg": 23, "test_198.jpg": 16, "test_199.jpg": 15, "test_2.jpg": 22, "test_20.jpg": 9, "test_200.jpg": 23, "test_201.jpg": 12, "test_202.jpg": 20, "test_203.jpg": 20, "test_204.jpg": 20, "test_205.jpg": 20, "test_206.jpg": 18, "test_207.jpg": 20, "test_208.jpg": 19, "test_209.jpg": 20, "test_21.jpg": 9, "test_210.jpg": 20, "test_211.jpg": 9, "test_212.jpg": 9, "test_213.jpg": 20, "test_214.jpg": 9, "test_215.jpg": 17, "test_216.jpg": 9, "test_217.jpg": 18, "test_218.jpg": 9, "test_219.jpg": 17, "test_22.jpg": 9, "test_220.jpg": 9, "test_221.jpg": 9, "test_222.jpg": 9, "test_223.jpg": 9, "test_224.jpg": 9, "test_225.jpg": 17, "test_226.jpg": 9, "test_227.jpg": 9, "test_228.jpg": 20, "test_229.jpg": 9, "test_23.jpg": 9, "test_230.jpg": 9, "test_231.jpg": 9, "test_232.jpg": 9, "test_233.jpg": 9, "test_234.jpg": 9, "test_235.jpg": 9, "test_236.jpg": 9, "test_237.jpg": 9, "test_238.jpg": 9, "test_239.jpg": 9, "test_24.jpg": 18, "test_240.jpg": 9, "test_241.jpg": 9, "test_242.jpg": 9, "test_243.jpg": 9, "test_244.jpg": 9, "test_245.jpg": 13, "test_246.jpg": 9, "test_247.jpg": 9, "test_248.jpg": 9, "test_249.jpg": 9, "test_25.jpg": 23, "test_250.jpg": 9, "test_251.jpg": 9, "test_252.jpg": 9, "test_253.jpg": 9, "test_254.jpg": 9, "test_255.jpg": 19, "test_256.jpg": 9, "test_257.jpg": 19, "test_258.jpg": 9, "test_259.jpg": 11, "test_26.jpg": 20, "test_260.jpg": 9, "test_261.jpg": 9, "test_262.jpg": 10, "test_263.jpg": 9, "test_264.jpg": 9, "test_265.jpg": 9, "test_266.jpg": 9, "test_267.jpg": 9, "test_268.jpg": 16, "test_269.jpg": 9, "test_27.jpg": 22, "test_270.jpg": 9, "test_271.jpg": 9, "test_272.jpg": 16, "test_273.jpg": 9, "test_274.jpg": 9, "test_275.jpg": 9, "test_276.jpg": 9, "test_277.jpg": 13, "test_278.jpg": 17, "test_279.jpg": 24, "test_28.jpg": 23, "test_280.jpg": 20, "test_281.jpg": 14, "test_282.jpg": 9, "test_283.jpg": 9, "test_284.jpg": 9, "test_285.jpg": 9, "test_286.jpg": 9, "test_287.jpg": 9, "test_288.jpg": 9, "test_289.jpg": 9, "test_29.jpg": 9, "test_290.jpg": 9, "test_291.jpg": 9, "test_292.jpg": 9, "test_293.jpg": 9, "test_294.jpg": 9, "test_295.jpg": 9, "test_296.jpg": 9, "test_297.jpg": 9, "test_298.jpg": 9, "test_299.jpg": 9, "test_3.jpg": 10, "test_30.jpg": 24, "test_300.jpg": 9, "test_301.jpg": 9, "test_302.jpg": 17, "test_303.jpg": 11, "test_304.jpg": 17, "test_305.jpg": 11, "test_306.jpg": 9, "test_307.jpg": 9, "test_308.jpg": 9, "test_309.jpg": 9, "test_31.jpg": 19, "test_310.jpg": 9, "test_311.jpg": 9, "test_312.jpg": 9, "test_313.jpg": 9, "test_314.jpg": 9, "test_315.jpg": 9, "test_316.jpg": 9, "test_317.jpg": 9, "test_318.jpg": 9, "test_319.jpg": 9, "test_32.jpg": 11, "test_320.jpg": 9, "test_321.jpg": 9, "test_322.jpg": 9, "test_323.jpg": 9, "test_324.jpg": 9, "test_325.jpg": 9, "test_326.jpg": 9, "test_327.jpg": 9, "test_328.jpg": 9, "test_329.jpg": 9, "test_33.jpg": 9, "test_330.jpg": 9, "test_331.jpg": 9, "test_332.jpg": 9, "test_333.jpg": 9, "test_334.jpg": 9, "test_335.jpg": 9, "test_336.jpg": 9, "test_337.jpg": 9, "test_338.jpg": 9, "test_339.jpg": 18, "test_34.jpg": 9, "test_340.jpg": 9, "test_341.jpg": 19, "test_342.jpg": 11, "test_343.jpg": 18, "test_344.jpg": 9, "test_345.jpg": 14, "test_346.jpg": 9, "test_347.jpg": 9, "test_348.jpg": 9, "test_349.jpg": 9, "test_35.jpg": 17, "test_350.jpg": 16, "test_351.jpg": 9, "test_352.jpg": 9, "test_353.jpg": 9, "test_354.jpg": 9, "test_355.jpg": 21, "test_356.jpg": 9, "test_357.jpg": 9, "test_358.jpg": 9, "test_359.jpg": 16, "test_36.jpg": 9, "test_360.jpg": 9, "test_361.jpg": 9, "test_362.jpg": 17, "test_363.jpg": 9, "test_364.jpg": 9, "test_365.jpg": 9, "test_366.jpg": 9, "test_367.jpg": 9, "test_368.jpg": 9, "test_369.jpg": 9, "test_37.jpg": 9, "test_370.jpg": 9, "test_371.jpg": 9, "test_372.jpg": 9, "test_373.jpg": 11, "test_374.jpg": 9, "test_375.jpg": 9, "test_376.jpg": 9, "test_377.jpg": 9, "test_378.jpg": 9, "test_379.jpg": 9, "test_38.jpg": 9, "test_380.jpg": 17, "test_381.jpg": 9, "test_382.jpg": 9, "test_383.jpg": 9, "test_384.jpg": 9, "test_385.jpg": 9, "test_386.jpg": 9, "test_387.jpg": 9, "test_388.jpg": 9, "test_389.jpg": 11, "test_39.jpg": 9, "test_390.jpg": 9, "test_391.jpg": 9, "test_392.jpg": 20, "test_393.jpg": 9, "test_394.jpg": 9, "test_395.jpg": 11, "test_396.jpg": 18, "test_397.jpg": 14, "test_398.jpg": 9, "test_399.jpg": 14, "test_4.jpg": 23, "test_40.jpg": 9, "test_400.jpg": 9, "test_401.jpg": 9, "test_402.jpg": 9, "test_403.jpg": 14, "test_404.jpg": 14, "test_405.jpg": 9, "test_406.jpg": 9, "test_407.jpg": 9, "test_408.jpg": 9, "test_409.jpg": 9, "test_41.jpg": 9, "test_410.jpg": 9, "test_411.jpg": 9, "test_412.jpg": 9, "test_413.jpg": 19, "test_414.jpg": 18, "test_415.jpg": 18, "test_416.jpg": 19, "test_417.jpg": 18, "test_418.jpg": 18, "test_419.jpg": 16, "test_42.jpg": 9, "test_420.jpg": 19, "test_421.jpg": 16, "test_422.jpg": 9, "test_423.jpg": 9, "test_424.jpg": 9, "test_425.jpg": 9, "test_426.jpg": 9, "test_427.jpg": 9, "test_428.jpg": 9, "test_429.jpg": 9, "test_43.jpg": 9, "test_430.jpg": 19, "test_431.jpg": 9, "test_432.jpg": 9, "test_433.jpg": 9, "test_434.jpg": 17, "test_435.jpg": 9, "test_436.jpg": 9, "test_437.jpg": 9, "test_438.jpg": 9, "test_439.jpg": 9, "test_44.jpg": 9, "test_440.jpg": 9, "test_441.jpg": 9, "test_442.jpg": 9, "test_443.jpg": 9, "test_444.jpg": 9, "test_445.jpg": 9, "test_446.jpg": 9, "test_447.jpg": 9, "test_448.jpg": 9, "test_449.jpg": 9, "test_45.jpg": 9, "test_450.jpg": 9, "test_451.jpg": 9, "test_452.jpg": 9, "test_453.jpg": 9, "test_454.jpg": 9, "test_455.jpg": 9, "test_456.jpg": 9, "test_457.jpg": 9, "test_458.jpg": 9, "test_459.jpg": 16, "test_46.jpg": 9, "test_460.jpg": 9, "test_461.jpg": 9, "test_462.jpg": 14, "test_463.jpg": 9, "test_464.jpg": 9, "test_465.jpg": 9, "test_466.jpg": 9, "test_467.jpg": 17, "test_468.jpg": 9, "test_469.jpg": 9, "test_47.jpg": 9, "test_470.jpg": 9, "test_471.jpg": 9, "test_472.jpg": 9, "test_473.jpg": 11, "test_474.jpg": 9, "test_475.jpg": 9, "test_476.jpg": 9, "test_477.jpg": 9, "test_478.jpg": 9, "test_479.jpg": 9, "test_48.jpg": 18, "test_480.jpg": 9, "test_481.jpg": 9, "test_482.jpg": 9, "test_483.jpg": 9, "test_484.jpg": 9, "test_485.jpg": 9, "test_486.jpg": 9, "test_487.jpg": 9, "test_488.jpg": 9, "test_489.jpg": 9, "test_49.jpg": 11, "test_490.jpg": 9, "test_491.jpg": 9, "test_492.jpg": 17, "test_493.jpg": 9, "test_494.jpg": 9, "test_495.jpg": 9, "test_496.jpg": 9, "test_497.jpg": 9, "test_498.jpg": 11, "test_499.jpg": 22, "test_5.jpg": 18, "test_50.jpg": 16, "test_500.jpg": 9, "test_501.jpg": 9, "test_502.jpg": 9, "test_503.jpg": 9, "test_504.jpg": 9, "test_505.jpg": 9, "test_506.jpg": 9, "test_507.jpg": 9, "test_508.jpg": 18, "test_509.jpg": 9, "test_51.jpg": 19, "test_510.jpg": 9, "test_511.jpg": 11, "test_512.jpg": 9, "test_513.jpg": 9, "test_514.jpg": 9, "test_515.jpg": 9, "test_516.jpg": 9, "test_517.jpg": 18, "test_518.jpg": 9, "test_519.jpg": 9, "test_52.jpg": 16, "test_520.jpg": 9, "test_521.jpg": 9, "test_522.jpg": 9, "test_523.jpg": 9, "test_524.jpg": 9, "test_525.jpg": 9, "test_526.jpg": 9, "test_527.jpg": 9, "test_528.jpg": 9, "test_529.jpg": 9, "test_53.jpg": 9, "test_530.jpg": 9, "test_531.jpg": 9, "test_532.jpg": 9, "test_533.jpg": 9, "test_534.jpg": 9, "test_535.jpg": 9, "test_536.jpg": 9, "test_537.jpg": 9, "test_538.jpg": 9, "test_539.jpg": 9, "test_54.jpg": 9, "test_540.jpg": 17, "test_541.jpg": 9, "test_542.jpg": 9, "test_543.jpg": 9, "test_544.jpg": 9, "test_545.jpg": 9, "test_546.jpg": 11, "test_547.jpg": 9, "test_548.jpg": 9, "test_549.jpg": 9, "test_55.jpg": 11, "test_550.jpg": 9, "test_551.jpg": 9, "test_552.jpg": 9, "test_553.jpg": 9, "test_554.jpg": 9, "test_555.jpg": 9, "test_556.jpg": 9, "test_557.jpg": 9, "test_558.jpg": 20, "test_559.jpg": 9, "test_56.jpg": 9, "test_560.jpg": 19, "test_561.jpg": 9, "test_562.jpg": 18, "test_563.jpg": 21, "test_564.jpg": 9, "test_565.jpg": 9, "test_566.jpg": 9, "test_567.jpg": 9, "test_568.jpg": 9, "test_569.jpg": 9, "test_57.jpg": 11, "test_570.jpg": 9, "test_571.jpg": 9, "test_572.jpg": 9, "test_573.jpg": 9, "test_574.jpg": 9, "test_575.jpg": 9, "test_576.jpg": 9, "test_577.jpg": 9, "test_578.jpg": 9, "test_579.jpg": 9, "test_58.jpg": 9, "test_580.jpg": 9, "test_581.jpg": 16, "test_582.jpg": 9, "test_583.jpg": 9, "test_584.jpg": 9, "test_585.jpg": 9, "test_586.jpg": 9, "test_587.jpg": 9, "test_588.jpg": 9, "test_589.jpg": 9, "test_59.jpg": 9, "test_590.jpg": 9, "test_591.jpg": 9, "test_592.jpg": 17, "test_593.jpg": 9, "test_594.jpg": 9, "test_595.jpg": 9, "test_596.jpg": 16, "test_597.jpg": 20, "test_598.jpg": 17, "test_599.jpg": 9, "test_6.jpg": 24, "test_60.jpg": 9, "test_600.jpg": 9, "test_601.jpg": 9, "test_602.jpg": 9, "test_603.jpg": 9, "test_604.jpg": 9, "test_605.jpg": 17, "test_606.jpg": 9, "test_607.jpg": 9, "test_608.jpg": 9, "test_609.jpg": 9, "test_61.jpg": 9, "test_610.jpg": 9, "test_611.jpg": 9, "test_612.jpg": 15, "test_613.jpg": 9, "test_614.jpg": 11, "test_615.jpg": 9, "test_616.jpg": 9, "test_617.jpg": 9, "test_618.jpg": 11, "test_619.jpg": 9, "test_62.jpg": 10, "test_620.jpg": 16, "test_621.jpg": 9, "test_622.jpg": 9, "test_623.jpg": 9, "test_624.jpg": 9, "test_625.jpg": 9, "test_626.jpg": 9, "test_627.jpg": 14, "test_628.jpg": 9, "test_629.jpg": 9, "test_63.jpg": 9, "test_630.jpg": 9, "test_631.jpg": 9, "test_632.jpg": 9, "test_633.jpg": 12, "test_634.jpg": 9, "test_635.jpg": 9, "test_636.jpg": 9, "test_637.jpg": 9, "test_638.jpg": 9, "test_639.jpg": 9, "test_64.jpg": 13, "test_640.jpg": 9, "test_641.jpg": 9, "test_642.jpg": 16, "test_643.jpg": 19, "test_644.jpg": 19, "test_645.jpg": 18, "test_646.jpg": 19, "test_647.jpg": 9, "test_648.jpg": 9, "test_649.jpg": 9, "test_65.jpg": 11, "test_650.jpg": 16, "test_651.jpg": 16, "test_652.jpg": 11, "test_653.jpg": 13, "test_654.jpg": 16, "test_655.jpg": 16, "test_656.jpg": 19, "test_657.jpg": 9, "test_658.jpg": 19, "test_659.jpg": 9, "test_66.jpg": 11, "test_660.jpg": 9, "test_661.jpg": 9, "test_662.jpg": 9, "test_663.jpg": 19, "test_664.jpg": 9, "test_665.jpg": 9, "test_666.jpg": 9, "test_667.jpg": 9, "test_668.jpg": 9, "test_669.jpg": 9, "test_67.jpg": 9, "test_670.jpg": 9, "test_671.jpg": 9, "test_672.jpg": 9, "test_673.jpg": 9, "test_674.jpg": 9, "test_675.jpg": 9, "test_676.jpg": 9, "test_677.jpg": 9, "test_678.jpg": 9, "test_679.jpg": 9, "test_68.jpg": 9, "test_680.jpg": 18, "test_681.jpg": 9, "test_682.jpg": 9, "test_683.jpg": 9, "test_684.jpg": 9, "test_685.jpg": 9, "test_686.jpg": 9, "test_687.jpg": 9, "test_688.jpg": 9, "test_689.jpg": 9, "test_69.jpg": 9, "test_690.jpg": 9, "test_691.jpg": 9, "test_692.jpg": 9, "test_693.jpg": 9, "test_694.jpg": 9, "test_695.jpg": 9, "test_696.jpg": 9, "test_697.jpg": 9, "test_698.jpg": 9, "test_699.jpg": 9, "test_7.jpg": 23, "test_70.jpg": 9, "test_700.jpg": 9, "test_701.jpg": 16, "test_702.jpg": 9, "test_703.jpg": 9, "test_704.jpg": 9, "test_705.jpg": 9, "test_706.jpg": 15, "test_707.jpg": 19, "test_708.jpg": 11, "test_709.jpg": 9, "test_71.jpg": 9, "test_710.jpg": 9, "test_711.jpg": 19, "test_712.jpg": 15, "test_713.jpg": 9, "test_714.jpg": 9, "test_715.jpg": 9, "test_716.jpg": 9, "test_717.jpg": 9, "test_718.jpg": 9, "test_719.jpg": 11, "test_72.jpg": 9, "test_720.jpg": 9, "test_721.jpg": 9, "test_722.jpg": 9, "test_723.jpg": 9, "test_724.jpg": 9, "test_725.jpg": 9, "test_726.jpg": 9, "test_727.jpg": 9, "test_728.jpg": 9, "test_729.jpg": 9, "test_73.jpg": 9, "test_730.jpg": 9, "test_731.jpg": 9, "test_732.jpg": 9, "test_733.jpg": 9, "test_734.jpg": 15, "test_735.jpg": 14, "test_736.jpg": 9, "test_737.jpg": 20, "test_738.jpg": 14, "test_739.jpg": 9, "test_74.jpg": 9, "test_740.jpg": 15, "test_741.jpg": 17, "test_742.jpg": 14, "test_743.jpg": 9, "test_744.jpg": 9, "test_745.jpg": 9, "test_746.jpg": 15, "test_747.jpg": 11, "test_748.jpg": 9, "test_749.jpg": 9, "test_75.jpg": 9, "test_750.jpg": 9, "test_751.jpg": 9, "test_752.jpg": 9, "test_753.jpg": 9, "test_754.jpg": 9, "test_755.jpg": 9, "test_756.jpg": 9, "test_757.jpg": 9, "test_758.jpg": 9, "test_759.jpg": 9, "test_76.jpg": 9, "test_760.jpg": 16, "test_761.jpg": 11, "test_762.jpg": 14, "test_763.jpg": 18, "test_764.jpg": 14, "test_765.jpg": 17, "test_766.jpg": 9, "test_767.jpg": 9, "test_768.jpg": 9, "test_769.jpg": 9, "test_77.jpg": 9, "test_770.jpg": 9, "test_771.jpg": 9, "test_772.jpg": 9, "test_773.jpg": 9, "test_774.jpg": 9, "test_775.jpg": 9, "test_776.jpg": 9, "test_777.jpg": 9, "test_778.jpg": 9, "test_779.jpg": 9, "test_78.jpg": 9, "test_780.jpg": 9, "test_781.jpg": 9, "test_782.jpg": 9, "test_783.jpg": 9, "test_784.jpg": 23, "test_785.jpg": 9, "test_786.jpg": 11, "test_787.jpg": 9, "test_788.jpg": 13, "test_789.jpg": 9, "test_79.jpg": 9, "test_790.jpg": 9, "test_791.jpg": 9, "test_792.jpg": 20, "test_793.jpg": 9, "test_794.jpg": 18, "test_795.jpg": 18, "test_796.jpg": 17, "test_797.jpg": 9, "test_798.jpg": 9, "test_799.jpg": 9, "test_8.jpg": 23, "test_80.jpg": 16, "test_800.jpg": 9, "test_801.jpg": 9, "test_802.jpg": 11, "test_803.jpg": 9, "test_804.jpg": 9, "test_805.jpg": 9, "test_806.jpg": 9, "test_807.jpg": 9, "test_808.jpg": 9, "test_809.jpg": 9, "test_81.jpg": 15, "test_810.jpg": 9, "test_811.jpg": 9, "test_812.jpg": 9, "test_813.jpg": 9, "test_814.jpg": 9, "test_815.jpg": 9, "test_816.jpg": 9, "test_817.jpg": 9, "test_818.jpg": 9, "test_819.jpg": 9, "test_82.jpg": 14, "test_820.jpg": 9, "test_821.jpg": 9, "test_822.jpg": 9, "test_823.jpg": 9, "test_824.jpg": 9, "test_825.jpg": 9, "test_826.jpg": 9, "test_827.jpg": 9, "test_828.jpg": 9, "test_829.jpg": 9, "test_83.jpg": 17, "test_830.jpg": 9, "test_831.jpg": 9, "test_832.jpg": 9, "test_833.jpg": 9, "test_834.jpg": 9, "test_835.jpg": 9, "test_836.jpg": 16, "test_837.jpg": 11, "test_838.jpg": 15, "test_839.jpg": 9, "test_84.jpg": 19, "test_840.jpg": 9, "test_841.jpg": 11, "test_842.jpg": 9, "test_843.jpg": 9, "test_844.jpg": 9, "test_845.jpg": 13, "test_846.jpg": 9, "test_847.jpg": 9, "test_848.jpg": 9, "test_849.jpg": 9, "test_85.jpg": 17, "test_850.jpg": 9, "test_851.jpg": 9, "test_852.jpg": 9, "test_853.jpg": 14, "test_854.jpg": 9, "test_855.jpg": 9, "test_856.jpg": 9, "test_857.jpg": 9, "test_858.jpg": 9, "test_859.jpg": 9, "test_86.jpg": 17, "test_860.jpg": 9, "test_861.jpg": 13, "test_862.jpg": 9, "test_863.jpg": 9, "test_864.jpg": 13, "test_865.jpg": 9, "test_866.jpg": 9, "test_867.jpg": 9, "test_868.jpg": 9, "test_869.jpg": 9, "test_87.jpg": 19, "test_870.jpg": 9, "test_871.jpg": 9, "test_872.jpg": 9, "test_873.jpg": 9, "test_874.jpg": 9, "test_875.jpg": 9, "test_876.jpg": 9, "test_877.jpg": 9, "test_878.jpg": 9, "test_879.jpg": 9, "test_88.jpg": 17, "test_880.jpg": 9, "test_881.jpg": 9, "test_882.jpg": 9, "test_883.jpg": 9, "test_884.jpg": 9, "test_885.jpg": 9, "test_886.jpg": 9, "test_887.jpg": 9, "test_888.jpg": 9, "test_889.jpg": 17, "test_89.jpg": 9, "test_890.jpg": 12, "test_891.jpg": 15, "test_892.jpg": 17, "test_893.jpg": 9, "test_894.jpg": 19, "test_895.jpg": 15, "test_896.jpg": 11, "test_897.jpg": 9, "test_898.jpg": 9, "test_899.jpg": 16, "test_9.jpg": 24, "test_90.jpg": 9, "test_900.jpg": 16, "test_901.jpg": 9, "test_902.jpg": 9, "test_903.jpg": 9, "test_904.jpg": 17, "test_905.jpg": 12, "test_906.jpg": 17, "test_907.jpg": 18, "test_908.jpg": 9, "test_909.jpg": 9, "test_91.jpg": 9, "test_910.jpg": 9, "test_911.jpg": 9, "test_912.jpg": 9, "test_913.jpg": 9, "test_92.jpg": 9, "test_93.jpg": 9, "test_94.jpg": 9, "test_95.jpg": 9, "test_96.jpg": 9, "test_97.jpg": 9, "test_98.jpg": 9, "test_99.jpg": 9}


--------------------------------------------------------------------------------
/data_new/output/vocab.json:
--------------------------------------------------------------------------------
1 | {"bird": 4, "muscular": 5, "bear": 6, "crown": 7, "pj": 8, "brief": 9, "rose": 10, "sweat": 11, "banana": 12, "blur": 13, "his": 14, "content": 15, "wood": 16, "sweater": 17, "woman's": 18, "not": 19, "pants,": 20, "muscle": 21, "p": 22, "silver": 23, "woman": 24, "close": 25, "cat": 26, "logo": 27, "red": 28, "pair": 29, "star": 30, "concrete": 31, "white": 32, "brown": 33, "la": 34, "character": 35, "pjs": 36, "sleeve": 37, "leather": 38, "leaf": 39, "tree": 40, "cartoon": 41, "no": 42, "camouflage": 43, "vest": 44, "sky": 45, "swimsuit": 46, "squares": 47, "anchor": 48, "los": 49, "photo": 50, "a": 51, "words": 52, "outline": 53, "sunflower": 54, "camo": 55, "button": 56, "universe": 57, "blurred": 58, "blurry": 59, "sitting": 60, "pajamas": 61, "tropical": 62, "crop": 63, "lettering": 64, "deer": 65, "pink": 66, "blured": 67, "cup": 68, "sneakers,": 69, "over": 70, "geometric": 71, "embroidered": 72, "checker": 73, "linen": 74, "street": 75, "across": 76, "staircase": 77, "pants": 78, "brick": 79, "paint": 80, "stone": 81, "bright": 82, "overalls": 83, "mesh": 84, "text": 85, "sunset": 86, "front": 87, "in": 88, "chain": 89, "palm": 90, "tattoo": 91, "waistband": 92, "i": 93, "left": 94, "of": 95, "male": 96, "underneath": 97, "grid": 98, "couch": 99, "pattern,": 100, "swim": 101, "leopard": 102, "pool,": 103, "fade": 104, "chair": 105, "nasa": 106, "and": 107, "tattoos": 108, "cactus": 109, "-": 110, "pocket": 111, "boxer": 112, "short": 113, "contrast": 114, "butterfly": 115, "textured": 116, "paste": 117, "dye": 118, "top": 119, "striped": 120, "tan": 121, "maroon": 122, "door": 123, "trunk": 124, "word": 125, "it": 126, "pattern": 127, "shorts,": 128, "chest": 129, "ombre": 130, "cargo": 131, "triangle": 132, "mountain,": 133, "beach": 134, "car": 135, "t": 136, "floral": 137, "arms": 138, "is": 139, "orange": 140, "`": 141, "trunks": 142, "patchwork": 143, "the": 144, "torso": 145, "are": 146, "sweatshirt": 147, "flamingo": 148, "socks": 149, "to": 150, "knit": 151, "design": 152, "collar": 153, "image": 154, "around": 155, "om": 156, "york": 157, "polo": 158, "mint": 159, "yellow": 160, "beard": 161, "dinosaur": 162, "graphic": 163, "basketball": 164, "metallic": 165, "wall": 166, "down": 167, "hoe": 168, "wave": 169, "plaid": 170, "with": 171, "zipper": 172, "back": 173, "says": 174, "material": 175, "purple": 176, "sharks": 177, "flower": 178, "pool": 179, "skull": 180, "wearing": 181, "black": 182, "lines": 183, "head": 184, "dumb": 185, "background,": 186, "denim": 187, "wall,": 188, "flowers": 189, "man's": 190, "green": 191, "ripped": 192, "navy": 193, "cap": 194, "book": 195, "panda": 196, "suit": 197, "sleeved": 198, "shirt,": 199, "letters": 200, "'": 201, "written": 202, "color": 203, "fake": 204, "tie": 205, "bull": 206, "beach,": 207, "hat": 208, "face": 209, "model": 210, "burgundy": 211, "sweatpants": 212, "on": 213, "printed": 214, "blury": 215, "neckline": 216, "gold": 217, "light": 218, "ant": 219, "up": 220, "paisley": 221, "wooden": 222, "stripe": 223, "joggers": 224, "standing": 225, "checkered": 226, "v": 227, "shark": 228, "plant": 229, "shorts": 230, "lightning": 231, "blue": 232, "elephant": 233, "all": 234, "mountain": 235, "sunglasses": 236, "shirt": 237, "an": 238, "dog": 239, "beige": 240, "metal": 241, "sleeveless": 242, "trim": 243, "fireplace": 244, "sleeves": 245, "that": 246, "knitted": 247, "hawaiian": 248, "marble": 249, "trousers": 250, "man": 251, "body": 252, "holding": 253, "stripes": 254, "sneakers": 255, "neck": 256, "jacket": 257, "neon": 258, "block": 259, "holes": 260, "california": 261, "gym": 262, "fence": 263, "ball": 264, "patterned": 265, "scene": 266, "cigarette": 267, "picture": 268, "jeans": 269, "grey": 270, "walking": 271, "hoodie": 272, "gradient": 273, "legs": 274, "side": 275, "colorful": 276, "print": 277, "new": 278, "bottom": 279, "background": 280, "read": 281, "wetsuit": 282, "gray": 283, "birds": 284, "khaki": 285, "trees": 286, "tank": 287, "arm": 288, "line": 289, "scene,": 290, "eye": 291, "dragon": 292, "<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}


--------------------------------------------------------------------------------
/data_new/rename_script.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | def rename_images(folder_path):
 4 |     # 检查文件夹是否存在
 5 |     if not os.path.exists(folder_path):
 6 |         print(f"文件夹 '{folder_path}' 不存在。")
 7 |         return
 8 | 
 9 |     # 获取文件夹下所有文件
10 |     files = os.listdir(folder_path)
11 | 
12 |     # 迭代处理每个文件
13 |     for index, file_name in enumerate(files):
14 |         # 获取文件的完整路径
15 |         old_path = os.path.join(folder_path, file_name)
16 | 
17 |         # 构建新的文件名
18 |         new_name = f"train_{index + 1}.jpg"
19 |         # new_name = f"test_{index + 1}.jpg"
20 | 
21 |         # 构建新的文件路径
22 |         new_path = os.path.join(folder_path, new_name)
23 | 
24 |         # 重命名文件
25 |         os.rename(old_path, new_path)
26 | 
27 |         print(f"重命名文件: {file_name} -> {new_name}")
28 | 
29 | if __name__ == "__main__":
30 |     # 指定图片文件夹的路径
31 |     images_folder_path = "train_images"
32 |     # images_folder_path = "test_images"
33 | 
34 |     # 调用函数进行重命名
35 |     rename_images(images_folder_path)
36 | 


--------------------------------------------------------------------------------
/data_old/output/vocab.json:
--------------------------------------------------------------------------------
1 | {"wearing": 4, "tank": 5, "pants,": 6, "hands": 7, "glasses": 8, "socks": 9, "stand": 10, "other,": 11, "patterns.": 12, "sweater": 13, "shorts,": 14, "plaid": 15, "color.": 16, "knitting": 17, "trousers": 18, "pure": 19, "striped": 20, "complicated": 21, "lattice.": 22, "ring": 23, "sleeves,": 24, "her": 25, "neckline.": 26, "wrist.": 27, "denim,": 28, "an": 29, "gentleman": 30, "stand.": 31, "solid": 32, "graphic.": 33, "square.": 34, "ring.": 35, "long-sleeve": 36, "medium-sleeve": 37, "v-shape.": 38, "head.": 39, "upper": 40, "has": 41, "of": 42, "off": 43, "pants.": 44, "sleeves": 45, "socks.": 46, "pattern": 47, "cotton.": 48, "the": 49, "neckline": 50, "graphic": 51, "skirt": 52, "patterns": 53, "is": 54, "belt": 55, "wears": 56, "trousers,": 57, "its": 58, "cotton": 59, "floral.": 60, "color": 61, "sunglasses.": 62, "guy": 63, "stripe": 64, "belt.": 65, "with": 66, "three-quarter": 67, "long": 68, "a": 69, "shorts.": 70, "shirt": 71, "hat": 72, "shorts": 73, "round.": 74, "suspenders": 75, "floral": 76, "lapel": 77, "hat.": 78, "top": 79, "chiffon": 80, "neck.": 81, "woman": 82, "trousers.": 83, "leather": 84, "striped.": 85, "knitting,": 86, "mixed": 87, "female": 88, "no": 89, "clothes.": 90, "t-shirt": 91, "it": 92, "are": 93, "cotton,": 94, "lady": 95, "suspenders.": 96, "waist.": 97, "neckwear.": 98, "lattice": 99, "v-shape": 100, "three-point": 101, "crew": 102, "in": 103, "other.": 104, "accessory": 105, "chiffon.": 106, "and": 107, "round": 108, "square": 109, "denim": 110, "also": 111, "outer": 112, "length.": 113, "pants": 114, "short-sleeve": 115, "short": 116, "off,": 117, "there": 118, "person": 119, "block": 120, "fabric.": 121, "clothing": 122, "plaid.": 123, "stripe.": 124, "or": 125, "crew.": 126, "chiffon,": 127, "knitting.": 128, "leggings.": 129, "lapel.": 130, "fabric": 131, "on": 132, "skirt.": 133, "this": 134, "shoes.": 135, "clothing,": 136, "his": 137, "pair": 138, "finger.": 139, "sleeveless": 140, "man": 141, "furry": 142, "block.": 143, "skirt,": 144, "cut": 145, "other": 146, "lower": 147, "medium": 148, "<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}


--------------------------------------------------------------------------------
/doc/NNDL图像_描述指南.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL图像_描述指南.pdf


--------------------------------------------------------------------------------
/doc/NNDL课设_中期报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_中期报告.pdf


--------------------------------------------------------------------------------
/doc/NNDL课设_开题报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_开题报告.pdf


--------------------------------------------------------------------------------
/doc/NNDL课设_结题报告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_结题报告.pdf


--------------------------------------------------------------------------------
/doc/NNDL课设_要求说明.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/NNDL课设_要求说明.pdf


--------------------------------------------------------------------------------
/doc/img/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/01.png


--------------------------------------------------------------------------------
/doc/img/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/02.png


--------------------------------------------------------------------------------
/doc/img/03.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/03.png


--------------------------------------------------------------------------------
/doc/img/AttentionModel-first_train-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-1.png


--------------------------------------------------------------------------------
/doc/img/AttentionModel-first_train-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-2.png


--------------------------------------------------------------------------------
/doc/img/AttentionModel-first_train-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-3.png


--------------------------------------------------------------------------------
/doc/img/AttentionModel-first_train-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel-first_train-4.png


--------------------------------------------------------------------------------
/doc/img/AttentionModel_backgroundcaption.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/AttentionModel_backgroundcaption.png


--------------------------------------------------------------------------------
/doc/img/BLIP_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_1.png


--------------------------------------------------------------------------------
/doc/img/BLIP_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_2.png


--------------------------------------------------------------------------------
/doc/img/BLIP_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_3.png


--------------------------------------------------------------------------------
/doc/img/BLIP_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_demo.png


--------------------------------------------------------------------------------
/doc/img/BLIP_full.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/BLIP_full.png


--------------------------------------------------------------------------------
/doc/img/CNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/CNN.png


--------------------------------------------------------------------------------
/doc/img/Ex_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Ex_data.png


--------------------------------------------------------------------------------
/doc/img/OriginalModel-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/OriginalModel-1.png


--------------------------------------------------------------------------------
/doc/img/OriginalModel-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/OriginalModel-2.png


--------------------------------------------------------------------------------
/doc/img/Out_of_Memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Out_of_Memory.png


--------------------------------------------------------------------------------
/doc/img/RNN.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/RNN.png


--------------------------------------------------------------------------------
/doc/img/Transformer_demo1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo1.png


--------------------------------------------------------------------------------
/doc/img/Transformer_demo2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo2.png


--------------------------------------------------------------------------------
/doc/img/Transformer_demo3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo3.png


--------------------------------------------------------------------------------
/doc/img/Transformer_demo4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_demo4.png


--------------------------------------------------------------------------------
/doc/img/Transformer_framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/Transformer_framework.png


--------------------------------------------------------------------------------
/doc/img/image-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image-1.png


--------------------------------------------------------------------------------
/doc/img/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image.png


--------------------------------------------------------------------------------
/doc/img/image20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Conqueror712/Image-Caption/f032929e4a0c9452e7ca52c70f5b25a9f509c91d/doc/img/image20.png


--------------------------------------------------------------------------------