├── .DS_Store
├── Data
    ├── .DS_Store
    ├── test_data
    │   ├── 14398
    │   │   ├── image.png
    │   │   └── data.json
    │   └── .DS_Store
    ├── train_fill_in_blank
    │   ├── 77070
    │   │   ├── image.png
    │   │   └── data.json
    │   └── .DS_Store
    └── train.jsonl
├── requirements.txt
├── README.md
├── prediction.py
└── finetuning.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/.DS_Store


--------------------------------------------------------------------------------
/Data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/Data/.DS_Store


--------------------------------------------------------------------------------
/Data/test_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/Data/test_data/.DS_Store


--------------------------------------------------------------------------------
/Data/test_data/14398/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/Data/test_data/14398/image.png


--------------------------------------------------------------------------------
/Data/train_fill_in_blank/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/Data/train_fill_in_blank/.DS_Store


--------------------------------------------------------------------------------
/Data/train_fill_in_blank/77070/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dino-chiio/blip-vqa-finetune/HEAD/Data/train_fill_in_blank/77070/image.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm==4.66.1
2 | datasets==2.14.6
3 | transformers==4.35.2
4 | torch==2.1.0
5 | torchsummary==1.5.1
6 | torchvision==0.16.0
7 | Pillow==10.0.1


--------------------------------------------------------------------------------
/Data/train.jsonl:
--------------------------------------------------------------------------------
1 | {"question": "How many shapes are purple?", "answer": "3", "ques_type": "fill_in_blank", "grade": "kindergarten", "label": "Q9", "pid": "77070"}


--------------------------------------------------------------------------------
/Data/test_data/14398/data.json:
--------------------------------------------------------------------------------
1 | {
2 |     "question": "Move the ruler to measure the length of the sword to the nearest inch. The sword is about (_) inches long.",
3 |     "id": "14398"
4 | }


--------------------------------------------------------------------------------
/Data/train_fill_in_blank/77070/data.json:
--------------------------------------------------------------------------------
1 | {
2 |   "question": "How many shapes are purple?",
3 |   "answer": "3",
4 |   "ques_type": "fill_in_blank",
5 |   "grade": "kindergarten",
6 |   "label": "Q9"
7 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Visual Question Answering using BLIP pre-trained model!
 2 | 
 3 | This implementation applies the BLIP pre-trained model to solve the icon domain task. 
 4 | ![The BLIP model for VQA task](https://i.postimg.cc/ncnxSnJw/image.png)
 5 | |  ![enter image description here](https://i.postimg.cc/1zSYsrmm/image.png)|  |
 6 | |--|--|
 7 | | How many dots are there? | 36 |
 8 | 
 9 | # Description
10 | **Note: The test dataset does not have labels. I evaluated the model via Kaggle competition and got 96% in accuracy manner. Obviously, you can use a partition of the training set as a testing set.
11 | ## Create data folder
12 | 
13 | Copy all data following the example form
14 | You can download data [here](https://drive.google.com/file/d/1tt6qJbOgevyPpfkylXpKYy-KaT4_aCYZ/view?usp=sharing)
15 | 
16 | ## Install requirements.txt
17 | 
18 |     pip install -r requirements.txt
19 | 
20 | ## Run finetuning code
21 | 
22 |     python finetuning.py
23 | 
24 | ## Run prediction
25 | 
26 |     python predicting.py
27 | 
28 | ### References:
29 | 
30 | > Nguyen Van Tuan (2023). JAIST_Advanced Machine Learning_Visual_Question_Answering
31 | 
32 | 


--------------------------------------------------------------------------------
/prediction.py:
--------------------------------------------------------------------------------
 1 | from transformers import ViltProcessor, ViltForQuestionAnswering
 2 | from transformers import BlipProcessor, BlipForQuestionAnswering
 3 | import requests
 4 | from PIL import Image
 5 | import json, os, csv
 6 | import logging
 7 | from tqdm import tqdm
 8 | import torch
 9 | 
10 | # Set the path to your test data directory
11 | test_data_dir = "Data/test_data/test_data"
12 | 
13 | # processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
14 | # model = ViltForQuestionAnswering.from_pretrained("test_model/checkpoint-525")
15 | 
16 | processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
17 | model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model").to("cuda")
18 | 
19 | # Create a list to store the results
20 | results = []
21 | 
22 | # Iterate through each file in the test data directory
23 | samples = os.listdir(test_data_dir)
24 | for filename in tqdm(os.listdir(test_data_dir), desc="Processing"):
25 |     sample_path = f"Data/test_data/{filename}"
26 | 
27 |     # Read the json file
28 |     json_path = os.path.join(sample_path, "data.json")
29 |     with open(json_path, "r") as json_file:
30 |         data = json.load(json_file)
31 |         question = data["question"]
32 |         image_id = data["id"]
33 | 
34 |     # Read the corresponding image
35 |     image_path = os.path.join(test_data_dir, f"{image_id}", "image.png")
36 |     image = Image.open(image_path).convert("RGB")
37 | 
38 |     # prepare inputs
39 |     encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16)
40 | 
41 |     out = model.generate(**encoding)
42 |     generated_text = processor.decode(out[0], skip_special_tokens=True)
43 | 
44 | 
45 |     results.append((image_id, generated_text))
46 | 
47 | # Write the results to a CSV file
48 | csv_file_path = "Results/results.csv"
49 | with open(csv_file_path, mode="w", newline="") as csv_file:
50 |     csv_writer = csv.writer(csv_file)
51 |     csv_writer.writerow(["ID", "Label"])  # Write header
52 |     csv_writer.writerows(results)
53 | 
54 | print(f"Results saved to {csv_file_path}")


--------------------------------------------------------------------------------
/finetuning.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import requests
  4 | from transformers import BlipProcessor, BlipForQuestionAnswering
  5 | from datasets import load_dataset
  6 | import torch
  7 | from PIL import Image
  8 | from torch.utils.data import DataLoader
  9 | from tqdm import tqdm
 10 | import pickle
 11 | 
 12 | model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
 13 | processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
 14 | 
 15 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 16 | model.to(device)
 17 | 
 18 | torch.cuda.empty_cache()
 19 | torch.manual_seed(42)
 20 | 
 21 | class VQADataset(torch.utils.data.Dataset):
 22 |     """VQA (v2) dataset."""
 23 | 
 24 |     def __init__(self, dataset, processor):
 25 |         self.dataset = dataset
 26 |         self.processor = processor
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.dataset)
 30 | 
 31 |     def __getitem__(self, idx):
 32 |         # get image + text
 33 |         question = self.dataset[idx]['question']
 34 |         answer = self.dataset[idx]['answer']
 35 |         image_id = self.dataset[idx]['pid']
 36 |         image_path = f"Data/train_fill_in_blank/{image_id}/image.png"
 37 |         image = Image.open(image_path).convert("RGB")
 38 |         text = question
 39 |         
 40 |         encoding = self.processor(image, text, padding="max_length", truncation=True, return_tensors="pt")
 41 |         labels = self.processor.tokenizer.encode(
 42 |             answer, max_length= 8, pad_to_max_length=True, return_tensors='pt'
 43 |         )
 44 |         encoding["labels"] = labels
 45 |         # remove batch dimension
 46 |         for k,v in encoding.items():  encoding[k] = v.squeeze()
 47 |         return encoding
 48 | 
 49 | training_dataset = load_dataset("json", data_files="Data/train.jsonl", split="train[:90%]")
 50 | valid_dataset = load_dataset("json", data_files="Data/train.jsonl", split="train[90%:]")
 51 | print("Training sets: {} - Validating set: {}".format(len(training_dataset), len(valid_dataset)))
 52 | 
 53 | train_dataset = VQADataset(dataset=training_dataset,
 54 |                           processor=processor)
 55 | valid_dataset = VQADataset(dataset=valid_dataset,
 56 |                           processor=processor)
 57 | 
 58 | batch_size = 12
 59 | train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
 60 | valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
 61 | 
 62 | 
 63 | optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
 64 | scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1, verbose=False)
 65 | 
 66 | num_epochs = 100
 67 | patience = 10
 68 | min_eval_loss = float("inf")
 69 | early_stopping_hook = 0
 70 | tracking_information = []
 71 | scaler = torch.cuda.amp.GradScaler()
 72 | 
 73 | for epoch in range(num_epochs):
 74 |     epoch_loss = 0
 75 |     model.train()
 76 |     for idx, batch in zip(tqdm(range(len(train_dataloader)), desc='Training batch: ...'), train_dataloader):
 77 |         input_ids = batch.pop('input_ids').to(device)
 78 |         pixel_values = batch.pop('pixel_values').to(device)
 79 |         attention_masked = batch.pop('attention_mask').to(device)
 80 |         labels = batch.pop('labels').to(device)
 81 |         
 82 |         with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
 83 |             outputs = model(input_ids=input_ids,
 84 |                         pixel_values=pixel_values,
 85 |                         # attention_mask=attention_masked,
 86 |                         labels=labels)
 87 |             
 88 |         loss = outputs.loss
 89 |         epoch_loss += loss.item()
 90 |         # loss.backward()
 91 |         # optimizer.step()
 92 |         optimizer.zero_grad()
 93 |         
 94 |         scaler.scale(loss).backward()
 95 |         scaler.step(optimizer)
 96 |         scaler.update()
 97 |     
 98 |     model.eval()
 99 |     eval_loss = 0
100 |     for idx, batch in zip(tqdm(range(len(valid_dataloader)), desc='Validating batch: ...'), valid_dataloader):
101 |         input_ids = batch.pop('input_ids').to(device)
102 |         pixel_values = batch.pop('pixel_values').to(device)
103 |         attention_masked = batch.pop('attention_mask').to(device)
104 |         labels = batch.pop('labels').to(device)
105 | 
106 |         with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
107 |             outputs = model(input_ids=input_ids,
108 |                         pixel_values=pixel_values,
109 |                         attention_mask=attention_masked,
110 |                         labels=labels)
111 |         
112 |         loss = outputs.loss
113 |         eval_loss += loss.item()
114 | 
115 |     tracking_information.append((epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
116 |     print("Epoch: {} - Training loss: {} - Eval Loss: {} - LR: {}".format(epoch+1, epoch_loss/len(train_dataloader), eval_loss/len(valid_dataloader), optimizer.param_groups[0]["lr"]))
117 |     scheduler.step()
118 |     if eval_loss < min_eval_loss:
119 |         model.save_pretrained("Model/blip-saved-model", from_pt=True) 
120 |         print("Saved model to Model/blip-saved-model")
121 |         min_eval_loss = eval_loss
122 |         early_stopping_hook = 0
123 |     else:
124 |         early_stopping_hook += 1
125 |         if early_stopping_hook > patience:
126 |             break
127 |     
128 | pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
129 | print("The finetuning process has done!")
130 | 


--------------------------------------------------------------------------------