├── .gitignore
├── Dockerfile
├── Images
├── VizWiz_accuracy.png
├── model_architecture.png
├── training_answer_type_distribution.png
├── training_answerable_distribution.png
├── training_losses.png
├── training_vizwiz_accuracy.png
├── training_wordcloud.png
├── user_interface.png
└── vizwiz_example.png
├── LaTeX_Paper
├── Visual_Question_Answering_Report.pdf
└── main.tex
├── Papers
├── Less is More.pdf
└── OpenAI_CLIP model.pdf
├── README.md
├── Saved_Models
├── README.md
├── answer_onehotencoder.pkl
└── answer_type_onehotencoder.pkl
├── app.py
├── notebook.ipynb
├── requirements.txt
├── static
├── image.jpg
├── script.js
└── styles.css
├── templates
├── index.html
└── user_image.jpg
└── vqa_model.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/*
2 | vizwiz/*
3 | __pycache__/modelArchitecture.cpython-311.pyc
4 | *.csv
5 | Saved_Models/*.pth
6 | *.pyc
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim-buster
2 |
3 | WORKDIR /app
4 |
5 | COPY . /app
6 |
7 | RUN apt-get update && apt-get install -y git
8 |
9 | RUN pip install git+https://github.com/openai/CLIP.git
10 |
11 | RUN pip install --trusted-host pypi.python.org -r requirements.txt
12 |
13 | EXPOSE 5000
14 |
15 | ENV FLASK_APP=app.py
16 |
17 | CMD ["flask", "run"]
--------------------------------------------------------------------------------
/Images/VizWiz_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/VizWiz_accuracy.png
--------------------------------------------------------------------------------
/Images/model_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/model_architecture.png
--------------------------------------------------------------------------------
/Images/training_answer_type_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/training_answer_type_distribution.png
--------------------------------------------------------------------------------
/Images/training_answerable_distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/training_answerable_distribution.png
--------------------------------------------------------------------------------
/Images/training_losses.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/training_losses.png
--------------------------------------------------------------------------------
/Images/training_vizwiz_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/training_vizwiz_accuracy.png
--------------------------------------------------------------------------------
/Images/training_wordcloud.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/training_wordcloud.png
--------------------------------------------------------------------------------
/Images/user_interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/user_interface.png
--------------------------------------------------------------------------------
/Images/vizwiz_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Images/vizwiz_example.png
--------------------------------------------------------------------------------
/LaTeX_Paper/Visual_Question_Answering_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/LaTeX_Paper/Visual_Question_Answering_Report.pdf
--------------------------------------------------------------------------------
/LaTeX_Paper/main.tex:
--------------------------------------------------------------------------------
1 | \documentclass[final,5p,times,twocolumn,authoryear]{elsarticle}
2 | \usepackage{amssymb}
3 | \usepackage{lipsum}
4 |
5 | \newcommand{\kms}{km\,s$^{-1}$}
6 | \newcommand{\msun}{$M_\odot}
7 |
8 | \journal{Machine Learning Research}
9 |
10 |
11 | \begin{document}
12 |
13 | \begin{frontmatter}
14 |
15 | %% Title, authors and addresses
16 |
17 | %% use the tnoteref command within \title for footnotes;
18 | %% use the tnotetext command for theassociated footnote;
19 | %% use the fnref command within \author or \affiliation for footnotes;
20 | %% use the fntext command for theassociated footnote;
21 | %% use the corref command within \author for corresponding author footnotes;
22 | %% use the cortext command for theassociated footnote;
23 | %% use the ead command for the email address,
24 | %% and the form \ead[url] for the home page:
25 | %% \title{Title\tnoteref{label1}}
26 | %% \tnotetext[label1]{}
27 | %% \author{Name\corref{cor1}\fnref{label2}}
28 | %% \ead{email address}
29 | %% \ead[url]{home page}
30 | %% \fntext[label2]{}
31 | %% \cortext[cor1]{}
32 | %% \fntext[label3]{}
33 |
34 | \title{Less Is More: Linear Layers on CLIP Features as Powerful VizWiz Model}
35 |
36 | %% use optional labels to link authors explicitly to addresses:
37 | %% \author[label1,label2]{}
38 | %% \affiliation[label1]{organization={},
39 | %% addressline={},
40 | %% city={},
41 | %% postcode={},
42 | %% state={},
43 | %% country={}}
44 | %%
45 | %% \affiliation[label2]{organization={},
46 | %% addressline={},
47 | %% city={},
48 | %% postcode={},
49 | %% state={},
50 | %% country={}}
51 |
52 | \author[first, second, third]{Yousef Kotp, Adham Mohammed, Mohammed Farid}
53 | \affiliation[first]{organization={@Alexandria University},%Department and Organization
54 | addressline={yousefkotp@outlook.com},
55 | city={Alexandria},
56 | country={Egypt}}
57 |
58 | \affiliation[second]{organization={@Alexandria University},%Department and Organization
59 | city={Alexandria},
60 | country={Egypt}}
61 |
62 | \affiliation[third]{organization={@Alexandria University},%Department and Organization
63 | city={Alexandria},
64 | country={Egypt}}
65 |
66 |
67 | %% Abstract
68 | \begin{abstract}
69 |
70 | In our groundbreaking research, we have developed a cutting-edge model that will revolutionize visual question answering (VQA) tasks. By leveraging the power of OpenAI's CLIP model, which possesses the remarkable ability to understand both images and text, we have created a lightweight solution that surpasses its counterparts. Our model has \textbf{achieved an impressive 54\% success rate and \textbf{ranked 4th in the official leaderboard} for the challenge in Task 1: Predict Answer to a Visual Question}, establishing itself as one of the top performers. However, we didn't stop there. We confronted the challenge of complexity head-on, devising a method to simplify the current VQA architectures. Our secret weapon includes curating answer vocabulary and incorporating an auxiliary loss for the answer type, which has unlocked the true potential of our model. This technique not only improved our training process but also paved the way for even more remarkable results. Additionally, by utilizing the mask obtained from this loss, we created a gate for the answers, resulting in a significant performance boost for our model. In Task 2: Predict Answerability of a Visual Question, our model achieved an astonishing \textbf{average precision of 73.15}\% and \textbf{ranked 3rd in the official leaderboard}. These tasks were part of the VizWiz Visual Question Answering Grand Challenge.
71 |
72 | \end{abstract}
73 |
74 |
75 | %%Graphical abstract
76 | % \begin{graphicalabstract}
77 | % \includegraphics{JHEAP_cover_image.pdf}
78 | % \end{graphicalabstract}
79 |
80 | %%Research highlights
81 | % \begin{highlights}
82 | % \item Research highlight 1
83 | % \item Research highlight 2
84 | % \end{highlights}
85 |
86 |
87 |
88 | \begin{keyword}
89 | %% keywords here, in the form: keyword \sep keyword, up to a maximum of 6 keywords
90 | VizWiz \sep Visual Question Answering \sep Open AI's CLIP model\sep VQA
91 | \end{keyword}
92 |
93 |
94 | \end{frontmatter}
95 |
96 | %\tableofcontents
97 |
98 | %% main text
99 |
100 | \section{Introduction}
101 | \label{introduction}
102 |
103 | In recent years, the field of visual question answering (VQA) has seen the development of numerous new architectures that have been applied to various datasets such as VQAv2, GQA, and VizWiz-VQA. The VizWiz dataset stands out from other VQA datasets due to the presence of several challenges in the data. For instance, questions in this dataset may be unanswerable due to missing information in the accompanying images, or the image quality itself may be exceptionally poor. Additionally, the questions in the dataset are not formulated based on rigid rules but rather tend to be colloquial and informal in nature.
104 |
105 | In the previous year, the winning team in the VizWiz-VQA challenge employed an extension of OSCAR, a popular VQA model. They enhanced their model by incorporating an optical character recognition (OCR) module and introducing reference image matching. Their final system consisted of an ensemble of 96 individual models. Ensembles are crucial for achieving competitive results in VQA, but they come with a significant cost in terms of training.
106 |
107 | It is worth noting that while many new architectures have emerged in recent years, the development and training of VQA models can be computationally expensive and resource-intensive. The complexity of the models, the size of the datasets, and the need for extensive training to optimize performance all contribute to the high cost associated with these approaches. Nonetheless, the continuous advancement of VQA architectures and techniques holds great promise for tackling the challenges posed by diverse and challenging datasets like VizWiz.
108 |
109 | \begin{figure}
110 | \centering
111 | \includegraphics[width=0.4\textwidth]{model_architecture.png}
112 | \caption{Our Proposed Model Architecture}
113 | \label{fig_mom0}%
114 | \end{figure}
115 |
116 | Our approach places emphasis on simplicity and usability in the context of visual question answering (VQA). We adopt a strategy that involves utilizing pre-trained image and text encoders from CLIP and focusing solely on training a straightforward classification head. The CLIP model itself is built upon convolutional neural networks (CNN) for image encoding and Vision Transformer for text encoding. It leverages the power of a Transformer for text representation.
117 |
118 | To facilitate the alignment of images and text, the CLIP model is pre-trained on a vast dataset of 400 million image-text pairs. This pre-training phase employs a contrastive objective, enabling both image and text modalities to be embedded within the same space. This integration ensures that images and corresponding text descriptions can be effectively compared and matched. Furthermore, due to the large-scale training of CLIP, the model also possesses optical character recognition (OCR) capabilities, enhancing its ability to extract and understand text from images.
119 |
120 |
121 |
122 | %%%%%%%%%%%%%%%%%%%%%%%%%% RELATED WORK %%%%%%%%%%%%%%%%%%%%%%%
123 | % \section{Related Work}
124 |
125 |
126 |
127 | %%%%%%%%%%%%%%%%%%%%%%%%%% PROPOSED METHOD %%%%%%%%%%%%%%%%%%%%%%%
128 | \section{Proposed Method}
129 | \subsection{Building the Perfect Vocab}
130 |
131 | We curated the perfect vocabulary for a Visual Question Answering (VQA) model using the VizWiz dataset. The goal is to select the most representative and common answers for each question in the dataset.
132 |
133 | The function follows a specific policy to build the answer vocabulary. For each question in the dataset, there are multiple answers available (typically 10).
134 |
135 | Next, for each question in the dataset, the function counts the frequency of each answer using an intermediate counter. This counter keeps track of how many times each answer occurs for a particular question.
136 |
137 | Once the answers for a question are counted, the function checks for any ties in the answer frequencies. If there is only one most frequent answer, it is selected as the answer for that question, and it is assigned to the corresponding row in the dataframe.
138 |
139 | However, if there is a tie in the answer frequencies, the function looks at the most common answer in the entire dataset. This step ensures that if multiple answers are equally common across different questions, the most common answer overall is chosen. Again, if there is a clear winner, it is assigned to the corresponding row in the dataframe.
140 |
141 | In the case where there is still a tie after considering the overall most common answer, the function utilizes pairwise Levenshtein distance. It calculates the Levenshtein distance between all the tied answers and selects the answer that has the minimum total distance from all the other tied answers. This answer is considered the most representative answer for the tied group.
142 |
143 | Finally, after iterating through all the questions in the dataset, the function returns the curated answer vocabulary, stored in the dataframe.
144 |
145 | By following this policy, the function ensures that the curated vocabulary consists of the most frequent and representative answers for each question in the VizWiz dataset, providing a reliable basis for training a visual question answering model. The number of distinct answer (classes) becomes 5410.
146 |
147 | \subsection{Auxilary Loss}
148 |
149 | We used an auxiliary loss called the "Answer Type Gate" to improve the VQA model. This loss is designed to learn an answer masking mechanism based on the answer types, which helps the model better understand and handle different types of answers. The answer types considered in this approach include "numbers," "yes/no," "others," and "unanswerable,"
150 |
151 | To train the auxiliary loss, the answer types are determined by performing regular expression matching on the best-selected answer for each image-question pair in the dataset. This means that for every question, the most suitable answer type is identified based on its content.
152 |
153 | The predictions for answer types are learned using a linear projection, which maps the answer type predictions to a vector of the same dimension as the number of possible answer classes. In this case, the dimension is 5410, representing the vocabulary size of the answers.
154 |
155 | After the linear projection, a sigmoid layer is applied to obtain probabilities for each answer type. These probabilities are then multiplied element-wise with the logits (scores) of the answer vocabulary. This multiplication effectively masks answers that do not correspond to the current answer type during inference. By applying this masking mechanism, the model can focus on generating answers that are relevant to the detected answer type.
156 |
157 | Both the intermediate answer type prediction and the final answer classification contribute to the loss function. The two cross-entropy losses, one for the intermediate answer type prediction and the other for the final answer classification, are weighted equally in the overall loss calculation. This means that both tasks are considered equally important during training, encouraging the model to effectively predict both the answer type and the answer itself.
158 |
159 | \begin{figure}
160 | \centering
161 | \includegraphics[width=0.4\textwidth]{training_losses.png}
162 | \caption{Training and Validation Losses}
163 | \label{fig_mom0}%
164 | \end{figure}
165 |
166 | By incorporating the Answer Type Gate auxiliary loss, the VQA model gains the ability to understand and distinguish different types of answers. This mechanism helps the model generate more accurate and contextually appropriate answers by masking irrelevant options during inference, improving its overall performance on the VQA task.
167 |
168 |
169 | %%%%%%%%%%%%%%%%%%%%%%%%%% SETUP %%%%%%%%%%%%%%%%%%%%%%%
170 | \section{Experimental Setup}
171 |
172 |
173 | The model was trained for a total of 50 epoch, we took the best performing model which turned out to be at epoch number 45, the training was done primarily on Kaggle using P100 accelerator. We usd a learning rate of 5e-4 with a batch size of 32 (64 was bad in terms of variance). However, we used 0.05 of the training data as test date, we didn't use the official test dataset which affected the training very much. The experiment would give even better results if we didn't split it.
174 |
175 |
176 |
177 | %%%%%%%%%%%%%%%%%%%%%%%%%% EVALUATION %%%%%%%%%%%%%%%%%%%%%%%
178 | \section{Evaluation}
179 | \subsection{Evaluation Metrics}
180 | Task 1: Predict Answer to a Visual Question
181 | Given an image and question about it, the task is to predict an accurate answer. Inspired by the VQA challenge, we use the following accuracy evaluation metric which equals minimum of either one or the number of humans that provided that answer divided by three.Following the VQA challenge, we average over all 10 choose 9 sets of human annotators. The team which achieves the maximum average accuracy for all test visual questions wins this challenge.
182 |
183 | Task 2: Predict Answerability of a Visual Question
184 | Given an image and question about it, the task is to predict if the visual question cannot be answered (with a confidence score in that prediction). The confidence score provided by a prediction model is for ‘answerable’ and should be in [0,1]. We use Python’s average precision evaluation metric which computes the weighted mean of precisions under a precision-recall curve. The team that achieves the largest average precision score for all test visual questions wins this challenge.
185 |
186 | \begin{figure}
187 | \centering
188 | \includegraphics[width=0.4\textwidth]{training_vizwiz_accuracy.png}
189 | \caption{Training and Validation VizWiz Accuracy}
190 | \label{fig_mom0}%
191 | \end{figure}
192 |
193 | \subsection{Results}
194 | he results table (Table 1) presents the performance of a VQA model on various metrics. The metrics include VizWiz accuracy, accuracy, and answerability.
195 |
196 | For VizWiz accuracy, the model achieved a training accuracy of 80.4\% and a validation accuracy of 61.5\%. This metric indicates how well the model performs on the VizWiz dataset, which is a real-world image dataset containing challenging visual question answering scenarios.
197 |
198 | The accuracy metric measures the overall correctness of the model's predictions. The training accuracy achieved by the model is 76.4\%, while the validation accuracy is 48.0\%. This metric provides insight into the model's ability to generate accurate answers across different question types and image contexts.
199 |
200 | The answerability metric focuses on the model's capability to identify questions that are unanswerable. The training and validation answerability scores are 80.2\% and 79.8\%, respectively. This metric assesses the model's skill in determining when a question does not have a meaningful answer based on the given image.
201 |
202 | Overall, the results indicate that the model performs relatively well in terms of VizWiz accuracy and answerability, achieving high scores in both training and validation phases. However, the accuracy metric suggests that there is room for improvement, as the model's performance is comparatively lower, especially on the validation set. These findings provide valuable insights for further optimizing the VQA model and enhancing its overall performance.
203 |
204 |
205 |
206 |
207 |
208 |
209 | \section{Conclusion}
210 |
211 | Our approach focuses on lightweight training by keeping the pre-trained CLIP backbone frozen, while still maintaining good accuracy. The OCR capabilities of CLIP, the large amount of pre-training data, and the multi-modality make CLIP an excellent feature extractor for this task. Unlike previous publications, the text Transformer is also used from CLIP. Although it was trained on alt-texts, it could be shown that meaningful representations of the questions are extracted without any fine-tuning. On the VizWiz VQA task we reach 61.5\% with a single model using ViT backbone on the validation dataset and 54\% on the test dataset.
212 | \begin{table}
213 | \begin{tabular}{l c c c}
214 | \hline
215 | Metric & Training & Validation & \\
216 | \hline
217 | VizWiz Accuracy\, & 80.4\, & 61.5 \\
218 | Accuracy\, & 76.4\, & 48.0 \\
219 | Answerability\, & 80.2\, & 79.8 \\
220 | \hline
221 | \end{tabular}
222 |
223 | \caption{Results table for different metrcis}
224 |
225 | \label{Table1}
226 | \end{table}
227 |
228 |
229 | \appendix
230 |
231 | \section{GitHub Code}
232 | \label{this is amazing}
233 | https://github.com/yousefkotp/Visual-Question-Answering/tree/main
234 |
235 |
236 | \bibliographystyle{elsarticle-harv}
237 |
238 | %% else use the following coding to input the bibitems directly in the
239 | %% TeX file.
240 |
241 | \begin{thebibliography}{00}
242 |
243 | % \bibitem[Author(year)]{label}
244 | \bibitem[Danna Gurari, Qing Li, Abigale J. Stangl, Anhong Guo, Chi Lin, Kristen Grauman, Jiebo Luo, Jeffrey P. Bigham]{VizWiz} VizWiz Grand Challenge: Answering Visual Questions from Blind People (2018)
245 |
246 |
247 | \end{thebibliography}
248 |
249 | \end{document}
250 |
251 | \endinput
--------------------------------------------------------------------------------
/Papers/Less is More.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Papers/Less is More.pdf
--------------------------------------------------------------------------------
/Papers/OpenAI_CLIP model.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Papers/OpenAI_CLIP model.pdf
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Visual Question Answering
2 | Welcome to our Visual Question Answering project! Our goal is to create an intelligent system that can answer questions about images to win the [VizWiz grand challenge](https://vizwiz.org/tasks-and-datasets/vqa/), using state-of-the-art deep learning techniques. With our approach, we aim to push the boundaries of what's possible in computer vision and natural language processing. Our dataset is carefully curated and preprocessed to ensure the best possible performance of our model. We've also created a user-friendly web application that allows you to interact with our system and see its capabilities in action. Whether you're a researcher, a developer, or just curious about the latest advances in AI, we invite you to explore our project and learn more about it.
3 |
4 | **Note:** We have an official submission for [VizWiz VQA challenge](https://vizwiz.org/tasks-and-datasets/vqa/) where we ranked 4th with an accuracy of 54%. Our team "AFK" is on the [leaderboard](https://eval.ai/web/challenges/challenge-page/1911/leaderboard/4517) for the challenge. We are also ranked 2nd for the answerability score in the [leaderboard](https://eval.ai/web/challenges/challenge-page/1911/leaderboard/4520)
5 |
6 |
7 |
8 |
9 |
10 | ## Table of Contents
11 | - [Visual Question Answering](#visual-question-answering)
12 | - [Our Approach](#our-approach)
13 | - [Dataset](#dataset)
14 | - [Data Exploration](#data-exploration)
15 | - [Data Splitting](#data-splitting)
16 | - [Data Preprocessing](#data-preprocessing)
17 | - [Model Architecture](#model-architecture)
18 | - [Training](#training)
19 | - [Loss Graphs](#loss-graphs)
20 | - [Accuracy Graphs](#accuracy-graphs)
21 | - [Evaluation](#evaluation)
22 | - [Results](#results)
23 | - [Web Application](#web-application)
24 | - [User Interface](#user-interface)
25 | - [Deployment](#deployment)
26 | - [Using Docker](#using-docker)
27 | - [Using Python](#using-python)
28 | - [Contributors](#contributors)
29 | - [References](#references)
30 |
31 | ## Our Approach
32 | Our Visual Question Answering (VQA) solution is implemented using a fixed vocabulary approach. This means that the model is not generative, but rather selects the answer from a pre-defined set of possible answers which is discussed in the [Less is More research](https://arxiv.org/abs/2206.05281). This approach is more suitable for our use case, as it allows us to control the vocabulary and ensure that the model only outputs answers that are relevant to the question. In addition to that, it requires a lot less computing power than any other solution. We use the [CLIP](https://openai.com/blog/clip/) model to extract features from the image and the question, and then feed them into a linear layer to predict the answer. We use the [VizWiz](https://vizwiz.org/tasks-and-datasets/vqa/) dataset for training and evaluation. We use the [PyTorch](https://pytorch.org/) library for building and training the model. We have also used [Flask](https://flask.palletsprojects.com/en/2.3.x/) to create a web application that allows you to interact with our model and see its capabilities in action. The web application is deployed using [Docker](https://www.docker.com/).
33 |
34 | ## Dataset
35 | For this project, we used the [VizWiz](https://vizwiz.org/tasks-and-datasets/vqa/) dataset. It is a large-scale visual question answering dataset that contains
36 | - 20,523 training image/question pairs
37 | - 205,230 training answer/answer confidence pairs
38 | - 4,319 validation image/question pairs
39 | - 43,190 validation answer/answer confidence pairs
40 | - 8,000 test image/question pairs
41 |
42 | Dataset files are as follows:
43 | - Images: training, validation, and test sets
44 | - Annotations: Visual questions are split into three JSON files: train, validation, and test. Answers are publicly shared for the train and validation splits and hidden for the test split.
45 |
46 | - The dataset is available on [Kaggle](https://www.kaggle.com/datasets/lhanhsin/vizwiz)
47 |
48 | ### Data Exploration
49 | We performed extensive exploratory data analysis in the [notebook file](https://github.com/yousefkotp/Visual-Question-Answering/blob/main/notebook.ipynb), so it is advisable to see the notebook first. However, here is an overview for words in the training dataset questions.
50 |
51 |
52 |
53 |
54 |
55 | ### Data Splitting
56 | For the training and validation sets, we used the official splits provided by the VizWiz dataset. For the test set, we selected 0.05 of the training set randomly to be used as the test set. We used the [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html) function from the [scikit-learn](https://scikit-learn.org/stable/) library to split the data.
57 |
58 | ### Data Preprocessing
59 | According to the paper we are implementing, we will have a policy to choose the most suitable answer out of the 10 answers to train the model on that answer, the policy for building a vocabulary is as follows:
60 |
61 | 1- Choose the most frequent answer out of the 10 answers
62 |
63 | 2- If there is a tie, we will choose the most frequent one in the entire set of all answers
64 |
65 | 3- If there is a tie, we will choose the answer with the minimum Levenshtein distance to all tied answers
66 |
67 | We also need to one hot encode the answers, so we will have a vector of size 5410, where 5410 is the size of the vocabulary. We will use the [One Hot Encoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) from the [scikit-learn](https://scikit-learn.org/stable/) library to one hot encode the answers and answer type.
68 |
69 | Instead of using lazy processing and extracting question and answer embeddings on the fly, we will extract them beforehand and save them in a pickle file. We will use the [CLIP](https://openai.com/blog/clip/) model to extract the image and questions embeddings.
70 |
71 | ## Model Architecture
72 | We will follow the architecture mentioned in [Less Is More](https://arxiv.org/abs/2206.05281) paper. The architecture goes as follows:
73 |
74 | - Each Linear layer consist of Layer Norm followed by Drop out with probability of 0.5 followed by fully connected layer of size 512.
75 | - We will have Cross Entropy Loss for answer and answer type
76 | - We will use Binary Cross Entropy Loss for answerability
77 |
78 | Here is the illustration of the model architecture
79 |
80 |
81 |
82 |
83 |
84 | - **Note:** We removed the drop out from answerability linear layer as it was causing a lot of noise in training answerability.
85 |
86 | ## Training
87 | We have trained the model on Kaggle using P100 cuda accelerator for a total of 50 epoch and picked the best one, each epoch took ~1 minute to finish. We have used a batch size of 32 (64 was bad in terms of variance), we used Adam optimized with a learning rate of 5e-4. No L2 regularization was used. The model was saved after each 5 epochs and the best model was picked based on the validation accuracy. We used the [PyTorch](https://pytorch.org/) library for building and training the model.
88 |
89 | The best model was recorded at epoch 45 so, we will use that model for evaluation.
90 |
91 | ### Loss Graphs
92 |
93 |
94 |
95 |
96 |
97 | ### Accuracy Graphs
98 |
99 |
100 |
101 |
102 |
103 | ## Evaluation
104 | According to evaluation in [VizWiz VQA Challenge](https://vizwiz.org/tasks-and-datasets/vqa/), we will use the following metrics to evaluate our model:
105 | - **VizWiz Accuracy:** Given an image and question about it, the task is to predict an accurate answer. Inspired by the VQA challenge, we use the following accuracy evaluation metric:
106 |
107 |
108 |
109 |
110 | VizWiz accuracy metric is the minimum between 1 and the number of humans that provided that answer divided by 3.
111 |
112 | - **Answerability**: given an image and question about it, the task is to predict if the visual question cannot be answered (with a confidence score in that prediction). The confidence score provided by a prediction model is for ‘answerable’ and should be in [0,1]. We use [Python’s average precision evaluation metric](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html) which computes the weighted mean of precisions under a precision-recall curve.
113 |
114 | ### Results
115 |
116 |
117 |
118 |
Metrics
119 |
Training
120 |
Validation
121 |
122 |
123 |
VizWiz Accuracy
124 |
80.4%
125 |
61.5%
126 |
127 |
128 |
Accuracy
129 |
76.4%
130 |
48.0%
131 |
132 |
133 |
Answerability
134 |
80.2%
135 |
79.8%
136 |
137 |
138 |
139 | ## Web Application
140 | [Flask](https://flask.palletsprojects.com/en/2.3.x/) is a lightweight web application framework. It is designed to make getting started quick and easy, with the ability to scale up to complex applications. We used Flask to build a web application that allows users to upload an image or enter image url and ask a question about the image. The application will then display the image and the answer to the question in a user friendly way.
141 |
142 | ### User Interface
143 |
144 |
145 |
146 |
147 |
148 | ## Deployment
149 | To ease the process of running the application, we have provided two options for deployment: using Docker or using Python. We recommend using Docker, as it is easier to set up and run. However, if you prefer to use Python, you can follow the instructions below. However, you need to place the trained model in the `Saved_Models` folder with the name `model.pth`
150 |
151 | ### Using Docker
152 | 1- Make sure to install [Docker](https://www.docker.com/) on your machine.
153 |
154 | 2- Run the following command to build the Docker image
155 |
156 | ```bash
157 | docker build -t vqa-flask-app .
158 | ```
159 |
160 | 3- Run the following command to start the server
161 |
162 | ```bash
163 | docker run -p 5000:5000 vqa-flask-app
164 | ```
165 |
166 | - You can change the port number of the deployed application by changing the first number in the command above.
167 |
168 | 4- Open your browser and go to http://localhost:5000/
169 |
170 |
171 | ### Using Python
172 |
173 | 1- Make sure to install the requirements
174 |
175 | ```bash
176 | pip install -r requirements.txt
177 | ```
178 |
179 | 2- Set up the application for the flask
180 |
181 | ```bash
182 | set FLASK_APP=app.py
183 | ```
184 |
185 | - If your are using Linux or Mac OS, use the following command instead
186 |
187 | ```bash
188 | export FLASK_APP=app.py
189 | ```
190 |
191 | 3- Run the following command to start the server
192 |
193 | ```bash
194 | python -m flask run
195 | ```
196 |
197 | 4- Open your browser and go to http://localhost:5000/
198 |
199 | ## Contributors
200 |
201 | - [Yousef Kotp](https://github.com/yousefkotp)
202 |
203 | - [Adham Mohamed](https://github.com/adhammohamed1)
204 |
205 | - [Mohamed Farid](https://github.com/MohamedFarid612)
206 |
207 | ## References
208 |
209 | - [Less Is More: Linear Layers on CLIP Features as Powerful VizWiz Model](https://arxiv.org/abs/2206.05281)
210 |
211 | - [CLIP: Connecting text and images](https://openai.com/research/clip)
212 |
213 | - [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
214 |
--------------------------------------------------------------------------------
/Saved_Models/README.md:
--------------------------------------------------------------------------------
1 | # Saved Models
2 | - This folder must contain at least one model to be used by the application. The model must be in the .pth format. The model must be named as "model.pth". If you want to use more than one model, you can add them as "model1.pth", "model2.pth", etc. The application will automatically detect the number of models and will use them all.
3 |
4 | - You should also store the fitted one hot encoder for answer and answer type here. The one hot encoder must be named as "answer_one_hot_encoder.pkl" and "answer_type_one_hot_encoder.pkl" respectively.
--------------------------------------------------------------------------------
/Saved_Models/answer_onehotencoder.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Saved_Models/answer_onehotencoder.pkl
--------------------------------------------------------------------------------
/Saved_Models/answer_type_onehotencoder.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/Saved_Models/answer_type_onehotencoder.pkl
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, jsonify, render_template, url_for
2 | import torch
3 | import pickle
4 | from vqa_model import VQAModel
5 | import urllib.request
6 |
7 | app = Flask(__name__)
8 |
9 | # Loading the fitted One Hot Encoders from the disk
10 | with open('Saved_Models/answer_onehotencoder.pkl', 'rb') as f:
11 | ANSWER_ONEHOTENCODER = pickle.load(f)
12 | with open('Saved_Models/answer_type_onehotencoder.pkl', 'rb') as f:
13 | ANSWER_TYPE_ONEHOTENCODER = pickle.load(f)
14 |
15 | # Loading the model from the disk
16 | DEVICE = torch.device("cpu")
17 | MODEL_NAME = "ViT-L/14@336px"
18 | NUM_CLASSES = 5410
19 | MODEL_PATH = "Saved_Models/model.pth"
20 | model = VQAModel(num_classes=NUM_CLASSES, device= DEVICE, hidden_size=512, model_name=MODEL_NAME).to(DEVICE)
21 | model.load_model(MODEL_PATH)
22 |
23 | @app.route('/')
24 | def home():
25 | return render_template('index.html')
26 |
27 |
28 | @app.route('/predict', methods=['POST'])
29 | def predict():
30 | # Get the image and question from the request
31 | image_url = request.form.get('image_url')
32 | question = request.form.get('question')
33 |
34 | if 'image' in request.files:
35 | # The image is a file uploaded from a device
36 | image = request.files['image']
37 | image_path = 'templates/user_image.jpg'
38 | image.save(image_path)
39 | elif image_url:
40 | # The image is a URL
41 | image_path = 'templates/user_image.jpg'
42 | urllib.request.urlretrieve(image_url, image_path)
43 | else:
44 | # No image was provided
45 | return 'No image provided'
46 |
47 | # Predict the answer and answer type
48 | predicted_answer, predicted_answer_type, answerability = model.test_model(image_path = image_path, question = question)
49 | answer = ANSWER_ONEHOTENCODER.inverse_transform(predicted_answer.cpu().detach().numpy())
50 | answer_type = ANSWER_TYPE_ONEHOTENCODER.inverse_transform(predicted_answer_type.cpu().detach().numpy())
51 |
52 | # Return the predicted answer and answer type as a JSON response
53 | response = {'answer': answer[0][0], 'answer_type': answer_type[0][0], 'answerability': answerability.item()}
54 | return jsonify(response)
55 |
56 | if __name__ == '__main__':
57 | app.run(debug=True)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/openai/CLIP.git
2 | flask
3 | torch
4 | numpy
5 | matplotlib
6 | pillow
7 | clip
8 | requests
9 | scikit-learn
--------------------------------------------------------------------------------
/static/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yousefkotp/Visual-Question-Answering/9777f431980fd004c5f971214a4d75a9481d6be6/static/image.jpg
--------------------------------------------------------------------------------
/static/script.js:
--------------------------------------------------------------------------------
1 | function submitForm() {
2 | var form = document.getElementById("predict-form");
3 | var formData = new FormData(form);
4 | var xhr = new XMLHttpRequest();
5 | xhr.open("POST", "/predict");
6 | xhr.onload = function() {
7 | if (xhr.status === 200) {
8 | var resultDiv = document.getElementById("result");
9 |
10 | // We will receive object as a string, so we need to parse it to JSON
11 | var objectReceived = JSON.parse(xhr.responseText);
12 |
13 | var answer = objectReceived.answer;
14 | var answer_type = objectReceived.answer_type;
15 | var answerability = 1 - objectReceived.answerability;
16 |
17 | resultDiv.innerHTML = "