├── .github
    ├── timestamp
    └── workflows
    │   └── main.yml
├── test_image.jpg
├── .gitattributes
├── requirements.txt
├── blip-image-captioning-base
    ├── pytorch_model.bin
    ├── special_tokens_map.json
    ├── preprocessor_config.json
    ├── tokenizer_config.json
    ├── config.json
    └── README.md
├── readme.md
├── image-captioning.py
└── .gitignore


/.github/timestamp:
--------------------------------------------------------------------------------
1 | 2025-12-23T05:54:28Z
2 | 


--------------------------------------------------------------------------------
/test_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/test_image.jpg


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | blip-image-captioning-base/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/requirements.txt


--------------------------------------------------------------------------------
/blip-image-captioning-base/pytorch_model.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d6638651a5526cc2ede56f2b5104d6851b0755816d220e5e046870430180c767
3 | size 989820849
4 | 


--------------------------------------------------------------------------------
/blip-image-captioning-base/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "cls_token": "[CLS]",
3 |   "mask_token": "[MASK]",
4 |   "pad_token": "[PAD]",
5 |   "sep_token": "[SEP]",
6 |   "unk_token": "[UNK]"
7 | }
8 | 


--------------------------------------------------------------------------------
/blip-image-captioning-base/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "do_normalize": true,
 3 |   "do_resize": true,
 4 |   "image_mean": [
 5 |     0.48145466,
 6 |     0.4578275,
 7 |     0.40821073
 8 |   ],
 9 |   "image_processor_type": "BlipImageProcessor",
10 |   "image_std": [
11 |     0.26862954,
12 |     0.26130258,
13 |     0.27577711
14 |   ],
15 |   "processor_class": "BlipProcessor",
16 |   "size": 384
17 | }
18 | 


--------------------------------------------------------------------------------
/blip-image-captioning-base/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cls_token": "[CLS]",
 3 |   "do_basic_tokenize": true,
 4 |   "do_lower_case": true,
 5 |   "mask_token": "[MASK]",
 6 |   "model_max_length": 512,
 7 |   "name_or_path": "bert-base-uncased",
 8 |   "never_split": null,
 9 |   "pad_token": "[PAD]",
10 |   "processor_class": "BlipProcessor",
11 |   "sep_token": "[SEP]",
12 |   "special_tokens_map_file": null,
13 |   "strip_accents": null,
14 |   "tokenize_chinese_chars": true,
15 |   "tokenizer_class": "BertTokenizer",
16 |   "unk_token": "[UNK]",
17 |   "model_input_names": [
18 |     "input_ids",
19 |     "attention_mask"
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Image captioning with a locally stored Large Language Model (LLM)
 2 | 
 3 | This example generates a caption of an image.
 4 | 
 5 | It runs fully local on your computer and it does not require a Graphics Processing Unit (GPU).
 6 | 
 7 | It uses the Salesforce [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/Salesforce/blip-image-captioning-base) Large Language Model (LLM) and Hugging Face Transformers. Please note that is a very small model and its capabilities are therefore limited, but the results are still very impressive for its size.
 8 | 
 9 | This example uses this test image:
10 | 
11 | ![alt text](https://github.com/botextractai/ai-image-captioning/blob/main/test_image.jpg "Test image")
12 | 
13 | to automatically generate this image caption:
14 | 
15 | `a set of toy cars and traffic cones`
16 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: Timestamp
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - master
 6 |   schedule:
 7 |     - cron: '45 5 * * *'
 8 | jobs:
 9 |   auto_commit:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |         with:
14 |           persist-credentials: false
15 |           fetch-depth: 0
16 |       - name: Modify timestamp file
17 |         run: |
18 |           d=`date '+%Y-%m-%dT%H:%M:%SZ'`
19 |           echo $d > .github/timestamp
20 |       - name: Commit changes
21 |         run: |
22 |           git config --local user.email "${{ secrets.USEREMAIL }}"
23 |           git config --local user.name "${{ secrets.USERNAME }}"
24 |           git commit -a -m "Timestamp"
25 |       - name: Push Back
26 |         uses: ad-m/github-push-action@master
27 |         with:
28 |           force: true
29 |           directory: '.'
30 |           github_token: ${{ secrets.GITHUB_TOKEN }}
31 | 


--------------------------------------------------------------------------------
/image-captioning.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | from PIL import Image
 3 | from transformers import AutoProcessor
 4 | from transformers import BlipForConditionalGeneration
 5 | from transformers.utils import logging
 6 | 
 7 | logging.set_verbosity_error()
 8 | 
 9 | # Suppress warning message
10 | warnings.filterwarnings("ignore", message="Using the model-agnostic default `max_length`")
11 | 
12 | # Load the Large Language Model (LLM)
13 | model = BlipForConditionalGeneration.from_pretrained(
14 |     "./blip-image-captioning-base")
15 | 
16 | # Load the processor
17 | processor = AutoProcessor.from_pretrained(
18 |     "./blip-image-captioning-base")
19 | 
20 | # Load the image
21 | image = Image.open("./test_image.jpg")
22 | 
23 | # Create the input
24 | inputs = processor(image, return_tensors="pt")
25 | 
26 | # Get the outupt
27 | out = model.generate(**inputs)
28 | 
29 | # Print the output
30 | print(processor.decode(out[0], skip_special_tokens=True))
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/blip-image-captioning-base/config.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "_commit_hash": null,
  3 |   "architectures": [
  4 |     "BlipForConditionalGeneration"
  5 |   ],
  6 |   "image_text_hidden_size": 256,
  7 |   "initializer_factor": 1.0,
  8 |   "logit_scale_init_value": 2.6592,
  9 |   "model_type": "blip",
 10 |   "projection_dim": 512,
 11 |   "text_config": {
 12 |     "_name_or_path": "",
 13 |     "add_cross_attention": false,
 14 |     "architectures": null,
 15 |     "attention_probs_dropout_prob": 0.0,
 16 |     "bad_words_ids": null,
 17 |     "begin_suppress_tokens": null,
 18 |     "bos_token_id": 30522,
 19 |     "chunk_size_feed_forward": 0,
 20 |     "cross_attention_hidden_size": null,
 21 |     "decoder_start_token_id": null,
 22 |     "diversity_penalty": 0.0,
 23 |     "do_sample": false,
 24 |     "early_stopping": false,
 25 |     "encoder_no_repeat_ngram_size": 0,
 26 |     "eos_token_id": 2,
 27 |     "exponential_decay_length_penalty": null,
 28 |     "finetuning_task": null,
 29 |     "forced_bos_token_id": null,
 30 |     "forced_eos_token_id": null,
 31 |     "hidden_act": "gelu",
 32 |     "hidden_dropout_prob": 0.0,
 33 |     "hidden_size": 768,
 34 |     "id2label": {
 35 |       "0": "LABEL_0",
 36 |       "1": "LABEL_1"
 37 |     },
 38 |     "initializer_factor": 1.0,
 39 |     "initializer_range": 0.02,
 40 |     "intermediate_size": 3072,
 41 |     "is_decoder": true,
 42 |     "is_encoder_decoder": false,
 43 |     "label2id": {
 44 |       "LABEL_0": 0,
 45 |       "LABEL_1": 1
 46 |     },
 47 |     "layer_norm_eps": 1e-12,
 48 |     "length_penalty": 1.0,
 49 |     "max_length": 20,
 50 |     "max_position_embeddings": 512,
 51 |     "min_length": 0,
 52 |     "model_type": "blip_text_model",
 53 |     "no_repeat_ngram_size": 0,
 54 |     "num_attention_heads": 12,
 55 |     "num_beam_groups": 1,
 56 |     "num_beams": 1,
 57 |     "num_hidden_layers": 12,
 58 |     "num_return_sequences": 1,
 59 |     "output_attentions": false,
 60 |     "output_hidden_states": false,
 61 |     "output_scores": false,
 62 |     "pad_token_id": 0,
 63 |     "prefix": null,
 64 |     "problem_type": null,
 65 |     "projection_dim": 768,
 66 |     "pruned_heads": {},
 67 |     "remove_invalid_values": false,
 68 |     "repetition_penalty": 1.0,
 69 |     "return_dict": true,
 70 |     "return_dict_in_generate": false,
 71 |     "sep_token_id": 102,
 72 |     "suppress_tokens": null,
 73 |     "task_specific_params": null,
 74 |     "temperature": 1.0,
 75 |     "tf_legacy_loss": false,
 76 |     "tie_encoder_decoder": false,
 77 |     "tie_word_embeddings": true,
 78 |     "tokenizer_class": null,
 79 |     "top_k": 50,
 80 |     "top_p": 1.0,
 81 |     "torch_dtype": null,
 82 |     "torchscript": false,
 83 |     "transformers_version": "4.26.0.dev0",
 84 |     "typical_p": 1.0,
 85 |     "use_bfloat16": false,
 86 |     "use_cache": true,
 87 |     "vocab_size": 30524
 88 |   },
 89 |   "torch_dtype": "float32",
 90 |   "transformers_version": null,
 91 |   "vision_config": {
 92 |     "_name_or_path": "",
 93 |     "add_cross_attention": false,
 94 |     "architectures": null,
 95 |     "attention_dropout": 0.0,
 96 |     "bad_words_ids": null,
 97 |     "begin_suppress_tokens": null,
 98 |     "bos_token_id": null,
 99 |     "chunk_size_feed_forward": 0,
100 |     "cross_attention_hidden_size": null,
101 |     "decoder_start_token_id": null,
102 |     "diversity_penalty": 0.0,
103 |     "do_sample": false,
104 |     "dropout": 0.0,
105 |     "early_stopping": false,
106 |     "encoder_no_repeat_ngram_size": 0,
107 |     "eos_token_id": null,
108 |     "exponential_decay_length_penalty": null,
109 |     "finetuning_task": null,
110 |     "forced_bos_token_id": null,
111 |     "forced_eos_token_id": null,
112 |     "hidden_act": "gelu",
113 |     "hidden_size": 768,
114 |     "id2label": {
115 |       "0": "LABEL_0",
116 |       "1": "LABEL_1"
117 |     },
118 |     "image_size": 384,
119 |     "initializer_factor": 1.0,
120 |     "initializer_range": 0.02,
121 |     "intermediate_size": 3072,
122 |     "is_decoder": false,
123 |     "is_encoder_decoder": false,
124 |     "label2id": {
125 |       "LABEL_0": 0,
126 |       "LABEL_1": 1
127 |     },
128 |     "layer_norm_eps": 1e-05,
129 |     "length_penalty": 1.0,
130 |     "max_length": 20,
131 |     "min_length": 0,
132 |     "model_type": "blip_vision_model",
133 |     "no_repeat_ngram_size": 0,
134 |     "num_attention_heads": 12,
135 |     "num_beam_groups": 1,
136 |     "num_beams": 1,
137 |     "num_channels": 3,
138 |     "num_hidden_layers": 12,
139 |     "num_return_sequences": 1,
140 |     "output_attentions": false,
141 |     "output_hidden_states": false,
142 |     "output_scores": false,
143 |     "pad_token_id": null,
144 |     "patch_size": 16,
145 |     "prefix": null,
146 |     "problem_type": null,
147 |     "projection_dim": 512,
148 |     "pruned_heads": {},
149 |     "remove_invalid_values": false,
150 |     "repetition_penalty": 1.0,
151 |     "return_dict": true,
152 |     "return_dict_in_generate": false,
153 |     "sep_token_id": null,
154 |     "suppress_tokens": null,
155 |     "task_specific_params": null,
156 |     "temperature": 1.0,
157 |     "tf_legacy_loss": false,
158 |     "tie_encoder_decoder": false,
159 |     "tie_word_embeddings": true,
160 |     "tokenizer_class": null,
161 |     "top_k": 50,
162 |     "top_p": 1.0,
163 |     "torch_dtype": null,
164 |     "torchscript": false,
165 |     "transformers_version": "4.26.0.dev0",
166 |     "typical_p": 1.0,
167 |     "use_bfloat16": false
168 |   }
169 | }
170 | 


--------------------------------------------------------------------------------
/blip-image-captioning-base/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | pipeline_tag: image-to-text
  3 | tags:
  4 | - image-captioning
  5 | languages:
  6 | - en
  7 | license: bsd-3-clause
  8 | ---
  9 | 
 10 | # BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
 11 | 
 12 | Model card for image captioning pretrained on COCO dataset - base architecture (with ViT base backbone).
 13 | 
 14 | | ![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif) |
 15 | |:--:|
 16 | | <b> Pull figure from BLIP official repo | Image source: https://github.com/salesforce/BLIP </b>|
 17 | 
 18 | ## TL;DR
 19 | 
 20 | Authors from the [paper](https://arxiv.org/abs/2201.12086) write in the abstract:
 21 | 
 22 | *Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
 23 | 
 24 | ## Usage
 25 | 
 26 | You can use this model for conditional and un-conditional image captioning
 27 | 
 28 | ### Using the Pytorch model
 29 | 
 30 | #### Running the model on CPU
 31 | 
 32 | <details>
 33 | <summary> Click to expand </summary>
 34 | 
 35 | ```python
 36 | import requests
 37 | from PIL import Image
 38 | from transformers import BlipProcessor, BlipForConditionalGeneration
 39 | 
 40 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 41 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 42 | 
 43 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
 44 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 45 | 
 46 | # conditional image captioning
 47 | text = "a photography of"
 48 | inputs = processor(raw_image, text, return_tensors="pt")
 49 | 
 50 | out = model.generate(**inputs)
 51 | print(processor.decode(out[0], skip_special_tokens=True))
 52 | # >>> a photography of a woman and her dog
 53 | 
 54 | # unconditional image captioning
 55 | inputs = processor(raw_image, return_tensors="pt")
 56 | 
 57 | out = model.generate(**inputs)
 58 | print(processor.decode(out[0], skip_special_tokens=True))
 59 | >>> a woman sitting on the beach with her dog
 60 | ```
 61 | </details>
 62 | 
 63 | #### Running the model on GPU
 64 | 
 65 | ##### In full precision 
 66 | 
 67 | <details>
 68 | <summary> Click to expand </summary>
 69 | 
 70 | ```python
 71 | import requests
 72 | from PIL import Image
 73 | from transformers import BlipProcessor, BlipForConditionalGeneration
 74 | 
 75 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 76 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
 77 | 
 78 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
 79 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 80 | 
 81 | # conditional image captioning
 82 | text = "a photography of"
 83 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
 84 | 
 85 | out = model.generate(**inputs)
 86 | print(processor.decode(out[0], skip_special_tokens=True))
 87 | # >>> a photography of a woman and her dog
 88 | 
 89 | # unconditional image captioning
 90 | inputs = processor(raw_image, return_tensors="pt").to("cuda")
 91 | 
 92 | out = model.generate(**inputs)
 93 | print(processor.decode(out[0], skip_special_tokens=True))
 94 | >>> a woman sitting on the beach with her dog
 95 | ```
 96 | </details>
 97 | 
 98 | ##### In half precision (`float16`)
 99 | 
100 | <details>
101 | <summary> Click to expand </summary>
102 | 
103 | ```python
104 | import torch
105 | import requests
106 | from PIL import Image
107 | from transformers import BlipProcessor, BlipForConditionalGeneration
108 | 
109 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
110 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
111 | 
112 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
113 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
114 | 
115 | # conditional image captioning
116 | text = "a photography of"
117 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
118 | 
119 | out = model.generate(**inputs)
120 | print(processor.decode(out[0], skip_special_tokens=True))
121 | # >>> a photography of a woman and her dog
122 | 
123 | # unconditional image captioning
124 | inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
125 | 
126 | out = model.generate(**inputs)
127 | print(processor.decode(out[0], skip_special_tokens=True))
128 | >>> a woman sitting on the beach with her dog
129 | ```
130 | </details>
131 | 
132 | ## BibTex and citation info
133 | 
134 | ```
135 | @misc{https://doi.org/10.48550/arxiv.2201.12086,
136 |   doi = {10.48550/ARXIV.2201.12086},
137 |   
138 |   url = {https://arxiv.org/abs/2201.12086},
139 |   
140 |   author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
141 |   
142 |   keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
143 |   
144 |   title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
145 |   
146 |   publisher = {arXiv},
147 |   
148 |   year = {2022},
149 |   
150 |   copyright = {Creative Commons Attribution 4.0 International}
151 | }
152 | ```
153 | 


--------------------------------------------------------------------------------