├── .github
├── timestamp
└── workflows
│ └── main.yml
├── test_image.jpg
├── .gitattributes
├── requirements.txt
├── blip-image-captioning-base
├── pytorch_model.bin
├── special_tokens_map.json
├── preprocessor_config.json
├── tokenizer_config.json
├── config.json
└── README.md
├── readme.md
├── image-captioning.py
└── .gitignore
/.github/timestamp:
--------------------------------------------------------------------------------
1 | 2025-12-23T05:54:28Z
2 |
--------------------------------------------------------------------------------
/test_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/test_image.jpg
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | blip-image-captioning-base/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/requirements.txt
--------------------------------------------------------------------------------
/blip-image-captioning-base/pytorch_model.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d6638651a5526cc2ede56f2b5104d6851b0755816d220e5e046870430180c767
3 | size 989820849
4 |
--------------------------------------------------------------------------------
/blip-image-captioning-base/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 | "cls_token": "[CLS]",
3 | "mask_token": "[MASK]",
4 | "pad_token": "[PAD]",
5 | "sep_token": "[SEP]",
6 | "unk_token": "[UNK]"
7 | }
8 |
--------------------------------------------------------------------------------
/blip-image-captioning-base/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "do_normalize": true,
3 | "do_resize": true,
4 | "image_mean": [
5 | 0.48145466,
6 | 0.4578275,
7 | 0.40821073
8 | ],
9 | "image_processor_type": "BlipImageProcessor",
10 | "image_std": [
11 | 0.26862954,
12 | 0.26130258,
13 | 0.27577711
14 | ],
15 | "processor_class": "BlipProcessor",
16 | "size": 384
17 | }
18 |
--------------------------------------------------------------------------------
/blip-image-captioning-base/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "cls_token": "[CLS]",
3 | "do_basic_tokenize": true,
4 | "do_lower_case": true,
5 | "mask_token": "[MASK]",
6 | "model_max_length": 512,
7 | "name_or_path": "bert-base-uncased",
8 | "never_split": null,
9 | "pad_token": "[PAD]",
10 | "processor_class": "BlipProcessor",
11 | "sep_token": "[SEP]",
12 | "special_tokens_map_file": null,
13 | "strip_accents": null,
14 | "tokenize_chinese_chars": true,
15 | "tokenizer_class": "BertTokenizer",
16 | "unk_token": "[UNK]",
17 | "model_input_names": [
18 | "input_ids",
19 | "attention_mask"
20 | ]
21 | }
22 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # Image captioning with a locally stored Large Language Model (LLM)
2 |
3 | This example generates a caption of an image.
4 |
5 | It runs fully local on your computer and it does not require a Graphics Processing Unit (GPU).
6 |
7 | It uses the Salesforce [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/Salesforce/blip-image-captioning-base) Large Language Model (LLM) and Hugging Face Transformers. Please note that is a very small model and its capabilities are therefore limited, but the results are still very impressive for its size.
8 |
9 | This example uses this test image:
10 |
11 | 
12 |
13 | to automatically generate this image caption:
14 |
15 | `a set of toy cars and traffic cones`
16 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Timestamp
2 | on:
3 | push:
4 | branches:
5 | - master
6 | schedule:
7 | - cron: '45 5 * * *'
8 | jobs:
9 | auto_commit:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | with:
14 | persist-credentials: false
15 | fetch-depth: 0
16 | - name: Modify timestamp file
17 | run: |
18 | d=`date '+%Y-%m-%dT%H:%M:%SZ'`
19 | echo $d > .github/timestamp
20 | - name: Commit changes
21 | run: |
22 | git config --local user.email "${{ secrets.USEREMAIL }}"
23 | git config --local user.name "${{ secrets.USERNAME }}"
24 | git commit -a -m "Timestamp"
25 | - name: Push Back
26 | uses: ad-m/github-push-action@master
27 | with:
28 | force: true
29 | directory: '.'
30 | github_token: ${{ secrets.GITHUB_TOKEN }}
31 |
--------------------------------------------------------------------------------
/image-captioning.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from PIL import Image
3 | from transformers import AutoProcessor
4 | from transformers import BlipForConditionalGeneration
5 | from transformers.utils import logging
6 |
7 | logging.set_verbosity_error()
8 |
9 | # Suppress warning message
10 | warnings.filterwarnings("ignore", message="Using the model-agnostic default `max_length`")
11 |
12 | # Load the Large Language Model (LLM)
13 | model = BlipForConditionalGeneration.from_pretrained(
14 | "./blip-image-captioning-base")
15 |
16 | # Load the processor
17 | processor = AutoProcessor.from_pretrained(
18 | "./blip-image-captioning-base")
19 |
20 | # Load the image
21 | image = Image.open("./test_image.jpg")
22 |
23 | # Create the input
24 | inputs = processor(image, return_tensors="pt")
25 |
26 | # Get the outupt
27 | out = model.generate(**inputs)
28 |
29 | # Print the output
30 | print(processor.decode(out[0], skip_special_tokens=True))
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/blip-image-captioning-base/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "_commit_hash": null,
3 | "architectures": [
4 | "BlipForConditionalGeneration"
5 | ],
6 | "image_text_hidden_size": 256,
7 | "initializer_factor": 1.0,
8 | "logit_scale_init_value": 2.6592,
9 | "model_type": "blip",
10 | "projection_dim": 512,
11 | "text_config": {
12 | "_name_or_path": "",
13 | "add_cross_attention": false,
14 | "architectures": null,
15 | "attention_probs_dropout_prob": 0.0,
16 | "bad_words_ids": null,
17 | "begin_suppress_tokens": null,
18 | "bos_token_id": 30522,
19 | "chunk_size_feed_forward": 0,
20 | "cross_attention_hidden_size": null,
21 | "decoder_start_token_id": null,
22 | "diversity_penalty": 0.0,
23 | "do_sample": false,
24 | "early_stopping": false,
25 | "encoder_no_repeat_ngram_size": 0,
26 | "eos_token_id": 2,
27 | "exponential_decay_length_penalty": null,
28 | "finetuning_task": null,
29 | "forced_bos_token_id": null,
30 | "forced_eos_token_id": null,
31 | "hidden_act": "gelu",
32 | "hidden_dropout_prob": 0.0,
33 | "hidden_size": 768,
34 | "id2label": {
35 | "0": "LABEL_0",
36 | "1": "LABEL_1"
37 | },
38 | "initializer_factor": 1.0,
39 | "initializer_range": 0.02,
40 | "intermediate_size": 3072,
41 | "is_decoder": true,
42 | "is_encoder_decoder": false,
43 | "label2id": {
44 | "LABEL_0": 0,
45 | "LABEL_1": 1
46 | },
47 | "layer_norm_eps": 1e-12,
48 | "length_penalty": 1.0,
49 | "max_length": 20,
50 | "max_position_embeddings": 512,
51 | "min_length": 0,
52 | "model_type": "blip_text_model",
53 | "no_repeat_ngram_size": 0,
54 | "num_attention_heads": 12,
55 | "num_beam_groups": 1,
56 | "num_beams": 1,
57 | "num_hidden_layers": 12,
58 | "num_return_sequences": 1,
59 | "output_attentions": false,
60 | "output_hidden_states": false,
61 | "output_scores": false,
62 | "pad_token_id": 0,
63 | "prefix": null,
64 | "problem_type": null,
65 | "projection_dim": 768,
66 | "pruned_heads": {},
67 | "remove_invalid_values": false,
68 | "repetition_penalty": 1.0,
69 | "return_dict": true,
70 | "return_dict_in_generate": false,
71 | "sep_token_id": 102,
72 | "suppress_tokens": null,
73 | "task_specific_params": null,
74 | "temperature": 1.0,
75 | "tf_legacy_loss": false,
76 | "tie_encoder_decoder": false,
77 | "tie_word_embeddings": true,
78 | "tokenizer_class": null,
79 | "top_k": 50,
80 | "top_p": 1.0,
81 | "torch_dtype": null,
82 | "torchscript": false,
83 | "transformers_version": "4.26.0.dev0",
84 | "typical_p": 1.0,
85 | "use_bfloat16": false,
86 | "use_cache": true,
87 | "vocab_size": 30524
88 | },
89 | "torch_dtype": "float32",
90 | "transformers_version": null,
91 | "vision_config": {
92 | "_name_or_path": "",
93 | "add_cross_attention": false,
94 | "architectures": null,
95 | "attention_dropout": 0.0,
96 | "bad_words_ids": null,
97 | "begin_suppress_tokens": null,
98 | "bos_token_id": null,
99 | "chunk_size_feed_forward": 0,
100 | "cross_attention_hidden_size": null,
101 | "decoder_start_token_id": null,
102 | "diversity_penalty": 0.0,
103 | "do_sample": false,
104 | "dropout": 0.0,
105 | "early_stopping": false,
106 | "encoder_no_repeat_ngram_size": 0,
107 | "eos_token_id": null,
108 | "exponential_decay_length_penalty": null,
109 | "finetuning_task": null,
110 | "forced_bos_token_id": null,
111 | "forced_eos_token_id": null,
112 | "hidden_act": "gelu",
113 | "hidden_size": 768,
114 | "id2label": {
115 | "0": "LABEL_0",
116 | "1": "LABEL_1"
117 | },
118 | "image_size": 384,
119 | "initializer_factor": 1.0,
120 | "initializer_range": 0.02,
121 | "intermediate_size": 3072,
122 | "is_decoder": false,
123 | "is_encoder_decoder": false,
124 | "label2id": {
125 | "LABEL_0": 0,
126 | "LABEL_1": 1
127 | },
128 | "layer_norm_eps": 1e-05,
129 | "length_penalty": 1.0,
130 | "max_length": 20,
131 | "min_length": 0,
132 | "model_type": "blip_vision_model",
133 | "no_repeat_ngram_size": 0,
134 | "num_attention_heads": 12,
135 | "num_beam_groups": 1,
136 | "num_beams": 1,
137 | "num_channels": 3,
138 | "num_hidden_layers": 12,
139 | "num_return_sequences": 1,
140 | "output_attentions": false,
141 | "output_hidden_states": false,
142 | "output_scores": false,
143 | "pad_token_id": null,
144 | "patch_size": 16,
145 | "prefix": null,
146 | "problem_type": null,
147 | "projection_dim": 512,
148 | "pruned_heads": {},
149 | "remove_invalid_values": false,
150 | "repetition_penalty": 1.0,
151 | "return_dict": true,
152 | "return_dict_in_generate": false,
153 | "sep_token_id": null,
154 | "suppress_tokens": null,
155 | "task_specific_params": null,
156 | "temperature": 1.0,
157 | "tf_legacy_loss": false,
158 | "tie_encoder_decoder": false,
159 | "tie_word_embeddings": true,
160 | "tokenizer_class": null,
161 | "top_k": 50,
162 | "top_p": 1.0,
163 | "torch_dtype": null,
164 | "torchscript": false,
165 | "transformers_version": "4.26.0.dev0",
166 | "typical_p": 1.0,
167 | "use_bfloat16": false
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/blip-image-captioning-base/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | pipeline_tag: image-to-text
3 | tags:
4 | - image-captioning
5 | languages:
6 | - en
7 | license: bsd-3-clause
8 | ---
9 |
10 | # BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation
11 |
12 | Model card for image captioning pretrained on COCO dataset - base architecture (with ViT base backbone).
13 |
14 | |  |
15 | |:--:|
16 | | Pull figure from BLIP official repo | Image source: https://github.com/salesforce/BLIP |
17 |
18 | ## TL;DR
19 |
20 | Authors from the [paper](https://arxiv.org/abs/2201.12086) write in the abstract:
21 |
22 | *Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.*
23 |
24 | ## Usage
25 |
26 | You can use this model for conditional and un-conditional image captioning
27 |
28 | ### Using the Pytorch model
29 |
30 | #### Running the model on CPU
31 |
32 |
33 | Click to expand
34 |
35 | ```python
36 | import requests
37 | from PIL import Image
38 | from transformers import BlipProcessor, BlipForConditionalGeneration
39 |
40 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
41 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
42 |
43 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
44 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
45 |
46 | # conditional image captioning
47 | text = "a photography of"
48 | inputs = processor(raw_image, text, return_tensors="pt")
49 |
50 | out = model.generate(**inputs)
51 | print(processor.decode(out[0], skip_special_tokens=True))
52 | # >>> a photography of a woman and her dog
53 |
54 | # unconditional image captioning
55 | inputs = processor(raw_image, return_tensors="pt")
56 |
57 | out = model.generate(**inputs)
58 | print(processor.decode(out[0], skip_special_tokens=True))
59 | >>> a woman sitting on the beach with her dog
60 | ```
61 |
62 |
63 | #### Running the model on GPU
64 |
65 | ##### In full precision
66 |
67 |
68 | Click to expand
69 |
70 | ```python
71 | import requests
72 | from PIL import Image
73 | from transformers import BlipProcessor, BlipForConditionalGeneration
74 |
75 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
76 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda")
77 |
78 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
79 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
80 |
81 | # conditional image captioning
82 | text = "a photography of"
83 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda")
84 |
85 | out = model.generate(**inputs)
86 | print(processor.decode(out[0], skip_special_tokens=True))
87 | # >>> a photography of a woman and her dog
88 |
89 | # unconditional image captioning
90 | inputs = processor(raw_image, return_tensors="pt").to("cuda")
91 |
92 | out = model.generate(**inputs)
93 | print(processor.decode(out[0], skip_special_tokens=True))
94 | >>> a woman sitting on the beach with her dog
95 | ```
96 |
97 |
98 | ##### In half precision (`float16`)
99 |
100 |
101 | Click to expand
102 |
103 | ```python
104 | import torch
105 | import requests
106 | from PIL import Image
107 | from transformers import BlipProcessor, BlipForConditionalGeneration
108 |
109 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
110 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda")
111 |
112 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
113 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
114 |
115 | # conditional image captioning
116 | text = "a photography of"
117 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16)
118 |
119 | out = model.generate(**inputs)
120 | print(processor.decode(out[0], skip_special_tokens=True))
121 | # >>> a photography of a woman and her dog
122 |
123 | # unconditional image captioning
124 | inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
125 |
126 | out = model.generate(**inputs)
127 | print(processor.decode(out[0], skip_special_tokens=True))
128 | >>> a woman sitting on the beach with her dog
129 | ```
130 |
131 |
132 | ## BibTex and citation info
133 |
134 | ```
135 | @misc{https://doi.org/10.48550/arxiv.2201.12086,
136 | doi = {10.48550/ARXIV.2201.12086},
137 |
138 | url = {https://arxiv.org/abs/2201.12086},
139 |
140 | author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven},
141 |
142 | keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},
143 |
144 | title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation},
145 |
146 | publisher = {arXiv},
147 |
148 | year = {2022},
149 |
150 | copyright = {Creative Commons Attribution 4.0 International}
151 | }
152 | ```
153 |
--------------------------------------------------------------------------------