├── .github ├── timestamp └── workflows │ └── main.yml ├── test_image.jpg ├── .gitattributes ├── requirements.txt ├── blip-image-captioning-base ├── pytorch_model.bin ├── special_tokens_map.json ├── preprocessor_config.json ├── tokenizer_config.json ├── config.json └── README.md ├── readme.md ├── image-captioning.py └── .gitignore /.github/timestamp: -------------------------------------------------------------------------------- 1 | 2025-12-23T05:54:28Z 2 | -------------------------------------------------------------------------------- /test_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/test_image.jpg -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | blip-image-captioning-base/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/botextractai/ai-image-captioning/HEAD/requirements.txt -------------------------------------------------------------------------------- /blip-image-captioning-base/pytorch_model.bin: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d6638651a5526cc2ede56f2b5104d6851b0755816d220e5e046870430180c767 3 | size 989820849 4 | -------------------------------------------------------------------------------- /blip-image-captioning-base/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /blip-image-captioning-base/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "do_normalize": true, 3 | "do_resize": true, 4 | "image_mean": [ 5 | 0.48145466, 6 | 0.4578275, 7 | 0.40821073 8 | ], 9 | "image_processor_type": "BlipImageProcessor", 10 | "image_std": [ 11 | 0.26862954, 12 | 0.26130258, 13 | 0.27577711 14 | ], 15 | "processor_class": "BlipProcessor", 16 | "size": 384 17 | } 18 | -------------------------------------------------------------------------------- /blip-image-captioning-base/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_basic_tokenize": true, 4 | "do_lower_case": true, 5 | "mask_token": "[MASK]", 6 | "model_max_length": 512, 7 | "name_or_path": "bert-base-uncased", 8 | "never_split": null, 9 | "pad_token": "[PAD]", 10 | "processor_class": "BlipProcessor", 11 | "sep_token": "[SEP]", 12 | "special_tokens_map_file": null, 13 | "strip_accents": null, 14 | "tokenize_chinese_chars": true, 15 | "tokenizer_class": "BertTokenizer", 16 | "unk_token": "[UNK]", 17 | "model_input_names": [ 18 | "input_ids", 19 | "attention_mask" 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Image captioning with a locally stored Large Language Model (LLM) 2 | 3 | This example generates a caption of an image. 4 | 5 | It runs fully local on your computer and it does not require a Graphics Processing Unit (GPU). 6 | 7 | It uses the Salesforce [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://huggingface.co/Salesforce/blip-image-captioning-base) Large Language Model (LLM) and Hugging Face Transformers. Please note that is a very small model and its capabilities are therefore limited, but the results are still very impressive for its size. 8 | 9 | This example uses this test image: 10 | 11 | ![alt text](https://github.com/botextractai/ai-image-captioning/blob/main/test_image.jpg "Test image") 12 | 13 | to automatically generate this image caption: 14 | 15 | `a set of toy cars and traffic cones` 16 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Timestamp 2 | on: 3 | push: 4 | branches: 5 | - master 6 | schedule: 7 | - cron: '45 5 * * *' 8 | jobs: 9 | auto_commit: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | persist-credentials: false 15 | fetch-depth: 0 16 | - name: Modify timestamp file 17 | run: | 18 | d=`date '+%Y-%m-%dT%H:%M:%SZ'` 19 | echo $d > .github/timestamp 20 | - name: Commit changes 21 | run: | 22 | git config --local user.email "${{ secrets.USEREMAIL }}" 23 | git config --local user.name "${{ secrets.USERNAME }}" 24 | git commit -a -m "Timestamp" 25 | - name: Push Back 26 | uses: ad-m/github-push-action@master 27 | with: 28 | force: true 29 | directory: '.' 30 | github_token: ${{ secrets.GITHUB_TOKEN }} 31 | -------------------------------------------------------------------------------- /image-captioning.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from PIL import Image 3 | from transformers import AutoProcessor 4 | from transformers import BlipForConditionalGeneration 5 | from transformers.utils import logging 6 | 7 | logging.set_verbosity_error() 8 | 9 | # Suppress warning message 10 | warnings.filterwarnings("ignore", message="Using the model-agnostic default `max_length`") 11 | 12 | # Load the Large Language Model (LLM) 13 | model = BlipForConditionalGeneration.from_pretrained( 14 | "./blip-image-captioning-base") 15 | 16 | # Load the processor 17 | processor = AutoProcessor.from_pretrained( 18 | "./blip-image-captioning-base") 19 | 20 | # Load the image 21 | image = Image.open("./test_image.jpg") 22 | 23 | # Create the input 24 | inputs = processor(image, return_tensors="pt") 25 | 26 | # Get the outupt 27 | out = model.generate(**inputs) 28 | 29 | # Print the output 30 | print(processor.decode(out[0], skip_special_tokens=True)) 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /blip-image-captioning-base/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_commit_hash": null, 3 | "architectures": [ 4 | "BlipForConditionalGeneration" 5 | ], 6 | "image_text_hidden_size": 256, 7 | "initializer_factor": 1.0, 8 | "logit_scale_init_value": 2.6592, 9 | "model_type": "blip", 10 | "projection_dim": 512, 11 | "text_config": { 12 | "_name_or_path": "", 13 | "add_cross_attention": false, 14 | "architectures": null, 15 | "attention_probs_dropout_prob": 0.0, 16 | "bad_words_ids": null, 17 | "begin_suppress_tokens": null, 18 | "bos_token_id": 30522, 19 | "chunk_size_feed_forward": 0, 20 | "cross_attention_hidden_size": null, 21 | "decoder_start_token_id": null, 22 | "diversity_penalty": 0.0, 23 | "do_sample": false, 24 | "early_stopping": false, 25 | "encoder_no_repeat_ngram_size": 0, 26 | "eos_token_id": 2, 27 | "exponential_decay_length_penalty": null, 28 | "finetuning_task": null, 29 | "forced_bos_token_id": null, 30 | "forced_eos_token_id": null, 31 | "hidden_act": "gelu", 32 | "hidden_dropout_prob": 0.0, 33 | "hidden_size": 768, 34 | "id2label": { 35 | "0": "LABEL_0", 36 | "1": "LABEL_1" 37 | }, 38 | "initializer_factor": 1.0, 39 | "initializer_range": 0.02, 40 | "intermediate_size": 3072, 41 | "is_decoder": true, 42 | "is_encoder_decoder": false, 43 | "label2id": { 44 | "LABEL_0": 0, 45 | "LABEL_1": 1 46 | }, 47 | "layer_norm_eps": 1e-12, 48 | "length_penalty": 1.0, 49 | "max_length": 20, 50 | "max_position_embeddings": 512, 51 | "min_length": 0, 52 | "model_type": "blip_text_model", 53 | "no_repeat_ngram_size": 0, 54 | "num_attention_heads": 12, 55 | "num_beam_groups": 1, 56 | "num_beams": 1, 57 | "num_hidden_layers": 12, 58 | "num_return_sequences": 1, 59 | "output_attentions": false, 60 | "output_hidden_states": false, 61 | "output_scores": false, 62 | "pad_token_id": 0, 63 | "prefix": null, 64 | "problem_type": null, 65 | "projection_dim": 768, 66 | "pruned_heads": {}, 67 | "remove_invalid_values": false, 68 | "repetition_penalty": 1.0, 69 | "return_dict": true, 70 | "return_dict_in_generate": false, 71 | "sep_token_id": 102, 72 | "suppress_tokens": null, 73 | "task_specific_params": null, 74 | "temperature": 1.0, 75 | "tf_legacy_loss": false, 76 | "tie_encoder_decoder": false, 77 | "tie_word_embeddings": true, 78 | "tokenizer_class": null, 79 | "top_k": 50, 80 | "top_p": 1.0, 81 | "torch_dtype": null, 82 | "torchscript": false, 83 | "transformers_version": "4.26.0.dev0", 84 | "typical_p": 1.0, 85 | "use_bfloat16": false, 86 | "use_cache": true, 87 | "vocab_size": 30524 88 | }, 89 | "torch_dtype": "float32", 90 | "transformers_version": null, 91 | "vision_config": { 92 | "_name_or_path": "", 93 | "add_cross_attention": false, 94 | "architectures": null, 95 | "attention_dropout": 0.0, 96 | "bad_words_ids": null, 97 | "begin_suppress_tokens": null, 98 | "bos_token_id": null, 99 | "chunk_size_feed_forward": 0, 100 | "cross_attention_hidden_size": null, 101 | "decoder_start_token_id": null, 102 | "diversity_penalty": 0.0, 103 | "do_sample": false, 104 | "dropout": 0.0, 105 | "early_stopping": false, 106 | "encoder_no_repeat_ngram_size": 0, 107 | "eos_token_id": null, 108 | "exponential_decay_length_penalty": null, 109 | "finetuning_task": null, 110 | "forced_bos_token_id": null, 111 | "forced_eos_token_id": null, 112 | "hidden_act": "gelu", 113 | "hidden_size": 768, 114 | "id2label": { 115 | "0": "LABEL_0", 116 | "1": "LABEL_1" 117 | }, 118 | "image_size": 384, 119 | "initializer_factor": 1.0, 120 | "initializer_range": 0.02, 121 | "intermediate_size": 3072, 122 | "is_decoder": false, 123 | "is_encoder_decoder": false, 124 | "label2id": { 125 | "LABEL_0": 0, 126 | "LABEL_1": 1 127 | }, 128 | "layer_norm_eps": 1e-05, 129 | "length_penalty": 1.0, 130 | "max_length": 20, 131 | "min_length": 0, 132 | "model_type": "blip_vision_model", 133 | "no_repeat_ngram_size": 0, 134 | "num_attention_heads": 12, 135 | "num_beam_groups": 1, 136 | "num_beams": 1, 137 | "num_channels": 3, 138 | "num_hidden_layers": 12, 139 | "num_return_sequences": 1, 140 | "output_attentions": false, 141 | "output_hidden_states": false, 142 | "output_scores": false, 143 | "pad_token_id": null, 144 | "patch_size": 16, 145 | "prefix": null, 146 | "problem_type": null, 147 | "projection_dim": 512, 148 | "pruned_heads": {}, 149 | "remove_invalid_values": false, 150 | "repetition_penalty": 1.0, 151 | "return_dict": true, 152 | "return_dict_in_generate": false, 153 | "sep_token_id": null, 154 | "suppress_tokens": null, 155 | "task_specific_params": null, 156 | "temperature": 1.0, 157 | "tf_legacy_loss": false, 158 | "tie_encoder_decoder": false, 159 | "tie_word_embeddings": true, 160 | "tokenizer_class": null, 161 | "top_k": 50, 162 | "top_p": 1.0, 163 | "torch_dtype": null, 164 | "torchscript": false, 165 | "transformers_version": "4.26.0.dev0", 166 | "typical_p": 1.0, 167 | "use_bfloat16": false 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /blip-image-captioning-base/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | pipeline_tag: image-to-text 3 | tags: 4 | - image-captioning 5 | languages: 6 | - en 7 | license: bsd-3-clause 8 | --- 9 | 10 | # BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation 11 | 12 | Model card for image captioning pretrained on COCO dataset - base architecture (with ViT base backbone). 13 | 14 | | ![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif) | 15 | |:--:| 16 | | Pull figure from BLIP official repo | Image source: https://github.com/salesforce/BLIP | 17 | 18 | ## TL;DR 19 | 20 | Authors from the [paper](https://arxiv.org/abs/2201.12086) write in the abstract: 21 | 22 | *Vision-Language Pre-training (VLP) has advanced the performance for many vision-language tasks. However, most existing pre-trained models only excel in either understanding-based tasks or generation-based tasks. Furthermore, performance improvement has been largely achieved by scaling up the dataset with noisy image-text pairs collected from the web, which is a suboptimal source of supervision. In this paper, we propose BLIP, a new VLP framework which transfers flexibly to both vision-language understanding and generation tasks. BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. Code, models, and datasets are released.* 23 | 24 | ## Usage 25 | 26 | You can use this model for conditional and un-conditional image captioning 27 | 28 | ### Using the Pytorch model 29 | 30 | #### Running the model on CPU 31 | 32 |
33 | Click to expand 34 | 35 | ```python 36 | import requests 37 | from PIL import Image 38 | from transformers import BlipProcessor, BlipForConditionalGeneration 39 | 40 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 41 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") 42 | 43 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 44 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') 45 | 46 | # conditional image captioning 47 | text = "a photography of" 48 | inputs = processor(raw_image, text, return_tensors="pt") 49 | 50 | out = model.generate(**inputs) 51 | print(processor.decode(out[0], skip_special_tokens=True)) 52 | # >>> a photography of a woman and her dog 53 | 54 | # unconditional image captioning 55 | inputs = processor(raw_image, return_tensors="pt") 56 | 57 | out = model.generate(**inputs) 58 | print(processor.decode(out[0], skip_special_tokens=True)) 59 | >>> a woman sitting on the beach with her dog 60 | ``` 61 |
62 | 63 | #### Running the model on GPU 64 | 65 | ##### In full precision 66 | 67 |
68 | Click to expand 69 | 70 | ```python 71 | import requests 72 | from PIL import Image 73 | from transformers import BlipProcessor, BlipForConditionalGeneration 74 | 75 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 76 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cuda") 77 | 78 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 79 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') 80 | 81 | # conditional image captioning 82 | text = "a photography of" 83 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda") 84 | 85 | out = model.generate(**inputs) 86 | print(processor.decode(out[0], skip_special_tokens=True)) 87 | # >>> a photography of a woman and her dog 88 | 89 | # unconditional image captioning 90 | inputs = processor(raw_image, return_tensors="pt").to("cuda") 91 | 92 | out = model.generate(**inputs) 93 | print(processor.decode(out[0], skip_special_tokens=True)) 94 | >>> a woman sitting on the beach with her dog 95 | ``` 96 |
97 | 98 | ##### In half precision (`float16`) 99 | 100 |
101 | Click to expand 102 | 103 | ```python 104 | import torch 105 | import requests 106 | from PIL import Image 107 | from transformers import BlipProcessor, BlipForConditionalGeneration 108 | 109 | processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") 110 | model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", torch_dtype=torch.float16).to("cuda") 111 | 112 | img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 113 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') 114 | 115 | # conditional image captioning 116 | text = "a photography of" 117 | inputs = processor(raw_image, text, return_tensors="pt").to("cuda", torch.float16) 118 | 119 | out = model.generate(**inputs) 120 | print(processor.decode(out[0], skip_special_tokens=True)) 121 | # >>> a photography of a woman and her dog 122 | 123 | # unconditional image captioning 124 | inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16) 125 | 126 | out = model.generate(**inputs) 127 | print(processor.decode(out[0], skip_special_tokens=True)) 128 | >>> a woman sitting on the beach with her dog 129 | ``` 130 |
131 | 132 | ## BibTex and citation info 133 | 134 | ``` 135 | @misc{https://doi.org/10.48550/arxiv.2201.12086, 136 | doi = {10.48550/ARXIV.2201.12086}, 137 | 138 | url = {https://arxiv.org/abs/2201.12086}, 139 | 140 | author = {Li, Junnan and Li, Dongxu and Xiong, Caiming and Hoi, Steven}, 141 | 142 | keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences}, 143 | 144 | title = {BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation}, 145 | 146 | publisher = {arXiv}, 147 | 148 | year = {2022}, 149 | 150 | copyright = {Creative Commons Attribution 4.0 International} 151 | } 152 | ``` 153 | --------------------------------------------------------------------------------