├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.yaml
    │   ├── config.yaml
    │   └── feature_request.yaml
├── .gitignore
├── BUILD.md
├── Dockerfile.qwendemo
├── Dockerfile.qwenint4openai
├── Dockerfile.qwenopenai
├── FAQ.md
├── FAQ_ja.md
├── FAQ_ko.md
├── FAQ_zh.md
├── LICENSE
├── NOTICE
├── README.md
├── README_CN.md
├── README_JA.md
├── README_KO.md
├── TUTORIAL.md
├── TUTORIAL_ja.md
├── TUTORIAL_ko.md
├── TUTORIAL_zh.md
├── assets
    ├── apple.jpeg
    ├── apple_r.jpeg
    ├── demo.jpeg
    ├── demo_highfive.jpg
    ├── demo_spotting_caption.jpg
    ├── demo_vl.gif
    ├── logo.jpg
    ├── mm_tutorial
    │   ├── Beijing.jpeg
    │   ├── Beijing_Small.jpeg
    │   ├── Chongqing.jpeg
    │   ├── Chongqing_Small.jpeg
    │   ├── Hospital.jpg
    │   ├── Hospital_Small.jpg
    │   ├── Menu.jpeg
    │   ├── Rebecca_(1939_poster).jpeg
    │   ├── Rebecca_(1939_poster)_Small.jpeg
    │   ├── Shanghai.jpg
    │   ├── Shanghai_Output.jpg
    │   ├── Shanghai_Output_Small.jpeg
    │   ├── Shanghai_Small.jpeg
    │   └── TUTORIAL.ipynb
    ├── qwenvl.jpeg
    ├── radar.png
    ├── radar_qwenvlplus.jpg
    ├── touchstone_datasets.jpg
    ├── touchstone_eval.png
    ├── touchstone_logo.png
    └── wechat.png
├── eval_mm
    ├── EVALUATION.md
    ├── data
    ├── evaluate_caption.py
    ├── evaluate_grounding.py
    ├── evaluate_multiple_choice.py
    ├── evaluate_vqa.py
    ├── infographicsvqa_eval.py
    ├── mmbench
    │   ├── MMBENCH.md
    │   ├── evaluate_multiple_choice_mmbench.py
    │   ├── mmbench_converter_dev.py
    │   ├── mmbench_converter_test.py
    │   ├── mmbench_evaluation.py
    │   ├── mmbench_evaluation_tricky.py
    │   └── mmbench_predict_to_submission.py
    ├── mme
    │   ├── EVAL_MME.md
    │   ├── cognition.jpg
    │   ├── eval.py
    │   ├── get_images.py
    │   └── perception.jpg
    ├── seed_bench
    │   ├── EVAL_SEED.md
    │   ├── eval.py
    │   ├── leaderboard.jpg
    │   └── trans.py
    ├── vqa.py
    └── vqa_eval.py
├── finetune.py
├── finetune
    ├── ds_config_zero2.json
    ├── ds_config_zero3.json
    ├── finetune_ds.sh
    ├── finetune_lora_ds.sh
    ├── finetune_lora_single_gpu.sh
    ├── finetune_qlora_ds.sh
    └── finetune_qlora_single_gpu.sh
├── openai_api.py
├── requirements.txt
├── requirements_openai_api.txt
├── requirements_web_demo.txt
├── touchstone
    ├── README.md
    ├── README_CN.md
    ├── README_JA.md
    └── README_KO.md
└── web_demo_mm.py


/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: 🐞 Bug
 2 | description: 提交错误报告 | File a bug/issue
 3 | title: "[BUG] <title>"
 4 | labels: []
 5 | body:
 6 |   - type: checkboxes
 7 |     attributes:
 8 |       label: 是否已有关于该错误的issue或讨论？ | Is there an existing issue / discussion for this?
 9 |       description: |
10 |         请先搜索您遇到的错误是否在已有的issues或讨论中提到过。
11 |         Please search to see if an issue / discussion already exists for the bug you encountered.
12 |         [Issues](https://github.com/QwenLM/Qwen-7B/issues)
13 |         [Discussions](https://github.com/QwenLM/Qwen-7B/discussions)
14 |       options:
15 |         - label: 我已经搜索过已有的issues和讨论 | I have searched the existing issues / discussions
16 |           required: true
17 |   - type: checkboxes
18 |     attributes:
19 |       label: 该问题是否在FAQ中有解答？ | Is there an existing answer for this in FAQ?
20 |       description: |
21 |         请先搜索您遇到的错误是否已在FAQ中有相关解答。
22 |         Please search to see if an answer already exists in FAQ for the bug you encountered.
23 |         [FAQ-en](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ.md)
24 |         [FAQ-zh](https://github.com/QwenLM/Qwen-7B/blob/main/FAQ_zh.md)
25 |       options:
26 |         - label: 我已经搜索过FAQ | I have searched FAQ
27 |           required: true
28 |   - type: textarea
29 |     attributes:
30 |       label: 当前行为 | Current Behavior
31 |       description: |
32 |         准确描述遇到的行为。
33 |         A concise description of what you're experiencing.
34 |     validations:
35 |       required: false
36 |   - type: textarea
37 |     attributes:
38 |       label: 期望行为 | Expected Behavior
39 |       description: |
40 |         准确描述预期的行为。
41 |         A concise description of what you expected to happen.
42 |     validations:
43 |       required: false
44 |   - type: textarea
45 |     attributes:
46 |       label: 复现方法 | Steps To Reproduce
47 |       description: |
48 |         复现当前行为的详细步骤。
49 |         Steps to reproduce the behavior.
50 |       placeholder: |
51 |         1. In this environment...
52 |         2. With this config...
53 |         3. Run '...'
54 |         4. See error...
55 |     validations:
56 |       required: false
57 |   - type: textarea
58 |     attributes:
59 |       label: 运行环境 | Environment
60 |       description: |
61 |         examples:
62 |           - **OS**: Ubuntu 20.04
63 |           - **Python**: 3.8
64 |           - **Transformers**: 4.31.0
65 |           - **PyTorch**: 2.0.1
66 |           - **CUDA**: 11.4
67 |       value: |
68 |         - OS:
69 |         - Python:
70 |         - Transformers:
71 |         - PyTorch:
72 |         - CUDA (`python -c 'import torch; print(torch.version.cuda)'`):
73 |       render: Markdown
74 |     validations:
75 |       required: false
76 |   - type: textarea
77 |     attributes:
78 |       label: 备注 | Anything else?
79 |       description: |
80 |         您可以在这里补充其他关于该问题背景信息的描述、链接或引用等。
81 |         
82 |         您可以通过点击高亮此区域然后拖动文件的方式上传图片或日志文件。
83 |         
84 |         Links? References? Anything that will give us more context about the issue you are encountering!
85 |         
86 |         Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
87 |     validations:
88 |       required: false
89 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yaml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
 1 | name: "💡 Feature Request"
 2 | description: 创建新功能请求 | Create a new ticket for a new feature request
 3 | title: "💡 [REQUEST] - <title>"
 4 | labels: [
 5 |   "question"
 6 | ]
 7 | body:
 8 |   - type: input
 9 |     id: start_date
10 |     attributes:
11 |       label: "起始日期 | Start Date"
12 |       description: |
13 |         起始开发日期
14 |         Start of development
15 |       placeholder: "month/day/year"
16 |     validations:
17 |       required: false
18 |   - type: textarea
19 |     id: implementation_pr
20 |     attributes:
21 |       label: "实现PR | Implementation PR"
22 |       description: |
23 |         实现该功能的Pull request
24 |         Pull request used
25 |       placeholder: "#Pull Request ID"
26 |     validations:
27 |       required: false
28 |   - type: textarea
29 |     id: reference_issues
30 |     attributes:
31 |       label: "相关Issues | Reference Issues"
32 |       description: |
33 |         与该功能相关的issues
34 |         Common issues
35 |       placeholder: "#Issues IDs"
36 |     validations:
37 |       required: false
38 |   - type: textarea
39 |     id: summary
40 |     attributes:
41 |       label: "摘要 | Summary"
42 |       description: |
43 |         简要描述新功能的特点
44 |         Provide a brief explanation of the feature
45 |       placeholder: |
46 |         Describe in a few lines your feature request
47 |     validations:
48 |       required: true
49 |   - type: textarea
50 |     id: basic_example
51 |     attributes:
52 |       label: "基本示例 | Basic Example"
53 |       description: Indicate here some basic examples of your feature.
54 |       placeholder: A few specific words about your feature request.
55 |     validations:
56 |       required: true
57 |   - type: textarea
58 |     id: drawbacks
59 |     attributes:
60 |       label: "缺陷 | Drawbacks"
61 |       description: |
62 |         该新功能有哪些缺陷/可能造成哪些影响？
63 |         What are the drawbacks/impacts of your feature request ?
64 |       placeholder: |
65 |         Identify the drawbacks and impacts while being neutral on your feature request
66 |     validations:
67 |       required: true
68 |   - type: textarea
69 |     id: unresolved_question
70 |     attributes:
71 |       label: "未解决问题 | Unresolved questions"
72 |       description: |
73 |         有哪些尚未解决的问题？
74 |         What questions still remain unresolved ?
75 |       placeholder: |
76 |         Identify any unresolved issues.
77 |     validations:
78 |       required: false


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.so
 3 | build
 4 | .coverage_*
 5 | *.egg-info
 6 | *~
 7 | .vscode/
 8 | .idea/
 9 | .DS_Store
10 | 
11 | /private/
12 | Qwen-VL-Chat/
13 | Qwen-VL-Chat-Int4/
14 | SimSun.ttf
15 | 


--------------------------------------------------------------------------------
/BUILD.md:
--------------------------------------------------------------------------------
 1 | ## qwen web demo
 2 | 
 3 | ### build
 4 | 
 5 | ```
 6 | docker build -t qwen-vl-chat:webdemo --platform linux/amd64 -f Dockerfile.qwendemo . 
 7 | ```
 8 | 
 9 | ### run
10 | 
11 | ```
12 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8000:8000 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:webdemo
13 | ```
14 | 
15 | ## qwen openai api
16 | 
17 | ### build
18 | 
19 | ```
20 | docker build -t qwen-vl-chat:openai --platform linux/amd64 -f Dockerfile.qwenopenai . 
21 | ```
22 | 
23 | ### run
24 | 
25 | ```
26 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:openai
27 | ```
28 | 
29 | ## qwen-int4 openai api
30 | 
31 | ### build
32 | 
33 | ```
34 | docker build -t qwen-vl-chat:int4-openai --platform linux/amd64 -f Dockerfile.qwenint4openai . 
35 | ```
36 | 
37 | ### run
38 | 
39 | ```
40 | docker run -it --gpus device=0 -d --restart always -v /var/run/docker.sock:/var/run/docker.sock --name qwen-vl-chat-int4 -p 8080:8080 --user=20001:20001 --platform linux/amd64 qwen-vl-chat:int4-openai
41 | ```
42 | 


--------------------------------------------------------------------------------
/Dockerfile.qwendemo:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat
40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat
41 | # copy fonts
42 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
43 | # COPY --chown=20001:20001 SimSun.ttf ./
44 | # copy main app
45 | COPY --chown=20001:20001 web_demo_mm.py ./
46 | 
47 | EXPOSE 8000
48 | CMD ["python3", "web_demo_mm.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8000"]
49 | 


--------------------------------------------------------------------------------
/Dockerfile.qwenint4openai:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat-Int4
40 | # COPY --chown=20001:20001 Qwen-VL-Chat-Int4 ./Qwen-VL-Chat-Int4
41 | 
42 | # Install AutoGPTQ
43 | RUN pip install optimum
44 | # RUN git clone https://github.com/JustinLin610/AutoGPTQ.git && \
45 | #     cd AutoGPTQ && \
46 | #     pip install -v .
47 | RUN pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu117/
48 | 
49 | # Install OpenAI API dependencies
50 | WORKDIR ${workdir}
51 | COPY requirements_openai_api.txt ./
52 | RUN pip install -r requirements_openai_api.txt
53 | # copy fonts
54 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
55 | # COPY --chown=20001:20001 SimSun.ttf ./
56 | # copy main app
57 | COPY --chown=20001:20001 openai_api.py ./
58 | 
59 | EXPOSE 8080
60 | # CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"]
61 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat-Int4", "--server-name", "0.0.0.0", "--server-port", "8080"]
62 | 


--------------------------------------------------------------------------------
/Dockerfile.qwenopenai:
--------------------------------------------------------------------------------
 1 | # python 3.8 and above
 2 | # pytorch 1.12 and above, 2.0 and above are recommended
 3 | # CUDA 11.4 and above are recommended (this is for GPU users, flash-attention users, etc.)
 4 | 
 5 | # based on modelscope docker image
 6 | # registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 7 | # registry.cn-beijing.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 8 | FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-1.8.0
 9 | 
10 | ARG workdir=/var/app
11 | RUN mkdir -p ${workdir}
12 | 
13 | RUN git lfs install
14 | 
15 | WORKDIR ${workdir}
16 | COPY requirements.txt requirements_web_demo.txt ./
17 | 
18 | # Install Qwen dependencies
19 | RUN pip install -r requirements.txt
20 | 
21 | # Install webUI dependencies
22 | WORKDIR ${workdir}
23 | RUN pip install -r requirements_web_demo.txt
24 | 
25 | # Offline mode, check https://huggingface.co/docs/transformers/v4.15.0/installation#offline-mode
26 | ENV HF_DATASETS_OFFLINE=1
27 | ENV TRANSFORMERS_OFFLINE=1
28 | 
29 | # set TZ, make logs dir, and expose port 8080
30 | ENV TZ=Asia/Shanghai
31 | RUN mkdir -p ${workdir}/logs && chmod 777 ${workdir}/logs
32 | VOLUME /var/app/logs
33 | 
34 | # create user 20001
35 | RUN useradd -r -m appuser -u 20001 -g 0
36 | 
37 | WORKDIR ${workdir}
38 | # copy model
39 | RUN git clone https://huggingface.co/Qwen/Qwen-VL-Chat
40 | # COPY --chown=20001:20001 Qwen-VL-Chat ./Qwen-VL-Chat
41 | 
42 | # Install OpenAI API dependencies
43 | WORKDIR ${workdir}
44 | COPY requirements_openai_api.txt ./
45 | RUN pip install -r requirements_openai_api.txt
46 | # copy fonts
47 | ADD --chown=20001:20001 https://github.com/StellarCN/scp_zh/raw/master/fonts/SimSun.ttf ./
48 | # COPY --chown=20001:20001 SimSun.ttf ./
49 | # copy main app
50 | COPY --chown=20001:20001 openai_api.py ./
51 | 
52 | EXPOSE 8080
53 | CMD ["python3", "openai_api.py", "-c", "./Qwen-VL-Chat", "--server-name", "0.0.0.0", "--server-port", "8080"]
54 | 


--------------------------------------------------------------------------------
/FAQ.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## Installation & Environment
 4 | 
 5 | #### Which version of transformers should I use?
 6 | 
 7 | 4.31.0 is preferred.
 8 | 
 9 | #### I downloaded the codes and checkpoints but I can't load the model locally. What should I do?
10 | 
11 | Please check if you have updated the code to the latest, and correctly downloaded all the sharded checkpoint files.
12 | 
13 | #### `qwen.tiktoken` is not found. What is it?
14 | 
15 | This is the merge file of the tokenizer. You have to download it. Note that if you just git clone the repo without [git-lfs](https://git-lfs.com), you cannot download this file.
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate not found
18 | 
19 | Run the command `pip install -r requirements.txt`. You can find the file at [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt).
20 | <br><br>
21 | 
22 | 
23 | 
24 | ## Demo & Inference
25 | 
26 | #### Is there any demo?
27 | 
28 | Yes, see `web_demo_mm.py` for web demo. See README for more information.
29 | 
30 | 
31 | 
32 | #### Can Qwen-VL support streaming?
33 | 
34 | No. We do not support streaming yet.
35 | 
36 | #### It seems that the generation is not related to the instruction...
37 | 
38 | Please check if you are loading Qwen-VL-Chat instead of Qwen-VL. Qwen-VL is the base model without alignment, which behaves differently from the SFT/Chat model.
39 | 
40 | #### Is quantization supported?
41 | 
42 | No. We would support quantization asap.
43 | 
44 | #### Unsatisfactory performance in processing long sequences
45 | 
46 | Please ensure that NTK is applied. `use_dynamc_ntk` and `use_logn_attn` in `config.json` should be set to `true` (`true` by default).
47 | <br><br>
48 | 
49 | 
50 | ## Tokenizer
51 | 
52 | #### bos_id/eos_id/pad_id not found
53 | 
54 | In our training, we only use `<|endoftext|>` as the separator and padding token. You can set bos_id, eos_id, and pad_id to tokenizer.eod_id. Learn more about our tokenizer from our documents about the tokenizer.
55 | 
56 | 


--------------------------------------------------------------------------------
/FAQ_ja.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## インストールと環境
 4 | 
 5 | #### transformers のバージョンは？
 6 | 
 7 | 4.31.0 が望ましいです。
 8 | 
 9 | #### コードとチェックポイントをダウンロードしましたが、モデルをローカルにロードできません。どうすればよいでしょうか？
10 | 
11 | コードを最新のものに更新し、すべてのシャードされたチェックポイントファイルを正しくダウンロードしたかどうか確認してください。
12 | 
13 | #### `qwen.tiktoken` が見つかりません。これは何ですか？
14 | 
15 | これは tokenizer のマージファイルです。ダウンロードする必要があります。[git-lfs](https://git-lfs.com) を使わずにリポジトリを git clone しただけでは、このファイルをダウンロードできないことに注意してください。
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate が見つかりません。
18 | 
19 | コマンド `pip install -r requirements.txt` を実行してください。このファイルは [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) にあります。
20 | <br><br>
21 | 
22 | 
23 | 
24 | ## デモと推論
25 | 
26 | #### デモはありますか？
27 | 
28 | ウェブデモは `web_demo_mm.py` を参照してください。詳細は README を参照してください。
29 | 
30 | 
31 | 
32 | #### Qwen-VLはストリーミングに対応していますか？
33 | 
34 | いいえ、まだサポートしていません。
35 | 
36 | #### 世代と命令は関係ないようですが...
37 | 
38 | Qwen-VL ではなく Qwen-VL-Chat を読み込んでいないか確認してください。Qwen-VL はアライメントなしのベースモデルで、SFT/Chat モデルとは動作が異なります。
39 | 
40 | #### 量子化はサポートされていますか？
41 | 
42 | いいえ。早急に量子化をサポートするつもりです。
43 | 
44 | #### 長いシーケンスの処理で不満足なパフォーマンス
45 | 
46 | NTK が適用されていることを確認してください。`config.json` の `use_dynamc_ntk` と `use_logn_attn` を `true` に設定する必要がある（デフォルトでは `true`）。
47 | <br><br>
48 | 
49 | 
50 | ## Tokenizer
51 | 
52 | #### bos_id/eos_id/pad_id が見つかりません。
53 | 
54 | 私たちのトレーニングでは、セパレータとパディングトークンとして `<|endoftext|>` のみを使用しています。bos_id、eos_id、pad_id は tokenizer.eod_id に設定できます。私たちの tokenizer について詳しくは、tokenizer についてのドキュメントをご覧ください。
55 | 
56 | 


--------------------------------------------------------------------------------
/FAQ_ko.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## 설치 및 환경
 4 | 
 5 | #### 어떤 버전의 transformers를 사용해야 하나요?
 6 | 
 7 | 4.31.0 버전을 사용하는 것을 선호합니다.
 8 | 
 9 | #### 코드와 체크포인트를 다운로드했는데 모델을 로컬에서 불러올 수 없어요. 어떻게 해야 하나요?
10 | 
11 | 코드를 최신 버전으로 업데이트했는지, 그리고 모든 샤드 체크포인트 파일을 올바르게 다운로드했는지 확인해 주세요.
12 | 
13 | #### `qwen.tiktoken`을 찾을 수 없어요. 이게 무엇인가요?
14 | 
15 | 이것은 토크나이저의 병합 파일입니다. 이 파일을 다운로드해야 합니다. [git-lfs](https://git-lfs.com) 없이 단순히 깃 저장소를 복제했다면 이 파일을 다운로드할 수 없습니다.
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate not found 오류
18 | 
19 | `pip install -r requirements.txt` 명령을 실행하세요. 이 파일은 [https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt)에서 찾을 수 있습니다.
20 | <br><br>
21 | 
22 | 
23 | ## Demo & Inference
24 | 
25 | #### 데모가 있나요?
26 | 
27 | 네, 웹 데모는 `web_demo_mm.py`를 참고하세요. 더 많은 정보는 README 파일에서 확인할 수 있습니다.
28 | 
29 | 
30 | 
31 | #### Qwen-VL은 스트리밍을 지원하나요?
32 | 
33 | 아니요. 아직 스트리밍을 지원하지 않습니다.
34 | 
35 | #### 생성된 내용이 지시사항과 관련 없는 것 같습니다.
36 | 
37 | Qwen-VL 대신 Qwen-VL-Chat을 로드하고 있는지 확인해 주세요. Qwen-VL은 SFT/Chat 모델과 달리 정렬이 없는 기본 모델이므로 다르게 작동합니다.
38 | 
39 | #### 양자화를 지원하나요?
40 | 
41 | 아니요. 가능한 빨리 양자화를 지원할 예정입니다.
42 | 
43 | #### 긴 시퀀스 처리에서 만족스럽지 못한 성능
44 | 
45 | NTK가 적용되었는지 확인해 주세요. `config.json`의 `use_dynamc_ntk`과 `use_logn_attn`은 `true`로 설정되어야 합니다(`true`가 기본값).
46 | <br><br>
47 | 
48 | 
49 | ## Tokenizer
50 | 
51 | #### bos_id/eos_id/pad_id not found 오류
52 | 
53 | 저희 훈련에서는 ``을 구분자 및 패딩 토큰으로만 사용합니다. bos_id, eos_id, pad_id를 tokenizer.eod_id로 설정할 수 있습니다. 토크나이저에 대한 문서에서 토크나이저에 대해 더 알아보세요.


--------------------------------------------------------------------------------
/FAQ_zh.md:
--------------------------------------------------------------------------------
 1 | # FAQ
 2 | 
 3 | ## 安装&环境
 4 | 
 5 | #### 我应该用哪个transformers版本？
 6 | 
 7 | 建议使用4.31.0。
 8 | 
 9 | #### 我把模型和代码下到本地，按照教程无法使用，该怎么办？
10 | 
11 | 答：别着急，先检查你的代码是不是更新到最新版本，然后确认你是否完整地将模型checkpoint下到本地。
12 | 
13 | #### `qwen.tiktoken`这个文件找不到，怎么办？
14 | 
15 | 这个是我们的tokenizer的merge文件，你必须下载它才能使用我们的tokenizer。注意，如果你使用git clone却没有使用git-lfs，这个文件不会被下载。如果你不了解git-lfs，可点击[官网](https://git-lfs.com/)了解。
16 | 
17 | #### transformers_stream_generator/tiktoken/accelerate，这几个库提示找不到，怎么办？
18 | 
19 | 运行如下命令：`pip install -r requirements.txt`。相关依赖库在[https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt](https://github.com/QwenLM/Qwen-VL/blob/main/requirements.txt) 可以找到。
20 | <br><br>
21 | 
22 | 
23 | ## Demo & 推理
24 | 
25 | #### 是否提供Demo？
26 | 
27 | `web_demo_mm.py`提供了Web UI。请查看README相关内容了解更多。
28 | 
29 | #### Qwen-VL支持流式推理吗？
30 | 
31 | Qwen-VL当前不支持流式推理。
32 | 
33 | #### 模型的输出看起来与输入无关/没有遵循指令/看起来呆呆的
34 | 
35 | 请检查是否加载的是Qwen-VL-Chat模型进行推理，Qwen-VL模型是未经align的预训练基模型，不期望具备响应用户指令的能力。我们在模型最新版本已经对`chat`接口内进行了检查，避免您误将预训练模型作为SFT/Chat模型使用。
36 | 
37 | #### 是否有量化版本模型
38 | 
39 | 目前Qwen-VL不支持量化，后续我们将支持高效的量化推理实现。
40 | 
41 | #### 处理长序列时效果有问题
42 | 
43 | 请确认是否开启ntk。若要启用这些技巧，请将`config.json`里的`use_dynamc_ntk`和`use_logn_attn`设置为`true`。最新代码默认为`true`。
44 | <br><br>
45 | 
46 | 
47 | ## Tokenizer
48 | 
49 | #### bos_id/eos_id/pad_id，这些token id不存在，为什么？
50 | 
51 | 在训练过程中，我们仅使用<|endoftext|>这一token作为sample/document之间的分隔符及padding位置占位符，你可以将bos_id, eos_id, pad_id均指向tokenizer.eod_id。请阅读我们关于tokenizer的文档，了解如何设置这些id。
52 | 
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Tongyi Qianwen LICENSE AGREEMENT
 2 | 
 3 | Tongyi Qianwen Release Date: August 23, 2023
 4 | 
 5 | By clicking to agree or by using or distributing any portion or element of the Tongyi Qianwen Materials, you will be deemed to have recognized and accepted the content of this Agreement, which is effective immediately.
 6 | 
 7 | 1. Definitions
 8 |     a. This Tongyi Qianwen LICENSE AGREEMENT (this "Agreement") shall mean the terms and conditions for use, reproduction, distribution and modification of the Materials as defined by this Agreement.
 9 |     b. "We"(or "Us") shall mean Alibaba Cloud.
10 |     c. "You" (or "Your") shall mean a natural person or legal entity exercising the rights granted by this Agreement and/or using the Materials for any purpose and in any field of use.
11 |     d. "Third Parties" shall mean individuals or legal entities that are not under common control with Us or You.
12 |     e. "Tongyi Qianwen" shall mean the large language models (including Qwen-VL model and Qwen-VL-Chat model), and software and algorithms, consisting of trained model weights, parameters (including optimizer states), machine-learning model code, inference-enabling code, training-enabling code, fine-tuning enabling code and other elements of the foregoing distributed by Us.
13 |     f. "Materials" shall mean, collectively, Alibaba Cloud's proprietary Tongyi Qianwen and Documentation (and any portion thereof) made available under this Agreement.
14 |     g. "Source" form shall mean the preferred form for making modifications, including but not limited to model source code, documentation source, and configuration files.
15 |     h. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation,
16 |  and conversions to other media types.
17 | 
18 | 2. Grant of Rights
19 | You are granted a non-exclusive, worldwide, non-transferable and royalty-free limited license under Alibaba Cloud's intellectual property or other rights owned by Us embodied in the Materials to use, reproduce, distribute, copy, create derivative works of, and make modifications to the Materials.
20 | 
21 | 3. Redistribution
22 | You may reproduce and distribute copies of the Materials or derivative works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
23 |     a. You shall give any other recipients of the Materials or derivative works a copy of this Agreement;
24 |     b. You shall cause any modified files to carry prominent notices stating that You changed the files;
25 |     c. You shall retain in all copies of the Materials that You distribute the following attribution notices within a "Notice" text file distributed as a part of such copies: "Tongyi Qianwen is licensed under the Tongyi Qianwen LICENSE AGREEMENT, Copyright (c) Alibaba Cloud. All Rights Reserved."; and
26 |     d. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such derivative works as a whole, provided Your use, reproduction, and distribution of the work otherwise complies with the terms and conditions of this Agreement.
27 | 
28 | 4. Restrictions
29 | If you are commercially using the Materials, and your product or service has more than 100 million monthly active users, You shall request a license from Us. You cannot exercise your rights under this Agreement without our express authorization.
30 | 
31 | 5. Rules of use
32 |     a. The Materials may be subject to export controls or restrictions in China, the United States or other countries or regions. You shall comply with applicable laws and regulations in your use of the Materials.
33 |     b. You can not use the Materials or any output therefrom to improve any other large language model (excluding Tongyi Qianwen or derivative works thereof).
34 | 
35 | 6. Intellectual Property
36 |     a. We retain ownership of all intellectual property rights in and to the Materials and derivatives made by or for Us. Conditioned upon compliance with the terms and conditions of this Agreement, with respect to any derivative works and modifications of the Materials that are made by you, you are and will be the owner of such derivative works and modifications.
37 |     b. No trademark license is granted to use the trade names, trademarks, service marks, or product names of Us, except as required to fulfill notice requirements under this Agreement or as required for reasonable and customary use in describing and redistributing the Materials.
38 |     c. If you commence a lawsuit or other proceedings (including a cross-claim or counterclaim in a lawsuit) against Us or any entity alleging that the Materials or any output therefrom, or any part of the foregoing, infringe any intellectual property or other right owned or licensable by you, then all licences granted to you under this Agreement shall terminate as of the date such lawsuit or other proceeding is commenced or brought.
39 | 
40 | 7. Disclaimer of Warranty and Limitation of Liability
41 | 
42 |     a. We are not obligated to support, update, provide training for, or develop any further version of the Tongyi Qianwen Materials or to grant any license thereto.
43 |     b. THE MATERIALS ARE PROVIDED "AS IS" WITHOUT ANY EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY, NONINFRINGEMENT, OR FITNESS FOR A PARTICULAR PURPOSE. WE MAKE NO WARRANTY AND ASSUME NO RESPONSIBILITY FOR THE SAFETY OR STABILITY OF THE MATERIALS AND ANY OUTPUT THEREFROM.
44 |     c. IN NO EVENT SHALL WE BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE MATERIALS OR ANY OUTPUT OF IT, NO MATTER HOW IT’S CAUSED.
45 |     d. You will defend, indemnify and hold harmless Us from and against any claim by any third party arising out of or related to your use or distribution of the Materials.
46 | 
47 | 8. Survival and Termination.
48 |     a. The term of this Agreement shall commence upon your acceptance of this Agreement or access to the Materials and will continue in full force and effect until terminated in accordance with the terms and conditions herein.
49 |     b. We may terminate this Agreement if you breach any of the terms or conditions of this Agreement. Upon termination of this Agreement, you must delete and cease use of the Materials. Sections 7 and 9 shall survive the termination of this Agreement.
50 | 
51 | 9. Governing Law and Jurisdiction.
52 |     a. This Agreement and any dispute arising out of or relating to it will be governed by the laws of China, without regard to conflict of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this Agreement.
53 |     b. The People's Courts in Hangzhou City shall have exclusive jurisdiction over any dispute arising out of this Agreement.


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | ------------- LICENSE FOR NVIDIA Megatron-LM code  --------------
 2 | 
 3 | Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions
 7 | are met:
 8 |   * Redistributions of source code must retain the above copyright
 9 |     notice, this list of conditions and the following disclaimer.
10 |   * Redistributions in binary form must reproduce the above copyright
11 |     notice, this list of conditions and the following disclaimer in the
12 |     documentation and/or other materials provided with the distribution.
13 |   * Neither the name of NVIDIA CORPORATION nor the names of its
14 |     contributors may be used to endorse or promote products derived
15 |     from this software without specific prior written permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
18 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 | PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 
30 | ------------- LICENSE FOR OpenAI tiktoken code  --------------
31 | 
32 | MIT License
33 | 
34 | Copyright (c) 2022 OpenAI, Shantanu Jain
35 | 
36 | Permission is hereby granted, free of charge, to any person obtaining a copy
37 | of this software and associated documentation files (the "Software"), to deal
38 | in the Software without restriction, including without limitation the rights
39 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
40 | copies of the Software, and to permit persons to whom the Software is
41 | furnished to do so, subject to the following conditions:
42 | 
43 | The above copyright notice and this permission notice shall be included in all
44 | copies or substantial portions of the Software.
45 | 
46 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
47 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
48 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
49 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
50 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
51 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
52 | SOFTWARE.


--------------------------------------------------------------------------------
/TUTORIAL.md:
--------------------------------------------------------------------------------
  1 | # Qwen-VL-Chat Tutorial
  2 | Qwen-VL-Chat is a generalist multimodal large-scale language model, and it can perform a wide range of vision-language tasks. In this tutorial, we will give some concise examples to demonstrate the capabilities of Qwen-VL-Chat in **Visual Question Answering, Text Understanding, Mathematical Reasoning with Diagrams, Multi-Figure Reasoning, and Grounding**. Please note that the examples shown are far from the limit of Qwen-VL-Chat's capabilities, **you can further explore Qwen-VL-Chat's capabilities by changing the input images and prompts!**
  3 | 
  4 | ## Initializing the Qwen-VL-Chat model
  5 | Before you can use Qwen-VL-Chat, you first need to initialize Qwen-VL-Chat's tokenizer and Qwen-VL-Chat's model:
  6 | 
  7 | ```python
  8 | import torch
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | from transformers.generation import GenerationConfig
 11 | 
 12 | # If you expect the results to be reproducible, set a random seed.
 13 | # torch.manual_seed(1234)
 14 | 
 15 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 16 | 
 17 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
 18 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 19 | ```
 20 | After executing the above code, ```tokenizer``` will correspond to the classifier used by Qwen-VL-Chat, while ```model``` will correspond to the model of Qwen-VL-Chat. The ```tokenizer``` is used for preprocessing the interleaved multimodal inputs, while the ```model``` is the Qwen-VL-Chat model itself.
 21 | 
 22 | ## Using Qwen-VL-Chat
 23 | ### **Multi-round visual question answering**
 24 | #### **The first question**
 25 | Let's get started with a simple example. As shown below, the file ```assets/mm_tutorial/Rebecca_(1939_poster).jpeg``` is a poster for the 1940 film Rebecca.
 26 | 
 27 | ![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
 28 | 
 29 | Let's ask what is the name of the film on the Qwen-VL-Chat poster. First of all, we use ```tokenizer.from_list_format``` which can preprocess and tokenize the input:
 30 | ```python
 31 | query = tokenizer.from_list_format([
 32 |     {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
 33 |     {'text': 'What is the name of the movie in the poster?'},
 34 | ])
 35 | ```
 36 | Next, we can use ```model.chat``` to ask questions to the Qwen-VL-Chat model and get its response. Note that for the first question, the dialogue history is empty, so we use ```history=None```.
 37 | ```python
 38 | response, history = model.chat(tokenizer, query=query, history=None)
 39 | print(response)
 40 | ```
 41 | You are expected to get an output similar to the following:
 42 | 
 43 | > The name of the movie in the poster is "Rebecca."
 44 | 
 45 | This shows that the model correctly answered the given question! According to the poster, the title of the film is 
 46 |  indeed **Rebecca**.
 47 | 
 48 | #### **Multi-round question answering**
 49 | We can also continue to ask the model other questions, such as who is the director of the film. The dialogue history is not empty for subsequent questions, therefore we use ```history=history``` to pass the history of previous conversations to ``model.chat``:
 50 | 
 51 | ```python
 52 | query = tokenizer.from_list_format([
 53 |     {'text': 'Who directed this movie?'},
 54 | ])
 55 | response, history = model.chat(tokenizer, query=query, history=history)
 56 | print(response)
 57 | ```
 58 | 
 59 | You are expected to get an output similar to the following:
 60 | 
 61 | > The movie "Rebecca" was directed by Alfred Hitchcock.
 62 | 
 63 | Again, the model answered the given question correctly! According to the poster, the director of the film is Alfred Hitchcock。
 64 | 
 65 | ### **Text Understanding**
 66 | Qwen-VL-Chat also has the ability to understand images containing dense text. As shown below, the file ```assets/mm_tutorial/Hospital.jpeg``` is a hospital signage containing dense text.
 67 | 
 68 | ![](assets/mm_tutorial/Hospital_Small.jpg)
 69 | 
 70 | We can ask questions about the location of different departments in the Hospital. Since the dialogue history is empty, so we use ```history=None```.
 71 | ```python
 72 | query = tokenizer.from_list_format([
 73 |     {'image': 'assets/mm_tutorial/Hospital.jpg'},
 74 |     {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
 75 | ])
 76 | response, history = model.chat(tokenizer, query=query, history=None)
 77 | print(response)
 78 | ```
 79 | 
 80 | You are expected to get an output similar to the following:
 81 | 
 82 | > The Department of Otorhinolaryngology is located on the 4th floor.
 83 | 
 84 | You can also ask further questions. In this case you need to use ```history=history``` to pass a history of previous conversations to ```model.chat```. 
 85 | 
 86 | ```python
 87 | query = tokenizer.from_list_format([
 88 |     {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
 89 | ])
 90 | response, history = model.chat(tokenizer, query=query, history=history)
 91 | print(response)
 92 | ```
 93 | 
 94 | You are expected to get an output similar to the following:
 95 | 
 96 | > The Department of Surgery is located on the 3rd floor.
 97 | 
 98 | ### **Mathematical Reasoning with Diagram**
 99 | Using the model's diagram comprehension and mathematical reasoning capabilities, Qwen-VL-Chat can also perform some more complex tasks! As shown below, the file ```assets/mm_tutorial/Menu.jpeg``` is the menu of a restaurant. Now we want to know how much it would cost to purchase two Salmon Burgers and three Meat Lover's Pizzas.
100 | 
101 | ![](assets/mm_tutorial/Menu.jpeg)
102 | 
103 | ```python
104 | query = tokenizer.from_list_format([
105 |     {'image': 'assets/mm_tutorial/Menu.jpeg'},
106 |     {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
107 | ])
108 | response, history = model.chat(tokenizer, query=query, history=None)
109 | print(response)
110 | ```
111 | 
112 | ```Think carefully step by step.``` is a common prompt that guides the model through complex tasks step by step. So if you have a complex task to complete, try using it to improve the accuracy of the model. You are expected to get an output similar to the following:
113 | 
114 | > To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
115 | > 
116 | > 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
117 | > 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
118 | > 
119 | > Therefore, the total cost would be $56.
120 | 
121 | ### **Multi-Figure Reasoning and Chinese Input**
122 | In the previous examples, we have demonstrated Qwen-VL-Chat's question-answering capability for a single image and English questions. However, Qwen-VL-Chat is actually a multilingual model that supports Chinese input and multiple images! In the following example, we let Qwen-VL-Chat compare the photos of two cities (Chongqing and Beijing) for us (```assets/mm_tutorial/Chongqing.jpeg``` and ```assets/mm_tutorial/Beijing.jpeg```) in Chinese:
123 | 
124 | ![](assets/mm_tutorial/Chongqing_Small.jpeg)
125 | 
126 | ![](assets/mm_tutorial/Beijing_Small.jpeg)
127 | 
128 | ```python
129 | query = tokenizer.from_list_format([
130 |     {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
131 |     {'image': 'assets/mm_tutorial/Beijing.jpeg'},
132 |     {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
133 | ])
134 | torch.manual_seed(5678)
135 | response, history = model.chat(tokenizer, query=query, history=None)
136 | print(response)
137 | ```
138 | 
139 | You are expected to get an output similar to the following:
140 | 
141 | > 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
142 | 
143 | **Please note that comparing cities is a fairly subjective question, so the responses generated by the model may be subject to a high degree of randomness. If you do not set the random seed using ```torch.manual_seed(5678)```, the output will be different each time. Even if you set the random seed, the results obtained may still differ from this tutorial due to differences in hardware and software environments.**
144 | 
145 | ### **Grounding Capability**
146 | In the last section of the tutorial, we demonstrate the ability of the Qwen-VL-Chat model to produce a bounding box. Qwen-VL-Chat can frame a specified area of an image with a rectangular box according to your language description. This may be a bit abstract, so let's look at the following example. As shown below, the file ```assets/mm_tutorial/Shanghai.jpg``` is a photo of Shanghai, and we'll start by asking the model to describe the image with a regular prompt.
147 | 
148 | ![](assets/mm_tutorial/Shanghai_Small.jpeg)
149 | 
150 | ```python
151 | torch.manual_seed(1234)
152 | query = tokenizer.from_list_format([
153 |     {'image': 'assets/mm_tutorial/Shanghai.jpg'},
154 |     {'text': '图里有啥'},
155 | ])
156 | response, history = model.chat(tokenizer, query=query, history=None)
157 | print(response)
158 | ```
159 | 
160 | You are expected to get an output similar to the following:
161 | 
162 | > 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
163 | 
164 | Next, let's talk to the model by using the prompt ```请给我框出图中上海环球金融中心和东方明珠``` and see what happens. Note that at this point you need to pass the history of previous conversations to ```model.chat``` using ```history=history```.
165 | 
166 | ```python
167 | query = tokenizer.from_list_format([
168 |     {'text': '请给我框出图中上海环球金融中心和东方明珠'},
169 | ])
170 | response, history = model.chat(tokenizer, query=query, history=history)
171 | print(response)
172 | ```
173 | You are expected to get an output similar to the following:
174 | ```xml
175 | <ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
176 | ```
177 | The Qwen-VL-Chat model doesn't have a hand, but it doesn't reject your request either. Instead, it outputs something "strange" - In fact, the output of the model gives the location of the 上海环球金融中心（Shanghai World Financial Centre） and the 东方明珠（Oriental Pearl Tower） in markup language. You can visualise it using the following code:
178 | ```python
179 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
180 | image.save('Shanghai_Output.jpg')
181 | ```
182 | The saved ```Shanghai_Output.jpg``` will look similar to the screenshot below: 
183 | 
184 | ![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
185 | 
186 | After that, you can still chat with Qwen-VL-Chat as before:
187 | ```python
188 | query = tokenizer.from_list_format([
189 |     {'text': '帮我写个这座城市的旅游计划'},
190 | ])
191 | response, history = model.chat(tokenizer, query=query, history=history)
192 | print(response)
193 | ```
194 | 
195 | You are expected to get an output similar to the following:
196 | 
197 | > 好的，以下是一个简单的上海旅游计划：
198 | >
199 | > 第一天：
200 | > 上午：抵达上海，前往酒店办理入住手续。
201 | > 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
202 | > 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
203 | > 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
204 | >
205 | > 第二天：
206 | > 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
207 | > 中午：在科技馆附近的餐厅享用午餐。
208 | > 下午：游览世纪公园，欣赏美景并放松身心。
209 | > 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
210 | >
211 | > 第三天：
212 | > 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
213 | > 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
214 | > 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
215 | > 晚上：在酒店附近享用晚餐，然后离开上海。
216 | >
217 | > 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
218 | 
219 | 
220 | **Please note that travel planning is a fairly subjective question, so the responses generated by the model may be subject to a high degree of randomness. If you do not set the random seed using ```torch.manual_seed(1234)```, the output will be different each time. Even if you set the random seed, the results obtained may still differ from this tutorial due to differences in hardware and software environments.**
221 | 
222 | ### Grounded Captioning
223 | Qwen-VL can output the bounding box information of the subject while captioning the image. For example:
224 | 
225 | ```
226 | img_url = 'assets/apple.jpeg'
227 | query = tokenizer.from_list_format([
228 |     {'image': img_url},
229 |     {'text': 'Generate the caption in English with grounding:'},
230 | ])
231 | response, history = model.chat(tokenizer, query=query, history=None)
232 | print(response)
233 | 
234 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
235 | if image is not None:
236 |     image.save('apple.jpg')
237 | ```
238 | 
239 | The saved ```apple.jpg``` will look similar to the screenshot below: 
240 | <p align="left">
241 |     <img src="assets/apple_r.jpeg" width="600"/>
242 | <p>
243 | 
244 | #### How to get the caption without any box-like annotations
245 | Sometimes you may expect no box-like annotations in the response. In the case, you can stably get the cleaned text by the following post-processing.
246 | 
247 | ```
248 | # response = '<ref> Two apples</ref><box>(302,257),(582,671)</box><box>(603,252),(878,642)</box> and<ref> a bowl</ref><box>(2,269),(304,674)</box>'
249 | import re
250 | clean_response = re.sub(r'<ref>(.*?)</ref>(?:<box>.*?</box>)*(?:<quad>.*?</quad>)*', r'\1', response).strip()
251 | print(clean_response)
252 | # clean_response = 'Two apples and a bowl'
253 | ```
254 | 


--------------------------------------------------------------------------------
/TUTORIAL_ja.md:
--------------------------------------------------------------------------------
  1 | # Qwen-VL-Chat チュートリアル
  2 | Qwen-VL-Chat は汎用のマルチモーダル大規模言語モデルであり、幅広い視覚言語タスクを実行できます。このチュートリアルでは、Qwen-VL-Chat の**視覚的質問応答、テキスト理解、図を用いた数学的推論、多視点推論、およびグラウンディング**の機能について、いくつかの簡潔な例を挙げて説明します。Qwen-VL-Chat は、入力画像やプロンプトを変更することで、Qwen-VL-Chat の能力をさらに引き出すことができます。
  3 | 
  4 | ## Qwen-VL-Chat モデルの初期化
  5 | Qwen-VL-Chat を使用する前に、まず Qwen-VL-Chat のトークナイザと Qwen-VL-Chat のモデルを初期化する必要があります:
  6 | 
  7 | ```python
  8 | import torch
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | from transformers.generation import GenerationConfig
 11 | 
 12 | # 結果の再現性を期待する場合は、ランダムシードを設定する。
 13 | # torch.manual_seed(1234)
 14 | 
 15 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 16 | 
 17 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
 18 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 19 | ```
 20 | 上記のコードを実行すると、```tokenizer``` は Qwen-VL-Chat で使用される分類器に対応し、```model``` は Qwen-VL-Chat のモデルに対応します。```tokenizer``` はインターリーブされたマルチモーダル入力の前処理に使用され、```model``` は Qwen-VL-Chat のモデルそのものです。
 21 | 
 22 | ## Qwen-VL-Chat を使う
 23 | ### **複数ラウンドのビジュアル質問回答**
 24 | #### **最初の質問**
 25 | 簡単な例から始めましょう。以下に示すように、```assets/mm_tutorial/Rebecca_(1939_poster).jpeg``` は 1940 年の映画 レベッカのポスターです。
 26 | 
 27 | ![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
 28 | 
 29 | Qwen-VL-Chat のポスターに描かれている映画の名前を聞いてみよう。まず初めに、入力を前処理してトークン化する ```tokenizer.from_list_format``` を使用します:
 30 | ```python
 31 | query = tokenizer.from_list_format([
 32 |     {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
 33 |     {'text': 'What is the name of the movie in the poster?'},
 34 | ])
 35 | ```
 36 | 次に、```model.chat``` を使って Qwen-VL-Chat モデルに質問をし、その回答を得ることができます。最初の質問では、ダイアログの履歴は空なので、```history=None``` を使用することに注意してください。
 37 | ```python
 38 | response, history = model.chat(tokenizer, query=query, history=None)
 39 | print(response)
 40 | ```
 41 | 以下のような出力が期待されます:
 42 | 
 43 | > The name of the movie in the poster is "Rebecca."
 44 | 
 45 | これは、モデルが与えられた問題に正しく答えたことを示しています！ポスターをみると、映画のタイトルは確かに**レベッカ**です。
 46 | 
 47 | #### **複数ラウンドの質問回答**
 48 | また、映画の監督は誰かなど、他の質問をモデルに続けることもできます。そのため、```history=history``` を使って、以前の会話の履歴を ``model.chat`` に渡します:
 49 | 
 50 | ```python
 51 | query = tokenizer.from_list_format([
 52 |     {'text': 'Who directed this movie?'},
 53 | ])
 54 | response, history = model.chat(tokenizer, query=query, history=history)
 55 | print(response)
 56 | ```
 57 | 
 58 | 以下のような出力が期待されます:
 59 | 
 60 | > The movie "Rebecca" was directed by Alfred Hitchcock.
 61 | 
 62 | 再びこのモデルは与えられた問題に正解しました！ポスターによると、この映画の監督はアルフレッド・ヒッチコックです。
 63 | 
 64 | ### **テキスト理解**
 65 | Qwen-VL-Chat には、高密度なテキストを含む画像を理解する機能もあります。下図に示すように、```assets/mm_tutorial/Hospital.jpeg``` というファイルは、濃いテキストを含む病院の看板です。
 66 | 
 67 | ![](assets/mm_tutorial/Hospital_Small.jpg)
 68 | 
 69 | 病院内のさまざまな診療科の場所について質問することができます。対話の履歴は空なので、```history=None``` を使用します。
 70 | ```python
 71 | query = tokenizer.from_list_format([
 72 |     {'image': 'assets/mm_tutorial/Hospital.jpg'},
 73 |     {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
 74 | ])
 75 | response, history = model.chat(tokenizer, query=query, history=None)
 76 | print(response)
 77 | ```
 78 | 
 79 | 以下のような出力が期待されます:
 80 | 
 81 | > The Department of Otorhinolaryngology is located on the 4th floor.
 82 | 
 83 | さらに質問をすることもできます。この場合、```history=history``` を使用して、以前の会話の履歴を ```model.chat``` に渡す必要があります。
 84 | 
 85 | ```python
 86 | query = tokenizer.from_list_format([
 87 |     {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
 88 | ])
 89 | response, history = model.chat(tokenizer, query=query, history=history)
 90 | print(response)
 91 | ```
 92 | 
 93 | 以下のような出力が期待されます:
 94 | 
 95 | > The Department of Surgery is located on the 3rd floor.
 96 | 
 97 | ### **ダイアグラムによる数学的推論**
 98 | Qwen-VL-Chat は、このモデルのダイアグラム理解能力と数学的推論能力を使って、より複雑なタスクを実行することもできます！下に示すように、```assets/mm_tutorial/Menu.jpeg``` というファイルはレストランのメニューです。では、Salmon Burger 2 個と Meat Lover's Pizza 3 枚を購入した場合の値段を知りたい。
 99 | 
100 | ![](assets/mm_tutorial/Menu.jpeg)
101 | 
102 | ```python
103 | query = tokenizer.from_list_format([
104 |     {'image': 'assets/mm_tutorial/Menu.jpeg'},
105 |     {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
106 | ])
107 | response, history = model.chat(tokenizer, query=query, history=None)
108 | print(response)
109 | ```
110 | 
111 | ステップバイステップで注意深く考えてください(```Think carefully step by step.```)」は、複雑なタスクを一歩ずつでモデルをガイドする一般的なプロンプトです。複雑なタスクをこなさなければならない場合、このプロンプトを使ってモデルの精度を上げてみてください。以下のような出力が期待されます:
112 | 
113 | > To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
114 | >
115 | > 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
116 | > 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
117 | >
118 | > Therefore, the total cost would be $56.
119 | 
120 | ### **多視点推論と中国語入力**
121 | これまでの例では、Qwen-VL-Chat が 1 つの画像と英語の質問に対して質問応答ができることを示しました。しかし、実際には Qwen-VL-Chat は中国語入力と複数の画像をサポートする多言語モデルです！以下の例では、Qwen-VL-Chat に 2 つの都市（重慶と北京）の写真（```assets/mm_tutorial/Chongqing.jpeg``` と ```assets/mm_tutorial/Beijing.jpeg```）を中国語で比較させています:
122 | 
123 | ![](assets/mm_tutorial/Chongqing_Small.jpeg)
124 | 
125 | ![](assets/mm_tutorial/Beijing_Small.jpeg)
126 | 
127 | ```python
128 | query = tokenizer.from_list_format([
129 |     {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
130 |     {'image': 'assets/mm_tutorial/Beijing.jpeg'},
131 |     {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
132 | ])
133 | torch.manual_seed(5678)
134 | response, history = model.chat(tokenizer, query=query, history=None)
135 | print(response)
136 | ```
137 | 
138 | 以下のような出力が期待されます:
139 | 
140 | > 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
141 | 
142 | **都市の比較はかなり主観的な質問であるため、モデルによって生成される回答は高度なランダム性を持つ可能性があることに注意してください。```torch.manual_seed(5678)``` を使用してランダムシードを設定しない場合、出力は毎回異なります。ランダムシードを設定した場合でも、ハードウェアやソフトウェアの環境の違いにより、得られる結果がこのチュートリアルと異なる場合があります。**
143 | 
144 | ### **グラウンディング能力**
145 | チュートリアルの最後のセクションでは、Qwen-VL-Chat モデルがバウンディングボックスを生成する機能を紹介します。Qwen-VL-Chat は、言語記述に従って、画像の指定された領域を矩形の枠で囲むことができます。少し抽象的なので、次の例を見てみましょう。下図のように、ファイル ```assets/mm_tutorial/Shanghai.jpg``` は上海の写真です。まず、通常のプロンプトでモデルに画像を記述してもらいます。
146 | 
147 | ![](assets/mm_tutorial/Shanghai_Small.jpeg)
148 | 
149 | ```python
150 | torch.manual_seed(1234)
151 | query = tokenizer.from_list_format([
152 |     {'image': 'assets/mm_tutorial/Shanghai.jpg'},
153 |     {'text': '图里有啥'},
154 | ])
155 | response, history = model.chat(tokenizer, query=query, history=None)
156 | print(response)
157 | ```
158 | 
159 | 以下のような出力が期待されます:
160 | 
161 | > 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
162 | 
163 | 次に、プロンプト ```请给我框出图中上海环球金融中心和东方明珠``` を使ってモデルと会話してみましょう。このとき、```history=history``` を使って、以前の会話の履歴を ```model.chat``` に渡す必要があることに注意してください。
164 | 
165 | ```python
166 | query = tokenizer.from_list_format([
167 |     {'text': '请给我框出图中上海环球金融中心和东方明珠'},
168 | ])
169 | response, history = model.chat(tokenizer, query=query, history=history)
170 | print(response)
171 | ```
172 | 以下のような出力が期待されます:
173 | ```xml
174 | <ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
175 | ```
176 | Qwen-VL-Chat モデルには手はありませんが、だからといってリクエストを拒否することもありません。その代わりに、"奇妙な"ものが出力されます - 実際、モデルの出力は上海环球金融中心（上海ワールド・フィナンシャル・センター）と东方明珠（東方テレビタワー）の位置をマークアップ言語で示しています。次のコードで視覚化できます:
177 | ```python
178 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
179 | image.save('Shanghai_Output.jpg')
180 | ```
181 | 保存された ```Shanghai_Output.jpg``` は以下のスクリーンショットのようになります:
182 | 
183 | ![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
184 | 
185 | その後、Qwen-VL-Chat で以前と同じようにチャットすることができます:
186 | ```python
187 | query = tokenizer.from_list_format([
188 |     {'text': '帮我写个这座城市的旅游计划'},
189 | ])
190 | response, history = model.chat(tokenizer, query=query, history=history)
191 | print(response)
192 | ```
193 | 
194 | 以下のような出力が期待されます:
195 | 
196 | > 好的，以下是一个简单的上海旅游计划：
197 | >
198 | > 第一天：
199 | > 上午：抵达上海，前往酒店办理入住手续。
200 | > 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
201 | > 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
202 | > 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
203 | >
204 | > 第二天：
205 | > 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
206 | > 中午：在科技馆附近的餐厅享用午餐。
207 | > 下午：游览世纪公园，欣赏美景并放松身心。
208 | > 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
209 | >
210 | > 第三天：
211 | > 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
212 | > 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
213 | > 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
214 | > 晚上：在酒店附近享用晚餐，然后离开上海。
215 | >
216 | > 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
217 | 
218 | 
219 | **旅行計画はかなり主観的な質問であるため、モデルによって生成される回答は高いランダム性を持つ可能性があることに注意してください。```torch.manual_seed(1234)``` を使用してランダムシードを設定しない場合、出力は毎回異なります。ランダムシードを設定した場合でも、ハードウェアやソフトウェアの環境の違いにより、得られる結果がこのチュートリアルと異なる場合があります。**
220 | 


--------------------------------------------------------------------------------
/TUTORIAL_ko.md:
--------------------------------------------------------------------------------
  1 | # Qwen-VL-Chat Tutorial
  2 | 
  3 | Qwen-VL-Chat은 범용 멀티모달 대규모 언어 모델이며 광범위한 시각 언어 작업을 수행할 수 있습니다. 이 튜토리얼에서는 **시각적 질문 답변, 텍스트 이해, 다이어그램을 사용한 수학적 추론, 다중 그림 추론 및 그라운딩(Grounding) 작업**에서 Qwen-VL-Chat의 기능을 보여주는 몇 가지 간결한 예제를 제시합니다. Qwen-VL-Chat의 기능의 한계가 아니며, **입력 이미지와 프롬프트를 변경하여 Qwen-VL-Chat의 기능**을 더 자세히 살펴보실 수도 있습니다.
  4 | 
  5 | ## Initializing the Qwen-VL-Chat model
  6 | Qwen-VL-Chat을 사용하기 전에 먼저 Qwen-VL-Chat의 Tokenizer와 Qwen-VL-Chat의 모델을 초기화해야 합니다.
  7 | 
  8 | ```python
  9 | import torch
 10 | from transformers import AutoModelForCausalLM, AutoTokenizer
 11 | from transformers.generation import GenerationConfig
 12 | 
 13 | # If you expect the results to be reproducible, set a random seed.
 14 | # torch.manual_seed(1234)
 15 | 
 16 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 17 | 
 18 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
 19 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 20 | ```
 21 | 
 22 | 위 코드를 실행하시면 ```tokenizer```변수에 Qwen-VL-Chat에서 사용하는 분류기(classifier)가 할당되고, ```model```변수에는 Qwen-VL-Chat의 모델을 할당하게 됩니다. ```tokenizer```는 인터리브된 멀티모달 입력(interleaved multimodal inputs)을 전처리하는 데 사용되며, ``model``은 Qwen-VL-Chat 모델입니다.
 23 | 
 24 | ## Using Qwen-VL-Chat
 25 | ### **Multi-round visual question answering**
 26 | #### **첫 질문하기**
 27 | 
 28 | 간단한 예제를 확인해보겠습니다. 아래에서 볼 수 있듯이, ```assets/mm_tutorial/Rebecca_(1939_poster).jpeg``` 파일은 1940년 영화 <레베카>의 포스터입니다.
 29 | 
 30 | ![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
 31 | 
 32 | Qwen-VL-Chat 포스터에 있는 영화 제목이 무엇인지 물어봅시다. 우선, 입력을 전처리하고 토큰화할 수 있는 ```tokenizer.from_list_format```을 사용합니다.
 33 | ```python
 34 | query = tokenizer.from_list_format([
 35 |     {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
 36 |     {'text': 'What is the name of the movie in the poster?'},
 37 | ])
 38 | ```
 39 | 다음으로, ```model.chat```을 사용하여 Qwen-VL-Chat 모델에 질문하고 응답을 얻을 수 있습니다. 첫 번째 질문의 경우 대화 기록이 비어 있으므로 ``history=None``을 사용합니다.
 40 | ```python
 41 | response, history = model.chat(tokenizer, query=query, history=None)
 42 | print(response)
 43 | ```
 44 | 다음과 비슷한 출력이 나올 것입니다.
 45 | 
 46 | > The name of the movie in the poster is "Rebecca."
 47 | 
 48 | 모델이 주어진 질문에 정답을 맞혔습니다. 포스터에 따르면, 영화의 제목은 실제로 **레베카**입니다.
 49 | 
 50 | #### **Multi-round question answering**
 51 | 또한 모델에게 영화 감독이 누구인지와 같은 다른 질문을 계속할 수도 있습니다. 대화 기록은 후속 질문을 위해 비어 있지 않으므로 ``history=history``를 사용하여 이전 대화의 기록을 ``model.chat``에 전달합니다:
 52 | 
 53 | ```python
 54 | query = tokenizer.from_list_format([
 55 |     {'text': 'Who directed this movie?'},
 56 | ])
 57 | response, history = model.chat(tokenizer, query=query, history=history)
 58 | print(response)
 59 | ```
 60 | 
 61 | 다음과 비슷한 출력이 나올 것입니다.
 62 | 
 63 | > The movie "Rebecca" was directed by Alfred Hitchcock.
 64 | 
 65 | 다시 한 번, 모델이 주어진 질문에 대한 정답을 맞혔습니다. 포스터에 따르면 이 영화의 감독은 <알프레드 히치콕>입니다.
 66 | 
 67 | ### **Text Understanding**
 68 | Qwen-VL-Chat은 촘촘한 텍스트가 포함된 이미지도 이해할 수 있습니다. 아래 그림과 같이 ``assets/mm_tutorial/Hospital.jpeg`` 파일은 촘촘한 텍스트가 포함된 병원 간판입니다.
 69 | 
 70 | ![](assets/mm_tutorial/Hospital_Small.jpg)
 71 | 
 72 | 병원 내 여러 부서의 위치에 대해 질문할 수 있습니다. 첫 질문으로 대화에 대한 이전 기록이 없으므로 ```history=None```을 사용합니다.
 73 | 
 74 | ```python
 75 | query = tokenizer.from_list_format([
 76 |     {'image': 'assets/mm_tutorial/Hospital.jpg'},
 77 |     {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
 78 | ])
 79 | response, history = model.chat(tokenizer, query=query, history=None)
 80 | print(response)
 81 | ```
 82 | 
 83 | 다음과 비슷한 출력이 나올 것입니다.
 84 | 
 85 | > The Department of Otorhinolaryngology is located on the 4th floor.
 86 | 
 87 | 추가 질문을 하실 수도 있습니다. 이 경우 ```history=history```를 사용하여 이전 대화의 기록을 ```model.chat```에 전달해야 합니다.
 88 | 
 89 | ```python
 90 | query = tokenizer.from_list_format([
 91 |     {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
 92 | ])
 93 | response, history = model.chat(tokenizer, query=query, history=history)
 94 | print(response)
 95 | ```
 96 | 
 97 | 다음과 비슷한 출력이 나올 것입니다.
 98 | 
 99 | > The Department of Surgery is located on the 3rd floor.
100 | 
101 | ### **Mathematical Reasoning with Diagram**
102 | 모델의 다이어그램 이해와 수학적 추론 기능을 사용하여 Qwen-VL-Chat은 좀 더 복잡한 작업도 수행할 수 있습니다. 아래에서 볼 수 있듯이 ``assets/mm_tutorial/Menu.jpeg`` 파일은 레스토랑의 메뉴 이미지 입니다. 이제 연어 버거 두 개와 미트 러버스 피자 세 개를 구매하는 데 드는 비용을 알아봅시다.
103 | ![](assets/mm_tutorial/Menu.jpeg)
104 | 
105 | ```python
106 | query = tokenizer.from_list_format([
107 |     {'image': 'assets/mm_tutorial/Menu.jpeg'},
108 |     {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
109 | ])
110 | response, history = model.chat(tokenizer, query=query, history=None)
111 | print(response)
112 | ```
113 | 
114 | ``단계별로 신중하게 생각하세요``는 복잡한 작업을 단계별로 모델에 안내하는 일반적인 프롬프트입니다. 따라서 완료해야 할 복잡한 작업이 있는 경우에는 이 프롬프트를 사용하여 모델의 정확도를 향상시켜 보세요. 다음과 유사한 출력이 나올 것입니다.
115 | 
116 | > To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
117 | > 
118 | > 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
119 | > 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
120 | > 
121 | > Therefore, the total cost would be $56.
122 | 
123 | ### **Multi-Figure Reasoning and Chinese Input**
124 | 이전 예제에서는 단일 이미지와 영어 질문에 대한 Qwen-VL-Chat의 질문 답변 기능을 시연했습니다. 하지만 실제로는 중국어 입력과 여러 이미지를 지원하는 다국어 모델입니다. 다음 예제에서는 두 도시(충칭과 베이징)의 사진(`assets/mm_tutorial/Chongqing.jpeg` 및 `assets/mm_tutorial/Beijing.jpeg`)을 중국어로 비교하도록 Qwen-VL-Chat을 설정했습니다.
125 | 
126 | ![](assets/mm_tutorial/Chongqing_Small.jpeg)
127 | 
128 | ![](assets/mm_tutorial/Beijing_Small.jpeg)
129 | 
130 | ```python
131 | query = tokenizer.from_list_format([
132 |     {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
133 |     {'image': 'assets/mm_tutorial/Beijing.jpeg'},
134 |     {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
135 | ])
136 | torch.manual_seed(5678)
137 | response, history = model.chat(tokenizer, query=query, history=None)
138 | print(response)
139 | ```
140 | 
141 | 다음과 유사한 출력이 나올 것입니다.
142 | 
143 | > 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
144 | 
145 | **도시 비교는 상당히 주관적인 질문이므로 모델에 의해 생성된 응답에는 매우 다양하게 무작위의 시드가 적용될 수 있다는 점을 유의하세요. ``torch.manual_seed(5678)```를 사용하여 무작위 시드를 설정하지 않으면 매번 출력이 달라집니다. 랜덤 시드를 설정하더라도 하드웨어 및 소프트웨어 환경의 차이로 인해 얻은 결과가 이 튜토리얼과 다를 수 있습니다**.
146 | 
147 | 
148 | ### **Grounding Capability**
149 | 튜토리얼의 마지막 섹션에서는 Qwen-VL-Chat 모델이 바운딩 박스를 생성하는 기능을 보여드립니다. Qwen-VL-Chat은 언어 설명에 따라 직사각형 상자로 이미지의 지정된 영역에 프레임을 지정할 수 있습니다. 다소 추상적일 수 있으므로 다음 예제를 살펴보겠습니다. 아래 그림과 같이 ```assets/mm_tutorial/Shanghai.jpg`` 파일은 상하이의 사진이며, 모델에게 일반 프롬프트로 이미지를 설명하도록 요청하는 것으로 시작하겠습니다.
150 | 
151 | ![](assets/mm_tutorial/Shanghai_Small.jpeg)
152 | 
153 | ```python
154 | torch.manual_seed(1234)
155 | query = tokenizer.from_list_format([
156 |     {'image': 'assets/mm_tutorial/Shanghai.jpg'},
157 |     {'text': '图里有啥'},
158 | ])
159 | response, history = model.chat(tokenizer, query=query, history=None)
160 | print(response)
161 | ```
162 | 
163 | 다음과 유사한 출력을 보실 수 있습니다.
164 | 
165 | > 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
166 | 
167 | 다음으로 '``请给我框出图中上海环球金融中心和东方明珠``라는 프롬프트를 사용하여 모델과 대화하고 어떤 일이 발생하는지 살펴봅시다. 이 시점에서 ``history=history``를 사용하여 이전 대화의 기록을 ``model.chat``에 전달해야 합니다.
168 | 
169 | ```python
170 | query = tokenizer.from_list_format([
171 |     {'text': '请给我框出图中上海环球金融中心和东方明珠'},
172 | ])
173 | response, history = model.chat(tokenizer, query=query, history=history)
174 | print(response)
175 | ```
176 | 
177 | 다음과 유사한 출력을 보실 수 있습니다.
178 | 
179 | ```xml
180 | <ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
181 | ```
182 | 
183 | Qwen-VL-Chat 모델에는 손이 없지만 사용자의 요청을 거부하지도 않습니다. 대신 "이상한" 결과를 출력하는데, 실제로 이 모델의 출력은 上海环球金融中心(상하이 월드 파이낸셜 센터) 와 东方明珠(동방명주) 의 위치를 마크업 언어로 제공합니다. 다음 코드를 사용하여 시각화할 수 있습니다.
184 | 
185 | ```python
186 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
187 | image.save('Shanghai_Output.jpg')
188 | ```
189 | The saved ```Shanghai_Output.jpg``` will look similar to the screenshot below: 
190 | 
191 | ![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
192 | 
193 | 그 후에도 이전처럼 Qwen-VL-Chat으로 계속 채팅할 수 있습니다.
194 | 
195 | ```python
196 | query = tokenizer.from_list_format([
197 |     {'text': '帮我写个这座城市的旅游计划'},
198 | ])
199 | response, history = model.chat(tokenizer, query=query, history=history)
200 | print(response)
201 | ```
202 | 
203 | 다음과 유사한 출력을 보실 수 있습니다.
204 | 
205 | > 好的，以下是一个简单的上海旅游计划：
206 | >
207 | > 第一天：
208 | > 上午：抵达上海，前往酒店办理入住手续。
209 | > 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
210 | > 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
211 | > 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
212 | >
213 | > 第二天：
214 | > 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
215 | > 中午：在科技馆附近的餐厅享用午餐。
216 | > 下午：游览世纪公园，欣赏美景并放松身心。
217 | > 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
218 | >
219 | > 第三天：
220 | > 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
221 | > 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
222 | > 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
223 | > 晚上：在酒店附近享用晚餐，然后离开上海。
224 | >
225 | > 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
226 | 
227 | **여행 계획은 상당히 주관적인 질문이므로 모델에 의해 생성된 응답에는 높은 수준의 랜덤 시드가 적용될 수 있다는 점에 유의하세요. ``torch.manual_seed(1234)``를 사용하여 무작위 시드를 설정하지 않으면 매번 다른 출력이 나오게 됩니다. 랜덤 시드를 일정하게 설정하더라도 하드웨어 및 소프트웨어 환경의 차이로 인해 얻은 결과가 이 튜토리얼과 다를 수 있습니다**.
228 | 
229 | ### Grounded Captioning
230 | Qwen-VL은 다음과 같이 이미지를 캡쳐하는 동안 피사체의 바운딩 박스 정보를 출력할 수 있습니다. 
231 | 
232 | ```
233 | img_url = 'assets/apple.jpeg'
234 | query = tokenizer.from_list_format([
235 |     {'image': img_url},
236 |     {'text': 'Generate the caption in English with grounding:'},
237 | ])
238 | response, history = model.chat(tokenizer, query=query, history=None)
239 | print(response)
240 | 
241 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
242 | if image is not None:
243 |     image.save('apple.jpg')
244 | ```
245 | 
246 | 저장된 ``사과.jpg``는 이미지는 아래 스크린샷과 비슷하게 보이게 될 것입니다.
247 | <p align="left">
248 |     <img src="assets/apple_r.jpeg" width="600"/>
249 | <p>
250 | 
251 | #### How to get the caption without any box-like annotations
252 | 때로는 응답에 박스형 주석이 없을 수도 있습니다. 이 경우 다음과 같은 후처리를 통해 안정적으로 정리된 텍스트를 얻을 수 있습니다.
253 | 
254 | ```
255 | # response = '<ref> Two apples</ref><box>(302,257),(582,671)</box><box>(603,252),(878,642)</box> and<ref> a bowl</ref><box>(2,269),(304,674)</box>'
256 | import re
257 | clean_response = re.sub(r'<ref>(.*?)</ref>(?:<box>.*?</box>)*(?:<quad>.*?</quad>)*', r'\1', response).strip()
258 | print(clean_response)
259 | # clean_response = 'Two apples and a bowl'
260 | ```
261 | 


--------------------------------------------------------------------------------
/TUTORIAL_zh.md:
--------------------------------------------------------------------------------
  1 | # Qwen-VL-Chat使用教程
  2 | Qwen-VL-Chat是通用多模态大规模语言模型，因此它可以完成多种视觉语言任务。在本教程之中，我们会给出一些简明的例子，用以展示Qwen-VL-Chat在**视觉问答，文字理解，图表数学推理，多图理解和Grounding**(根据指令标注图片中指定区域的包围框)等多方面的能力。请注意，展示的例子远非Qwen-VL-Chat能力的极限，**您可以通过更换不同的输入图像和提示词（Prompt），来进一步挖掘Qwen-VL-Chat的能力！**
  3 | 
  4 | ## 初始化Qwen-VL-Chat模型
  5 | 在使用Qwen-VL-Chat之前，您首先需要初始化Qwen-VL-Chat的分词器（Tokenizer）和Qwen-VL-Chat的模型：
  6 | ```python
  7 | import torch
  8 | from transformers import AutoModelForCausalLM, AutoTokenizer
  9 | from transformers.generation import GenerationConfig
 10 | 
 11 | # 如果您希望结果可复现，可以设置随机数种子。
 12 | # torch.manual_seed(1234)
 13 | 
 14 | tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 15 | 
 16 | model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL-Chat", device_map="cuda", trust_remote_code=True).eval()
 17 | model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
 18 | ```
 19 | 在执行完上述代码后，```tokenizer```将对应Qwen-VL-Chat使用的分词器，而```model```将对应Qwen-VL-Chat的模型。```tokenizer```用于对图文混排输入进行分词和预处理，而```model```则是Qwen-VL-Chat模型本身。
 20 | 
 21 | ## 使用Qwen-VL-Chat
 22 | ### **多轮视觉问答**
 23 | #### **第一个问题**
 24 | 首先我们来看一个最简单的例子，如下图所示，文件```assets/mm_tutorial/Rebecca_(1939_poster).jpeg```是1940年电影Rebecca的于1939发布的海报。
 25 | 
 26 | ![](assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg)
 27 | 
 28 | 我们来问一问Qwen-VL-Chat海报上电影的名称是什么。首先，我们使用tokenizer.from_list_format可以对图文混排输入进行分词与处理：
 29 | ```python
 30 | query = tokenizer.from_list_format([
 31 |     {'image': 'assets/mm_tutorial/Rebecca_(1939_poster).jpeg'},
 32 |     {'text': 'What is the name of the movie in the poster?'},
 33 | ])
 34 | ```
 35 | 接下来，我们可以使用```model.chat```向Qwen-VL-Chat模型提问并获得回复。注意在第一次提问时，对话历史为空，因此我们使用```history=None```。
 36 | ```python
 37 | response, history = model.chat(tokenizer, query=query, history=None)
 38 | print(response)
 39 | ```
 40 | 您应该会得到类似下列的输出结果：
 41 | 
 42 | > The name of the movie in the poster is "Rebecca."
 43 | 
 44 | 这说明模型正确的回答了问题！根据海报，该电影的名称的确是**Rebecca**。
 45 | 
 46 | #### **多轮问答**
 47 | 我们还可以继续向模型发问，例如询问电影的导演是谁。在后续提问时，对话历史并不为空，我们使用```history=history```向```model.chat```传递之前的对话历史：
 48 | ```python
 49 | query = tokenizer.from_list_format([
 50 |     {'text': 'Who directed this movie?'},
 51 | ])
 52 | response, history = model.chat(tokenizer, query=query, history=history)
 53 | print(response)
 54 | ```
 55 | 
 56 | 您应该会得到类似下列的输出结果：
 57 | 
 58 | > The movie "Rebecca" was directed by Alfred Hitchcock.
 59 | 
 60 | 模型再次正确回答了问题！根据海报，该电影的导演是Alfred Hitchcock。
 61 | 
 62 | ### **文字理解**
 63 | Qwen-VL-Chat具有一定的针对包含密集文字图片的理解能力。如下图所示，文件```assets/mm_tutorial/Hospital.jpeg```是一个包含密集文字的医院指示牌。
 64 | 
 65 | ![](assets/mm_tutorial/Hospital_Small.jpg)
 66 | 
 67 | 我们可以像之前一样向模型询问医院中各个科室的位置，对话历史为空，因此我们使用```history=None```。
 68 | ```python
 69 | query = tokenizer.from_list_format([
 70 |     {'image': 'assets/mm_tutorial/Hospital.jpg'},
 71 |     {'text': 'Based on the photo, which floor is the Department of Otorhinolaryngology on?'},
 72 | ])
 73 | response, history = model.chat(tokenizer, query=query, history=None)
 74 | print(response)
 75 | ```
 76 | 
 77 | 您应该会得到类似下列的输出结果：
 78 | 
 79 | > The Department of Otorhinolaryngology is located on the 4th floor.
 80 | 
 81 | 您同样可以进一步提出后续问题，此时需要使用```history=history```向```model.chat```传递之前的对话历史。
 82 | 
 83 | ```python
 84 | query = tokenizer.from_list_format([
 85 |     {'text': 'Based on the photo, which floor is the Department of Surgery on?'},
 86 | ])
 87 | response, history = model.chat(tokenizer, query=query, history=history)
 88 | print(response)
 89 | ```
 90 | 
 91 | 您应该会得到类似下列的输出结果：
 92 | 
 93 | > The Department of Surgery is located on the 3rd floor.
 94 | 
 95 | ### **图表数学推理**
 96 | 利用模型的图表理解和数学推理能力，Qwen-VL-Chat还可以完成更复杂的一些任务！如下图所示，文件```assets/mm_tutorial/Menu.jpeg```展示了一家餐厅的菜单。现在我们想知道，如果购买两个Salmon Burger和三个Meat Lover's Pizza需要花多少钱呢？
 97 | 
 98 | ![](assets/mm_tutorial/Menu.jpeg)
 99 | 
100 | ```python
101 | query = tokenizer.from_list_format([
102 |     {'image': 'assets/mm_tutorial/Menu.jpeg'},
103 |     {'text': 'How much would I pay if I want to order two Salmon Burger and three Meat Lover\'s Pizza? Think carefully step by step.'},
104 | ])
105 | response, history = model.chat(tokenizer, query=query, history=None)
106 | print(response)
107 | ```
108 | 
109 | ```Think carefully step by step.```是一个引导模型分步处理复杂任务的常见提示词，如果您需要完成的任务较为复杂，可以试着使用它来提高准确率。您应该会得到类似下列的输出结果：
110 | 
111 | > To order two Salmon Burgers and three Meat Lover's Pizzas, you would need to pay the following:
112 | > 
113 | > 1. For two Salmon Burgers: x2 Salmon Burgers at $10 each = $20
114 | > 2. For three Meat Lover's Pizzas: x3 Meat Lover's Pizzas at $12 each = $36
115 | > 
116 | > Therefore, the total cost would be $56.
117 | 
118 | ### **多图理解与中文输入**
119 | 在之前的例子中，我们主要展示了Qwen-VL-Chat针对单张图像和英文问题的问答能力。但实际上，Qwen-VL-Chat是支持中文输入的多语言模型，而且也支持多张图片的输入！下面的例子中，我们用中文让Qwen-VL-Chat来为我们比较重庆和北京这两个城市的照片（```assets/mm_tutorial/Chongqing.jpeg```和```assets/mm_tutorial/Beijing.jpeg```）：
120 | 
121 | ![](assets/mm_tutorial/Chongqing_Small.jpeg)
122 | 
123 | ![](assets/mm_tutorial/Beijing_Small.jpeg)
124 | 
125 | ```python
126 | query = tokenizer.from_list_format([
127 |     {'image': 'assets/mm_tutorial/Chongqing.jpeg'},
128 |     {'image': 'assets/mm_tutorial/Beijing.jpeg'},
129 |     {'text': '上面两张图片分别是哪两个城市？请对它们进行对比。'},
130 | ])
131 | torch.manual_seed(5678)
132 | response, history = model.chat(tokenizer, query=query, history=None)
133 | print(response)
134 | ```
135 | 
136 | 您应该会得到类似下列的输出结果：
137 | 
138 | > 第一张图片是重庆的城市天际线，它反映了现代都市的繁华与喧嚣。第二张图片是北京的天际线，它象征着中国首都的现代化和国际化。两座城市都是中国的重要城市，拥有独特的文化和发展历史。
139 | 
140 | **请注意，城市间的比较是一个具有相当主观性的问题，因此模型产生的回复可能具有相当高的随机性。若不使用```torch.manual_seed(5678)```设置随机数种子，每次的输出结果会不一样。即使您设置了随机数种子，由于软硬件环境的差异，得到的结果也可能与本文档中的有所不同。**
141 | 
142 | ### **Grounding能力**
143 | 在最后，我们展示Qwen-VL-Chat模型产生包围框的能力。Qwen-VL-Chat可以根据您的语言描述，在图像中用矩形框框出指定区域。这样说可能有些抽象，让我们来看下面的例子。如下图所示，文件```assets/mm_tutorial/Shanghai.jpg```是上海的一张照片，我们先用常规的提示词，问一下模型图里有什么。
144 | 
145 | ![](assets/mm_tutorial/Shanghai_Small.jpeg)
146 | 
147 | ```python
148 | torch.manual_seed(1234)
149 | query = tokenizer.from_list_format([
150 |     {'image': 'assets/mm_tutorial/Shanghai.jpg'},
151 |     {'text': '图里有啥'},
152 | ])
153 | response, history = model.chat(tokenizer, query=query, history=None)
154 | print(response)
155 | ```
156 | 
157 | 您应该会得到类似下列的输出结果：
158 | 
159 | > 图中是中国上海的天际线，包括了上海塔、金茂大厦、上海环球金融中心、海洋大厦等著名建筑。
160 | 
161 | 接下来，我们通过使用```请给我框出图中上海环球金融中心和东方明珠```这个提示词来和模型对话，看看会发生什么。注意此时需要使用```history=history```向```model.chat```传递之前的对话历史。
162 | ```python
163 | query = tokenizer.from_list_format([
164 |     {'text': '请给我框出图中上海环球金融中心和东方明珠'},
165 | ])
166 | response, history = model.chat(tokenizer, query=query, history=history)
167 | print(response)
168 | ```
169 | 您应该会得到类似下列的输出结果：
170 | ```xml
171 | <ref>上海环球金融中心</ref><box>(667,437),(760,874)</box>和<ref>东方明珠</ref><box>(506,75),(582,946)</box>
172 | ```
173 | Qwen-VL-Chat模型没有手，但也没有拒绝您的请求，而是输出了一些“奇怪”的东西——并不是，实际上，模型的输出以标记语言的形式给出了上海环球金融中心和东方明珠在图中的具体位置。您可以使用下列代码将其可视化：
174 | ```python
175 | image = tokenizer.draw_bbox_on_latest_picture(response, history)
176 | image.save('Shanghai_Output.jpg')
177 | ```
178 | 保存下来的```Shanghai_Output.jpg```结果将类似于下面的截图：
179 | 
180 | ![](assets/mm_tutorial/Shanghai_Output_Small.jpeg)
181 | 
182 | 在此之后，您还可以继续照常和Qwen-VL-Chat对话：
183 | ```python
184 | query = tokenizer.from_list_format([
185 |     {'text': '帮我写个这座城市的旅游计划'},
186 | ])
187 | response, history = model.chat(tokenizer, query=query, history=history)
188 | print(response)
189 | ```
190 | 
191 | 您应该会得到类似下列的输出结果：
192 | 
193 | > 好的，以下是一个简单的上海旅游计划：
194 | >
195 | > 第一天：
196 | > 上午：抵达上海，前往酒店办理入住手续。
197 | > 中午：享用午餐后，前往外滩，欣赏黄浦江畔的美景，游览上海地标性建筑如浦发银行大楼、汇丰银行大楼等。
198 | > 下午：游览南京路步行街，购买特色礼品或品尝当地美食。
199 | > 晚上：在南京路附近的餐厅享用晚餐，然后去看上海的夜景。
200 | >
201 | > 第二天：
202 | > 上午：前往上海科技馆，了解科技发展历史，观看各种科技展览。
203 | > 中午：在科技馆附近的餐厅享用午餐。
204 | > 下午：游览世纪公园，欣赏美景并放松身心。
205 | > 晚上：在南京路或附近的陆家嘴地区享用晚餐，然后去看上海的夜景。
206 | >
207 | > 第三天：
208 | > 上午：游览上海迪士尼乐园或上海海昌海洋公园，与各种迪士尼角色互动，或者在海洋公园观看海洋生物表演。
209 | > 中午：在迪士尼乐园或海洋公园附近的餐厅享用午餐。
210 | > 下午：自由活动，可以去购物、品尝当地美食或者去博物馆等。
211 | > 晚上：在酒店附近享用晚餐，然后离开上海。
212 | >
213 | > 当然，以上只是一个简单的计划，上海有许多其他景点和活动，例如参观上海博物馆、游览田子坊、观看上海话剧等。具体计划可以根据个人兴趣和时间进行调整。
214 | 
215 | **请注意，旅游计划是一个具有相当主观性的问题，因此模型产生的回复可能具有相当高的随机性。若不使用```torch.manual_seed(1234)```设置随机数种子，每次的输出结果会不一样。即使您设置了随机数种子，由于软硬件环境的差异，得到的结果也可能与本文档中的有所不同。**
216 | 


--------------------------------------------------------------------------------
/assets/apple.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/apple.jpeg


--------------------------------------------------------------------------------
/assets/apple_r.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/apple_r.jpeg


--------------------------------------------------------------------------------
/assets/demo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/demo.jpeg


--------------------------------------------------------------------------------
/assets/demo_highfive.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/demo_highfive.jpg


--------------------------------------------------------------------------------
/assets/demo_spotting_caption.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/demo_spotting_caption.jpg


--------------------------------------------------------------------------------
/assets/demo_vl.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/demo_vl.gif


--------------------------------------------------------------------------------
/assets/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/logo.jpg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Beijing.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Beijing.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Beijing_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Beijing_Small.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Chongqing.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Chongqing.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Chongqing_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Chongqing_Small.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Hospital.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Hospital.jpg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Hospital_Small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Hospital_Small.jpg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Menu.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Menu.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Rebecca_(1939_poster).jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Rebecca_(1939_poster).jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Rebecca_(1939_poster)_Small.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Shanghai.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Shanghai.jpg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Shanghai_Output.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Shanghai_Output.jpg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Shanghai_Output_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Shanghai_Output_Small.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/Shanghai_Small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/Shanghai_Small.jpeg


--------------------------------------------------------------------------------
/assets/mm_tutorial/TUTORIAL.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/mm_tutorial/TUTORIAL.ipynb


--------------------------------------------------------------------------------
/assets/qwenvl.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/qwenvl.jpeg


--------------------------------------------------------------------------------
/assets/radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/radar.png


--------------------------------------------------------------------------------
/assets/radar_qwenvlplus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/radar_qwenvlplus.jpg


--------------------------------------------------------------------------------
/assets/touchstone_datasets.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/touchstone_datasets.jpg


--------------------------------------------------------------------------------
/assets/touchstone_eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/touchstone_eval.png


--------------------------------------------------------------------------------
/assets/touchstone_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/touchstone_logo.png


--------------------------------------------------------------------------------
/assets/wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/assets/wechat.png


--------------------------------------------------------------------------------
/eval_mm/data:
--------------------------------------------------------------------------------
1 | /cpfs01/shared/public/shusheng.yss/datasets/qwenvl_evaluation


--------------------------------------------------------------------------------
/eval_mm/evaluate_caption.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import json
  4 | import os
  5 | import random
  6 | import time
  7 | from functools import partial
  8 | 
  9 | import torch
 10 | from pycocoevalcap.eval import COCOEvalCap
 11 | from pycocotools.coco import COCO
 12 | from tqdm import tqdm
 13 | from transformers import AutoModelForCausalLM, AutoTokenizer
 14 | 
 15 | ds_collections = {
 16 |     'flickr': {
 17 |         'train': 'data/flickr30k/flickr30k_karpathy_test.json',
 18 |         'test': 'data/flickr30k/flickr30k_karpathy_test.json',
 19 |     },
 20 |     'nocaps': {
 21 |         'train': '',
 22 |         'test': 'data/nocaps/nocaps_val.json',
 23 |     },
 24 | }
 25 | 
 26 | 
 27 | class CaptionDataset(torch.utils.data.Dataset):
 28 | 
 29 |     def __init__(self, train, test, prompt, few_shot=0):
 30 |         self.images = json.load(open(test))['images']
 31 |         self.prompt = prompt
 32 | 
 33 |         self.few_shot = few_shot
 34 |         if few_shot > 0:
 35 |             self.train = json.load(open(train))['annotations']
 36 | 
 37 |     def __len__(self):
 38 |         return len(self.images)
 39 | 
 40 |     def __getitem__(self, idx):
 41 |         image_id, image_path = self.images[idx]['id'], self.images[idx][
 42 |             'image']
 43 | 
 44 |         few_shot_prompt = ''
 45 |         if self.few_shot > 0:
 46 |             few_shot_samples = random.sample(self.train, self.few_shot)
 47 |             for sample in few_shot_samples:
 48 |                 few_shot_prompt += self.prompt.format(
 49 |                     sample['image']) + f" {sample['caption']}"
 50 | 
 51 |         return {
 52 |             'image_id': image_id,
 53 |             'input_text': few_shot_prompt + self.prompt.format(image_path)
 54 |         }
 55 | 
 56 | 
 57 | def collate_fn(inputs, tokenizer):
 58 | 
 59 |     image_ids = [_['image_id'] for _ in inputs]
 60 |     input_texts = [_['input_text'] for _ in inputs]
 61 |     input_tokens = tokenizer(input_texts,
 62 |                              return_tensors='pt',
 63 |                              padding='longest')
 64 | 
 65 |     return image_ids, input_tokens.input_ids, input_tokens.attention_mask
 66 | 
 67 | 
 68 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 69 | 
 70 |     def __init__(self, size):
 71 |         self._size = int(size)
 72 |         assert size > 0
 73 |         self._rank = torch.distributed.get_rank()
 74 |         self._world_size = torch.distributed.get_world_size()
 75 |         self._local_indices = self._get_local_indices(size, self._world_size,
 76 |                                                       self._rank)
 77 | 
 78 |     @staticmethod
 79 |     def _get_local_indices(total_size, world_size, rank):
 80 |         shard_size = total_size // world_size
 81 |         left = total_size % world_size
 82 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 83 | 
 84 |         begin = sum(shard_sizes[:rank])
 85 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
 86 |         return range(begin, end)
 87 | 
 88 |     def __iter__(self):
 89 |         yield from self._local_indices
 90 | 
 91 |     def __len__(self):
 92 |         return len(self._local_indices)
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 | 
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument('--checkpoint', type=str, default='')
 99 |     parser.add_argument('--dataset', type=str, default='')
100 |     parser.add_argument('--batch-size', type=int, default=1)
101 |     parser.add_argument('--num-workers', type=int, default=1)
102 |     parser.add_argument('--few-shot', type=int, default=0)
103 |     parser.add_argument('--seed', type=int, default=0)
104 |     args = parser.parse_args()
105 | 
106 |     torch.distributed.init_process_group(
107 |         backend='nccl',
108 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
109 |         rank=int(os.getenv('RANK', '0')),
110 |     )
111 | 
112 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
113 | 
114 |     prompt = '<img>{}</img>Describe the image in English:'
115 | 
116 |     model = AutoModelForCausalLM.from_pretrained(
117 |         args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
118 | 
119 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
120 |                                               trust_remote_code=True)
121 |     tokenizer.padding_side = 'left'
122 |     tokenizer.pad_token_id = tokenizer.eod_id
123 | 
124 |     random.seed(args.seed)
125 |     dataset = CaptionDataset(
126 |         train=ds_collections[args.dataset]['train'],
127 |         test=ds_collections[args.dataset]['test'],
128 |         prompt=prompt,
129 |         few_shot=args.few_shot,
130 |     )
131 |     coco_karpathy_test_loader = torch.utils.data.DataLoader(
132 |         dataset=dataset,
133 |         sampler=InferenceSampler(len(dataset)),
134 |         batch_size=args.batch_size,
135 |         num_workers=args.num_workers,
136 |         pin_memory=True,
137 |         drop_last=False,
138 |         collate_fn=partial(collate_fn, tokenizer=tokenizer),
139 |     )
140 | 
141 |     image_ids = []
142 |     captions = []
143 |     for _, (ids, input_ids,
144 |             attention_mask) in tqdm(enumerate(coco_karpathy_test_loader)):
145 |         pred = model.generate(
146 |             input_ids=input_ids.cuda(),
147 |             attention_mask=attention_mask.cuda(),
148 |             do_sample=False,
149 |             num_beams=1,
150 |             max_new_tokens=30,
151 |             min_new_tokens=8,
152 |             length_penalty=0,
153 |             num_return_sequences=1,
154 |             use_cache=True,
155 |             pad_token_id=tokenizer.eod_id,
156 |             eos_token_id=tokenizer.eod_id,
157 |         )
158 |         image_ids.extend(ids)
159 |         captions.extend([
160 |             tokenizer.decode(_[input_ids.size(1):].cpu(),
161 |                              skip_special_tokens=True).strip() for _ in pred
162 |         ])
163 | 
164 |     torch.distributed.barrier()
165 | 
166 |     world_size = torch.distributed.get_world_size()
167 |     merged_ids = [None for _ in range(world_size)]
168 |     merged_captions = [None for _ in range(world_size)]
169 |     torch.distributed.all_gather_object(merged_ids, image_ids)
170 |     torch.distributed.all_gather_object(merged_captions, captions)
171 | 
172 |     merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
173 |     merged_captions = [
174 |         _ for _ in itertools.chain.from_iterable(merged_captions)
175 |     ]
176 | 
177 |     if torch.distributed.get_rank() == 0:
178 |         print(f"Evaluating {args.dataset} ...")
179 | 
180 |         results = []
181 |         for image_id, caption in zip(merged_ids, merged_captions):
182 |             results.append({
183 |                 'image_id': int(image_id),
184 |                 'caption': caption,
185 |             })
186 |         time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
187 |         results_file = f'{args.dataset}_{time_prefix}.json'
188 |         json.dump(results, open(results_file, 'w'))
189 | 
190 |         coco = COCO(ds_collections[args.dataset]['test'])
191 |         coco_result = coco.loadRes(results_file)
192 |         coco_eval = COCOEvalCap(coco, coco_result)
193 |         coco_eval.evaluate()
194 | 
195 |         print(coco_eval.eval.items())
196 |     torch.distributed.barrier()
197 | 


--------------------------------------------------------------------------------
/eval_mm/evaluate_grounding.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import json
  4 | import os
  5 | import re
  6 | from functools import partial
  7 | 
  8 | import torch
  9 | from torchvision.ops.boxes import box_area
 10 | from tqdm import tqdm
 11 | from transformers import AutoModelForCausalLM, AutoTokenizer
 12 | 
 13 | ds_collections = {
 14 |     'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
 15 |     'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
 16 |     'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
 17 |     'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
 18 |     'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
 19 |     'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
 20 |     'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
 21 |     'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
 22 | }
 23 | 
 24 | 
 25 | def box_iou(boxes1, boxes2):
 26 |     area1 = box_area(boxes1)
 27 |     area2 = box_area(boxes2)
 28 | 
 29 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 30 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 31 | 
 32 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 33 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 34 | 
 35 |     union = area1[:, None] + area2 - inter
 36 | 
 37 |     iou = inter / union
 38 |     return iou, union
 39 | 
 40 | 
 41 | def collate_fn(batches, tokenizer):
 42 | 
 43 |     texts = [_['text'] for _ in batches]
 44 |     bboxes = [_['bbox'] for _ in batches]
 45 |     hws = [_['hw'] for _ in batches]
 46 | 
 47 |     input_ids = tokenizer(texts, return_tensors='pt', padding='longest')
 48 | 
 49 |     return input_ids.input_ids, input_ids.attention_mask, bboxes, hws
 50 | 
 51 | 
 52 | class RefCOCODataset(torch.utils.data.Dataset):
 53 | 
 54 |     def __init__(self, test, tokenizer, prompt):
 55 |         self.datas = open(test).readlines()
 56 |         self.tokenizer = tokenizer
 57 |         self.prompt = prompt
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.datas)
 61 | 
 62 |     def __getitem__(self, idx):
 63 |         data = json.loads(self.datas[idx].strip())
 64 |         image = data['image']
 65 |         text = data['sent']
 66 |         bbox = data['bbox']
 67 | 
 68 |         w, h = data['width'], data['height']
 69 | 
 70 |         return {
 71 |             'text': self.prompt.format(image, text),
 72 |             'bbox': bbox,
 73 |             'hw': (h, w),
 74 |         }
 75 | 
 76 | 
 77 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 78 | 
 79 |     def __init__(self, size):
 80 |         self._size = int(size)
 81 |         assert size > 0
 82 |         self._rank = torch.distributed.get_rank()
 83 |         self._world_size = torch.distributed.get_world_size()
 84 |         self._local_indices = self._get_local_indices(size, self._world_size,
 85 |                                                       self._rank)
 86 | 
 87 |     @staticmethod
 88 |     def _get_local_indices(total_size, world_size, rank):
 89 |         shard_size = total_size // world_size
 90 |         left = total_size % world_size
 91 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 92 | 
 93 |         begin = sum(shard_sizes[:rank])
 94 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
 95 |         return range(begin, end)
 96 | 
 97 |     def __iter__(self):
 98 |         yield from self._local_indices
 99 | 
100 |     def __len__(self):
101 |         return len(self._local_indices)
102 | 
103 | 
104 | if __name__ == '__main__':
105 | 
106 |     parser = argparse.ArgumentParser()
107 |     parser.add_argument('--checkpoint', type=str, default='')
108 |     parser.add_argument('--dataset', type=str, default='')
109 |     parser.add_argument('--batch-size', type=int, default=1)
110 |     parser.add_argument('--num-workers', type=int, default=1)
111 |     args = parser.parse_args()
112 | 
113 |     torch.distributed.init_process_group(
114 |         backend='nccl',
115 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
116 |         rank=int(os.getenv('RANK', '0')),
117 |     )
118 | 
119 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
120 | 
121 |     model = AutoModelForCausalLM.from_pretrained(
122 |         args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
123 | 
124 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
125 |                                               trust_remote_code=True)
126 |     tokenizer.padding_side = 'left'
127 |     tokenizer.pad_token_id = tokenizer.eod_id
128 | 
129 |     prompt = '<img>{}</img><ref>{}</ref><box>'
130 | 
131 |     dataset = RefCOCODataset(test=ds_collections[args.dataset],
132 |                              tokenizer=tokenizer,
133 |                              prompt=prompt)
134 | 
135 |     dataloader = torch.utils.data.DataLoader(
136 |         dataset=dataset,
137 |         sampler=InferenceSampler(len(dataset)),
138 |         batch_size=args.batch_size,
139 |         num_workers=args.num_workers,
140 |         pin_memory=True,
141 |         drop_last=True,
142 |         collate_fn=partial(collate_fn, tokenizer=tokenizer),
143 |     )
144 | 
145 |     outputs = []
146 |     for _, (input_ids, attention_mask, bboxes,
147 |             hws) in tqdm(enumerate(dataloader)):
148 |         pred = model.generate(
149 |             input_ids=input_ids.cuda(),
150 |             attention_mask=attention_mask.cuda(),
151 |             do_sample=False,
152 |             num_beams=1,
153 |             max_new_tokens=28,
154 |             min_new_tokens=10,
155 |             length_penalty=1,
156 |             num_return_sequences=1,
157 |             use_cache=True,
158 |             pad_token_id=tokenizer.eod_id,
159 |             eos_token_id=tokenizer.eod_id,
160 |         )
161 |         answers = [
162 |             tokenizer.decode(_[input_ids.size(1):].cpu(),
163 |                              skip_special_tokens=True) for _ in pred
164 |         ]
165 | 
166 |         for bbox, hw, answer in zip(bboxes, hws, answers):
167 |             outputs.append({
168 |                 'answer': answer,
169 |                 'gt_bbox': bbox,
170 |                 'hw': hw,
171 |             })
172 | 
173 |     torch.distributed.barrier()
174 | 
175 |     world_size = torch.distributed.get_world_size()
176 |     merged_outputs = [None for _ in range(world_size)]
177 |     torch.distributed.all_gather_object(merged_outputs, outputs)
178 | 
179 |     merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
180 |     PATTERN = re.compile(r'\((.*?)\),\((.*?)\)')
181 | 
182 |     if torch.distributed.get_rank() == 0:
183 |         correct = total_cnt = 0
184 |         for i, output in enumerate(merged_outputs):
185 |             predict_bbox = re.findall(PATTERN, output['answer'])
186 |             try:
187 |                 if ',' not in predict_bbox[0][0] or ',' not in predict_bbox[0][
188 |                         1]:
189 |                     predict_bbox = (0., 0., 0., 0.)
190 |                 else:
191 |                     x1, y1 = [
192 |                         float(tmp) for tmp in predict_bbox[0][0].split(',')
193 |                     ]
194 |                     x2, y2 = [
195 |                         float(tmp) for tmp in predict_bbox[0][1].split(',')
196 |                     ]
197 |                     predict_bbox = (x1, y1, x2, y2)
198 |             except:
199 |                 predict_bbox = (0., 0., 0., 0.)
200 |             target_bbox = torch.tensor(output['gt_bbox'],
201 |                                        dtype=torch.float32).view(-1, 4)
202 |             predict_bbox = torch.tensor(predict_bbox,
203 |                                         dtype=torch.float32).view(-1, 4) / 999
204 |             predict_bbox[:, 0::2] *= output['hw'][1]
205 |             predict_bbox[:, 1::2] *= output['hw'][0]
206 |             iou, _ = box_iou(predict_bbox, target_bbox)
207 |             iou = iou.item()
208 |             total_cnt += 1
209 |             if iou >= 0.5:
210 |                 correct += 1
211 | 
212 |         print(f"Evaluating {args.dataset} ...")
213 |         print(f'Precision @ 1: {correct / total_cnt} \n')
214 |     torch.distributed.barrier()
215 | 


--------------------------------------------------------------------------------
/eval_mm/evaluate_multiple_choice.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import json
  4 | import os
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | from tqdm import tqdm
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | 
 11 | multiple_choices = ['A', 'B', 'C', 'D', 'E']
 12 | 
 13 | ds_collections = {
 14 |     'scienceqa_test_img': {
 15 |         'test': 'data/scienceqa/scienceqa_test_img.jsonl',
 16 |     }
 17 | }
 18 | 
 19 | 
 20 | def collate_fn(batches, pad_token_id):
 21 | 
 22 |     input_tokens = [_['input_tokens'] for _ in batches]
 23 |     target_lengths = [_['target_lengths'] for _ in batches]
 24 |     answers = [_['answer'] for _ in batches]
 25 | 
 26 |     chunk_sizes = [len(_) for _ in input_tokens]
 27 | 
 28 |     input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
 29 | 
 30 |     max_lengths = max([len(_) for _ in input_tokens])
 31 |     input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
 32 |                     for _ in input_tokens]
 33 |     input_tokens = torch.LongTensor(input_tokens)
 34 | 
 35 |     attention_mask = 1 - input_tokens.eq(pad_token_id).float()
 36 | 
 37 |     return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
 38 | 
 39 | 
 40 | class MultipleChoiceDataste(torch.utils.data.Dataset):
 41 | 
 42 |     def __init__(self, test, prompt, tokenizer):
 43 |         self.datas = open(test).readlines()
 44 |         self.prompt = prompt
 45 |         self.tokenizer = tokenizer
 46 | 
 47 |     def __len__(self):
 48 |         return len(self.datas)
 49 | 
 50 |     def __getitem__(self, idx):
 51 | 
 52 |         data = json.loads(self.datas[idx].strip())
 53 |         image = data['image']
 54 |         hint = data['hint'] if data['hint'] else 'N/A'
 55 |         question = data['question']
 56 | 
 57 |         choices = data['choices']
 58 |         choice_list = []
 59 |         for i, c in enumerate(choices):
 60 |             choice_list.append('{}. {}'.format(multiple_choices[i], c))
 61 |         choice_txt = '\n'.join(choice_list)
 62 | 
 63 |         prompt = self.prompt.format(image, hint, question, choice_txt)
 64 | 
 65 |         prompt_tokens = self.tokenizer(prompt).input_ids
 66 |         target_tokens = [
 67 |             self.tokenizer(' ' + _).input_ids
 68 |             for _ in multiple_choices[:len(choices)]
 69 |         ]
 70 | 
 71 |         return {
 72 |             'input_tokens': [prompt_tokens + _ for _ in target_tokens],
 73 |             'target_lengths': [len(_) for _ in target_tokens],
 74 |             'answer': data['answer'],
 75 |         }
 76 | 
 77 | 
 78 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 79 | 
 80 |     def __init__(self, size):
 81 |         self._size = int(size)
 82 |         assert size > 0
 83 |         self._rank = torch.distributed.get_rank()
 84 |         self._world_size = torch.distributed.get_world_size()
 85 |         self._local_indices = self._get_local_indices(size, self._world_size,
 86 |                                                       self._rank)
 87 | 
 88 |     @staticmethod
 89 |     def _get_local_indices(total_size, world_size, rank):
 90 |         shard_size = total_size // world_size
 91 |         left = total_size % world_size
 92 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 93 | 
 94 |         begin = sum(shard_sizes[:rank])
 95 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
 96 |         return range(begin, end)
 97 | 
 98 |     def __iter__(self):
 99 |         yield from self._local_indices
100 | 
101 |     def __len__(self):
102 |         return len(self._local_indices)
103 | 
104 | 
105 | if __name__ == '__main__':
106 | 
107 |     parser = argparse.ArgumentParser()
108 |     parser.add_argument('--checkpoint', type=str, default='')
109 |     parser.add_argument('--dataset', type=str, default='')
110 |     parser.add_argument('--batch-size', type=int, default=1)
111 |     parser.add_argument('--num-workers', type=int, default=1)
112 |     args = parser.parse_args()
113 | 
114 |     torch.distributed.init_process_group(
115 |         backend='nccl',
116 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
117 |         rank=int(os.getenv('RANK', '0')),
118 |     )
119 | 
120 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
121 | 
122 |     model = AutoModelForCausalLM.from_pretrained(
123 |         args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
124 | 
125 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
126 |                                               trust_remote_code=True)
127 | 
128 |     prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
129 | 
130 |     dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
131 |                                     prompt=prompt,
132 |                                     tokenizer=tokenizer)
133 |     dataloader = torch.utils.data.DataLoader(
134 |         dataset=dataset,
135 |         sampler=InferenceSampler(len(dataset)),
136 |         batch_size=args.batch_size,
137 |         num_workers=args.num_workers,
138 |         pin_memory=True,
139 |         drop_last=False,
140 |         collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
141 |     )
142 | 
143 |     results = []
144 |     with torch.no_grad():
145 |         for _, (input_tokens, attention_mask, target_lengths, answer,
146 |                 chunk_sizes) in tqdm(enumerate(dataloader)):
147 | 
148 |             outputs = model(
149 |                 input_ids=input_tokens[:, :-1].cuda(),
150 |                 attention_mask=attention_mask[:, :-1].cuda(),
151 |                 return_dict=True,
152 |             )
153 |             losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
154 |                 0, 2, 1),
155 |                                                        input_tokens[:,
156 |                                                                     1:].cuda(),
157 |                                                        reduction='none')
158 | 
159 |             losses = losses.split(chunk_sizes, dim=0)
160 | 
161 |             for loss, target_length, answer in zip(losses, target_lengths,
162 |                                                    answer):
163 | 
164 |                 target_loss = loss.mean(-1)
165 |                 for _ in range(len(target_length)):
166 |                     target_loss[_] = loss[_, -target_length[_]:].mean()
167 |                 pred = target_loss.argmin().item()
168 |                 if pred == answer:
169 |                     results.append(1)
170 |                 else:
171 |                     results.append(0)
172 | 
173 |     torch.distributed.barrier()
174 | 
175 |     world_size = torch.distributed.get_world_size()
176 |     merged_results = [None for _ in range(world_size)]
177 |     torch.distributed.all_gather_object(merged_results, results)
178 | 
179 |     merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
180 | 
181 |     if torch.distributed.get_rank() == 0:
182 |         print(f"Evaluating {args.dataset} ...")
183 |         print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
184 | 
185 |     torch.distributed.barrier()
186 | 


--------------------------------------------------------------------------------
/eval_mm/infographicsvqa_eval.py:
--------------------------------------------------------------------------------
  1 | # This file can be downloaded from: https://www.docvqa.org/datasets/infographicvqa and https://rrc.cvc.uab.es/?ch=17&com=introduction
  2 | 
  3 | import os, json
  4 | import argparse
  5 | 
  6 | question_ids_to_exclude = []
  7 | 
  8 | # answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span', 'list': 'List'}
  9 | answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span'}
 10 | evidence_types = {'table/list': 'Table/list', 'textual': 'Text', 'photo/pciture/visual_objects': 'Visual/Layout', 'figure': 'Figure', 'map': 'Map'}
 11 | reasoning_requirements = {'comparison': 'Sorting', 'arithmetic': 'Arithmetic', 'counting':'Counting'}
 12 | 
 13 | 
 14 | def save_json(file_path, data):
 15 |     with open(file_path, 'w+') as json_file:
 16 |         json.dump(data, json_file)
 17 | 
 18 | 
 19 | 
 20 | def levenshtein_distance(s1, s2):
 21 |     if len(s1) > len(s2):
 22 |         s1, s2 = s2, s1
 23 | 
 24 |     distances = range(len(s1) + 1)
 25 |     for i2, c2 in enumerate(s2):
 26 |         distances_ = [i2+1]
 27 |         for i1, c1 in enumerate(s1):
 28 |             if c1 == c2:
 29 |                 distances_.append(distances[i1])
 30 |             else:
 31 |                 distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
 32 |         distances = distances_
 33 |     return distances[-1]
 34 | 
 35 | 
 36 | def validate_data(gtFilePath, submFilePath):
 37 |     """
 38 |     Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
 39 |                             Validates also that there are no missing files in the folder.
 40 |                             If some error detected, the method raises the error
 41 |     """
 42 |     
 43 |     gtJson = json.load(open(gtFilePath,'rb'));
 44 |     submJson = json.load(open(submFilePath,'rb'));
 45 |     
 46 |     if not 'data' in gtJson:
 47 |         raise Exception("The GT file is not valid (no data key)")
 48 |     
 49 |     if not 'dataset_name' in gtJson:
 50 |         raise Exception("The GT file is not valid (no dataset_name key)")      
 51 |     
 52 |     if isinstance(submJson, list) == False :
 53 |         raise Exception("The Det file is not valid (root item must be an array)")
 54 |         
 55 |     if len(submJson) != len(gtJson['data']) :
 56 |         raise Exception("The Det file is not valid (invalid number of answers. Expected:" + str(len(gtJson['data'])) + " Found:" + str(len(submJson)) + ")")    
 57 |     
 58 |     gtQuestions = sorted([r['questionId'] for r in gtJson['data']])
 59 |     res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
 60 |     detQuestions = sorted([r['questionId'] for r in submJson])
 61 |     
 62 |     if( (gtQuestions == detQuestions) == False ):
 63 |         raise Exception("The Det file is not valid. Question IDs must much GT")    
 64 |     
 65 |     for gtObject in gtJson['data']:
 66 |         
 67 |         try:
 68 |             q_id = int(gtObject['questionId']);
 69 |             res_ix = res_id_to_index[q_id];
 70 |             
 71 |         except:
 72 |             raise Exception("The Det file is not valid. Question " + str(gtObject['questionId']) + " not present")
 73 |         
 74 |         else:
 75 |             detObject = submJson[res_ix];
 76 |             
 77 | #            if detObject['questionId'] != gtObject['questionId'] :
 78 | #                raise Exception("Answer #" + str(i) + " not valid (invalid question ID. Expected:" + str(gtObject['questionId']) + "Found:" + detObject['questionId'] + ")")
 79 | 
 80 |             if not 'answer' in detObject:
 81 |                 raise Exception("Question " + str(gtObject['questionId']) + " not valid (no answer key)")
 82 | 
 83 |             if isinstance(detObject['answer'], list) == True :
 84 |                 raise Exception("Question " + str(gtObject['questionId']) + " not valid (answer key has to be a single string)")
 85 | 
 86 | 
 87 | def evaluate_method(gtFilePath, submFilePath, evaluationParams):
 88 |     """
 89 |     Method evaluate_method: evaluate method and returns the results
 90 |         Results. Dictionary with the following values:
 91 |         - method (required)  Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
 92 |         - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
 93 |     """  
 94 |     
 95 |     show_scores_per_answer_type = evaluationParams.answer_types
 96 | 
 97 |     gtJson = json.load(open(gtFilePath,'rb'));
 98 |     submJson = json.load(open(submFilePath,'rb'));
 99 |     
100 |     res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
101 |     
102 |     
103 |     perSampleMetrics = {}
104 |     
105 |     totalScore = 0
106 |     row = 0
107 |     
108 |     if show_scores_per_answer_type:
109 | 	    answerTypeTotalScore = {x:0 for x in answer_types.keys()}
110 | 	    answerTypeNumQuestions = {x:0 for x in answer_types.keys()}
111 | 
112 | 	    evidenceTypeTotalScore = {x:0 for x in evidence_types.keys()}
113 | 	    evidenceTypeNumQuestions = {x:0 for x in evidence_types.keys()}
114 | 
115 | 	    reasoningTypeTotalScore = {x:0 for x in reasoning_requirements.keys()}
116 | 	    reasoningTypeNumQuestions = {x:0 for x in reasoning_requirements.keys()}
117 |     
118 |     for gtObject in gtJson['data']:
119 | 
120 |         q_id = int(gtObject['questionId']);
121 |         res_ix = res_id_to_index[q_id];
122 |         detObject = submJson[res_ix];
123 | 
124 |         if q_id in question_ids_to_exclude:
125 |             question_result = 0
126 |             info = 'Question EXCLUDED from the result'
127 |             
128 |         else:
129 |             info = ''
130 |             values = []
131 |             for answer in gtObject['answers']:
132 |                 # preprocess both the answers - gt and prediction
133 |                 gt_answer = ' '.join(answer.strip().lower().split())
134 |                 det_answer = ' '.join(detObject['answer'].strip().lower().split())
135 | 
136 |                 #dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
137 |                 dist = levenshtein_distance(gt_answer,det_answer)
138 |                 length = max( len(answer.upper()), len(detObject['answer'].upper()) )
139 |                 values.append( 0.0 if length == 0 else float(dist) / float(length) )
140 | 
141 |             question_result = 1 - min(values)
142 |         
143 |             if (question_result < evaluationParams.anls_threshold) :
144 |                 question_result = 0
145 | 
146 |             totalScore += question_result
147 |             
148 |             if show_scores_per_answer_type:
149 |                 for q_type in gtObject["answer_type"]:
150 |                     answerTypeTotalScore[q_type] += question_result
151 |                     answerTypeNumQuestions[q_type] += 1
152 | 
153 |                 for q_type in gtObject["evidence"]:
154 |                     evidenceTypeTotalScore[q_type] += question_result
155 |                     evidenceTypeNumQuestions[q_type] += 1
156 | 
157 |                 for q_type in gtObject["operation/reasoning"]:
158 |                     reasoningTypeTotalScore[q_type] += question_result
159 |                     reasoningTypeNumQuestions[q_type] += 1
160 |                 
161 |         
162 |         perSampleMetrics[str(gtObject['questionId'])] = {
163 |                                 'score':question_result,
164 |                                 'question':gtObject['question'],
165 |                                 'gt':gtObject['answers'],
166 |                                 'det':detObject['answer'],
167 |                                 'info': info
168 |                                 }
169 |         row = row + 1
170 | 
171 |                                 
172 |     methodMetrics = {
173 | 		'score': 0 if len(gtJson['data']) == 0 else totalScore/ (len(gtJson['data']) - len(question_ids_to_exclude) )
174 |     }
175 | 
176 |     answer_types_scores = {}
177 |     evidence_types_scores = {}
178 |     operation_types_scores = {}
179 | 
180 |     if show_scores_per_answer_type:
181 |         for a_type, ref in answer_types.items():
182 |             answer_types_scores[ref] = 0 if len(gtJson['data']) == 0 else answerTypeTotalScore[a_type] / (answerTypeNumQuestions[a_type] )
183 | 
184 |         for e_type, ref in evidence_types.items():
185 |             evidence_types_scores[ref] = 0 if len(gtJson['data']) == 0 else evidenceTypeTotalScore[e_type] / (evidenceTypeNumQuestions[e_type] )
186 | 
187 |         for r_type, ref in reasoning_requirements.items():
188 |             operation_types_scores[ref] = 0 if len(gtJson['data']) == 0 else reasoningTypeTotalScore[r_type] / (reasoningTypeNumQuestions[r_type] )
189 | 
190 | 
191 |     resDict = {
192 |             'result': methodMetrics, 
193 |             'scores_by_types': {'answer_types': answer_types_scores, 'evidence_types': evidence_types_scores, 'operation_types': operation_types_scores},
194 |             'per_sample_result':perSampleMetrics
195 |             }
196 | 
197 |     return resDict;
198 | 
199 | 
200 | def display_results(results, show_answer_types):
201 |     print("\nOverall ANLS: {:2.4f}".format(results['result']['score']))
202 | 
203 |     if show_answer_types:
204 |         print("\nAnswer types:")
205 |         for a_type in answer_types.values():
206 |             print("\t{:12s} {:2.4f}".format(a_type, results['scores_by_types']['answer_types'][a_type]))
207 | 
208 |         print("\nEvidence types:")
209 |         for e_type in evidence_types.values():
210 |             print("\t{:12s} {:2.4f}".format(e_type, results['scores_by_types']['evidence_types'][e_type]))
211 | 
212 |         print("\nOperation required:")
213 |         for r_type in reasoning_requirements.values():
214 |             print("\t{:12s} {:2.4f}".format(r_type, results['scores_by_types']['operation_types'][r_type]))
215 | 
216 | 
217 | 
218 | if __name__=='__main__':
219 |     parser = argparse.ArgumentParser(description="InfographVQA evaluation script.")
220 | 
221 |     parser.add_argument('-g', '--ground_truth', type=str, help="Path of the Ground Truth file.", required=True)
222 |     parser.add_argument('-s', '--submission_file', type=str, help="Path of your method's results file.", required=True)
223 | 
224 |     parser.add_argument('-t', '--anls_threshold', type=float, default=0.5, help="ANLS threshold to use (See Scene-Text VQA paper for more info.).", required=False)
225 |     parser.add_argument('-a', '--answer_types', type=bool, default=False, help="Score break down by answer types (special gt file required).", required=False)
226 |     parser.add_argument('-o', '--output', type=str, help="Path to a directory where to copy the file 'results.json' that contains per-sample results.", required=False)
227 | 
228 |     args = parser.parse_args()
229 | 
230 |     # Validate the format of ground truth and submission files.
231 |     validate_data(args.ground_truth, args.submission_file)
232 | 
233 |     # Evaluate method
234 |     results = evaluate_method(args.ground_truth, args.submission_file, args)
235 | 
236 |     display_results(results, args.answer_types)
237 | 
238 |     if args.output:
239 |         output_dir = args.output
240 | 
241 |         if not os.path.exists(output_dir):
242 |             os.makedirs(output_dir)
243 | 
244 |         resultsOutputname = os.path.join(output_dir, 'results.json')
245 |         save_json(resultsOutputname, results)
246 | 
247 |         print("All results including per-sample result has been correctly saved!")
248 | 
249 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/MMBENCH.md:
--------------------------------------------------------------------------------
 1 | # MMBench Evaluation
 2 | 
 3 | ## Data
 4 | 
 5 | ```bash
 6 | /cpfs01/shared/public/shusheng.yss/workspace/23082502_qwenvl_eval_test/eval_mm/data/mmbench
 7 | ```
 8 | 
 9 | ## Dev
10 | 
11 | ```bash
12 | checkpoint=/PATH/TO/CHECKPOINT
13 | ds=mmbench_dev_20230712
14 | python -m torch.distributed.launch --use-env \
15 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
16 |     --nnodes ${WORLD_SIZE:-1} \
17 |     --node_rank ${RANK:-0} \
18 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
19 |     --master_port ${MASTER_PORT:-12345} \
20 |     evaluate_multiple_choice_mmbench.py \
21 |     --checkpoint $checkpoint \
22 |     --dataset $ds \
23 |     --batch-size 2 \
24 |     --num-workers 2
25 | 
26 | # the results will be saved to mmbench_dev_20230712.json
27 | 
28 | # without consistency constrain
29 | 
30 | python mmbench_evaluation.py
31 | 
32 | # with consistency constrain
33 | 
34 | python mmbench_evaluation_tricky.py
35 | 
36 | ```
37 | 
38 | ## Test
39 | 
40 | ```bash
41 | checkpoint=/PATH/TO/CHECKPOINT
42 | ds=mmbench_test_20230712
43 | python -m torch.distributed.launch --use-env \
44 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
45 |     --nnodes ${WORLD_SIZE:-1} \
46 |     --node_rank ${RANK:-0} \
47 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
48 |     --master_port ${MASTER_PORT:-12345} \
49 |     evaluate_multiple_choice_mmbench.py \
50 |     --checkpoint $checkpoint \
51 |     --dataset $ds \
52 |     --batch-size 2 \
53 |     --num-workers 2
54 | 
55 | # the results will be saved to mmbench_test_20230712.json
56 | 
57 | # convert to submission format with consistency constrain
58 | 
59 | python mmbench_predict_to_submission.py
60 | 
61 | ```
62 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/evaluate_multiple_choice_mmbench.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import json
  4 | import os
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | from tqdm import tqdm
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | 
 11 | multiple_choices = ['A', 'B', 'C', 'D', 'E']
 12 | 
 13 | ds_collections = {
 14 |     'mmbench_dev_20230712': {
 15 |         'test': 'data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl',
 16 |     },
 17 |     'mmbench_test_20230712': {
 18 |         'test': 'data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl',
 19 |     }
 20 | }
 21 | 
 22 | def collate_fn(batches, pad_token_id):
 23 | 
 24 |     indexes = [_['index'] for _ in batches]
 25 | 
 26 |     input_tokens = [_['input_tokens'] for _ in batches]
 27 |     target_lengths = [_['target_lengths'] for _ in batches]
 28 | 
 29 |     chunk_sizes = [len(_) for _ in input_tokens]
 30 | 
 31 |     input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
 32 | 
 33 |     max_lengths = max([len(_) for _ in input_tokens])
 34 |     input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
 35 |                     for _ in input_tokens]
 36 |     input_tokens = torch.LongTensor(input_tokens)
 37 | 
 38 |     attention_mask = 1 - input_tokens.eq(pad_token_id).float()
 39 | 
 40 |     return input_tokens, attention_mask, target_lengths, chunk_sizes, indexes
 41 | 
 42 | 
 43 | class MultipleChoiceDataste(torch.utils.data.Dataset):
 44 | 
 45 |     def __init__(self, test, prompt, tokenizer):
 46 |         self.datas = open(test).readlines()
 47 |         self.prompt = prompt
 48 |         self.tokenizer = tokenizer
 49 | 
 50 |     def __len__(self):
 51 |         return len(self.datas)
 52 | 
 53 |     def __getitem__(self, idx):
 54 | 
 55 |         data = json.loads(self.datas[idx].strip())
 56 |         index = data['index']
 57 |         image = data['image']
 58 |         hint = data['hint'] if data['hint'] else 'N/A'
 59 |         question = data['question']
 60 | 
 61 |         choices = data['choices']
 62 |         choice_list = []
 63 |         for i, c in enumerate(choices):
 64 |             choice_list.append('{}. {}'.format(multiple_choices[i], c))
 65 |         choice_txt = '\n'.join(choice_list)
 66 | 
 67 |         prompt = self.prompt.format(image, hint, question, choice_txt)
 68 | 
 69 |         prompt_tokens = self.tokenizer(prompt).input_ids
 70 |         target_tokens = [
 71 |             self.tokenizer(' ' + _).input_ids
 72 |             for _ in multiple_choices[:len(choices)]
 73 |         ]
 74 | 
 75 |         return {
 76 |             'index': index,
 77 |             'input_tokens': [prompt_tokens + _ for _ in target_tokens],
 78 |             'target_lengths': [len(_) for _ in target_tokens],
 79 |             # 'answer': data['answer'],
 80 |         }
 81 | 
 82 | 
 83 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 84 | 
 85 |     def __init__(self, size):
 86 |         self._size = int(size)
 87 |         assert size > 0
 88 |         self._rank = torch.distributed.get_rank()
 89 |         self._world_size = torch.distributed.get_world_size()
 90 |         self._local_indices = self._get_local_indices(size, self._world_size,
 91 |                                                       self._rank)
 92 | 
 93 |     @staticmethod
 94 |     def _get_local_indices(total_size, world_size, rank):
 95 |         shard_size = total_size // world_size
 96 |         left = total_size % world_size
 97 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 98 | 
 99 |         begin = sum(shard_sizes[:rank])
100 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
101 |         return range(begin, end)
102 | 
103 |     def __iter__(self):
104 |         yield from self._local_indices
105 | 
106 |     def __len__(self):
107 |         return len(self._local_indices)
108 | 
109 | 
110 | if __name__ == '__main__':
111 | 
112 |     parser = argparse.ArgumentParser()
113 |     parser.add_argument('--checkpoint', type=str, default='')
114 |     parser.add_argument('--dataset', type=str, default='')
115 |     parser.add_argument('--batch-size', type=int, default=1)
116 |     parser.add_argument('--num-workers', type=int, default=1)
117 |     args = parser.parse_args()
118 | 
119 |     torch.distributed.init_process_group(
120 |         backend='nccl',
121 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
122 |         rank=int(os.getenv('RANK', '0')),
123 |     )
124 | 
125 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
126 | 
127 |     model = AutoModelForCausalLM.from_pretrained(
128 |         args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
129 | 
130 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
131 |                                               trust_remote_code=True)
132 | 
133 |     prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
134 | 
135 |     dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
136 |                                     prompt=prompt,
137 |                                     tokenizer=tokenizer)
138 |     dataloader = torch.utils.data.DataLoader(
139 |         dataset=dataset,
140 |         sampler=InferenceSampler(len(dataset)),
141 |         batch_size=args.batch_size,
142 |         num_workers=args.num_workers,
143 |         pin_memory=True,
144 |         drop_last=False,
145 |         collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
146 |     )
147 | 
148 |     results = []
149 |     with torch.no_grad():
150 |         for _, (input_tokens, attention_mask, target_lengths,
151 |                 chunk_sizes, indexes) in tqdm(enumerate(dataloader)):
152 | 
153 |             outputs = model(
154 |                 input_ids=input_tokens[:, :-1].cuda(),
155 |                 attention_mask=attention_mask[:, :-1].cuda(),
156 |                 return_dict=True,
157 |             )
158 |             losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
159 |                 0, 2, 1),
160 |                                                        input_tokens[:,
161 |                                                                     1:].cuda(),
162 |                                                        reduction='none')
163 | 
164 |             losses = losses.split(chunk_sizes, dim=0)
165 | 
166 |             for loss, target_length, index in zip(losses, target_lengths, indexes):
167 | 
168 |                 target_loss = loss.mean(-1)
169 |                 for _ in range(len(target_length)):
170 |                     target_loss[_] = loss[_, -target_length[_]:].mean()
171 |                 pred = target_loss.argmin().item()
172 | 
173 |                 results.append({
174 |                     "index": index,
175 |                     "prediction": pred,
176 |                 })
177 | 
178 |     torch.distributed.barrier()
179 | 
180 |     world_size = torch.distributed.get_world_size()
181 |     merged_results = [None for _ in range(world_size)]
182 |     torch.distributed.all_gather_object(merged_results, results)
183 | 
184 |     merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
185 | 
186 |     if torch.distributed.get_rank() == 0:
187 |         json.dump(merged_results, open(f"{args.dataset}.json", "w"))
188 | 
189 |     torch.distributed.barrier()
190 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/mmbench_converter_dev.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import base64
 4 | import json
 5 | from PIL import Image
 6 | 
 7 | '''
 8 | This scripts convert mmbench_dev tsv file to jsonl
 9 | '''
10 | 
11 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
12 | 
13 | global_choices = ['A', 'B', 'C', 'D']
14 | 
15 | def decode_base64_to_image(base64_string):
16 |     image_data = base64.b64decode(base64_string)
17 |     image = Image.open(io.BytesIO(image_data))
18 |     return image
19 | 
20 | 
21 | with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
22 |     for idx in range(len(datas)):
23 |         data = datas.iloc[idx]
24 |         
25 |         index = int(data['index'])
26 |         question = data['question']
27 |         hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
28 | 
29 |         choices = []
30 |         for opt in global_choices:
31 |             if pd.isna(data[opt]):
32 |                 continue
33 |             choices.append(data[opt])
34 | 
35 |         answer = global_choices.index(data['answer'])
36 | 
37 |         image = decode_base64_to_image(data['image'])
38 |         image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
39 | 
40 |         f.write(json.dumps({
41 |             "index": index,
42 |             "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
43 |             "hint": hint,
44 |             "question": question,
45 |             "choices": choices, 
46 |             "answer": answer,
47 |         }) + "\n")
48 | 
49 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/mmbench_converter_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | import base64
 4 | import json
 5 | from PIL import Image
 6 | 
 7 | '''
 8 | This script convert mmbench_test tsv file to jsonl
 9 | This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
10 | '''
11 | 
12 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
13 | 
14 | global_choices = ['A', 'B', 'C', 'D']
15 | 
16 | def decode_base64_to_image(base64_string):
17 |     image_data = base64.b64decode(base64_string)
18 |     image = Image.open(io.BytesIO(image_data))
19 |     return image
20 | 
21 | 
22 | with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
23 |     for idx in range(len(datas)):
24 |         data = datas.iloc[idx]
25 |         
26 |         index = int(data['index'])
27 |         question = data['question']
28 |         hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
29 | 
30 |         choices = []
31 |         for opt in global_choices:
32 |             if pd.isna(data[opt]):
33 |                 continue
34 |             choices.append(data[opt])
35 | 
36 |         # answer = global_choices.index(data['answer'])
37 | 
38 |         image = decode_base64_to_image(data['image'])
39 |         image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
40 | 
41 |         f.write(json.dumps({
42 |             "index": index,
43 |             "image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
44 |             "hint": hint,
45 |             "question": question,
46 |             "choices": choices, 
47 |             # "answer": answer,
48 |         }) + "\n")
49 | 
50 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/mmbench_evaluation.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | 
 4 | '''
 5 | This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
 6 | '''
 7 | 
 8 | predictions = json.load(open('mmbench_dev_20230712.json'))
 9 | 
10 | index2predictions = {}
11 | for pred in predictions:
12 |     index2predictions[pred['index']] = pred['prediction']
13 | 
14 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
15 | 
16 | glb_opts = ['A', 'B', 'C', 'D']
17 | index2answer = {}
18 | for idx in range(len(datas)):
19 |     data = datas.iloc[idx]
20 |     index2answer[data['index']] = glb_opts.index(data['answer'])
21 | 
22 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
23 | 
24 | correct = 0
25 | total = 0
26 | for index in identity_indexes:
27 |     for _ in range(4):
28 |         cycle_index = int(_ * 1e6 + index)
29 |         if index2predictions.get(cycle_index, None) is not None:
30 |             if index2predictions[cycle_index] == index2answer[cycle_index]:
31 |                 continue
32 |             else:
33 |                 print(cycle_index)
34 |                 break
35 |     else:
36 |         correct += 1
37 |     total += 1
38 | 
39 | print(correct, total)
40 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/mmbench_evaluation_tricky.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import random
 4 | 
 5 | '''
 6 | This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
 7 | '''
 8 | 
 9 | predictions = json.load(open('mmbench_dev_20230712.json'))
10 | 
11 | index2predictions = {}
12 | for pred in predictions:
13 |     index2predictions[pred['index']] = pred['prediction']
14 | 
15 | 
16 | from collections import Counter
17 | 
18 | def most_common_elements(lst):
19 |     counter = Counter(lst)
20 |     max_count = max(counter.values())
21 |     most_common = [element for element, count in counter.items() if count == max_count]
22 |     return random.choice(most_common) # random sample from random choice
23 | 
24 | datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
25 | 
26 | glb_opts = ['A', 'B', 'C', 'D']
27 | index2answer = {}
28 | index2choices = {}
29 | index2rawanswer = {}
30 | for idx in range(len(datas)):
31 |     data = datas.iloc[idx]
32 |     
33 |     choices = []
34 |     for opt in glb_opts:
35 |         if not pd.isna(data[opt]):
36 |             choices.append(data[opt])
37 |     index2choices[data['index']] = choices
38 | 
39 |     index2answer[data['index']] = glb_opts.index(data['answer'])
40 |     index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
41 | 
42 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
43 | 
44 | correct = 0
45 | total = 0
46 | for index in identity_indexes:
47 |     raw_preds = []
48 |     raw_answer = []
49 |     for _ in range(4):
50 |         cycle_index = int(_ * 1e6 + index)
51 |         if index2predictions.get(cycle_index, None) is not None:
52 |             raw_answer = index2rawanswer[cycle_index]
53 |             raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
54 |             raw_preds.append(raw_pred)
55 | 
56 |     if len(set(raw_preds)) == 1:
57 |         if raw_preds[0] == raw_answer:
58 |             correct += 1
59 |     else:
60 |         result = most_common_elements(raw_preds)
61 |         if result == raw_answer:
62 |             correct += 1
63 | 
64 |     total += 1
65 | 
66 | print(correct, total, correct / total * 100.)
67 | 


--------------------------------------------------------------------------------
/eval_mm/mmbench/mmbench_predict_to_submission.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import json
 3 | import random
 4 | 
 5 | '''
 6 | This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
 7 | '''
 8 | 
 9 | predictions = json.load(open('mmbench_test_20230712.json'))
10 | 
11 | index2predictions = {}
12 | for pred in predictions:
13 |     index2predictions[pred['index']] = pred['prediction']
14 | 
15 | from collections import Counter
16 | 
17 | def most_common_elements(lst):
18 |     counter = Counter(lst)
19 |     max_count = max(counter.values())
20 |     most_common = [element for element, count in counter.items() if count == max_count]
21 |     print(most_common)
22 |     return random.choice(most_common)
23 |     # return most_common
24 | 
25 | datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
26 | 
27 | datas = datas.drop('image', axis=1)
28 | 
29 | glb_opts = ['A', 'B', 'C', 'D']
30 | index2choices = {}
31 | for idx in range(len(datas)):
32 |     data = datas.iloc[idx]
33 |     
34 |     choices = []
35 |     for opt in glb_opts:
36 |         if not pd.isna(data[opt]):
37 |             choices.append(data[opt])
38 |     index2choices[data['index']] = choices
39 | 
40 | identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
41 | 
42 | 
43 | processed_index2predictions = {}
44 | for index in identity_indexes:
45 |     raw_preds = []
46 |     for _ in range(4):
47 |         cycle_index = int(_ * 1e6 + index)
48 |         if index2predictions.get(cycle_index, None) is not None:
49 |             raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
50 |             raw_preds.append(raw_pred)
51 |     
52 |     if len(set(raw_preds)) == 1:
53 |         pred_answer = raw_preds[0]
54 |     else:
55 |         pred_answer = most_common_elements(raw_preds)
56 | 
57 |     print(index, pred_answer)
58 |     for _ in range(4):
59 |         cycle_index = int(_ * 1e6 + index)
60 |         if index2predictions.get(cycle_index, None) is not None:
61 |             processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
62 | 
63 | 
64 | predictions = []
65 | for idx in range(len(datas)):
66 |     data = datas.iloc[idx]
67 |     index = data['index']
68 |     prediction = glb_opts[processed_index2predictions[index]]
69 |     predictions.append(prediction)
70 | 
71 | datas['prediction'] = predictions
72 | datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
73 | # constrained means we force the model predict same answer when tested on a question for multiple times
74 | 


--------------------------------------------------------------------------------
/eval_mm/mme/EVAL_MME.md:
--------------------------------------------------------------------------------
 1 | # MME Benchmark
 2 | 
 3 | [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
 4 | 
 5 | Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
 6 | 
 7 | Perception Evaluation
 8 | 
 9 | | Rank |      Model      |          Version         |  Score  |
10 | |:----:|:---------------:|:------------------------:|:-------:|
11 | |   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**|        **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**       | **1487.57** |
12 | |   2  |    Skywork-MM   |      Skywork-MM-13B      | 1419.08 |
13 | |   3  |      MMICL      |         FlanT5xxl        | 1376.00 |
14 | |   4  |       Lynx      |         vicuna-7b        | 1373.23 |
15 | |   5  |      BLIVA      |         FlanT5xxl        | 1337.73 |
16 | 
17 | Cognition Evaluation
18 | 
19 | | Rank |       Model      |     Version    |    Score   |
20 | |:----:|:----------------:|:--------------:|:----------:|
21 | |   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** |   **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**  | **360.71** |
22 | |   2  |       MMICL      |    FlanT5xxl   |   360.36   |
23 | |   3  |    Skywork-MM    | Skywork-MM-13B |   356.43   |
24 | |   4  |       BLIVA      |    FlanT5xxl   |   331.43   |
25 | |   5  |  LRV-Instruction |     LRV-7B     |   328.21   |
26 | 
27 | Full Metrics
28 | 
29 | ```
30 | =========== Perception ===========
31 | total score: 1487.576330532213 
32 | 
33 |          existence  score: 158.33333333333331
34 |          count  score: 150.0
35 |          position  score: 128.33333333333334
36 |          color  score: 170.0
37 |          posters  score: 178.57142857142856
38 |          celebrity  score: 120.58823529411764
39 |          scene  score: 152.25
40 |          landmark  score: 164.0
41 |          artwork  score: 125.5
42 |          OCR  score: 140.0
43 | 
44 | 
45 | =========== Cognition ===========
46 | total score: 360.71428571428567 
47 | 
48 |          commonsense_reasoning  score: 130.7142857142857
49 |          numerical_calculation  score: 40.0
50 |          text_translation  score: 147.5
51 |          code_reasoning  score: 42.5
52 | ```
53 | 
54 | ## How To Reproduce Results of MME Benchmark
55 | 
56 | 1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
57 | 2. Rearrange images by executing `python get_images.py`
58 | 3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
59 | 4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
60 | 


--------------------------------------------------------------------------------
/eval_mm/mme/cognition.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/eval_mm/mme/cognition.jpg


--------------------------------------------------------------------------------
/eval_mm/mme/eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | 
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | from transformers.generation import GenerationConfig
 6 | 
 7 | checkpoint = 'Qwen/Qwen-VL-Chat'
 8 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
 9 | model = AutoModelForCausalLM.from_pretrained(
10 |     checkpoint, device_map='cuda', trust_remote_code=True).eval()
11 | 
12 | model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
13 | model.generation_config.top_p = 0.01
14 | 
15 | 
16 | root = 'Your_Results'
17 | output = 'Qwen-VL-Chat'
18 | os.makedirs(output, exist_ok=True)
19 | for filename in os.listdir(root):
20 |     with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
21 |         lines = fin.read().splitlines()
22 |         filename = filename.replace('.txt', '')
23 |         for line in tqdm(lines):
24 |             img, question, gt = line.strip().split('\t')
25 |             img_path = os.path.join('images', filename, img)
26 |             assert os.path.exists(img_path), img_path
27 |             query = f'<img>{img_path}</img>\n{question}'
28 |             response, _ = model.chat(tokenizer, query=query, history=None)
29 | 
30 |             print(img, question, gt, response, sep='\t', file=fout)
31 | 


--------------------------------------------------------------------------------
/eval_mm/mme/get_images.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tqdm import tqdm
 3 | 
 4 | os.system('rm -rf images')
 5 | os.system('mkdir images')
 6 | 
 7 | os.system('cp -r ../MME_Benchmark_release/OCR images/')
 8 | 
 9 | os.system('mkdir images/artwork')
10 | os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
11 | with open('LaVIN/artwork.txt') as fin:
12 |     paths = [ line.strip().split('\t', 1)[0] for line in fin ]
13 |     paths = list(set(paths))
14 |     for path in tqdm(paths):
15 |         os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
16 | 
17 | os.system('mkdir images/celebrity')
18 | os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
19 | os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
20 | 
21 | os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
22 | 
23 | os.system('cp -r ../MME_Benchmark_release/color images/')
24 | 
25 | os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
26 | 
27 | os.system('cp -r ../MME_Benchmark_release/count images/')
28 | 
29 | os.system('cp -r ../MME_Benchmark_release/existence images/')
30 | 
31 | os.system('mkdir images/landmark')
32 | os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
33 | os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
34 | 
35 | os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
36 | 
37 | os.system('cp -r ../MME_Benchmark_release/position images/')
38 | 
39 | os.system('mkdir images/posters')
40 | os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
41 | os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
42 | 
43 | os.system('mkdir images/scene')
44 | os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
45 | os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
46 | 
47 | os.system('cp -r ../MME_Benchmark_release/text_translation images/')
48 | 


--------------------------------------------------------------------------------
/eval_mm/mme/perception.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/eval_mm/mme/perception.jpg


--------------------------------------------------------------------------------
/eval_mm/seed_bench/EVAL_SEED.md:
--------------------------------------------------------------------------------
 1 | # Seed-Bench Evaluation
 2 | 
 3 | [SEED-Bench](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) is a multimodal benchmark of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both **image** and **video** understanding.
 4 | 
 5 | Qwen-VL and Qwen-VL-Chat achieve SOTAs on this benchmark.
 6 | 
 7 | <p align="center">
 8 |     <img src="leaderboard.jpg"/>
 9 | <p>
10 | 
11 | ## How To Process Video by Qwen-VL
12 | 
13 | Qwen-VL and Qwen-VL-Chat didn't train any video data or tasks during training, but they can understand some videos in a zero-shot way. For the video question-answering task, we utilize four uniformly sampled frames per video sample. These frames are treated as separate images and are stitched into the context. For example:
14 | 
15 | ```
16 | {
17 |   "question_id": "v0",
18 |   "prompt": "<img>video_imgs_4/v0_0.jpg</img>\n<img>video_imgs_4/v0_1.jpg</img>\n<img>video_imgs_4/v0_2.jpg</img>\n<img>video_imgs_4/v0_3.jpg</img>\nQuestion: Can you identify the action taking place in the video?\nOptions: A. pretending to take something out of something\nB. pretending to take something from somewhere\nC. feigning to insert something into something\nD. simulating putting something onto something\nAnswer:"
19 | }
20 | ```
21 | 
22 | The above JSON line can be used as the input by `eval_mm/seed_bench/eval.py` and output the following results:
23 | ```
24 | {"question_id": "v0", "prediction": "B"}
25 | ```
26 | 
27 | Please see [eval_mm/seed_bench/eval.py](eval.py) for more inference details.
28 | 
29 | ## How To Reproduce Results of Seed-Bench
30 | 
31 | 1. Download all images and videos by following the [instruction](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md). Then modify the root path in `eval_mm/seed_bench/trans.py` with your customized path.
32 | ```
33 | # path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
34 | seed_bench_input_path = 'SEED-Bench.json'
35 | # root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
36 | cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
37 | # root directory of evaluation dimension 10
38 | dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
39 | # root directory of evaluation dimension 11
40 | dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
41 | # root directory of evaluation dimension 12
42 | dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
43 | ```
44 | 
45 | 2. Generate input files of Qwen-VL with the JSON formatting.
46 | ```
47 | cd eval_mm/seed_bench/
48 | python trans.py
49 | ```
50 | This script will output two JSONL files and one directory. `image_input.jsonl` is the input file of image evaluation and `video_input_4.jsonl` is the input file of video evaluation by 4 frames. The directory `video_imgs_4` contains all 4-framed images extracted from videos. We provide our [image_input.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/image_input.jsonl) and [video_input_4.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/video_input_4.jsonl) here for reference.
51 | 
52 | 3. Produce the results of Seed-Bench.
53 | ```
54 | # The number of available GPUs 
55 | export NPROC_PER_NODE=8
56 | 
57 | # Produce the Qwen-VL-Chat results of image understanding
58 | python -m torch.distributed.launch --use-env \
59 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
60 |     --nnodes ${WORLD_SIZE:-1} \
61 |     --node_rank ${RANK:-0} \
62 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
63 |     --master_port ${MASTER_PORT:-12345} \
64 |     eval.py \
65 |     --checkpoint Qwen/Qwen-VL-Chat \
66 |     --dataset image_input.jsonl \
67 |     --batch-size 4 \
68 |     --num-workers 2
69 | # Collect the result files
70 | cat result_?.jsonl >results_chat_img.jsonl
71 | rm result_?.jsonl
72 | 
73 | # Produce the results of video understanding
74 | python -m torch.distributed.launch --use-env \
75 |     --nproc_per_node ${NPROC_PER_NODE:-8} \
76 |     --nnodes ${WORLD_SIZE:-1} \
77 |     --node_rank ${RANK:-0} \
78 |     --master_addr ${MASTER_ADDR:-127.0.0.1} \
79 |     --master_port ${MASTER_PORT:-12345} \
80 |     eval.py \
81 |     --checkpoint Qwen/Qwen-VL-Chat \
82 |     --dataset video_input_4.jsonl \
83 |     --batch-size 2 \
84 |     --num-workers 1
85 | # Collect the result files
86 | cat result_?.jsonl >results_chat_vid.jsonl
87 | rm result_?.jsonl
88 | 
89 | # The file `results_chat.jsonl` can be submitted to the leaderboard
90 | cat results_chat_img.jsonl results_chat_vid.jsonl >results_chat.jsonl
91 | ```
92 | 
93 | You can reproduce the Seed-Bench results of Qwen-VL by replacing `Qwen/Qwen-VL-Chat` with `Qwen/Qwen-VL` on the above script.
94 | 


--------------------------------------------------------------------------------
/eval_mm/seed_bench/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import itertools
  3 | import json
  4 | import os
  5 | from functools import partial
  6 | 
  7 | import torch
  8 | from tqdm import tqdm
  9 | from transformers import AutoModelForCausalLM, AutoTokenizer
 10 | from transformers.generation import GenerationConfig
 11 | 
 12 | 
 13 | def collate_fn(batches, pad_token_id):
 14 | 
 15 |     input_tokens = [_['input_tokens'] for _ in batches]
 16 |     target_lengths = [_['target_lengths'] for _ in batches]
 17 |     answers = [_['answer'] for _ in batches]
 18 |     question_id = [_['question_id'] for _ in batches]
 19 | 
 20 |     chunk_sizes = [len(_) for _ in input_tokens]
 21 | 
 22 |     input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
 23 | 
 24 |     max_lengths = max([len(_) for _ in input_tokens])
 25 |     input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
 26 |                     for _ in input_tokens]
 27 |     input_tokens = torch.LongTensor(input_tokens)
 28 | 
 29 |     attention_mask = 1 - input_tokens.eq(pad_token_id).float()
 30 | 
 31 |     return input_tokens, attention_mask, target_lengths, answers, chunk_sizes, question_id
 32 | 
 33 | 
 34 | class MultipleChoiceDataste(torch.utils.data.Dataset):
 35 | 
 36 |     def __init__(self, test, tokenizer):
 37 |         self.datas = []
 38 |         with open(test) as fin:
 39 |             for line in tqdm(fin):
 40 |                 self.datas.append(json.loads(line.strip()))
 41 |         self.tokenizer = tokenizer
 42 | 
 43 |     def __len__(self):
 44 |         return len(self.datas)
 45 | 
 46 |     def __getitem__(self, idx):
 47 | 
 48 |         data = self.datas[idx]
 49 |         prompt = data['prompt']
 50 | 
 51 |         prompt_tokens = self.tokenizer(prompt).input_ids
 52 |         target_tokens = [
 53 |             self.tokenizer(' ' + _).input_ids
 54 |             for _ in ['A', 'B', 'C', 'D']
 55 |         ]
 56 | 
 57 |         return {
 58 |             'input_tokens': [prompt_tokens + _ for _ in target_tokens],
 59 |             'target_lengths': [len(_) for _ in target_tokens],
 60 |             'answer': data['answer'],
 61 |             'question_id': data['question_id'],
 62 |         }
 63 | 
 64 | 
 65 | class InferenceSampler(torch.utils.data.sampler.Sampler):
 66 | 
 67 |     def __init__(self, size):
 68 |         self._size = int(size)
 69 |         assert size > 0
 70 |         self._rank = torch.distributed.get_rank()
 71 |         self._world_size = torch.distributed.get_world_size()
 72 |         self._local_indices = self._get_local_indices(size, self._world_size,
 73 |                                                       self._rank)
 74 | 
 75 |     @staticmethod
 76 |     def _get_local_indices(total_size, world_size, rank):
 77 |         shard_size = total_size // world_size
 78 |         left = total_size % world_size
 79 |         shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
 80 | 
 81 |         begin = sum(shard_sizes[:rank])
 82 |         end = min(sum(shard_sizes[:rank + 1]), total_size)
 83 |         return range(begin, end)
 84 | 
 85 |     def __iter__(self):
 86 |         yield from self._local_indices
 87 | 
 88 |     def __len__(self):
 89 |         return len(self._local_indices)
 90 | 
 91 | 
 92 | if __name__ == '__main__':
 93 | 
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument('--checkpoint', type=str, default='')
 96 |     parser.add_argument('--dataset', type=str, default='')
 97 |     parser.add_argument('--batch-size', type=int, default=1)
 98 |     parser.add_argument('--num-workers', type=int, default=1)
 99 |     args = parser.parse_args()
100 | 
101 |     torch.distributed.init_process_group(
102 |         backend='nccl',
103 |         world_size=int(os.getenv('WORLD_SIZE', '1')),
104 |         rank=int(os.getenv('RANK', '0')),
105 |     )
106 | 
107 |     torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
108 | 
109 |     model = AutoModelForCausalLM.from_pretrained(
110 |         args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
111 | 
112 |     tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
113 |                                               trust_remote_code=True)
114 |     model.generation_config = GenerationConfig.from_pretrained(args.checkpoint, trust_remote_code=True)
115 |     model.generation_config.top_p = 0.01
116 | 
117 |     dataset = MultipleChoiceDataste(test=args.dataset, tokenizer=tokenizer)
118 |     dataloader = torch.utils.data.DataLoader(
119 |         dataset=dataset,
120 |         # sampler=InferenceSampler(1000),
121 |         sampler=InferenceSampler(len(dataset)),
122 |         batch_size=args.batch_size,
123 |         num_workers=args.num_workers,
124 |         pin_memory=True,
125 |         drop_last=False,
126 |         collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
127 |     )
128 | 
129 |     results = []
130 |     fout = open('result_{}.jsonl'.format(torch.distributed.get_rank()), 'w')
131 |     with torch.no_grad():
132 |         for _, (input_tokens, attention_mask, target_lengths, answers,
133 |                 chunk_sizes, question_ids) in tqdm(enumerate(dataloader)):
134 | 
135 |             outputs = model(
136 |                 input_ids=input_tokens[:, :-1].cuda(),
137 |                 attention_mask=attention_mask[:, :-1].cuda(),
138 |                 return_dict=True,
139 |             )
140 |             losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
141 |                 0, 2, 1),
142 |                                                        input_tokens[:,
143 |                                                                     1:].cuda(),
144 |                                                        reduction='none')
145 | 
146 |             losses = losses.split(chunk_sizes, dim=0)
147 | 
148 |             for loss, target_length, answer, question_id in zip(losses, target_lengths,
149 |                                                    answers, question_ids):
150 | 
151 |                 target_loss = loss.mean(-1)
152 |                 for _ in range(len(target_length)):
153 |                     target_loss[_] = loss[_, -target_length[_]:].mean()
154 |                 pred = target_loss.argmin().item()
155 |                 pred = chr(pred + 65)
156 |                 if pred == answer:
157 |                     results.append(1)
158 |                 else:
159 |                     results.append(0)
160 |                 answer_record = {
161 |                     'question_id': question_id,
162 |                     'prediction': pred
163 |                 }
164 |                 print(json.dumps(answer_record), file=fout)
165 |     fout.close()
166 | 
167 |     torch.distributed.barrier()
168 | 
169 |     world_size = torch.distributed.get_world_size()
170 |     merged_results = [None for _ in range(world_size)]
171 |     torch.distributed.all_gather_object(merged_results, results)
172 | 
173 |     merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
174 | 
175 |     if torch.distributed.get_rank() == 0:
176 |         print(f"Evaluating {args.dataset} ...")
177 |         print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
178 | 
179 |     torch.distributed.barrier()
180 | 


--------------------------------------------------------------------------------
/eval_mm/seed_bench/leaderboard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/QwenLM/Qwen-VL/aa00ed04091eea5fcdd32985e7915f1c53e7d599/eval_mm/seed_bench/leaderboard.jpg


--------------------------------------------------------------------------------
/eval_mm/seed_bench/trans.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import av
  3 | import json
  4 | 
  5 | import torch
  6 | import numpy as np
  7 | from PIL import Image
  8 | from tqdm import tqdm
  9 | from decord import VideoReader, cpu
 10 | 
 11 | # path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
 12 | seed_bench_input_path = 'SEED-Bench.json'
 13 | # root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
 14 | cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
 15 | # root directory of evaluation dimension 10
 16 | dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
 17 | # root directory of evaluation dimension 11
 18 | dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
 19 | # root directory of evaluation dimension 12
 20 | dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
 21 | 
 22 | def is_integer_string(s):
 23 |     try:
 24 |         int(s)
 25 |         return True
 26 |     except ValueError:
 27 |         return False
 28 | 
 29 | def filter_questions(data, task='all'):
 30 |     if task == "image":
 31 |         return [q for q in data if 1 <= q["question_type_id"] <= 9]
 32 |     elif task == "video":
 33 |         return [q for q in data if 10 <= q["question_type_id"] <= 12]
 34 |     elif task == "all":
 35 |         return data
 36 |     elif is_integer_string(task):
 37 |         return [q for q in data if q["question_type_id"] == int(task)]
 38 |     else:
 39 |         raise ValueError(f"Invalid task: {task}")
 40 | 
 41 | def get_index(num_frames, num_segments):
 42 |     if num_segments > num_frames:
 43 |         offsets = np.array([
 44 |             idx for idx in range(num_frames)
 45 |         ])
 46 |     else:
 47 |         # uniform sampling
 48 |         seg_size = float(num_frames - 1) / num_segments
 49 |         start = int(seg_size / 2)
 50 |         offsets = np.array([
 51 |             start + int(np.round(seg_size * idx)) for idx in range(num_segments)
 52 |         ])
 53 |     return offsets
 54 | 
 55 | with open(seed_bench_input_path) as fin:
 56 |     qa_anno = json.load(fin)['questions']
 57 | 
 58 | fout = open('image_input.jsonl', 'w')
 59 | i_anno = filter_questions(qa_anno, 'image')
 60 | for qa_item in tqdm(i_anno):
 61 |     data_path = cc3m_dir + qa_item['data_id']
 62 |     choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
 63 |     choice_list = []
 64 |     for i, c in enumerate(choices):
 65 |         choice_list.append('{}. {}'.format(chr(i + 65), c))
 66 |     choice_txt = '\n'.join(choice_list)
 67 |     prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
 68 |         data_path, qa_item['question'], choice_txt)
 69 |     print(json.dumps({
 70 |         'question_id': qa_item['question_id'],
 71 |         'prompt': prompt,
 72 |         'answer': qa_item['answer'],
 73 |     }), file=fout)
 74 | fout.close()
 75 | 
 76 | n_frames = 8
 77 | os.system('rm -rf video_input_' + str(n_frames))
 78 | os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
 79 | 
 80 | fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
 81 | v_anno = filter_questions(qa_anno, 'video')
 82 | for qa_item in tqdm(v_anno):
 83 |     if qa_item['question_type_id'] == 12:
 84 |         data_path = dimension12_dir + qa_item['data_id']
 85 |     elif qa_item['question_type_id'] == 11:
 86 |         data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
 87 |     elif qa_item['question_type_id'] == 10:
 88 |         data_path = dimension10_dir + qa_item['data_id']
 89 |     else:
 90 |         assert False, str(qa_item)
 91 |     print(data_path)
 92 | 
 93 |     use_pyav = False
 94 |     if 'segment' in qa_item.keys():
 95 |         segment = qa_item['segment']
 96 |         if isinstance(segment[0], int):
 97 |             # using pyav for decoding videos in evaluation dimension 12
 98 |             use_pyav = True
 99 |         start, end = segment[0], segment[1]
100 |     else:
101 |         start = 0.0
102 |         end = 0.0
103 | 
104 |     if use_pyav:
105 |         # using pyav for decoding videos in evaluation dimension 12
106 |         reader = av.open(data_path)
107 |         frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
108 |         video_len = len(frames)
109 |         start_frame, end_frame = start, end
110 |         end_frame = min(end_frame, video_len)
111 |         offset = get_index(end_frame - start_frame, n_frames)
112 |         frame_indices = offset + start_frame
113 |         images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
114 |     else:
115 |         # using decord for decoding videos in evaluation dimension 10-11
116 |         try:
117 |             vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
118 |             video_len = len(vr)
119 |             fps = vr.get_avg_fps()
120 |             if 'segment' in qa_item.keys():
121 |                 # obtain start and end frame for the video segment in evaluation dimension 11
122 |                 start_frame = int(min(max(start * fps, 0), video_len - 1))
123 |                 end_frame = int(min(max(end * fps, 0), video_len - 1))
124 |                 tot_frames = int(end_frame - start_frame)
125 |                 offset = get_index(tot_frames, n_frames)
126 |                 frame_indices = offset + start_frame
127 |             else:
128 |                 # sample frames of the video in evaluation dimension 10
129 |                 frame_indices = get_index(video_len - 1, n_frames)
130 |             vr.seek(0)
131 |             images = vr.get_batch(frame_indices).asnumpy()
132 |         except Exception as e:
133 |             print(json.dumps({
134 |                 'question_id': qa_item['question_id'],
135 |                 'prompt': "Error" + str(e),
136 |                 'answer': qa_item['answer'],
137 |             }), file=fout)
138 |             continue
139 | 
140 |     prompt = ''
141 |     for i in range(images.shape[0]):
142 |         data = Image.fromarray(images[i])
143 |         img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
144 |         data.save(img_path)
145 |         prompt += '<img>' + img_path + '</img>\n'
146 | 
147 |     choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
148 |     choice_list = []
149 |     for i, c in enumerate(choices):
150 |         choice_list.append('{}. {}'.format(chr(i + 65), c))
151 |     choice_txt = '\n'.join(choice_list)
152 | 
153 |     prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
154 |     print(json.dumps({
155 |         'question_id': qa_item['question_id'],
156 |         'prompt': prompt,
157 |         'answer': qa_item['answer'],
158 |     }), file=fout)
159 | fout.close()
160 | 


--------------------------------------------------------------------------------
/eval_mm/vqa.py:
--------------------------------------------------------------------------------
  1 | """Copyright (c) 2022, salesforce.com, inc.
  2 | 
  3 | All rights reserved.
  4 | SPDX-License-Identifier: BSD-3-Clause
  5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
  6 | """
  7 | 
  8 | __author__ = 'aagrawal'
  9 | __version__ = '0.9'
 10 | 
 11 | # Interface for accessing the VQA dataset.
 12 | 
 13 | # This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
 14 | # (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
 15 | 
 16 | # The following functions are defined:
 17 | #  VQA        - VQA class that loads VQA annotation file and prepares data structures.
 18 | #  getQuesIds - Get question ids that satisfy given filter conditions.
 19 | #  getImgIds  - Get image ids that satisfy given filter conditions.
 20 | #  loadQA     - Load questions and answers with the specified question ids.
 21 | #  showQA     - Display the specified questions and answers.
 22 | #  loadRes    - Load result file and create result object.
 23 | 
 24 | # Help on each function can be accessed by: "help(COCO.function)"
 25 | 
 26 | import copy
 27 | import datetime
 28 | import json
 29 | 
 30 | 
 31 | class VQA:
 32 | 
 33 |     def __init__(self, annotation_file=None, question_file=None):
 34 |         """Constructor of VQA helper class for reading and visualizing
 35 |         questions and answers.
 36 | 
 37 |         :param annotation_file (str): location of VQA annotation file
 38 |         :return:
 39 |         """
 40 |         # load dataset
 41 |         self.dataset = {}
 42 |         self.questions = {}
 43 |         self.qa = {}
 44 |         self.qqa = {}
 45 |         self.imgToQA = {}
 46 |         if not annotation_file == None and not question_file == None:
 47 |             print('loading VQA annotations and questions into memory...')
 48 |             time_t = datetime.datetime.utcnow()
 49 |             dataset = json.load(open(annotation_file, 'r'))
 50 |             questions = json.load(open(question_file, 'r'))
 51 |             self.dataset = dataset
 52 |             self.questions = questions
 53 |             self.createIndex()
 54 | 
 55 |     def createIndex(self):
 56 |         # create index
 57 |         print('creating index...')
 58 |         imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
 59 |         qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
 60 |         qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
 61 |         for ann in self.dataset['annotations']:
 62 |             imgToQA[ann['image_id']] += [ann]
 63 |             qa[ann['question_id']] = ann
 64 |         for ques in self.questions['questions']:
 65 |             qqa[ques['question_id']] = ques
 66 |         print('index created!')
 67 | 
 68 |         # create class members
 69 |         self.qa = qa
 70 |         self.qqa = qqa
 71 |         self.imgToQA = imgToQA
 72 | 
 73 |     def info(self):
 74 |         """Print information about the VQA annotation file.
 75 | 
 76 |         :return:
 77 |         """
 78 |         for key, value in self.datset['info'].items():
 79 |             print('%s: %s' % (key, value))
 80 | 
 81 |     def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
 82 |         """Get question ids that satisfy given filter conditions. default skips
 83 |         that filter.
 84 | 
 85 |         :param  imgIds    (int array)   : get question ids for given imgs
 86 |                         quesTypes (str array)   : get question ids for given question types
 87 |                         ansTypes  (str array)   : get question ids for given answer types
 88 |         :return:    ids   (int array)   : integer array of question ids
 89 |         """
 90 |         imgIds = imgIds if type(imgIds) == list else [imgIds]
 91 |         quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
 92 |         ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
 93 | 
 94 |         if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
 95 |             anns = self.dataset['annotations']
 96 |         else:
 97 |             if not len(imgIds) == 0:
 98 |                 anns = sum(
 99 |                     [
100 |                         self.imgToQA[imgId]
101 |                         for imgId in imgIds if imgId in self.imgToQA
102 |                     ],
103 |                     [],
104 |                 )
105 |             else:
106 |                 anns = self.dataset['annotations']
107 |             anns = (anns if len(quesTypes) == 0 else
108 |                     [ann for ann in anns if ann['question_type'] in quesTypes])
109 |             anns = (anns if len(ansTypes) == 0 else
110 |                     [ann for ann in anns if ann['answer_type'] in ansTypes])
111 |         ids = [ann['question_id'] for ann in anns]
112 |         return ids
113 | 
114 |     def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
115 |         """Get image ids that satisfy given filter conditions. default skips
116 |         that filter.
117 | 
118 |          :param quesIds   (int array)   : get image ids for given question ids
119 |         quesTypes (str array)   : get image ids for given question types
120 |         ansTypes  (str array)   : get image ids for given answer types
121 |          :return: ids     (int array)   : integer array of image ids
122 |         """
123 |         quesIds = quesIds if type(quesIds) == list else [quesIds]
124 |         quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
125 |         ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
126 | 
127 |         if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
128 |             anns = self.dataset['annotations']
129 |         else:
130 |             if not len(quesIds) == 0:
131 |                 anns = sum([
132 |                     self.qa[quesId] for quesId in quesIds if quesId in self.qa
133 |                 ], [])
134 |             else:
135 |                 anns = self.dataset['annotations']
136 |             anns = (anns if len(quesTypes) == 0 else
137 |                     [ann for ann in anns if ann['question_type'] in quesTypes])
138 |             anns = (anns if len(ansTypes) == 0 else
139 |                     [ann for ann in anns if ann['answer_type'] in ansTypes])
140 |         ids = [ann['image_id'] for ann in anns]
141 |         return ids
142 | 
143 |     def loadQA(self, ids=[]):
144 |         """Load questions and answers with the specified question ids.
145 | 
146 |         :param ids (int array)       : integer ids specifying question ids
147 |         :return: qa (object array)   : loaded qa objects
148 |         """
149 |         if type(ids) == list:
150 |             return [self.qa[id] for id in ids]
151 |         elif type(ids) == int:
152 |             return [self.qa[ids]]
153 | 
154 |     def showQA(self, anns):
155 |         """Display the specified annotations.
156 | 
157 |         :param anns (array of object): annotations to display
158 |         :return: None
159 |         """
160 |         if len(anns) == 0:
161 |             return 0
162 |         for ann in anns:
163 |             quesId = ann['question_id']
164 |             print('Question: %s' % (self.qqa[quesId]['question']))
165 |             for ans in ann['answers']:
166 |                 print('Answer %d: %s' % (ans['answer_id'], ans['answer']))
167 | 
168 |     def loadRes(self, resFile, quesFile):
169 |         """Load result file and return a result object.
170 | 
171 |         :param   resFile (str)     : file name of result file
172 |         :return: res (obj)         : result api object
173 |         """
174 |         res = VQA()
175 |         res.questions = json.load(open(quesFile))
176 |         res.dataset['info'] = copy.deepcopy(self.questions['info'])
177 |         res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
178 |         res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
179 |         res.dataset['data_subtype'] = copy.deepcopy(
180 |             self.questions['data_subtype'])
181 |         res.dataset['license'] = copy.deepcopy(self.questions['license'])
182 | 
183 |         print('Loading and preparing results...     ')
184 |         time_t = datetime.datetime.utcnow()
185 |         anns = json.load(open(resFile))
186 |         assert type(anns) == list, 'results is not an array of objects'
187 |         annsQuesIds = [ann['question_id'] for ann in anns]
188 |         assert set(annsQuesIds) == set(
189 |             self.getQuesIds()
190 |         ), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
191 |         for ann in anns:
192 |             quesId = ann['question_id']
193 |             if res.dataset['task_type'] == 'Multiple Choice':
194 |                 assert (
195 |                     ann['answer'] in self.qqa[quesId]['multiple_choices']
196 |                 ), 'predicted answer is not one of the multiple choices'
197 |             qaAnn = self.qa[quesId]
198 |             ann['image_id'] = qaAnn['image_id']
199 |             ann['question_type'] = qaAnn['question_type']
200 |             ann['answer_type'] = qaAnn['answer_type']
201 |         print('DONE (t=%0.2fs)' %
202 |               ((datetime.datetime.utcnow() - time_t).total_seconds()))
203 | 
204 |         res.dataset['annotations'] = anns
205 |         res.createIndex()
206 |         return res
207 | 


--------------------------------------------------------------------------------
/eval_mm/vqa_eval.py:
--------------------------------------------------------------------------------
  1 | """Copyright (c) 2022, salesforce.com, inc.
  2 | 
  3 | All rights reserved.
  4 | SPDX-License-Identifier: BSD-3-Clause
  5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
  6 | """
  7 | 
  8 | # coding=utf-8
  9 | 
 10 | __author__ = 'aagrawal'
 11 | 
 12 | import re
 13 | # This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
 14 | # (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
 15 | import sys
 16 | 
 17 | 
 18 | class VQAEval:
 19 | 
 20 |     def __init__(self, vqa=None, vqaRes=None, n=2):
 21 |         self.n = n
 22 |         self.accuracy = {}
 23 |         self.evalQA = {}
 24 |         self.evalQuesType = {}
 25 |         self.evalAnsType = {}
 26 |         self.vqa = vqa
 27 |         self.vqaRes = vqaRes
 28 |         if vqa is not None:
 29 |             self.params = {'question_id': vqa.getQuesIds()}
 30 |         self.contractions = {
 31 |             'aint': "ain't",
 32 |             'arent': "aren't",
 33 |             'cant': "can't",
 34 |             'couldve': "could've",
 35 |             'couldnt': "couldn't",
 36 |             "couldn'tve": "couldn't've",
 37 |             "couldnt've": "couldn't've",
 38 |             'didnt': "didn't",
 39 |             'doesnt': "doesn't",
 40 |             'dont': "don't",
 41 |             'hadnt': "hadn't",
 42 |             "hadnt've": "hadn't've",
 43 |             "hadn'tve": "hadn't've",
 44 |             'hasnt': "hasn't",
 45 |             'havent': "haven't",
 46 |             'hed': "he'd",
 47 |             "hed've": "he'd've",
 48 |             "he'dve": "he'd've",
 49 |             'hes': "he's",
 50 |             'howd': "how'd",
 51 |             'howll': "how'll",
 52 |             'hows': "how's",
 53 |             "Id've": "I'd've",
 54 |             "I'dve": "I'd've",
 55 |             'Im': "I'm",
 56 |             'Ive': "I've",
 57 |             'isnt': "isn't",
 58 |             'itd': "it'd",
 59 |             "itd've": "it'd've",
 60 |             "it'dve": "it'd've",
 61 |             'itll': "it'll",
 62 |             "let's": "let's",
 63 |             'maam': "ma'am",
 64 |             'mightnt': "mightn't",
 65 |             "mightnt've": "mightn't've",
 66 |             "mightn'tve": "mightn't've",
 67 |             'mightve': "might've",
 68 |             'mustnt': "mustn't",
 69 |             'mustve': "must've",
 70 |             'neednt': "needn't",
 71 |             'notve': "not've",
 72 |             'oclock': "o'clock",
 73 |             'oughtnt': "oughtn't",
 74 |             "ow's'at": "'ow's'at",
 75 |             "'ows'at": "'ow's'at",
 76 |             "'ow'sat": "'ow's'at",
 77 |             'shant': "shan't",
 78 |             "shed've": "she'd've",
 79 |             "she'dve": "she'd've",
 80 |             "she's": "she's",
 81 |             'shouldve': "should've",
 82 |             'shouldnt': "shouldn't",
 83 |             "shouldnt've": "shouldn't've",
 84 |             "shouldn'tve": "shouldn't've",
 85 |             "somebody'd": 'somebodyd',
 86 |             "somebodyd've": "somebody'd've",
 87 |             "somebody'dve": "somebody'd've",
 88 |             'somebodyll': "somebody'll",
 89 |             'somebodys': "somebody's",
 90 |             'someoned': "someone'd",
 91 |             "someoned've": "someone'd've",
 92 |             "someone'dve": "someone'd've",
 93 |             'someonell': "someone'll",
 94 |             'someones': "someone's",
 95 |             'somethingd': "something'd",
 96 |             "somethingd've": "something'd've",
 97 |             "something'dve": "something'd've",
 98 |             'somethingll': "something'll",
 99 |             'thats': "that's",
100 |             'thered': "there'd",
101 |             "thered've": "there'd've",
102 |             "there'dve": "there'd've",
103 |             'therere': "there're",
104 |             'theres': "there's",
105 |             'theyd': "they'd",
106 |             "theyd've": "they'd've",
107 |             "they'dve": "they'd've",
108 |             'theyll': "they'll",
109 |             'theyre': "they're",
110 |             'theyve': "they've",
111 |             'twas': "'twas",
112 |             'wasnt': "wasn't",
113 |             "wed've": "we'd've",
114 |             "we'dve": "we'd've",
115 |             'weve': "we've",
116 |             'werent': "weren't",
117 |             'whatll': "what'll",
118 |             'whatre': "what're",
119 |             'whats': "what's",
120 |             'whatve': "what've",
121 |             'whens': "when's",
122 |             'whered': "where'd",
123 |             'wheres': "where's",
124 |             'whereve': "where've",
125 |             'whod': "who'd",
126 |             "whod've": "who'd've",
127 |             "who'dve": "who'd've",
128 |             'wholl': "who'll",
129 |             'whos': "who's",
130 |             'whove': "who've",
131 |             'whyll': "why'll",
132 |             'whyre': "why're",
133 |             'whys': "why's",
134 |             'wont': "won't",
135 |             'wouldve': "would've",
136 |             'wouldnt': "wouldn't",
137 |             "wouldnt've": "wouldn't've",
138 |             "wouldn'tve": "wouldn't've",
139 |             'yall': "y'all",
140 |             "yall'll": "y'all'll",
141 |             "y'allll": "y'all'll",
142 |             "yall'd've": "y'all'd've",
143 |             "y'alld've": "y'all'd've",
144 |             "y'all'dve": "y'all'd've",
145 |             'youd': "you'd",
146 |             "youd've": "you'd've",
147 |             "you'dve": "you'd've",
148 |             'youll': "you'll",
149 |             'youre': "you're",
150 |             'youve': "you've",
151 |         }
152 |         self.manualMap = {
153 |             'none': '0',
154 |             'zero': '0',
155 |             'one': '1',
156 |             'two': '2',
157 |             'three': '3',
158 |             'four': '4',
159 |             'five': '5',
160 |             'six': '6',
161 |             'seven': '7',
162 |             'eight': '8',
163 |             'nine': '9',
164 |             'ten': '10',
165 |         }
166 |         self.articles = ['a', 'an', 'the']
167 | 
168 |         self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
169 |         self.commaStrip = re.compile('(\d)(,)(\d)')
170 |         self.punct = [
171 |             ';',
172 |             r'/',
173 |             '[',
174 |             ']',
175 |             '"',
176 |             '{',
177 |             '}',
178 |             '(',
179 |             ')',
180 |             '=',
181 |             '+',
182 |             '\\',
183 |             '_',
184 |             '-',
185 |             '>',
186 |             '<',
187 |             '@',
188 |             '`',
189 |             ',',
190 |             '?',
191 |             '!',
192 |         ]
193 | 
194 |     def evaluate(self, quesIds=None):
195 |         if quesIds == None:
196 |             quesIds = [quesId for quesId in self.params['question_id']]
197 |         gts = {}
198 |         res = {}
199 |         for quesId in quesIds:
200 |             gts[quesId] = self.vqa.qa[quesId]
201 |             res[quesId] = self.vqaRes.qa[quesId]
202 | 
203 |         # =================================================
204 |         # Compute accuracy
205 |         # =================================================
206 |         accQA = []
207 |         accQuesType = {}
208 |         accAnsType = {}
209 |         print('computing accuracy')
210 |         step = 0
211 |         for quesId in quesIds:
212 |             resAns = res[quesId]['answer']
213 |             resAns = resAns.replace('\n', ' ')
214 |             resAns = resAns.replace('\t', ' ')
215 |             resAns = resAns.strip()
216 |             resAns = self.processPunctuation(resAns)
217 |             resAns = self.processDigitArticle(resAns)
218 |             gtAcc = []
219 |             gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
220 |             if len(set(gtAnswers)) > 1:
221 |                 for ansDic in gts[quesId]['answers']:
222 |                     ansDic['answer'] = self.processPunctuation(
223 |                         ansDic['answer'])
224 |             for gtAnsDatum in gts[quesId]['answers']:
225 |                 otherGTAns = [
226 |                     item for item in gts[quesId]['answers']
227 |                     if item != gtAnsDatum
228 |                 ]
229 |                 matchingAns = [
230 |                     item for item in otherGTAns if item['answer'] == resAns
231 |                 ]
232 |                 acc = min(1, float(len(matchingAns)) / 3)
233 |                 gtAcc.append(acc)
234 |             quesType = gts[quesId]['question_type']
235 |             ansType = gts[quesId]['answer_type']
236 |             avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
237 |             accQA.append(avgGTAcc)
238 |             if quesType not in accQuesType:
239 |                 accQuesType[quesType] = []
240 |             accQuesType[quesType].append(avgGTAcc)
241 |             if ansType not in accAnsType:
242 |                 accAnsType[ansType] = []
243 |             accAnsType[ansType].append(avgGTAcc)
244 |             self.setEvalQA(quesId, avgGTAcc)
245 |             self.setEvalQuesType(quesId, quesType, avgGTAcc)
246 |             self.setEvalAnsType(quesId, ansType, avgGTAcc)
247 |             if step % 100 == 0:
248 |                 self.updateProgress(step / float(len(quesIds)))
249 |             step = step + 1
250 | 
251 |         self.setAccuracy(accQA, accQuesType, accAnsType)
252 |         print('Done computing accuracy')
253 | 
254 |     def processPunctuation(self, inText):
255 |         outText = inText
256 |         for p in self.punct:
257 |             if (p + ' ' in inText or ' ' + p
258 |                     in inText) or (re.search(self.commaStrip, inText) != None):
259 |                 outText = outText.replace(p, '')
260 |             else:
261 |                 outText = outText.replace(p, ' ')
262 |         outText = self.periodStrip.sub('', outText, re.UNICODE)
263 |         return outText
264 | 
265 |     def processDigitArticle(self, inText):
266 |         outText = []
267 |         tempText = inText.lower().split()
268 |         for word in tempText:
269 |             word = self.manualMap.setdefault(word, word)
270 |             if word not in self.articles:
271 |                 outText.append(word)
272 |             else:
273 |                 pass
274 |         for wordId, word in enumerate(outText):
275 |             if word in self.contractions:
276 |                 outText[wordId] = self.contractions[word]
277 |         outText = ' '.join(outText)
278 |         return outText
279 | 
280 |     def setAccuracy(self, accQA, accQuesType, accAnsType):
281 |         self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
282 |                                          self.n)
283 |         self.accuracy['perQuestionType'] = {
284 |             quesType: round(
285 |                 100 * float(sum(accQuesType[quesType])) /
286 |                 len(accQuesType[quesType]),
287 |                 self.n,
288 |             )
289 |             for quesType in accQuesType
290 |         }
291 |         self.accuracy['perAnswerType'] = {
292 |             ansType: round(
293 |                 100 * float(sum(accAnsType[ansType])) /
294 |                 len(accAnsType[ansType]), self.n)
295 |             for ansType in accAnsType
296 |         }
297 | 
298 |     def setEvalQA(self, quesId, acc):
299 |         self.evalQA[quesId] = round(100 * acc, self.n)
300 | 
301 |     def setEvalQuesType(self, quesId, quesType, acc):
302 |         if quesType not in self.evalQuesType:
303 |             self.evalQuesType[quesType] = {}
304 |         self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
305 | 
306 |     def setEvalAnsType(self, quesId, ansType, acc):
307 |         if ansType not in self.evalAnsType:
308 |             self.evalAnsType[ansType] = {}
309 |         self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
310 | 
311 |     def updateProgress(self, progress):
312 |         barLength = 20
313 |         status = ''
314 |         if isinstance(progress, int):
315 |             progress = float(progress)
316 |         if not isinstance(progress, float):
317 |             progress = 0
318 |             status = 'error: progress var must be float\r\n'
319 |         if progress < 0:
320 |             progress = 0
321 |             status = 'Halt...\r\n'
322 |         if progress >= 1:
323 |             progress = 1
324 |             status = 'Done...\r\n'
325 |         block = int(round(barLength * progress))
326 |         text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
327 |             '#' * block + '-' * (barLength - block), int(progress * 100),
328 |             status)
329 |         sys.stdout.write(text)
330 |         sys.stdout.flush()
331 | 


--------------------------------------------------------------------------------
/finetune.py:
--------------------------------------------------------------------------------
  1 | # This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
  2 | 
  3 | 
  4 | from dataclasses import dataclass, field
  5 | import json
  6 | import math
  7 | import logging
  8 | import os
  9 | from typing import Dict, Optional, List
 10 | import torch
 11 | from torch.utils.data import Dataset
 12 | from deepspeed import zero
 13 | from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 14 | import transformers
 15 | from transformers import Trainer, GPTQConfig, deepspeed
 16 | from transformers.trainer_pt_utils import LabelSmoother
 17 | from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
 18 | from accelerate.utils import DistributedType
 19 | 
 20 | IGNORE_TOKEN_ID = LabelSmoother.ignore_index
 21 | 
 22 | 
 23 | @dataclass
 24 | class ModelArguments:
 25 |     model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
 26 | 
 27 | 
 28 | @dataclass
 29 | class DataArguments:
 30 |     data_path: str = field(
 31 |         default=None, metadata={"help": "Path to the training data."}
 32 |     )
 33 |     eval_data_path: str = field(
 34 |         default=None, metadata={"help": "Path to the evaluation data."}
 35 |     )
 36 |     lazy_preprocess: bool = False
 37 | 
 38 | 
 39 | @dataclass
 40 | class TrainingArguments(transformers.TrainingArguments):
 41 |     cache_dir: Optional[str] = field(default=None)
 42 |     optim: str = field(default="adamw_torch")
 43 |     model_max_length: int = field(
 44 |         default=8192,
 45 |         metadata={
 46 |             "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 47 |         },
 48 |     )
 49 |     use_lora: bool = False
 50 |     fix_vit: bool = True
 51 | 
 52 | 
 53 | @dataclass
 54 | class LoraArguments:
 55 |     lora_r: int = 64
 56 |     lora_alpha: int = 16
 57 |     lora_dropout: float = 0.05
 58 |     lora_target_modules: List[str] = field(
 59 |         default_factory=lambda: ["c_attn", "attn.c_proj", "w1", "w2"] ##["in_proj","out_proj","c_fc"]
 60 |     )
 61 |     lora_weight_path: str = ""
 62 |     lora_bias: str = "none"
 63 |     q_lora: bool = False
 64 | 
 65 | 
 66 | def maybe_zero_3(param):
 67 |     if hasattr(param, "ds_id"):
 68 |         assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
 69 |         with zero.GatheredParameters([param]):
 70 |             param = param.data.detach().cpu().clone()
 71 |     else:
 72 |         param = param.detach().cpu().clone()
 73 |     return param
 74 | 
 75 | 
 76 | # Borrowed from peft.utils.get_peft_model_state_dict
 77 | def get_peft_state_maybe_zero_3(named_params, bias):
 78 |     if bias == "none":
 79 |         to_return = {k: t for k, t in named_params if "lora_" in k}
 80 |     elif bias == "all":
 81 |         to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
 82 |     elif bias == "lora_only":
 83 |         to_return = {}
 84 |         maybe_lora_bias = {}
 85 |         lora_bias_names = set()
 86 |         for k, t in named_params:
 87 |             if "lora_" in k:
 88 |                 to_return[k] = t
 89 |                 bias_name = k.split("lora_")[0] + "bias"
 90 |                 lora_bias_names.add(bias_name)
 91 |             elif "bias" in k:
 92 |                 maybe_lora_bias[k] = t
 93 |         for k, t in maybe_lora_bias:
 94 |             if bias_name in lora_bias_names:
 95 |                 to_return[bias_name] = t
 96 |     else:
 97 |         raise NotImplementedError
 98 |     to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
 99 |     return to_return
100 | 
101 | local_rank = None
102 | 
103 | def rank0_print(*args):
104 |     if local_rank == 0:
105 |         print(*args)
106 | 
107 | 
108 | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"):
109 |     """Collects the state dict and dump to disk."""
110 |     # check if zero3 mode enabled
111 |     if deepspeed.is_deepspeed_zero3_enabled():
112 |         state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
113 |     else:
114 |         if trainer.args.use_lora:
115 |             state_dict = get_peft_state_maybe_zero_3(
116 |                 trainer.model.named_parameters(), bias
117 |             )
118 |         else:
119 |             state_dict = trainer.model.state_dict()
120 |     if trainer.args.should_save and trainer.args.local_rank == 0:
121 |         trainer._save(output_dir, state_dict=state_dict)
122 | 
123 | 
124 | def preprocess(
125 |     sources,
126 |     tokenizer: transformers.PreTrainedTokenizer,
127 |     max_len: int,
128 |     system_message: str = "You are a helpful assistant."
129 | ) -> Dict:
130 |     roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
131 | 
132 |     im_start = tokenizer.im_start_id
133 |     im_end = tokenizer.im_end_id
134 |     nl_tokens = tokenizer('\n').input_ids
135 |     _system = tokenizer('system').input_ids + nl_tokens
136 |     _user = tokenizer('user').input_ids + nl_tokens
137 |     _assistant = tokenizer('assistant').input_ids + nl_tokens
138 | 
139 |     # Apply prompt templates
140 |     input_ids, targets = [], []
141 |     for i, source in enumerate(sources):
142 |         if roles[source[0]["from"]] != roles["user"]:
143 |             source = source[1:]
144 | 
145 |         input_id, target = [], []
146 |         system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
147 |         input_id += system
148 |         target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
149 |         assert len(input_id) == len(target)
150 |         for j, sentence in enumerate(source):
151 |             role = roles[sentence["from"]]
152 |             _input_id = tokenizer(role).input_ids + nl_tokens + \
153 |                 tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
154 |             input_id += _input_id
155 |             if role == '<|im_start|>user':
156 |                 _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
157 |             elif role == '<|im_start|>assistant':
158 |                 _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
159 |                     _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
160 |             else:
161 |                 raise NotImplementedError
162 |             target += _target
163 |         assert len(input_id) == len(target)
164 |         input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
165 |         target += [IGNORE_TOKEN_ID] * (max_len - len(target))
166 |         input_ids.append(input_id[:max_len])
167 |         targets.append(target[:max_len])
168 |     input_ids = torch.tensor(input_ids, dtype=torch.int)
169 |     targets = torch.tensor(targets, dtype=torch.int)
170 | 
171 |     return dict(
172 |         input_ids=input_ids,
173 |         labels=targets,
174 |         attention_mask=input_ids.ne(tokenizer.pad_token_id),
175 |     )
176 | 
177 | 
178 | class SupervisedDataset(Dataset):
179 |     """Dataset for supervised fine-tuning."""
180 | 
181 |     def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
182 |         super(SupervisedDataset, self).__init__()
183 | 
184 |         rank0_print("Formatting inputs...")
185 |         sources = [example["conversations"] for example in raw_data]
186 |         data_dict = preprocess(sources, tokenizer, max_len)
187 | 
188 |         self.input_ids = data_dict["input_ids"]
189 |         self.labels = data_dict["labels"]
190 |         self.attention_mask = data_dict["attention_mask"]
191 | 
192 |     def __len__(self):
193 |         return len(self.input_ids)
194 | 
195 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
196 |         return dict(
197 |             input_ids=self.input_ids[i],
198 |             labels=self.labels[i],
199 |             attention_mask=self.attention_mask[i],
200 |         )
201 | 
202 | 
203 | class LazySupervisedDataset(Dataset):
204 |     """Dataset for supervised fine-tuning."""
205 | 
206 |     def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
207 |         super(LazySupervisedDataset, self).__init__()
208 |         self.tokenizer = tokenizer
209 |         self.max_len = max_len
210 | 
211 |         rank0_print("Formatting inputs...Skip in lazy mode")
212 |         self.tokenizer = tokenizer
213 |         self.raw_data = raw_data
214 |         self.cached_data_dict = {}
215 | 
216 |     def __len__(self):
217 |         return len(self.raw_data)
218 | 
219 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
220 |         if i in self.cached_data_dict:
221 |             return self.cached_data_dict[i]
222 | 
223 |         ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
224 |         ret = dict(
225 |             input_ids=ret["input_ids"][0],
226 |             labels=ret["labels"][0],
227 |             attention_mask=ret["attention_mask"][0],
228 |         )
229 |         self.cached_data_dict[i] = ret
230 | 
231 |         return ret
232 | 
233 | 
234 | def make_supervised_data_module(
235 |     tokenizer: transformers.PreTrainedTokenizer, data_args, max_len,
236 | ) -> Dict:
237 |     """Make dataset and collator for supervised fine-tuning."""
238 |     dataset_cls = (
239 |         LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
240 |     )
241 |     rank0_print("Loading data...")
242 | 
243 |     train_json = json.load(open(data_args.data_path, "r"))
244 |     train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)
245 | 
246 |     if data_args.eval_data_path:
247 |         eval_json = json.load(open(data_args.eval_data_path, "r"))
248 |         eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
249 |     else:
250 |         eval_dataset = None
251 | 
252 |     return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
253 | 
254 | 
255 | def train():
256 |     global local_rank
257 |     
258 |     parser = transformers.HfArgumentParser(
259 |         (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
260 |     )
261 |     (
262 |         model_args,
263 |         data_args,
264 |         training_args,
265 |         lora_args,
266 |     ) = parser.parse_args_into_dataclasses()
267 | 
268 |     if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
269 |         training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
270 | 
271 |     compute_dtype = (
272 |         torch.float16
273 |         if training_args.fp16
274 |         else (torch.bfloat16 if training_args.bf16 else torch.float32)
275 |     )
276 | 
277 |     local_rank = training_args.local_rank
278 | 
279 |     device_map = None
280 |     world_size = int(os.environ.get("WORLD_SIZE", 1))
281 |     ddp = world_size != 1
282 |     if lora_args.q_lora:
283 |         device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
284 |         if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
285 |             logging.warning(
286 |                 "FSDP or ZeRO3 are not incompatible with QLoRA."
287 |             )
288 | 
289 |     # Set RoPE scaling factor
290 |     config = transformers.AutoConfig.from_pretrained(
291 |         model_args.model_name_or_path,
292 |         cache_dir=training_args.cache_dir,
293 |         trust_remote_code=True,
294 |     )
295 |     config.use_cache = False
296 | 
297 |     # Load model and tokenizer
298 |     model = transformers.AutoModelForCausalLM.from_pretrained(
299 |         model_args.model_name_or_path,
300 |         config=config,
301 |         cache_dir=training_args.cache_dir,
302 |         device_map=device_map,
303 |         trust_remote_code=True,
304 |         quantization_config=GPTQConfig(
305 |             bits=4, disable_exllama=True
306 |         )
307 |         if training_args.use_lora and lora_args.q_lora
308 |         else None,
309 |     )
310 | 
311 |     if not training_args.use_lora:
312 |         if training_args.fix_vit and hasattr(model,'transformer') and hasattr(model.transformer,'visual'):
313 |             model.transformer.visual.requires_grad_(False)
314 |             if hasattr(model.transformer.visual,'attn_pool'):
315 |                 model.transformer.visual.attn_pool.requires_grad_(True)
316 |     tokenizer = transformers.AutoTokenizer.from_pretrained(
317 |         model_args.model_name_or_path,
318 |         cache_dir=training_args.cache_dir,
319 |         model_max_length=training_args.model_max_length,
320 |         padding_side="right",
321 |         use_fast=False,
322 |         trust_remote_code=True,
323 |     )
324 |     tokenizer.pad_token_id = tokenizer.eod_id
325 | 
326 |     if training_args.use_lora:
327 |         if lora_args.q_lora or "chat" in model_args.model_name_or_path.lower():
328 |             modules_to_save = None
329 |         else:
330 |             modules_to_save = ["wte", "lm_head"]
331 |         lora_config = LoraConfig(
332 |             r=lora_args.lora_r,
333 |             lora_alpha=lora_args.lora_alpha,
334 |             target_modules=lora_args.lora_target_modules,
335 |             lora_dropout=lora_args.lora_dropout,
336 |             bias=lora_args.lora_bias,
337 |             task_type="CAUSAL_LM",
338 |             modules_to_save=modules_to_save  # This argument serves for adding new tokens.
339 |         )
340 |         if lora_args.q_lora:
341 |             model = prepare_model_for_kbit_training(
342 |                 model, use_gradient_checkpointing=training_args.gradient_checkpointing
343 |             )
344 | 
345 |         model = get_peft_model(model, lora_config)
346 | 
347 |         if training_args.gradient_checkpointing:
348 |             model.enable_input_require_grads()
349 | 
350 |     # Load data
351 |     data_module = make_supervised_data_module(
352 |         tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
353 |     )
354 | 
355 |     # Start trainner
356 |     trainer = Trainer(
357 |         model=model, tokenizer=tokenizer, args=training_args, **data_module
358 |     )
359 | 
360 |     trainer.train()
361 |     trainer.save_state()
362 | 
363 |     safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias)
364 | 
365 | 
366 | if __name__ == "__main__":
367 |     train()
368 | 


--------------------------------------------------------------------------------
/finetune/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "scheduler": {
24 |         "type": "WarmupLR",
25 |         "params": {
26 |             "warmup_min_lr": "auto",
27 |             "warmup_max_lr": "auto",
28 |             "warmup_num_steps": "auto"
29 |         }
30 |     },
31 | 
32 |     "zero_optimization": {
33 |         "stage": 2,
34 |         "offload_optimizer": {
35 |             "device": "none",
36 |             "pin_memory": true
37 |         },
38 |         "allgather_partitions": true,
39 |         "allgather_bucket_size": 2e8,
40 |         "overlap_comm": true,
41 |         "reduce_scatter": true,
42 |         "reduce_bucket_size": 2e8,
43 |         "contiguous_gradients": true
44 |     },
45 | 
46 |     "gradient_accumulation_steps": "auto",
47 |     "gradient_clipping": "auto",
48 |     "steps_per_print": 100,
49 |     "train_batch_size": "auto",
50 |     "train_micro_batch_size_per_gpu": "auto",
51 |     "wall_clock_breakdown": false
52 | }


--------------------------------------------------------------------------------
/finetune/ds_config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |         "type": "AdamW",
15 |         "params": {
16 |             "lr": "auto",
17 |             "betas": "auto",
18 |             "eps": "auto",
19 |             "weight_decay": "auto"
20 |         }
21 |     },
22 | 
23 |     "scheduler": {
24 |         "type": "WarmupLR",
25 |         "params": {
26 |             "warmup_min_lr": "auto",
27 |             "warmup_max_lr": "auto",
28 |             "warmup_num_steps": "auto"
29 |         }
30 |     },
31 | 
32 |     "zero_optimization": {
33 |         "stage": 3,
34 |         "offload_optimizer": {
35 |             "device": "none",
36 |             "pin_memory": true
37 |         },
38 |         "offload_param": {
39 |             "device": "none",
40 |             "pin_memory": true
41 |         },
42 |         "overlap_comm": true,
43 |         "contiguous_gradients": true,
44 |         "sub_group_size": 1e9,
45 |         "reduce_bucket_size": "auto",
46 |         "stage3_prefetch_bucket_size": "auto",
47 |         "stage3_param_persistence_threshold": "auto",
48 |         "stage3_max_live_parameters": 1e9,
49 |         "stage3_max_reuse_distance": 1e9,
50 |         "stage3_gather_16bit_weights_on_model_save": true
51 |     },
52 | 
53 |     "gradient_accumulation_steps": "auto",
54 |     "gradient_clipping": "auto",
55 |     "steps_per_print": 100,
56 |     "train_batch_size": "auto",
57 |     "train_micro_batch_size_per_gpu": "auto",
58 |     "wall_clock_breakdown": false
59 | }
60 | 


--------------------------------------------------------------------------------
/finetune/finetune_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen \
30 |     --num_train_epochs 5 \
31 |     --per_device_train_batch_size 1 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 16 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 1000 \
37 |     --save_total_limit 10 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "none" \
45 |     --model_max_length 2048 \
46 |     --gradient_checkpointing True \
47 |     --lazy_preprocess True \
48 |     --deepspeed finetune/ds_config_zero3.json
49 | 


--------------------------------------------------------------------------------
/finetune/finetune_lora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL"  Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | DISTRIBUTED_ARGS="
17 |     --nproc_per_node $GPUS_PER_NODE \
18 |     --nnodes $NNODES \
19 |     --node_rank $NODE_RANK \
20 |     --master_addr $MASTER_ADDR \
21 |     --master_port $MASTER_PORT
22 | "
23 | 
24 | torchrun $DISTRIBUTED_ARGS finetune.py \
25 |     --model_name_or_path $MODEL \
26 |     --data_path $DATA \
27 |     --bf16 True \
28 |     --fix_vit True \
29 |     --output_dir output_qwen \
30 |     --num_train_epochs 5 \
31 |     --per_device_train_batch_size 2 \
32 |     --per_device_eval_batch_size 1 \
33 |     --gradient_accumulation_steps 8 \
34 |     --evaluation_strategy "no" \
35 |     --save_strategy "steps" \
36 |     --save_steps 1000 \
37 |     --save_total_limit 10 \
38 |     --learning_rate 1e-5 \
39 |     --weight_decay 0.1 \
40 |     --adam_beta2 0.95 \
41 |     --warmup_ratio 0.01 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --report_to "none" \
45 |     --model_max_length 2048 \
46 |     --lazy_preprocess True \
47 |     --use_lora \
48 |     --gradient_checkpointing \
49 |     --deepspeed finetune/ds_config_zero2.json


--------------------------------------------------------------------------------
/finetune/finetune_lora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | 
 6 | MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
 7 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 8 | # See the section for finetuning in README for more information.
 9 | DATA="path_to_data"
10 | 
11 | export CUDA_VISIBLE_DEVICES=0
12 | 
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --bf16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora


--------------------------------------------------------------------------------
/finetune/finetune_qlora_ds.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | GPUS_PER_NODE=8
 6 | NNODES=1
 7 | NODE_RANK=0
 8 | MASTER_ADDR=localhost
 9 | MASTER_PORT=6001
10 | 
11 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
12 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
13 | # See the section for finetuning in README for more information.
14 | DATA="path_to_data"
15 | 
16 | 
17 | DISTRIBUTED_ARGS="
18 |     --nproc_per_node $GPUS_PER_NODE \
19 |     --nnodes $NNODES \
20 |     --node_rank $NODE_RANK \
21 |     --master_addr $MASTER_ADDR \
22 |     --master_port $MASTER_PORT
23 | "
24 | 
25 | # Remember to use --fp16 instead of --bf16 due to autogptq
26 | torchrun $DISTRIBUTED_ARGS finetune.py \
27 |     --model_name_or_path $MODEL \
28 |     --data_path $DATA \
29 |     --fp16 True \
30 |     --fix_vit True \
31 |     --output_dir output_qwen \
32 |     --num_train_epochs 5 \
33 |     --per_device_train_batch_size 2 \
34 |     --per_device_eval_batch_size 1 \
35 |     --gradient_accumulation_steps 8 \
36 |     --evaluation_strategy "no" \
37 |     --save_strategy "steps" \
38 |     --save_steps 1000 \
39 |     --save_total_limit 10 \
40 |     --learning_rate 1e-5 \
41 |     --weight_decay 0.1 \
42 |     --adam_beta2 0.95 \
43 |     --warmup_ratio 0.01 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --report_to "none" \
47 |     --model_max_length 2048 \
48 |     --lazy_preprocess True \
49 |     --use_lora \
50 |     --q_lora \
51 |     --gradient_checkpointing \
52 |     --deepspeed finetune/ds_config_zero2.json


--------------------------------------------------------------------------------
/finetune/finetune_qlora_single_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | export CUDA_DEVICE_MAX_CONNECTIONS=1
 3 | DIR=`pwd`
 4 | 
 5 | MODEL="Qwen/Qwen-VL-Chat-Int4" # Qwen/Qwen-VL-Chat-Int4 Set the path if you do not want to load from huggingface directly
 6 | # ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
 7 | # See the section for finetuning in README for more information.
 8 | DATA="path_to_data"
 9 | 
10 | export CUDA_VISIBLE_DEVICES=0
11 | 
12 | # Remember to use --fp16 instead of --bf16 due to autogptq
13 | python finetune.py \
14 |     --model_name_or_path $MODEL \
15 |     --data_path $DATA \
16 |     --fp16 True \
17 |     --fix_vit True \
18 |     --output_dir output_qwen \
19 |     --num_train_epochs 5 \
20 |     --per_device_train_batch_size 1 \
21 |     --per_device_eval_batch_size 1 \
22 |     --gradient_accumulation_steps 8 \
23 |     --evaluation_strategy "no" \
24 |     --save_strategy "steps" \
25 |     --save_steps 1000 \
26 |     --save_total_limit 10 \
27 |     --learning_rate 1e-5 \
28 |     --weight_decay 0.1 \
29 |     --adam_beta2 0.95 \
30 |     --warmup_ratio 0.01 \
31 |     --lr_scheduler_type "cosine" \
32 |     --logging_steps 1 \
33 |     --report_to "none" \
34 |     --model_max_length 2048 \
35 |     --lazy_preprocess True \
36 |     --gradient_checkpointing \
37 |     --use_lora \
38 |     --q_lora \
39 |     --deepspeed finetune/ds_config_zero2.json
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.32.0
 2 | accelerate
 3 | tiktoken
 4 | einops
 5 | transformers_stream_generator==0.0.4
 6 | scipy
 7 | torchvision
 8 | pillow
 9 | tensorboard
10 | matplotlib
11 | 


--------------------------------------------------------------------------------
/requirements_openai_api.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn
3 | openai
4 | pydantic
5 | sse_starlette
6 | 


--------------------------------------------------------------------------------
/requirements_web_demo.txt:
--------------------------------------------------------------------------------
1 | gradio
2 | modelscope
3 | 


--------------------------------------------------------------------------------
/touchstone/README.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         <a href="../touchstone/README_CN.md">中文</a>&nbsp ｜ &nbspEnglish ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a>｜ &nbsp<a href="../touchstone/README_KO.md">한국어</a>
10 | </p>
11 | <br><br>
12 | 
13 | **TOUCHSTONE** is a comprehensive assessment of multimodal language models, encompassing not only basic recognition and comprehension but also extending to literary creation. By automating the evaluation process and converting multimodal information into text, our TouchStone allows for efficient and accurate assessment of dialogue quality, leveraging the power of advanced language models without the need for manual intervention.
14 | 
15 | ## DATASET
16 | 
17 | To evaluate the abilities of LVLMs, we construct a diverse and comprehensive dataset that covers five key dimensions: basic descriptive ability, visual recognition ability, visual comprehension ability, visual storytelling ability, and multi-image analysis ability.
18 | 
19 | - **Basic Descriptive Ability** Image description involves the ability of a model to describe the information contained in an image, including simple and detailed descriptions. Simple descriptions are typically short phrases that describe the main subject and action of the image, while detailed descriptions provide more in-depth information about the image scene, their attributes, and relationships.
20 | 
21 | - **Visual Recognition Ability** Image recognition is the task of recognizing objects or scenes within an image and inferring relevant information. This area can be further divided into several sub-tasks, including attribute QA, movie/TV recognition, art recognition, landmark recognition, celebrity recognition, emotion recognition, text recognition, object recognition, and structure content recognition. 
22 | 
23 | - **Visual Comprehension Ability** Image understanding involves the ability of a model to understand the meaning of an image and associated tasks. This area encompasses several sub-tasks, such as style appreciation, abstract image understanding, meme understanding, image analysis, chart analysis, general problem-solving, and reasoning QA.
24 | 
25 | - **Visual Storytelling Ability**  The visual storytelling ability is the process of literary creation based on visual content, including writing emails, poetry, stories, ads/commodity recommendations, and brainstorming. 
26 | 
27 | - **Multi-Image Analysis Ability** Multi-image analysis is the task of analyzing and comparing multiple images. This area includes tasks such as comparing two/multiple images, summarizing multiple image information, comparing commodities, and step-by-step analysis of images.
28 | 
29 | 
30 | <p align="center">
31 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
32 | <p>
33 | 
34 | We comprehensively evaluate the model's ability from five dimensions. As shown in the figure above, an example of 27 subtasks is given. From perception to cognition to creativity, as the difficulty increases, the requirements for models are also getting higher and higher. Currently, LVLM capabilities are in their early stages. Our dataset contains 800+ questions and 27 categories.
35 | 
36 | ## Methods
37 | 
38 | 
39 | We apply a powerful LLM as a judge to enable automated evaluation. To effectively comprehend the contents of an image, we manually substitute the actual image input with fine-grained textual annotations. By inputting these annotations and corresponding questions to a powerful LLM like GPT4, we obtain reference answers.
40 | 
41 | For the evaluation of the LVLMs, we provide actual images and questions as input and obtain their respective answers. Finally, we employ GPT4 to score the answers generated by the LVLMs based on the fine-grained annotations and questions. The scoring instructions require the model to assess the usefulness, relevance, and accuracy of the answers, considering the annotations as the content of the images. To ensure fairness in the evaluation, each model's answer is compared against a consistent reference answer from GPT4. The average score of the model in all questions is taken as the final score.
42 | 
43 | To eliminate the influence of answer position, we perform a second scoring round by swapping the positions of the answers and then compute the average of the two scores obtained. This approach aims to mitigate any bias introduced by the placement of the answers.
44 | 
45 | <p align="center">
46 |     <img src="../assets/touchstone_eval.png" width="600"/>
47 | <p>
48 | 
49 | ### Evaluation
50 | 
51 | #### Evaluation in English-based Multimodal Dialogue
52 | 
53 | | Model         | Score |
54 | |---------------|-------|
55 | | PandaGPT      | 488.5 |
56 | | MiniGPT4      | 531.7 |
57 | | InstructBLIP  | 552.4 |
58 | | LLaMA-AdapterV2 | 590.1 |
59 | | mPLUG-Owl     | 605.4 |
60 | | LLaVA         | 602.7 |
61 | | Qwen-VL-Chat   | 645.2 |
62 | 
63 | #### Evaluation in Chinese-based Multimodal Dialogue
64 | 
65 | | Model         | Score |
66 | |---------------|-------|
67 | | VisualGLM     | 247.1 |
68 | | Qwen-VL-Chat   | 401.2 |
69 | 
70 | 


--------------------------------------------------------------------------------
/touchstone/README_CN.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         中文&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a> ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a>
10 | </p>
11 | <br><br>
12 | 
13 | **TOUCHSTONE** 是一种针对多模态语言模型（LVLM）的自动化综合评估方法，评估不仅包括基本的认知和理解，还延伸到文学创作。通过人类注解将多模态信息转换为文本，我们的 TouchStone 可以利用SOTA的语言模型来自动化地完成对LVLMs的多模态对话质量评估。
14 | 
15 | ## 数据集
16 | 
17 | 为了评估 LVLMs 的能力，我们构建了一个多样化且全面的数据集，涵盖五个关键维度：基本描述能力、视觉识别能力、视觉理解能力、视觉叙事能力和多图分析能力。
18 | 
19 | - **基本描述能力** 图像描述考验模型总结图片信息的能力，包括简单描述和详细描述。 简单描述通常是描述图像的主要内容和关系的简短短语，而详细描述则提供有关图像场景、其属性和关系的更深入的信息。
20 | 
21 | - **视觉识别能力** 图像识别考察模型提取图像中内容的属性以及关联到知识库的能力。为了考察这方面能力，测试的问题包括属性QA、影视识别、艺术识别、地标识别、名人识别、情感识别、文本识别、物体识别和结构内容识别。
22 | 
23 | - **视觉理解能力** 图像理解需要模型理解图像内容并完成推理进行相关任务。 这方面包含了例如风格欣赏、抽象图像理解、模因理解、图像分析、图表分析、一般问题解决和推理问答等任务。
24 | 
25 | - **视觉叙事能力**  视觉叙事能力是基于视觉内容的文学创作能力，包括撰写电子邮件、诗歌、故事、广告/商品推荐、头脑风暴等。 
26 | 
27 | - **多图分析能力** 多图分析是分析和比较多幅图像的任务。该领域包括比较两个/多个图像、总结多个图像信息、比较商品以及逐步分析图像等任务。
28 | 
29 | <p align="center">
30 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
31 | <p>
32 | 
33 | 我们从五个维度综合评估了模型的能力。 如上图所示，给出了27个子任务的示例。 从感知到认知，再到创造力，随着难度的增加，对模型的要求也越来越高。 目前，LVLM的能力还处于早期阶段。 我们的数据集包含800+道题目、27个类别。
34 | 
35 | ## 测评方式
36 | 
37 | 我们应用SOTA的LLM进行自动化评估。 为了有效地理解图像的内容，我们人工用细粒度的文本注释替换实际的图像输入。 通过将这些注释和相应的问题输入到像GPT4这样强LLM中，我们可以获得参考答案。
38 | 
39 | 对于待测评的LVLM，我们提供实际图像和问题作为输入并获得各自的答案。 最后，我们使用GPT4根据细粒度注释和问题对LVLM生成的答案进行评分。 评分指令要求模型评估答案的有用性、相关性和准确性，并将人工注解视为图像的内容。 为了确保评估的公平性，每个模型的答案都会与 GPT4生成的参考答案进行比较。 模型在所有问题上的平均得分作为最终得分。
40 | 
41 | 为了消除答案位置的影响，我们通过交换答案的位置来进行第二轮评分，然后计算获得的两次分数的平均值。
42 | 
43 | <p align="center">
44 |     <img src="../assets/touchstone_eval.png" width="600"/>
45 | <p>
46 | 
47 | 
48 | ## 测评结果
49 | 
50 | #### 英文版本测评
51 | 
52 | | Model         | Score |
53 | |---------------|-------|
54 | | PandaGPT      | 488.5 |
55 | | MiniGPT4      | 531.7 |
56 | | InstructBLIP  | 552.4 |
57 | | LLaMA-AdapterV2 | 590.1 |
58 | | mPLUG-Owl     | 605.4 |
59 | | LLaVA         | 602.7 |
60 | | Qwen-VL-Chat   | 645.2 |
61 | 
62 | #### 中文版本测评
63 | 
64 | | Model         | Score |
65 | |---------------|-------|
66 | | VisualGLM     | 247.1 |
67 | | Qwen-VL-Chat   | 401.2 |
68 | 
69 | 


--------------------------------------------------------------------------------
/touchstone/README_JA.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         <a href="touchstone/README_CN.md">中文</a>&nbsp ｜ &nbsp<a href="../touchstone/README.md">English</a>｜ &nbsp日本語
10 | </p>
11 | <br><br>
12 | 
13 | **TOUCHSTONE** は、マルチモーダル言語モデルの包括的な評価であり、基本的な認識や理解だけでなく、文学的な創作にまで及びます。評価プロセスを自動化し、マルチモーダル情報をテキストに変換することで、私達の TouchStone は、人手を介することなく高度な言語モデルの力を活用し、対話の質を効率的かつ正確に評価することができます。
14 | 
15 | ## DATASET
16 | 
17 | LVLMの能力を評価するために、基本的な記述能力、視覚認識能力、視覚理解能力、視覚ストーリーテリング能力、複数画像解析能力の5つの主要な次元をカバーする多様で包括的なデータセットを構築する。
18 | 
19 | - **基本的描写力** 画像記述には、単純な記述と詳細な記述を含め、画像に含まれる情報を記述するモデルの能力が含まれる。単純な記述は、通常、画像の主な主題とアクションを記述する短いフレーズであり、詳細な記述は、画像のシーン、それらの属性、および関係についてのより詳細な情報を提供します。
20 | 
21 | - **視覚認識能力** 画像認識とは、画像内のオブジェクトやシーンを認識し、関連情報を推論するタスクである。この分野はさらに、属性QA、映画/テレビ認識、アート認識、ランドマーク認識、有名人認識、感情認識、テキスト認識、オブジェクト認識、構造コンテンツ認識など、いくつかのサブタスクに分けることができる。
22 | 
23 | - **視覚理解能力** 画像理解とは、モデルが画像の意味や関連するタスクを理解する能力のことである。この分野には、スタイル理解、抽象画像理解、ミーム理解、画像分析、チャート分析、一般的な問題解決、推論QAなど、いくつかのサブタスクが含まれる。
24 | 
25 | - **視覚的ストーリーテリング能力** ビジュアルストーリーテリング能力とは、メール、詩、物語、広告／商品推薦、ブレーンストーミングの執筆など、ビジュアルコンテンツに基づいた文学創作のプロセスである。
26 | 
27 | - **マルチ画像解析能力** 複数画像解析とは、複数の画像を解析・比較する作業である。この分野には、2つまたは複数の画像を比較する、複数の画像情報を要約する、商品を比較する、画像を段階的に分析するなどのタスクが含まれます。
28 | 
29 | 
30 | <p align="center">
31 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
32 | <p>
33 | 
34 | モデルの能力を 5 つの次元から総合的に評価する。上図のように、27 のサブタスクの例を示す。知覚から認知、創造性まで、難易度が上がるにつれて、モデルに求められる要件もどんどん高くなっている。現在、LVLM の機能は初期段階にある。我々のデータセットには 800 以上の質問と 27 のカテゴリーが含まれている。
35 | 
36 | ## 方法
37 | 
38 | 
39 | 自動評価を可能にするために、強力な LLM を判定器として適用する。画像の内容を効果的に理解するために、実際の画像入力をきめ細かいテキスト注釈に手動で置き換える。これらの注釈と対応する質問を GPT4 のような強力な LLM に入力することで、参照解答を得る。
40 | 
41 | LVLMの評価には、実際の画像と質問を入力として与え、それぞれの回答を得る。最後に、GPT4を用いて、LVLMが生成した回答を、細かいアノテーションと質問に基づいてスコアリングする。スコアリングの指示は、注釈を画像の内容とみなして、回答の有用性、関連性、正確性を評価するようモデルに要求する。評価の公平性を確保するため、各モデルの回答はGPT4の一貫した参照回答と比較されます。全問題におけるモデルの平均スコアを最終スコアとする。
42 | 
43 | 解答位置の影響を排除するために、解答位置を入れ替えて2回目の採点ラウンドを行い、得られた2つのスコアの平均を計算します。このアプローチは、解答の配置によって生じるバイアスを軽減することを目的としています。
44 | <p align="center">
45 |     <img src="../assets/touchstone_eval.png" width="600"/>
46 | <p>
47 | 
48 | ### 評価
49 | 
50 | #### 英語ベースのマルチモーダル対話における評価
51 | 
52 | | Model         | Score |
53 | |---------------|-------|
54 | | PandaGPT      | 488.5 |
55 | | MiniGPT4      | 531.7 |
56 | | InstructBLIP  | 552.4 |
57 | | LLaMA-AdapterV2 | 590.1 |
58 | | mPLUG-Owl     | 605.4 |
59 | | LLaVA         | 602.7 |
60 | | Qwen-VL-Chat   | 645.2 |
61 | 
62 | #### 中国語ベースのマルチモーダル対話における評価
63 | 
64 | | Model         | Score |
65 | |---------------|-------|
66 | | VisualGLM     | 247.1 |
67 | | Qwen-VL-Chat   | 401.2 |
68 | 
69 | 


--------------------------------------------------------------------------------
/touchstone/README_KO.md:
--------------------------------------------------------------------------------
 1 | <br>
 2 | 
 3 | <p align="center">
 4 |     <img src="../assets/touchstone_logo.png" width="300"/>
 5 | <p>
 6 | <br>
 7 | 
 8 | <p align="center">
 9 |         <a href="../touchstone/README_CN.md">中文</a>&nbsp ｜ &nbspEnglish ｜ &nbsp<a href="../touchstone/README_JA.md">日本語</a> ｜ &nbsp<a href="../touchstone/README_KO.md">한국어</a> 
10 | </p>
11 | <br><br>
12 | 
13 | **터치스톤, TOUCHSTONE**은 기본적인 인식과 이해력뿐만 아니라 문학 창작까지 아우르는 종합적인 멀티모달 언어 모델 평가입니다. 평가 프로세스를 자동화하고 멀티모달 정보를 텍스트로 변환하는 터치스톤은 수동 개입 없이도 고급 언어 모델의 성능을 활용하여 대화 품질을 효율적이고 정확하게 평가할 수 있도록 지원합니다.
14 | 
15 | ## DATASET
16 | 
17 | 머신러닝의 능력을 평가하기 위해 기본 설명 능력, 시각 인식 능력, 시각 이해 능력, 시각 스토리텔링 능력, 다중 이미지 분석 능력 등 5가지 주요 모달을 포괄하는 다양하고 광범위한 데이터 세트를 구축합니다.
18 | 
19 | - **기본 설명 능력, Basic Descriptive Ability** 이미지 설명에는 단순 설명과 상세 설명을 포함하여 이미지에 포함된 정보를 설명하는 모델의 능력이 포함됩니다. 단순 설명은 일반적으로 이미지의 주요 주제와 동작을 설명하는 짧은 문구로 상세 설명은 이미지 장면, 속성 및 관계에 대한 보다 심층적인 정보를 제공합니다.
20 | 
21 | - **시각적 인식 능력, Visual Recognition Ability** 이미지 인식은 이미지 내의 사물이나 장면을 인식하고 관련 정보를 추론하는 작업입니다. 이 영역은 속성 QA, 영화/TV 인식, 예술 인식, 랜드마크 인식, 유명인 인식, 감정 인식, 텍스트 인식, 사물 인식, 구조물 내용 인식 등 여러 하위 작업으로 세분화할 수 있습니다. 
22 | 
23 | - **시각적 이해 능력, Visual Comprehension Ability** 이미지 이해에는 이미지의 의미와 관련 작업을 이해하는 모델의 능력이 포함됩니다. 이 영역에는 스타일 감상, 추상적 이미지 이해, 밈 이해, 이미지 분석, 차트 분석, 일반적인 문제 해결, 추론 QA와 같은 여러 하위 작업이 포함됩니다.
24 | 
25 | - **시각적 스토리텔링 능력, Visual Storytelling Ability** 시각적 스토리텔링 능력은 이메일, 시, 스토리, 광고/상품 추천, 브레인스토밍 등 시각적 콘텐츠를 기반으로 문학적 창작을 하는 과정입니다. 
26 | 
27 | - **다중 이미지 분석 능력, Multi-Image Analysis Ability** 다중 이미지 분석은 여러 이미지를 분석하고 비교하는 작업입니다. 이 영역에는 두 개/여러 개의 이미지 비교, 여러 이미지 정보 요약, 상품 비교, 이미지의 단계별 분석 등의 작업이 포함됩니다.
28 | 
29 | 
30 | 
31 | <p align="center">
32 |     <img src="../assets/touchstone_datasets.jpg" width="600"/>
33 | <p>
34 | 
35 | 5가지 측면에서 모델의 능력을 종합적으로 평가합니다. 위 그림과 같이 27개의 하위 과제를 예로 들었습니다. 지각부터 인지, 창의력까지 난이도가 높아질수록 모델에 대한 요구 사항도 점점 더 높아지고 있습니다. 현재 LVLM 기능은 초기 단계에 있습니다. 데이터 세트에는 800개 이상의 질문과 27개 카테고리가 포함되어 있습니다.
36 | 
37 | ## Methods
38 | 
39 | 당사는 자동화된 평가를 위해 강력한 LLM을 심사자로 적용합니다. 이미지의 내용을 효과적으로 이해하기 위해 실제 이미지 입력을 세분화된 텍스트 주석으로 수동으로 대체합니다. 이러한 주석과 해당 질문을 GPT4와 같은 강력한 LLM에 입력하면 참조 답변을 얻을 수 있습니다.
40 | 
41 | LVLM의 평가를 위해 실제 이미지와 질문을 입력으로 제공하고 각각의 답변을 얻습니다. 마지막으로, 세분화된 주석과 질문을 기반으로 LVLM이 생성한 답변에 GPT4를 사용하여 점수를 매깁니다. 채점 지침에 따라 모델은 주석을 이미지의 콘텐츠로 간주하여 답변의 유용성, 관련성 및 정확성을 평가해야 합니다. 평가의 공정성을 보장하기 위해 각 모델의 답변은 GPT4의 일관된 참조 답변과 비교됩니다. 모든 문제에서 모델의 평균 점수가 최종 점수로 사용됩니다.
42 | 
43 | 답안 위치의 영향을 제거하기 위해 답안 위치를 바꿔서 두 번째 채점 라운드를 수행한 다음 얻은 두 점수의 평균을 계산합니다. 이 접근 방식은 답안 배치로 인해 발생하는 편향을 완화하는 것을 목표로 합니다.
44 | 
45 | <p align="center">
46 |     <img src="../assets/touchstone_eval.png" width="600"/>
47 | <p>
48 | 
49 | ### Evaluation
50 | 
51 | #### Evaluation in English-based Multimodal Dialogue
52 | 
53 | | Model         | Score |
54 | |---------------|-------|
55 | | PandaGPT      | 488.5 |
56 | | MiniGPT4      | 531.7 |
57 | | InstructBLIP  | 552.4 |
58 | | LLaMA-AdapterV2 | 590.1 |
59 | | mPLUG-Owl     | 605.4 |
60 | | LLaVA         | 602.7 |
61 | | Qwen-VL-Chat   | 645.2 |
62 | 
63 | #### Evaluation in Chinese-based Multimodal Dialogue
64 | 
65 | | Model         | Score |
66 | |---------------|-------|
67 | | VisualGLM     | 247.1 |
68 | | Qwen-VL-Chat   | 401.2 |
69 | 
70 | 


--------------------------------------------------------------------------------
/web_demo_mm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Alibaba Cloud.
  2 | #
  3 | # This source code is licensed under the license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | """A simple web interactive chat demo based on gradio."""
  7 | 
  8 | from argparse import ArgumentParser
  9 | from pathlib import Path
 10 | 
 11 | import copy
 12 | import gradio as gr
 13 | import os
 14 | import re
 15 | import secrets
 16 | import tempfile
 17 | from modelscope import (
 18 |     snapshot_download, AutoModelForCausalLM, AutoTokenizer, GenerationConfig
 19 | )
 20 | 
 21 | DEFAULT_CKPT_PATH = 'qwen/Qwen-VL-Chat'
 22 | BOX_TAG_PATTERN = r"<box>([\s\S]*?)</box>"
 23 | PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
 24 | 
 25 | 
 26 | def _get_args():
 27 |     parser = ArgumentParser()
 28 |     parser.add_argument("-c", "--checkpoint-path", type=str, default=DEFAULT_CKPT_PATH,
 29 |                         help="Checkpoint name or path, default to %(default)r")
 30 |     parser.add_argument("--cpu-only", action="store_true", help="Run demo with CPU only")
 31 | 
 32 |     parser.add_argument("--share", action="store_true", default=False,
 33 |                         help="Create a publicly shareable link for the interface.")
 34 |     parser.add_argument("--inbrowser", action="store_true", default=False,
 35 |                         help="Automatically launch the interface in a new tab on the default browser.")
 36 |     parser.add_argument("--server-port", type=int, default=8000,
 37 |                         help="Demo server port.")
 38 |     parser.add_argument("--server-name", type=str, default="127.0.0.1",
 39 |                         help="Demo server name.")
 40 | 
 41 |     args = parser.parse_args()
 42 |     return args
 43 | 
 44 | 
 45 | def _load_model_tokenizer(args):
 46 |     tokenizer = AutoTokenizer.from_pretrained(
 47 |         args.checkpoint_path, trust_remote_code=True, resume_download=True, revision='master',
 48 |     )
 49 | 
 50 |     if args.cpu_only:
 51 |         device_map = "cpu"
 52 |     else:
 53 |         device_map = "cuda"
 54 | 
 55 |     model = AutoModelForCausalLM.from_pretrained(
 56 |         args.checkpoint_path,
 57 |         device_map=device_map,
 58 |         trust_remote_code=True,
 59 |         resume_download=True,
 60 |         revision='master',
 61 |     ).eval()
 62 |     model.generation_config = GenerationConfig.from_pretrained(
 63 |         args.checkpoint_path, trust_remote_code=True, resume_download=True, revision='master',
 64 |     )
 65 | 
 66 |     return model, tokenizer
 67 | 
 68 | 
 69 | def _parse_text(text):
 70 |     lines = text.split("\n")
 71 |     lines = [line for line in lines if line != ""]
 72 |     count = 0
 73 |     for i, line in enumerate(lines):
 74 |         if "```" in line:
 75 |             count += 1
 76 |             items = line.split("`")
 77 |             if count % 2 == 1:
 78 |                 lines[i] = f'<pre><code class="language-{items[-1]}">'
 79 |             else:
 80 |                 lines[i] = f"<br></code></pre>"
 81 |         else:
 82 |             if i > 0:
 83 |                 if count % 2 == 1:
 84 |                     line = line.replace("`", r"\`")
 85 |                     line = line.replace("<", "&lt;")
 86 |                     line = line.replace(">", "&gt;")
 87 |                     line = line.replace(" ", "&nbsp;")
 88 |                     line = line.replace("*", "&ast;")
 89 |                     line = line.replace("_", "&lowbar;")
 90 |                     line = line.replace("-", "&#45;")
 91 |                     line = line.replace(".", "&#46;")
 92 |                     line = line.replace("!", "&#33;")
 93 |                     line = line.replace("(", "&#40;")
 94 |                     line = line.replace(")", "&#41;")
 95 |                     line = line.replace("$", "&#36;")
 96 |                 lines[i] = "<br>" + line
 97 |     text = "".join(lines)
 98 |     return text
 99 | 
100 | def _remove_image_special(text):
101 |     text = text.replace('<ref>', '').replace('</ref>', '')
102 |     return re.sub(r'<box>.*?(</box>|$)', '', text)
103 | 
104 | def _launch_demo(args, model, tokenizer):
105 |     uploaded_file_dir = os.environ.get("GRADIO_TEMP_DIR") or str(
106 |         Path(tempfile.gettempdir()) / "gradio"
107 |     )
108 | 
109 |     def predict(_chatbot, task_history):
110 |         chat_query = _chatbot[-1][0]
111 |         query = task_history[-1][0]
112 |         print("User: " + _parse_text(query))
113 |         history_cp = copy.deepcopy(task_history)
114 |         full_response = ""
115 | 
116 |         history_filter = []
117 |         pic_idx = 1
118 |         pre = ""
119 |         for i, (q, a) in enumerate(history_cp):
120 |             if isinstance(q, (tuple, list)):
121 |                 q = f'Picture {pic_idx}: <img>{q[0]}</img>'
122 |                 pre += q + '\n'
123 |                 pic_idx += 1
124 |             else:
125 |                 pre += q
126 |                 history_filter.append((pre, a))
127 |                 pre = ""
128 |         history, message = history_filter[:-1], history_filter[-1][0]
129 |         # response, history = model.chat(tokenizer, message, history=history)
130 |         for response in model.chat_stream(tokenizer, message, history=history):
131 |             _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
132 | 
133 |             yield _chatbot
134 |             full_response = _parse_text(response)
135 | 
136 |         response = full_response
137 |         history.append((message, response))
138 |         image = tokenizer.draw_bbox_on_latest_picture(response, history)
139 |         if image is not None:
140 |             temp_dir = secrets.token_hex(20)
141 |             temp_dir = Path(uploaded_file_dir) / temp_dir
142 |             temp_dir.mkdir(exist_ok=True, parents=True)
143 |             name = f"tmp{secrets.token_hex(5)}.jpg"
144 |             filename = temp_dir / name
145 |             image.save(str(filename))
146 |             _chatbot.append((None, (str(filename),)))
147 |         else:
148 |             _chatbot[-1] = (_parse_text(chat_query), response)
149 |         # full_response = _parse_text(response)
150 | 
151 |         task_history[-1] = (query, full_response)
152 |         print("Qwen-VL-Chat: " + _parse_text(full_response))
153 |         yield _chatbot
154 | 
155 |     def regenerate(_chatbot, task_history):
156 |         if not task_history:
157 |             return _chatbot
158 |         item = task_history[-1]
159 |         if item[1] is None:
160 |             return _chatbot
161 |         task_history[-1] = (item[0], None)
162 |         chatbot_item = _chatbot.pop(-1)
163 |         if chatbot_item[0] is None:
164 |             _chatbot[-1] = (_chatbot[-1][0], None)
165 |         else:
166 |             _chatbot.append((chatbot_item[0], None))
167 |         return predict(_chatbot, task_history)
168 | 
169 |     def add_text(history, task_history, text):
170 |         task_text = text
171 |         if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
172 |             task_text = text[:-1]
173 |         history = history + [(_parse_text(text), None)]
174 |         task_history = task_history + [(task_text, None)]
175 |         return history, task_history, ""
176 | 
177 |     def add_file(history, task_history, file):
178 |         history = history + [((file.name,), None)]
179 |         task_history = task_history + [((file.name,), None)]
180 |         return history, task_history
181 | 
182 |     def reset_user_input():
183 |         return gr.update(value="")
184 | 
185 |     def reset_state(task_history):
186 |         task_history.clear()
187 |         return []
188 | 
189 |     with gr.Blocks() as demo:
190 |         gr.Markdown("""\
191 | <p align="center"><img src="https://modelscope.cn/api/v1/models/qwen/Qwen-7B-Chat/repo?
192 | Revision=master&FilePath=assets/logo.jpeg&View=true" style="height: 80px"/><p>""")
193 |         gr.Markdown("""<center><font size=8>Qwen-VL-Chat Bot</center>""")
194 |         gr.Markdown(
195 |             """\
196 | <center><font size=3>This WebUI is based on Qwen-VL-Chat, developed by Alibaba Cloud. \
197 | (本WebUI基于Qwen-VL-Chat打造，实现聊天机器人功能。)</center>""")
198 |         gr.Markdown("""\
199 | <center><font size=4>Qwen-VL <a href="https://modelscope.cn/models/qwen/Qwen-VL/summary">🤖 </a> 
200 | | <a href="https://huggingface.co/Qwen/Qwen-VL">🤗</a>&nbsp ｜ 
201 | Qwen-VL-Chat <a href="https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary">🤖 </a> | 
202 | <a href="https://huggingface.co/Qwen/Qwen-VL-Chat">🤗</a>&nbsp ｜ 
203 | &nbsp<a href="https://github.com/QwenLM/Qwen-VL">Github</a></center>""")
204 | 
205 |         chatbot = gr.Chatbot(label='Qwen-VL-Chat', elem_classes="control-height", height=750)
206 |         query = gr.Textbox(lines=2, label='Input')
207 |         task_history = gr.State([])
208 | 
209 |         with gr.Row():
210 |             empty_bin = gr.Button("🧹 Clear History (清除历史)")
211 |             submit_btn = gr.Button("🚀 Submit (发送)")
212 |             regen_btn = gr.Button("🤔️ Regenerate (重试)")
213 |             addfile_btn = gr.UploadButton("📁 Upload (上传文件)", file_types=["image"])
214 | 
215 |         submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(
216 |             predict, [chatbot, task_history], [chatbot], show_progress=True
217 |         )
218 |         submit_btn.click(reset_user_input, [], [query])
219 |         empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
220 |         regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
221 |         addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
222 | 
223 |         gr.Markdown("""\
224 | <font size=2>Note: This demo is governed by the original license of Qwen-VL. \
225 | We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
226 | including hate speech, violence, pornography, deception, etc. \
227 | (注：本演示受Qwen-VL的许可协议限制。我们强烈建议，用户不应传播及不应允许他人传播以下内容，\
228 | 包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)""")
229 | 
230 |     demo.queue().launch(
231 |         share=args.share,
232 |         inbrowser=args.inbrowser,
233 |         server_port=args.server_port,
234 |         server_name=args.server_name,
235 |     )
236 | 
237 | 
238 | def main():
239 |     args = _get_args()
240 | 
241 |     model, tokenizer = _load_model_tokenizer(args)
242 | 
243 |     _launch_demo(args, model, tokenizer)
244 | 
245 | 
246 | if __name__ == '__main__':
247 |     main()
248 | 


--------------------------------------------------------------------------------