├── .github └── FUNDING.yml ├── .gitignore ├── CITATION.cff ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md ├── assets ├── images │ ├── logo │ │ ├── opendrivelab.jpeg │ │ └── tubingen.jpeg │ ├── page │ │ ├── front_page.png │ │ ├── front_page_dark.png │ │ ├── galaxy.jpeg │ │ └── video_front_page.png │ ├── repo │ │ ├── drivelm_teaser.jpg │ │ ├── drivelm_timeline.jpg │ │ ├── drivelm_timeline_v2.jpg │ │ ├── drivelm_timeline_v3.jpg │ │ ├── paper_data.jpg │ │ ├── paper_data_comp.png │ │ ├── paper_model_pipeline.jpg │ │ ├── paper_qualitative.jpg │ │ ├── paper_teaser.jpg │ │ ├── point_1.png │ │ ├── point_2.png │ │ ├── point_3.png │ │ ├── title.jpg │ │ └── title_v2.jpg │ └── svg │ │ ├── 404.svg │ │ ├── dialog.svg │ │ ├── divider.svg │ │ ├── faq.svg │ │ ├── featured-light.svg │ │ ├── featured.svg │ │ ├── graphic-1.svg │ │ ├── graphic-2.svg │ │ ├── master-card-1.svg │ │ ├── pattern-lg-light.svg │ │ ├── pattern-lg.svg │ │ ├── pattern.svg │ │ ├── pin-light.svg │ │ ├── pin.svg │ │ ├── quotes.svg │ │ ├── shadow.svg │ │ └── visa-1.svg └── video │ ├── DriveLM.mp4 │ └── graph.mp4 ├── challenge ├── README.md ├── __init__.py ├── convert2llama.py ├── convert_data.py ├── data │ └── train_sample.json ├── evaluation.py ├── extract_data.py ├── gpt_eval.py ├── llama_adapter_v2_multimodal7b │ ├── README.md │ ├── data │ │ ├── dataset.py │ │ └── nuscenes │ │ │ └── samples │ │ │ ├── CAM_BACK │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291002287558.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291005187558.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291010637558.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291013637558.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280036187525.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280039187525.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280042637525.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280045137525.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280048187525.jpg │ │ │ ├── CAM_BACK_LEFT │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291002297405.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291005197405.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291010647405.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291013647405.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280036197423.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280039197423.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280042647423.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280045147423.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280048197442.jpg │ │ │ ├── CAM_BACK_RIGHT │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291002278113.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291005178113.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291010628113.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291013628113.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280036177893.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280039177893.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280042627893.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280045127893.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280048177893.jpg │ │ │ ├── CAM_FRONT │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291002262404.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291005162404.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291010612404.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291013612404.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280036162460.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280039162460.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280042612460.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280045112460.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280048162460.jpg │ │ │ ├── CAM_FRONT_LEFT │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291002254799.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291005154799.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291010604799.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291013604799.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280036154844.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280039154844.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280042604844.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280045104844.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280048154844.jpg │ │ │ └── CAM_FRONT_RIGHT │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291002270482.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291005170482.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291010620482.jpg │ │ │ ├── n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291013620482.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280036170339.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280039170339.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280042620339.jpg │ │ │ ├── n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280045120339.jpg │ │ │ └── n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280048170339.jpg │ ├── demo.py │ ├── docs │ │ ├── eval.md │ │ └── train.md │ ├── engine_finetune.py │ ├── engine_pretrain.py │ ├── exps │ │ ├── finetune.sh │ │ └── pretrain.sh │ ├── finetune_data_config.yaml │ ├── gradio_app.py │ ├── llama │ │ ├── __init__.py │ │ ├── llama.py │ │ ├── llama_adapter.py │ │ ├── tokenizer.py │ │ └── utils.py │ ├── main_finetune.py │ ├── main_pretrain.py │ ├── requirements.txt │ └── util │ │ ├── evaluate_mme.py │ │ ├── extract_adapter_from_checkpoint.py │ │ ├── lr_sched.py │ │ └── misc.py ├── output.json ├── prepare_submission.py ├── submission.json ├── test.json ├── test_eval.json └── test_llama.json ├── docs ├── data_details.md ├── data_prep_nus.md └── gvqa.md ├── environment.yml ├── index.html └── sample.html /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [OpenDriveLab] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | .DS_Store 162 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - name: "DriveLM Contributors" 5 | title: "Drive on Language" 6 | date-released: 2023-08-25 7 | url: "https://github.com/OpenDriveLab/DriveLM/" 8 | license: Apache-2.0 -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at contact@opendrivelab.com 63 | 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /assets/images/logo/opendrivelab.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/logo/opendrivelab.jpeg -------------------------------------------------------------------------------- /assets/images/logo/tubingen.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/logo/tubingen.jpeg -------------------------------------------------------------------------------- /assets/images/page/front_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/page/front_page.png -------------------------------------------------------------------------------- /assets/images/page/front_page_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/page/front_page_dark.png -------------------------------------------------------------------------------- /assets/images/page/galaxy.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/page/galaxy.jpeg -------------------------------------------------------------------------------- /assets/images/page/video_front_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/page/video_front_page.png -------------------------------------------------------------------------------- /assets/images/repo/drivelm_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/drivelm_teaser.jpg -------------------------------------------------------------------------------- /assets/images/repo/drivelm_timeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/drivelm_timeline.jpg -------------------------------------------------------------------------------- /assets/images/repo/drivelm_timeline_v2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/drivelm_timeline_v2.jpg -------------------------------------------------------------------------------- /assets/images/repo/drivelm_timeline_v3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/drivelm_timeline_v3.jpg -------------------------------------------------------------------------------- /assets/images/repo/paper_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/paper_data.jpg -------------------------------------------------------------------------------- /assets/images/repo/paper_data_comp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/paper_data_comp.png -------------------------------------------------------------------------------- /assets/images/repo/paper_model_pipeline.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/paper_model_pipeline.jpg -------------------------------------------------------------------------------- /assets/images/repo/paper_qualitative.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/paper_qualitative.jpg -------------------------------------------------------------------------------- /assets/images/repo/paper_teaser.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/paper_teaser.jpg -------------------------------------------------------------------------------- /assets/images/repo/point_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/point_1.png -------------------------------------------------------------------------------- /assets/images/repo/point_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/point_2.png -------------------------------------------------------------------------------- /assets/images/repo/point_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/point_3.png -------------------------------------------------------------------------------- /assets/images/repo/title.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/title.jpg -------------------------------------------------------------------------------- /assets/images/repo/title_v2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/images/repo/title_v2.jpg -------------------------------------------------------------------------------- /assets/images/svg/404.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /assets/images/svg/dialog.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /assets/images/svg/divider.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /assets/images/svg/faq.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /assets/images/svg/featured-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /assets/images/svg/featured.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /assets/images/svg/graphic-1.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /assets/images/svg/graphic-2.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /assets/images/svg/master-card-1.svg: -------------------------------------------------------------------------------- 1 | mastercard-color -------------------------------------------------------------------------------- /assets/images/svg/pattern-lg-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /assets/images/svg/pattern-lg.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /assets/images/svg/pattern.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /assets/images/svg/pin-light.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /assets/images/svg/pin.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /assets/images/svg/quotes.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /assets/images/svg/shadow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /assets/images/svg/visa-1.svg: -------------------------------------------------------------------------------- 1 | visa-color -------------------------------------------------------------------------------- /assets/video/DriveLM.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/video/DriveLM.mp4 -------------------------------------------------------------------------------- /assets/video/graph.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/assets/video/graph.mp4 -------------------------------------------------------------------------------- /challenge/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt_eval import GPTEvaluation -------------------------------------------------------------------------------- /challenge/convert2llama.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | 4 | 5 | def convert2llama(root, dst): 6 | with open(root, 'r') as f: 7 | test_file = json.load(f) 8 | 9 | output = [] 10 | for scene_id in test_file.keys(): 11 | scene_data = test_file[scene_id]['key_frames'] 12 | 13 | for frame_id in scene_data.keys(): 14 | image_paths = scene_data[frame_id]['image_paths'] 15 | image_paths = [image_paths[key].replace("..", "data") for key in image_paths.keys()] 16 | 17 | frame_data_qa = scene_data[frame_id]['QA'] 18 | QA_pairs = frame_data_qa["perception"] + frame_data_qa["prediction"] + frame_data_qa["planning"] + frame_data_qa["behavior"] 19 | 20 | for idx, qa in enumerate(QA_pairs): 21 | question = qa['Q'] 22 | answer = qa['A'] 23 | output.append( 24 | { 25 | "id": scene_id + "_" + frame_id + "_" + str(idx), 26 | "image": image_paths, 27 | "conversations": [ 28 | { 29 | "from": "human", 30 | "value": "\n" + question 31 | }, 32 | { 33 | "from": "gpt", 34 | "value": answer 35 | }, 36 | ] 37 | } 38 | ) 39 | 40 | with open(dst, 'w') as f: 41 | json.dump(output, f, indent=4) 42 | 43 | 44 | if __name__ == '__main__': 45 | root = "test_eval.json" 46 | dst = "test_llama.json" 47 | convert2llama(root, dst) 48 | -------------------------------------------------------------------------------- /challenge/convert_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import random 4 | 5 | 6 | def rule_based1(question, answer): 7 | rule = ["Going ahead.", "Turn right.", "Turn left.", "Stopped.", "Back up.", "Reverse parking.", "Drive backward."] 8 | rule.remove(answer) 9 | choices = random.sample(rule, 3) 10 | choices.append(answer) 11 | random.shuffle(choices) 12 | idx = choices.index(answer) 13 | question += f" Please select the correct answer from the following options: A. {choices[0]} B. {choices[1]} C. {choices[2]} D. {choices[3]}" 14 | mapping = {0: "A", 1: "B", 2: "C", 3: "D"} 15 | return {"Q": question, "A": mapping[idx]} 16 | 17 | def rule_based2(question, answer): 18 | rule = ['The ego vehicle is slightly steering to the left. The ego vehicle is driving very fast.', 'The ego vehicle is steering to the left. The ego vehicle is driving with normal speed.', 'The ego vehicle is steering to the left. The ego vehicle is driving fast.', 'The ego vehicle is slightly steering to the right. The ego vehicle is driving fast.', 'The ego vehicle is going straight. The ego vehicle is driving slowly.', 'The ego vehicle is going straight. The ego vehicle is driving with normal speed.', 'The ego vehicle is slightly steering to the left. The ego vehicle is driving with normal speed.', 'The ego vehicle is slightly steering to the left. The ego vehicle is driving slowly.', 'The ego vehicle is slightly steering to the right. The ego vehicle is driving slowly.', 'The ego vehicle is slightly steering to the right. The ego vehicle is driving very fast.', 'The ego vehicle is steering to the right. The ego vehicle is driving fast.', 'The ego vehicle is steering to the right. The ego vehicle is driving very fast.', 'The ego vehicle is slightly steering to the left. The ego vehicle is driving fast.', 'The ego vehicle is steering to the left. The ego vehicle is driving very fast.', 'The ego vehicle is going straight. The ego vehicle is not moving.', 'The ego vehicle is slightly steering to the right. The ego vehicle is driving with normal speed.', 'The ego vehicle is steering to the right. The ego vehicle is driving slowly.', 'The ego vehicle is steering to the right. The ego vehicle is driving with normal speed.', 'The ego vehicle is going straight. The ego vehicle is driving very fast.', 'The ego vehicle is going straight. The ego vehicle is driving fast.', 'The ego vehicle is steering to the left. The ego vehicle is driving slowly.'] 19 | rule.remove(answer) 20 | choices = random.sample(rule, 3) 21 | choices.append(answer) 22 | random.shuffle(choices) 23 | idx = choices.index(answer) 24 | question += f" Please select the correct answer from the following options: A. {choices[0]} B. {choices[1]} C. {choices[2]} D. {choices[3]}" 25 | mapping = {0: "A", 1: "B", 2: "C", 3: "D"} 26 | return {"Q": question, "A": mapping[idx]} 27 | 28 | 29 | def loop_test(root, dst): 30 | with open(root, 'r') as f: 31 | test_file = json.load(f) 32 | 33 | for scene_id in test_file.keys(): 34 | scene_data = test_file[scene_id]['key_frames'] 35 | 36 | for frame_id in scene_data.keys(): 37 | # frame_data_infos = scene_data[frame_id]['key_object_infos'] 38 | frame_data_qa = scene_data[frame_id]['QA'] 39 | image_paths = scene_data[frame_id]['image_paths'] 40 | 41 | test_file[scene_id]['key_frames'][frame_id] = dict() 42 | # test_file[scene_id]['key_frames'][frame_id]['key_object_infos'] = frame_data_infos 43 | test_file[scene_id]['key_frames'][frame_id]['QA'] = dict() 44 | test_file[scene_id]['key_frames'][frame_id]['QA']['perception'] = [] 45 | # add all prediction and planning 46 | test_file[scene_id]['key_frames'][frame_id]['QA']['prediction'] = frame_data_qa["prediction"] 47 | test_file[scene_id]['key_frames'][frame_id]['QA']['planning'] = frame_data_qa["planning"] 48 | 49 | test_file[scene_id]['key_frames'][frame_id]['QA']['behavior'] = [] 50 | test_file[scene_id]['key_frames'][frame_id]['image_paths'] = image_paths 51 | 52 | for qa in frame_data_qa["perception"]: 53 | question = qa['Q'] 54 | answer = qa['A'] 55 | if "What is the moving status of object".lower() in question.lower(): 56 | qa.update(rule_based1(question, answer)) 57 | test_file[scene_id]['key_frames'][frame_id]['QA']['perception'].append(qa) 58 | else: 59 | test_file[scene_id]['key_frames'][frame_id]['QA']['perception'].append(qa) 60 | 61 | for qa in frame_data_qa["behavior"]: 62 | question = qa['Q'] 63 | answer = qa['A'] 64 | qa.update(rule_based2(question, answer)) 65 | test_file[scene_id]['key_frames'][frame_id]['QA']['behavior'].append(qa) 66 | 67 | with open(dst, 'w') as f: 68 | json.dump(test_file, f, indent=4) 69 | 70 | 71 | 72 | if __name__ == '__main__': 73 | root = "test.json" 74 | dst = "test_eval.json" 75 | loop_test(root, dst) 76 | -------------------------------------------------------------------------------- /challenge/evaluation.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | import json 4 | import numpy as np 5 | import torch.nn as nn 6 | import language_evaluation 7 | from multiprocessing import Pool 8 | 9 | import sys 10 | sys.path.append(".") 11 | from gpt_eval import GPTEvaluation 12 | 13 | 14 | class evaluation_suit(): 15 | def __init__(self): 16 | self.language_eval = language_evaluation.CocoEvaluator(coco_types=["BLEU", "ROUGE_L", "CIDEr"]) 17 | self.chatgpt_eval = GPTEvaluation() 18 | self.GPT = [] 19 | self.accuracy = {"answer": [], "GT": []} 20 | self.language = {"answer": [], "GT": []} 21 | self.match = {"match": {"answer": [], "GT": []}, "GPT": []} 22 | 23 | def eval_acc(self): 24 | scores = [] 25 | for i in range(len(self.accuracy["answer"])): 26 | answer = self.accuracy["answer"][i] 27 | GT = self.accuracy["GT"][i] 28 | if answer == GT: 29 | scores.append(1.0) 30 | else: 31 | scores.append(0.0) 32 | 33 | scores = sum(scores) / len(scores) 34 | return scores 35 | 36 | def eval_chatGPT(self, data): 37 | with Pool(32) as p: # Change the number based on your CPU cores 38 | scores = p.map(self.chatgpt_eval.forward, data) 39 | 40 | scores = list(map(float, scores)) 41 | scores = sum(scores) / len(scores) 42 | return scores 43 | 44 | def eval_language(self): 45 | """ 46 | return the dict evaluation results 47 | """ 48 | answer = self.language["answer"] 49 | GT = self.language["GT"] 50 | results_gen = self.language_eval.run_evaluation(answer, GT) 51 | results_gen_dict = { 52 | f"val/{k}": v for k, v in results_gen.items() 53 | } 54 | return results_gen_dict 55 | 56 | def eval_match(self): 57 | outs1 = [] 58 | for i in range(len(self.match["match"]["answer"])): 59 | answer = self.match["match"]["answer"][i] 60 | GT = self.match["match"]["GT"][i] 61 | _, F1_score = self.match_result(answer, GT) 62 | outs1.append(F1_score * 100) 63 | 64 | outs1 = sum(outs1) / len(outs1) 65 | outs2 = self.eval_chatGPT(self.match["GPT"]) 66 | scores = (outs1 + outs2) / 2.0 67 | return scores 68 | 69 | def eval_graph(self, question): 70 | # check if answer in self.graph 71 | question_nums = re.findall(r'\d+\.\d+', question) 72 | question_nums = np.array([list(map(float, x.split()))[0] for x in question_nums]).reshape(-1, 2) 73 | question_nums = [list(i) for i in question_nums] 74 | for q in question_nums: 75 | if q not in self.graph: 76 | return False 77 | return True 78 | 79 | def match_result(self, answer, GT): 80 | """ 81 | answer: [[1.,2.], [2., 3.]] 82 | GT: [[1., 2.], [2., 3.]] 83 | """ 84 | answer_nums = re.findall(r'\d+\.\d+', answer) 85 | GT_nums = re.findall(r'\d+\.\d+', GT) 86 | # transform string into float 87 | if len(answer_nums) % 2 != 0: 88 | answer_nums = answer_nums[:-1] 89 | answer_nums = np.array([list(map(float, x.split()))[0] for x in answer_nums]).reshape(-1, 2) 90 | GT_nums = np.array([list(map(float, x.split()))[0] for x in GT_nums]).reshape(-1, 2) 91 | length = len(GT_nums) 92 | 93 | matched_out = [] 94 | true_positives = 0 95 | false_positives = 0 96 | false_negatives = 0 97 | for pred in answer_nums: 98 | closest_distance = float('inf') 99 | closest_gt = None 100 | closest_id = None 101 | for i, gt in enumerate(GT_nums): 102 | distance = np.sum(np.abs(pred - gt)) 103 | if distance < closest_distance: 104 | closest_distance = distance 105 | closest_gt = gt 106 | closest_id = i 107 | 108 | if closest_distance < 16: 109 | true_positives += 1 110 | matched_out.append(closest_gt) 111 | GT_nums = np.delete(GT_nums, closest_id, axis=0) 112 | else: 113 | false_positives += 1 114 | 115 | false_negatives = length - true_positives 116 | precision = true_positives / (true_positives + false_positives + 1e-8) 117 | recall = true_positives / (true_positives + false_negatives + 1e-8) 118 | F1 = 2 * precision * recall / (precision + recall + 1e-8) 119 | 120 | return matched_out, F1 121 | 122 | def set_graph(self, answer, GT): 123 | self.graph, _ = self.match_result(answer, GT) 124 | self.graph = [list(i) for i in self.graph] 125 | 126 | def forward(self, tag, answer, GT): 127 | if 0 in tag: 128 | self.accuracy["answer"].append(answer) 129 | self.accuracy["GT"].append(GT) 130 | if 1 in tag: 131 | self.GPT.append((answer, GT)) 132 | if 2 in tag: 133 | self.language["GT"].append(GT) 134 | self.language["answer"].append(answer) 135 | if 3 in tag: 136 | self.match["match"]["GT"].append(GT) 137 | self.match["match"]["answer"].append(answer) 138 | self.match["GPT"].append((answer, GT)) 139 | 140 | 141 | def evaluation(self): 142 | print("evaluation start!") 143 | scores = {} 144 | scores["accuracy"] = self.eval_acc() 145 | scores["chatgpt"] = self.eval_chatGPT(self.GPT) 146 | scores["language"] = self.eval_language() 147 | scores["match"] = self.eval_match() 148 | 149 | return scores 150 | 151 | if __name__ == '__main__': 152 | # get args 153 | parser = argparse.ArgumentParser(description='Evaluation') 154 | parser.add_argument('--root_path1', type=str, default="./llama-adapter-DriveLM.json", help='path to prediction file') 155 | parser.add_argument('--root_path2', type=str, default="./test_v1.json", help='path to test file') 156 | args = parser.parse_args() 157 | 158 | with open(args.root_path1, 'r') as f :#, \ 159 | pred_file = json.load(f) 160 | pred_file = {pred_file[i]["id"]: pred_file[i] for i in range(len(pred_file))} 161 | 162 | with open(args.root_path2, 'r') as f: 163 | test_file = json.load(f) 164 | 165 | evaluation = evaluation_suit() 166 | for scene_id in test_file.keys(): 167 | scene_data = test_file[scene_id]['key_frames'] 168 | 169 | for frame_id in scene_data.keys(): 170 | frame_data_qa = scene_data[frame_id]['QA'] 171 | first_flag = True 172 | 173 | for i, qa in enumerate(frame_data_qa["perception"] + frame_data_qa["prediction"] + frame_data_qa["planning"] + frame_data_qa["behavior"]): 174 | question = qa['Q'] 175 | GT = qa['A'] 176 | tag = qa['tag'] 177 | idx = scene_id + "_" + frame_id + "_" + str(i) 178 | predict = pred_file[idx]["answer"] 179 | # assert pred_file[idx]["gt_answer"] == GT, print(pred_file[idx]["gt_answer"], GT) 180 | if first_flag: 181 | first_flag = False 182 | evaluation.set_graph(predict, GT) 183 | evaluation.forward(tag, predict, GT) 184 | else: 185 | if evaluation.eval_graph(question): 186 | res = evaluation.forward(tag, predict, GT) 187 | 188 | output = evaluation.evaluation() 189 | print("accuracy score: ", output["accuracy"]) 190 | print("chatgpt score: ", output["chatgpt"]) 191 | print("match score: ", output["match"]) 192 | print("language score: ", output["language"]) 193 | 194 | # Normalize to 0-1 and combine the scores: chatgpt, language, match, accuracy 195 | scores = [] 196 | weights = [0.4, 0.2, 0.2, 0.2] 197 | 198 | # chatGPT 199 | score = output["chatgpt"] / 100. 200 | scores.append(score) 201 | 202 | # language 203 | score = 0 204 | for idx, key in enumerate(output["language"].keys()): 205 | if idx < 4: 206 | score += output["language"][key] / 4. / 3. 207 | elif idx == 4: 208 | score += output["language"][key] / 3. 209 | else: 210 | score += output["language"][key] / 10. / 3. 211 | 212 | scores.append(score) 213 | 214 | # match 215 | score = output["match"] / 100. 216 | scores.append(score) 217 | 218 | # accuracy 219 | score = output["accuracy"] 220 | scores.append(score) 221 | 222 | final_score = sum([x * y for x, y in zip(scores, weights)]) 223 | print("final score: ", final_score) 224 | 225 | 226 | -------------------------------------------------------------------------------- /challenge/extract_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | 5 | 6 | def extract_data(root_path, save_path): 7 | 8 | with open(root_path, 'r') as f :#, \ 9 | train_file = json.load(f) 10 | 11 | test_data=dict() 12 | 13 | # TODO: convert the data into test data, containing the importance, multiple choice questions, graph questions 14 | for scene_id in train_file.keys(): 15 | scene_data = train_file[scene_id]['key_frames'] 16 | 17 | # for test file 18 | test_data[scene_id] = dict() 19 | test_data[scene_id]['key_frames'] = dict() 20 | 21 | for frame_id in scene_data.keys(): 22 | frame_data_infos = scene_data[frame_id]['key_object_infos'] 23 | frame_data_qa = scene_data[frame_id]['QA'] 24 | image_paths = scene_data[frame_id]['image_paths'] 25 | 26 | # for test file 27 | test_data[scene_id]['key_frames'][frame_id] = dict() 28 | # test_data[scene_id]['key_frames'][frame_id]['key_object_infos'] = frame_data_infos 29 | test_data[scene_id]['key_frames'][frame_id]['QA'] = dict() 30 | test_data[scene_id]['key_frames'][frame_id]['image_paths'] = image_paths 31 | test_data[scene_id]['key_frames'][frame_id]['QA']['perception'] = [] 32 | test_data[scene_id]['key_frames'][frame_id]['QA']['prediction'] = [] 33 | test_data[scene_id]['key_frames'][frame_id]['QA']['planning'] = [] 34 | test_data[scene_id]['key_frames'][frame_id]['QA']['behavior'] = [] 35 | 36 | # get the classes of the important objects 37 | classes = [] 38 | for obj_id in frame_data_infos.keys(): 39 | obj_data = frame_data_infos[obj_id] 40 | classes.append(obj_data['Visual_description'].split('.')[0]) 41 | print(classes) 42 | 43 | # get the location of the important objects 44 | locations = [] 45 | for obj_id in frame_data_infos.keys(): 46 | locations.append(obj_id) 47 | print(locations) 48 | 49 | # get the questions and answers of the perception 50 | perception = frame_data_qa["perception"] 51 | prediction = frame_data_qa["prediction"] 52 | planning = frame_data_qa["planning"] 53 | behavior = frame_data_qa["behavior"] 54 | 55 | for qa in perception: 56 | question = qa['Q'] 57 | answer = qa['A'] 58 | 59 | # according to the classes to select the corresponding question 60 | flag = 1 61 | for cl in classes: 62 | if cl.lower() not in answer.lower(): 63 | flag = 0 64 | if flag == 1: 65 | qa['tag'] = [2] 66 | test_data[scene_id]['key_frames'][frame_id]['QA']['perception'].append(qa) 67 | break 68 | 69 | # get the multiple choice questions and answers 70 | for qa in perception: 71 | question = qa['Q'] 72 | answer = qa['A'] 73 | if "What is the moving status of object".lower() in question.lower(): 74 | qa['tag'] = [0] 75 | test_data[scene_id]['key_frames'][frame_id]['QA']['perception'].append(qa) 76 | break 77 | 78 | # get the graph questions and answers 79 | for qa in prediction: 80 | question = qa['Q'] 81 | answer = qa['A'] 82 | 83 | # according to the location to select the corresponding question 84 | flag = 1 85 | for loc in locations: 86 | if loc.lower() not in answer.lower(): 87 | flag = 0 88 | if flag == 1: 89 | qa['tag'] = [3] 90 | test_data[scene_id]['key_frames'][frame_id]['QA']['prediction'].append(qa) 91 | break 92 | 93 | # get the yes or no questions and answers 94 | for qa in prediction: 95 | question = qa['Q'] 96 | answer = qa['A'] 97 | if "yes" in answer.lower() or "no" in answer.lower(): 98 | qa['tag'] = [0] 99 | test_data[scene_id]['key_frames'][frame_id]['QA']['prediction'].append(qa) 100 | break 101 | 102 | # get the three questions from the planning "safe actions", "collision", "" 103 | actions_question_added = False 104 | collision_question_added = False 105 | safe_actions_question_added = False 106 | for qa in planning: 107 | question = qa['Q'] 108 | answer = qa['A'] 109 | if "What actions could the ego vehicle take".lower() in question.lower() and not actions_question_added: 110 | qa['tag'] = [1] 111 | test_data[scene_id]['key_frames'][frame_id]['QA']['planning'].append(qa) 112 | actions_question_added = True 113 | if "lead to a collision" in question.lower() and not collision_question_added: 114 | qa['tag'] = [1] 115 | test_data[scene_id]['key_frames'][frame_id]['QA']['planning'].append(qa) 116 | collision_question_added = True 117 | if "safe actions" in question.lower() and not safe_actions_question_added: 118 | qa['tag'] = [1] 119 | test_data[scene_id]['key_frames'][frame_id]['QA']['planning'].append(qa) 120 | safe_actions_question_added = True 121 | 122 | # Check if all question types have been added and exit the loop 123 | if actions_question_added and collision_question_added and safe_actions_question_added: 124 | break 125 | 126 | for qa in behavior: 127 | question = qa['Q'] 128 | answer = qa['A'] 129 | qa['tag'] = [0] 130 | test_data[scene_id]['key_frames'][frame_id]['QA']['behavior'].append(qa) 131 | 132 | with open(save_path, 'w') as f: 133 | json.dump(test_data, f, indent=4) 134 | 135 | if __name__ == "__main__": 136 | # extract the data from the training json file 137 | root_path = "data/train_sample.json" 138 | save_path = "test.json" 139 | extract_data(root_path, save_path) 140 | 141 | 142 | -------------------------------------------------------------------------------- /challenge/gpt_eval.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pdb 3 | import numpy as np 4 | import torch 5 | import json 6 | import argparse 7 | from multiprocessing import Pool 8 | from openai import OpenAI 9 | 10 | 11 | class GPTEvaluation: 12 | def __init__(self): 13 | self.client = OpenAI(api_key="you need to use your own openai key for evaluation on your local machine") 14 | 15 | def call_chatgpt(self, chatgpt_messages, max_tokens=40, model="gpt-3.5-turbo"): 16 | response = self.client.chat.completions.create( 17 | model=model, messages=chatgpt_messages, temperature=0.6, max_tokens=max_tokens 18 | ) 19 | reply = response.choices[0].message.content 20 | total_tokens = response.usage.total_tokens 21 | return reply, total_tokens 22 | 23 | def prepare_chatgpt_message(self, prompt): 24 | system_message = "an evaluator who rates my answer based on the correct answer" 25 | messages = [{"role": "system", "content": system_message}] 26 | messages.append({"role": "user", "content": "{}".format(prompt)}) 27 | 28 | return messages 29 | 30 | def forward(self, data): 31 | answer, GT = data 32 | prompts = "Rate my answer based on the correct answer out of 100, with higher scores indicating that the answer is closer to the correct answer, and you should be accurate to single digits like 62, 78, 41,etc. Output the number only" 33 | prompts = prompts + "This is the correct answer: " + GT + "This is my answer: " + answer 34 | 35 | output = "" 36 | messages = self.prepare_chatgpt_message(prompts) 37 | reply, total_tokens = self.call_chatgpt(messages, max_tokens=3000) 38 | 39 | output += reply 40 | output += "\n\n" 41 | 42 | output = output[:-2] 43 | 44 | return output 45 | 46 | 47 | if __name__ == "__main__": 48 | data = [ 49 | ("The ego vehicle should notice the bus next, as it is the third object in the image. The bus is stopped at the intersection, and the ego vehicle should be cautious when approaching the intersection to ensure it does not collide with the bus.", "Firstly, notice . The object is a traffic sign, so the ego vehicle should continue at the same speed. Secondly, notice . The object is a traffic sign, so the ego vehicle should accelerate and continue ahead. Thirdly, notice . The object is stationary, so the ego vehicle should continue ahead at the same speed."), 50 | # Add more data here 51 | ] 52 | 53 | eval = GPTEvaluation() 54 | 55 | with Pool(5) as p: # Change the number based on your CPU cores 56 | scores = p.map(eval.forward, data) 57 | 58 | print(scores) 59 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/README.md: -------------------------------------------------------------------------------- 1 | # LLaMA-Adapter-V2 Multi-modal 2 | 3 | ## News 4 | * [Oct 11, 2023] Release LLaMA-Adapter V2.1 and evaluation on MME. 5 | * [July 5, 2023] Release pre-traininig and fine-tuning codes. 6 | * [May 26, 2023] Initial release. 7 | 8 | 9 | ## Setup 10 | 11 | * setup up a new conda env and install necessary packages. 12 | ```bash 13 | conda create -n llama_adapter_v2 python=3.8 -y 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | * Obtain the LLaMA backbone weights using [this form](https://forms.gle/jk851eBVbX1m5TAv5). Please note that checkpoints from unofficial sources (e.g., BitTorrent) may contain malicious code and should be used with care. Organize the downloaded file in the following structure 18 | ``` 19 | /path/to/llama_model_weights 20 | ├── 7B 21 | │   ├── checklist.chk 22 | │   ├── consolidated.00.pth 23 | │   └── params.json 24 | └── tokenizer.model 25 | ``` 26 | 27 | ## Inference 28 | 29 | Here is a simple inference script for LLaMA-Adapter V2. The pre-trained model will be downloaded directly from [Github Release](https://github.com/OpenGVLab/LLaMA-Adapter/releases/tag/v.2.0.0). 30 | 31 | ```python 32 | import cv2 33 | import llama 34 | import torch 35 | from PIL import Image 36 | 37 | device = "cuda" if torch.cuda.is_available() else "cpu" 38 | 39 | llama_dir = "/path/to/LLaMA/" 40 | 41 | # choose from BIAS-7B, LORA-BIAS-7B, LORA-BIAS-7B-v21 42 | model, preprocess = llama.load("BIAS-7B", llama_dir, llama_type="7B", device=device) 43 | model.eval() 44 | 45 | prompt = llama.format_prompt("Please introduce this painting.") 46 | img = Image.fromarray(cv2.imread("../docs/logo_v1.png")) 47 | img = preprocess(img).unsqueeze(0).to(device) 48 | 49 | result = model.generate(img, [prompt])[0] 50 | 51 | print(result) 52 | ``` 53 | 54 | The output will look like the following: 55 | ``` 56 | The painting features a cute white lama, or llama, standing on a wooden floor. The llama is holding a variety of tools and accessories, such as a paintbrush, a pencil, a ruler, a pair of scissors, and a paint can. The llama is dressed in a suit, which adds a touch of sophistication to the scene. The painting is a creative and whimsical representation of a person or animal holding various tools and accessories, making it an interesting and unique piece of art. 57 | ``` 58 | 59 | ## Evaluation 60 | Check [eval.md](./docs/eval.md) for details. 61 | 62 | ## Online demo 63 | 64 | We provide an online demo at [OpenGVLab](http://llama-adapter.opengvlab.com). 65 | 66 | You can also start it locally with: 67 | ```bash 68 | python gradio_app.py 69 | ``` 70 | 71 | ## Models 72 | 73 | You can check our models by running: 74 | ```python 75 | import llama 76 | print(llama.available_models()) 77 | ``` 78 | 79 | Now we provide `BIAS-7B` which fine-tunes the `bias` and `norm` parameters of LLaMA, and `LORA-BIAS-7B` which fine-tunes the `bias`, `norm` and `lora` parameters of LLaMA. We will include more pretrained models in the future, such as the LoRA fine-tuning model `LORA-7B` and partial-tuning model `PARTIAL-7B`. 80 | 81 | ## Pre-traininig & Fine-tuning 82 | See [train.md](docs/train.md) 83 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/dataset.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import yaml 3 | from torch.utils.data import Dataset 4 | from PIL import Image 5 | import json 6 | import llama.utils 7 | from llama import Tokenizer 8 | import copy 9 | import torchvision.transforms as transforms 10 | import pandas as pd 11 | import random 12 | import cv2 13 | import re 14 | 15 | try: 16 | from torchvision.transforms import InterpolationMode 17 | BICUBIC = InterpolationMode.BICUBIC 18 | except ImportError: 19 | BICUBIC = Image.BICUBIC 20 | 21 | 22 | PROMPT_DICT = { 23 | "prompt_input": ( 24 | "Below is an instruction that describes a task, paired with an input that provides further context. " 25 | "Write a response that appropriately completes the request.\n\n" 26 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 27 | ), 28 | "prompt_no_input": ( 29 | "Below is an instruction that describes a task. " 30 | "Write a response that appropriately completes the request.\n\n" 31 | "### Instruction:\n{instruction}\n\n### Response:" 32 | ), 33 | } 34 | 35 | # create data 36 | transform_train = transforms.Compose([ 37 | transforms.Resize( 38 | (224, 224), interpolation=InterpolationMode.BICUBIC 39 | ), # 3 is bicubic 40 | transforms.ToTensor(), 41 | transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]) 42 | 43 | class FinetuneDataset(Dataset): 44 | def __init__(self, config_path, transform, max_words=30, tokenizer_path=None): 45 | print(f"read dataset config from {config_path}") 46 | with open(config_path, 'r') as f: 47 | self.config = yaml.load(f, Loader=yaml.FullLoader) 48 | print("DATASET CONFIG:") 49 | print(self.config) 50 | ann = [] 51 | for meta_path in self.config['META']: 52 | meta_l = json.load(open(meta_path)) 53 | print(f"{meta_path}: len {len(meta_l)}") 54 | ann += meta_l 55 | self.ann = ann 56 | print(f"total length: {len(self)}") 57 | self.transform = transform 58 | self.max_words = max_words 59 | self.tokenizer = Tokenizer(model_path=tokenizer_path) 60 | 61 | def __len__(self): 62 | return len(self.ann) 63 | 64 | def __getitem__(self, index): 65 | data_item = self.ann[index] 66 | if 'image' in data_item.keys(): 67 | filename = data_item['image'] 68 | question = data_item['conversations'][0]['value'] 69 | answer = data_item['conversations'][1]['value'] 70 | if isinstance(filename, list): 71 | image_all = [] 72 | for img_path in filename: 73 | image = cv2.imread(img_path) 74 | image = Image.fromarray(image) 75 | image = self.transform(image) 76 | image_all.append(image) 77 | image = torch.stack(image_all) 78 | else: 79 | image = cv2.imread(filename) 80 | image = Image.fromarray(image) 81 | image = self.transform(image) 82 | format_instruction = question 83 | format_input = None 84 | else: 85 | image = torch.zeros(3, 224, 224) 86 | format_instruction = data_item['instruction'], 87 | format_input = data_item['input'] 88 | answer = data_item['output'] 89 | input1 = llama.utils.format_prompt(format_instruction, format_input) 90 | input2 = input1 + answer 91 | input1 = torch.tensor(self.tokenizer.encode(input1, bos=True, eos=False), dtype=torch.int64) 92 | input2 = torch.tensor(self.tokenizer.encode(input2, bos=True, eos=True), dtype=torch.int64) 93 | padding = self.max_words - input2.shape[0] 94 | if padding > 0: 95 | input2 = torch.cat((input2, torch.zeros(padding, dtype=torch.int64) - 1)) 96 | elif padding < 0: 97 | input2 = input2[:self.max_words] 98 | labels = copy.deepcopy(input2) 99 | labels[:len(input1)] = -1 100 | input2_mask = input2.ge(0) 101 | label_mask = labels.ge(0) 102 | input2[~input2_mask] = 0 103 | labels[~label_mask] = 0 104 | input2_mask = input2_mask.float() 105 | label_mask = label_mask.float() 106 | return input2, labels, input2_mask, image 107 | 108 | 109 | class PretrainDataset(Dataset): 110 | def __init__(self, config_path, transform, max_words=30, tokenizer_path=None): 111 | print(f"read dataset config from {config_path}") 112 | with open(config_path, 'r') as f: 113 | self.config = yaml.load(f, Loader=yaml.FullLoader) 114 | print("DATASET CONFIG:") 115 | print(self.config) 116 | images, captions = [], [] 117 | for meta_path in self.config['META']: 118 | images_this_meta, captions_this_meta = [], [] 119 | for chunk in pd.read_csv(meta_path, sep='\t', lineterminator='\n', chunksize=10 ** 6): 120 | images_this_meta.extend(chunk['url'].tolist()) 121 | captions_this_meta.extend(chunk['caption'].tolist()) 122 | print(f"{meta_path}: len {len(images_this_meta)}") 123 | images.extend(images_this_meta) 124 | captions.extend(captions_this_meta) 125 | 126 | self.data_list = [] 127 | for x, y in zip(images, captions): 128 | self.data_list.append({'url': x, 'caption': y}) 129 | print(f"total length: {len(self)}") 130 | self.transform = transform 131 | self.max_words = max_words 132 | self.tokenizer = Tokenizer(model_path=tokenizer_path) 133 | 134 | def __len__(self): 135 | return len(self.data_list) 136 | 137 | def __getitem__(self, index): 138 | sample = self.data_list[index] 139 | image_path, caption = sample['url'], sample['caption'] 140 | if isinstance(caption, list): 141 | caption = random.choice(caption) 142 | caption = str(caption) 143 | 144 | image = cv2.imread(image_path) 145 | image = Image.fromarray(image) 146 | image = self.transform(image) 147 | 148 | format_instruction = "Generate caption of this image" 149 | input1 = llama.utils.format_prompt(format_instruction, None) 150 | input2 = input1 + caption 151 | 152 | input1 = torch.tensor(self.tokenizer.encode(input1, bos=True, eos=False), dtype=torch.int64) 153 | input2 = torch.tensor(self.tokenizer.encode(input2, bos=True, eos=True), dtype=torch.int64) 154 | padding = self.max_words - input2.shape[0] 155 | if padding > 0: 156 | input2 = torch.cat((input2, torch.zeros(padding, dtype=torch.int64) - 1)) 157 | elif padding < 0: 158 | input2 = input2[:self.max_words] 159 | labels = copy.deepcopy(input2) 160 | labels[:len(input1)] = -1 161 | input2_mask = input2.ge(0) 162 | label_mask = labels.ge(0) 163 | input2[~input2_mask] = 0 164 | labels[~label_mask] = 0 165 | input2_mask = input2_mask.float() 166 | label_mask = label_mask.float() 167 | return input2, labels, input2_mask, image 168 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291002287558.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291002287558.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291005187558.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291005187558.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291010637558.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291010637558.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291013637558.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n008-2018-09-18-13-10-39-0400__CAM_BACK__1537291013637558.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280036187525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280036187525.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280039187525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280039187525.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280042637525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280042637525.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280045137525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280045137525.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280048187525.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK/n015-2018-08-03-15-00-36+0800__CAM_BACK__1533280048187525.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291002297405.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291002297405.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291005197405.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291005197405.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291010647405.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291010647405.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291013647405.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n008-2018-09-18-13-10-39-0400__CAM_BACK_LEFT__1537291013647405.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280036197423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280036197423.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280039197423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280039197423.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280042647423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280042647423.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280045147423.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280045147423.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280048197442.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-03-15-00-36+0800__CAM_BACK_LEFT__1533280048197442.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291002278113.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291002278113.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291005178113.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291005178113.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291010628113.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291010628113.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291013628113.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_BACK_RIGHT__1537291013628113.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280036177893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280036177893.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280039177893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280039177893.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280042627893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280042627893.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280045127893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280045127893.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280048177893.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_BACK_RIGHT__1533280048177893.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291002262404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291002262404.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291005162404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291005162404.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291010612404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291010612404.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291013612404.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n008-2018-09-18-13-10-39-0400__CAM_FRONT__1537291013612404.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280036162460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280036162460.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280039162460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280039162460.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280042612460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280042612460.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280045112460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280045112460.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280048162460.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT/n015-2018-08-03-15-00-36+0800__CAM_FRONT__1533280048162460.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291002254799.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291002254799.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291005154799.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291005154799.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291010604799.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291010604799.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291013604799.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_LEFT__1537291013604799.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280036154844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280036154844.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280039154844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280039154844.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280042604844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280042604844.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280045104844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280045104844.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280048154844.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_LEFT__1533280048154844.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291002270482.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291002270482.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291005170482.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291005170482.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291010620482.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291010620482.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291013620482.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-09-18-13-10-39-0400__CAM_FRONT_RIGHT__1537291013620482.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280036170339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280036170339.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280039170339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280039170339.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280042620339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280042620339.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280045120339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280045120339.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280048170339.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenDriveLab/DriveLM/0e8dff72eaf13c42b508289a34ea758782089e8c/challenge/llama_adapter_v2_multimodal7b/data/nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-03-15-00-36+0800__CAM_FRONT_RIGHT__1533280048170339.jpg -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/demo.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import llama 3 | import torch 4 | from PIL import Image 5 | from tqdm import tqdm 6 | import json 7 | import argparse 8 | import torchvision.transforms as transforms 9 | from torch.utils.data import Dataset, DataLoader 10 | from threading import Thread 11 | import math 12 | 13 | try: 14 | from torchvision.transforms import InterpolationMode 15 | BICUBIC = InterpolationMode.BICUBIC 16 | except ImportError: 17 | BICUBIC = Image.BICUBIC 18 | 19 | class LLamaDataset(Dataset): 20 | def __init__(self, data, transform=None): 21 | self.data = data 22 | self.transform = transform 23 | 24 | def __len__(self): 25 | return len(self.data) 26 | 27 | def __getitem__(self, idx): 28 | data_item = self.data[idx] 29 | filename = data_item['image'] 30 | ids = data_item['id'] 31 | question = data_item['conversations'][0]['value'] 32 | answer = data_item['conversations'][1]['value'] 33 | 34 | prompt = llama.format_prompt(question) 35 | 36 | if isinstance(filename, list): 37 | image_all = [] 38 | for img_path in filename: 39 | image = cv2.imread(img_path) 40 | image = Image.fromarray(image) 41 | if self.transform: 42 | image = self.transform(image) 43 | image_all.append(image) 44 | image = torch.stack(image_all, dim=0) 45 | else: 46 | image = cv2.imread(filename) 47 | image = Image.fromarray(image) 48 | if self.transform: 49 | image = self.transform(image) 50 | 51 | return image, prompt, ids, question, answer 52 | 53 | def worker(rank, gpu_id, args, data_dict): 54 | torch.cuda.set_device(gpu_id) 55 | device = torch.device("cuda") 56 | llama_dir = args.llama_dir 57 | 58 | model, preprocess = llama.load(args.checkpoint, llama_dir, llama_type="7B", device=device) 59 | model.eval() 60 | 61 | transform_train = transforms.Compose([ 62 | transforms.Resize((224, 224), interpolation=BICUBIC), 63 | transforms.ToTensor(), 64 | transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])]) 65 | 66 | with open(args.data, 'r') as f: 67 | data_all = json.load(f) 68 | 69 | num_processes = args.num_processes 70 | data_per_process = math.ceil(len(data_all) / num_processes) 71 | start_idx = rank * data_per_process 72 | end_idx = min((rank + 1) * data_per_process, len(data_all)) 73 | data_to_process = data_all[start_idx:end_idx] 74 | 75 | dataset = LLamaDataset(data_to_process, transform=transform_train) 76 | dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=8) 77 | 78 | for batch in tqdm(dataloader): 79 | images, prompts, ids, questions, gt_answers = batch 80 | images = images.to(device) 81 | results = model.generate(images, prompts, temperature=0.2, top_p=0.1) 82 | 83 | for i, result in enumerate(results): 84 | print(f"Thread {rank}: Result - {result}") 85 | data_dict.append({'id': ids[i], 'question': questions[i], 'answer': result}) 86 | 87 | print(f"Thread {rank} finished") 88 | 89 | # add args 90 | parser = argparse.ArgumentParser(description='LLAMA Adapter') 91 | parser.add_argument('--llama_dir', type=str, default="/path/to/llama_model_weights", help='path to llama model weights') 92 | parser.add_argument('--checkpoint', type=str, default="/path/to/pre-trained/checkpoint.pth", help='path to pre-trained checkpoint') 93 | parser.add_argument('--data', type=str, default="../test_llama.json", help='path to test data') 94 | parser.add_argument('--output', type=str, default="../output.json", help='path to output file') 95 | parser.add_argument('--batch_size', type=int, default=8, help='batch size for parallel processing') 96 | parser.add_argument('--num_processes', type=int, default=8, help='number of gpus to use') 97 | args = parser.parse_args() 98 | 99 | if __name__ == '__main__': 100 | num_gpus = args.num_processes 101 | print(f"Using {num_gpus} GPUs") 102 | 103 | data_dict = [] 104 | threads = [] 105 | for rank in range(num_gpus): 106 | t = Thread(target=worker, args=(rank, rank, args, data_dict)) 107 | t.start() 108 | threads.append(t) 109 | 110 | for t in threads: 111 | t.join() 112 | 113 | with open(args.output, "w") as f: 114 | json.dump(data_dict, f, indent=4) 115 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/docs/eval.md: -------------------------------------------------------------------------------- 1 | # Evaluation on MME Benchmark 2 | 3 | [MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning. 4 | 5 | ## Setup & Evaluation 6 | 7 | 1. Download MME datasets and `eval_tool` from the [MME repo](https://github.com/bradyfu/awesome-multimodal-large-language-models#our-mllm-works), and put them under `MME_Benchmark_release_version`. Now the folder structure will be: 8 | ``` 9 | MME_Benchmark_release_version 10 | ├── artwork 11 | ├── celebrity 12 | ├── code_reasoning 13 | ├── color 14 | ├── commonsense_reasoning 15 | ├── count 16 | ├── eval_tool 17 | │ ├── calculation.py 18 | │ ├── LaVIN 19 | │ └── Your_Results 20 | ├── existence 21 | ├── landmark 22 | ├── numerical_calculation 23 | ├── OCR 24 | ├── position 25 | ├── posters 26 | ├── scene 27 | └── text_translation 28 | ``` 29 | 2. Generate MME results using: `python util/evaluate_mme.py --pretrained_path [MODEL_PATH] --llama_path [LLAMA_DIR] --output_path [RESULT_FILE_PATH]` 30 | 3. Evaluate LLaMA-Adapter V2.1 with MME's eval_tool: `python MME_Benchmark_release_version/eval_tool/calculation.py --results_dir [RESULT_FILE_PATH]` 31 | 32 | ## Results 33 | 34 | > For comparisons with other works, please check [MME Leaderboard](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation). 35 | 36 | * **LLaMA-Adapter V2.1** 37 | 38 | ``` 39 | =========== Perception =========== 40 | total score: 1326.0875953396435 41 | 42 | existence score: 185.0 43 | count score: 133.33333333333331 44 | position score: 56.666666666666664 45 | color score: 118.33333333333334 46 | posters score: 147.9591836734694 47 | celebrity score: 134.70588235294116 48 | scene score: 156.25 49 | landmark score: 167.8391959798995 50 | artwork score: 123.5 51 | OCR score: 102.5 52 | 53 | 54 | =========== Cognition =========== 55 | total score: 356.42857142857144 56 | 57 | commonsense_reasoning score: 106.42857142857144 58 | numerical_calculation score: 47.5 59 | text_translation score: 112.5 60 | code_reasoning score: 90.0 61 | 62 | ``` 63 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/docs/train.md: -------------------------------------------------------------------------------- 1 | The training process of LLaMA-Adapter V2 consists of the pre-training and fine-tuning phases. 2 | 3 | ## Pre-training 4 | 5 | ### Data 6 | 7 | * We use multiple datasets with **image-text pairs** for pre-training. The texts are English-only. 8 | * For each dataset, the meta file should be organized in the `.csv` format as following: 9 | 10 | ``` 11 | url caption 12 | /path/to/image1 caption1 13 | /path/to/image2 caption2 14 | ... 15 | ``` 16 | 17 | Alternatively, you may modify the [`PretrainDataset`](/data/dataset.py) implementation to adapt to your own meta file format. 18 | * Write a `.yaml` config file to specify the datasets for pre-training: 19 | 20 | ``` 21 | META: 22 | - '/path/to/cc3m.csv' 23 | - '/path/to/coco.csv' 24 | ... 25 | ``` 26 | 27 | ### Start pre-training 28 | 29 | We are now ready to start pre-training (please make sure that the original LLaMA weights are available in `/path/to/llama_model_weights`). 30 | 31 | ```bash 32 | . exps/pretrain.sh /path/to/llama_model_weights /path/to/pretrain-data-config.yaml /output/path 33 | ``` 34 | 35 | ## Fine-tuning 36 | 37 | ### Data 38 | 39 | * We fine-tune LLaMA-Adapter V2 on text-only as well as image-text instruction following datasets. 40 | * The following lists the datasets we use for training our release weights: 41 | 42 | | Name | Link | 43 | | ------------------------ | ------------------------------------------------------------------------------------------------------------ | 44 | | alpaca_gpt4_data.json | [File Link](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data.json) | 45 | | alpaca_gpt4_data_zh.json | [File Link](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/blob/main/data/alpaca_gpt4_data_zh.json) | 46 | | llava_instruct_150k.json | [File Link](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/raw/main/llava_instruct_150k.json) | 47 | | alpaca_data_zh_51k.json | [File Link](https://github.com/ymcui/Chinese-LLaMA-Alpaca/blob/main/data/alpaca_data_zh_51k.json) | 48 | * Similar to pre-training, write a `.yaml` config file to specify the datasets for fine-tuning: 49 | 50 | ``` 51 | META: 52 | - '/path/to/alpaca_gpt4_data.json' 53 | - '/path/to/alpaca_gpt4_data_zh.json' 54 | ... 55 | ``` 56 | 57 | ### Start fine-tuning 58 | 59 | ```bash 60 | . exps/finetune.sh \ 61 | /path/to/llama_model_weights /path/to/pre-trained/checkpoint.pth \ 62 | /path/to/finetune-data-config.yaml /output/path 63 | ``` 64 | 65 | ### Test and Save 66 | 67 | ```python 68 | import os 69 | from llama.llama_adapter import LLaMA_adapter 70 | import util.misc as misc 71 | import util.extract_adapter_from_checkpoint as extract 72 | from PIL import Image 73 | import cv2 74 | import torch 75 | import llama 76 | 77 | device = "cuda" if torch.cuda.is_available() else "cpu" 78 | 79 | llama_dir = "path/to/llama/" 80 | llama_type = '7B' 81 | llama_ckpt_dir = os.path.join(llama_dir, llama_type) 82 | llama_tokenzier_path = os.path.join(llama_dir, 'tokenizer.model') 83 | model = LLaMA_adapter(llama_ckpt_dir, llama_tokenzier_path) 84 | 85 | misc.load_model(model, 'path/to/finetune/checkpoint.pth') 86 | model.eval() 87 | model.to(device) 88 | 89 | prompt = llama.format_prompt('your prompt') 90 | img = Image.fromarray(cv2.imread("your image")) 91 | img = model.clip_transform(img).unsqueeze(0).to(device) 92 | 93 | result = model.generate(img, [prompt])[0] 94 | print(result) 95 | 96 | extract.save(model,'path/to/adapter-7B.pth','BIAS') # Please end it with -llama_type.pth 97 | ``` 98 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/engine_finetune.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | from typing import Iterable 4 | 5 | import torch 6 | 7 | import util.misc as misc 8 | import util.lr_sched as lr_sched 9 | 10 | from llama import LLaMA_adapter 11 | 12 | def train_one_epoch(model: LLaMA_adapter, 13 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 14 | device: torch.device, epoch: int, loss_scaler, 15 | log_writer=None, 16 | args=None): 17 | model.train(True) 18 | # model.module.set_default_trainability() 19 | 20 | metric_logger = misc.MetricLogger(delimiter=" ") 21 | metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) 22 | header = 'Epoch: [{}]'.format(epoch) 23 | print_freq = 10 24 | 25 | accum_iter = args.accum_iter 26 | 27 | optimizer.zero_grad() 28 | 29 | if log_writer is not None: 30 | print('log_dir: {}'.format(log_writer.log_dir)) 31 | for data_iter_step, (examples, labels, example_mask, imgs) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): 32 | # we use a per iteration (instead of per epoch) lr scheduler 33 | if data_iter_step % accum_iter == 0: 34 | lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) 35 | 36 | imgs = imgs.to(device, non_blocking=True) 37 | with torch.cuda.amp.autocast(): 38 | c_loss, m_loss = model(examples, labels, imgs) 39 | loss = c_loss + m_loss * 0 40 | loss_value = loss.item() 41 | c_loss_value = c_loss.item() 42 | m_loss_value = m_loss 43 | if not math.isfinite(loss_value): 44 | print("Loss is {}, stopping training".format(loss_value)) 45 | sys.exit(1) 46 | 47 | loss /= accum_iter 48 | loss_scaler(loss, optimizer, parameters=model.parameters(), 49 | update_grad=(data_iter_step + 1) % accum_iter == 0) 50 | if (data_iter_step + 1) % accum_iter == 0: 51 | optimizer.zero_grad() 52 | 53 | torch.cuda.synchronize() 54 | 55 | metric_logger.update(closs=c_loss_value) 56 | metric_logger.update(mloss=m_loss_value) 57 | 58 | lr = optimizer.param_groups[0]["lr"] 59 | metric_logger.update(lr=lr) 60 | 61 | loss_value_reduce = misc.all_reduce_mean(loss_value) 62 | c_loss_value_reduce = misc.all_reduce_mean(c_loss_value) 63 | m_loss_value_reduce = misc.all_reduce_mean(m_loss_value) 64 | if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: 65 | """ We use epoch_1000x as the x-axis in tensorboard. 66 | This calibrates different curves when batch size changes. 67 | """ 68 | epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) 69 | log_writer.add_scalar('c_train_loss', c_loss_value_reduce, epoch_1000x) 70 | log_writer.add_scalar('m_train_loss', m_loss_value_reduce, epoch_1000x) 71 | log_writer.add_scalar('lr', lr, epoch_1000x) 72 | 73 | 74 | # gather the stats from all processes 75 | metric_logger.synchronize_between_processes() 76 | print("Averaged stats:", metric_logger) 77 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 78 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/engine_pretrain.py: -------------------------------------------------------------------------------- 1 | import math 2 | import sys 3 | from typing import Iterable 4 | 5 | import torch 6 | 7 | import util.misc as misc 8 | import util.lr_sched as lr_sched 9 | 10 | from llama import LLaMA_adapter 11 | 12 | def train_one_epoch(model: LLaMA_adapter, 13 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 14 | device: torch.device, epoch: int, loss_scaler, 15 | log_writer=None, 16 | args=None): 17 | model.train(True) 18 | # model.module.set_default_trainability() 19 | 20 | metric_logger = misc.MetricLogger(delimiter=" ") 21 | metric_logger.add_meter('lr', misc.SmoothedValue(window_size=1, fmt='{value:.6f}')) 22 | header = 'Epoch: [{}]'.format(epoch) 23 | print_freq = 10 24 | 25 | accum_iter = args.accum_iter 26 | 27 | optimizer.zero_grad() 28 | 29 | if log_writer is not None: 30 | print('log_dir: {}'.format(log_writer.log_dir)) 31 | for data_iter_step, (examples, labels, example_mask, imgs) in enumerate(metric_logger.log_every(data_loader, print_freq, header)): 32 | # we use a per iteration (instead of per epoch) lr scheduler 33 | if data_iter_step % accum_iter == 0: 34 | lr_sched.adjust_learning_rate(optimizer, data_iter_step / len(data_loader) + epoch, args) 35 | 36 | imgs = imgs.to(device, non_blocking=True) 37 | with torch.cuda.amp.autocast(): 38 | c_loss, m_loss = model(examples, labels, imgs) 39 | loss = c_loss + m_loss * 0 40 | loss_value = loss.item() 41 | c_loss_value = c_loss.item() 42 | m_loss_value = m_loss 43 | if not math.isfinite(loss_value): 44 | print("Loss is {}, stopping training".format(loss_value)) 45 | sys.exit(1) 46 | 47 | loss /= accum_iter 48 | loss_scaler(loss, optimizer, parameters=model.parameters(), 49 | update_grad=(data_iter_step + 1) % accum_iter == 0) 50 | if (data_iter_step + 1) % accum_iter == 0: 51 | optimizer.zero_grad() 52 | 53 | torch.cuda.synchronize() 54 | 55 | metric_logger.update(closs=c_loss_value) 56 | metric_logger.update(mloss=m_loss_value) 57 | 58 | lr = optimizer.param_groups[0]["lr"] 59 | metric_logger.update(lr=lr) 60 | 61 | loss_value_reduce = misc.all_reduce_mean(loss_value) 62 | c_loss_value_reduce = misc.all_reduce_mean(c_loss_value) 63 | m_loss_value_reduce = misc.all_reduce_mean(m_loss_value) 64 | if log_writer is not None and (data_iter_step + 1) % accum_iter == 0: 65 | """ We use epoch_1000x as the x-axis in tensorboard. 66 | This calibrates different curves when batch size changes. 67 | """ 68 | epoch_1000x = int((data_iter_step / len(data_loader) + epoch) * 1000) 69 | log_writer.add_scalar('c_train_loss', c_loss_value_reduce, epoch_1000x) 70 | log_writer.add_scalar('m_train_loss', m_loss_value_reduce, epoch_1000x) 71 | log_writer.add_scalar('lr', lr, epoch_1000x) 72 | 73 | 74 | # gather the stats from all processes 75 | metric_logger.synchronize_between_processes() 76 | print("Averaged stats:", metric_logger) 77 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 78 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/exps/finetune.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | LLAMA_PATH="$1" 4 | PRETRAINED_PATH="$2" # path to pre-trained checkpoint 5 | CONFIG="$3" 6 | OUTPUT_DIR="$4" 7 | 8 | mkdir -p $OUTPUT_DIR 9 | 10 | python -u -m torch.distributed.launch --master_port=1112 --nproc_per_node=8 --use_env \ 11 | main_finetune.py --data_config "$CONFIG" --batch_size 4 \ 12 | --epochs 4 --warmup_epochs 1 --blr 10e-4 --weight_decay 0.02 \ 13 | --llama_path "$LLAMA_PATH" \ 14 | --output_dir "$OUTPUT_DIR" \ 15 | --pretrained_path "$PRETRAINED_PATH" \ 16 | &>> "$OUTPUT_DIR"/output.log & -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/exps/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/bash 2 | 3 | LLAMA_PATH="$1" 4 | CONFIG="$2" 5 | OUTPUT_DIR="$3" 6 | 7 | mkdir -p "$OUTPUT_DIR" 8 | 9 | python -u -m torch.distributed.launch --master_port=1112 --nproc_per_node=8 --use_env \ 10 | main_pretrain.py --data_config "$CONFIG" --batch_size 4 \ 11 | --epochs 150 --split_epoch 50 --warmup_epochs 5 --blr 1.0e-4 --weight_decay 0.05 \ 12 | --llama_path "$LLAMA_PATH" \ 13 | --output_dir "$OUTPUT_DIR" \ 14 | &>> "$OUTPUT_DIR"/output.log & -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/finetune_data_config.yaml: -------------------------------------------------------------------------------- 1 | META: 2 | - 'test_llama.json' 3 | 4 | 5 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/gradio_app.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import gradio as gr 3 | import torch 4 | from PIL import Image 5 | 6 | import llama 7 | 8 | 9 | device = "cuda" if torch.cuda.is_available() else "cpu" 10 | 11 | llama_dir = "/path/to/LLaMA/" 12 | 13 | model, preprocess = llama.load("BIAS-7B", llama_dir, device) 14 | model.half() 15 | model.eval() 16 | 17 | def multi_modal_generate( 18 | img_path: str, 19 | prompt: str, 20 | max_gen_len=256, 21 | temperature: float = 0.1, 22 | top_p: float = 0.75, 23 | ): 24 | try: 25 | img = Image.fromarray(cv2.imread(img_path)) 26 | except: 27 | return "" 28 | 29 | img = preprocess(img).unsqueeze(0).half().to(device) 30 | prompt = llama.format_prompt(prompt) 31 | 32 | result = model.generate(img, [prompt], 33 | max_gen_len=max_gen_len, 34 | temperature=temperature, 35 | top_p=top_p) 36 | print(result[0]) 37 | return result[0] 38 | 39 | 40 | def create_multi_modal_demo(): 41 | with gr.Blocks() as instruct_demo: 42 | with gr.Row(): 43 | with gr.Column(): 44 | img = gr.Image(label='Input', type='filepath') 45 | question = gr.Textbox(lines=2, label="Prompt") 46 | max_len = gr.Slider(minimum=1, maximum=512, 47 | value=256, label="Max length") 48 | with gr.Accordion(label='Advanced options', open=False): 49 | temp = gr.Slider(minimum=0, maximum=1, 50 | value=0.1, label="Temperature") 51 | top_p = gr.Slider(minimum=0, maximum=1, 52 | value=0.75, label="Top p") 53 | 54 | run_botton = gr.Button("Run") 55 | 56 | with gr.Column(): 57 | outputs = gr.Textbox(lines=10, label="Output") 58 | 59 | inputs = [img, question, max_len, temp, top_p] 60 | 61 | examples = [ 62 | ["../docs/logo_v1.png", "Please introduce this painting.", 256, 0.1, 0.75], 63 | ] 64 | 65 | gr.Examples( 66 | examples=examples, 67 | inputs=inputs, 68 | outputs=outputs, 69 | fn=multi_modal_generate, 70 | cache_examples=False 71 | ) 72 | run_botton.click(fn=multi_modal_generate, 73 | inputs=inputs, outputs=outputs) 74 | return instruct_demo 75 | 76 | 77 | description = """ 78 | # LLaMA-Adapter V2🚀 79 | The official demo for **LLaMA-Adapter V2: Parameter-Efficient Visual Instruction Model**. 80 | 81 | Please refer to our [arXiv paper](https://arxiv.org/abs/2304.15010) and [github](https://github.com/ZrrSkywalker/LLaMA-Adapter) for more details. 82 | 83 | The demo for **LLaMA-Adapter V1** is available at: [Huggingface Spaces](https://huggingface.co/spaces/csuhan/LLaMA-Adapter). 84 | """ 85 | 86 | with gr.Blocks(css="h1,p {text-align: center;}") as demo: 87 | gr.Markdown(description) 88 | with gr.TabItem("Multi-Modal Interaction"): 89 | create_multi_modal_demo() 90 | 91 | demo.queue(api_open=True, concurrency_count=1).launch(share=True) 92 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .llama import ModelArgs, Transformer 2 | from .tokenizer import Tokenizer 3 | from .llama_adapter import * 4 | from .utils import format_prompt -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/llama/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the GNU General Public License version 3. 3 | 4 | from sentencepiece import SentencePieceProcessor 5 | from logging import getLogger 6 | from typing import List 7 | import os 8 | 9 | 10 | logger = getLogger() 11 | 12 | 13 | class Tokenizer: 14 | def __init__(self, model_path: str): 15 | # reload tokenizer 16 | assert os.path.isfile(model_path), model_path 17 | self.sp_model = SentencePieceProcessor(model_file=model_path) 18 | logger.info(f"Reloaded SentencePiece model from {model_path}") 19 | 20 | # BOS / EOS token IDs 21 | self.n_words: int = self.sp_model.vocab_size() 22 | self.bos_id: int = self.sp_model.bos_id() 23 | self.eos_id: int = self.sp_model.eos_id() 24 | self.pad_id: int = self.sp_model.pad_id() 25 | logger.info( 26 | f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" 27 | ) 28 | assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() 29 | 30 | def encode(self, s: str, bos: bool, eos: bool) -> List[int]: 31 | assert type(s) is str 32 | t = self.sp_model.encode(s) 33 | if bos: 34 | t = [self.bos_id] + t 35 | if eos: 36 | t = t + [self.eos_id] 37 | return t 38 | 39 | def decode(self, t: List[int]) -> str: 40 | return self.sp_model.decode(t) 41 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/llama/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import urllib 3 | import hashlib 4 | import warnings 5 | 6 | from tqdm import tqdm 7 | import torch 8 | 9 | 10 | def sample_top_p(probs, p): 11 | probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True) 12 | probs_sum = torch.cumsum(probs_sort, dim=-1) 13 | mask = probs_sum - probs_sort > p 14 | probs_sort[mask] = 0.0 15 | probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True)) 16 | next_token = torch.multinomial(probs_sort, num_samples=1) 17 | next_token = torch.gather(probs_idx, -1, next_token) 18 | return next_token 19 | 20 | 21 | def format_prompt(instruction, input=None): 22 | 23 | PROMPT_DICT = { 24 | "prompt_input": ( 25 | "Below is an instruction that describes a task, paired with an input that provides further context. " 26 | "Write a response that appropriately completes the request.\n\n" 27 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 28 | ), 29 | "prompt_no_input": ( 30 | "Below is an instruction that describes a task. " 31 | "Write a response that appropriately completes the request.\n\n" 32 | "### Instruction:\n{instruction}\n\n### Response:" 33 | ), 34 | } 35 | if input is None: 36 | return PROMPT_DICT['prompt_no_input'].format_map({'instruction': instruction}) 37 | else: 38 | return PROMPT_DICT["prompt_input"].format_map({'instruction': instruction, 'input': input}) 39 | 40 | 41 | def _download(url: str, root: str): 42 | os.makedirs(root, exist_ok=True) 43 | filename = os.path.basename(url) 44 | # assume the url is https://some/path/sha256_model.pth 45 | expected_sha256 = url.split("/")[-1].split('_')[0] 46 | # expected_sha256 = url.split("/")[-2] 47 | download_target = os.path.join(root, filename) 48 | 49 | if os.path.exists(download_target) and not os.path.isfile(download_target): 50 | raise RuntimeError(f"{download_target} exists and is not a regular file") 51 | 52 | if os.path.isfile(download_target): 53 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: 54 | return download_target 55 | else: 56 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") 57 | 58 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: 59 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop: 60 | while True: 61 | buffer = source.read(8192) 62 | if not buffer: 63 | break 64 | 65 | output.write(buffer) 66 | loop.update(len(buffer)) 67 | 68 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: 69 | raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match") 70 | 71 | return download_target 72 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/main_finetune.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.backends.cudnn as cudnn 3 | from torch.utils.tensorboard import SummaryWriter 4 | from torch.utils.data import Dataset 5 | 6 | import util.misc as misc 7 | from util.misc import NativeScalerWithGradNormCount as NativeScaler 8 | from llama.llama_adapter import LLaMA_adapter 9 | 10 | from data.dataset import FinetuneDataset, transform_train 11 | 12 | import argparse 13 | import datetime 14 | import json 15 | import numpy as np 16 | import os 17 | import time 18 | from pathlib import Path 19 | 20 | from engine_finetune import train_one_epoch 21 | 22 | 23 | def get_args_parser(): 24 | parser = argparse.ArgumentParser('llama_adapterV2 pre-training', add_help=False) 25 | parser.add_argument('--batch_size', default=64, type=int, 26 | help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') 27 | parser.add_argument('--epochs', default=400, type=int) 28 | parser.add_argument('--accum_iter', default=1, type=int, 29 | help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') 30 | 31 | # Model parameters 32 | parser.add_argument('--llama_type', default='7B', type=str, 33 | help='Type of LLaMA model') # 34 | parser.add_argument('--llama_path', default='/path/to/llama', type=str, 35 | help='path to LLaMA pretrained checkpoint') 36 | parser.add_argument('--pretrained_path', default='/path/to/pretrained', type=str, 37 | help='path to checkpoint from pretrain stage') 38 | parser.add_argument('--max_words', default=512, type=int, 39 | help='max number of input words') 40 | 41 | # Optimizer parameters 42 | parser.add_argument('--weight_decay', type=float, default=0.05, 43 | help='weight decay (default: 0.05)') 44 | 45 | parser.add_argument('--lr', type=float, default=None, metavar='LR', 46 | help='learning rate (absolute lr)') 47 | parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', 48 | help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') 49 | parser.add_argument('--min_lr', type=float, default=0., metavar='LR', 50 | help='lower lr bound for cyclic schedulers that hit 0') 51 | 52 | parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', 53 | help='epochs to warmup LR') 54 | 55 | # Dataset parameters 56 | parser.add_argument('--data_config', default='configs/data/finetune/EN.yaml', type=str, 57 | help='dataset config path') 58 | parser.add_argument('--num_workers', default=10, type=int) 59 | parser.add_argument('--pin_mem', action='store_true', 60 | help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') 61 | parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') 62 | parser.set_defaults(pin_mem=True) 63 | 64 | 65 | parser.add_argument('--output_dir', default='./output', 66 | help='path where to save, empty for no saving') 67 | parser.add_argument('--log_dir', default='./output', 68 | help='path where to tensorboard log') 69 | parser.add_argument('--device', default='cuda', 70 | help='device to use for training / testing') 71 | parser.add_argument('--seed', default=0, type=int) 72 | 73 | 74 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N', 75 | help='start epoch') 76 | 77 | # distributed training parameters 78 | parser.add_argument('--world_size', default=1, type=int, 79 | help='number of distributed processes') 80 | parser.add_argument('--local_rank', default=-1, type=int) 81 | parser.add_argument('--dist_on_itp', action='store_true') 82 | parser.add_argument('--dist_url', default='env://', 83 | help='url used to set up distributed training') 84 | 85 | return parser 86 | 87 | 88 | def main(args): 89 | misc.init_distributed_mode(args) 90 | 91 | print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) 92 | print("{}".format(args).replace(', ', ',\n')) 93 | 94 | device = torch.device(args.device) 95 | 96 | # fix the seed for reproducibility 97 | seed = args.seed + misc.get_rank() 98 | torch.manual_seed(seed) 99 | np.random.seed(seed) 100 | cudnn.benchmark = True 101 | 102 | # define the model 103 | llama_type = args.llama_type 104 | llama_ckpt_dir = os.path.join(args.llama_path, llama_type) 105 | llama_tokenzier_path = os.path.join(args.llama_path, 'tokenizer.model') 106 | model = LLaMA_adapter(llama_ckpt_dir, llama_tokenzier_path) 107 | 108 | model.to(device) 109 | 110 | model_without_ddp = model 111 | print("Model = %s" % str(model_without_ddp)) 112 | 113 | print("Trainable Params:") 114 | print([(key, val.shape) for key, val in model.named_parameters() if val.requires_grad]) 115 | 116 | if args.distributed: 117 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) 118 | model_without_ddp = model.module 119 | 120 | # training detail 121 | eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() 122 | 123 | if args.lr is None: # only base_lr is specified 124 | args.lr = args.blr * eff_batch_size / 256 125 | 126 | print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) 127 | print("actual lr: %.2e" % args.lr) 128 | 129 | print("accumulate grad iterations: %d" % args.accum_iter) 130 | print("effective batch size: %d" % eff_batch_size) 131 | 132 | # following timm: set wd as 0 for bias and norm layers 133 | param_groups = misc.add_weight_decay(model_without_ddp, args.weight_decay) 134 | optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) 135 | print(optimizer) 136 | loss_scaler = NativeScaler() 137 | 138 | misc.load_model(model_without_ddp, args.pretrained_path) 139 | 140 | 141 | dataset_train = FinetuneDataset(args.data_config, transform=transform_train, 142 | max_words=args.max_words, tokenizer_path=llama_tokenzier_path) 143 | print(dataset_train) 144 | num_tasks = misc.get_world_size() 145 | global_rank = misc.get_rank() 146 | sampler_train = torch.utils.data.DistributedSampler( 147 | dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True 148 | ) 149 | print("Sampler_train = %s" % str(sampler_train)) 150 | 151 | data_loader_train = torch.utils.data.DataLoader( 152 | dataset_train, sampler=sampler_train, 153 | batch_size=args.batch_size, 154 | num_workers=args.num_workers, 155 | pin_memory=args.pin_mem, 156 | drop_last=True, 157 | ) 158 | 159 | # SummaryWrite 160 | if global_rank == 0 and args.log_dir is not None: 161 | os.makedirs(args.log_dir, exist_ok=True) 162 | log_writer = SummaryWriter(log_dir=args.log_dir) 163 | else: 164 | log_writer = None 165 | 166 | 167 | print(f"Start training for {args.epochs} epochs") 168 | start_time = time.time() 169 | for epoch in range(args.start_epoch, args.epochs): 170 | if args.distributed: 171 | data_loader_train.sampler.set_epoch(epoch) 172 | 173 | train_stats = train_one_epoch( 174 | model, data_loader_train, 175 | optimizer, device, epoch, loss_scaler, 176 | log_writer=log_writer, 177 | args=args 178 | ) 179 | 180 | if args.output_dir and (epoch % 5 == 0 or epoch + 1 == args.epochs): 181 | misc.save_model( 182 | args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, 183 | loss_scaler=loss_scaler, epoch=epoch) 184 | 185 | log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 186 | 'epoch': epoch, 187 | **{f'val_{k}': v for k, v in train_stats.items()}} 188 | 189 | if args.output_dir and misc.is_main_process(): 190 | if log_writer is not None: 191 | log_writer.flush() 192 | with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: 193 | f.write(json.dumps(log_stats) + "\n") 194 | 195 | total_time = time.time() - start_time 196 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 197 | print('Training time {}'.format(total_time_str)) 198 | 199 | 200 | if __name__ == '__main__': 201 | args = get_args_parser() 202 | args = args.parse_args() 203 | if args.output_dir: 204 | Path(args.output_dir).mkdir(parents=True, exist_ok=True) 205 | main(args) 206 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/main_pretrain.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.backends.cudnn as cudnn 3 | from torch.utils.tensorboard import SummaryWriter 4 | from torch.utils.data import Dataset 5 | 6 | import util.misc as misc 7 | from util.misc import NativeScalerWithGradNormCount as NativeScaler 8 | from llama.llama_adapter import LLaMA_adapter 9 | 10 | from data.dataset import PretrainDataset, transform_train 11 | 12 | import argparse 13 | import datetime 14 | import json 15 | import numpy as np 16 | import os 17 | import time 18 | from pathlib import Path 19 | 20 | from engine_pretrain import train_one_epoch 21 | 22 | 23 | def get_args_parser(): 24 | parser = argparse.ArgumentParser('llama_adapterV2 pre-training', add_help=False) 25 | parser.add_argument('--batch_size', default=64, type=int, 26 | help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus') 27 | parser.add_argument('--epochs', default=400, type=int) 28 | parser.add_argument('--accum_iter', default=1, type=int, 29 | help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)') 30 | 31 | # Model parameters 32 | parser.add_argument('--llama_type', default='7B', type=str, 33 | help='Type of LLaMA model') # 34 | parser.add_argument('--llama_path', default='/path/to/llama', type=str, 35 | help='path to LLaMA pretrained checkpoint') 36 | parser.add_argument('--max_words', default=96, type=int, 37 | help='max number of input words') 38 | 39 | # Optimizer parameters 40 | parser.add_argument('--weight_decay', type=float, default=0.05, 41 | help='weight decay (default: 0.05)') 42 | 43 | parser.add_argument('--lr', type=float, default=None, metavar='LR', 44 | help='learning rate (absolute lr)') 45 | parser.add_argument('--blr', type=float, default=1e-3, metavar='LR', 46 | help='base learning rate: absolute_lr = base_lr * total_batch_size / 256') 47 | parser.add_argument('--min_lr', type=float, default=0., metavar='LR', 48 | help='lower lr bound for cyclic schedulers that hit 0') 49 | 50 | parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N', 51 | help='epochs to warmup LR') 52 | 53 | # Dataset parameters 54 | parser.add_argument('--data_config', default='configs/data/pretrain/EN.yaml', type=str, 55 | help='dataset config path') 56 | parser.add_argument('--num_workers', default=10, type=int) 57 | parser.add_argument('--pin_mem', action='store_true', 58 | help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.') 59 | parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem') 60 | parser.set_defaults(pin_mem=True) 61 | 62 | 63 | parser.add_argument('--output_dir', default='./output', 64 | help='path where to save, empty for no saving') 65 | parser.add_argument('--log_dir', default='./output', 66 | help='path where to tensorboard log') 67 | parser.add_argument('--device', default='cuda', 68 | help='device to use for training / testing') 69 | parser.add_argument('--seed', default=0, type=int) 70 | 71 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N', 72 | help='start epoch') 73 | 74 | # distributed training parameters 75 | parser.add_argument('--world_size', default=1, type=int, 76 | help='number of distributed processes') 77 | parser.add_argument('--local_rank', default=-1, type=int) 78 | parser.add_argument('--dist_on_itp', action='store_true') 79 | parser.add_argument('--dist_url', default='env://', 80 | help='url used to set up distributed training') 81 | 82 | parser.add_argument('--split_epoch', type=int, default=50) 83 | 84 | return parser 85 | 86 | 87 | def main(args): 88 | misc.init_distributed_mode(args) 89 | 90 | print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__)))) 91 | print("{}".format(args).replace(', ', ',\n')) 92 | 93 | device = torch.device(args.device) 94 | 95 | # fix the seed for reproducibility 96 | seed = args.seed + misc.get_rank() 97 | torch.manual_seed(seed) 98 | np.random.seed(seed) 99 | cudnn.benchmark = True 100 | 101 | # define the model 102 | llama_type = args.llama_type 103 | llama_ckpt_dir = os.path.join(args.llama_path, llama_type) 104 | llama_tokenzier_path = os.path.join(args.llama_path, 'tokenizer.model') 105 | model = LLaMA_adapter(llama_ckpt_dir, llama_tokenzier_path, phase="pretrain") 106 | 107 | model.to(device) 108 | 109 | model_without_ddp = model 110 | print("Model = %s" % str(model_without_ddp)) 111 | 112 | print("Trainable Params:") 113 | print([(key, val.shape) for key, val in model.named_parameters() if val.requires_grad]) 114 | 115 | if args.distributed: 116 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) 117 | model_without_ddp = model.module 118 | 119 | # training detail 120 | eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size() 121 | 122 | if args.lr is None: # only base_lr is specified 123 | args.lr = args.blr * eff_batch_size / 256 124 | 125 | print("base lr: %.2e" % (args.lr * 256 / eff_batch_size)) 126 | print("actual lr: %.2e" % args.lr) 127 | 128 | print("accumulate grad iterations: %d" % args.accum_iter) 129 | print("effective batch size: %d" % eff_batch_size) 130 | 131 | # following timm: set wd as 0 for bias and norm layers 132 | param_groups = misc.add_weight_decay(model_without_ddp, args.weight_decay) 133 | optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=(0.9, 0.95)) 134 | print(optimizer) 135 | loss_scaler = NativeScaler() 136 | 137 | 138 | 139 | dataset_train = PretrainDataset(args.data_config, transform=transform_train, 140 | max_words=args.max_words, tokenizer_path=llama_tokenzier_path) 141 | print(dataset_train) 142 | num_tasks = misc.get_world_size() 143 | global_rank = misc.get_rank() 144 | sampler_train = misc.DistributedSubEpochSampler( 145 | dataset_train, num_replicas=num_tasks, rank=global_rank, split_epoch=args.split_epoch, shuffle=True 146 | ) 147 | print("Sampler_train = %s" % str(sampler_train)) 148 | 149 | data_loader_train = torch.utils.data.DataLoader( 150 | dataset_train, sampler=sampler_train, 151 | batch_size=args.batch_size, 152 | num_workers=args.num_workers, 153 | pin_memory=args.pin_mem, 154 | drop_last=True, 155 | ) 156 | 157 | # SummaryWrite 158 | if global_rank == 0 and args.log_dir is not None: 159 | os.makedirs(args.log_dir, exist_ok=True) 160 | log_writer = SummaryWriter(log_dir=args.log_dir) 161 | else: 162 | log_writer = None 163 | 164 | 165 | print(f"Start training for {args.epochs} epochs") 166 | start_time = time.time() 167 | for epoch in range(args.start_epoch, args.epochs): 168 | if args.distributed: 169 | data_loader_train.sampler.set_epoch(epoch) 170 | 171 | train_stats = train_one_epoch( 172 | model, data_loader_train, 173 | optimizer, device, epoch, loss_scaler, 174 | log_writer=log_writer, 175 | args=args 176 | ) 177 | 178 | if args.output_dir and (epoch % 2 == 0 or epoch + 1 == args.epochs): 179 | misc.save_model( 180 | args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, 181 | loss_scaler=loss_scaler, epoch=epoch) 182 | 183 | log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 184 | 'epoch': epoch} 185 | 186 | if args.output_dir and misc.is_main_process(): 187 | if log_writer is not None: 188 | log_writer.flush() 189 | with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f: 190 | f.write(json.dumps(log_stats) + "\n") 191 | 192 | total_time = time.time() - start_time 193 | total_time_str = str(datetime.timedelta(seconds=int(total_time))) 194 | print('Training time {}'.format(total_time_str)) 195 | 196 | 197 | if __name__ == '__main__': 198 | args = get_args_parser() 199 | args = args.parse_args() 200 | if args.output_dir: 201 | Path(args.output_dir).mkdir(parents=True, exist_ok=True) 202 | main(args) 203 | -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/requirements.txt: -------------------------------------------------------------------------------- 1 | --extra-index-url https://download.pytorch.org/whl/cu117 2 | torch==2.0.0+cu117 3 | torchvision==0.15.1+cu117 4 | tensorboard 5 | fairscale 6 | sentencepiece 7 | Pillow 8 | opencv-python 9 | gradio 10 | tqdm 11 | tenacity 12 | openai 13 | git+https://github.com/csuhan/timm_0_3_2.git 14 | git+https://github.com/openai/CLIP.git -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/util/evaluate_mme.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import argparse 4 | from tqdm import tqdm 5 | import PIL 6 | from PIL import Image 7 | import torch 8 | import torch.distributed as dist 9 | from torch.utils.data import Dataset 10 | import cv2 11 | from llama.llama_adapter import LLaMA_adapter 12 | 13 | DATA_DIR = "./MME_Benchmark_release_version" 14 | 15 | def get_image(image): 16 | if type(image) is str: 17 | try: 18 | return Image.open(image).convert("RGB") 19 | except Exception as e: 20 | print(f"Fail to read image: {image}") 21 | exit(-1) 22 | elif type(image) is Image.Image: 23 | return image 24 | elif type(image) is PIL.JpegImagePlugin.JpegImageFile: 25 | return image 26 | elif type(image) is PIL.PngImagePlugin.PngImageFile: 27 | return image 28 | elif type(image) is PIL.MpoImagePlugin.MpoImageFile: 29 | return image 30 | else: 31 | raise NotImplementedError(f"Invalid type of Image: {type(image)}") 32 | 33 | 34 | class MMEDataset(Dataset): 35 | def __init__( 36 | self, 37 | dataset_name 38 | ): 39 | self.dataset_name = dataset_name 40 | self.dataset = [] 41 | jpg_sets = ["artwork", "celebrity", "color", "count", "existence", "landmark", "OCR", "position", "posters", "scene"] 42 | png_sets = ["code_reasoning", "commonsense_reasoning", "numerical_calculation", "text_translation"] 43 | image_suffix = '.jpg' if dataset_name in jpg_sets else ".png" 44 | 45 | assert (dataset_name in jpg_sets) or (dataset_name in png_sets), f"Invalid dataset name for MME benchmark: {dataset_name}" 46 | 47 | if os.path.exists(f"{DATA_DIR}/{dataset_name}/images") and os.path.exists(f"{DATA_DIR}/{dataset_name}/questions_answers_YN"): 48 | question_files = os.listdir(f"{DATA_DIR}/{dataset_name}/questions_answers_YN") 49 | for question_file in question_files: 50 | image_file_name = os.path.join(DATA_DIR, dataset_name, "images", question_file.replace('.txt', image_suffix)) 51 | with open(os.path.join(DATA_DIR, dataset_name, "questions_answers_YN", question_file), 'r', encoding='utf-8') as f: 52 | for line in f.readlines(): 53 | try: 54 | question, gt_answer = line.replace('\n', '').split('\t') 55 | self.dataset.append({ 56 | "image_path": image_file_name, 57 | "gt_answers": gt_answer, 58 | "question": question 59 | }) 60 | except: 61 | pass 62 | 63 | else: 64 | question_files = glob.glob(f"{DATA_DIR}/{dataset_name}/*.txt") 65 | for question_file in question_files: 66 | image_file_name = question_file.replace(".txt", image_suffix) 67 | with open(question_file, 'r', encoding='utf-8') as f: 68 | for line in f.readlines(): 69 | try: 70 | question, gt_answer = line.replace('\n', '').split('\t') 71 | self.dataset.append({ 72 | "image_path": image_file_name, 73 | "gt_answers": gt_answer, 74 | "question": question 75 | }) 76 | except: 77 | pass 78 | 79 | def __len__(self): 80 | return len(self.dataset) 81 | 82 | def __getitem__(self, idx): 83 | return self.dataset[idx] 84 | 85 | 86 | def get_args_parser(): 87 | parser = argparse.ArgumentParser('Single-turn (conversation) demo', add_help=False) 88 | # Model parameters 89 | parser.add_argument('--llama_path', default='/path/to/llama', type=str, 90 | help='path to LLaMA pretrained checkpoint') 91 | parser.add_argument('--pretrained_path', default='/path/to/pretrained', type=str, 92 | help='directory containing pre-trained checkpoints') 93 | parser.add_argument('--lora', default=16, type=int) 94 | parser.add_argument('--output_path', default='/path/to/output_results', type=str) 95 | return parser 96 | 97 | 98 | if __name__ == "__main__": 99 | args = get_args_parser().parse_args() 100 | 101 | device = "cuda" if torch.cuda.is_available() else "cpu" 102 | 103 | llama_dir = args.llama_path 104 | llama_type = '7B' 105 | llama_ckpt_dir = os.path.join(llama_dir, llama_type) 106 | llama_tokenzier_path = os.path.join(llama_dir, 'tokenizer.model') 107 | 108 | model_path = args.pretrained_path 109 | # load llama_adapter weights and model_cfg 110 | print(f'Loading LLaMA-Adapter from {model_path}') 111 | ckpt = torch.load(model_path, map_location='cpu') 112 | 113 | w_bias = True 114 | w_lora = args.lora > 0 115 | print('Lora:', w_lora) 116 | lora_rank = args.lora 117 | model = LLaMA_adapter( 118 | llama_ckpt_dir, llama_tokenzier_path, 119 | max_seq_len=512, max_batch_size=32, 120 | clip_model='ViT-L/14', 121 | v_embed_dim=768, v_depth=8, 122 | v_num_heads=16, v_mlp_ratio=4.0, 123 | query_len=10, query_layer=31, 124 | w_bias=w_bias, 125 | w_lora=w_lora, 126 | lora_rank=lora_rank, 127 | w_new_gate=w_lora, # for compatibility 128 | phase='finetune') 129 | 130 | load_result = model.load_state_dict(ckpt['model'], strict=False) 131 | print(load_result) 132 | 133 | model = model.to(device) 134 | model.half() 135 | model.eval() 136 | preprocess = model.clip_transform 137 | 138 | prompt_format = ( 139 | "Below is an instruction that describes a task. " 140 | "Write a response that appropriately completes the request using a single word or phrase.\n\n" 141 | "### Instruction:\n{instruction}\n\n### Response:" 142 | ) 143 | 144 | def multi_modal_generate( 145 | img_path: str, 146 | prompt: str, 147 | max_gen_len=30, 148 | temperature: float = 0, 149 | top_p: float = 0.75, 150 | ): 151 | img = Image.fromarray(cv2.imread(img_path)) 152 | img = preprocess(img).unsqueeze(0).half().to(device) 153 | prompt = prompt_format.format_map({'instruction': prompt}) 154 | 155 | result = model.generate(img, [prompt], 156 | max_gen_len=max_gen_len, 157 | temperature=temperature, 158 | top_p=top_p) 159 | return result[0] 160 | 161 | 162 | result = {} 163 | dataset_names = ["artwork", "celebrity", "color", "count", "existence", "OCR", "position", "posters", "scene", "code_reasoning", "commonsense_reasoning", "numerical_calculation", "text_translation", "landmark"] # landmark (03d5e3bfc958be38.jpg) 164 | answer_path = args.output_path 165 | batch_size = 1 166 | 167 | print("Starting...") 168 | for dataset_name in dataset_names: 169 | dataset = MMEDataset(dataset_name) 170 | 171 | predictions = [] 172 | with torch.no_grad(): 173 | for data in tqdm(dataset, desc=f"Inferencing {dataset_name}"): 174 | pred = multi_modal_generate(data['image_path'], data['question']) 175 | predictions.append({'image_path': data['image_path'], 'question': data['question'], 'answer': pred, 'gt_answers': data['gt_answers']}) 176 | 177 | os.makedirs(answer_path, exist_ok=True) 178 | prediction_file = os.path.join(answer_path, f"{dataset_name}.txt") 179 | out_datas = [ 180 | f"{data['image_path']}\t{data['question']}\t{data['gt_answers']}\t{data['answer']}" 181 | for data in predictions 182 | ] 183 | with open(prediction_file, 'w') as f: 184 | f.write('\n'.join(out_datas)) -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/util/extract_adapter_from_checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def save(full_model, path, model_type = 'BIAS'): 4 | if model_type == 'BIAS': 5 | keys = [ 6 | f'visual_blocks.{i}.{key}.{suffix}' 7 | for i in range(8) 8 | for key in ['norm1', 'attn.qkv', 'attn.proj', 'norm2', 'mlp.fc1', 'mlp.fc2'] 9 | for suffix in ['weight', 'bias'] 10 | ] + [ 11 | f'llama.layers.{i}.{key}' 12 | for i in range(32) 13 | for key in ['attention.gate', 'attention.wq.bias', 'attention.wo.bias', 'feed_forward.w1.bias', 'feed_forward.w2.bias', 'feed_forward.w3.bias', 'attention_norm.weight', 'ffn_norm.weight'] 14 | ] + [ 15 | f'{base_key}.{suffix}' 16 | for base_key in ['clip_proj_norm', 'visual_proj_norm', 'visual_proj', 'clip_proj'] 17 | for suffix in ['weight', 'bias'] 18 | ] + ['llama.norm.weight', 'visual_query.weight', 'adapter_query.weight'] 19 | 20 | 21 | elif model_type == 'LORA': 22 | keys = [ 23 | f'visual_blocks.{i}.{key}.{suffix}' 24 | for i in range(8) 25 | for key in [f'norm{j}' for j in range(1, 3)] + ['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'] 26 | for suffix in ['weight', 'bias'] 27 | ] + [ 28 | f'llama.layers.{i}.{key}' 29 | for i in range(32) 30 | for key in ['attention.gate', 'attention.wq.bias', 'attention.wo.bias', 'feed_forward.w1.bias', 'feed_forward.w2.bias', 'feed_forward.w3.bias', 'attention_norm.weight', 'ffn_norm.weight'] 31 | + [f'attention.lora_wk_l{j}.weight' for j in range(1, 3)] 32 | + [f'attention.lora_wo_l{j}.weight' for j in range(1, 3)] 33 | + [f'feed_forward.lora_w{k}_l{j}.weight' for k in range(1, 4) for j in range(1, 3)] 34 | + [f'attention.lora_wq_l{j}.weight' for j in range(1, 3)] 35 | + [f'attention.lora_wv_l{j}.weight' for j in range(1, 3)] 36 | + ['attention.new_gate'] 37 | ] + [ 38 | f'{base_key}.{suffix}' 39 | for base_key in ['clip_proj_norm', 'visual_proj_norm', 'visual_proj', 'clip_proj'] 40 | for suffix in ['weight', 'bias'] 41 | ] + ['llama.norm.weight', 'visual_query.weight', 'adapter_query.weight'] 42 | 43 | ## TODO: Add other model types 44 | 45 | full_model_state_dict = full_model.state_dict() 46 | small_weights = {key: full_model_state_dict[key] for key in keys} 47 | if model_type == 'BIAS': 48 | wrapped_small_weights = {'model': small_weights,'config': {'w_bias': True, 'w_lora': False, 'lora_rank': 16}} 49 | elif model_type == 'LORA': 50 | wrapped_small_weights = {'model': small_weights,'config': {'w_bias': True, 'w_lora': True, 'lora_rank': 16}} 51 | # Save the wrapped small weights 52 | torch.save(wrapped_small_weights, path) -------------------------------------------------------------------------------- /challenge/llama_adapter_v2_multimodal7b/util/lr_sched.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | 9 | def adjust_learning_rate(optimizer, epoch, args): 10 | """Decay the learning rate with half-cycle cosine after warmup""" 11 | if epoch < args.warmup_epochs: 12 | lr = args.lr * epoch / args.warmup_epochs 13 | else: 14 | lr = args.min_lr + (args.lr - args.min_lr) * 0.5 * \ 15 | (1. + math.cos(math.pi * (epoch - args.warmup_epochs) / (args.epochs - args.warmup_epochs))) 16 | for param_group in optimizer.param_groups: 17 | if "lr_scale" in param_group: 18 | param_group["lr"] = lr * param_group["lr_scale"] 19 | else: 20 | param_group["lr"] = lr 21 | return lr 22 | -------------------------------------------------------------------------------- /challenge/prepare_submission.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | # Please fill in your team information here 4 | method = "" # -- name of the method 5 | team = "" # -- name of the team, !!!identical to the Google Form!!! 6 | authors = [""] # -- list of str, authors 7 | email = "" # -- e-mail address 8 | institution = "" # -- institution or company 9 | country = "" # -- country or region 10 | 11 | 12 | def main(): 13 | with open('output.json', 'r') as file: 14 | output_res = json.load(file) 15 | 16 | submission_content = { 17 | "method": method, 18 | "team": team, 19 | "authors": authors, 20 | "email": email, 21 | "institution": institution, 22 | "country": country, 23 | "results": output_res 24 | } 25 | 26 | with open('submission.json', 'w') as file: 27 | json.dump(submission_content, file, indent=4) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /docs/data_details.md: -------------------------------------------------------------------------------- 1 | ## Features of the DriveLM-Data 2 | 3 | - 🛣 Completeness in functionality (covering **Perception**, **Prediction**, and **Planning** QA pairs). 4 | 5 | 6 |

7 | 8 |

9 | 10 | 11 | - 🔜 Reasoning for future events that have not yet happened. 12 | - Many **"What If"**-style questions: imagine the future by language. 13 | 14 | 15 |

16 | 17 |

18 | 19 | - ♻ Task-driven decomposition. 20 | - **One** scene-level description into **many** frame-level trajectories & planning QA pairs. 21 | 22 |

23 | 24 |

25 | 26 | ## How about the annotation process? 27 | 28 | The annotation process is different for DriveLM-nuScenes and DriveLM-CARLA. 29 | 30 |

31 | 32 |

33 | 34 | **For DriveLM-nuScenes**, we divide the annotation process into three steps: 35 | 36 | 1️⃣ Keyframe selection. Given all frames in one clip, the annotator selects the keyframes that need annotation. The criterion is that those frames should involve changes in ego-vehicle movement status (lane changes, sudden stops, start after a stop, etc.). 37 | 38 | 2️⃣ Key objects selection. Given keyframes, the annotator needs to pick up key objects in the six surrounding images. The criterion is that those objects should be able to affect the action of the ego vehicle (traffic signals, pedestrians crossing the road, other vehicles that move in the direction of the ego vehicle, etc.). 39 | 40 | 3️⃣ Question and answer annotation. Given those key objects, we automatically generate questions regarding single or multiple objects about perception, prediction, and planning. More details can be found in our data. 41 | 42 | **For DriveLM-CARLA**, we employ an automated annotation approach: 43 | 44 | We collect data using CARLA 0.9.14 in the Leaderboard 2.0 framework with a privileged rule-based expert. We set up a series of routes in urban, residential, and rural areas and execute the expert on these routes. During this process, we collect the necessary sensor data, generate relevant QAs based on privileged information about objects and the scene, and organize the logical relationships to connect this series of QAs into a graph. 45 | -------------------------------------------------------------------------------- /docs/data_prep_nus.md: -------------------------------------------------------------------------------- 1 | ## Download data 2 | We kindly hope you to fill out the [form](https://docs.google.com/forms/d/e/1FAIpQLSeX6CR3u-15IV-TKx2uPv1wiKjydjZ__NNW98H4nR5JZtQa2Q/viewform) before downloading. To get started, download nuScenes subset image data and DriveLM-nuScenes QA json files below. For v1.1 data, please visit the [DriveLM/challenge](https://github.com/OpenDriveLab/DriveLM/tree/main/challenge) folder. 3 | 4 | 7 | 8 | | nuScenes subset images | DriveLM-nuScenes version-1.0| 9 | |-------|-------| 10 | | [Google Drive](https://drive.google.com/file/d/1DeosPGYeM2gXSChjMODGsQChZyYDmaUz/view?usp=sharing) | [Google Drive](https://drive.google.com/file/d/1LK7pYHytv64neN1626u6eTQBy1Uf4IQH/view?usp=sharing) | 11 | |[Baidu Netdisk](https://pan.baidu.com/s/11xvxPzUY5xTIsJQrYFogqg?pwd=mk95)|[Baidu Netdisk](https://pan.baidu.com/s/1PAdotDY0MN3nkz8w_XhDsw?pwd=l4wf) | 12 | |[HuggingFace](https://huggingface.co/datasets/OpenDriveLab/DriveLM/blob/main/drivelm_nus_imgs_train.zip)|[HuggingFace](https://huggingface.co/datasets/OpenDriveLab/DriveLM/blob/main/v1_0_train_nus.json) 13 | 14 | You can also download the full nuScenes dataset [HERE](https://www.nuscenes.org/download) to enable video input. 15 | 16 | Our DriveLM dataset contains a collection of questions and answers. Currently, only the training set is publicly available. The dataset is named `v1_0_train_nus.json`. 17 | 18 | 19 | 20 | 21 | ## Prepare the dataset 22 | 23 | Organize the data structure as follows: 24 | 25 | ``` 26 | DriveLM 27 | ├── data/ 28 | │ ├── QA_dataset_nus/ 29 | │ │ ├── v1_0_train_nus.json 30 | │ ├── nuscenes/ 31 | │ │ ├── samples/ 32 | ``` 33 | 34 | 35 | #### File structure 36 | 37 | The QA pairs are in the `v1_0_train_nus.json`. Below is the json file structure. All `coordinates` mentioned are referenced from the `upper-left` corner of the respective camera, with the `right` and `bottom` directions serving as the positive x and y axes, respectively. 38 | ``` 39 | v1_0_train_nus.json 40 | ├── scene_token:{ 41 | │ ├── "scene_description": "The ego vehicle proceeds along the current road, preparing to enter the main road after a series of consecutive right turns.", 42 | │ ├── "key_frames":{ 43 | │ │ ├── "frame_token_1":{ 44 | │ │ │ ├── "key_object_infos":{"": {"Category": "Vehicle", "Status": "Moving", "Visual_description": "White Sedan", "2d_bbox": [x_min, y_min, x_max, y_max]}, ...}, 45 | │ │ │ ├── "QA":{ 46 | │ │ │ │ ├── "perception":[ 47 | │ │ │ │ │ ├── {"Q": "What are the important objects in the current scene?", "A": "The important objects are , , ...", "C": None, "con_up": None, "con_down": None, "cluster": None, "layer": None}, 48 | │ │ │ │ │ ├── {"Q": "xxx", "A": "xxx", "C": None, "con_up": None, "con_down": None, "cluster": None, "layer": None}, ... 49 | │ │ │ │ ├── ], 50 | │ │ │ │ ├── "prediction":[ 51 | │ │ │ │ │ ├── {"Q": "What is the future state of ?", "A": "Slightly offset to the left in maneuvering.", "C": None, "con_up": None, "con_down": None, "cluster": None, "layer": None}, ... 52 | │ │ │ │ ├── ], 53 | │ │ │ │ ├── "planning":[ 54 | │ │ │ │ │ ├── {"Q": "In this scenario, what are safe actions to take for the ego vehicle?", "A": "Brake gently to a stop, turn right, turn left.", "C": None, "con_up": None, "con_down": None, "cluster": None, "layer": None}, ... 55 | │ │ │ │ ├── ], 56 | │ │ │ │ ├── "behavior":[ 57 | │ │ │ │ │ ├── {"Q": "Predict the behavior of the ego vehicle.", "A": "The ego vehicle is going straight. The ego vehicle is driving slowly.", "C": None, "con_up": None, "con_down": None, "cluster": None, "layer": None} 58 | │ │ │ │ ├── ] 59 | │ │ │ ├── }, 60 | │ │ │ ├── "image_paths":{ 61 | │ │ │ │ ├── "CAM_FRONT": "xxx", 62 | │ │ │ │ ├── "CAM_FRONT_LEFT": "xxx", 63 | │ │ │ │ ├── "CAM_FRONT_RIGHT": "xxx", 64 | │ │ │ │ ├── "CAM_BACK": "xxx", 65 | │ │ │ │ ├── "CAM_BACK_LEFT": "xxx", 66 | │ │ │ │ ├── "CAM_BACK_RIGHT": "xxx", 67 | │ │ │ ├── } 68 | │ │ ├── }, 69 | │ │ ├── "frame_token_2":{ 70 | │ │ │ ├── "key_object_infos":{"": {"Category": "Traffic element", "Status": "None", "Visual_description": "Stop sign", "2d_bbox": [x_min, y_min, x_max, y_max]}, ...}, 71 | │ │ │ ├── "QA":{ 72 | │ │ │ │ ├── "perception":[...], 73 | │ │ │ │ ├── "prediction":[...], 74 | │ │ │ │ ├── "planning":[...], 75 | │ │ │ │ ├── "behavior":[...] 76 | │ │ │ ├── }, 77 | │ │ │ ├── "image_paths":{...} 78 | │ │ ├── } 79 | │ ├── } 80 | ├── } 81 | ``` 82 | 83 | - `scene_token` is the same as in nuScenes dataset. 84 | - `scene_description` is a one-sentence summary of ego-vehicle behavior in the about 20-second video clip (the notion of a scene in nuScenes dataset). 85 | - Under `key_frames`, each key frame is identified by the `frame_token`, which corresponds to the `token` in the nuScenes dataset. 86 | - The `key_object_infos` is a mapping between `c tag` (i.e. \) and more information about the related key objects such as the category, the status, the visual description, and the 2d bounding box. 87 | - `QA` is divided into different tasks, and QA pairs under each task are formulated as a list of dictionaries. Each dictionary encompasses keys of `Q` (question), `A` (answer), `C` (context), `con_up`, `con_down`, `cluster`, and `layer`. Currently, the values of context related keys are set to None, serving as a tentative placeholder for future fields related to DriveLM-CARLA. 88 | 89 | 90 | **Note:** The `c tag` label is used to indicate key objects selected during the annotation process. These objects include not only those present in the ground truth but also objects that are not, such as landmarks and traffic lights. Each key frame contains a minimum of three and a maximum of six key objects. The organization format of the `c tag` is ``, where c is the identifier, CAM indicates the camera where the key object’s center point is situated, and x, y represent the horizontal and vertical coordinates of the 2D bounding box in the respective camera’s coordinate system with the `upper-left` corner as the `origin`, and the `right` and `bottom` as the `positive x and y axes`, respectively. 91 | 92 | In contrast to the `c tag`, for the question "Identify all the traffic elements in the front view," the output is presented as a list formatted as `[(c, s, x1, y1, x2, y2), ...]`. Here, `c` denotes the category, `s` represents the status, and `x1, y1, x2, y2` indicate the offsets of the top-left and bottom-right corners of the box relative to the center point. 93 | 94 | 95 |

96 | data 97 |

98 | 99 | -------------------------------------------------------------------------------- /docs/gvqa.md: -------------------------------------------------------------------------------- 1 | ### What is GVQA? 2 | The most exciting aspect of the dataset is that the questions and answers (`QA pairs`) are connected in a graph-style structure, with QA pairs as every node and potential logical progression as the edges. The reason for doing this in the AD domain is that AD tasks are well-defined per stage, from raw sensor input to final control action through perception, prediction and planning. 3 | 4 | Its key difference to prior VQA tasks for AD is the availability of logical dependencies between QAs, which can be used to guide the answering process. Below is a demo video illustrating the idea. 5 | 6 | https://github.com/OpenDriveLab/DriveLM/assets/54334254/988472a8-d7b9-4685-b4b8-7a0e77f68265 7 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: drivelm 2 | channels: 3 | - omgarcia 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _openmp_mutex=5.1=1_gnu 9 | - appdirs=1.4.4=pyhd3eb1b0_0 10 | - blas=1.0=mkl 11 | - brotlipy=0.7.0=py38h27cfd23_1003 12 | - ca-certificates=2023.7.22=hbcca054_0 13 | - certifi=2023.7.22=pyhd8ed1ab_0 14 | - cffi=1.15.1=py38h5eee18b_3 15 | - cryptography=41.0.2=py38h22a60cf_0 16 | - cudatoolkit=11.7.0=hd8887f6_10 17 | - gcc-6=6.1.0=2 18 | - gmp=6.1.0=1 19 | - idna=3.4=py38h06a4308_0 20 | - intel-openmp=2023.1.0=hdb19cb5_46305 21 | - isl=0.17.1=0 22 | - ld_impl_linux-64=2.38=h1181459_1 23 | - libffi=3.4.4=h6a678d5_0 24 | - libgcc-ng=11.2.0=h1234567_1 25 | - libgfortran-ng=11.2.0=h00389a5_1 26 | - libgfortran5=11.2.0=h1234567_1 27 | - libgomp=11.2.0=h1234567_1 28 | - libstdcxx-ng=11.2.0=h1234567_1 29 | - mkl=2023.1.0=h6d00ec8_46342 30 | - mkl-service=2.4.0=py38h5eee18b_1 31 | - mkl_fft=1.3.6=py38h417a72b_1 32 | - mkl_random=1.2.2=py38h417a72b_1 33 | - mpc=1.0.1=0 34 | - mpfr=3.1.2=0 35 | - ncurses=6.4=h6a678d5_0 36 | - openssl=3.0.10=h7f8727e_2 37 | - pip=23.2.1=py38h06a4308_0 38 | - pooch=1.4.0=pyhd3eb1b0_0 39 | - pycparser=2.21=pyhd3eb1b0_0 40 | - pyopenssl=23.2.0=py38h06a4308_0 41 | - pysocks=1.7.1=py38h06a4308_0 42 | - python=3.8.17=h955ad1f_0 43 | - readline=8.2=h5eee18b_0 44 | - sqlite=3.41.2=h5eee18b_0 45 | - tbb=2021.8.0=hdb19cb5_0 46 | - tk=8.6.12=h1ccaba5_0 47 | - wheel=0.38.4=py38h06a4308_0 48 | - xz=5.4.2=h5eee18b_0 49 | - zlib=1.2.13=h5eee18b_0 50 | - pip: 51 | - absl-py==1.4.0 52 | - accelerate==0.21.0 53 | - addict==2.4.0 54 | - aiohttp==3.8.5 55 | - aiosignal==1.3.1 56 | - aliyun-python-sdk-core==2.13.36 57 | - aliyun-python-sdk-kms==2.16.1 58 | - ansi2html==1.8.0 59 | - antlr4-python3-runtime==4.9.3 60 | - anyio==3.7.1 61 | - argon2-cffi==23.1.0 62 | - argon2-cffi-bindings==21.2.0 63 | - arrow==1.2.3 64 | - asttokens==2.2.1 65 | - async-lru==2.0.4 66 | - async-timeout==4.0.2 67 | - attrs==23.1.0 68 | - babel==2.12.1 69 | - backcall==0.2.0 70 | - beautifulsoup4==4.12.2 71 | - bert-score 72 | - bitsandbytes==0.41.1 73 | - black==23.7.0 74 | - bleach==6.0.0 75 | - cachetools==5.3.1 76 | - cchardet==2.1.7 77 | - chardet==5.2.0 78 | - charset-normalizer==3.2.0 79 | - click==8.1.6 80 | - cmake==3.27.0 81 | - colorama==0.4.6 82 | - colorlog==6.7.0 83 | - comm==0.1.4 84 | - configargparse==1.7 85 | - contourpy==1.1.0 86 | # - cosine-annealing-warmup # no need for torch lightning 87 | - crcmod==1.7 88 | - cycler==0.11.0 89 | - dash==2.13.0 90 | - dash-core-components==2.0.0 91 | - dash-html-components==2.0.0 92 | - dash-table==5.0.0 93 | - datasets==2.14.3 94 | - debugpy==1.6.7.post1 95 | - decorator==5.1.1 96 | - defusedxml==0.7.1 97 | - descartes==1.1.0 98 | - dill==0.3.7 99 | - docker-pycreds==0.4.0 100 | - evaluate==0.4.0 101 | - exceptiongroup==1.1.3 102 | - executing==1.2.0 103 | - fastjsonschema==2.18.0 104 | - filelock==3.12.2 105 | - fire==0.5.0 106 | - flake8==6.1.0 107 | - flask==2.2.5 108 | - fonttools==4.42.0 109 | - fqdn==1.5.1 110 | - frozenlist==1.4.0 111 | - fsspec==2023.6.0 112 | - gitdb==4.0.10 113 | - gitpython==3.1.32 114 | - google-auth 115 | - google-auth-oauthlib 116 | - grpcio==1.56.2 117 | - huggingface-hub==0.16.4 118 | - hydra-core==1.3.2 119 | - imageio==2.31.1 120 | - importlib-metadata==6.8.0 121 | - importlib-resources==6.0.0 122 | - iniconfig==2.0.0 123 | - inquirerpy==0.3.4 124 | - ipykernel==6.25.1 125 | - ipython==8.12.2 126 | - ipython-genutils==0.2.0 127 | - ipywidgets==8.1.0 128 | - isoduration==20.11.0 129 | - itsdangerous==2.1.2 130 | - jedi==0.19.0 131 | - jinja2==3.1.2 132 | - jmespath==0.10.0 133 | - joblib==1.3.1 134 | - json5==0.9.14 135 | - jsonpointer==2.4 136 | - jsonschema==4.19.0 137 | - jsonschema-specifications==2023.7.1 138 | - jupyter==1.0.0 139 | - jupyter-client==8.3.1 140 | - jupyter-console==6.6.3 141 | - jupyter-core==5.3.1 142 | - jupyter-events==0.7.0 143 | - jupyter-lsp==2.2.0 144 | - jupyter-server==2.7.2 145 | - jupyter-server-terminals==0.4.4 146 | - jupyterlab==4.0.5 147 | - jupyterlab-pygments==0.2.2 148 | - jupyterlab-server==2.24.0 149 | - jupyterlab-widgets==3.0.8 150 | - kiwisolver==1.4.4 151 | # - language-evaluation # should install separately 152 | - lazy-loader==0.3 153 | - lightning-utilities==0.9.0 154 | - line-profiler==4.0.3 155 | - lit==16.0.6 156 | - llvmlite==0.31.0 157 | - lyft-dataset-sdk==0.0.8 158 | - markdown==3.4.4 159 | - markdown-it-py==3.0.0 160 | - markupsafe==2.1.3 161 | - matplotlib==3.5.2 162 | - matplotlib-inline==0.1.6 163 | - mccabe==0.7.0 164 | - mdurl==0.1.2 165 | - mistune==2.0.5 166 | # - mmcv==1.4.0 # no need for loading drivelm, need for loading nuScenes 167 | # - mmdet==2.14.0 # no need for loading drivelm, need for loading nuScenes 168 | # - mmengine==0.8.4 # no need for loading drivelm, need for loading nuScenes 169 | # - mmsegmentation==0.14.1 # no need for loading drivelm, need for loading nuScenes 170 | - model-index==0.1.11 171 | - more-itertools==10.1.0 172 | - mpmath==1.3.0 173 | - multidict==6.0.4 174 | - multiprocess==0.70.15 175 | - mypy-extensions==1.0.0 176 | - nbclient==0.8.0 177 | - nbconvert==7.4.0 178 | - nbformat==5.5.0 179 | - nest-asyncio==1.5.7 180 | - networkx==2.2 181 | - nltk==3.8.1 182 | - notebook==7.0.2 183 | - notebook-shim==0.2.3 184 | - numba==0.48.0 185 | - numpy 186 | - nuscenes-devkit==1.1.10 187 | - nvidia-cublas-cu11==11.10.3.66 188 | - nvidia-cuda-cupti-cu11==11.7.101 189 | - nvidia-cuda-nvrtc-cu11==11.7.99 190 | - nvidia-cuda-runtime-cu11==11.7.99 191 | - nvidia-cudnn-cu11==8.5.0.96 192 | - nvidia-cufft-cu11==10.9.0.58 193 | - nvidia-curand-cu11==10.2.10.91 194 | - nvidia-cusolver-cu11==11.4.0.1 195 | - nvidia-cusparse-cu11==11.7.4.91 196 | - nvidia-nccl-cu11==2.14.3 197 | - nvidia-nvtx-cu11==11.7.91 198 | - oauthlib==3.2.2 199 | - omegaconf==2.3.0 200 | # - open3d # visualization packaage, no need for loading drivelm 201 | - opencv-python==4.8.0.74 202 | # - opendatalab==0.0.10 # data download tools, no need for loading drivelm 203 | # - openmim==0.3.9 # mmlab package manager, no need for loading drivelm 204 | # - openxlab==0.0.22 # mmlab package, no need for loading drivelm 205 | - ordered-set==4.1.0 206 | - oss2==2.17.0 207 | - overrides==7.4.0 208 | - packaging==23.1 209 | - pandas==1.4.4 210 | - pandocfilters==1.5.0 211 | - parso==0.8.3 212 | - pathspec==0.11.2 213 | - pathtools==0.1.2 214 | - peft==0.4.0 215 | - pexpect==4.8.0 216 | - pfzy==0.3.4 217 | - pickleshare==0.7.5 218 | - pillow==10.0.0 219 | - pkgutil-resolve-name==1.3.10 220 | - platformdirs==3.10.0 221 | - plotly==5.16.1 222 | - pluggy==1.3.0 223 | - plyfile==1.0.1 224 | - prettytable==3.8.0 225 | - prometheus-client==0.17.1 226 | - prompt-toolkit==3.0.39 227 | - protobuf==4.23.4 228 | - psutil==5.9.5 229 | - ptyprocess==0.7.0 230 | - pure-eval==0.2.2 231 | - pyarrow==12.0.1 232 | - pyasn1==0.5.0 233 | - pyasn1-modules==0.3.0 234 | - pycocotools==2.0.7 235 | - pycodestyle==2.11.0 236 | - pycryptodome==3.18.0 237 | - pydeprecate==0.3.2 238 | - pyflakes==3.1.0 239 | - pygments==2.16.1 240 | - pyparsing==3.0.9 241 | - pyquaternion==0.9.9 242 | - pytest==7.4.0 243 | - python-dateutil==2.8.2 244 | - python-json-logger==2.0.7 245 | - pytorch-lightning==1.7.0 246 | - pytz==2023.3 247 | - pywavelets==1.4.1 248 | - pyyaml==6.0.1 249 | - pyzmq==25.1.1 250 | - qtconsole==5.4.3 251 | - qtpy==2.4.0 252 | - referencing==0.30.2 253 | - regex==2023.6.3 254 | - requests 255 | - requests-oauthlib 256 | - responses==0.18.0 257 | - retrying==1.3.4 258 | - rfc3339-validator==0.1.4 259 | - rfc3986-validator==0.1.1 260 | - rich==13.4.2 261 | - rouge-score==0.1.2 262 | - rpds-py==0.10.0 263 | - rsa==4.9 264 | - safetensors==0.3.1 265 | - scikit-image==0.19.3 266 | - scikit-learn==1.3.0 267 | - scipy==1.7.3 268 | - send2trash==1.8.2 269 | - sentencepiece==0.1.99 270 | - sentry-sdk==1.29.2 271 | - setproctitle==1.3.2 272 | - setuptools==60.2.0 273 | - shapely==1.8.5 274 | - six==1.16.0 275 | - smmap==5.0.0 276 | - sniffio==1.3.0 277 | - soupsieve==2.4.1 278 | - stack-data==0.6.2 279 | - sympy==1.12 280 | - tabulate==0.9.0 281 | - tenacity==8.2.3 282 | - tensorboard==2.13.0 283 | - tensorboard-data-server==0.7.1 284 | - termcolor==2.3.0 285 | - terminado==0.17.1 286 | - terminaltables==3.1.10 287 | - threadpoolctl==3.2.0 288 | - tifffile==2023.7.10 289 | - tinycss2==1.2.1 290 | - tokenizers==0.13.3 291 | - tomli==2.0.1 292 | - torch==2.0.1 293 | - torchaudio==2.0.2 294 | - torchmetrics==0.11.1 295 | - torchvision==0.15.2 296 | - tornado==6.3.3 297 | - tqdm==4.65.0 298 | - traitlets==5.9.0 299 | - transformers==4.31.0 300 | - trimesh==2.35.39 301 | - triton==2.0.0 302 | - typing-extensions==4.7.1 303 | - tzdata==2023.3 304 | - uri-template==1.3.0 305 | - urllib3==2.0.4 306 | - wandb==0.15.8 307 | - wcwidth==0.2.6 308 | - webcolors==1.13 309 | - webencodings==0.5.1 310 | - websocket-client==1.6.2 311 | - werkzeug==2.2.3 312 | - widgetsnbextension==4.0.8 313 | - xxhash==3.3.0 314 | - yapf==0.40.1 315 | - yarl==1.9.2 316 | - zipp==3.16.2 317 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |

You will be redirected to opendrivelab.com/DriveLM/ soon!

8 | 9 | 10 | --------------------------------------------------------------------------------