├── .gitignore ├── LICENSE ├── README.md ├── figures ├── logo_Infinigence-ai.png └── logo_nicsefc.jpg ├── main.py ├── qllm_eval ├── evaluation │ ├── __init__.py │ ├── q_dialogue │ │ ├── README.md │ │ ├── fschat_dat │ │ │ ├── judge_prompts.jsonl │ │ │ ├── mt_bench │ │ │ │ ├── misc │ │ │ │ │ └── radar.png │ │ │ │ ├── model_answer │ │ │ │ │ └── llama2-7b-chat_quant_w_4_kv_4.jsonl │ │ │ │ ├── question.jsonl │ │ │ │ └── reference_answer │ │ │ │ │ └── gpt-4.jsonl │ │ │ └── vicuna_bench │ │ │ │ ├── question.jsonl │ │ │ │ └── reference_answer │ │ │ │ └── gpt-4.jsonl │ │ ├── gen_judgment.py │ │ ├── gen_model_answer.py │ │ └── show_result.py │ ├── q_ethic │ │ ├── README.md │ │ └── main.py │ ├── q_harness │ │ ├── README.md │ │ ├── __init__.py │ │ ├── lm_eval_adaptor.py │ │ └── main.py │ ├── q_long │ │ ├── README.md │ │ ├── __init__.py │ │ ├── longeval │ │ │ ├── LICENSE │ │ │ ├── __init__.py │ │ │ ├── generate_testcase.py │ │ │ ├── generate_testcases_configs.yaml │ │ │ └── utils.py │ │ ├── main_litm.py │ │ └── main_longeval.py │ └── q_opencompass │ │ ├── README.md │ │ ├── main.py │ │ ├── tasks │ │ ├── __init__.py │ │ └── openicl_infer_quant.py │ │ └── utils │ │ ├── __init__.py │ │ ├── build.py │ │ ├── models.py │ │ └── run.py ├── methods │ ├── __init__.py │ └── rep │ │ ├── __init__.py │ │ ├── apply_rep.py │ │ └── utils.py ├── quantization │ ├── __init__.py │ ├── qattn │ │ ├── __init__.py │ │ └── sw │ │ │ ├── __init__.py │ │ │ ├── bloom_attn.py │ │ │ ├── deepseekv2_attn.py │ │ │ ├── falcon_attn.py │ │ │ ├── flash_attn_triton.py │ │ │ ├── gemma_attn.py │ │ │ ├── glm2_attn.py │ │ │ ├── glm3_attn.py │ │ │ ├── llama_attn.py │ │ │ ├── mistral_attn.py │ │ │ ├── mixtral_attn.py │ │ │ ├── mpt_attn.py │ │ │ ├── opt_attn.py │ │ │ └── stable_attn.py │ ├── qlinear │ │ ├── __init__.py │ │ └── sqwa.py │ ├── quant_funcs.py │ └── quant_wrapper.py ├── utils.py └── visualization │ ├── __init__.py │ ├── basic │ ├── assets │ │ └── lambada_w_only.py │ ├── heatmap_plot.py │ ├── plot.py │ ├── ppl_distribution_plot.py │ ├── radar_plot.py │ ├── radar_ratio_range_plot.py │ ├── storage_overhead_barplot.py │ ├── storage_overhead_curveplot.py │ ├── storage_overhead_plot.py │ └── utils.py │ ├── dialogue │ ├── assets │ │ ├── __init__.py │ │ ├── mt_bench_kv_cache.py │ │ ├── mt_bench_w_a.py │ │ └── mt_bench_w_only.py │ └── gen_table.py │ ├── emergent │ ├── emergent │ │ ├── arc-c_kv.txt │ │ ├── arc-c_w.txt │ │ ├── arc-c_wa.txt │ │ ├── arc-e_kv.txt │ │ ├── arc-e_w.txt │ │ ├── arc-e_wa.txt │ │ ├── ceval_kv.txt │ │ ├── ceval_w.txt │ │ ├── ceval_wa.txt │ │ ├── gsm8k_kv.txt │ │ ├── gsm8k_w.txt │ │ ├── gsm8k_wa.txt │ │ ├── hellaswag_kv.txt │ │ ├── hellaswag_w.txt │ │ ├── hellaswag_wa.txt │ │ ├── mmlu-calib_kv.txt │ │ ├── mmlu-calib_w.txt │ │ ├── mmlu-calib_wa.txt │ │ ├── mmlu_kv.txt │ │ ├── mmlu_w.txt │ │ ├── mmlu_wa.txt │ │ ├── strategyqa_kv.txt │ │ ├── strategyqa_w.txt │ │ └── strategyqa_wa.txt │ ├── plot_ability.py │ └── plot_other_set.py │ ├── long │ ├── litm │ │ ├── plot.py │ │ └── pos │ │ │ ├── kv_length_result.py │ │ │ ├── w_length_result.py │ │ │ └── wa_length_result.py │ └── longeval │ │ ├── length │ │ ├── kv_length_result.py │ │ ├── w_length_result.py │ │ ├── wa_length_result.py │ │ └── wkv_length_result.py │ │ ├── plot.py │ │ └── pos │ │ ├── kv_length_result.py │ │ ├── w_length_result.py │ │ ├── wa_length_result.py │ │ └── wkv_length_result.py │ └── trustworthiness │ ├── adv_glue │ ├── assets │ │ ├── hf_chatglm3_6b │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_chatglm3_6b_32k │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_falcon_40b_instruct │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_falcon_7b_instruct │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_llama2_13b_chat │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_llama2_70b_chat │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_llama2_7b_chat │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ ├── hf_mistral_7b_instruct │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ │ └── hf_mixtral_8x7b_instruct │ │ │ ├── fp16.csv │ │ │ ├── kv3.csv │ │ │ ├── kv4.csv │ │ │ ├── kv8.csv │ │ │ ├── w3.csv │ │ │ ├── w4.csv │ │ │ ├── w4a8.csv │ │ │ ├── w8.csv │ │ │ └── w8a8.csv │ └── proc_results.py │ ├── ethics │ ├── ethics │ │ ├── commonsense │ │ │ ├── kv_result.py │ │ │ ├── w_result.py │ │ │ └── wa_result.py │ │ ├── deontology │ │ │ ├── kv_result.py │ │ │ ├── w_result.py │ │ │ └── wa_result.py │ │ ├── justice │ │ │ ├── kv_result.py │ │ │ ├── w_result.py │ │ │ └── wa_result.py │ │ └── virtue │ │ │ ├── kv_result.py │ │ │ ├── w_result.py │ │ │ └── wa_result.py │ └── plot.py │ └── truthfulqa │ ├── plot.py │ └── results │ ├── kv_mc1_result.py │ ├── kv_mc2_result.py │ ├── w_mc1_result.py │ ├── w_mc2_result.py │ ├── wa_mc1_result.py │ └── wa_mc2_result.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.vscode/ 6 | *outputs* 7 | *bkp* 8 | test/ 9 | statistic/ 10 | statistic_tensor 11 | *.sh* 12 | *.zip* 13 | *.tar.gz* 14 | 15 | # C extensions 16 | *.so 17 | 18 | # Distribution / packaging 19 | .Python 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | wheels/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | cover/ 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | .idea/ 170 | 171 | # OpenCompass outputs 172 | outputs/ 173 | # OpenCompass data 174 | data/ 175 | # OpenCompass intermediate files 176 | tmp/ 177 | icl_inference_output/ 178 | # OpenCompass config files 179 | qllm_eval/evaluation/q_opencompass/configs/ 180 | # OpenCompass Evaluation Scripts 181 | qllm_eval/evaluation/q_opencompass/scripts/ 182 | 183 | # OpenCompass tools for listing configs 184 | tools/ 185 | 186 | # vim cache files 187 | *.swp 188 | 189 | # visualization related files 190 | ## basic capacity plots 191 | qllm_eval/visualization/basic/results/ 192 | qllm_eval/visualization/basic/assets/* 193 | !qllm_eval/visualization/basic/assets/lambada_w_only.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [2024] [Shiyao Li, Xuefei Ning, Luning Wang, Tengxuan Liu, Xiangsheng Shi, Shengen Yan, Guohao Dai, Huazhong Yang, and Yu Wang] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /figures/logo_Infinigence-ai.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/figures/logo_Infinigence-ai.png -------------------------------------------------------------------------------- /figures/logo_nicsefc.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/figures/logo_nicsefc.jpg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | 4 | from qllm_eval.quantization.quant_wrapper import quantize_model 5 | from qllm_eval.utils import build_model_and_enc 6 | from qllm_eval.methods.rep.apply_rep import apply_awq 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--model_path", type=str, help="path of the hf model") 10 | parser.add_argument("--output_path", type=str, help="path to save the quantized model") 11 | parser.add_argument("--rep_file", type=str, help="path to load the reparameterization factors") 12 | parser.add_argument("--use_flash_attn", action="store_true") 13 | parser.add_argument("--w_group_size", type=int, default=128) 14 | parser.add_argument("--w_bit", type=int, default=16) 15 | parser.add_argument("--a_group_size", type=int, default=128) 16 | parser.add_argument("--a_bit", type=int, default=16) 17 | parser.add_argument("--kv_group_size", type=int, default=128) 18 | parser.add_argument("--kv_bit", type=int, default=16) 19 | args = parser.parse_args() 20 | 21 | 22 | def main(): 23 | # a hack here to auto set model group 24 | model, enc = build_model_and_enc(args.model_path, args.use_flash_attn, args.kv_bit, args.kv_group_size) 25 | 26 | if args.rep_file: 27 | rep_results = torch.load(args.rep_file, map_location="cpu") 28 | apply_awq(model, rep_results) 29 | 30 | # quantize model 31 | model = quantize_model(model, args) 32 | 33 | # save the quantized model 34 | if args.output_path: 35 | model.save_pretrained(args.output_path, safe_serialization=False) 36 | enc.save_pretrained(args.output_path) 37 | 38 | # evaluation 39 | # TODO: add evaluation functions 40 | prompt = "Hello, my name is human, and I like drinking" 41 | input_ids = enc(prompt, return_tensors="pt")['input_ids'].to(next(model.parameters()).device) 42 | output = model.generate(input_ids, do_sample=True, max_length=50, top_p=0.95, top_k=60) 43 | print(enc.decode(output[0])) 44 | 45 | 46 | if __name__ == "__main__": 47 | main() 48 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/__init__.py -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_dialogue/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation with LLM_judge 2 | Basic instructions on evaluating quantized LLMs with [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). 3 | 4 | > Note that this repository only contains necessary code required by our experiments, including code for quantized LLMs' dialogue generation and "single" mode GPT-4 judgement. Also be aware that the scripts are adapted from the original ones provided by [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge), so the script arguments are not exactly the same, please follow THIS document's instructions if any conflict exists. If you want to learn more, please refer to the original repository for [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). 5 | 6 | ## Installation 7 | Change directory to `QLLM-Evaluation` and conduct the following command to install our repository's requirements: 8 | ``` 9 | cd path/to/QLLM-Evaluation 10 | pip install -e . 11 | pip install -r requirements.txt 12 | ``` 13 | You don't need to download the source code of FastChat, for the package `fschat` has been included in our `requirements.txt`. 14 | 15 | ## Evaluate a quantized model on MT-bench 16 | ### Generate model answers to MT-bench questions 17 | Change directory to `QLLM-Evaluation/qllm_eval/evaluation/q_dialogue` and run `gen_model_answer.py`: 18 | ``` 19 | python gen_model_answer.py \ 20 | --model-path [MODEL-PATH] \ 21 | --model-id [MODEL-ID] \ 22 | [--quant] \ 23 | [--w_bit [W-BIT]] \ 24 | [--w_group_size [W-GROUP-SIZE]] \ 25 | [--a_bit [A-BIT]] \ 26 | [--a_group_size [A-GROUP-SIZE]] \ 27 | [--kv_bit [KV-BIT]] \ 28 | [--kv_group_size [KV-GROUP-SIZE]] \ 29 | [--rep_file [REP-FILE]] \ 30 | [--use_flash_attn] 31 | ``` 32 | The following arguments are required: 33 | - `[MODEL-PATH]` is the path to the weights, which can be a local folder or a HuggingFace repo ID. 34 | - `[MODEL-ID]` is a name you give to the model. 35 | 36 | The following arguments are optional: 37 | - `--quant` indicates whether you want to generate dialogues using a quantized model. 38 | - `[W-BIT]`, `[A-BIT]`, `[KV-BIT]` are the quantization bit-width for weight, activation and kv-cache, all default to 16. 39 | - `[W-GROUP-SIZE]`, `[A-GROUP-SIZE]`, `[KV-GROUP-SIZE]` are the group size for group-wise quantiztion, all default to 128. 40 | - `[REP-FILE]` is the path to the cache file if you want to use AWQ or SmoothQuant. 41 | - `--use_flash_attn` indicates whether you want to use flash-attention to save memory and speed up decoding. 42 | 43 | The answers will be saved to: `./fschat_dat/mt_bench/model_answer/[MODEL-ID].jsonl`. 44 | 45 | > Also, please note that the `--model_id` argument should match one of [FastChat's supported models](https://github.com/lm-sys/FastChat/blob/main/docs/model_support.md) to get the proper prompt template. The matching rule of each supported model could be found at [model_adapter.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py). For example, `Llama2Adapter`'s matching rule is defined as: 46 | > ```python 47 | > def match(self, model_path: str): 48 | > return "llama-2" in model_path.lower() 49 | > ``` 50 | > Therefore, when evaluating llama-2 models, you should specify a `--model_id` argument containing the pattern "llama-2". 51 | > 52 | > For unsupported models, you could either support your model according to [model_support.md](https://github.com/lm-sys/FastChat/blob/main/docs/model_support.md), or just use the default model adapter without any modification. 53 | 54 | e.g. 55 | ``` 56 | python gen_model_answer.py \ 57 | --model-path meta-llama/Llama-2-7b-chat-hf \ 58 | --model-id llama-2-7b-chat_quant_w_4_a_4 \ 59 | --quant \ 60 | --w_bit 4 \ 61 | --a_bit 4 \ 62 | --use_flash_attn 63 | ``` 64 | 65 | ### Generate GPT-4 judgements 66 | We only support single-answer grading here. This mode asks GPT-4 to grade and give a score to model's answer directly without pairwise comparison. For each turn, GPT-4 will give a score on a scale of 10. We then compute the average score on all turns. Please note that while the original script support passing more than one model_ids to the `--model-list` argument, we recommend passing only one model_id each time for clarity. 67 | ``` 68 | python gen_judgment.py \ 69 | --model-list [MODEL-ID] \ 70 | --save_name [MODEL-SAVENAME] \ 71 | [--parallel [PARALLEL]] 72 | ``` 73 | The evaluation results will be saved to: `./fschat_dat/mt_bench/model_judgement/[MODEL-SAVENAME].jsonl`. Here we recommend setting `[MODEL-SAVENAME]` the same as `[MODEL-ID]` for simplicity. 74 | 75 | e.g. 76 | ``` 77 | python gen_judgment.py \ 78 | --model-list llama-2-7b-chat_quant_w_4_a_4 \ 79 | --parallel 4 \ 80 | --save_name llama-2-7b-chat_quant_w_4_a_4 81 | ``` 82 | 83 | ### Show MT-bench scores 84 | You can show all the available results by simply running: 85 | ``` 86 | python show_result.py 87 | ``` 88 | If you want to show results of model_ids with a specified pattern, you can pass `--only_show` argument. For example, if you only want to see the results of llama models: 89 | ``` 90 | python show_result.py --only_show llama 91 | ``` 92 | 93 | ### Results 94 | We open-source our results in [this directory](https://github.com/LSY-noya/QLLM-Evaluation/tree/main/qllm_eval/visualization/dialogue/assets), corresponding to the data we present in our paper. 95 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/misc/radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/misc/radar.png -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/model_answer/llama2-7b-chat_quant_w_4_kv_4.jsonl: -------------------------------------------------------------------------------- 1 | {"question_id": 95, "answer_id": "28UCTvfjRRMHyiD5H2tUFn", "model_id": "llama2-7b-chat_quant_w_4_kv_4", "choices": [{"index": 0, "turns": ["Thank you for entrusting me with this task! I'm happy to help you improve your language skills. The text you provided translates to \"I'm so excited for the party! I can't wait to see my friends!\" in English.\nHowever, I noticed a few minor errors in the original text. \"\u8863\u5e26\" should be \"clothes,\" \"\u6e10\u5bbd\" should be \"wide,\" and \"\u7ec8\u4e0d\u6094\" should be \"won't regret.\" Here's the refined and polished translation:\n\"I'm so excited for the party! I can't wait to see my friends and have a great time with them. I won't regret it!\"\nI hope this helps, and please feel free to ask me any questions or provide more texts for me to correct and enhance!", "Ah, I see! \"Ich verstehe\" is German for \"I understand.\" In English, we would say \"I understand.\" So, \"Ich verstehe\" can be translated to \"I understand.\" Is there anything else you'd like me to help you with?"]}], "tstamp": 1706367441.4954596} 2 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_ethic/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation with ETHICS benchmark 2 | Basic instructions on evaluating quantized LLMs with ETHICS benchmark. 3 | 4 | ## Run ETHICS benchmark 5 | We implement an end-to-end evaluation flow for `ETHICS` benchmark. You can use the following command below to evaluate four subdatasets, including `commonsense`, `deontology`, `justice`, and `virtue`. 6 | 7 | ``` 8 | CUDA_VISIBLE_DEVICES=0 python main.py \ 9 | --model_path /Your/LLM/Path --output_dir /Your/Path/to/Results \ 10 | --w_group_size w_group_size --w_bit w_bit \ 11 | --a_group_size a_group_size --a_bit a_bit \ 12 | --kv_group_size kv_group_size --kv_bit kv_bit 13 | ``` 14 | 15 | > The evaluation results can be found in `/Your/Path/to/Results`. 16 | 17 | > Here, the main.py script is in `qllm_eval/evaluation/q_ethic`. 18 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_harness/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation with lm_evaluation_harness 2 | Basic instructions on evaluating quantized LLMs with [lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness). 3 | 4 | ## Installation 5 | Install the lm_eval package: 6 | ``` 7 | pip install lm_eval==0.3.0 8 | ``` 9 | The package `lm_eval` has been included in our `requirements.txt`. 10 | 11 | 12 | ## Use lm_eval for evaluation 13 | Use the main.py in `qllm_eval/evaluation/q_harness` as an example, you can use the `--tasks A,B,C` to select different tasks for evaluation, where the `A,B,C` represents three different tasks in `lm_eval`. 14 | 15 | ``` 16 | CUDA_VISIBLE_DEVICES=0 python main.py \ 17 | --model_path /Your/LLM/Path --tasks A,B,C \ 18 | --w_group_size w_group_size --w_bit w_bit \ 19 | --a_group_size a_group_size --a_bit a_bit \ 20 | --kv_group_size kv_group_size --kv_bit kv_bit 21 | ``` 22 | 23 | > The tasks supported by `lm_eval` can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md). 24 | 25 | > In our paper, we use `--tasks truthfulqa_mc` to evaluate quantized LLMs on the `TruthfulQA` dataset. 26 | 27 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_harness/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_harness/__init__.py -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_harness/lm_eval_adaptor.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import torch 3 | from lm_eval.base import BaseLM 4 | 5 | 6 | class LMEvalAdaptor(BaseLM): 7 | 8 | def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1): 9 | super().__init__() 10 | 11 | assert isinstance(batch_size, int) 12 | 13 | self.model_name = model_name 14 | self.model = model 15 | self.model.eval() 16 | 17 | self.tokenizer = tokenizer 18 | 19 | # assert isinstance(self.tokenizer, ( 20 | # transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast, 21 | # transformers.T5Tokenizer, transformers.T5TokenizerFast, 22 | # )), "this tokenizer has not been checked for compatibility yet!" 23 | 24 | self.vocab_size = self.tokenizer.vocab_size 25 | 26 | self._batch_size = batch_size 27 | 28 | self._max_length = max_length 29 | 30 | @property 31 | def eot_token_id(self): 32 | # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* 33 | return self.tokenizer.eos_token_id 34 | 35 | @property 36 | def max_length(self): 37 | if self._max_length != -1: 38 | return self._max_length 39 | if hasattr(self.model.config, 'n_ctx'): 40 | return self.model.config.n_ctx 41 | elif hasattr(self.model.config, 'max_position_embeddings'): 42 | return self.model.config.max_position_embeddings 43 | elif hasattr(self.model.config, 'n_positions'): 44 | return self.model.config.n_positions 45 | elif 'bloom' in self.model_name: 46 | return 2048 47 | elif 'llama' in self.model_name: 48 | return 2048 # TODO: did not check this 49 | else: 50 | print(self.model.config) 51 | raise NotImplementedError 52 | 53 | @property 54 | def max_gen_toks(self): 55 | return 256 56 | 57 | @property 58 | def batch_size(self): 59 | return self._batch_size 60 | 61 | @property 62 | def device(self): 63 | return "cuda" 64 | 65 | def tok_encode(self, string: str): 66 | return self.tokenizer.encode(string, add_special_tokens=False) 67 | 68 | def tok_decode(self, tokens): 69 | return self.tokenizer.decode(tokens) 70 | 71 | def _model_call(self, inps): 72 | """ 73 | inps: a torch tensor of shape [batch, sequence] 74 | the size of sequence may vary from call to call 75 | 76 | returns: a torch tensor of shape [batch, sequence, vocab] with the 77 | logits returned from the model 78 | """ 79 | with torch.no_grad(): 80 | if isinstance(self.model, transformers.models.t5.modeling_t5.T5ForConditionalGeneration): 81 | dec_inps = torch.cat( 82 | [ 83 | torch.tensor( 84 | self.model.generation_config.decoder_start_token_id, 85 | ) 86 | .tile(len(inps), 1) 87 | .to(inps), 88 | inps, 89 | ], 90 | dim=1, 91 | ) 92 | 93 | kwargs = {"decoder_input_ids": dec_inps,} 94 | else: 95 | kwargs = {} 96 | out = self.model(inps, **kwargs)[0] 97 | if "opt" in self.model_name: # there are a few extra tokens in opt, which we should omit 98 | return out[:, :, :50257] 99 | else: 100 | return out # [:, :, :self.tokenizer.vocab_size] 101 | 102 | def _model_generate(self, context, max_length, eos_token_id): 103 | return self.model.generate( 104 | context, 105 | max_length=max_length, 106 | eos_token_id=eos_token_id, 107 | do_sample=False 108 | ) 109 | 110 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_harness/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | from qllm_eval.quantization.quant_wrapper import quantize_model 6 | from qllm_eval.utils import build_model_and_enc 7 | from qllm_eval.evaluation.q_harness.lm_eval_adaptor import LMEvalAdaptor 8 | from lm_eval import evaluator 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument("--model_path", type=str, help="path of the hf model") 12 | parser.add_argument("--output_path", type=str, help="path to save the quantized model") 13 | parser.add_argument("--use_flash_attn", action="store_true") 14 | parser.add_argument("--tasks", type=str, default="truthfulqa_mc") 15 | parser.add_argument("--metrics", type=str, default="mc1,mc2") 16 | parser.add_argument("--w_group_size", type=int, default=128) 17 | parser.add_argument("--w_bit", type=int, default=16) 18 | parser.add_argument("--a_group_size", type=int, default=128) 19 | parser.add_argument("--a_bit", type=int, default=16) 20 | parser.add_argument("--kv_group_size", type=int, default=128) 21 | parser.add_argument("--kv_bit", type=int, default=16) 22 | args = parser.parse_args() 23 | 24 | 25 | def main(): 26 | print("* Quantization Format: kv_{}_w_{}_a_{}".format(args.kv_bit, args.w_bit, args.a_bit)) 27 | if 'falcon' in args.model_path.lower(): 28 | args.kv_group_size = 64 29 | args.w_group_size = 64 30 | 31 | # a hack here to auto set model group 32 | model, enc = build_model_and_enc(args.model_path, args.use_flash_attn, args.kv_bit, args.kv_group_size) 33 | 34 | # quantize model 35 | model = quantize_model(model, args) 36 | 37 | # # save the quantized model 38 | # if args.output_path: 39 | # model.save_pretrained(args.output_path, safe_serialization=False) 40 | # enc.save_pretrained(args.output_path) 41 | 42 | # evaluation 43 | lm_eval_model = LMEvalAdaptor(args.model_path, model, enc, 1) 44 | 45 | if args.tasks is not None: 46 | task_names = args.tasks.split(",") 47 | 48 | results = evaluator.simple_evaluate( 49 | model=lm_eval_model, 50 | tasks=task_names, 51 | batch_size=1, 52 | no_cache=True, 53 | num_fewshot=0, 54 | ) 55 | # print(results) 56 | # print(evaluator.make_table(results)) 57 | for task_name in task_names: 58 | output_path = "{}/{}/kv_{}_w_{}_a_{}.jsonl".format(task_name, args.model_path, args.kv_bit, args.w_bit, args.a_bit) 59 | print("* Output: ", output_path) 60 | if not os.path.exists("{}/{}".format(task_name, args.model_path)): 61 | os.makedirs("{}/{}".format(task_name, args.model_path)) 62 | with open(output_path, 'w') as f: 63 | f.write(json.dumps(results['results'][task_name]) + "\n") 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_long/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation Long-Context Tasks 2 | Basic instructions on evaluating quantized LLMs with [LongEval](https://github.com/DachengLi1/LongChat) and [Lost-in-the-middle](https://github.com/nelson-liu/lost-in-the-middle) benchmarks. You need to install the qllm_eval package first. 3 | 4 | ## For LongEval dataset 5 | 6 | 1. Generate new key-value retrieval test cases. 7 | ``` 8 | cd qllm_eval/evaluation/q_long 9 | 10 | python longeval/generate_testcase.py longeval/generate_testcases_configs.yaml 11 | ``` 12 | 13 | 2. Evaluation with `LongEval`. 14 | ``` 15 | CUDA_VISIBLE_DEVICES=0 python main_longeval.py \ 16 | --model-name-or-path /Your/LLM/Path --use_flash_attn \ 17 | --task lines --test_dir new_cases \ 18 | --w_group_size w_group_size --w_bit w_bit \ 19 | --a_group_size a_group_size --a_bit a_bit \ 20 | --kv_group_size kv_group_size --kv_bit kv_bit 21 | ``` 22 | 23 | 24 | ## For Lost-in-the-middle dataset 25 | 26 | 1. Git clone [Lost-in-the-middle](https://github.com/nelson-liu/lost-in-the-middle) and install it locally in the qllm_eval conda environment. 27 | ``` 28 | conda activate qllm_eval 29 | git clone git@github.com:nelson-liu/lost-in-the-middle.git 30 | cd 31 | Installation... 32 | ``` 33 | 34 | 2. Evaluation with `lost-in-the-middle`. 35 | ``` 36 | cd qllm_eval/evaluation/q_long 37 | 38 | CUDA_VISIBLE_DEVICES=0 python main_litm.py \ 39 | --model_name /Your/LLM/Path --use_flash_attn \ 40 | --w_group_size w_group_size --w_bit w_bit \ 41 | --a_group_size a_group_size --a_bit a_bit \ 42 | --kv_group_size kv_group_size --kv_bit kv_bit \ 43 | --input_path /qa_data/30_total_documents/nq-open-30_total_documents_gold_at_0.jsonl.gz \ 44 | --max_new_tokens 100 --output_path /Your/Path/to/Results 45 | ``` 46 | 47 | > The input data file is `/qa_data/30_total_documents/nq-open-30_total_documents_gold_at_0.jsonl.gz` 48 | 49 | > The evaluation results can be found in `/Your/Path/to/Results`. 50 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_long/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_long/__init__.py -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_long/longeval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_long/longeval/__init__.py -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_long/longeval/generate_testcase.py: -------------------------------------------------------------------------------- 1 | import os 2 | from utils import generate_lines_testcases, retrieve_cmd_args 3 | 4 | if __name__ == "__main__": 5 | cfgs = retrieve_cmd_args() 6 | 7 | output_dir = os.path.join(cfgs["output_dir"], cfgs["task"], "testcases/") 8 | if not os.path.exists(output_dir): 9 | os.makedirs(output_dir) 10 | else: 11 | raise RuntimeError("Output directory already exists! Remove this line to overwrite the original testcases.") 12 | 13 | if cfgs["task"] == "topics": 14 | pass 15 | else: 16 | generate_lines_testcases(cfgs, output_dir) -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_long/longeval/generate_testcases_configs.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | output_dir: new_cases/ # directory to put the generated testcases 3 | task: lines # choose between topics and lines 4 | num_test_samples: 500 # number of testcases for each test 5 | 6 | 7 | # lines evaluation options 8 | num_lines: [200,300,400,500,600,680] # number of lines in each testcase 9 | line_idx_opt: LRT-NL # type of line index, choose among LRT, LRT-ABCindex, LRT-UUID, LRT-NL 10 | 11 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_opencompass/README.md: -------------------------------------------------------------------------------- 1 | # Evaluation with OpenCompass 2 | Basic instructions on evaluating quantized LLMs with [OpenCompass](https://github.com/open-compass/opencompass). You need to install the qllm_eval package first. 3 | 4 | 5 | ## Installation 6 | 7 | 1. Git clone [OpenCompass](https://github.com/open-compass/opencompass) and install it locally in the qllm_eval conda environment. See [requirements of OpenCompass](https://github.com/open-compass/opencompass/blob/main/requirements.txt). 8 | ``` 9 | conda activate qllm 10 | git clone git@github.com:open-compass/opencompass.git 11 | cd 12 | ``` 13 | 14 | 2. Install the required packages from the source. 15 | 16 | ``` 17 | pip install -e . 18 | ``` 19 | 20 | 3. Note that LlaMA should be installed mannually. Take the following steps to ensure LlaMA works properly: 21 | 22 | ``` 23 | git clone https://github.com/facebookresearch/llama.git 24 | cd 25 | pip install -r requirements.txt 26 | pip install -e . 27 | ``` 28 | 29 | ## Evaluation 30 | 31 | 1. Prepare datasets. Change directory to `QLLM-Evaluation/qllm_eval/evaluation/q_opencompass/` and create a new folder: 32 | 33 | ``` 34 | cd qllm_eval/evaluation/q_opencompass 35 | mkdir data 36 | cd data 37 | ``` 38 | 39 | Run the following commands to download and place the datasets in the `./qllm_eval/evaluation/q_opencompass/data` directory can complete dataset preparation. 40 | 41 | ``` 42 | # Run in the OpenCompass directory 43 | wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip 44 | unzip OpenCompassData-core-20231110.zip 45 | ``` 46 | 47 | You may also use the pre-downloaded zip file, which is located at `/share/datasets/public_datasets/`. 48 | 49 | 2. Run the following demo command to evaluate `OPT-125m` with weights quantized to 8-bit on `SuperGLUE_BoolQ_ppl` dataset: 50 | 51 | ``` 52 | cd qllm_eval/evaluation/q_opencompass 53 | CUDA_VISIBLE_DEVICES=0 python main.py --models hf_opt_125m --datasets SuperGLUE_BoolQ_ppl --work-dir ./outputs/debug/api_test --w_bit 8 54 | ``` 55 | 56 | 3. If you want to evaluate models with different quantization settings, please modify `./qllm_eval/evaluation/q_opencompass/utils/build.py`. If you want to support new datasets and new models, please add their configs to `./qllm_eval/evaluation/q_opencompass/configs`, whose original configs may be found at opencompass repo. 57 | 58 | * Specially, if you want to evaluate the models with kv cache quantized, please modify the imported model class in the model configuration file. We provide class `HuggingFaceCausalLM_` for this specific need. 59 | 60 | ```python 61 | from qllm_eval.evaluation.q_opencompass.utils.models import HuggingFaceCausalLM_ 62 | ``` 63 | 64 | ## Reference Table for Evaluation Failure Cases 65 | 66 | From time to time we get upset evalution results from opencompass. Hopefully this table can help you solve the problem quickly. 67 | 68 | 1. Evaluation failure due to unparsed model outputs. 69 | 70 | When you evaluate one quantized model with a generation task, the model might output paired curly brace characters, which will be loaded as a dict variable, causing errors in the following string processing. In this case, you could modify the local opencompass package to avoid this: 71 | 72 | ``` 73 | opencompass/opencompass/tasks/openicl_eval.py 74 | ``` 75 | 76 | Adding `try-except` for exception processing might be helpful. 77 | -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_opencompass/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .openicl_infer_quant import * # noqa: F401, F403 -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_opencompass/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .build import * # noqa -------------------------------------------------------------------------------- /qllm_eval/evaluation/q_opencompass/utils/build.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import argparse 3 | 4 | import torch 5 | from mmengine.config import ConfigDict 6 | from opencompass.registry import LOAD_DATASET, MODELS 7 | 8 | # self import 9 | from qllm_eval.methods.rep.apply_rep import apply_awq 10 | from qllm_eval.quantization.quant_wrapper import quantize_model 11 | 12 | 13 | 14 | def build_dataset_from_cfg(dataset_cfg: ConfigDict): 15 | dataset_cfg = copy.deepcopy(dataset_cfg) 16 | dataset_cfg.pop('infer_cfg', None) 17 | dataset_cfg.pop('eval_cfg', None) 18 | dataset_cfg.pop('abbr', None) 19 | return LOAD_DATASET.build(dataset_cfg) 20 | 21 | 22 | def build_model_from_cfg(model_cfg: ConfigDict): 23 | model_cfg = copy.deepcopy(model_cfg) 24 | model_cfg.pop('run_cfg', None) 25 | model_cfg.pop('max_out_len', None) 26 | model_cfg.pop('batch_size', None) 27 | model_cfg.pop('abbr', None) 28 | model_cfg.pop('pred_postprocessor', None) 29 | # dump the quantization configs 30 | quant_cfg = {} 31 | quant_cfg['w_group_size'] = model_cfg.pop('w_group_size', 128) 32 | quant_cfg['w_bit'] = model_cfg.pop('w_bit', None) 33 | quant_cfg['a_group_size'] = model_cfg.pop('a_group_size', 128) 34 | quant_cfg['a_bit'] = model_cfg.pop('a_bit', None) 35 | quant_cfg['kv_group_size'] = model_cfg.pop('kv_group_size', 64) 36 | quant_cfg['kv_bit'] = model_cfg.pop('kv_bit', None) 37 | use_flash_attn = model_cfg.pop('use_flash_attn', False) 38 | rep_file = model_cfg.pop('rep_file', None) 39 | 40 | # # if kv cache quantization is specified, we should add the config to the model. 41 | # if quant_cfg['kv_bit'] is not None: 42 | # model_cfg['model_kwargs'].update({ 43 | # 'kv_bit': quant_cfg['kv_bit'], 44 | # 'kv_group_size': quant_cfg['kv_group_size'], 45 | # 'use_flash_attn': use_flash_attn, 46 | # }) 47 | 48 | # build the original llm 49 | lm_model = MODELS.build(model_cfg) 50 | ''' 51 | Implement Your Quantization Code Here. 52 | Below is a demo quantizing weights only. 53 | ''' 54 | raw_model = lm_model.model # shallow copy 55 | 56 | if rep_file is not None: 57 | rep_results = torch.load(rep_file, map_location="cpu") 58 | apply_awq(raw_model, rep_results) 59 | 60 | lm_model.model = quantize_model(raw_model, argparse.Namespace(**quant_cfg)) 61 | return lm_model 62 | -------------------------------------------------------------------------------- /qllm_eval/methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/methods/__init__.py -------------------------------------------------------------------------------- /qllm_eval/methods/rep/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/methods/rep/__init__.py -------------------------------------------------------------------------------- /qllm_eval/methods/rep/apply_rep.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu 4 | from transformers.models.opt.modeling_opt import OPTDecoderLayer 5 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm 6 | from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralRMSNorm 7 | from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer, MixtralRMSNorm 8 | from transformers.activations import GELUActivation 9 | 10 | from .utils import get_op_by_name, get_op_name, set_op_by_name 11 | 12 | class ScaledActivation(nn.Module): 13 | def __init__(self, module, scales): 14 | super().__init__() 15 | self.act = module 16 | self.scales = nn.Parameter(scales.data) 17 | 18 | def forward(self, x): 19 | return self.act(x) / self.scales.view(1, 1, -1).to(x.device) 20 | 21 | @torch.no_grad() 22 | def get_weight_scale(weight, q_group_size=-1): 23 | org_shape = weight.shape 24 | if q_group_size > 0: 25 | weight = weight.view(-1, q_group_size) 26 | scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True) 27 | scale = scale.view(org_shape) 28 | scale = scale.mean(0) 29 | return scale 30 | 31 | 32 | @torch.no_grad() 33 | def get_act_scale(x): 34 | return x.abs().view(-1, x.shape[-1]).mean(0) 35 | 36 | 37 | @torch.no_grad() 38 | def scale_ln_fcs(ln, fcs, scales): 39 | if not isinstance(fcs, list): 40 | fcs = [fcs] 41 | 42 | scales = scales.to(ln.weight.device) 43 | 44 | ln.weight.div_(scales) 45 | if hasattr(ln, 'bias') and ln.bias is not None: 46 | ln.bias.div_(scales) 47 | 48 | for fc in fcs: 49 | fc.weight.mul_(scales.view(1, -1)) 50 | 51 | for p in ln.parameters(): 52 | assert torch.isnan(p).sum() == 0 53 | for fc in fcs: 54 | for p in fc.parameters(): 55 | assert torch.isnan(p).sum() == 0 56 | 57 | 58 | @torch.no_grad() 59 | def scale_fc_fc(fc1, fc2, scales): 60 | assert isinstance(fc1, nn.Linear) 61 | assert isinstance(fc2, nn.Linear) 62 | # assert fc1.out_features == fc2.in_features 63 | 64 | scales = scales.to(fc1.weight.device) 65 | 66 | # fc1.weight.div_(scales.view(-1, 1)) 67 | fc1.weight[-scales.size(0):].div_(scales.view(-1, 1)) 68 | if fc1.bias is not None: 69 | fc1.bias.div_(scales.view(-1)) 70 | 71 | fc2.weight.mul_(scales.view(1, -1)) 72 | 73 | for p in fc1.parameters(): 74 | assert torch.isnan(p).sum() == 0 75 | for p in fc2.parameters(): 76 | assert torch.isnan(p).sum() == 0 77 | 78 | 79 | @torch.no_grad() 80 | def scale_gelu_fc(gelu, fc, scales): 81 | assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation)) 82 | assert isinstance(fc, nn.Linear) 83 | 84 | fc.weight.mul_(scales.view(1, -1).to(fc.weight.device)) 85 | 86 | for p in fc.parameters(): 87 | assert torch.isnan(p).sum() == 0 88 | 89 | def apply_scale(module, scales_list, input_feat_dict=None): 90 | for prev_op_name, layer_names, scales in scales_list: 91 | prev_op = get_op_by_name(module, prev_op_name) 92 | layers = [get_op_by_name(module, name) for name in layer_names] 93 | 94 | if isinstance(prev_op, nn.Linear): 95 | assert len(layers) == 1 96 | scale_fc_fc(prev_op, layers[0], scales) 97 | elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm, MixtralRMSNorm, MistralRMSNorm)) or 'rmsnorm' in str(prev_op.__class__).lower(): 98 | scale_ln_fcs(prev_op, layers, scales) 99 | elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)): 100 | new_module = ScaledActivation(prev_op, scales) 101 | set_op_by_name(module, prev_op_name, new_module) 102 | scale_gelu_fc(prev_op, layers[0], scales) 103 | else: 104 | raise NotImplementedError( 105 | f"prev_op {type(prev_op)} not supported yet!") 106 | 107 | # apply the scaling to input feat if given; prepare it for clipping 108 | if input_feat_dict is not None: 109 | for layer_name in layer_names: 110 | inp = input_feat_dict[layer_name] 111 | inp.div_(scales.view(1, -1).to(inp.device)) 112 | 113 | @torch.no_grad() 114 | def apply_clip(module, clip_list): 115 | from .utils import get_op_by_name 116 | for name, max_val in clip_list: 117 | layer = get_op_by_name(module, name) 118 | max_val = max_val.to(layer.weight.device) 119 | org_shape = layer.weight.shape 120 | layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1) 121 | layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val) 122 | layer.weight.data = layer.weight.data.reshape(org_shape) 123 | 124 | def apply_awq(model, awq_results): 125 | apply_scale(model, awq_results["scale"]) 126 | apply_clip(model, awq_results["clip"]) -------------------------------------------------------------------------------- /qllm_eval/methods/rep/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def get_op_by_name(module, op_name): 4 | # get the op by its name relative to the module 5 | for name, m in module.named_modules(): 6 | if name == op_name: 7 | return m 8 | raise ValueError(f"Cannot find op {op_name} in module {module}") 9 | 10 | 11 | def set_op_by_name(layer, name, new_module): 12 | levels = name.split('.') 13 | if len(levels) > 1: 14 | mod_ = layer 15 | for l_idx in range(len(levels)-1): 16 | if levels[l_idx].isdigit(): 17 | mod_ = mod_[int(levels[l_idx])] 18 | else: 19 | mod_ = getattr(mod_, levels[l_idx]) 20 | setattr(mod_, levels[-1], new_module) 21 | else: 22 | setattr(layer, name, new_module) 23 | 24 | 25 | def get_op_name(module, op): 26 | # get the name of the op relative to the module 27 | for name, m in module.named_modules(): 28 | if m is op: 29 | return name 30 | raise ValueError(f"Cannot find op {op} in module {module}") 31 | 32 | 33 | def append_str_prefix(x, prefix): 34 | if isinstance(x, str): 35 | return prefix + x 36 | elif isinstance(x, tuple): 37 | return tuple([append_str_prefix(y, prefix) for y in x]) 38 | elif isinstance(x, list): 39 | return [append_str_prefix(y, prefix) for y in x] 40 | else: 41 | return x -------------------------------------------------------------------------------- /qllm_eval/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /qllm_eval/quantization/qattn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qattn/__init__.py -------------------------------------------------------------------------------- /qllm_eval/quantization/qattn/sw/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qattn/sw/__init__.py -------------------------------------------------------------------------------- /qllm_eval/quantization/qlinear/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qlinear/__init__.py -------------------------------------------------------------------------------- /qllm_eval/quantization/qlinear/sqwa.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from functools import partial 4 | from ..quant_funcs import * 5 | 6 | class WALinear(nn.Module): 7 | def __init__(self, in_features, out_features, bias=True, act_quant='per_token', a_bit=8, w_bit=8, quantize_output=False, dev='cuda'): 8 | super().__init__() 9 | self.in_features = in_features 10 | self.out_features = out_features 11 | self.a_bit = a_bit 12 | self.w_bit = w_bit 13 | 14 | self.register_buffer('weight', torch.zeros(self.out_features, 15 | self.in_features, dtype=torch.float16, requires_grad=False, device=dev)) 16 | if bias: 17 | self.register_buffer('bias', torch.zeros( 18 | (1, self.out_features), dtype=torch.float16, requires_grad=False, device=dev)) 19 | else: 20 | self.register_buffer('bias', None) 21 | 22 | if act_quant == 'per_token': 23 | self.act_quant_name = 'per_token' 24 | self.act_quant = partial( 25 | quantize_activation_per_token_absmax, n_bits=self.a_bit) 26 | elif act_quant == 'per_tensor': 27 | self.act_quant_name = 'per_tensor' 28 | self.act_quant = partial( 29 | quantize_activation_per_tensor_absmax, n_bits=self.a_bit) 30 | else: 31 | raise ValueError(f'Invalid act_quant: {act_quant}') 32 | 33 | if quantize_output: 34 | self.output_quant_name = self.act_quant_name 35 | self.output_quant = self.act_quant 36 | else: 37 | self.output_quant_name = 'None' 38 | self.output_quant = lambda x: x 39 | 40 | def to(self, *args, **kwargs): 41 | super(WALinear, self).to(*args, **kwargs) 42 | self.weight = self.weight.to(*args, **kwargs) 43 | if self.bias is not None: 44 | self.bias = self.bias.to(*args, **kwargs) 45 | return self 46 | 47 | @torch.no_grad() 48 | def forward(self, x): 49 | q_x = self.act_quant(x) 50 | y = torch.functional.F.linear(q_x, self.weight, self.bias) 51 | q_y = self.output_quant(y) 52 | return q_y 53 | 54 | @staticmethod 55 | def from_float(module, weight_quant='per_channel', act_quant='per_token', w_bit=4, a_bit=8, weight_group=128, quantize_output=False): 56 | assert isinstance(module, torch.nn.Linear) 57 | new_module = WALinear( 58 | module.in_features, module.out_features, module.bias is not None, act_quant=act_quant, a_bit=a_bit, w_bit=w_bit, quantize_output=quantize_output, dev=module.weight.device) 59 | 60 | # Quantize the weight matrices 61 | if weight_quant == 'per_channel': 62 | new_module.weight = quantize_weight_per_channel_absmax(module.weight, n_bits=w_bit) 63 | elif weight_quant == 'per_tensor': 64 | new_module.weight = quantize_weight_per_tensor_absmax(module.weight, n_bits=w_bit) 65 | elif weight_quant == 'per_group': 66 | new_module.weight = pseudo_quantize_tensor(module.weight, n_bits=w_bit, q_group_size=weight_group, inplace=True) 67 | else: 68 | raise ValueError(f'Invalid weight_quant: {weight_quant}') 69 | 70 | new_module.weight_quant_name = weight_quant 71 | if module.bias is not None: 72 | new_module.bias = module.bias 73 | del module 74 | return new_module 75 | 76 | def __repr__(self): 77 | return 'W{}A{}Linear'.format(self.w_bit, self.a_bit) 78 | 79 | -------------------------------------------------------------------------------- /qllm_eval/quantization/quant_funcs.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | @torch.no_grad() 4 | def pseudo_quantize_tensor(tensor, n_bits=8, zero_point=True, q_group_size=-1, per_tensor=False, inplace=False): 5 | """ 6 | The basic quantization function for weight, activation and KV cache. 7 | """ 8 | org_tensor_shape = tensor.shape 9 | if q_group_size > 0: 10 | assert org_tensor_shape[-1] % q_group_size == 0 11 | tensor = tensor.reshape(-1, q_group_size) 12 | if per_tensor: 13 | tensor = tensor.reshape(1, -1) 14 | assert tensor.dim() == 2 15 | if zero_point: 16 | max_val = tensor.amax(dim=1, keepdim=True) 17 | min_val = tensor.amin(dim=1, keepdim=True) 18 | max_int = 2**n_bits - 1 19 | min_int = 0 20 | scales = (max_val - min_val).clamp(min=1e-5) / max_int 21 | zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int) 22 | else: 23 | max_val = tensor.abs().amax(dim=1, keepdim=True) 24 | max_val = max_val.clamp(min=1e-5) 25 | max_int = 2 ** (n_bits - 1) - 1 26 | min_int = -(2 ** (n_bits - 1)) 27 | scales = max_val / max_int 28 | zeros = 0 29 | 30 | if inplace: 31 | ( 32 | (tensor.div_(scales).round_().add_(zeros)).clamp_(min_int, max_int).sub_(zeros) 33 | ).mul_(scales) 34 | else: 35 | tensor = ( 36 | torch.clamp(torch.round(tensor / scales) + zeros, min_int, max_int) - zeros 37 | ) * scales 38 | 39 | assert torch.isnan(tensor).sum() == 0 40 | 41 | tensor = tensor.reshape(org_tensor_shape) 42 | 43 | # return the quantized tonsor, the scaling factor and the zero point value 44 | # return tensor, scales.view(tensor.shape[0], -1), zeros.view(tensor.shape[0], -1) 45 | return tensor 46 | 47 | 48 | @torch.no_grad() 49 | def quantize_weight_per_channel_absmax(w, n_bits=8): 50 | """ 51 | The basic quantization function for weight, activation and KV cache. 52 | """ 53 | tensor = pseudo_quantize_tensor(w, n_bits=n_bits, zero_point=False, q_group_size=-1, per_tensor=False, inplace=False) 54 | return tensor 55 | 56 | @torch.no_grad() 57 | def quantize_activation_per_token_absmax(t, n_bits=8): 58 | t_shape = t.shape 59 | t = t.view(-1, t_shape[-1]) 60 | t = pseudo_quantize_tensor(t, n_bits=n_bits, zero_point=True, q_group_size=-1, per_tensor=False, inplace=False) 61 | return t.reshape(t_shape) 62 | 63 | @torch.no_grad() 64 | def quantize_weight_per_tensor_absmax(w, n_bits=8): 65 | """ 66 | The basic quantization function for weight, activation and KV cache. 67 | """ 68 | tensor = pseudo_quantize_tensor(w, n_bits=n_bits, zero_point=False, q_group_size=-1, per_tensor=True, inplace=False) 69 | return tensor 70 | 71 | @torch.no_grad() 72 | def quantize_activation_per_tensor_absmax(t, n_bits=8): 73 | t_shape = t.shape 74 | t = t.view(-1, t_shape[-1]) 75 | t = pseudo_quantize_tensor(t, n_bits=n_bits, zero_point=True, q_group_size=-1, per_tensor=True, inplace=False) 76 | return t.reshape(t_shape) 77 | -------------------------------------------------------------------------------- /qllm_eval/visualization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/visualization/__init__.py -------------------------------------------------------------------------------- /qllm_eval/visualization/basic/assets/lambada_w_only.py: -------------------------------------------------------------------------------- 1 | # opt series 2 | # demo data for the plot script 3 | # the elements in each list is the result of precision [FP16, W8, W4, W3, W2] 4 | OPT_125M= [39.63, 39.69, 38.70, 18.61, 0.06] 5 | OPT_1B3 = [59.60, 59.60, 56.37, 26.57, 0.02] 6 | OPT_2B7 = [64.60, 64.76, 61.96, 3.09, 0.00] 7 | OPT_6B7 = [68.08, 67.98, 65.13, 45.60, 0.04] 8 | OPT_13B = [69.96, 69.78, 68.12, 20.30, 0.04] 9 | OPT_30B = [71.49, 71.36, 69.88, 51.66, 0.06] 10 | OPT_66B = [73.43, 73.45, 72.95, 8.11, 0.04] 11 | 12 | # falcon series 13 | Falcon_7B = [74.11, 74.09, 73.39, 70.60, 0.35] 14 | Falcon_40B = [76.23, 76.21, 75.70, 74.52, 0.14] 15 | Falcon_180B = [79.62, 79.60, 78.27, 75.53, 50.48] 16 | 17 | # llama2 series 18 | LlaMA2_7B = [73.32, 73.37, 71.73, 66.61, 0.00] 19 | LlaMA2_13B = [76.48, 76.58, 75.99, 71.73, 1.71] 20 | LlaMA2_70B = [78.96, 78.89, 77.88, 76.46, 16.96] 21 | 22 | # Bloom series 23 | Bloom_560M = [36.13, 36.21, 32.68, 21.21, 0.06] 24 | Bloom_1B1 = [44.15, 44.13, 42.29, 34.12, 1.14] 25 | Bloom_1B7 = [48.36, 48.22, 48.22, 38.15, 0.14] 26 | Bloom_3B = [54.16, 54.14, 51.87, 46.73, 4.39] 27 | Bloom_7B1 = [58.26, 58.06, 56.90, 52.11, 23.54] 28 | Bloom_175B = [67.53, 67.59, 67.01, 66.37, 38.31] 29 | 30 | # Bloomz series 31 | Bloomz_560M = [41.59, 41.12, 39.36, 26.59, 0.10] 32 | Bloomz_1B1 = [49.70, 49.60, 48.98, 40.27, 3.14] 33 | Bloomz_1B7 = [50.71, 50.77, 48.24, 43.82, 0.25] 34 | Bloomz_3B = [52.49, 52.65, 51.85, 47.33, 6.64] 35 | Bloomz_7B1 = [56.59, 56.39, 56.74, 53.11, 28.93] 36 | Bloomz_175B = [68.23, 68.12, 68.15, 67.15, 46.44] 37 | 38 | # ChatGLM3 series 39 | ChatGLM3_6B = [51.97, 52.01, 48.71, 32.72, 0.14] 40 | ChatGLM3_6B_32K = [63.79, 60.39, 63.30, 63.63, 0.02] 41 | 42 | # MOE models 43 | Mistral_7B = [75.04, 75.18, 73.90, 68.00, 0.21] 44 | Mixtral_8x7B = [76.79, 76.79, 74.85, 70.02, 0.02] 45 | 46 | -------------------------------------------------------------------------------- /qllm_eval/visualization/basic/heatmap_plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | import matplotlib.pyplot as plt 7 | 8 | from scipy import stats 9 | from importlib import import_module 10 | 11 | from utils import * 12 | 13 | save_name = None 14 | save_path = './results/heatmap_plot/' 15 | 16 | # whether use manually processed order or not 17 | normalize = False 18 | 19 | mode = 'w_only' 20 | modes = ['w_only', 'w_a', 'kv_cache'] 21 | selected_models = 'llama2+falcon+bloomz+chatglm3+moe+opt' 22 | # selected_models = 'llama2+bloomz+opt' 23 | selected_bitwidth = 'w3' 24 | 25 | label_size = 20 26 | 27 | 28 | ''' 29 | # full list of the tested models. 30 | models_to_plot = { 31 | 'opt': ['OPT_125M', 'OPT_1B3', 'OPT_2B7', 'OPT_6B7', 'OPT_13B', 'OPT_30B', 'OPT_66B'], 32 | 'falcon': ['Falcon_7B', 'Falcon_40B', 'Falcon_180B'], 33 | 'llama2': ['LlaMA2_7B', 'LlaMA2_13B', 'LlaMA2_70B'], 34 | 'bloom': ['Bloom_560M', 'Bloom_1B1', 'Bloom_1B7', 'Bloom_3B', 'Bloom_7B1', 'Bloom_175B'], 35 | 'bloomz': ['Bloomz_560M', 'Bloomz_1B1', 'Bloomz_1B7', 'Bloomz_3B', 'Bloomz_7B1', 'Bloomz_175B'], 36 | 'chatglm3': ['ChatGLM3_6B'], 37 | } 38 | ''' 39 | # selected models for easier display. 40 | models_to_plot = { 41 | 'opt': ['OPT_6B7', 'OPT_13B'], 42 | 'falcon': ['Falcon_7B', 'Falcon_180B'], 43 | 'llama2': ['LlaMA2_7B', 'LlaMA2_70B'], 44 | # 'bloom': ['Bloom_3B', 'Bloom_7B1'], 45 | 'bloomz': ['Bloomz_3B', 'Bloomz_175B'], 46 | 'chatglm3': ['ChatGLM3_6B'], 47 | 'moe': ['Mistral_7B', 'Mixtral_8x7B'], 48 | } 49 | 50 | # datasets=['chid', 'winogrande', 'race', 'lambada', 'rte', 'piqa', 'siqa'] 51 | datasets=['chid', 'winogrande', 'race', 'lambada', 'piqa', 'siqa'] 52 | datasets_ = ['CHID', 'Winogrande', 'Race', 'LAMBADA', 'PIQA', 'SIQA'] 53 | 54 | if __name__ == '__main__': 55 | if not os.path.exists(save_path): 56 | os.makedirs(save_path) 57 | 58 | save_name = save_name + '.pdf' if save_name is not None else \ 59 | 'heatmap_' + selected_bitwidth + '.pdf' 60 | save_name = save_path + save_name 61 | 62 | 63 | # first gather the test results. 64 | model_perfs = {} 65 | selected_models = selected_models.split('+') 66 | for dataset in datasets: 67 | fp_perfs = [] 68 | quant_perfs = [] 69 | results_module = import_module('assets.' + dataset + '_' + mode) 70 | for selected_model_family in selected_models: 71 | for model in models_to_plot[selected_model_family]: 72 | perfs = getattr(results_module, model) 73 | fp_perfs.append(perfs[0]) 74 | quant_idx = bitwidth_idx_mapping[mode][selected_bitwidth] 75 | quant_perfs.append(perfs[quant_idx]) 76 | model_perfs.update({dataset: {'fp_perfs': fp_perfs, 'quant_perfs': quant_perfs}}) 77 | 78 | spearman_corr = np.zeros((12, 12)) 79 | # the order of the datasets is the same as datasets. The FP results occupy the first 6 col & row, followed 80 | # by 6 quant results. 81 | for row in range(12): 82 | for col in range(12): 83 | 84 | row_dataset = datasets[row % 6] 85 | row_fp_flag = not(row // 6) 86 | row_perfs = np.array(model_perfs[row_dataset]['fp_perfs' if row_fp_flag else 'quant_perfs']) 87 | 88 | col_dataset = datasets[col % 6] 89 | col_fp_flag = not(col // 6) 90 | col_perfs = np.array(model_perfs[col_dataset]['fp_perfs' if col_fp_flag else 'quant_perfs']) 91 | 92 | sp_corr_value = stats.spearmanr(row_perfs, col_perfs).correlation 93 | spearman_corr[row][col] = sp_corr_value 94 | 95 | # plot the heatmap figure 96 | spearman_corr = pd.DataFrame(spearman_corr) 97 | 98 | x_labels = ["FP16 " + datasets_[i % 6] if i < 6 else selected_bitwidth.upper() + " " + datasets_[i % 6] for i in range(12)] 99 | y_labels = ["FP16 " + datasets_[i % 6] if i < 6 else selected_bitwidth.upper() + " " + datasets_[i % 6] for i in range(12)] 100 | 101 | plot = sns.heatmap(spearman_corr, cmap="YlGnBu", xticklabels=x_labels, yticklabels=y_labels, annot=True, 102 | annot_kws={'fontsize': 6}, fmt='.2g') 103 | 104 | # plot.set(xlabel=x_labels, ylabel=y_labels) 105 | plot.xaxis.tick_top() 106 | plt.xticks(rotation=45, ha='left', rotation_mode='anchor') 107 | # plt.yticks(rotation=45) 108 | plt.subplots_adjust(left=0.22, right=0.89, top=0.77, bottom=0.13) 109 | # plt.xlabel('Dataset - Quantization Bitwidth', fontsize=label_size) 110 | # plt.ylabel('Dataset - Quantization Bitwidth', fontsize=label_size) 111 | plt.savefig(save_name) 112 | plt.show() 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /qllm_eval/visualization/basic/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # dataset related properties 4 | dataset_minimum = { 5 | 'chid': 16.67, 6 | 'winogrande': 50., 7 | 'race': 25., 8 | 'lambada': 0., 9 | 'rte': 50., 10 | 'piqa': 50., 11 | 'siqa': 33.33, 12 | } 13 | 14 | bitwidth_idx_mapping = { 15 | 'w_only': {'fp16': 0, 'w8': 1, 'w4': 2, 'w3': 3, 'w2': 4}, 16 | 'w_a': {'fp16': 0, 'w8a8': 1, 'w4a8': 2, 'w4a4': 3}, 17 | 'kv_cache': {'fp16': 0, 'kv8': 1, 'kv4': 2, 'kv3': 3, 'kv2': 4} 18 | } 19 | 20 | 21 | # helper functions 22 | def normalize_results(raw_results, fp_idx=0, minimal=None, range=1, w_fp=True): 23 | # raw_results = result_pad(raw_results) 24 | has_nonzero_fp_result = raw_results[fp_idx] is not None and raw_results[fp_idx] != 0 25 | if has_nonzero_fp_result and w_fp: 26 | # do not consider the minimal value of the dataset 27 | fp_result = raw_results[fp_idx] 28 | if minimal is None: 29 | norm_results = [i / fp_result if i is not None else None for i in raw_results] 30 | print('Result Normalization Succeeded.') 31 | else: 32 | norm_results = [max((i - minimal) / (fp_result - minimal), 0) \ 33 | if i is not None else None for i in raw_results] 34 | else: 35 | norm_results = raw_results 36 | print('The input results have no FP precision, return original results.') 37 | assert range in [1, 100] 38 | if range == 100: 39 | norm_results = [i * 100 if i is not None else i for i in norm_results] 40 | print('Result Normalization Succeeded.') 41 | elif range == 1 and norm_results == raw_results: 42 | norm_results = [i / 100. if i is not None else i for i in norm_results] 43 | print('Result Normalization Succeeded.') 44 | return norm_results 45 | -------------------------------------------------------------------------------- /qllm_eval/visualization/dialogue/assets/__init__.py: -------------------------------------------------------------------------------- 1 | from . import mt_bench_w_only as w_only 2 | from . import mt_bench_w_a as wa 3 | from . import mt_bench_kv_cache as kv -------------------------------------------------------------------------------- /qllm_eval/visualization/dialogue/assets/mt_bench_kv_cache.py: -------------------------------------------------------------------------------- 1 | # results for mt-bench 2 | # the elements in each list are the results of round1 and round2 3 | # the elements in each sub-list are the results of precision [[FP16, KV8, KV4, KV3], [FP16, KV8, KV4, KV3]] 4 | 5 | results = { 6 | # llama2-chat series 7 | 'llama-2_7b_chat' : [[5.31, 5.25, 5.46, 4.38], [4.14, 4.21, 4.16, 2.76]], 8 | 'llama-2_13b_chat' : [[5.72, 5.84, 5.86, 5.53], [5.05, 5.04, 4.38, 3.85]], 9 | 'llama-2_70b_chat' : [[6.26, 6.41, 6.30, 6.25], [5.99, 5.91, 6.03, 5.66]], 10 | 11 | # falcon-instruct series 12 | 'falcon_7b_instruct' : [[3.79, 3.64, 3.60, 2.92], [2.30, 2.27, 2.24, 1.86]], 13 | 'falcon_40b_instruct' : [[4.92, 4.86, 4.76, 4.45], [3.36, 3.35, 3.40, 3.34]], 14 | 'falcon_180b_chat' : [[6.35, 6.68, 6.61, 6.58], [5.59, 5.63, 5.70, 5.46]], 15 | 16 | # mistral-instruct series 17 | 'mistral_7b_instruct' : [[6.70, 6.70, 6.55, 6.26], [6.00, 6.42, 6.22, 4.96]], 18 | 'mixtral_8x7b_instruct' : [[7.89, 7.62, 6.97, 6.26], [6.55, 6.94, 6.34, 4.96]], 19 | 20 | # chatglm3 series 21 | 'chatglm3_6b' : [[5.13, 5.08, 4.58, 4.81], [3.68, 3.54, 3.26, 3.18]], 22 | 23 | # stablelm series 24 | 'stablelm_zephyr_3b' : [[5.03, 5.08, 5.08, 4.38], [4.25, 4.14, 3.78, 3.19]], 25 | 26 | # gemma series 27 | 'gemma_2b_it' : [[4.06, 3.98, 3.41, 3.34], [2.81, 2.62, 2.80, 2.52]], 28 | 'gemma_7b_it' : [[5.25, 5.28, 4.98, 4.94], [3.61, 3.63, 3.49, 3.37]], 29 | } 30 | -------------------------------------------------------------------------------- /qllm_eval/visualization/dialogue/assets/mt_bench_w_a.py: -------------------------------------------------------------------------------- 1 | # results for mt-bench 2 | # the elements in each list are the results of round1 and round2 3 | # the elements in each sub-list are the results of precision [[FP16, W8A8, W4A8, W4A4], [FP16, W8A8, W4A8, W4A4]] 4 | 5 | results = { 6 | # llama2-chat series 7 | 'llama-2_7b_chat' : [[5.31, 4.94, 5.12, 1.00], [4.14, 3.79, 4.22, 1.00]], 8 | 'llama-2_13b_chat' : [[5.72, 5.83, 5.88, 1.00], [5.05, 5.12, 4.97, 1.00]], 9 | 'llama-2_70b_chat' : [[6.26, 6.17, 6.11, 1.00], [5.99, 5.86, 5.86, 1.00]], 10 | 11 | # falcon-instruct series 12 | 'falcon_7b_instruct' : [[3.79, 3.59, 3.15, 1.00], [2.30, 2.14, 2.05, 1.00]], 13 | 'falcon_40b_instruct' : [[4.92, 4.71, 4.40, 1.00], [3.36, 3.55, 3.14, 1.00]], 14 | 'falcon_180b_chat' : [[6.35, 6.35, 6.56, 1.00], [5.59, 5.43, 5.88, 1.00]], 15 | 16 | # mistral-instruct series 17 | 'mistral_7b_instruct' : [[6.70, 6.74, 6.53, 1.00], [6.00, 6.08, 5.76, 1.00]], 18 | 'mixtral_8x7b_instruct' : [[7.89, 7.22, 7.42, 1.00], [6.55, 6.61, 6.49, 1.00]], 19 | 20 | # chatglm3 series 21 | 'chatglm3_6b' : [[5.13, 5.14, 4.85, 1.00], [3.68, 3.92, 3.50, 1.00]], 22 | 23 | # stablelm series 24 | 'stablelm_zephyr_3b' : [[5.03, 5.29, 5.48, 1.14], [4.25, 4.10, 3.94, 1.00]], 25 | 26 | # gemma series 27 | 'gemma_2b_it' : [[4.06, 3.91, 3.64, 1.00], [2.81, 2.95, 2.73, 1.00]], 28 | 'gemma_7b_it' : [[5.25, 5.09, 5.14, 1.00], [3.62, 3.79, 3.28, 1.00]], 29 | 30 | # mamba series 31 | 'mamba_2b8_chat' : [[1.95, 2.08, 1.83, 1.41], [1.40, 1.45, 1.12, 1.00]], 32 | } 33 | 34 | -------------------------------------------------------------------------------- /qllm_eval/visualization/dialogue/assets/mt_bench_w_only.py: -------------------------------------------------------------------------------- 1 | # results for mt-bench 2 | # the elements in each list are the results of round1 and round2 3 | # the elements in each sub-list are the results of precision [[FP16, W8, W4, W3], [FP16, W8, W4, W3]], 4 | 5 | results = { 6 | # llama2-chat series 7 | 'llama-2_7b_chat' : [[5.31, 5.16, 5.47, 4.94], [4.14, 4.04, 4.30, 3.52]], 8 | 'llama-2_13b_chat' : [[5.72, 5.95, 5.74, 5.38], [5.05, 5.31, 4.65, 4.26]], 9 | 'llama-2_70b_chat' : [[6.26, 6.49, 5.91, 5.86], [5.99, 5.83, 5.55, 5.12]], 10 | 11 | # falcon-instruct series 12 | 'falcon_7b_instruct' : [[3.79, 3.71, 3.39, 3.06], [2.30, 2.19, 2.27, 1.89]], 13 | 'falcon_40b_instruct' : [[4.92, 4.81, 4.66, 4.38], [3.36, 3.01, 3.69, 3.54]], 14 | 'falcon_180b_chat' : [[6.35, 6.62, 6.25, 5.79], [5.59, 5.70, 6.05, 5.17]], 15 | 16 | # mistral-instruct series 17 | 'mistral_7b_instruct' : [[6.70, 6.78, 6.44, 6.18], [6.00, 6.01, 5.88, 5.49]], 18 | 'mixtral_8x7b_instruct' : [[7.89, 7.53, 7.21, 6.72], [6.55, 6.64, 6.21, 5.53]], 19 | 20 | # chatglm3 series 21 | 'chatglm3_6b' : [[5.13, 4.91, 5.06, 4.35], [3.68, 3.88, 4.09, 3.12]], 22 | 23 | # stablelm series 24 | 'stablelm_zephyr_3b' : [[5.03, 5.09, 5.58, 3.15], [4.25, 3.86, 4.09, 2.11]], 25 | 26 | # gemma series 27 | 'gemma_2b_it' : [[4.06, 4.18, 3.67, 3.39], [2.81, 3.11, 3.06, 2.60]], 28 | 'gemma_7b_it' : [[5.25, 5.29, 4.94, 4.74], [3.62, 3.88, 3.51, 3.19]], 29 | 30 | # mamba series 31 | 'mamba_2b8_chat' : [[1.95, 2.06, 1.73, 1.09], [1.40, 1.29, 1.29, 1.00]], 32 | } 33 | -------------------------------------------------------------------------------- /qllm_eval/visualization/dialogue/gen_table.py: -------------------------------------------------------------------------------- 1 | import os 2 | from assets import w_only, wa, kv 3 | 4 | model_name_map = { 5 | 'llama-2_7b_chat': 'LLaMA2-7B-chat', 6 | 'llama-2_13b_chat': 'LLaMA2-13B-chat', 7 | 'llama-2_70b_chat': 'LLaMA2-70B-chat', 8 | 'falcon_7b_instruct': 'Falcon-7B-instruct', 9 | 'falcon_40b_instruct': 'Falcon-40B-instruct', 10 | 'falcon_180b_chat': 'Falcon-180B-chat', 11 | 'mistral_7b_instruct': 'Mistral-7B-instruct-v0.2', 12 | 'mixtral_8x7b_instruct': 'Mixtral-8x7B-instruct-v0.1', 13 | 'chatglm3_6b': 'ChatGLM3-6B', 14 | 'stablelm_zephyr_3b': 'StableLM-Zephyr-3B', 15 | 'gemma_2b_it': 'Gemma-2B-it', 16 | 'gemma_7b_it': 'Gemma-7B-it', 17 | 'mamba_2b8_chat': "Mamba-2.8B-chat" 18 | } 19 | 20 | def gen_tex_table_lines(save_path='./tables/table.tex'): 21 | """ 22 | \midrule 23 | \multirow{2}{*}{MODEL-NAME} & 1 & rd1_fp16 & rd1_w8 & rd1_w4 & rd1_w3_awq & rd1_w3_awq & rd1_w8a8 & rd1_w4a8 & rd1_w4a4 & rd1_w4a4_sq & rd1_kv8 & rd1_kv4 & rd1_kv3 \\ 24 | & 2 & rd2_fp16 & rd2_w8 & rd2_w4 & rd2_w3_awq & rd2_w3_awq & rd2_w8a8 & rd2_w4a8 & rd2_w4a4 & rd2_w4a4_sq & rd2_kv8 & rd2_kv4 & rd2_kv3 \\ 25 | """ 26 | # clear the previous table 27 | if os.path.exists(save_path): 28 | os.remove(save_path) 29 | os.makedirs(os.path.split(save_path)[0], exist_ok=True) 30 | 31 | with open(save_path, 'a') as f: 32 | for model, model_name in model_name_map.items(): 33 | f.write(r'\midrule' + '\n') 34 | if 'mamba' in model.lower(): # mamba doesn't have kv cache 35 | # round 1 36 | f.write(r'\multirow{2}{*}{' + model_name + r'} & 1 ' + \ 37 | ''.join([f'& {w_only.results[model][0][i]:.2f} ' for i in range(len(w_only.results[model][0]))]) + '& - ' + \ 38 | ''.join([f'& {wa.results[model][0][i]:.2f} ' for i in range(1, len(wa.results[model][0]))]) + '& - ' + \ 39 | ''.join([f'& - ' for _ in range(3)]) + \ 40 | r'\\' + '\n') 41 | # round2 42 | f.write(r'& 2 ' + \ 43 | ''.join([f'& {w_only.results[model][1][i]:.2f} ' for i in range(len(w_only.results[model][1]))]) + '& - ' + \ 44 | ''.join([f'& {wa.results[model][1][i]:.2f} ' for i in range(1, len(wa.results[model][1]))]) + '& - ' + \ 45 | ''.join([f'& - ' for _ in range(3)]) + \ 46 | r'\\' + '\n') 47 | else: 48 | # round1 49 | f.write(r'\multirow{2}{*}{' + model_name + r'} & 1 ' + \ 50 | ''.join([f'& {w_only.results[model][0][i]:.2f} ' for i in range(len(w_only.results[model][0]))]) + '& - ' + \ 51 | ''.join([f'& {wa.results[model][0][i]:.2f} ' for i in range(1, len(wa.results[model][0]))]) + '& - ' + \ 52 | ''.join([f'& {kv.results[model][0][i]:.2f} ' for i in range(1, len(kv.results[model][0]))]) + \ 53 | r'\\' + '\n') 54 | # round2 55 | f.write(r'& 2 ' + \ 56 | ''.join([f'& {w_only.results[model][1][i]:.2f} ' for i in range(len(w_only.results[model][1]))]) + '& - ' + \ 57 | ''.join([f'& {wa.results[model][1][i]:.2f} ' for i in range(1, len(wa.results[model][1]))]) + '& - ' + \ 58 | ''.join([f'& {kv.results[model][1][i]:.2f} ' for i in range(1, len(kv.results[model][1]))]) + \ 59 | r'\\' + '\n') 60 | 61 | def gen_md_table_lines(save_path='./tables/table.md'): 62 | pass 63 | 64 | if __name__ == '__main__': 65 | gen_tex_table_lines() -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-c_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 36.61 36.61 36.27 33.56 22.37 2 | LLaMA2-13B-chat 41.69 42.37 40.34 37.97 31.19 3 | LLaMA2-70B-chat 40.34 39.32 38.64 38.31 38.31 4 | Falcon-7B-instruct 34.58 34.58 35.25 32.88 23.05 5 | Falcon-40B-instruct 45.76 45.76 45.08 41.36 32.2 6 | Falcon-180B-chat 36.95 36.27 35.93 34.24 34.92 7 | ChatGLM3-6B 30.85 31.19 30.17 28.14 26.44 8 | ChatGLM3-6B-32k 42.37 42.71 40.68 44.75 29.49 9 | Mistral-7B-instruct 48.14 48.14 48.14 51.53 37.63 10 | Mixtral-8x7B-instruct 55.25 53.56 51.86 50.85 28.81 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-c_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 36.61 36.61 34.92 31.19 24.75 2 | LLaMA2-13B-chat 41.69 42.37 41.02 40.34 20.68 3 | LLaMA2-70B-chat 40.34 40.34 36.61 37.97 26.44 4 | Falcon-7B-instruct 34.58 34.58 31.86 32.2 24.75 5 | Falcon-40B-instruct 45.76 45.42 45.08 45.76 22.37 6 | Falcon-180B-chat 36.95 36.27 34.92 33.22 24.75 7 | ChatGLM3-6B 30.85 31.19 28.47 30.17 21.02 8 | ChatGLM3-6B-32k 42.37 42.37 42.37 38.98 24.41 9 | Mistral-7B-instruct 48.14 48.81 49.49 44.41 21.02 10 | Mixtral-8x7B-instruct 55.25 56.61 52.2 46.78 24.41 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-c_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 36.61 37.63 34.92 25.42 2 | LLaMA2-13B-chat 41.69 43.05 40.34 22.37 3 | LLaMA2-70B-chat 40.34 41.36 37.97 21.69 4 | Falcon-7B-instruct 34.58 33.9 32.88 20.68 5 | Falcon-40B-instruct 45.76 45.76 45.42 22.03 6 | Falcon-180B-chat 36.95 35.59 35.93 22.37 7 | ChatGLM3-6B 30.85 31.86 29.15 24.41 8 | ChatGLM3-6B-32k 42.37 42.71 41.36 22.03 9 | Mistral-7B-instruct 48.14 48.81 48.14 27.8 10 | Mixtral-8x7B-instruct 55.25 53.56 49.83 27.46 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-e_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 60.32 60.49 59.08 57.67 39.51 2 | LLaMA2-13B-chat 64.2 64.02 63.84 62.61 50.79 3 | LLaMA2-70B-chat 60.67 60.67 60.85 61.02 61.55 4 | Falcon-7B-instruct 58.91 58.73 58.73 54.67 37.39 5 | Falcon-40B-instruct 71.08 70.72 71.25 70.02 58.73 6 | Falcon-180b-chat 55.73 55.73 55.56 55.56 55.38 7 | ChatGLM3-6B 44.27 44.44 45.5 43.03 40.21 8 | ChatGLM3-6B-32k 66.67 66.31 67.37 63.14 48.5 9 | Mistral-7B-instruct 67.37 67.37 67.37 63.84 55.73 10 | Mixtral-8x7B-instruct 79.19 78.84 78.13 71.08 50.79 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-e_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 60.32 60.14 56.44 51.85 25.4 2 | LLaMA2-13B-chat 64.2 64.02 63.32 61.9 28.22 3 | LLaMA2-70B-chat 60.67 61.02 59.79 56.26 48.5 4 | Falcon-7B-instruct 58.91 58.91 58.2 55.73 38.1 5 | Falcon-40B-instruct 71.08 70.9 70.55 71.6 29.98 6 | Falcon-180B-chat 55.73 56.26 56.26 58.38 40.39 7 | ChatGLM3-6B 44.27 43.92 42.86 41.62 27.34 8 | ChatGLM3-6B-32k 66.67 66.84 65.96 64.37 29.63 9 | Mistral-7B-instruct 67.37 67.55 65.26 62.43 32.1 10 | Mixtral-8x7B-instruct 79.19 79.72 76.19 72.49 26.28 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/arc-e_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 60.32 60.14 54.85 25.04 2 | LLaMA2-13B-chat 64.2 64.73 63.14 29.28 3 | LLaMA2-70B-chat 60.67 62.26 59.96 29.63 4 | Falcon-7B-instruct 58.91 59.08 58.91 35.98 5 | Falcon-40B-instruct 71.08 71.08 70.19 23.46 6 | Falcon-180B-chat 55.73 54.5 56.97 24.87 7 | ChatGLM3-6B 44.27 42.5 42.86 29.98 8 | ChatGLM3-6B-32k 66.67 67.02 66.14 29.28 9 | Mistral-7B-instruct 67.37 66.49 65.61 37.04 10 | Mixtral-8x7B-instruct 79.19 76.54 74.96 29.98 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/ceval_kv.txt: -------------------------------------------------------------------------------- 1 | ChatGLM3-6B 53.84 53.92 52.36 51.84 34.53 2 | ChatGLM3-6B-32k 60.74 60.43 59.39 50.74 24.97 3 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/ceval_w.txt: -------------------------------------------------------------------------------- 1 | ChatGLM3-6B 53.84 53.85 53.19 49.87 24.82 2 | ChatGLM3-6B-32k 60.74 60.73 59.69 57.36 26.2 -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/ceval_wa.txt: -------------------------------------------------------------------------------- 1 | ChatGLM3-6B 53.84 53.78 53.51 22.89 2 | ChatGLM3-6B-32k 60.74 60.55 59.02 26.33 -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/gsm8k_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 26.76 25.78 23.96 12.05 0.83 2 | LLaMA2-13B 42.84 42.68 39.35 31.61 0.91 3 | LLaMA2-70B 59.14 59.14 58 53.75 3.79 4 | Falcon-7B 3.64 5 | Falcon-40B 38.36 37.68 37.23 29.57 2.2 6 | Falcon-180B 67.63 67.7 66.87 59.29 2.35 7 | ChatGLM3-6B 55.8 56.79 56.79 47.31 3.49 8 | ChatGLM3-6B-32k 54.21 55.57 51.1 37.76 9 | Mistral-7B 48.52 47.61 43.52 29.87 1.52 10 | Mixtral-8x7B 71.11 72.4 66.79 49.2 0.53 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/gsm8k_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 26.76 25.47 25.17 15.62 1.9 2 | LLaMA2-13B 42.84 42.61 37.38 28.43 0.15 3 | LLaMA2-70B 59.14 59.82 58.07 50.87 0.99 4 | Falcon-7B 3.64 3.41 3.94 5 | Falcon-40B 38.36 38.29 36.24 32.37 0 6 | Falcon-180B 67.63 67.7 67.02 47.46 1.59 7 | ChatGLM3-6B 55.8 57.39 53.68 45.26 0.3 8 | ChatGLM3-6B-32k 54.21 54.28 52.54 42.3 1.52 9 | Mistral-7B 48.52 48.52 44.5 29.72 1.52 10 | Mixtral-8x7B 71.11 70.96 69.52 51.71 0.68 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/gsm8k_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 26.76 25.93 22.37 0.91 2 | LLaMA2-13B 42.84 42.46 38.36 0.23 3 | LLaMA2-70B 59.14 58.23 56.48 0.99 4 | Falcon-7B 3.64 5 | Falcon-40B 38.36 36.16 34.95 1.29 6 | Falcon-180B 67.63 67.32 68.61 0.08 7 | ChatGLM3-6B 55.8 55.04 53.68 1.21 8 | ChatGLM3-6B-32k 54.21 52.39 49.96 0.53 9 | Mistral-7B 48.52 47.08 43.44 1.36 10 | Mixtral-8x7B 71.11 71.95 70.36 0.15 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/hellaswag_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 70.28 70.24 69.84 68.66 42.7 2 | LLaMA2-13B-chat 74.32 74.37 74.09 73.29 55.37 3 | LLaMA2-70B-chat 77.09 77.11 76.96 76.28 71.87 4 | Falcon-7B-instruct 62.81 62.75 62.4 60.32 34.82 5 | Falcon-40B-instruct 76.6 76.6 76.57 75.51 67.11 6 | Falcon-180B-chat 82.22 82.21 82.4 81.59 76.98 7 | ChatGLM3-6B 59.83 59.79 59.41 58.05 51.38 8 | ChatGLM3-6B-32k 66.09 66.04 65.45 63.22 42.86 9 | Mistral-7B-instruct 74.16 74.14 73.34 73.46 61.45 10 | Mixtral-8x7B-instruct 80.43 80.29 80.17 79.24 64.26 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/hellaswag_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 70.28 70.3 68.9 65.47 24.82 2 | LLaMA2-13B-chat 74.32 74.31 73.78 70.94 32.61 3 | LLaMA2-70B-chat 77.09 77.04 76.38 74.37 52.72 4 | Falcon-7B-instruct 62.81 62.74 61.91 59.24 41.44 5 | Falcon-40B-instruct 76.6 76.59 76.65 76.25 27.37 6 | Falcon-180B-chat 82.22 82.18 82.04 79.89 60.75 7 | ChatGLM3-6B 59.83 59.74 58.98 57.58 33.7 8 | ChatGLM3-6B-32k 66.09 66.04 65.67 61.85 33.2 9 | Mistral-7B-instruct 74.16 74.19 73.45 70.75 32.65 10 | Mixtral-8x7B-instruct 80.43 80.4 80.21 77.67 27.72 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/hellaswag_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B-chat 70.28 69.66 68.81 30.12 2 | LLaMA2-13B-chat 74.32 74.47 73.58 28.28 3 | LLaMA2-70B-chat 77.09 77.2 76.11 27.39 4 | Falcon-7B-instruct 62.81 62.88 61.77 33.36 5 | Falcon-40B-instruct 76.6 76.5 76.31 25.89 6 | Falcon-180b-chat 82.22 81.58 81.76 25.71 7 | ChatGLM3-6B 59.83 59.18 58.49 30.02 8 | ChatGLM3-6B-32k 66.09 65.94 65.46 27.39 9 | Mistral-7B-instruct 74.16 73.44 73.21 44.99 10 | Mixtral-8x7B-instruct 80.43 80.34 79.9 33.6 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu-calib_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 56.05/56.33 56.42/56.45 54.64/54.68 51.17/51.37 50.1/50.15 2 | LLaMA2-13B 59.88/59.48 59.82/59.29 61.02/60.66 58.08/57.71 49.96/50.09 3 | LLaMA2-70B 70.26/70.47 70.09/70.57 69.54/69.56 66.87/67.19 50.73/50.61 4 | Falcon-7B 51.59/51.43 51.62/51.54 51.82/51.68 50.24/50.16 49.54/49.59 5 | Falcon-40B 61.59/61.87 61.49/61.43 60.7/60.51 57.13/57.13 49.55/49.65 6 | Falcon-180B 72.87/73.62 73/73.82 72.01/72.55 70.02/70.34 51.67/51.46 7 | ChatGLM3-6B 55.66/55.38 55.32/55.18 55.68/55.43 53.62/53.4 50.13/49.91 8 | Mistral-7B 67.83/67.27 67.55/66.84 67.47/66.79 64.09/63.53 49.97/49.88 9 | Mixtral-8x7B 73.29/72.98 73.25/73.16 71/70.58 62.93/62.95 50.04/50.12 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu-calib_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 56.05/56.33 56.15/56.31 55.29/54.94 50.35/50.47 49.75/49.91 2 | LLaMA2-13B 59.88/59.48 59.9/59.58 60.13/59.6 57.46/57.11 50.06/50.16 3 | LLaMA2-70B 70.26/70.47 69.75/70.08 67.9/68.6 66.81/67.09 49.89/50 4 | Falcon-7B 51.59/51.43 51.89/51.98 52.14/52.12 50.1/50.08 5 | Falcon-40B 61.59/61.87 61.44/61.68 61.13/60.91 53.3/53.31 50/50.01 6 | Falcon-180B 72.87/73.62 72.3/73.57 72.81/73.79 69.59/69.48 53.04/53.13 7 | ChatGLM3-6B 55.66/55.38 55.58/55.21 55.13/54.84 52.08/52.01 50.6/50.57 8 | Mistral-7B 67.83/67.27 67.58/66.98 66.26/65.56 63.64/62.99 49.99/50.03 9 | Mixtral-8x7B 73.29/72.98 73.18/73.45 71.97/71.2 66.85/66.2 50.05/49.81 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu-calib_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 56.05/56.33 56.34/56.32 54.57/54.24 4967/49.83 2 | LLaMA2-13B 59.88/59.48 59.21/59.06 60.28/59.55 49.94/49.94 3 | LLaMA2-70B 70.26/70.47 70.41/70.53 69/69.07 50.18/50.17 4 | Falcon-7B 51.59/51.43 52.04/51.97 51.91/51.95 49.54/49.5 5 | Falcon-40B 61.59/61.87 60.84/60.82 60.18/60.07 50/50 6 | Falcon-180B 72.87/73.62 72.89/73.54 72.15/73.18 49.66/49.93 7 | ChatGLM3-6B 55.66/55.38 54.91/54.76 55.04/54.71 49.68/49.8 8 | Mistral-7B 67.83/67.27 67.76/67.24 66.03/65.74 49.86/49.55 9 | Mixtral-8x7B 73.29/72.98 70.72/70.5 69.18/68.66 50.68/49.90 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 46.69/45.89 46.64/45.86 46.19/44.98 41.15/40.44 24.98/25.21 2 | LLaMA2-13B 55.81/55.68 55.75/55.6 54.51/54.35 49.96/49.27 25.54/25.49 3 | LLaMA2-70B 69.80/69.13 69.81/69.09 68.97/68.56 67.06/66.17 35.98/36.16 4 | Falcon-7B 27.55/27.11 5 | Falcon-40B 57.03/56 57.04/56.02 55.9/55.31 52.03/50.83 26.96/27.2 6 | Falcon-180B 70.45/70.41 70.5/70.45 69.78/69.74 67.51/66.95 42.83/41.79 7 | ChatGLM3-6B 51.93/50.58 51.95/50.6 51.52/50.11 49.04/47.71 34.64/34.02 8 | ChatGLM3-6B-32k 58.01/57.29 58.08/57.36 57.11/56.15 51.26/50.17 24.44/24.98 9 | Mistral-7B 64.06/62.57 64.1/62.65 62.81/61.33 59.08/57.76 27.36/27.08 10 | Mixtral-8x7B 71.91/70.49 71.63/70.21 68.07/66.98 58.77/57.77 25.02/24.91 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 46.69/45.89 46.92/45.98 45.45/44.91 38.52/37.87 24.23/24.39 2 | LLaMA2-13B 55.81/55.68 55.73/55.69 54.55/54.28 51.07/50.51 25.08/24.94 3 | LLaMA2-70B 69.80/69.13 69.74/69.1 68.49/68.02 65.21/64.69 27.13/26.63 4 | Falcon-7B 27.55/27.11 27.71/27.08 28.6/28.27 28.42/27.71 24.78/24.6 5 | Falcon-40B 57.03/56 57.09/56.02 56.1/55.26 53.7/53.3 23.75/23.84 6 | Falcon-180B 70.45/70.41 70.4/70.42 69.38/69.57 65.91/65.58 37.76/36.83 7 | ChatGLM3-6B 51.93/50.58 51.99/50.58 51.75/50.24 48.91/47.33 25.25/25.11 8 | ChatGLM3-6B-32k 58.01/57.29 58.17/57.35 57.55/56.84 54/52.9 24.41/24.13 9 | Mistral-7B 64.06/62.57 64.07/62.53 62.64/61.15 56.04/54.96 23.08/23.13 10 | Mixtral-8x7B 71.91/70.49 71.93/70.54 70.48/69.1 64.27/63.16 24.91/24.72 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/mmlu_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 46.69/45.89 47.05/46.14 45.7/44.7 23.82/23.82 2 | LLaMA2-13B 55.81/55.68 55.77/55.32 54.28/54.06 24.17/24.08 3 | LLaMA2-70B 69.80/69.13 69.55/68.66 67.8/67.12 23.79/23.62 4 | Falcon-7B 27.55/27.11 5 | Falcon-40B 57.03/56 56.3/55.38 55.24/54.42 27.23/27 6 | Falcon-180B 70.45/70.41 70.1/70.05 69.82/69.76 24/23.61 7 | ChatGLM3-6B 51.93/50.58 51.87/50.36 51.16/49.85 24.07/23.99 8 | ChatGLM3-6B-32k 58.01/57.29 57.96/57.09 57.12/56.45 23.88/23.61 9 | Mistral-7B 64.06/62.57 63.73/62.26 62.43/60.89 24.86/24.75 10 | Mixtral-8x7B 71.91/70.49 69.29/68.15 67.35/66.52 24.27/24.14 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/strategyqa_kv.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 64.37 64.59 62.4 60.83 25.76 2 | LLaMA2-13B 67.69 67.95 68.03 66.46 44.59 3 | LLaMA2-70B 77.16 77.38 76.9 76.77 50.48 4 | Falcon-7B 54.32 54.59 54.1 53.97 0.04 5 | Falcon-40B 69.43 69.52 68.56 67.69 50.9 6 | Falcon-180B 75.37 75.33 75.41 75.63 52.23 7 | ChatGLM3-6B 48.86 48.73 50.31 50.31 47.99 8 | ChatGLM3-6B-32k 81.14 81.27 79.87 72.88 28.56 9 | Mistral-7B 67.69 67.51 66.72 63.28 40.66 10 | Mixtral-8x7B 75.76 75.81 74.72 70.48 31.7 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/strategyqa_w.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 64.37 64.28 61.44 60.39 11.48 2 | LLaMA2-13B 67.69 68.08 66.51 63.76 25.24 3 | LLaMA2-70B 77.16 77.12 75.24 73.14 49.56 4 | Falcon-7B 54.32 54.37 53.23 54.63 9.96 5 | Falcon-40B 69.43 69.48 69.43 66.77 31.92 6 | Falcon-180B 75.37 75.02 75.85 72.1 23.06 7 | ChatGLM3-6B 48.86 51.35 49.34 47.95 1.66 8 | ChatGLM3-6B-32k 81.14 81.09 78.91 71.79 2.62 9 | Mistral-7B 67.69 67.86 66.16 61.05 18.25 10 | Mixtral-8x7B 75.76 75.9 76.07 68.56 1.22 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/emergent/strategyqa_wa.txt: -------------------------------------------------------------------------------- 1 | LLaMA2-7B 64.37 64.59 61.4 2.23 2 | LLaMA2-13B 67.69 68.17 66.55 5.46 3 | LLaMA2-70B 77.16 76.77 75.37 4.41 4 | Falcon-7B 54.32 54.5 54.98 44.8 5 | Falcon-40B 69.43 68.73 69.61 1.18 6 | Falcon-180B 75.37 75.76 75.28 1.12 7 | ChatGLM3-6B 48.86 49.83 48.69 17.25 8 | ChatGLM3-6B-32k 81.14 80.52 79.43 9.91 9 | Mistral-7B 67.69 67.42 65.9 47.51 10 | Mixtral-8x7B 75.76 75.81 74.72 2.71 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/emergent/plot_ability.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | 4 | modes = ["w", "wa", "kv"] 5 | model_names = ["Falcon-180B"] 6 | datasets = ["mmlu", "arc-c", "arc-e", "hellaswag", "gsm8k", "strategyqa", "mmlu-calib"] 7 | for mode in modes: 8 | for model_name in model_names: 9 | acc = {} 10 | minimum = {"mmlu": 25, "arc-c": 25, "arc-e": 25, "hellaswag": 25, "gsm8k": 0, "strategyqa": 0, "mmlu-calib": 50} 11 | for dataset in datasets: 12 | data_file_name = dataset + "_" + mode + ".txt" 13 | with open("./emergent/" + data_file_name, "r") as f: 14 | while True: 15 | txt = f.readline() 16 | if txt == "": 17 | break 18 | fmodel_name = txt.split("\t")[0] 19 | if "32k" in fmodel_name: 20 | continue 21 | if "instruct" in fmodel_name or "chat" in fmodel_name: 22 | fmodel_name = fmodel_name.replace("-instruct", "") 23 | fmodel_name = fmodel_name.replace("-chat", "") 24 | if fmodel_name != model_name: 25 | continue 26 | acc0 = txt.split("\t")[1:] 27 | for i in range(len(acc0)): 28 | if acc0[i].split("/")[-1] == "" or acc0[i].split("/")[-1] == "\n": 29 | acc0[i] = None 30 | else: 31 | acc0[i] = float(acc0[i].split("/")[-1]) 32 | if i == 0: 33 | std = acc0[i] 34 | acc0[i] = max((acc0[i] - minimum[dataset]) / (std - minimum[dataset]), 0) 35 | acc[dataset] = acc0 36 | ability = {} 37 | # print(acc) 38 | if mode == "w": 39 | x_label = ["FP16", "W8", "W4", "W3", "W2"] 40 | elif mode == "wa": 41 | x_label = ["FP16", "W8A8", "W4A8", "W4A4"] 42 | else: 43 | x_label = ["FP16", "KV8", "KV4", "KV3", "KV2"] 44 | 45 | for i in range(len(x_label)): 46 | ability[x_label[i]] = [] 47 | ability[x_label[i]].append(acc["mmlu"][i]) 48 | ability[x_label[i]].append(acc["strategyqa"][i]) 49 | ability[x_label[i]].append(acc["gsm8k"][i]) 50 | ability[x_label[i]].append(acc["arc-c"][i]) 51 | ability[x_label[i]].append(acc["mmlu-calib"][i]) 52 | ability[x_label[i]].append(acc["mmlu"][i]) 53 | angles = np.linspace(0, 2 * np.pi, 5, endpoint=False) 54 | angles = np.concatenate((angles, [angles[0]])) 55 | angles += np.pi / 10 56 | feature = ["ICL", "C-MR", "M-MR", "IF", "SC", "ICL"] 57 | fig = plt.figure() 58 | ax = fig.add_subplot(111, polar=True) 59 | 60 | for k, v in ability.items(): 61 | ax.plot(angles, v, "o-", linewidth=2, label=k) 62 | ax.fill(angles, v, alpha=0.25) 63 | ax.set_thetagrids(angles * 180 / np.pi, feature, size=15) 64 | plt.legend(loc="upper right", fontsize=13) 65 | plt.savefig("./figure/radarpdf/" + model_name + "_" + mode + ".pdf", bbox_inches="tight") 66 | plt.close() 67 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/litm/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import matplotlib.pyplot as plt 4 | from matplotlib.lines import Line2D 5 | 6 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the 7 | # script instead of being passed as command args. 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--save_path', type=str, default='./figures/') 11 | parser.add_argument('--save_name', type=str, default=None) 12 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv']) 13 | parser.add_argument('--dataset_name', type=str, default='Multi-Doc_QA') 14 | parser.add_argument('--norm', action='store_true') 15 | parser.add_argument("--num_bins", type=int, default=30, help="bin number") 16 | parser.add_argument('--x_label', type=str, default='Position', choices=['Position']) 17 | parser.add_argument('--y_label', type=str, default='Accuracy') 18 | parser.add_argument('--legend_loc', type=str, default='lower left') 19 | args = parser.parse_args() 20 | 21 | 22 | ##################################### Helper Functions ###################################### 23 | def result_pad(raw_results, fp_idx=0): 24 | raw_fp_result = raw_results[fp_idx] 25 | new_results = [x if x is not None else 0 for x in raw_results] 26 | new_results[fp_idx] = raw_fp_result 27 | return new_results 28 | 29 | def normalize_results(raw_results, fp_idx=0): 30 | # raw_results = result_pad(raw_results) 31 | if raw_results[fp_idx] is not None and raw_results[fp_idx] != 0: 32 | norm_results = [i / raw_results[fp_idx] * 100 if i is not None else None for i in raw_results] 33 | print('Result Normalization Succeeded.') 34 | else: 35 | norm_results = raw_results 36 | print('The input results have no FP precision, return original results.') 37 | return norm_results 38 | 39 | MARKERS = ['o', '^', 's', 'd', '+'] 40 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown'] 41 | MODELS = ["Mistral_7B", "Vicuna_7B"] 42 | 43 | 44 | ###################################### Plot Functions ###################################### 45 | if __name__ == '__main__': 46 | if args.plot_mode == 'kv': 47 | import pos.kv_length_result as data 48 | elif args.plot_mode == 'w': 49 | import pos.w_length_result as data 50 | elif args.plot_mode == 'wa': 51 | import pos.wa_length_result as data 52 | 53 | # create a folder to save the resulting plot. 54 | if not os.path.exists(args.save_path): 55 | os.makedirs(args.save_path) 56 | 57 | # plot name 58 | save_name = args.save_name + '.pdf' if args.save_name is not None else \ 59 | args.dataset_name + '_' + args.plot_mode + '_' + args.x_label + '.pdf' 60 | save_name = args.save_path + save_name 61 | 62 | # init canvas 63 | fig, ax = plt.subplots(figsize=[8., 6.]) 64 | 65 | # make the plot compact 66 | plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11) 67 | 68 | # set figure labels 69 | plt.xlabel(args.x_label, fontsize=20) 70 | plt.ylabel(args.y_label if not args.norm else 'Normalized ' + args.y_label, fontsize=20) 71 | plt.ylim(0.2, 0.75) 72 | 73 | # set axes font size 74 | ax.tick_params(axis='x', labelsize=20) 75 | ax.tick_params(axis='y', labelsize=20) 76 | 77 | x_axis = { 78 | 'Position': ['0', '1', '2', '3', '4', '5', '6'], 79 | }[args.x_label] 80 | 81 | # mode = { 82 | # 'w': ['FP16', 'W8', 'W4', 'W3', 'W2'], 83 | # 'wa': ['FP16', 'W8A8', 'W4A8', 'W4A4'], 84 | # 'kv': ['FP16', 'KV8', 'KV4', 'KV3', 'KV2'], 85 | # }[args.plot_mode] 86 | 87 | mode = { 88 | 'w': ['FP16', 'W8', 'W4', 'W3'], 89 | 'wa': ['FP16', 'W8A8', 'W4A8'], 90 | 'kv': ['FP16', 'KV8', 'KV4', 'KV3'], 91 | }[args.plot_mode] 92 | 93 | for i, model_name in enumerate(MODELS): 94 | for j, _ in enumerate(mode): 95 | # ax.plot(x_axis, eval("data.{}".format(model_name))[j], marker=MARKERS[j], markersize=13, label=model_name + '-' + mode[j], linestyle="-", color=COLORS[i]) 96 | y_value = eval("data.{}".format(model_name))[j] 97 | ax.plot(x_axis[:len(y_value)], y_value, marker=MARKERS[j], markersize=13, linestyle="-", color=COLORS[i]) 98 | 99 | # set legend location 100 | # ax.legend(loc=args.legend_loc) 101 | legend_elements = [] 102 | for i, _ in enumerate(mode): 103 | legend_elements.append(Line2D([0], [0], marker=MARKERS[i], color='w', label=mode[i], markerfacecolor='tab:brown', markersize=16)) 104 | ax.legend(handles=legend_elements, fontsize=20) 105 | 106 | # for i, _ in enumerate(MODELS): 107 | # legend_elements.append(Line2D([0], [0], color=COLORS[i], label=MODELS[i], linewidth=4)) 108 | # ax.legend(handles=legend_elements, fontsize=20) 109 | 110 | # Warning: please do not change the figure you are previewing 111 | # preview the figure 112 | plt.savefig(save_name) 113 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/litm/pos/kv_length_result.py: -------------------------------------------------------------------------------- 1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6429378531073446, 0.49642184557438795, 0.4726930320150659, 0.4553672316384181, 0.4542372881355932, 0.45800376647834273, 0.503578154425612], [0.6071563088512241, 0.487758945386064, 0.46440677966101696, 0.45084745762711864, 0.4369114877589454, 0.44143126177024483, 0.48851224105461394], [0.4512241054613936, 0.3630885122410546, 0.34199623352165726, 0.34764595103578155, 0.3694915254237288, 0.36271186440677966, 0.4056497175141243]] 2 | LongChat_13B = [[0.6700564971751413, 0.5480225988700564, 0.5276836158192091, 0.5291902071563088, 0.5205273069679849, 0.5133709981167608, 0.5506591337099812], [0.6625235404896422, 0.5532956685499059, 0.5318267419962335, 0.5250470809792843, 0.5212806026365349, 0.5133709981167608, 0.5427495291902071], [0.6557438794726931, 0.5318267419962335, 0.5205273069679849, 0.5129943502824859, 0.5096045197740113, 0.4919020715630885, 0.5299435028248588], [0.5811676082862524, 0.439924670433145, 0.423728813559322, 0.4233521657250471, 0.4263653483992467, 0.4248587570621469, 0.43615819209039547]] 3 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6817325800376648, 0.5555555555555556, 0.47532956685499056, 0.4384180790960452, 0.4425612052730697, 0.4354048964218456, 0.47796610169491527], [0.6429378531073446, 0.5065913370998116, 0.4568738229755179, 0.4256120527306968, 0.42259887005649716, 0.41883239171374764, 0.4689265536723164], [0.49227871939736345, 0.38267419962335214, 0.2998116760828625, 0.29303201506591336, 0.3013182674199623, 0.2568738229755179, 0.32354048964218457]] 4 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.688512241054614, 0.5020715630885122, 0.44745762711864406, 0.4256120527306968, 0.43163841807909603, 0.4312617702448211, 0.44143126177024483], [0.6700564971751413, 0.5114877589453861, 0.4651600753295669, 0.44030131826741997, 0.43088512241054616, 0.4350282485875706, 0.4497175141242938], [0.5329566854990584, 0.3728813559322034, 0.351789077212806, 0.34048964218455746, 0.3480225988700565, 0.33634651600753296, 0.3951035781544256]] 5 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.48436911487758944, 0.30772128060263654, 0.3047080979284369, 0.2903954802259887, 0.30282485875706217, 0.31148775894538605, 0.39623352165725045], [0.47005649717514125, 0.2885122410546139, 0.2952919020715631, 0.2847457627118644, 0.2768361581920904, 0.2760828625235405, 0.37589453860640304], [0.3871939736346516, 0.23126177024482109, 0.2519774011299435, 0.2384180790960452, 0.2192090395480226, 0.21242937853107344, 0.30998116760828626]] 6 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.5902071563088512, 0.49566854990583803, 0.4888888888888889, 0.49491525423728816, 0.46365348399246703, 0.5069679849340867, 0.5209039548022599], [0.5834274952919021, 0.4873822975517891, 0.4806026365348399, 0.47909604519774013, 0.4591337099811676, 0.5001883239171375, 0.5152542372881356], [0.4651600753295669, 0.35555555555555557, 0.3465160075329567, 0.35480225988700564, 0.32919020715630887, 0.36610169491525424, 0.37928436911487756]] 7 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5969868173258004, 0.5898305084745763, 0.6143126177024482, 0.639924670433145, 0.64030131826742, 0.639924670433145, 0.6606403013182675], [0.5947269303201507, 0.5792843691148776, 0.5992467043314501, 0.6369114877589453, 0.6380414312617703, 0.6391713747645951, 0.6538606403013183], [0.5370998116760829, 0.46779661016949153, 0.4975517890772128, 0.5329566854990584, 0.5370998116760829, 0.5664783427495292, 0.5807909604519774]] 8 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6467043314500942, 0.655367231638418, 0.6775894538606403, 0.672316384180791, 0.6595103578154425, 0.663653483992467, 0.7224105461393597], [0.6595103578154425, 0.6451977401129944, 0.6538606403013183, 0.6621468926553672, 0.6632768361581921, 0.6583804143126177, 0.7096045197740113], [0.5875706214689266, 0.5653483992467043, 0.567984934086629, 0.5770244821092279, 0.5642184557438795, 0.592467043314501, 0.6587570621468927]] 9 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5819209039548022, 0.5205273069679849, 0.5389830508474577, 0.5883239171374764, 0.696045197740113], [0.5725047080979284, 0.5295668549905838, 0.5386064030131826, 0.5905838041431262, 0.6934086629001883], [0.5269303201506591, 0.4990583804143126, 0.5269303201506591, 0.5774011299435028, 0.6677966101694915]] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/litm/pos/w_length_result.py: -------------------------------------------------------------------------------- 1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6440677966101694, 0.4967984934086629, 0.47871939736346514, 0.4610169491525424, 0.4512241054613936, 0.4632768361581921, 0.503578154425612], [0.6165725047080979, 0.46440677966101696, 0.45725047080979286, 0.4444444444444444, 0.44105461393596984, 0.432015065913371, 0.4463276836158192], [0.40941619585687383, 0.30772128060263654, 0.33973634651600754, 0.3344632768361582, 0.32693032015065915, 0.3167608286252354, 0.36271186440677966]] 2 | LongChat_13B = [[0.6700564971751413, 0.5480225988700564, 0.5276836158192091, 0.5291902071563088, 0.5205273069679849, 0.5133709981167608, 0.5506591337099812], [0.6741996233521658, 0.5435028248587571, 0.5310734463276836, 0.5242937853107345, 0.5193973634651601, 0.5163841807909605, 0.5510357815442561], [0.6666666666666666, 0.5276836158192091, 0.5054613935969868, 0.49227871939736345, 0.5005649717514125, 0.49416195856873824, 0.5291902071563088], [0.6060263653483993, 0.43427495291902074, 0.4214689265536723, 0.423728813559322, 0.4346516007532957, 0.4312617702448211, 0.40414312617702447]] 3 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6813559322033899, 0.5574387947269304, 0.47306967984934084, 0.4429378531073446, 0.44105461393596984, 0.4327683615819209, 0.48361581920903957], [0.6387947269303201, 0.5322033898305085, 0.4674199623352166, 0.4463276836158192, 0.44708097928436913, 0.4534839924670433, 0.4937853107344633], [0.5570621468926553, 0.42297551789077215, 0.392090395480226, 0.37175141242937854, 0.36610169491525424, 0.3694915254237288, 0.3559322033898305]] 4 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.6836158192090396, 0.5005649717514125, 0.4485875706214689, 0.42674199623352166, 0.42674199623352166, 0.4331450094161959, 0.43804143126177025], [0.6937853107344633, 0.5382297551789077, 0.46817325800376647, 0.4335216572504708, 0.4274952919020716, 0.44143126177024483, 0.4527306967984934], [0.5638418079096045, 0.46854990583804146, 0.44595103578154427, 0.4290018832391714, 0.4105461393596987, 0.42109227871939736, 0.44745762711864406]] 5 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.4858757062146893, 0.3088512241054614, 0.30583804143126175, 0.29792843691148774, 0.29303201506591336, 0.30847457627118646, 0.3935969868173258], [0.4854990583804143, 0.303954802259887, 0.2858757062146893, 0.28210922787193976, 0.3013182674199623, 0.3160075329566855, 0.41883239171374764], [0.41506591337099813, 0.25348399246704334, 0.24180790960451978, 0.232015065913371, 0.24858757062146894, 0.2749529190207156, 0.367984934086629]] 6 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.60075329566855, 0.49566854990583803, 0.48512241054613936, 0.4915254237288136, 0.47532956685499056, 0.5096045197740113, 0.5231638418079096], [0.5849340866290019, 0.4839924670433145, 0.456120527306968, 0.46779661016949153, 0.45310734463276836, 0.47721280602636534, 0.5009416195856874], [0.5261770244821092, 0.4233521657250471, 0.416195856873823, 0.42523540489642186, 0.42862523540489644, 0.455743879472693, 0.480225988700565]] 7 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5977401129943503, 0.5951035781544256, 0.6131826741996234, 0.6444444444444445, 0.6414312617702448, 0.6380414312617703, 0.6598870056497175], [0.5728813559322034, 0.5630885122410546, 0.5894538606403014, 0.6203389830508474, 0.6109227871939736, 0.6120527306967984, 0.6621468926553672], [0.5005649717514125, 0.5137476459510358, 0.5397363465160075, 0.5777777777777777, 0.5917137476459511, 0.5894538606403014, 0.6188323917137476]] 8 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6516007532956686, 0.6508474576271186, 0.6700564971751413, 0.672316384180791, 0.6685499058380414, 0.6659133709981168, 0.7182674199623352], [0.6278719397363465, 0.6380414312617703, 0.6583804143126177, 0.6681732580037665, 0.6467043314500942, 0.6523540489642184, 0.7250470809792844], [0.6030131826741997, 0.5834274952919021, 0.6022598870056497, 0.6165725047080979, 0.6116760828625235, 0.6222222222222222, 0.672316384180791]] 9 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5807909604519774, 0.528060263653484, 0.5419962335216573, 0.591337099811676, 0.6949152542372882], [0.551789077212806, 0.5190207156308851, 0.5340866290018832, 0.5856873822975518, 0.6937853107344633], [0.5043314500941619, 0.46817325800376647, 0.48851224105461394, 0.5013182674199623, 0.6444444444444445]] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/litm/pos/wa_length_result.py: -------------------------------------------------------------------------------- 1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6459510357815442, 0.49227871939736345, 0.47721280602636534, 0.45084745762711864, 0.4346516007532957, 0.4485875706214689, 0.49830508474576274], [0.6146892655367232, 0.4711864406779661, 0.46365348399246703, 0.4519774011299435, 0.45160075329566857, 0.44030131826741997, 0.4598870056497175]] 2 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6760828625235404, 0.5555555555555556, 0.4741996233521657, 0.4406779661016949, 0.4354048964218456, 0.43728813559322033, 0.4768361581920904], [0.6282485875706215, 0.5269303201506591, 0.4610169491525424, 0.43163841807909603, 0.44595103578154427, 0.448210922787194, 0.4817325800376648]] 3 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.6952919020715631, 0.511864406779661, 0.46854990583804146, 0.4354048964218456, 0.43653483992467046, 0.44519774011299434, 0.4576271186440678], [0.6934086629001883, 0.5374764595103578, 0.4704331450094162, 0.4425612052730697, 0.4331450094161959, 0.45235404896421844, 0.4519774011299435]] 4 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.4711864406779661, 0.3129943502824859, 0.30583804143126175, 0.28775894538606406, 0.296045197740113, 0.3016949152542373, 0.4011299435028249], [0.4821092278719397, 0.2994350282485876, 0.2952919020715631, 0.28210922787193976, 0.2983050847457627, 0.319774011299435, 0.4143126177024482]] 5 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.5947269303201507, 0.5016949152542373, 0.4749529190207156, 0.4817325800376648, 0.45875706214689266, 0.5028248587570622, 0.5212806026365349], [0.5958568738229755, 0.4817325800376648, 0.46026365348399245, 0.4775894538606403, 0.4429378531073446, 0.4760828625235405, 0.4986817325800377]] 6 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5981167608286252, 0.5856873822975518, 0.6210922787193973, 0.6474576271186441, 0.6425612052730697, 0.63954802259887, 0.663653483992467], [0.5709981167608287, 0.5634651600753295, 0.5935969868173258, 0.616195856873823, 0.6116760828625235, 0.6154425612052731, 0.6527306967984934]] 7 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6448210922787194, 0.6610169491525424, 0.6662900188323917, 0.6693032015065913, 0.6651600753295669, 0.6591337099811676, 0.7227871939736347], [0.6278719397363465, 0.6308851224105462, 0.6538606403013183, 0.6651600753295669, 0.6444444444444445, 0.647834274952919, 0.7182674199623352]] 8 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5672316384180791, 0.5163841807909605, 0.5333333333333333, 0.5871939736346516, 0.6839924670433145], [0.544256120527307, 0.5020715630885122, 0.5261770244821092, 0.5702448210922787, 0.6903954802259887]] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/length/kv_length_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.954, 0.954, 0.896, 0.896, 0.882, 0.856], [0.936, 0.928, 0.868, 0.872, 0.814, 0.822], [0.478, 0.412, 0.410, 0.310, 0.240, 0.222], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 2 | ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.484, 0.288, 0.628, 0.864, 0.826, 0.864], [0.03, 0.024, 0.104, 0.05, 0.120, 0.206], [0.012, 0.024, 0.008, 0.022, 0.038, 0.014], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.966, 0.920, 0.630, 0.556, 0.456, 0.342], [0.910, 0.866, 0.552, 0.458, 0.336, 0.256], [0.518, 0.398, 0.164, 0.062, 0.032, 0.028], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.866, 0.686, 0.518, 0.396, 0.184], [0.846, 0.738, 0.522, 0.348, 0.23, 0.106], [0.242, 0.074, 0.064, 0.020, 0.01, 0.002], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.964, 0.902, 0.73, 0.536], [0.964, 0.964, 0.904, 0.754, 0.55, 0.36], [0.300, 0.242, 0.070, 0.04, 0.01, 0.004], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.96, 0.896, 0.668, 0.394], [0.962, 0.96, 0.910, 0.804, 0.564, 0.312], [0.500, 0.296, 0.164, 0.042, 0.064, 0.002], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.998, 1.0], [0.998, 0.996, 0.994, 0.994, 0.982, 0.982], [0.692, 0.644, 0.602, 0.544, 0.46, 0.452], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.958, 0.954, 0.934], [0.996, 0.982, 0.96, 0.928, 0.914, 0.904], [0.784, 0.712, 0.628, 0.542, 0.5, 0.406], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/length/w_length_result.py: -------------------------------------------------------------------------------- 1 | # ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.45, 0.27, 0.602, 0.832, 0.816, 0.856], [0.874, 0.77, 0.908, 0.928, 0.93, 0.908], [0.816, 0.42, 0.794, 0.768, 0.868, 0.852], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 2 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.95, 0.95, 0.896, 0.89, 0.882, 0.854], [0.978, 0.968, 0.91, 0.906, 0.88, 0.86], [0.904, 0.88, 0.83, 0.774, 0.752, 0.676], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.964, 0.924, 0.888, 0.88, 0.778, 0.722], [0.966, 0.898, 0.832, 0.83, 0.708, 0.608], [0.806, 0.786, 0.702, 0.602, 0.45, 0.392], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.874, 0.798, 0.648, 0.588, 0.384], [0.924, 0.808, 0.724, 0.654, 0.44, 0.286], [0.676, 0.54, 0.422, 0.292, 0.16, 0.082], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.968, 0.91, 0.74, 0.578], [0.996, 0.992, 0.98, 0.902, 0.754, 0.614], [0.928, 0.838, 0.66, 0.524, 0.392, 0.304], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.96, 0.904, 0.668, 0.428], [0.976, 0.95, 0.902, 0.834, 0.67, 0.348], [0.674, 0.864, 0.84, 0.642, 0.482, 0.282], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.998, 1.0], [1.0, 1.0, 0.998, 0.998, 0.998, 0.996], [0.954, 0.93, 0.914, 0.846, 0.828, 0.82], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.974, 0.968, 0.944], [0.996, 0.998, 0.988, 0.978, 0.954, 0.922], [0.988, 0.944, 0.91, 0.852, 0.78, 0.76], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 9 | 10 | ChatGLM2_6B_AWQ = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.952, 0.954, 0.896, 0.896, 0.882, 0.856], [0.94, 0.932, 0.872, 0.86, 0.862, 0.822], [0.92, 0.91, 0.848, 0.798, 0.808, 0.758], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 11 | LongChat_13B_AWQ = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.966, 0.924, 0.884, 0.878, 0.772, 0.724], [0.976, 0.914, 0.864, 0.83, 0.714, 0.61], [0.928, 0.876, 0.778, 0.7, 0.556, 0.438], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 12 | LongChat_7B_AWQ = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.874, 0.794, 0.656, 0.594, 0.388], [0.918, 0.868, 0.786, 0.684, 0.562, 0.372], [0.77, 0.604, 0.5, 0.476, 0.326, 0.264], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 13 | Vicuna_7B_AWQ = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.99, 0.964, 0.912, 0.736, 0.582], [0.998, 0.992, 0.964, 0.892, 0.71, 0.578], [0.974, 0.860, 0.904, 0.772, 0.57, 0.412], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 14 | Vicuna_13B_AWQ = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.956, 0.896, 0.67, 0.408], [0.986, 0.966, 0.944, 0.852, 0.58, 0.33], [0.948, 0.918, 0.714, 0.528, 0.36, 0.164], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 15 | Mixtral_8x7B_AWQ = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [0.998, 0.998, 1.0, 0.998, 0.994, 0.996], [0.968, 0.95, 0.882, 0.924, 0.886, 0.902], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 16 | Mistral_7B_AWQ = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.98, 0.97, 0.948], [0.998, 0.998, 0.988, 0.964, 0.954, 0.926], [0.996, 0.996, 0.966, 0.942, 0.926, 0.87], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 17 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/length/wa_length_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.972, 0.964, 0.916, 0.904, 0.888, 0.864], [0.972, 0.96, 0.912, 0.918, 0.894, 0.866], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 2 | ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.74, 0.64, 0.82, 0.908, 0.808, 0.86], [0.894, 0.896, 0.912, 0.922, 0.914, 0.902], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.974, 0.92, 0.904, 0.902, 0.794, 0.708], [0.964, 0.862, 0.798, 0.802, 0.71, 0.618], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.956, 0.884, 0.824, 0.75, 0.618, 0.448], [0.93, 0.848, 0.776, 0.61, 0.42, 0.308], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.99, 0.972, 0.908, 0.72, 0.586], [0.996, 0.996, 0.966, 0.888, 0.696, 0.586], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.99, 0.974, 0.95, 0.906, 0.716, 0.462], [0.978, 0.948, 0.882, 0.862, 0.696, 0.446], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.996, 1.0, 0.994, 0.994, 0.996], [1.0, 0.998, 0.998, 1.0, 0.998, 0.996], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.996, 0.996, 0.978, 0.954, 0.932], [0.998, 0.994, 0.99, 0.96, 0.944, 0.908], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/length/wkv_length_result.py: -------------------------------------------------------------------------------- 1 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.968, 0.924, 0.618, 0.562, 0.456, 0.342], [0.962, 0.904, 0.63, 0.566, 0.494, 0.378], [0.924, 0.872, 0.55, 0.466, 0.374, 0.248], [0.91, 0.836, 0.552, 0.44, 0.36, 0.266]] 2 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.872, 0.682, 0.506, 0.394, 0.18], [0.912, 0.814, 0.7, 0.544, 0.338, 0.198], [0.83, 0.752, 0.52, 0.332, 0.19, 0.102], [0.81, 0.664, 0.534, 0.326, 0.198, 0.108]] 3 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.966, 0.898, 0.72, 0.544], [0.996, 0.994, 0.974, 0.904, 0.73, 0.596], [0.964, 0.966, 0.888, 0.744, 0.554, 0.372], [0.964, 0.952, 0.872, 0.788, 0.586, 0.37]] 4 | Vicuna_13B = [[0.990, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.976, 0.96, 0.894, 0.678, 0.408], [0.978, 0.946, 0.91, 0.83, 0.632, 0.364], [0.966, 0.958, 0.904, 0.798, 0.554, 0.336], [0.942, 0.91, 0.682, 0.732, 0.508, 0.29]] 5 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/pos/wa_length_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM2_6B = [[0.8125, 0.9661016949152542, 0.8055555555555556, 0.875, 0.6825396825396826, 0.6585365853658537, 0.8771929824561403, 0.9056603773584906, 0.9818181818181818, 0.9375], [0.8333333333333334, 0.9491525423728814, 0.8333333333333334, 0.875, 0.6984126984126984, 0.7560975609756098, 0.8421052631578947, 0.9245283018867925, 0.9818181818181818, 0.9375], [0.8958333333333334, 0.9661016949152542, 0.8888888888888888, 0.85, 0.7936507936507936, 0.5609756097560976, 0.8596491228070176, 0.8867924528301887, 0.9636363636363636, 0.9375], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 2 | ChatGLM3_6B = [[0.8125, 0.6949152542372882, 0.8055555555555556, 0.925, 0.7936507936507936, 0.8292682926829268, 0.8421052631578947, 0.8867924528301887, 1.0, 0.9375], [0.9166666666666666, 0.7627118644067796, 0.6944444444444444, 0.925, 0.8253968253968254, 0.8292682926829268, 0.8771929824561403, 0.8679245283018868, 0.9454545454545454, 0.9375], [0.9375, 0.8813559322033898, 0.8888888888888888, 0.95, 0.8571428571428571, 0.8780487804878049, 0.9298245614035088, 0.9245283018867925, 0.9090909090909091, 0.875], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 3 | LongChat_13B = [[0.3333333333333333, 0.423728813559322, 0.6111111111111112, 0.675, 0.7777777777777778, 0.8048780487804879, 0.7719298245614035, 0.8867924528301887, 0.9272727272727272, 0.9375], [0.3541666666666667, 0.3389830508474576, 0.5833333333333334, 0.725, 0.7619047619047619, 0.7804878048780488, 0.8070175438596491, 0.8867924528301887, 0.8909090909090909, 0.9375], [0.3333333333333333, 0.2033898305084746, 0.5, 0.625, 0.6349206349206349, 0.5853658536585366, 0.7368421052631579, 0.7735849056603774, 0.9090909090909091, 0.8541666666666666], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 4 | LongChat_7B = [[0.125, 0.22033898305084745, 0.4444444444444444, 0.5, 0.5396825396825397, 0.3170731707317073, 0.3157894736842105, 0.41509433962264153, 0.5636363636363636, 0.5416666666666666], [0.125, 0.1864406779661017, 0.4444444444444444, 0.65, 0.5396825396825397, 0.3902439024390244, 0.45614035087719296, 0.49056603773584906, 0.6181818181818182, 0.6041666666666666], [0.16666666666666666, 0.06779661016949153, 0.2777777777777778, 0.475, 0.31746031746031744, 0.2926829268292683, 0.3157894736842105, 0.3584905660377358, 0.45454545454545453, 0.3958333333333333], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 5 | Vicuna_7B = [[0.10416666666666667, 0.2711864406779661, 0.3055555555555556, 0.7, 0.49206349206349204, 0.3902439024390244, 0.6842105263157895, 0.8301886792452831, 0.9636363636363636, 0.9583333333333334], [0.14583333333333334, 0.2542372881355932, 0.3888888888888889, 0.7, 0.49206349206349204, 0.4146341463414634, 0.6842105263157895, 0.8679245283018868, 0.9090909090909091, 0.9583333333333334], [0.0625, 0.2711864406779661, 0.3333333333333333, 0.525, 0.5873015873015873, 0.4878048780487805, 0.7368421052631579, 0.8679245283018868, 0.9272727272727272, 0.9375], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 6 | Vicuna_13B = [[0.16666666666666666, 0.0847457627118644, 0.05555555555555555, 0.325, 0.3333333333333333, 0.1951219512195122, 0.543859649122807, 0.6415094339622641, 0.8181818181818182, 0.8541666666666666], [0.22916666666666666, 0.11864406779661017, 0.08333333333333333, 0.275, 0.2698412698412698, 0.4634146341463415, 0.6140350877192983, 0.7358490566037735, 0.8363636363636363, 0.8958333333333334], [0.25, 0.11864406779661017, 0.05555555555555555, 0.475, 0.3968253968253968, 0.21951219512195122, 0.5263157894736842, 0.6226415094339622, 0.7818181818181819, 0.8958333333333334], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 7 | Mixtral_8x7B = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 0.9722222222222222, 1.0, 1.0, 1.0, 1.0, 0.9811320754716981, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9824561403508771, 1.0, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 8 | Mistral_7B = [[0.9791666666666666, 1.0, 1.0, 0.975, 0.9047619047619048, 0.8292682926829268, 0.8596491228070176, 0.9433962264150944, 1.0, 1.0], [1.0, 1.0, 1.0, 0.975, 0.8412698412698413, 0.7804878048780488, 0.8245614035087719, 0.9245283018867925, 1.0, 1.0], [0.9791666666666666, 0.9830508474576272, 0.9444444444444444, 0.925, 0.7936507936507936, 0.7073170731707317, 0.7894736842105263, 0.9811320754716981, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 9 | Mistral_7B = [[0.9791666666666666, 1.0, 1.0, 0.975, 0.9047619047619048, 0.8292682926829268, 0.8596491228070176, 0.9433962264150944, 1.0, 1.0], [1.0, 1.0, 1.0, 0.975, 0.8412698412698413, 0.7804878048780488, 0.8245614035087719, 0.9245283018867925, 1.0, 1.0], [0.9791666666666666, 0.9830508474576272, 0.9444444444444444, 0.925, 0.7936507936507936, 0.7073170731707317, 0.7894736842105263, 0.9811320754716981, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/long/longeval/pos/wkv_length_result.py: -------------------------------------------------------------------------------- 1 | LongChat_13B = [[0.3333333333333333, 0.423728813559322, 0.6111111111111112, 0.675, 0.7777777777777778, 0.8048780487804879, 0.7719298245614035, 0.8867924528301887, 0.9272727272727272, 0.9375], [0.22916666666666666, 0.1694915254237288, 0.19444444444444445, 0.55, 0.4126984126984127, 0.2682926829268293, 0.3157894736842105, 0.2830188679245283, 0.45454545454545453, 0.5416666666666666], [0.25, 0.11864406779661017, 0.25, 0.45, 0.47619047619047616, 0.21951219512195122, 0.38596491228070173, 0.4339622641509434, 0.5636363636363636, 0.5833333333333334], [0.1875, 0.1016949152542373, 0.0, 0.3, 0.38095238095238093, 0.24390243902439024, 0.21052631578947367, 0.18867924528301888, 0.38181818181818183, 0.4166666666666667], [0.20833333333333334, 0.0847457627118644, 0.1111111111111111, 0.25, 0.3333333333333333, 0.1951219512195122, 0.24561403508771928, 0.22641509433962265, 0.34545454545454546, 0.625]] 2 | LongChat_7B = [[0.125, 0.22033898305084745, 0.4444444444444444, 0.5, 0.5396825396825397, 0.3170731707317073, 0.3157894736842105, 0.41509433962264153, 0.5636363636363636, 0.5416666666666666], [0.08333333333333333, 0.11864406779661017, 0.2222222222222222, 0.25, 0.23809523809523808, 0.1951219512195122, 0.12280701754385964, 0.1509433962264151, 0.2, 0.25], [0.1875, 0.05084745762711865, 0.1111111111111111, 0.325, 0.2698412698412698, 0.17073170731707318, 0.10526315789473684, 0.18867924528301888, 0.2727272727272727, 0.3125], [0.0625, 0.01694915254237288, 0.1111111111111111, 0.15, 0.1111111111111111, 0.0975609756097561, 0.07017543859649122, 0.09433962264150944, 0.18181818181818182, 0.14583333333333334], [0.10416666666666667, 0.03389830508474576, 0.05555555555555555, 0.05, 0.14285714285714285, 0.07317073170731707, 0.08771929824561403, 0.018867924528301886, 0.2727272727272727, 0.20833333333333334]] 3 | Vicuna_7B = [[0.10416666666666667, 0.2711864406779661, 0.3055555555555556, 0.7, 0.49206349206349204, 0.3902439024390244, 0.6842105263157895, 0.8301886792452831, 0.9636363636363636, 0.9583333333333334], [0.10416666666666667, 0.2542372881355932, 0.3333333333333333, 0.6, 0.3968253968253968, 0.4146341463414634, 0.6140350877192983, 0.8679245283018868, 0.8909090909090909, 0.9166666666666666], [0.10416666666666667, 0.23728813559322035, 0.4444444444444444, 0.525, 0.6031746031746031, 0.5609756097560976, 0.6842105263157895, 0.9245283018867925, 0.8909090909090909, 0.9166666666666666], [0.020833333333333332, 0.13559322033898305, 0.2777777777777778, 0.425, 0.36507936507936506, 0.14634146341463414, 0.40350877192982454, 0.5094339622641509, 0.6909090909090909, 0.6875], [0.041666666666666664, 0.1016949152542373, 0.2222222222222222, 0.3, 0.2698412698412698, 0.24390243902439024, 0.3508771929824561, 0.5471698113207547, 0.7636363636363637, 0.8125]] 4 | Vicuna_13B = [[0.16666666666666666, 0.0847457627118644, 0.05555555555555555, 0.325, 0.3333333333333333, 0.1951219512195122, 0.543859649122807, 0.6415094339622641, 0.8181818181818182, 0.8541666666666666], [0.1875, 0.1016949152542373, 0.08333333333333333, 0.325, 0.19047619047619047, 0.1951219512195122, 0.5087719298245614, 0.7169811320754716, 0.7818181818181819, 0.8958333333333334], [0.125, 0.05084745762711865, 0.08333333333333333, 0.35, 0.31746031746031744, 0.1951219512195122, 0.40350877192982454, 0.41509433962264153, 0.8, 0.8125], [0.10416666666666667, 0.05084745762711865, 0.027777777777777776, 0.25, 0.20634920634920634, 0.17073170731707318, 0.3508771929824561, 0.660377358490566, 0.6, 0.8541666666666666], [0.125, 0.05084745762711865, 0.05555555555555555, 0.15, 0.30158730158730157, 0.17073170731707318, 0.19298245614035087, 0.41509433962264153, 0.6, 0.75]] 5 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.27 3 | adv_mnli,bd8ef0,acc_after,gen,39.67 4 | adv_mnli,bd8ef0,acc_before,gen,54.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.26 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57 7 | adv_mnli_mm,bd8ef0,acc_before,gen,38.66 8 | adv_qnli,0b7326,acc_drop,gen,0.14 9 | adv_qnli,0b7326,acc_after,gen,54.89 10 | adv_qnli,0b7326,acc_before,gen,63.91 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.06 15 | adv_rte,8cc547,acc_after,gen,55.56 16 | adv_rte,8cc547,acc_before,gen,59.26 17 | adv_sst2,ee8d3b,acc_drop,gen,0.18 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,60.31 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21 3 | adv_mnli,bd8ef0,acc_after,gen,39.67 4 | adv_mnli,bd8ef0,acc_before,gen,50.41 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.10 6 | adv_mnli_mm,bd8ef0,acc_after,gen,39.50 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,0.09 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,46.48 14 | adv_rte,8cc547,acc_drop,gen,0.04 15 | adv_rte,8cc547,acc_after,gen,55.56 16 | adv_rte,8cc547,acc_before,gen,58.02 17 | adv_sst2,ee8d3b,acc_drop,gen,0.03 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,51.15 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.17 3 | adv_mnli,bd8ef0,acc_after,gen,47.11 4 | adv_mnli,bd8ef0,acc_before,gen,57.02 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.31 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38 8 | adv_qnli,0b7326,acc_drop,gen,0.28 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,69.92 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.10 3 | adv_mnli,bd8ef0,acc_after,gen,45.45 4 | adv_mnli,bd8ef0,acc_before,gen,50.41 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.37 6 | adv_mnli_mm,bd8ef0,acc_after,gen,27.73 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70 8 | adv_qnli,0b7326,acc_drop,gen,0.25 9 | adv_qnli,0b7326,acc_after,gen,53.38 10 | adv_qnli,0b7326,acc_before,gen,71.43 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16 3 | adv_mnli,bd8ef0,acc_after,gen,39.67 4 | adv_mnli,bd8ef0,acc_before,gen,47.11 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.29 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70 8 | adv_qnli,0b7326,acc_drop,gen,0.15 9 | adv_qnli,0b7326,acc_after,gen,57.14 10 | adv_qnli,0b7326,acc_before,gen,66.92 11 | adv_qqp,cdc277,acc_drop,gen,-0.04 12 | adv_qqp,cdc277,acc_after,gen,39.44 13 | adv_qqp,cdc277,acc_before,gen,38.03 14 | adv_rte,8cc547,acc_drop,gen,-0.04 15 | adv_rte,8cc547,acc_after,gen,61.73 16 | adv_rte,8cc547,acc_before,gen,59.26 17 | adv_sst2,ee8d3b,acc_drop,gen,-0.09 18 | adv_sst2,ee8d3b,acc_after,gen,48.09 19 | adv_sst2,ee8d3b,acc_before,gen,44.27 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.32 3 | adv_mnli,bd8ef0,acc_after,gen,35.54 4 | adv_mnli,bd8ef0,acc_before,gen,52.07 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.14 6 | adv_mnli_mm,bd8ef0,acc_after,gen,30.25 7 | adv_mnli_mm,bd8ef0,acc_before,gen,35.29 8 | adv_qnli,0b7326,acc_drop,gen,0.14 9 | adv_qnli,0b7326,acc_after,gen,52.63 10 | adv_qnli,0b7326,acc_before,gen,60.90 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.08 15 | adv_rte,8cc547,acc_after,gen,60.49 16 | adv_rte,8cc547,acc_before,gen,65.43 17 | adv_sst2,ee8d3b,acc_drop,gen,0.18 18 | adv_sst2,ee8d3b,acc_after,gen,51.91 19 | adv_sst2,ee8d3b,acc_before,gen,63.36 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.25 3 | adv_mnli,bd8ef0,acc_after,gen,36.36 4 | adv_mnli,bd8ef0,acc_before,gen,48.76 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.31 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05 7 | adv_mnli_mm,bd8ef0,acc_before,gen,37.82 8 | adv_qnli,0b7326,acc_drop,gen,0.19 9 | adv_qnli,0b7326,acc_after,gen,52.63 10 | adv_qnli,0b7326,acc_before,gen,64.66 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.08 15 | adv_rte,8cc547,acc_after,gen,59.26 16 | adv_rte,8cc547,acc_before,gen,64.20 17 | adv_sst2,ee8d3b,acc_drop,gen,0.12 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,56.49 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18 3 | adv_mnli,bd8ef0,acc_after,gen,45.45 4 | adv_mnli,bd8ef0,acc_before,gen,55.37 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.26 6 | adv_mnli_mm,bd8ef0,acc_after,gen,29.41 7 | adv_mnli_mm,bd8ef0,acc_before,gen,39.50 8 | adv_qnli,0b7326,acc_drop,gen,-0.01 9 | adv_qnli,0b7326,acc_after,gen,57.14 10 | adv_qnli,0b7326,acc_before,gen,56.39 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,45.07 14 | adv_rte,8cc547,acc_drop,gen,-0.04 15 | adv_rte,8cc547,acc_after,gen,59.26 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.13 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,58.02 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b 2 | adv_mnli,bd8ef0,acc_drop,gen,0.17 3 | adv_mnli,bd8ef0,acc_after,gen,45.45 4 | adv_mnli,bd8ef0,acc_before,gen,54.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.22 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61 7 | adv_mnli_mm,bd8ef0,acc_before,gen,42.86 8 | adv_qnli,0b7326,acc_drop,gen,0.08 9 | adv_qnli,0b7326,acc_after,gen,55.64 10 | adv_qnli,0b7326,acc_before,gen,60.15 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.06 15 | adv_rte,8cc547,acc_after,gen,55.56 16 | adv_rte,8cc547,acc_before,gen,59.26 17 | adv_sst2,ee8d3b,acc_drop,gen,-0.06 18 | adv_sst2,ee8d3b,acc_after,gen,58.02 19 | adv_sst2,ee8d3b,acc_before,gen,54.96 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.35 3 | adv_mnli,bd8ef0,acc_after,gen,59.50 4 | adv_mnli,bd8ef0,acc_before,gen,91.74 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.11 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24 8 | adv_qnli,0b7326,acc_drop,gen,0.19 9 | adv_qnli,0b7326,acc_after,gen,66.17 10 | adv_qnli,0b7326,acc_before,gen,81.20 11 | adv_qqp,cdc277,acc_drop,gen,0.18 12 | adv_qqp,cdc277,acc_after,gen,69.01 13 | adv_qqp,cdc277,acc_before,gen,84.51 14 | adv_rte,8cc547,acc_drop,gen,0.15 15 | adv_rte,8cc547,acc_after,gen,77.78 16 | adv_rte,8cc547,acc_before,gen,91.36 17 | adv_sst2,ee8d3b,acc_drop,gen,0.37 18 | adv_sst2,ee8d3b,acc_after,gen,55.73 19 | adv_sst2,ee8d3b,acc_before,gen,87.79 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,58.68 4 | adv_mnli,bd8ef0,acc_before,gen,77.69 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.18 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59 7 | adv_mnli_mm,bd8ef0,acc_before,gen,85.71 8 | adv_qnli,0b7326,acc_drop,gen,0.18 9 | adv_qnli,0b7326,acc_after,gen,58.65 10 | adv_qnli,0b7326,acc_before,gen,71.43 11 | adv_qqp,cdc277,acc_drop,gen,0.23 12 | adv_qqp,cdc277,acc_after,gen,67.61 13 | adv_qqp,cdc277,acc_before,gen,87.32 14 | adv_rte,8cc547,acc_drop,gen,0.21 15 | adv_rte,8cc547,acc_after,gen,72.84 16 | adv_rte,8cc547,acc_before,gen,92.59 17 | adv_sst2,ee8d3b,acc_drop,gen,0.38 18 | adv_sst2,ee8d3b,acc_after,gen,53.44 19 | adv_sst2,ee8d3b,acc_before,gen,86.26 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.40 3 | adv_mnli,bd8ef0,acc_after,gen,52.89 4 | adv_mnli,bd8ef0,acc_before,gen,87.60 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,67.23 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03 8 | adv_qnli,0b7326,acc_drop,gen,0.18 9 | adv_qnli,0b7326,acc_after,gen,69.17 10 | adv_qnli,0b7326,acc_before,gen,84.21 11 | adv_qqp,cdc277,acc_drop,gen,0.11 12 | adv_qqp,cdc277,acc_after,gen,76.06 13 | adv_qqp,cdc277,acc_before,gen,85.92 14 | adv_rte,8cc547,acc_drop,gen,0.21 15 | adv_rte,8cc547,acc_after,gen,74.07 16 | adv_rte,8cc547,acc_before,gen,93.83 17 | adv_sst2,ee8d3b,acc_drop,gen,0.32 18 | adv_sst2,ee8d3b,acc_after,gen,58.78 19 | adv_sst2,ee8d3b,acc_before,gen,87.02 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.31 3 | adv_mnli,bd8ef0,acc_after,gen,62.81 4 | adv_mnli,bd8ef0,acc_before,gen,90.91 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24 8 | adv_qnli,0b7326,acc_drop,gen,0.19 9 | adv_qnli,0b7326,acc_after,gen,68.42 10 | adv_qnli,0b7326,acc_before,gen,84.21 11 | adv_qqp,cdc277,acc_drop,gen,0.21 12 | adv_qqp,cdc277,acc_after,gen,76.06 13 | adv_qqp,cdc277,acc_before,gen,95.77 14 | adv_rte,8cc547,acc_drop,gen,0.12 15 | adv_rte,8cc547,acc_after,gen,82.72 16 | adv_rte,8cc547,acc_before,gen,93.83 17 | adv_sst2,ee8d3b,acc_drop,gen,0.32 18 | adv_sst2,ee8d3b,acc_after,gen,60.31 19 | adv_sst2,ee8d3b,acc_before,gen,89.31 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.27 3 | adv_mnli,bd8ef0,acc_after,gen,61.98 4 | adv_mnli,bd8ef0,acc_before,gen,85.12 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.11 7 | adv_mnli_mm,bd8ef0,acc_before,gen,86.55 8 | adv_qnli,0b7326,acc_drop,gen,0.18 9 | adv_qnli,0b7326,acc_after,gen,63.91 10 | adv_qnli,0b7326,acc_before,gen,78.20 11 | adv_qqp,cdc277,acc_drop,gen,0.17 12 | adv_qqp,cdc277,acc_after,gen,70.42 13 | adv_qqp,cdc277,acc_before,gen,84.51 14 | adv_rte,8cc547,acc_drop,gen,0.09 15 | adv_rte,8cc547,acc_after,gen,79.01 16 | adv_rte,8cc547,acc_before,gen,86.42 17 | adv_sst2,ee8d3b,acc_drop,gen,0.29 18 | adv_sst2,ee8d3b,acc_after,gen,61.07 19 | adv_sst2,ee8d3b,acc_before,gen,85.50 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33 3 | adv_mnli,bd8ef0,acc_after,gen,61.98 4 | adv_mnli,bd8ef0,acc_before,gen,92.56 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43 7 | adv_mnli_mm,bd8ef0,acc_before,gen,89.08 8 | adv_qnli,0b7326,acc_drop,gen,0.13 9 | adv_qnli,0b7326,acc_after,gen,69.17 10 | adv_qnli,0b7326,acc_before,gen,79.70 11 | adv_qqp,cdc277,acc_drop,gen,0.12 12 | adv_qqp,cdc277,acc_after,gen,71.83 13 | adv_qqp,cdc277,acc_before,gen,81.69 14 | adv_rte,8cc547,acc_drop,gen,0.16 15 | adv_rte,8cc547,acc_after,gen,70.37 16 | adv_rte,8cc547,acc_before,gen,83.95 17 | adv_sst2,ee8d3b,acc_drop,gen,0.26 18 | adv_sst2,ee8d3b,acc_after,gen,62.60 19 | adv_sst2,ee8d3b,acc_before,gen,84.73 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.32 3 | adv_mnli,bd8ef0,acc_after,gen,61.16 4 | adv_mnli,bd8ef0,acc_before,gen,90.08 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.95 7 | adv_mnli_mm,bd8ef0,acc_before,gen,90.76 8 | adv_qnli,0b7326,acc_drop,gen,0.18 9 | adv_qnli,0b7326,acc_after,gen,66.17 10 | adv_qnli,0b7326,acc_before,gen,80.45 11 | adv_qqp,cdc277,acc_drop,gen,0.21 12 | adv_qqp,cdc277,acc_after,gen,69.01 13 | adv_qqp,cdc277,acc_before,gen,87.32 14 | adv_rte,8cc547,acc_drop,gen,0.19 15 | adv_rte,8cc547,acc_after,gen,70.37 16 | adv_rte,8cc547,acc_before,gen,86.42 17 | adv_sst2,ee8d3b,acc_drop,gen,0.22 18 | adv_sst2,ee8d3b,acc_after,gen,64.12 19 | adv_sst2,ee8d3b,acc_before,gen,82.44 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33 3 | adv_mnli,bd8ef0,acc_after,gen,61.16 4 | adv_mnli,bd8ef0,acc_before,gen,91.74 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24 8 | adv_qnli,0b7326,acc_drop,gen,0.15 9 | adv_qnli,0b7326,acc_after,gen,69.92 10 | adv_qnli,0b7326,acc_before,gen,81.95 11 | adv_qqp,cdc277,acc_drop,gen,0.07 12 | adv_qqp,cdc277,acc_after,gen,74.65 13 | adv_qqp,cdc277,acc_before,gen,80.28 14 | adv_rte,8cc547,acc_drop,gen,0.14 15 | adv_rte,8cc547,acc_after,gen,77.78 16 | adv_rte,8cc547,acc_before,gen,90.12 17 | adv_sst2,ee8d3b,acc_drop,gen,0.36 18 | adv_sst2,ee8d3b,acc_after,gen,57.25 19 | adv_sst2,ee8d3b,acc_before,gen,89.31 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,chatglm3-6b-32k 2 | adv_mnli,bd8ef0,acc_drop,gen,0.37 3 | adv_mnli,bd8ef0,acc_after,gen,57.85 4 | adv_mnli,bd8ef0,acc_before,gen,91.74 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.18 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24 8 | adv_qnli,0b7326,acc_drop,gen,0.17 9 | adv_qnli,0b7326,acc_after,gen,66.92 10 | adv_qnli,0b7326,acc_before,gen,80.45 11 | adv_qqp,cdc277,acc_drop,gen,0.21 12 | adv_qqp,cdc277,acc_after,gen,67.61 13 | adv_qqp,cdc277,acc_before,gen,85.92 14 | adv_rte,8cc547,acc_drop,gen,0.10 15 | adv_rte,8cc547,acc_after,gen,79.01 16 | adv_rte,8cc547,acc_before,gen,87.65 17 | adv_sst2,ee8d3b,acc_drop,gen,0.34 18 | adv_sst2,ee8d3b,acc_after,gen,57.25 19 | adv_sst2,ee8d3b,acc_before,gen,87.02 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,43.80 4 | adv_mnli,bd8ef0,acc_before,gen,54.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17 6 | adv_mnli_mm,bd8ef0,acc_after,gen,42.02 7 | adv_mnli_mm,bd8ef0,acc_before,gen,50.42 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,55.64 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.02 3 | adv_mnli,bd8ef0,acc_after,gen,44.63 4 | adv_mnli,bd8ef0,acc_before,gen,45.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.03 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,47.37 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.08 3 | adv_mnli,bd8ef0,acc_after,gen,48.76 4 | adv_mnli,bd8ef0,acc_before,gen,52.89 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07 6 | adv_mnli_mm,bd8ef0,acc_after,gen,43.70 7 | adv_mnli_mm,bd8ef0,acc_before,gen,47.06 8 | adv_qnli,0b7326,acc_drop,gen,0.01 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,50.38 11 | adv_qqp,cdc277,acc_drop,gen,0.26 12 | adv_qqp,cdc277,acc_after,gen,49.30 13 | adv_qqp,cdc277,acc_before,gen,66.20 14 | adv_rte,8cc547,acc_drop,gen,0.06 15 | adv_rte,8cc547,acc_after,gen,60.49 16 | adv_rte,8cc547,acc_before,gen,64.20 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,43.80 4 | adv_mnli,bd8ef0,acc_before,gen,54.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16 6 | adv_mnli_mm,bd8ef0,acc_after,gen,43.70 7 | adv_mnli_mm,bd8ef0,acc_before,gen,52.10 8 | adv_qnli,0b7326,acc_drop,gen,0.10 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,54.89 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.06 3 | adv_mnli,bd8ef0,acc_after,gen,41.32 4 | adv_mnli,bd8ef0,acc_before,gen,38.84 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.09 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.93 7 | adv_mnli_mm,bd8ef0,acc_before,gen,29.41 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,47.37 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,57.75 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,59.26 16 | adv_rte,8cc547,acc_before,gen,59.26 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-hf 2 | adv_mnli,bd8ef0,acc_drop,gen,0.11 3 | adv_mnli,bd8ef0,acc_after,gen,47.93 4 | adv_mnli,bd8ef0,acc_before,gen,53.72 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.06 6 | adv_mnli_mm,bd8ef0,acc_after,gen,55.46 7 | adv_mnli_mm,bd8ef0,acc_before,gen,58.82 8 | adv_qnli,0b7326,acc_drop,gen,0.17 9 | adv_qnli,0b7326,acc_after,gen,48.12 10 | adv_qnli,0b7326,acc_before,gen,57.89 11 | adv_qqp,cdc277,acc_drop,gen,-0.08 12 | adv_qqp,cdc277,acc_after,gen,59.15 13 | adv_qqp,cdc277,acc_before,gen,54.93 14 | adv_rte,8cc547,acc_drop,gen,-0.02 15 | adv_rte,8cc547,acc_after,gen,72.84 16 | adv_rte,8cc547,acc_before,gen,71.60 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.09 3 | adv_mnli,bd8ef0,acc_after,gen,48.76 4 | adv_mnli,bd8ef0,acc_before,gen,53.72 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.06 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10 7 | adv_mnli_mm,bd8ef0,acc_before,gen,55.46 8 | adv_qnli,0b7326,acc_drop,gen,0.22 9 | adv_qnli,0b7326,acc_after,gen,48.12 10 | adv_qnli,0b7326,acc_before,gen,61.65 11 | adv_qqp,cdc277,acc_drop,gen,0.05 12 | adv_qqp,cdc277,acc_after,gen,52.11 13 | adv_qqp,cdc277,acc_before,gen,54.93 14 | adv_rte,8cc547,acc_drop,gen,-0.09 15 | adv_rte,8cc547,acc_after,gen,76.54 16 | adv_rte,8cc547,acc_before,gen,70.37 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-hf 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,44.63 4 | adv_mnli,bd8ef0,acc_before,gen,57.02 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12 6 | adv_mnli_mm,bd8ef0,acc_after,gen,44.54 7 | adv_mnli_mm,bd8ef0,acc_before,gen,50.42 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,55.64 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,56.79 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-40b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,43.80 4 | adv_mnli,bd8ef0,acc_before,gen,56.20 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07 6 | adv_mnli_mm,bd8ef0,acc_after,gen,42.02 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38 8 | adv_qnli,0b7326,acc_drop,gen,0.12 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,57.14 11 | adv_qqp,cdc277,acc_drop,gen,-0.11 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,-0.02 15 | adv_rte,8cc547,acc_after,gen,59.26 16 | adv_rte,8cc547,acc_before,gen,58.02 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03 3 | adv_mnli,bd8ef0,acc_after,gen,33.06 4 | adv_mnli,bd8ef0,acc_before,gen,32.23 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93 8 | adv_qnli,0b7326,acc_drop,gen,-0.07 9 | adv_qnli,0b7326,acc_after,gen,48.87 10 | adv_qnli,0b7326,acc_before,gen,45.86 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,47.89 14 | adv_rte,8cc547,acc_drop,gen,-0.05 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,46.91 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,27.27 4 | adv_mnli,bd8ef0,acc_before,gen,33.88 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.30 6 | adv_mnli_mm,bd8ef0,acc_after,gen,29.41 7 | adv_mnli_mm,bd8ef0,acc_before,gen,22.69 8 | adv_qnli,0b7326,acc_drop,gen,-0.10 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,45.11 11 | adv_qqp,cdc277,acc_drop,gen,0.14 12 | adv_qqp,cdc277,acc_after,gen,43.66 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,0.09 15 | adv_rte,8cc547,acc_after,gen,39.51 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.02 3 | adv_mnli,bd8ef0,acc_after,gen,33.88 4 | adv_mnli,bd8ef0,acc_before,gen,33.06 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.09 8 | adv_qnli,0b7326,acc_drop,gen,0.01 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,51.88 11 | adv_qqp,cdc277,acc_drop,gen,0.05 12 | adv_qqp,cdc277,acc_after,gen,54.93 13 | adv_qqp,cdc277,acc_before,gen,57.75 14 | adv_rte,8cc547,acc_drop,gen,-0.07 15 | adv_rte,8cc547,acc_after,gen,54.32 16 | adv_rte,8cc547,acc_before,gen,50.62 17 | adv_sst2,ee8d3b,acc_drop,gen,-0.03 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,48.85 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.05 3 | adv_mnli,bd8ef0,acc_after,gen,32.23 4 | adv_mnli,bd8ef0,acc_before,gen,30.58 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89 7 | adv_mnli_mm,bd8ef0,acc_before,gen,30.25 8 | adv_qnli,0b7326,acc_drop,gen,-0.08 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,46.62 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,47.89 14 | adv_rte,8cc547,acc_drop,gen,-0.05 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,46.91 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,32.23 4 | adv_mnli,bd8ef0,acc_before,gen,41.32 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.03 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57 7 | adv_mnli_mm,bd8ef0,acc_before,gen,27.73 8 | adv_qnli,0b7326,acc_drop,gen,-0.06 9 | adv_qnli,0b7326,acc_after,gen,42.86 10 | adv_qnli,0b7326,acc_before,gen,40.60 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,46.91 16 | adv_rte,8cc547,acc_before,gen,46.91 17 | adv_sst2,ee8d3b,acc_drop,gen,0.04 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,52.67 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.03 3 | adv_mnli,bd8ef0,acc_after,gen,31.40 4 | adv_mnli,bd8ef0,acc_before,gen,32.23 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.04 6 | adv_mnli_mm,bd8ef0,acc_after,gen,22.69 7 | adv_mnli_mm,bd8ef0,acc_before,gen,23.53 8 | adv_qnli,0b7326,acc_drop,gen,-0.05 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,48.12 11 | adv_qqp,cdc277,acc_drop,gen,-0.03 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,46.48 14 | adv_rte,8cc547,acc_drop,gen,-0.10 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,51.85 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,0.03 3 | adv_mnli,bd8ef0,acc_after,gen,31.40 4 | adv_mnli,bd8ef0,acc_before,gen,32.23 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07 6 | adv_mnli_mm,bd8ef0,acc_after,gen,22.69 7 | adv_mnli_mm,bd8ef0,acc_before,gen,24.37 8 | adv_qnli,0b7326,acc_drop,gen,-0.05 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,48.12 11 | adv_qqp,cdc277,acc_drop,gen,0.09 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,49.30 14 | adv_rte,8cc547,acc_drop,gen,0.05 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,51.85 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03 3 | adv_mnli,bd8ef0,acc_after,gen,33.06 4 | adv_mnli,bd8ef0,acc_before,gen,32.23 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93 8 | adv_qnli,0b7326,acc_drop,gen,-0.06 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,46.62 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,47.89 14 | adv_rte,8cc547,acc_drop,gen,-0.05 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,46.91 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,falcon-7b-instruct 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03 3 | adv_mnli,bd8ef0,acc_after,gen,31.40 4 | adv_mnli,bd8ef0,acc_before,gen,30.58 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93 8 | adv_qnli,0b7326,acc_drop,gen,-0.08 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,-0.06 12 | adv_qqp,cdc277,acc_after,gen,50.70 13 | adv_qqp,cdc277,acc_before,gen,47.89 14 | adv_rte,8cc547,acc_drop,gen,-0.11 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,44.44 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16 3 | adv_mnli,bd8ef0,acc_after,gen,38.02 4 | adv_mnli,bd8ef0,acc_before,gen,45.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.09 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77 7 | adv_mnli_mm,bd8ef0,acc_before,gen,36.13 8 | adv_qnli,0b7326,acc_drop,gen,0.09 9 | adv_qnli,0b7326,acc_after,gen,56.39 10 | adv_qnli,0b7326,acc_before,gen,61.65 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,49.30 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,62.96 16 | adv_rte,8cc547,acc_before,gen,62.96 17 | adv_sst2,ee8d3b,acc_drop,gen,0.26 18 | adv_sst2,ee8d3b,acc_after,gen,45.80 19 | adv_sst2,ee8d3b,acc_before,gen,61.83 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,28.93 4 | adv_mnli,bd8ef0,acc_before,gen,38.02 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,63.91 10 | adv_qnli,0b7326,acc_before,gen,68.42 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,45.07 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,58.02 16 | adv_rte,8cc547,acc_before,gen,58.02 17 | adv_sst2,ee8d3b,acc_drop,gen,0.06 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,53.44 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33 3 | adv_mnli,bd8ef0,acc_after,gen,41.32 4 | adv_mnli,bd8ef0,acc_before,gen,61.98 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.25 6 | adv_mnli_mm,bd8ef0,acc_after,gen,39.50 7 | adv_mnli_mm,bd8ef0,acc_before,gen,52.94 8 | adv_qnli,0b7326,acc_drop,gen,0.09 9 | adv_qnli,0b7326,acc_after,gen,60.90 10 | adv_qnli,0b7326,acc_before,gen,66.92 11 | adv_qqp,cdc277,acc_drop,gen,-0.13 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.02 15 | adv_rte,8cc547,acc_after,gen,60.49 16 | adv_rte,8cc547,acc_before,gen,61.73 17 | adv_sst2,ee8d3b,acc_drop,gen,0.28 18 | adv_sst2,ee8d3b,acc_after,gen,45.04 19 | adv_sst2,ee8d3b,acc_before,gen,62.60 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16 3 | adv_mnli,bd8ef0,acc_after,gen,38.02 4 | adv_mnli,bd8ef0,acc_before,gen,45.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.93 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.03 9 | adv_qnli,0b7326,acc_after,gen,57.89 10 | adv_qnli,0b7326,acc_before,gen,59.40 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,49.30 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,62.96 16 | adv_rte,8cc547,acc_before,gen,62.96 17 | adv_sst2,ee8d3b,acc_drop,gen,0.18 18 | adv_sst2,ee8d3b,acc_after,gen,45.04 19 | adv_sst2,ee8d3b,acc_before,gen,54.96 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,39.67 4 | adv_mnli,bd8ef0,acc_before,gen,49.59 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.08 6 | adv_mnli_mm,bd8ef0,acc_after,gen,50.42 7 | adv_mnli_mm,bd8ef0,acc_before,gen,54.62 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,60.15 10 | adv_qnli,0b7326,acc_before,gen,67.67 11 | adv_qqp,cdc277,acc_drop,gen,-0.26 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,38.03 14 | adv_rte,8cc547,acc_drop,gen,0.12 15 | adv_rte,8cc547,acc_after,gen,46.91 16 | adv_rte,8cc547,acc_before,gen,53.09 17 | adv_sst2,ee8d3b,acc_drop,gen,0.02 18 | adv_sst2,ee8d3b,acc_after,gen,47.33 19 | adv_sst2,ee8d3b,acc_before,gen,48.09 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.08 3 | adv_mnli,bd8ef0,acc_after,gen,29.75 4 | adv_mnli,bd8ef0,acc_before,gen,32.23 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.05 6 | adv_mnli_mm,bd8ef0,acc_after,gen,36.97 7 | adv_mnli_mm,bd8ef0,acc_before,gen,35.29 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,54.14 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,43.66 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,41.98 16 | adv_rte,8cc547,acc_before,gen,41.98 17 | adv_sst2,ee8d3b,acc_drop,gen,0.10 18 | adv_sst2,ee8d3b,acc_after,gen,48.85 19 | adv_sst2,ee8d3b,acc_before,gen,54.20 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03 3 | adv_mnli,bd8ef0,acc_after,gen,32.23 4 | adv_mnli,bd8ef0,acc_before,gen,31.40 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00 6 | adv_mnli_mm,bd8ef0,acc_after,gen,40.34 7 | adv_mnli_mm,bd8ef0,acc_before,gen,40.34 8 | adv_qnli,0b7326,acc_drop,gen,0.05 9 | adv_qnli,0b7326,acc_after,gen,53.38 10 | adv_qnli,0b7326,acc_before,gen,56.39 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,47.89 14 | adv_rte,8cc547,acc_drop,gen,0.03 15 | adv_rte,8cc547,acc_after,gen,40.74 16 | adv_rte,8cc547,acc_before,gen,41.98 17 | adv_sst2,ee8d3b,acc_drop,gen,0.12 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,56.49 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.15 3 | adv_mnli,bd8ef0,acc_after,gen,38.84 4 | adv_mnli,bd8ef0,acc_before,gen,45.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.14 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09 7 | adv_mnli_mm,bd8ef0,acc_before,gen,36.13 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,58.65 10 | adv_qnli,0b7326,acc_before,gen,58.65 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,47.89 13 | adv_qqp,cdc277,acc_before,gen,49.30 14 | adv_rte,8cc547,acc_drop,gen,0.06 15 | adv_rte,8cc547,acc_after,gen,62.96 16 | adv_rte,8cc547,acc_before,gen,66.67 17 | adv_sst2,ee8d3b,acc_drop,gen,0.11 18 | adv_sst2,ee8d3b,acc_after,gen,48.09 19 | adv_sst2,ee8d3b,acc_before,gen,54.20 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-13b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.29 3 | adv_mnli,bd8ef0,acc_after,gen,37.19 4 | adv_mnli,bd8ef0,acc_before,gen,52.07 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.35 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05 7 | adv_mnli_mm,bd8ef0,acc_before,gen,40.34 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,57.14 10 | adv_qnli,0b7326,acc_before,gen,61.65 11 | adv_qqp,cdc277,acc_drop,gen,-0.23 12 | adv_qqp,cdc277,acc_after,gen,52.11 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.04 15 | adv_rte,8cc547,acc_after,gen,56.79 16 | adv_rte,8cc547,acc_before,gen,59.26 17 | adv_sst2,ee8d3b,acc_drop,gen,0.15 18 | adv_sst2,ee8d3b,acc_after,gen,52.67 19 | adv_sst2,ee8d3b,acc_before,gen,61.83 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.25 3 | adv_mnli,bd8ef0,acc_after,gen,48.76 4 | adv_mnli,bd8ef0,acc_before,gen,65.29 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10 7 | adv_mnli_mm,bd8ef0,acc_before,gen,68.07 8 | adv_qnli,0b7326,acc_drop,gen,0.10 9 | adv_qnli,0b7326,acc_after,gen,63.91 10 | adv_qnli,0b7326,acc_before,gen,70.68 11 | adv_qqp,cdc277,acc_drop,gen,-0.32 12 | adv_qqp,cdc277,acc_after,gen,46.48 13 | adv_qqp,cdc277,acc_before,gen,35.21 14 | adv_rte,8cc547,acc_drop,gen,0.19 15 | adv_rte,8cc547,acc_after,gen,64.20 16 | adv_rte,8cc547,acc_before,gen,79.01 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.77 3 | adv_mnli,bd8ef0,acc_after,gen,19.01 4 | adv_mnli,bd8ef0,acc_before,gen,10.74 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17 6 | adv_mnli_mm,bd8ef0,acc_after,gen,15.97 7 | adv_mnli_mm,bd8ef0,acc_before,gen,19.33 8 | adv_qnli,0b7326,acc_drop,gen,-0.05 9 | adv_qnli,0b7326,acc_after,gen,33.08 10 | adv_qnli,0b7326,acc_before,gen,31.58 11 | adv_qqp,cdc277,acc_drop,gen,-1.77 12 | adv_qqp,cdc277,acc_after,gen,50.70 13 | adv_qqp,cdc277,acc_before,gen,18.31 14 | adv_rte,8cc547,acc_drop,gen,-0.75 15 | adv_rte,8cc547,acc_after,gen,17.28 16 | adv_rte,8cc547,acc_before,gen,9.88 17 | adv_sst2,ee8d3b,acc_drop,gen,0.18 18 | adv_sst2,ee8d3b,acc_after,gen,28.24 19 | adv_sst2,ee8d3b,acc_before,gen,34.35 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.00 3 | adv_mnli,bd8ef0,acc_after,gen,14.88 4 | adv_mnli,bd8ef0,acc_before,gen,14.88 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.47 6 | adv_mnli_mm,bd8ef0,acc_after,gen,14.29 7 | adv_mnli_mm,bd8ef0,acc_before,gen,26.89 8 | adv_qnli,0b7326,acc_drop,gen,0.20 9 | adv_qnli,0b7326,acc_after,gen,29.32 10 | adv_qnli,0b7326,acc_before,gen,36.84 11 | adv_qqp,cdc277,acc_drop,gen,0.33 12 | adv_qqp,cdc277,acc_after,gen,33.80 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,0.37 15 | adv_rte,8cc547,acc_after,gen,14.81 16 | adv_rte,8cc547,acc_before,gen,23.46 17 | adv_sst2,ee8d3b,acc_drop,gen,0.38 18 | adv_sst2,ee8d3b,acc_after,gen,19.08 19 | adv_sst2,ee8d3b,acc_before,gen,30.53 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.13 3 | adv_mnli,bd8ef0,acc_after,gen,14.05 4 | adv_mnli,bd8ef0,acc_before,gen,12.40 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13 6 | adv_mnli_mm,bd8ef0,acc_after,gen,23.53 7 | adv_mnli_mm,bd8ef0,acc_before,gen,26.89 8 | adv_qnli,0b7326,acc_drop,gen,-0.14 9 | adv_qnli,0b7326,acc_after,gen,30.08 10 | adv_qnli,0b7326,acc_before,gen,26.32 11 | adv_qqp,cdc277,acc_drop,gen,0.42 12 | adv_qqp,cdc277,acc_after,gen,29.58 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,-0.25 15 | adv_rte,8cc547,acc_after,gen,12.35 16 | adv_rte,8cc547,acc_before,gen,9.88 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,25.95 19 | adv_sst2,ee8d3b,acc_before,gen,25.95 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,45.45 4 | adv_mnli,bd8ef0,acc_before,gen,59.50 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23 6 | adv_mnli_mm,bd8ef0,acc_after,gen,47.90 7 | adv_mnli_mm,bd8ef0,acc_before,gen,62.18 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,54.14 10 | adv_qnli,0b7326,acc_before,gen,60.90 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,50.70 13 | adv_qqp,cdc277,acc_before,gen,52.11 14 | adv_rte,8cc547,acc_drop,gen,0.13 15 | adv_rte,8cc547,acc_after,gen,72.84 16 | adv_rte,8cc547,acc_before,gen,83.95 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.15 3 | adv_mnli,bd8ef0,acc_after,gen,50.41 4 | adv_mnli,bd8ef0,acc_before,gen,59.50 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.24 6 | adv_mnli_mm,bd8ef0,acc_after,gen,51.26 7 | adv_mnli_mm,bd8ef0,acc_before,gen,67.23 8 | adv_qnli,0b7326,acc_drop,gen,0.10 9 | adv_qnli,0b7326,acc_after,gen,62.41 10 | adv_qnli,0b7326,acc_before,gen,69.17 11 | adv_qqp,cdc277,acc_drop,gen,-0.13 12 | adv_qqp,cdc277,acc_after,gen,49.30 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.14 15 | adv_rte,8cc547,acc_after,gen,67.90 16 | adv_rte,8cc547,acc_before,gen,79.01 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16 3 | adv_mnli,bd8ef0,acc_after,gen,50.41 4 | adv_mnli,bd8ef0,acc_before,gen,60.33 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.27 6 | adv_mnli_mm,bd8ef0,acc_after,gen,46.22 7 | adv_mnli_mm,bd8ef0,acc_before,gen,63.03 8 | adv_qnli,0b7326,acc_drop,gen,0.02 9 | adv_qnli,0b7326,acc_after,gen,64.66 10 | adv_qnli,0b7326,acc_before,gen,66.17 11 | adv_qqp,cdc277,acc_drop,gen,-0.19 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,38.03 14 | adv_rte,8cc547,acc_drop,gen,0.18 15 | adv_rte,8cc547,acc_after,gen,67.90 16 | adv_rte,8cc547,acc_before,gen,82.72 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21 3 | adv_mnli,bd8ef0,acc_after,gen,47.93 4 | adv_mnli,bd8ef0,acc_before,gen,60.33 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10 7 | adv_mnli_mm,bd8ef0,acc_before,gen,68.07 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,63.16 10 | adv_qnli,0b7326,acc_before,gen,70.68 11 | adv_qqp,cdc277,acc_drop,gen,-0.15 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,36.62 14 | adv_rte,8cc547,acc_drop,gen,0.13 15 | adv_rte,8cc547,acc_after,gen,64.20 16 | adv_rte,8cc547,acc_before,gen,74.07 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-70b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,48.76 4 | adv_mnli,bd8ef0,acc_before,gen,62.81 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.22 6 | adv_mnli_mm,bd8ef0,acc_after,gen,51.26 7 | adv_mnli_mm,bd8ef0,acc_before,gen,65.55 8 | adv_qnli,0b7326,acc_drop,gen,0.09 9 | adv_qnli,0b7326,acc_after,gen,64.66 10 | adv_qnli,0b7326,acc_before,gen,70.68 11 | adv_qqp,cdc277,acc_drop,gen,-0.10 12 | adv_qqp,cdc277,acc_after,gen,46.48 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.07 15 | adv_rte,8cc547,acc_after,gen,64.20 16 | adv_rte,8cc547,acc_before,gen,69.14 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06 3 | adv_mnli,bd8ef0,acc_after,gen,23.97 4 | adv_mnli,bd8ef0,acc_before,gen,25.62 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,52.63 10 | adv_qnli,0b7326,acc_before,gen,56.39 11 | adv_qqp,cdc277,acc_drop,gen,0.11 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,0.07 15 | adv_rte,8cc547,acc_after,gen,51.85 16 | adv_rte,8cc547,acc_before,gen,55.56 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,26.45 4 | adv_mnli,bd8ef0,acc_before,gen,34.71 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.04 6 | adv_mnli_mm,bd8ef0,acc_after,gen,36.97 7 | adv_mnli_mm,bd8ef0,acc_before,gen,38.66 8 | adv_qnli,0b7326,acc_drop,gen,-0.02 9 | adv_qnli,0b7326,acc_after,gen,48.12 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,0.13 12 | adv_qqp,cdc277,acc_after,gen,46.48 13 | adv_qqp,cdc277,acc_before,gen,53.52 14 | adv_rte,8cc547,acc_drop,gen,0.25 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,65.43 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06 3 | adv_mnli,bd8ef0,acc_after,gen,24.79 4 | adv_mnli,bd8ef0,acc_before,gen,26.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61 7 | adv_mnli_mm,bd8ef0,acc_before,gen,33.61 8 | adv_qnli,0b7326,acc_drop,gen,0.04 9 | adv_qnli,0b7326,acc_after,gen,55.64 10 | adv_qnli,0b7326,acc_before,gen,57.89 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,42.25 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,-0.19 15 | adv_rte,8cc547,acc_after,gen,54.32 16 | adv_rte,8cc547,acc_before,gen,45.68 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06 3 | adv_mnli,bd8ef0,acc_after,gen,23.97 4 | adv_mnli,bd8ef0,acc_before,gen,25.62 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00 6 | adv_mnli_mm,bd8ef0,acc_after,gen,34.45 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,51.88 10 | adv_qnli,0b7326,acc_before,gen,55.64 11 | adv_qqp,cdc277,acc_drop,gen,0.16 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,53.52 14 | adv_rte,8cc547,acc_drop,gen,0.02 15 | adv_rte,8cc547,acc_after,gen,51.85 16 | adv_rte,8cc547,acc_before,gen,53.09 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.10 3 | adv_mnli,bd8ef0,acc_after,gen,36.36 4 | adv_mnli,bd8ef0,acc_before,gen,40.50 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11 6 | adv_mnli_mm,bd8ef0,acc_after,gen,40.34 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,46.62 10 | adv_qnli,0b7326,acc_before,gen,52.63 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,59.15 13 | adv_qqp,cdc277,acc_before,gen,57.75 14 | adv_rte,8cc547,acc_drop,gen,-0.04 15 | adv_rte,8cc547,acc_after,gen,60.49 16 | adv_rte,8cc547,acc_before,gen,58.02 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.00 3 | adv_mnli,bd8ef0,acc_after,gen,23.97 4 | adv_mnli,bd8ef0,acc_before,gen,23.97 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.11 6 | adv_mnli_mm,bd8ef0,acc_after,gen,35.29 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,47.37 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,0.16 12 | adv_qqp,cdc277,acc_after,gen,36.62 13 | adv_qqp,cdc277,acc_before,gen,43.66 14 | adv_rte,8cc547,acc_drop,gen,0.07 15 | adv_rte,8cc547,acc_after,gen,50.62 16 | adv_rte,8cc547,acc_before,gen,54.32 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03 3 | adv_mnli,bd8ef0,acc_after,gen,27.27 4 | adv_mnli,bd8ef0,acc_before,gen,26.45 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.08 6 | adv_mnli_mm,bd8ef0,acc_after,gen,35.29 7 | adv_mnli_mm,bd8ef0,acc_before,gen,32.77 8 | adv_qnli,0b7326,acc_drop,gen,0.00 9 | adv_qnli,0b7326,acc_after,gen,47.37 10 | adv_qnli,0b7326,acc_before,gen,47.37 11 | adv_qqp,cdc277,acc_drop,gen,0.03 12 | adv_qqp,cdc277,acc_after,gen,40.85 13 | adv_qqp,cdc277,acc_before,gen,42.25 14 | adv_rte,8cc547,acc_drop,gen,0.09 15 | adv_rte,8cc547,acc_after,gen,49.38 16 | adv_rte,8cc547,acc_before,gen,54.32 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18 3 | adv_mnli,bd8ef0,acc_after,gen,23.14 4 | adv_mnli,bd8ef0,acc_before,gen,28.10 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,52.63 10 | adv_qnli,0b7326,acc_before,gen,56.39 11 | adv_qqp,cdc277,acc_drop,gen,0.11 12 | adv_qqp,cdc277,acc_after,gen,45.07 13 | adv_qqp,cdc277,acc_before,gen,50.70 14 | adv_rte,8cc547,acc_drop,gen,0.07 15 | adv_rte,8cc547,acc_after,gen,51.85 16 | adv_rte,8cc547,acc_before,gen,55.56 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,llama2-7b-chat 2 | adv_mnli,bd8ef0,acc_drop,gen,0.12 3 | adv_mnli,bd8ef0,acc_after,gen,24.79 4 | adv_mnli,bd8ef0,acc_before,gen,28.10 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.02 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,51.88 10 | adv_qnli,0b7326,acc_before,gen,55.64 11 | adv_qqp,cdc277,acc_drop,gen,-0.06 12 | adv_qqp,cdc277,acc_after,gen,52.11 13 | adv_qqp,cdc277,acc_before,gen,49.30 14 | adv_rte,8cc547,acc_drop,gen,0.05 15 | adv_rte,8cc547,acc_after,gen,64.20 16 | adv_rte,8cc547,acc_before,gen,67.90 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,50.38 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,57.02 4 | adv_mnli,bd8ef0,acc_before,gen,72.73 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71 7 | adv_mnli_mm,bd8ef0,acc_before,gen,80.67 8 | adv_qnli,0b7326,acc_drop,gen,0.06 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,-0.06 15 | adv_rte,8cc547,acc_after,gen,45.68 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.29 3 | adv_mnli,bd8ef0,acc_after,gen,53.72 4 | adv_mnli,bd8ef0,acc_before,gen,76.03 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.87 7 | adv_mnli_mm,bd8ef0,acc_before,gen,74.79 8 | adv_qnli,0b7326,acc_drop,gen,0.05 9 | adv_qnli,0b7326,acc_after,gen,58.65 10 | adv_qnli,0b7326,acc_before,gen,61.65 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.03 15 | adv_rte,8cc547,acc_after,gen,41.98 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,54.55 4 | adv_mnli,bd8ef0,acc_before,gen,67.77 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,62.18 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.15 8 | adv_qnli,0b7326,acc_drop,gen,0.08 9 | adv_qnli,0b7326,acc_after,gen,49.62 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.03 15 | adv_rte,8cc547,acc_after,gen,41.98 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,57.02 4 | adv_mnli,bd8ef0,acc_before,gen,72.73 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71 7 | adv_mnli_mm,bd8ef0,acc_before,gen,80.67 8 | adv_qnli,0b7326,acc_drop,gen,0.06 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,-0.03 15 | adv_rte,8cc547,acc_after,gen,45.68 16 | adv_rte,8cc547,acc_before,gen,44.44 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,52.89 4 | adv_mnli,bd8ef0,acc_before,gen,67.77 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19 6 | adv_mnli_mm,bd8ef0,acc_after,gen,56.30 7 | adv_mnli_mm,bd8ef0,acc_before,gen,69.75 8 | adv_qnli,0b7326,acc_drop,gen,0.12 9 | adv_qnli,0b7326,acc_after,gen,50.38 10 | adv_qnli,0b7326,acc_before,gen,57.14 11 | adv_qqp,cdc277,acc_drop,gen,-0.05 12 | adv_qqp,cdc277,acc_after,gen,60.56 13 | adv_qqp,cdc277,acc_before,gen,57.75 14 | adv_rte,8cc547,acc_drop,gen,-0.03 15 | adv_rte,8cc547,acc_after,gen,44.44 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.02 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,50.38 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21 3 | adv_mnli,bd8ef0,acc_after,gen,57.85 4 | adv_mnli,bd8ef0,acc_before,gen,73.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71 7 | adv_mnli_mm,bd8ef0,acc_before,gen,76.47 8 | adv_qnli,0b7326,acc_drop,gen,0.04 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,53.38 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,-0.16 15 | adv_rte,8cc547,acc_after,gen,45.68 16 | adv_rte,8cc547,acc_before,gen,39.51 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18 3 | adv_mnli,bd8ef0,acc_after,gen,60.33 4 | adv_mnli,bd8ef0,acc_before,gen,73.55 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.87 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99 8 | adv_qnli,0b7326,acc_drop,gen,0.04 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,53.38 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,-0.08 15 | adv_rte,8cc547,acc_after,gen,53.09 16 | adv_rte,8cc547,acc_before,gen,49.38 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,57.85 4 | adv_mnli,bd8ef0,acc_before,gen,76.03 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.21 6 | adv_mnli_mm,bd8ef0,acc_after,gen,62.18 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99 8 | adv_qnli,0b7326,acc_drop,gen,0.06 9 | adv_qnli,0b7326,acc_after,gen,51.13 10 | adv_qnli,0b7326,acc_before,gen,54.14 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,-0.06 15 | adv_rte,8cc547,acc_after,gen,45.68 16 | adv_rte,8cc547,acc_before,gen,43.21 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,57.85 4 | adv_mnli,bd8ef0,acc_before,gen,72.73 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.03 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99 8 | adv_qnli,0b7326,acc_drop,gen,0.03 9 | adv_qnli,0b7326,acc_after,gen,54.14 10 | adv_qnli,0b7326,acc_before,gen,55.64 11 | adv_qqp,cdc277,acc_drop,gen,-0.02 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.03 15 | adv_rte,8cc547,acc_after,gen,43.21 16 | adv_rte,8cc547,acc_before,gen,44.44 17 | adv_sst2,ee8d3b,acc_drop,gen,0.00 18 | adv_sst2,ee8d3b,acc_after,gen,49.62 19 | adv_sst2,ee8d3b,acc_before,gen,49.62 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/fp16.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,60.33 4 | adv_mnli,bd8ef0,acc_before,gen,79.34 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11 6 | adv_mnli_mm,bd8ef0,acc_after,gen,74.79 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,75.19 10 | adv_qnli,0b7326,acc_before,gen,81.20 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,61.97 13 | adv_qqp,cdc277,acc_before,gen,66.20 14 | adv_rte,8cc547,acc_drop,gen,0.09 15 | adv_rte,8cc547,acc_after,gen,71.60 16 | adv_rte,8cc547,acc_before,gen,79.01 17 | adv_sst2,ee8d3b,acc_drop,gen,0.31 18 | adv_sst2,ee8d3b,acc_after,gen,58.78 19 | adv_sst2,ee8d3b,acc_before,gen,84.73 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20 3 | adv_mnli,bd8ef0,acc_after,gen,62.81 4 | adv_mnli,bd8ef0,acc_before,gen,78.51 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.10 6 | adv_mnli_mm,bd8ef0,acc_after,gen,68.91 7 | adv_mnli_mm,bd8ef0,acc_before,gen,76.47 8 | adv_qnli,0b7326,acc_drop,gen,0.02 9 | adv_qnli,0b7326,acc_after,gen,71.43 10 | adv_qnli,0b7326,acc_before,gen,72.93 11 | adv_qqp,cdc277,acc_drop,gen,0.00 12 | adv_qqp,cdc277,acc_after,gen,56.34 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.08 15 | adv_rte,8cc547,acc_after,gen,66.67 16 | adv_rte,8cc547,acc_before,gen,72.84 17 | adv_sst2,ee8d3b,acc_drop,gen,0.19 18 | adv_sst2,ee8d3b,acc_after,gen,51.15 19 | adv_sst2,ee8d3b,acc_before,gen,63.36 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.14 3 | adv_mnli,bd8ef0,acc_after,gen,61.98 4 | adv_mnli,bd8ef0,acc_before,gen,71.90 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43 7 | adv_mnli_mm,bd8ef0,acc_before,gen,81.51 8 | adv_qnli,0b7326,acc_drop,gen,0.06 9 | adv_qnli,0b7326,acc_after,gen,72.93 10 | adv_qnli,0b7326,acc_before,gen,77.44 11 | adv_qqp,cdc277,acc_drop,gen,0.09 12 | adv_qqp,cdc277,acc_after,gen,59.15 13 | adv_qqp,cdc277,acc_before,gen,64.79 14 | adv_rte,8cc547,acc_drop,gen,0.00 15 | adv_rte,8cc547,acc_after,gen,71.60 16 | adv_rte,8cc547,acc_before,gen,71.60 17 | adv_sst2,ee8d3b,acc_drop,gen,0.25 18 | adv_sst2,ee8d3b,acc_after,gen,58.78 19 | adv_sst2,ee8d3b,acc_before,gen,78.63 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,61.16 4 | adv_mnli,bd8ef0,acc_before,gen,78.51 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.87 8 | adv_qnli,0b7326,acc_drop,gen,0.05 9 | adv_qnli,0b7326,acc_after,gen,74.44 10 | adv_qnli,0b7326,acc_before,gen,78.20 11 | adv_qqp,cdc277,acc_drop,gen,0.06 12 | adv_qqp,cdc277,acc_after,gen,64.79 13 | adv_qqp,cdc277,acc_before,gen,69.01 14 | adv_rte,8cc547,acc_drop,gen,0.07 15 | adv_rte,8cc547,acc_after,gen,70.37 16 | adv_rte,8cc547,acc_before,gen,75.31 17 | adv_sst2,ee8d3b,acc_drop,gen,0.31 18 | adv_sst2,ee8d3b,acc_after,gen,58.78 19 | adv_sst2,ee8d3b,acc_before,gen,84.73 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w3.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22 3 | adv_mnli,bd8ef0,acc_after,gen,59.50 4 | adv_mnli,bd8ef0,acc_before,gen,76.03 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17 6 | adv_mnli_mm,bd8ef0,acc_after,gen,53.78 7 | adv_mnli_mm,bd8ef0,acc_before,gen,64.71 8 | adv_qnli,0b7326,acc_drop,gen,0.08 9 | adv_qnli,0b7326,acc_after,gen,75.94 10 | adv_qnli,0b7326,acc_before,gen,82.71 11 | adv_qqp,cdc277,acc_drop,gen,-0.07 12 | adv_qqp,cdc277,acc_after,gen,60.56 13 | adv_qqp,cdc277,acc_before,gen,56.34 14 | adv_rte,8cc547,acc_drop,gen,0.13 15 | adv_rte,8cc547,acc_after,gen,76.54 16 | adv_rte,8cc547,acc_before,gen,87.65 17 | adv_sst2,ee8d3b,acc_drop,gen,0.26 18 | adv_sst2,ee8d3b,acc_after,gen,53.44 19 | adv_sst2,ee8d3b,acc_before,gen,71.76 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w4.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21 3 | adv_mnli,bd8ef0,acc_after,gen,63.64 4 | adv_mnli,bd8ef0,acc_before,gen,80.99 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59 7 | adv_mnli_mm,bd8ef0,acc_before,gen,81.51 8 | adv_qnli,0b7326,acc_drop,gen,0.14 9 | adv_qnli,0b7326,acc_after,gen,73.68 10 | adv_qnli,0b7326,acc_before,gen,85.71 11 | adv_qqp,cdc277,acc_drop,gen,0.07 12 | adv_qqp,cdc277,acc_after,gen,60.56 13 | adv_qqp,cdc277,acc_before,gen,64.79 14 | adv_rte,8cc547,acc_drop,gen,0.14 15 | adv_rte,8cc547,acc_after,gen,69.14 16 | adv_rte,8cc547,acc_before,gen,80.25 17 | adv_sst2,ee8d3b,acc_drop,gen,0.25 18 | adv_sst2,ee8d3b,acc_after,gen,59.54 19 | adv_sst2,ee8d3b,acc_before,gen,79.39 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w4a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.19 3 | adv_mnli,bd8ef0,acc_after,gen,65.29 4 | adv_mnli,bd8ef0,acc_before,gen,80.99 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13 6 | adv_mnli_mm,bd8ef0,acc_after,gen,68.91 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99 8 | adv_qnli,0b7326,acc_drop,gen,0.11 9 | adv_qnli,0b7326,acc_after,gen,76.69 10 | adv_qnli,0b7326,acc_before,gen,85.71 11 | adv_qqp,cdc277,acc_drop,gen,0.07 12 | adv_qqp,cdc277,acc_after,gen,57.75 13 | adv_qqp,cdc277,acc_before,gen,61.97 14 | adv_rte,8cc547,acc_drop,gen,0.02 15 | adv_rte,8cc547,acc_after,gen,72.84 16 | adv_rte,8cc547,acc_before,gen,74.07 17 | adv_sst2,ee8d3b,acc_drop,gen,0.23 18 | adv_sst2,ee8d3b,acc_after,gen,61.07 19 | adv_sst2,ee8d3b,acc_before,gen,79.39 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24 3 | adv_mnli,bd8ef0,acc_after,gen,59.50 4 | adv_mnli,bd8ef0,acc_before,gen,78.51 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27 7 | adv_mnli_mm,bd8ef0,acc_before,gen,82.35 8 | adv_qnli,0b7326,acc_drop,gen,0.07 9 | adv_qnli,0b7326,acc_after,gen,75.19 10 | adv_qnli,0b7326,acc_before,gen,81.20 11 | adv_qqp,cdc277,acc_drop,gen,0.04 12 | adv_qqp,cdc277,acc_after,gen,61.97 13 | adv_qqp,cdc277,acc_before,gen,64.79 14 | adv_rte,8cc547,acc_drop,gen,0.12 15 | adv_rte,8cc547,acc_after,gen,70.37 16 | adv_rte,8cc547,acc_before,gen,80.25 17 | adv_sst2,ee8d3b,acc_drop,gen,0.30 18 | adv_sst2,ee8d3b,acc_after,gen,56.49 19 | adv_sst2,ee8d3b,acc_before,gen,80.15 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w8a8.csv: -------------------------------------------------------------------------------- 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18 3 | adv_mnli,bd8ef0,acc_after,gen,65.29 4 | adv_mnli,bd8ef0,acc_before,gen,79.34 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03 8 | adv_qnli,0b7326,acc_drop,gen,0.05 9 | adv_qnli,0b7326,acc_after,gen,75.19 10 | adv_qnli,0b7326,acc_before,gen,78.95 11 | adv_qqp,cdc277,acc_drop,gen,0.08 12 | adv_qqp,cdc277,acc_after,gen,63.38 13 | adv_qqp,cdc277,acc_before,gen,69.01 14 | adv_rte,8cc547,acc_drop,gen,0.02 15 | adv_rte,8cc547,acc_after,gen,74.07 16 | adv_rte,8cc547,acc_before,gen,75.31 17 | adv_sst2,ee8d3b,acc_drop,gen,0.27 18 | adv_sst2,ee8d3b,acc_after,gen,56.49 19 | adv_sst2,ee8d3b,acc_before,gen,77.10 20 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/adv_glue/proc_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | 4 | root_path = 'outputs' 5 | out_path = 'assets' 6 | 7 | os.mkdir(out_path) 8 | 9 | for model in os.listdir(root_path): 10 | os.mkdir(os.path.join(out_path, model)) 11 | for precision in os.listdir(os.path.join(root_path, model)): 12 | for i in range(len(os.listdir(os.path.join(root_path, model, precision)))): 13 | timestamp = os.listdir(os.path.join(root_path, model, precision))[i] 14 | if os.path.exists(os.path.join(root_path, model, precision, timestamp, 'summary')): 15 | break 16 | for file_name in os.listdir(os.path.join(root_path, model, precision, timestamp, 'summary')): 17 | if file_name.endswith('.csv'): 18 | break 19 | src_file = os.path.join(root_path, model, precision, timestamp, 'summary', file_name) 20 | dst_file = os.path.join(out_path, model, f'{precision}.csv') 21 | shutil.copy(src_file, dst_file) 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/kv_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.5927927927927928, 0.5976833976833977, 0.5812097812097812, 0.5554697554697555] 2 | Mistral_7B = [0.584041184041184, 0.583011583011583, 0.5976833976833977, 0.4800514800514801] 3 | Mixtral_8x7B = [0.7799227799227799, 0.7773487773487774, 0.759073359073359, 0.6800514800514801] 4 | LLaMA2_7B = [0.425997425997426, 0.4252252252252252, 0.43217503217503217, 0.29523809523809524] 5 | LLaMA2_13B = [0.6146718146718146, 0.6164736164736164, 0.6138996138996139, 0.49034749034749037] 6 | LLaMA2_70B = [0.7235521235521235, 0.7235521235521235, 0.7196911196911197, 0.7184041184041184] 7 | Falcon_7B = [0.41132561132561135, 0.41338481338481337, 0.33745173745173745, 0.2494208494208494] 8 | Falcon_40B = [0.007207207207207207, 0.007207207207207207, 0.008494208494208495, 0.01904761904761905] 9 | Falcon_180B = [0.6478764478764478, 0.6483912483912484, 0.631917631917632, 0.5680823680823681] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/w_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.5927927927927928, 0.6, 0.5745173745173745, 0.5096525096525096] 2 | Mistral_7B = [0.584041184041184, 0.5812097812097812, 0.5747747747747748, 0.564993564993565] 3 | Mixtral_8x7B = [0.7799227799227799, 0.7727155727155727, 0.7693693693693694, 0.6586872586872586] 4 | LLaMA2_7B = [0.425997425997426, 0.42368082368082366, 0.45302445302445304, 0.5181467181467182] 5 | LLaMA2_13B = [0.6146718146718146, 0.6128700128700129, 0.5783783783783784, 0.6015444015444016] 6 | LLaMA2_70B = [0.7235521235521235, 0.7240669240669241, 0.7132561132561133, 0.6563706563706564] 7 | Falcon_7B = [0.41132561132561135, 0.41235521235521233, 0.43346203346203344, 0.27953667953667954] 8 | Falcon_40B = [0.007207207207207207, 0.0066924066924066924, 0.011583011583011582, 0.23191763191763193] 9 | Falcon_180B = [0.6478764478764478, 0.6507078507078508, 0.6756756756756757, 0.668983268983269] 10 | LLaMA2_7B_AWQ = [0.425997425997426, 0.425997425997426, 0.39510939510939513, 0.4936936936936937] 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/wa_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.5927927927927928, 0.5871299871299871, 0.563963963963964] 2 | Mistral_7B = [0.584041184041184, 0.5791505791505791, 0.5716859716859717] 3 | Mixtral_8x7B = [0.7799227799227799, 0.7711711711711712, 0.767052767052767] 4 | LLaMA2_7B = [0.425997425997426, 0.4095238095238095, 0.435006435006435] 5 | LLaMA2_13B = [0.6146718146718146, 0.6162162162162163, 0.5752895752895753] 6 | LLaMA2_70B = [0.7235521235521235, 0.7202059202059202, 0.722007722007722] 7 | Falcon_7B = [0.41132561132561135, 0.4200772200772201, 0.43346203346203344] 8 | Falcon_40B = [0.007207207207207207, 0.005920205920205921, 0.010553410553410553] 9 | Falcon_180B = [0.6478764478764478, 0.6316602316602317, 0.6398970398970399] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/kv_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.585928809788654, 0.5914905450500556, 0.5817575083426029, 0.5745272525027809] 2 | Mistral_7B = [0.5670189098998888, 0.5667408231368187, 0.5656284760845384, 0.5444938820912124] 3 | Mixtral_8x7B = [0.6270856507230256, 0.628476084538376, 0.6084538375973304, 0.5684093437152391] 4 | LLaMA2_7B = [0.5795328142380423, 0.5792547274749722, 0.5561735261401557, 0.49638487208008897] 5 | LLaMA2_13B = [0.3576195773081201, 0.3542825361512792, 0.46662958843159064, 0.4051724137931034] 6 | LLaMA2_70B = [0.5203003337041157, 0.5191879866518354, 0.482202447163515, 0.5439377085650723] 7 | Falcon_7B = [0.4635706340378198, 0.4624582869855395, 0.43325917686318133, 0.36012235817575083] 8 | Falcon_40B = [0.09983314794215796, 0.10066740823136819, 0.12903225806451613, 0.1735261401557286] 9 | Falcon_180B = [0.5525583982202447, 0.5517241379310345, 0.5439377085650723, 0.39265850945494996] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/w_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.585928809788654, 0.5862068965517241, 0.5976084538375973, 0.5781423804226918] 2 | Mistral_7B = [0.5670189098998888, 0.5672969966629589, 0.550055617352614, 0.5478309232480534] 3 | Mixtral_8x7B = [0.6270856507230256, 0.6307007786429366, 0.6031701890989989, 0.564238042269188] 4 | LLaMA2_7B = [0.5795328142380423, 0.5759176863181312, 0.5439377085650723, 0.3879310344827586] 5 | LLaMA2_13B = [0.3576195773081201, 0.3681868743047831, 0.13264738598442713, 0.27335928809788657] 6 | LLaMA2_70B = [0.5203003337041157, 0.5186318131256952, 0.514182424916574, 0.4972191323692992] 7 | Falcon_7B = [0.4635706340378198, 0.4582869855394883, 0.47302558398220246, 0.4435483870967742] 8 | Falcon_40B = [0.09983314794215796, 0.09955506117908787, 0.05589543937708565, 0.21885428253615127] 9 | Falcon_180B = [0.5525583982202447, 0.550055617352614, 0.5389321468298109, 0.16879866518353726] 10 | LLaMA2_7B_AWQ = [0.5795328142380423, 0.5803670745272526, 0.5689655172413793, 0.5150166852057843] 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/wa_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.585928809788654, 0.5862068965517241, 0.5906562847608454] 2 | Mistral_7B = [0.5670189098998888, 0.5709121245828699, 0.5556173526140156] 3 | Mixtral_8x7B = [0.6270856507230256, 0.6256952169076752, 0.6048387096774194] 4 | LLaMA2_7B = [0.5795328142380423, 0.5700778642936596, 0.5419911012235817] 5 | LLaMA2_13B = [0.3576195773081201, 0.3973859844271413, 0.15878754171301446] 6 | LLaMA2_70B = [0.5203003337041157, 0.546440489432703, 0.5414349276974416] 7 | Falcon_7B = [0.4635706340378198, 0.4363181312569522, 0.47163515016685204] 8 | Falcon_40B = [0.09983314794215796, 0.10372636262513904, 0.04922135706340378] 9 | Falcon_180B = [0.5525583982202447, 0.47914349276974416, 0.48526140155728587] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/justice/kv_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.7255917159763313, 0.7189349112426036, 0.7089497041420119, 0.6501479289940828] 2 | Mistral_7B = [0.6664201183431953, 0.6667899408284024, 0.6697485207100592, 0.6793639053254438] 3 | Mixtral_8x7B = [0.7340976331360947, 0.7344674556213018, 0.7181952662721893, 0.650887573964497] 4 | LLaMA2_7B = [0.6035502958579881, 0.6042899408284024, 0.6142751479289941, 0.5232988165680473] 5 | LLaMA2_13B = [0.33986686390532544, 0.33986686390532544, 0.4345414201183432, 0.3890532544378698] 6 | LLaMA2_70B = [0.6590236686390533, 0.6605029585798816, 0.6649408284023669, 0.5968934911242604] 7 | Falcon_7B = [0.46079881656804733, 0.45710059171597633, 0.45968934911242604, 0.3816568047337278] 8 | Falcon_40B = [0.04105029585798817, 0.04105029585798817, 0.0695266272189349, 0.13942307692307693] 9 | Falcon_180B = [0.6268491124260355, 0.6290680473372781, 0.621301775147929, 0.4478550295857988] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/justice/w_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.7255917159763313, 0.7200443786982249, 0.7218934911242604, 0.6527366863905325] 2 | Mistral_7B = [0.6664201183431953, 0.6671597633136095, 0.6797337278106509, 0.6338757396449705] 3 | Mixtral_8x7B = [0.7340976331360947, 0.7329881656804734, 0.7233727810650887, 0.5732248520710059] 4 | LLaMA2_7B = [0.6035502958579881, 0.6057692307692307, 0.5972633136094675, 0.46449704142011833] 5 | LLaMA2_13B = [0.33986686390532544, 0.34985207100591714, 0.14497041420118342, 0.24149408284023668] 6 | LLaMA2_70B = [0.6590236686390533, 0.6593934911242604, 0.6105769230769231, 0.5277366863905325] 7 | Falcon_7B = [0.46079881656804733, 0.46116863905325445, 0.46486686390532544, 0.4992603550295858] 8 | Falcon_40B = [0.04105029585798817, 0.039571005917159764, 0.015162721893491124, 0.21930473372781065] 9 | Falcon_180B = [0.6268491124260355, 0.6316568047337278, 0.6183431952662722, 0.21375739644970415] 10 | LLaMA2_7B_AWQ = [0.6035502958579881, 0.6042899408284024, 0.6076183431952663, 0.4966715976331361] 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/justice/wa_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.7255917159763313, 0.7230029585798816, 0.7211538461538461] 2 | Mistral_7B = [0.6664201183431953, 0.6704881656804734, 0.6826923076923077] 3 | Mixtral_8x7B = [0.7340976331360947, 0.7326183431952663, 0.7270710059171598] 4 | LLaMA2_7B = [0.6035502958579881, 0.6150147928994083, 0.5832100591715976] 5 | LLaMA2_13B = [0.33986686390532544, 0.36797337278106507, 0.18010355029585798] 6 | LLaMA2_70B = [0.6590236686390533, 0.6886094674556213, 0.6264792899408284] 7 | Falcon_7B = [0.46079881656804733, 0.4430473372781065, 0.4467455621301775] 8 | Falcon_40B = [0.04105029585798817, 0.059911242603550297, 0.01257396449704142] 9 | Falcon_180B = [0.6268491124260355, 0.5329142011834319, 0.5417899408284024] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/kv_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.8578894472361809, 0.8655276381909548, 0.8625125628140704, 0.834572864321608] 2 | Mistral_7B = [0.6840201005025126, 0.6850251256281407, 0.6974874371859296, 0.6148743718592965] 3 | Mixtral_8x7B = [0.9252261306532663, 0.9246231155778895, 0.9214070351758794, 0.878391959798995] 4 | LLaMA2_7B = [0.6753768844221105, 0.6757788944723618, 0.700502512562814, 0.6858291457286432] 5 | LLaMA2_13B = [0.7995979899497487, 0.7989949748743719, 0.8142713567839196, 0.7155778894472362] 6 | LLaMA2_70B = [0.9242211055276381, 0.9246231155778895, 0.9232160804020101, 0.914572864321608] 7 | Falcon_7B = [0.16522613065326633, 0.16241206030150754, 0.22914572864321608, 0.3210050251256281] 8 | Falcon_40B = [0.08321608040201005, 0.08442211055276382, 0.06793969849246231, 0.04904522613065326] 9 | Falcon_180B = [0.9401005025125628, 0.939497487437186, 0.938894472361809, 0.9298492462311557] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/w_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.8578894472361809, 0.8613065326633166, 0.8574874371859297, 0.8315577889447237] 2 | Mistral_7B = [0.6840201005025126, 0.6846231155778895, 0.6671356783919598, 0.76] 3 | Mixtral_8x7B = [0.9252261306532663, 0.9256281407035176, 0.9238190954773869, 0.9053266331658292] 4 | LLaMA2_7B = [0.6753768844221105, 0.6719597989949748, 0.7141708542713567, 0.7382914572864322] 5 | LLaMA2_13B = [0.7995979899497487, 0.8008040201005026, 0.8168844221105528, 0.8247236180904522] 6 | LLaMA2_70B = [0.9242211055276381, 0.9248241206030151, 0.9230150753768844, 0.9139698492462311] 7 | Falcon_7B = [0.16522613065326633, 0.17427135678391958, 0.31316582914572866, 0.2064321608040201] 8 | Falcon_40B = [0.08321608040201005, 0.0850251256281407, 0.05849246231155779, 0.05628140703517588] 9 | Falcon_180B = [0.9401005025125628, 0.938894472361809, 0.9350753768844221, 0.9306532663316583] 10 | LLaMA2_7B_AWQ = [0.6753768844221105, 0.675175879396985, 0.6605025125628141, 0.647035175879397] 11 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/wa_result.py: -------------------------------------------------------------------------------- 1 | ChatGLM3_6B = [0.8578894472361809, 0.8574874371859297, 0.8536683417085427] 2 | Mistral_7B = [0.6840201005025126, 0.6727638190954773, 0.650251256281407] 3 | Mixtral_8x7B = [0.9252261306532663, 0.925427135678392, 0.9228140703517588] 4 | LLaMA2_7B = [0.6753768844221105, 0.6655276381909547, 0.7035175879396985] 5 | LLaMA2_13B = [0.7995979899497487, 0.8044221105527638, 0.815678391959799] 6 | LLaMA2_70B = [0.9242211055276381, 0.9232160804020101, 0.9189949748743719] 7 | Falcon_7B = [0.16522613065326633, 0.1306532663316583, 0.3499497487437186] 8 | Falcon_40B = [0.08321608040201005, 0.05748743718592965, 0.027537688442211054] 9 | Falcon_180B = [0.9401005025125628, 0.938894472361809, 0.935678391959799] 10 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/ethics/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import matplotlib.pyplot as plt 4 | from matplotlib.lines import Line2D 5 | from matplotlib.ticker import FormatStrFormatter 6 | import importlib 7 | import numpy as np 8 | 9 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the 10 | # script instead of being passed as command args. 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--save_path', type=str, default='./figures/') 14 | parser.add_argument('--save_name', type=str, default=None) 15 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv']) 16 | parser.add_argument('--dataset_name', type=str, default='Multi-Doc_QA') 17 | parser.add_argument('--norm', action='store_true') 18 | parser.add_argument("--num_bins", type=int, default=30, help="bin number") 19 | parser.add_argument('--x_label', type=str, default='Bit-width') 20 | parser.add_argument('--y_label', type=str, default='Accuracy') 21 | parser.add_argument('--legend_loc', type=str, default='lower left') 22 | args = parser.parse_args() 23 | 24 | 25 | ##################################### Helper Functions ###################################### 26 | def result_pad(raw_results, fp_idx=0): 27 | raw_fp_result = raw_results[fp_idx] 28 | new_results = [x if x is not None else 0 for x in raw_results] 29 | new_results[fp_idx] = raw_fp_result 30 | return new_results 31 | 32 | def normalize_results(raw_results, fp_idx=0): 33 | # raw_results = result_pad(raw_results) 34 | if raw_results[fp_idx] is not None and raw_results[fp_idx] != 0: 35 | norm_results = [i / raw_results[fp_idx] * 100 if i is not None else None for i in raw_results] 36 | print('Result Normalization Succeeded.') 37 | else: 38 | norm_results = raw_results 39 | print('The input results have no FP precision, return original results.') 40 | return norm_results 41 | 42 | MARKERS = ['o', '^', 's', 'd', '+'] 43 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown'] 44 | MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_70B"] 45 | TASKS = ['commonsense', 'deontology', 'justice', 'virtue'] 46 | MIN = [0.2, 0.2, 0.5, 0.6] 47 | MAX = [0.81, 0.71, 0.8, 0.95] 48 | 49 | ###################################### Plot Functions ###################################### 50 | if __name__ == '__main__': 51 | # create a folder to save the resulting plot. 52 | if not os.path.exists(args.save_path): 53 | os.makedirs(args.save_path) 54 | 55 | # plot different tasks 56 | for t_i, task in enumerate(TASKS): 57 | # plot name 58 | save_name = args.save_name + '.pdf' if args.save_name is not None else \ 59 | task + '_' + args.plot_mode + '_' + args.x_label + '.pdf' 60 | save_name = args.save_path + save_name 61 | 62 | # import data 63 | data = importlib.import_module("ethics.{}.{}_result".format(task, args.plot_mode)) 64 | 65 | # init canvas 66 | fig, ax = plt.subplots(figsize=[8., 6.]) 67 | 68 | # make the plot compact 69 | plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11) 70 | 71 | # set figure labels 72 | plt.xlabel(args.x_label, fontsize=20) 73 | plt.ylabel(args.y_label if not args.norm else 'Normalized ' + args.y_label, fontsize=20) 74 | plt.ylim(MIN[t_i], MAX[t_i]) 75 | plt.yticks(np.arange(MIN[t_i], MAX[t_i], 0.1)) 76 | 77 | # set axes font size 78 | ax.tick_params(axis='x', labelsize=20) 79 | ax.tick_params(axis='y', labelsize=20) 80 | 81 | x_axis = { 82 | 'w': ['FP16', 'W8', 'W4', 'W3'], 83 | 'wa': ['FP16', 'W8A8', 'W4A8'], 84 | 'kv': ['FP16', 'KV8', 'KV4', 'KV3'], 85 | }[args.plot_mode] 86 | 87 | for i, model_name in enumerate(MODELS): 88 | y_value = eval("data.{}".format(model_name)) 89 | ax.plot(x_axis, y_value, marker=MARKERS[0], markersize=16, label=MODELS[i], linestyle="-", color=COLORS[i]) 90 | # ax.plot(x_axis[:len(y_value)], y_value, marker=MARKERS[j], markersize=13, linestyle="-", color=COLORS[i]) 91 | 92 | # set legend location 93 | # ax.legend(loc=args.legend_loc, fontsize=20) 94 | ax.legend(fontsize=20) 95 | 96 | # legend_elements = [] 97 | # for i, _ in enumerate(MODELS): 98 | # legend_elements.append(Line2D([0], [0], marker=MARKERS[i], color='w', label=mode[i], markerfacecolor='tab:brown', markersize=16)) 99 | # ax.legend(handles=legend_elements, fontsize=20) 100 | 101 | plt.savefig(save_name) 102 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import matplotlib.pyplot as plt 4 | from matplotlib.lines import Line2D 5 | from matplotlib.ticker import FormatStrFormatter 6 | import importlib 7 | import numpy as np 8 | 9 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the 10 | # script instead of being passed as command args. 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--save_path', type=str, default='./figures/') 14 | parser.add_argument('--save_name', type=str, default=None) 15 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv']) 16 | parser.add_argument("--num_bins", type=int, default=30, help="bin number") 17 | parser.add_argument("--metric", type=str, default="mc1", choices=["mc1", "mc2"]) 18 | parser.add_argument('--x_label', type=str, default='Bit-width') 19 | parser.add_argument('--y_label', type=str, default='Accuracy') 20 | parser.add_argument('--legend_loc', type=str, default='lower left') 21 | args = parser.parse_args() 22 | 23 | 24 | ##################################### Helper Functions ###################################### 25 | def result_pad(raw_results, fp_idx=0): 26 | raw_fp_result = raw_results[fp_idx] 27 | new_results = [x if x is not None else 0 for x in raw_results] 28 | new_results[fp_idx] = raw_fp_result 29 | return new_results 30 | 31 | MARKERS = ['o', '^', 's', 'd', '+'] 32 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown', 'tab:pink'] 33 | # MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_13B", "LLaMA2_70B", "Falcon_7B", "Falcon_40B"] 34 | MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_13B", "LLaMA2_70B"] 35 | 36 | ###################################### Plot Functions ###################################### 37 | if __name__ == '__main__': 38 | # create a folder to save the resulting plot. 39 | if not os.path.exists(args.save_path): 40 | os.makedirs(args.save_path) 41 | 42 | # plot name 43 | save_name = args.save_name + '.pdf' if args.save_name is not None else \ 44 | args.plot_mode + '_' + args.x_label + '.pdf' 45 | save_name = args.save_path + save_name 46 | 47 | # import data 48 | data = importlib.import_module("results.{}_{}_result".format(args.plot_mode, args.metric)) 49 | 50 | # init canvas 51 | fig, ax = plt.subplots(figsize=[8., 6.]) 52 | 53 | # make the plot compact 54 | plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11) 55 | 56 | # set figure labels 57 | plt.xlabel(args.x_label, fontsize=20) 58 | plt.ylabel(args.y_label, fontsize=20) 59 | plt.ylim(0.2, 0.6) 60 | plt.yticks(np.arange(0.2, 0.6, 0.1)) 61 | 62 | # set axes font size 63 | ax.tick_params(axis='x', labelsize=20) 64 | ax.tick_params(axis='y', labelsize=20) 65 | 66 | # x_axis = { 67 | # 'w': ['FP16', 'W8', 'W4', 'W3'], 68 | # 'wa': ['FP16', 'W8A8', 'W4A8'], 69 | # 'kv': ['FP16', 'KV8', 'KV4', 'KV3'], 70 | # }[args.plot_mode] 71 | x_axis = { 72 | 'w': ['FP16', 'W8', 'W4', 'W3', 'W2'], 73 | 'wa': ['FP16', 'W8A8', 'W4A8', 'W4A4'], 74 | 'kv': ['FP16', 'KV8', 'KV4', 'KV3', 'KV2'], 75 | }[args.plot_mode] 76 | 77 | for i, model_name in enumerate(MODELS): 78 | y_value = eval("data.{}".format(model_name)) 79 | ax.plot(x_axis, y_value, marker=MARKERS[0], markersize=16, label=MODELS[i], linestyle="-", color=COLORS[i]) 80 | 81 | ax.legend(fontsize=20) 82 | 83 | plt.savefig(save_name) 84 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/kv_mc1_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.5177478580171359, 0.5177478580171359, 0.49938800489596086, 0.47368421052631576, 0.27539779681762544] 2 | Mixtral_8x7B = [0.4969400244798042, 0.49326805385556916, 0.4541003671970624, 0.4222766217870257, 0.24112607099143207] 3 | LLaMA2_7B = [0.29498164014687883, 0.2974296205630355, 0.30599755201958384, 0.24969400244798043, 0.2386780905752754] 4 | LLaMA2_13B = [0.28518971848225216, 0.2827417380660955, 0.2741738066095471, 0.2913096695226438, 0.2252141982864137] 5 | LLaMA2_70B = [0.3561811505507956, 0.3574051407588739, 0.3561811505507956, 0.34149326805385555, 0.23011015911872704] 6 | Falcon_7B = [0.2876376988984088, 0.2876376988984088, 0.2876376988984088, 0.26560587515299877, 0.22643818849449204] 7 | Falcon_40B = [0.3733170134638923, 0.37209302325581395, 0.3806609547123623, 0.37454100367197063, 0.26805385556915545] 8 | Falcon_180B = [0.36964504283965727, 0.36964504283965727, 0.36964504283965727, 0.3708690330477356, 0.29498164014687883] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/kv_mc2_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.6688017220512057, 0.6691715738543765, 0.6564044217068896, 0.6427100548674619, 0.48027649255784544] 2 | Mixtral_8x7B = [0.6484817260542265, 0.6495993649117421, 0.629318751867763, 0.5994892449410594, 0.4926746070653021] 3 | LLaMA2_7B = [0.44926537410693956, 0.44936809200977723, 0.45751973212284464, 0.39193280615715015, 0.47475433863980704] 4 | LLaMA2_13B = [0.4406564723437067, 0.43997647262514694, 0.43465132855093525, 0.4525000742515952, 0.4510122041900766] 5 | LLaMA2_70B = [0.5275437344080325, 0.5279487941769914, 0.526220891244331, 0.5207520579654967, 0.4378068545954345] 6 | Falcon_7B = [0.4407104445643683, 0.4410793932995058, 0.44445645688981794, 0.41868173734281455, 0.45280967780345155] 7 | Falcon_40B = [0.5247662521731141, 0.5246133475850496, 0.5284034579971917, 0.522891477364859, 0.46078848831981034] 8 | Falcon_180B = [0.5335740147660477, 0.5333871339139281, 0.5332405981470844, 0.5334614774880382, 0.46799068584054326] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/w_mc1_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.5177478580171359, 0.5165238678090576, 0.4883720930232558, 0.45532435740514077, 0.27050183598531213] 2 | Mixtral_8x7B = [0.4969400244798042, 0.4944920440636475, 0.4724602203182375, 0.4724602203182375, 0.24112607099143207] 3 | LLaMA2_7B = [0.29498164014687883, 0.2962056303549572, 0.29498164014687883, 0.2937576499388005, 0.26193390452876375] 4 | LLaMA2_13B = [0.28518971848225216, 0.2802937576499388, 0.2864137086903305, 0.26805385556915545, 0.25458996328029376] 5 | LLaMA2_70B = [0.3561811505507956, 0.3561811505507956, 0.34516523867809057, 0.32068543451652387, 0.24969400244798043] 6 | Falcon_7B = [0.2876376988984088, 0.28886168910648713, 0.2778457772337821, 0.2668298653610771, 0.2350061199510404] 7 | Falcon_40B = [0.3733170134638923, 0.3733170134638923, 0.36964504283965727, 0.36107711138310894, 0.23133414932680538] 8 | Falcon_180B = [0.36964504283965727, 0.36964504283965727, 0.3574051407588739, 0.34761321909424725, 0.2876376988984088] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/w_mc2_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.6688017220512057, 0.6696166302279059, 0.6577095905502711, 0.6138682987896682, 0.5058903110407331] 2 | Mixtral_8x7B = [0.6484817260542265, 0.6515374278400258, 0.6247955109142671, 0.6397627935078577, 0.4830758940244642] 3 | LLaMA2_7B = [0.44926537410693956, 0.4496653069702225, 0.456922166094929, 0.45485949562068634, 0.511788507924486] 4 | LLaMA2_13B = [0.4406564723437067, 0.43933660745531866, 0.43538230062712213, 0.411918030468475, 0.47564118986968823] 5 | LLaMA2_70B = [0.5275437344080325, 0.5275547367866754, 0.5159843363616804, 0.49391937697797156, 0.39547118299925027] 6 | Falcon_7B = [0.4407104445643683, 0.4409154529480796, 0.43489265554092676, 0.4297975191130771, 0.4701201417345436] 7 | Falcon_40B = [0.5247662521731141, 0.5249681834735866, 0.515695665047965, 0.5164459705459207, 0.4500814796622027] 8 | Falcon_180B = [0.5335740147660477, 0.5335126034400929, 0.5318512061393834, 0.5068406829747578, 0.477815457771522] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/wa_mc1_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.5177478580171359, 0.5201958384332925, 0.4969400244798042, 0.28886168910648713] 2 | Mixtral_8x7B = [0.4969400244798042, 0.4834761321909425, 0.4834761321909425, 0.25703794369645044] 3 | LLaMA2_7B = [0.29498164014687883, 0.2962056303549572, 0.2937576499388005, 0.25458996328029376] 4 | LLaMA2_13B = [0.28518971848225216, 0.28151774785801714, 0.27906976744186046, 0.2558139534883721] 5 | LLaMA2_70B = [0.3561811505507956, 0.34394124847001223, 0.3378212974296206, 0.23133414932680538] 6 | Falcon_7B = [0.2876376988984088, 0.2937576499388005, 0.2802937576499388, 0.23255813953488372] 7 | Falcon_40B = [0.3733170134638923, 0.3769889840881273, 0.3623011015911873, 0.22031823745410037] 8 | Falcon_180B = [0.36964504283965727, 0.37209302325581395, 0.3623011015911873, 0.22766217870257038] 9 | -------------------------------------------------------------------------------- /qllm_eval/visualization/trustworthiness/truthfulqa/results/wa_mc2_result.py: -------------------------------------------------------------------------------- 1 | Mistral_7B = [0.6688017220512057, 0.6723008908877496, 0.6595733109987866, 0.4852569797437285] 2 | Mixtral_8x7B = [0.6484817260542265, 0.6450170044402032, 0.6305351114450428, 0.4898087829899215] 3 | LLaMA2_7B = [0.44926537410693956, 0.4491759979927774, 0.4563169228667865, 0.4912608412585181] 4 | LLaMA2_13B = [0.4406564723437067, 0.4387751750922574, 0.42988544244530175, 0.5008732700965536] 5 | LLaMA2_70B = [0.5275437344080325, 0.520717869815872, 0.5098099728598039, 0.48717163966923716] 6 | Falcon_7B = [0.4407104445643683, 0.4450711520882632, 0.4350850113436507, 0.4324344947777] 7 | Falcon_40B = [0.5247662521731141, 0.5216728790262783, 0.5123887987899755, 0.4816507929107791] 8 | Falcon_180B = [0.5335740147660477, 0.5367397258662, 0.5329333474493646, 0.4821977821247397] 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # basic requirements 2 | torch>=2.0 3 | xopen 4 | anthropic 5 | 6 | # huggingface requirements 7 | accelerate>=0.21.0 8 | transformers==4.44.2 9 | datasets 10 | evaluate 11 | 12 | # for logging or special models, please try installing the following packages 13 | wandb 14 | sentencepiece 15 | 16 | # for longbench 17 | tqdm 18 | rouge 19 | jieba 20 | fuzzywuzzy 21 | einops 22 | 23 | # lm_eval 24 | lm_eval==0.3.0 25 | 26 | # for visualization 27 | seaborn 28 | 29 | # for conversation tasks 30 | anthropic 31 | openai==0.28 32 | fschat 33 | 34 | # use flash attention 35 | packaging 36 | ninja 37 | # flash-attn 38 | triton>=2.1.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup 4 | import setuptools 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | setup( 10 | name="qllm_eval", 11 | version="0.1.0", 12 | description="QLLM_Evaluation", 13 | author="Shiyao Li", 14 | author_email="shiyao1620@gmail.com", 15 | # url="https://github.com/LSY-noya/QLLM-Evaluation.git", 16 | packages=setuptools.find_packages(), 17 | license="MIT", 18 | long_description=long_description, 19 | classifiers=[ 20 | "Programming Language :: Python :: 3", 21 | "Operating System :: OS Independent", 22 | ], 23 | ) 24 | --------------------------------------------------------------------------------