├── .gitignore
├── LICENSE
├── README.md
├── figures
    ├── logo_Infinigence-ai.png
    └── logo_nicsefc.jpg
├── main.py
├── qllm_eval
    ├── evaluation
    │   ├── __init__.py
    │   ├── q_dialogue
    │   │   ├── README.md
    │   │   ├── fschat_dat
    │   │   │   ├── judge_prompts.jsonl
    │   │   │   ├── mt_bench
    │   │   │   │   ├── misc
    │   │   │   │   │   └── radar.png
    │   │   │   │   ├── model_answer
    │   │   │   │   │   └── llama2-7b-chat_quant_w_4_kv_4.jsonl
    │   │   │   │   ├── question.jsonl
    │   │   │   │   └── reference_answer
    │   │   │   │   │   └── gpt-4.jsonl
    │   │   │   └── vicuna_bench
    │   │   │   │   ├── question.jsonl
    │   │   │   │   └── reference_answer
    │   │   │   │       └── gpt-4.jsonl
    │   │   ├── gen_judgment.py
    │   │   ├── gen_model_answer.py
    │   │   └── show_result.py
    │   ├── q_ethic
    │   │   ├── README.md
    │   │   └── main.py
    │   ├── q_harness
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── lm_eval_adaptor.py
    │   │   └── main.py
    │   ├── q_long
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── longeval
    │   │   │   ├── LICENSE
    │   │   │   ├── __init__.py
    │   │   │   ├── generate_testcase.py
    │   │   │   ├── generate_testcases_configs.yaml
    │   │   │   └── utils.py
    │   │   ├── main_litm.py
    │   │   └── main_longeval.py
    │   └── q_opencompass
    │   │   ├── README.md
    │   │   ├── main.py
    │   │   ├── tasks
    │   │       ├── __init__.py
    │   │       └── openicl_infer_quant.py
    │   │   └── utils
    │   │       ├── __init__.py
    │   │       ├── build.py
    │   │       ├── models.py
    │   │       └── run.py
    ├── methods
    │   ├── __init__.py
    │   └── rep
    │   │   ├── __init__.py
    │   │   ├── apply_rep.py
    │   │   └── utils.py
    ├── quantization
    │   ├── __init__.py
    │   ├── qattn
    │   │   ├── __init__.py
    │   │   └── sw
    │   │   │   ├── __init__.py
    │   │   │   ├── bloom_attn.py
    │   │   │   ├── deepseekv2_attn.py
    │   │   │   ├── falcon_attn.py
    │   │   │   ├── flash_attn_triton.py
    │   │   │   ├── gemma_attn.py
    │   │   │   ├── glm2_attn.py
    │   │   │   ├── glm3_attn.py
    │   │   │   ├── llama_attn.py
    │   │   │   ├── mistral_attn.py
    │   │   │   ├── mixtral_attn.py
    │   │   │   ├── mpt_attn.py
    │   │   │   ├── opt_attn.py
    │   │   │   └── stable_attn.py
    │   ├── qlinear
    │   │   ├── __init__.py
    │   │   └── sqwa.py
    │   ├── quant_funcs.py
    │   └── quant_wrapper.py
    ├── utils.py
    └── visualization
    │   ├── __init__.py
    │   ├── basic
    │       ├── assets
    │       │   └── lambada_w_only.py
    │       ├── heatmap_plot.py
    │       ├── plot.py
    │       ├── ppl_distribution_plot.py
    │       ├── radar_plot.py
    │       ├── radar_ratio_range_plot.py
    │       ├── storage_overhead_barplot.py
    │       ├── storage_overhead_curveplot.py
    │       ├── storage_overhead_plot.py
    │       └── utils.py
    │   ├── dialogue
    │       ├── assets
    │       │   ├── __init__.py
    │       │   ├── mt_bench_kv_cache.py
    │       │   ├── mt_bench_w_a.py
    │       │   └── mt_bench_w_only.py
    │       └── gen_table.py
    │   ├── emergent
    │       ├── emergent
    │       │   ├── arc-c_kv.txt
    │       │   ├── arc-c_w.txt
    │       │   ├── arc-c_wa.txt
    │       │   ├── arc-e_kv.txt
    │       │   ├── arc-e_w.txt
    │       │   ├── arc-e_wa.txt
    │       │   ├── ceval_kv.txt
    │       │   ├── ceval_w.txt
    │       │   ├── ceval_wa.txt
    │       │   ├── gsm8k_kv.txt
    │       │   ├── gsm8k_w.txt
    │       │   ├── gsm8k_wa.txt
    │       │   ├── hellaswag_kv.txt
    │       │   ├── hellaswag_w.txt
    │       │   ├── hellaswag_wa.txt
    │       │   ├── mmlu-calib_kv.txt
    │       │   ├── mmlu-calib_w.txt
    │       │   ├── mmlu-calib_wa.txt
    │       │   ├── mmlu_kv.txt
    │       │   ├── mmlu_w.txt
    │       │   ├── mmlu_wa.txt
    │       │   ├── strategyqa_kv.txt
    │       │   ├── strategyqa_w.txt
    │       │   └── strategyqa_wa.txt
    │       ├── plot_ability.py
    │       └── plot_other_set.py
    │   ├── long
    │       ├── litm
    │       │   ├── plot.py
    │       │   └── pos
    │       │   │   ├── kv_length_result.py
    │       │   │   ├── w_length_result.py
    │       │   │   └── wa_length_result.py
    │       └── longeval
    │       │   ├── length
    │       │       ├── kv_length_result.py
    │       │       ├── w_length_result.py
    │       │       ├── wa_length_result.py
    │       │       └── wkv_length_result.py
    │       │   ├── plot.py
    │       │   └── pos
    │       │       ├── kv_length_result.py
    │       │       ├── w_length_result.py
    │       │       ├── wa_length_result.py
    │       │       └── wkv_length_result.py
    │   └── trustworthiness
    │       ├── adv_glue
    │           ├── assets
    │           │   ├── hf_chatglm3_6b
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_chatglm3_6b_32k
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_falcon_40b_instruct
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_falcon_7b_instruct
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_llama2_13b_chat
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_llama2_70b_chat
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_llama2_7b_chat
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   ├── hf_mistral_7b_instruct
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           │   └── hf_mixtral_8x7b_instruct
    │           │   │   ├── fp16.csv
    │           │   │   ├── kv3.csv
    │           │   │   ├── kv4.csv
    │           │   │   ├── kv8.csv
    │           │   │   ├── w3.csv
    │           │   │   ├── w4.csv
    │           │   │   ├── w4a8.csv
    │           │   │   ├── w8.csv
    │           │   │   └── w8a8.csv
    │           └── proc_results.py
    │       ├── ethics
    │           ├── ethics
    │           │   ├── commonsense
    │           │   │   ├── kv_result.py
    │           │   │   ├── w_result.py
    │           │   │   └── wa_result.py
    │           │   ├── deontology
    │           │   │   ├── kv_result.py
    │           │   │   ├── w_result.py
    │           │   │   └── wa_result.py
    │           │   ├── justice
    │           │   │   ├── kv_result.py
    │           │   │   ├── w_result.py
    │           │   │   └── wa_result.py
    │           │   └── virtue
    │           │   │   ├── kv_result.py
    │           │   │   ├── w_result.py
    │           │   │   └── wa_result.py
    │           └── plot.py
    │       └── truthfulqa
    │           ├── plot.py
    │           └── results
    │               ├── kv_mc1_result.py
    │               ├── kv_mc2_result.py
    │               ├── w_mc1_result.py
    │               ├── w_mc2_result.py
    │               ├── wa_mc1_result.py
    │               └── wa_mc2_result.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | *.vscode/
  6 | *outputs*
  7 | *bkp*
  8 | test/
  9 | statistic/
 10 | statistic_tensor
 11 | *.sh*
 12 | *.zip*
 13 | *.tar.gz*
 14 | 
 15 | # C extensions
 16 | *.so
 17 | 
 18 | # Distribution / packaging
 19 | .Python
 20 | build/
 21 | develop-eggs/
 22 | dist/
 23 | downloads/
 24 | eggs/
 25 | .eggs/
 26 | lib/
 27 | lib64/
 28 | parts/
 29 | sdist/
 30 | var/
 31 | wheels/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | cover/
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | .pybuilder/
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | #   For a library or package, you might want to ignore these files since the code is
 96 | #   intended to run in multiple environments; otherwise, check them in:
 97 | # .python-version
 98 | 
 99 | # pipenv
100 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | #   install all needed dependencies.
104 | #Pipfile.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 | 
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 | 
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 | 
128 | # SageMath parsed files
129 | *.sage.py
130 | 
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | 
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 | 
144 | # Rope project settings
145 | .ropeproject
146 | 
147 | # mkdocs documentation
148 | /site
149 | 
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 | 
155 | # Pyre type checker
156 | .pyre/
157 | 
158 | # pytype static type analyzer
159 | .pytype/
160 | 
161 | # Cython debug symbols
162 | cython_debug/
163 | 
164 | # PyCharm
165 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
168 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | .idea/
170 | 
171 | # OpenCompass outputs
172 | outputs/
173 | # OpenCompass data
174 | data/
175 | # OpenCompass intermediate files
176 | tmp/
177 | icl_inference_output/
178 | # OpenCompass config files
179 | qllm_eval/evaluation/q_opencompass/configs/
180 | # OpenCompass Evaluation Scripts
181 | qllm_eval/evaluation/q_opencompass/scripts/
182 | 
183 | # OpenCompass tools for listing configs
184 | tools/
185 | 
186 | # vim cache files
187 | *.swp
188 | 
189 | # visualization related files
190 | ## basic capacity plots
191 | qllm_eval/visualization/basic/results/
192 | qllm_eval/visualization/basic/assets/*
193 | !qllm_eval/visualization/basic/assets/lambada_w_only.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [2024] [Shiyao Li, Xuefei Ning, Luning Wang, Tengxuan Liu, Xiangsheng Shi, Shengen Yan, Guohao Dai, Huazhong Yang, and Yu Wang]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/figures/logo_Infinigence-ai.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/figures/logo_Infinigence-ai.png


--------------------------------------------------------------------------------
/figures/logo_nicsefc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/figures/logo_nicsefc.jpg


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import torch
 3 | 
 4 | from qllm_eval.quantization.quant_wrapper import quantize_model
 5 | from qllm_eval.utils import build_model_and_enc
 6 | from qllm_eval.methods.rep.apply_rep import apply_awq
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument("--model_path", type=str, help="path of the hf model")
10 | parser.add_argument("--output_path", type=str, help="path to save the quantized model")
11 | parser.add_argument("--rep_file", type=str, help="path to load the reparameterization factors")
12 | parser.add_argument("--use_flash_attn", action="store_true")
13 | parser.add_argument("--w_group_size", type=int, default=128)
14 | parser.add_argument("--w_bit", type=int, default=16)
15 | parser.add_argument("--a_group_size", type=int, default=128)
16 | parser.add_argument("--a_bit", type=int, default=16)
17 | parser.add_argument("--kv_group_size", type=int, default=128)
18 | parser.add_argument("--kv_bit", type=int, default=16)
19 | args = parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     # a hack here to auto set model group
24 |     model, enc = build_model_and_enc(args.model_path, args.use_flash_attn, args.kv_bit, args.kv_group_size)
25 |     
26 |     if args.rep_file:
27 |         rep_results = torch.load(args.rep_file, map_location="cpu")
28 |         apply_awq(model, rep_results)
29 | 
30 |     # quantize model
31 |     model = quantize_model(model, args)
32 | 
33 |     # save the quantized model
34 |     if args.output_path:
35 |         model.save_pretrained(args.output_path, safe_serialization=False)
36 |         enc.save_pretrained(args.output_path)
37 | 
38 |     # evaluation
39 |     # TODO: add evaluation functions
40 |     prompt = "Hello, my name is human, and I like drinking"
41 |     input_ids = enc(prompt, return_tensors="pt")['input_ids'].to(next(model.parameters()).device)
42 |     output = model.generate(input_ids, do_sample=True, max_length=50, top_p=0.95, top_k=60)
43 |     print(enc.decode(output[0]))
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     main()
48 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_dialogue/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation with LLM_judge
 2 | Basic instructions on evaluating quantized LLMs with [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge). 
 3 | 
 4 | > Note that this repository only contains necessary code required by our experiments, including code for quantized LLMs' dialogue generation and "single" mode GPT-4 judgement. Also be aware that the scripts are adapted from the original ones provided by [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge), so the script arguments are not exactly the same, please follow THIS document's instructions if any conflict exists. If you want to learn more, please refer to the original repository for [LLM_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
 5 | 
 6 | ## Installation
 7 | Change directory to `QLLM-Evaluation` and conduct the following command to install our repository's requirements:	
 8 | ```
 9 | cd path/to/QLLM-Evaluation
10 | pip install -e .
11 | pip install -r requirements.txt
12 | ```
13 | You don't need to download the source code of FastChat, for the package `fschat` has been included in our `requirements.txt`.
14 | 
15 | ## Evaluate a quantized model on MT-bench
16 | ### Generate model answers to MT-bench questions
17 | Change directory to `QLLM-Evaluation/qllm_eval/evaluation/q_dialogue` and run `gen_model_answer.py`:
18 | ```
19 | python gen_model_answer.py \
20 | --model-path [MODEL-PATH] \
21 | --model-id [MODEL-ID] \
22 | [--quant] \
23 | [--w_bit [W-BIT]] \
24 | [--w_group_size [W-GROUP-SIZE]] \
25 | [--a_bit [A-BIT]] \
26 | [--a_group_size [A-GROUP-SIZE]] \
27 | [--kv_bit [KV-BIT]] \
28 | [--kv_group_size [KV-GROUP-SIZE]] \
29 | [--rep_file [REP-FILE]] \
30 | [--use_flash_attn]
31 | ```
32 | The following arguments are required:
33 | - `[MODEL-PATH]` is the path to the weights, which can be a local folder or a HuggingFace repo ID.
34 | - `[MODEL-ID]` is a name you give to the model.
35 | 
36 | The following arguments are optional:
37 | - `--quant` indicates whether you want to generate dialogues using a quantized model.
38 | - `[W-BIT]`, `[A-BIT]`, `[KV-BIT]` are the quantization bit-width for weight, activation and kv-cache, all default to 16.
39 | - `[W-GROUP-SIZE]`, `[A-GROUP-SIZE]`, `[KV-GROUP-SIZE]` are the group size for group-wise quantiztion, all default to 128.
40 | - `[REP-FILE]` is the path to the cache file if you want to use AWQ or SmoothQuant.
41 | - `--use_flash_attn` indicates whether you want to use flash-attention to save memory and speed up decoding.
42 | 
43 | The answers will be saved to: `./fschat_dat/mt_bench/model_answer/[MODEL-ID].jsonl`.
44 | 
45 | > Also, please note that the `--model_id` argument should match one of [FastChat's supported models](https://github.com/lm-sys/FastChat/blob/main/docs/model_support.md) to get the proper prompt template. The matching rule of each supported model could be found at [model_adapter.py](https://github.com/lm-sys/FastChat/blob/main/fastchat/model/model_adapter.py). For example, `Llama2Adapter`'s matching rule is defined as:
46 | > ```python
47 | > def match(self, model_path: str):
48 | >      return "llama-2" in model_path.lower()
49 | > ```
50 | > Therefore, when evaluating llama-2 models, you should specify a `--model_id` argument containing the pattern "llama-2".
51 | > 
52 | > For unsupported models, you could either support your model according to [model_support.md](https://github.com/lm-sys/FastChat/blob/main/docs/model_support.md), or just use the default model adapter without any modification.
53 | 
54 | e.g.
55 | ```
56 | python gen_model_answer.py \
57 | --model-path meta-llama/Llama-2-7b-chat-hf \
58 | --model-id llama-2-7b-chat_quant_w_4_a_4 \
59 | --quant \
60 | --w_bit 4 \
61 | --a_bit 4 \
62 | --use_flash_attn
63 | ```
64 | 
65 | ### Generate GPT-4 judgements
66 | We only support single-answer grading here. This mode asks GPT-4 to grade and give a score to model's answer directly without pairwise comparison. For each turn, GPT-4 will give a score on a scale of 10. We then compute the average score on all turns. Please note that while the original script support passing more than one model_ids to the `--model-list` argument, we recommend passing only one model_id each time for clarity.
67 | ```
68 | python gen_judgment.py \
69 | --model-list [MODEL-ID] \
70 | --save_name [MODEL-SAVENAME] \
71 | [--parallel [PARALLEL]] 
72 | ```
73 | The evaluation results will be saved to: `./fschat_dat/mt_bench/model_judgement/[MODEL-SAVENAME].jsonl`. Here we recommend setting `[MODEL-SAVENAME]` the same as `[MODEL-ID]` for simplicity.
74 | 
75 | e.g.
76 | ```
77 | python gen_judgment.py \
78 | --model-list llama-2-7b-chat_quant_w_4_a_4 \
79 | --parallel 4 \
80 | --save_name llama-2-7b-chat_quant_w_4_a_4
81 | ```
82 | 
83 | ### Show MT-bench scores
84 | You can show all the available results by simply running:
85 | ```
86 | python show_result.py
87 | ```
88 | If you want to show results of model_ids with a specified pattern, you can pass `--only_show` argument. For example, if you only want to see the results of llama models:
89 | ```
90 | python show_result.py --only_show llama
91 | ```
92 | 
93 | ### Results
94 | We open-source our results in [this directory](https://github.com/LSY-noya/QLLM-Evaluation/tree/main/qllm_eval/visualization/dialogue/assets), corresponding to the data we present in our paper.
95 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/misc/radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/misc/radar.png


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_dialogue/fschat_dat/mt_bench/model_answer/llama2-7b-chat_quant_w_4_kv_4.jsonl:
--------------------------------------------------------------------------------
1 | {"question_id": 95, "answer_id": "28UCTvfjRRMHyiD5H2tUFn", "model_id": "llama2-7b-chat_quant_w_4_kv_4", "choices": [{"index": 0, "turns": ["Thank you for entrusting me with this task! I'm happy to help you improve your language skills. The text you provided translates to \"I'm so excited for the party! I can't wait to see my friends!\" in English.\nHowever, I noticed a few minor errors in the original text. \"\u8863\u5e26\" should be \"clothes,\" \"\u6e10\u5bbd\" should be \"wide,\" and \"\u7ec8\u4e0d\u6094\" should be \"won't regret.\" Here's the refined and polished translation:\n\"I'm so excited for the party! I can't wait to see my friends and have a great time with them. I won't regret it!\"\nI hope this helps, and please feel free to ask me any questions or provide more texts for me to correct and enhance!", "Ah, I see! \"Ich verstehe\" is German for \"I understand.\" In English, we would say \"I understand.\" So, \"Ich verstehe\" can be translated to \"I understand.\" Is there anything else you'd like me to help you with?"]}], "tstamp": 1706367441.4954596}
2 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_ethic/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation with ETHICS benchmark
 2 | Basic instructions on evaluating quantized LLMs with ETHICS benchmark. 
 3 | 
 4 | ## Run ETHICS benchmark
 5 | We implement an end-to-end evaluation flow for `ETHICS` benchmark. You can use the following command below to evaluate four subdatasets, including `commonsense`, `deontology`, `justice`, and `virtue`.
 6 | 
 7 | ```
 8 | CUDA_VISIBLE_DEVICES=0 python main.py \
 9 | --model_path /Your/LLM/Path --output_dir /Your/Path/to/Results \
10 | --w_group_size w_group_size --w_bit w_bit \
11 | --a_group_size a_group_size --a_bit a_bit \
12 | --kv_group_size kv_group_size --kv_bit kv_bit
13 | ```
14 | 
15 | > The evaluation results can be found in `/Your/Path/to/Results`.
16 | 
17 | > Here, the main.py script is in `qllm_eval/evaluation/q_ethic`.
18 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_harness/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation with lm_evaluation_harness
 2 | Basic instructions on evaluating quantized LLMs with [lm_evaluation_harness](https://github.com/EleutherAI/lm-evaluation-harness). 
 3 | 
 4 | ## Installation
 5 | Install the lm_eval package:
 6 | ```
 7 | pip install lm_eval==0.3.0
 8 | ```
 9 | The package `lm_eval` has been included in our `requirements.txt`.
10 | 
11 | 
12 | ## Use lm_eval for evaluation
13 | Use the main.py in `qllm_eval/evaluation/q_harness` as an example, you can use the `--tasks A,B,C` to select different tasks for evaluation, where the `A,B,C` represents three different tasks in `lm_eval`.
14 | 
15 | ```
16 | CUDA_VISIBLE_DEVICES=0 python main.py \
17 | --model_path /Your/LLM/Path --tasks A,B,C \
18 | --w_group_size w_group_size --w_bit w_bit \
19 | --a_group_size a_group_size --a_bit a_bit \
20 | --kv_group_size kv_group_size --kv_bit kv_bit
21 | ```
22 | 
23 | > The tasks supported by `lm_eval` can be found [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md).
24 | 
25 | > In our paper, we use `--tasks truthfulqa_mc` to evaluate quantized LLMs on the `TruthfulQA` dataset.
26 | 
27 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_harness/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_harness/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_harness/lm_eval_adaptor.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | import torch
  3 | from lm_eval.base import BaseLM
  4 | 
  5 | 
  6 | class LMEvalAdaptor(BaseLM):
  7 | 
  8 |     def __init__(self, model_name, model, tokenizer, batch_size=1, max_length=-1):
  9 |         super().__init__()
 10 | 
 11 |         assert isinstance(batch_size, int)
 12 | 
 13 |         self.model_name = model_name
 14 |         self.model = model
 15 |         self.model.eval()
 16 | 
 17 |         self.tokenizer = tokenizer
 18 | 
 19 |         # assert isinstance(self.tokenizer, (
 20 |         #     transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast,
 21 |         #     transformers.T5Tokenizer, transformers.T5TokenizerFast,
 22 |         # )), "this tokenizer has not been checked for compatibility yet!"
 23 | 
 24 |         self.vocab_size = self.tokenizer.vocab_size
 25 | 
 26 |         self._batch_size = batch_size
 27 | 
 28 |         self._max_length = max_length
 29 | 
 30 |     @property
 31 |     def eot_token_id(self):
 32 |         # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 33 |         return self.tokenizer.eos_token_id
 34 | 
 35 |     @property
 36 |     def max_length(self):
 37 |         if self._max_length != -1:
 38 |             return self._max_length
 39 |         if hasattr(self.model.config, 'n_ctx'):
 40 |             return self.model.config.n_ctx
 41 |         elif hasattr(self.model.config, 'max_position_embeddings'):
 42 |             return self.model.config.max_position_embeddings
 43 |         elif hasattr(self.model.config, 'n_positions'):
 44 |             return self.model.config.n_positions
 45 |         elif 'bloom' in self.model_name:
 46 |             return 2048
 47 |         elif 'llama' in self.model_name:
 48 |             return 2048  # TODO: did not check this
 49 |         else:
 50 |             print(self.model.config)
 51 |             raise NotImplementedError
 52 | 
 53 |     @property
 54 |     def max_gen_toks(self):
 55 |         return 256
 56 | 
 57 |     @property
 58 |     def batch_size(self):
 59 |         return self._batch_size
 60 | 
 61 |     @property
 62 |     def device(self):
 63 |         return "cuda"
 64 | 
 65 |     def tok_encode(self, string: str):
 66 |         return self.tokenizer.encode(string, add_special_tokens=False)
 67 | 
 68 |     def tok_decode(self, tokens):
 69 |         return self.tokenizer.decode(tokens)
 70 | 
 71 |     def _model_call(self, inps):
 72 |         """
 73 |         inps: a torch tensor of shape [batch, sequence]
 74 |         the size of sequence may vary from call to call
 75 | 
 76 |         returns: a torch tensor of shape [batch, sequence, vocab] with the
 77 |         logits returned from the model
 78 |         """
 79 |         with torch.no_grad():
 80 |             if isinstance(self.model, transformers.models.t5.modeling_t5.T5ForConditionalGeneration):
 81 |                 dec_inps = torch.cat(
 82 |                     [
 83 |                         torch.tensor(
 84 |                             self.model.generation_config.decoder_start_token_id,
 85 |                         )
 86 |                         .tile(len(inps), 1)
 87 |                         .to(inps),
 88 |                         inps,
 89 |                     ],
 90 |                     dim=1,
 91 |                 )
 92 |              
 93 |                 kwargs = {"decoder_input_ids": dec_inps,}
 94 |             else:
 95 |                 kwargs = {}
 96 |             out = self.model(inps, **kwargs)[0]
 97 |             if "opt" in self.model_name:  # there are a few extra tokens in opt, which we should omit
 98 |                 return out[:, :, :50257]
 99 |             else:
100 |                 return out  # [:, :, :self.tokenizer.vocab_size]
101 | 
102 |     def _model_generate(self, context, max_length, eos_token_id):
103 |         return self.model.generate(
104 |             context,
105 |             max_length=max_length,
106 |             eos_token_id=eos_token_id,
107 |             do_sample=False
108 |         )
109 | 
110 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_harness/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | 
 5 | from qllm_eval.quantization.quant_wrapper import quantize_model
 6 | from qllm_eval.utils import build_model_and_enc
 7 | from qllm_eval.evaluation.q_harness.lm_eval_adaptor import LMEvalAdaptor
 8 | from lm_eval import evaluator
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument("--model_path", type=str, help="path of the hf model")
12 | parser.add_argument("--output_path", type=str, help="path to save the quantized model")
13 | parser.add_argument("--use_flash_attn", action="store_true")
14 | parser.add_argument("--tasks", type=str, default="truthfulqa_mc")
15 | parser.add_argument("--metrics", type=str, default="mc1,mc2")
16 | parser.add_argument("--w_group_size", type=int, default=128)
17 | parser.add_argument("--w_bit", type=int, default=16)
18 | parser.add_argument("--a_group_size", type=int, default=128)
19 | parser.add_argument("--a_bit", type=int, default=16)
20 | parser.add_argument("--kv_group_size", type=int, default=128)
21 | parser.add_argument("--kv_bit", type=int, default=16)
22 | args = parser.parse_args()
23 | 
24 | 
25 | def main():
26 |     print("* Quantization Format: kv_{}_w_{}_a_{}".format(args.kv_bit, args.w_bit, args.a_bit))
27 |     if 'falcon' in args.model_path.lower():
28 |         args.kv_group_size = 64
29 |         args.w_group_size = 64
30 | 
31 |     # a hack here to auto set model group
32 |     model, enc = build_model_and_enc(args.model_path, args.use_flash_attn, args.kv_bit, args.kv_group_size)
33 | 
34 |     # quantize model
35 |     model = quantize_model(model, args)
36 | 
37 |     # # save the quantized model
38 |     # if args.output_path:
39 |     #     model.save_pretrained(args.output_path, safe_serialization=False)
40 |     #     enc.save_pretrained(args.output_path)
41 | 
42 |     # evaluation
43 |     lm_eval_model = LMEvalAdaptor(args.model_path, model, enc, 1)
44 | 
45 |     if args.tasks is not None:
46 |         task_names = args.tasks.split(",")
47 | 
48 |         results = evaluator.simple_evaluate(
49 |             model=lm_eval_model,
50 |             tasks=task_names,
51 |             batch_size=1,
52 |             no_cache=True,
53 |             num_fewshot=0,
54 |         )
55 |         # print(results)
56 |         # print(evaluator.make_table(results))
57 |         for task_name in task_names:
58 |             output_path = "{}/{}/kv_{}_w_{}_a_{}.jsonl".format(task_name, args.model_path, args.kv_bit, args.w_bit, args.a_bit)
59 |             print("* Output: ", output_path)
60 |             if not os.path.exists("{}/{}".format(task_name, args.model_path)):
61 |                 os.makedirs("{}/{}".format(task_name, args.model_path))
62 |             with open(output_path, 'w') as f:
63 |                 f.write(json.dumps(results['results'][task_name]) + "\n")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_long/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation Long-Context Tasks
 2 | Basic instructions on evaluating quantized LLMs with [LongEval](https://github.com/DachengLi1/LongChat) and [Lost-in-the-middle](https://github.com/nelson-liu/lost-in-the-middle) benchmarks. You need to install the qllm_eval package first.
 3 | 
 4 | ## For LongEval dataset
 5 | 
 6 | 1. Generate new key-value retrieval test cases.
 7 |     ```
 8 |     cd qllm_eval/evaluation/q_long
 9 | 
10 |     python longeval/generate_testcase.py longeval/generate_testcases_configs.yaml
11 |     ```
12 | 
13 | 2. Evaluation with `LongEval`.
14 |     ```
15 |     CUDA_VISIBLE_DEVICES=0 python main_longeval.py \
16 |     --model-name-or-path /Your/LLM/Path --use_flash_attn \
17 |     --task lines --test_dir new_cases \
18 |     --w_group_size w_group_size --w_bit w_bit \
19 |     --a_group_size a_group_size --a_bit a_bit \
20 |     --kv_group_size kv_group_size --kv_bit kv_bit
21 |     ```
22 | 
23 | 
24 | ## For Lost-in-the-middle dataset
25 | 
26 | 1. Git clone [Lost-in-the-middle](https://github.com/nelson-liu/lost-in-the-middle) and install it locally in the qllm_eval conda environment. 
27 |    ```
28 |     conda activate qllm_eval
29 | 	git clone git@github.com:nelson-liu/lost-in-the-middle.git
30 | 	cd <lost-in-the-middle_path>
31 |     Installation...
32 |     ```
33 | 
34 | 2. Evaluation with `lost-in-the-middle`.
35 |     ```
36 |     cd qllm_eval/evaluation/q_long
37 | 
38 |     CUDA_VISIBLE_DEVICES=0 python main_litm.py \
39 |     --model_name /Your/LLM/Path --use_flash_attn \
40 |     --w_group_size w_group_size --w_bit w_bit \
41 |     --a_group_size a_group_size --a_bit a_bit \
42 |     --kv_group_size kv_group_size --kv_bit kv_bit \
43 |     --input_path <lost-in-the-middle_path>/qa_data/30_total_documents/nq-open-30_total_documents_gold_at_0.jsonl.gz \
44 |     --max_new_tokens 100 --output_path /Your/Path/to/Results
45 |     ```
46 | 
47 |     > The input data file is `<lost-in-the-middle_path>/qa_data/30_total_documents/nq-open-30_total_documents_gold_at_0.jsonl.gz`
48 | 
49 |     > The evaluation results can be found in `/Your/Path/to/Results`.
50 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_long/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_long/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_long/longeval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/evaluation/q_long/longeval/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_long/longeval/generate_testcase.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from utils import generate_lines_testcases, retrieve_cmd_args
 3 | 
 4 | if __name__ == "__main__":
 5 |     cfgs = retrieve_cmd_args()
 6 | 
 7 |     output_dir = os.path.join(cfgs["output_dir"], cfgs["task"], "testcases/")
 8 |     if not os.path.exists(output_dir):
 9 |         os.makedirs(output_dir)
10 |     else:
11 |         raise RuntimeError("Output directory already exists! Remove this line to overwrite the original testcases.")
12 | 
13 |     if cfgs["task"] == "topics":
14 |         pass
15 |     else:
16 |         generate_lines_testcases(cfgs, output_dir)


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_long/longeval/generate_testcases_configs.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | output_dir: new_cases/  # directory to put the generated testcases
 3 | task: lines  # choose between topics and lines
 4 | num_test_samples: 500 # number of testcases for each test
 5 | 
 6 | 
 7 | # lines evaluation options
 8 | num_lines: [200,300,400,500,600,680]   # number of lines in each testcase
 9 | line_idx_opt: LRT-NL # type of line index, choose among LRT, LRT-ABCindex, LRT-UUID, LRT-NL
10 | 
11 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_opencompass/README.md:
--------------------------------------------------------------------------------
 1 | # Evaluation with OpenCompass
 2 | Basic instructions on evaluating quantized LLMs with [OpenCompass](https://github.com/open-compass/opencompass). You need to install the qllm_eval package first.
 3 | 
 4 | 
 5 | ## Installation
 6 | 
 7 | 1. Git clone [OpenCompass](https://github.com/open-compass/opencompass) and install it locally in the qllm_eval conda environment. See [requirements of OpenCompass](https://github.com/open-compass/opencompass/blob/main/requirements.txt).
 8 |    ```
 9 |     conda activate qllm
10 | 	git clone git@github.com:open-compass/opencompass.git
11 | 	cd <opencompass_path>
12 |     ```  
13 | 
14 | 2. Install the required packages from the source.
15 | 
16 |    ```
17 |    pip install -e .
18 |    ```
19 | 
20 | 3. Note that LlaMA should be installed mannually. Take the following steps to ensure LlaMA works properly:
21 | 
22 | 	```
23 |     git clone https://github.com/facebookresearch/llama.git
24 | 	cd <llama_path>
25 | 	pip install -r requirements.txt
26 | 	pip install -e .
27 | 	```
28 | 
29 | ## Evaluation
30 | 
31 | 1. Prepare datasets. Change directory to `QLLM-Evaluation/qllm_eval/evaluation/q_opencompass/` and create a new folder:
32 | 
33 | 	```
34 | 	cd qllm_eval/evaluation/q_opencompass
35 | 	mkdir data
36 | 	cd data
37 | 	```
38 | 
39 | 	Run the following commands to download and place the datasets in the `./qllm_eval/evaluation/q_opencompass/data` directory can complete dataset preparation.
40 | 
41 | 	```
42 | 	# Run in the OpenCompass directory
43 | 	wget https://github.com/open-compass/opencompass/releases/download/0.1.8.rc1/OpenCompassData-core-20231110.zip
44 | 	unzip OpenCompassData-core-20231110.zip
45 | 	```
46 | 
47 | 	You may also use the pre-downloaded zip file, which is located at `/share/datasets/public_datasets/`.  
48 | 
49 | 2. Run the following demo command to evaluate `OPT-125m` with weights quantized to 8-bit on `SuperGLUE_BoolQ_ppl` dataset:
50 | 
51 | 	```
52 | 	cd qllm_eval/evaluation/q_opencompass
53 | 	CUDA_VISIBLE_DEVICES=0 python main.py --models hf_opt_125m --datasets SuperGLUE_BoolQ_ppl --work-dir ./outputs/debug/api_test --w_bit 8
54 | 	```
55 | 
56 | 3. If you want to evaluate models with different quantization settings, please modify `./qllm_eval/evaluation/q_opencompass/utils/build.py`. If you want to support new datasets and new models, please add their configs to `./qllm_eval/evaluation/q_opencompass/configs`, whose original configs may be found at opencompass repo.
57 | 
58 | 	* Specially, if you want to evaluate the models with kv cache quantized, please modify the imported model class in the model configuration file. We provide class `HuggingFaceCausalLM_` for this specific need.
59 | 
60 | 	```python
61 | 	from qllm_eval.evaluation.q_opencompass.utils.models import HuggingFaceCausalLM_
62 | 	```
63 | 
64 | ## Reference Table for Evaluation Failure Cases
65 | 
66 | From time to time we get upset evalution results from opencompass. Hopefully this table can help you solve the problem quickly.
67 | 
68 | 1. Evaluation failure due to unparsed model outputs.
69 | 
70 | 	When you evaluate one quantized model with a generation task, the model might output paired curly brace characters, which will be loaded as a dict variable, causing errors in the following string processing. In this case, you could modify the local opencompass package to avoid this:
71 | 
72 | 	```
73 | 	opencompass/opencompass/tasks/openicl_eval.py
74 | 	```
75 | 
76 | 	Adding `try-except` for exception processing might be helpful.
77 | 


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_opencompass/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | from .openicl_infer_quant import *  # noqa: F401, F403


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_opencompass/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .build import *  # noqa


--------------------------------------------------------------------------------
/qllm_eval/evaluation/q_opencompass/utils/build.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import argparse
 3 | 
 4 | import torch
 5 | from mmengine.config import ConfigDict
 6 | from opencompass.registry import LOAD_DATASET, MODELS
 7 | 
 8 | # self import
 9 | from qllm_eval.methods.rep.apply_rep import apply_awq
10 | from qllm_eval.quantization.quant_wrapper import quantize_model
11 | 
12 | 
13 | 
14 | def build_dataset_from_cfg(dataset_cfg: ConfigDict):
15 |     dataset_cfg = copy.deepcopy(dataset_cfg)
16 |     dataset_cfg.pop('infer_cfg', None)
17 |     dataset_cfg.pop('eval_cfg', None)
18 |     dataset_cfg.pop('abbr', None)
19 |     return LOAD_DATASET.build(dataset_cfg)
20 | 
21 | 
22 | def build_model_from_cfg(model_cfg: ConfigDict):
23 |     model_cfg = copy.deepcopy(model_cfg)
24 |     model_cfg.pop('run_cfg', None)
25 |     model_cfg.pop('max_out_len', None)
26 |     model_cfg.pop('batch_size', None)
27 |     model_cfg.pop('abbr', None)
28 |     model_cfg.pop('pred_postprocessor', None)
29 |     # dump the quantization configs
30 |     quant_cfg = {}
31 |     quant_cfg['w_group_size'] = model_cfg.pop('w_group_size', 128)
32 |     quant_cfg['w_bit'] = model_cfg.pop('w_bit', None)
33 |     quant_cfg['a_group_size'] = model_cfg.pop('a_group_size', 128)
34 |     quant_cfg['a_bit'] = model_cfg.pop('a_bit', None)
35 |     quant_cfg['kv_group_size'] = model_cfg.pop('kv_group_size', 64)
36 |     quant_cfg['kv_bit'] = model_cfg.pop('kv_bit', None)
37 |     use_flash_attn = model_cfg.pop('use_flash_attn', False)
38 |     rep_file = model_cfg.pop('rep_file', None)
39 | 
40 |     # # if kv cache quantization is specified, we should add the config to the model.
41 |     # if quant_cfg['kv_bit'] is not None:
42 |     #     model_cfg['model_kwargs'].update({
43 |     #         'kv_bit': quant_cfg['kv_bit'],
44 |     #         'kv_group_size': quant_cfg['kv_group_size'],
45 |     #         'use_flash_attn': use_flash_attn,
46 |     #     })
47 | 
48 |     # build the original llm
49 |     lm_model = MODELS.build(model_cfg)
50 |     '''
51 |     Implement Your Quantization Code Here.
52 |     Below is a demo quantizing weights only.
53 |     '''
54 |     raw_model = lm_model.model # shallow copy
55 | 
56 |     if rep_file is not None:
57 |         rep_results = torch.load(rep_file, map_location="cpu")
58 |         apply_awq(raw_model, rep_results)
59 | 
60 |     lm_model.model = quantize_model(raw_model, argparse.Namespace(**quant_cfg))
61 |     return lm_model
62 | 


--------------------------------------------------------------------------------
/qllm_eval/methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/methods/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/methods/rep/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/methods/rep/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/methods/rep/apply_rep.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu
  4 | from transformers.models.opt.modeling_opt import OPTDecoderLayer
  5 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
  6 | from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralRMSNorm
  7 | from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer, MixtralRMSNorm
  8 | from transformers.activations import GELUActivation
  9 | 
 10 | from .utils import get_op_by_name, get_op_name, set_op_by_name
 11 | 
 12 | class ScaledActivation(nn.Module):
 13 |     def __init__(self, module, scales):
 14 |         super().__init__()
 15 |         self.act = module
 16 |         self.scales = nn.Parameter(scales.data)
 17 |     
 18 |     def forward(self, x):
 19 |         return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
 20 | 
 21 | @torch.no_grad()
 22 | def get_weight_scale(weight, q_group_size=-1):
 23 |     org_shape = weight.shape
 24 |     if q_group_size > 0:
 25 |         weight = weight.view(-1, q_group_size)
 26 |     scale = weight.abs() / weight.abs().amax(dim=1, keepdim=True)
 27 |     scale = scale.view(org_shape)
 28 |     scale = scale.mean(0)
 29 |     return scale
 30 | 
 31 | 
 32 | @torch.no_grad()
 33 | def get_act_scale(x):
 34 |     return x.abs().view(-1, x.shape[-1]).mean(0)
 35 | 
 36 | 
 37 | @torch.no_grad()
 38 | def scale_ln_fcs(ln, fcs, scales):
 39 |     if not isinstance(fcs, list):
 40 |         fcs = [fcs]
 41 |     
 42 |     scales = scales.to(ln.weight.device)
 43 | 
 44 |     ln.weight.div_(scales)
 45 |     if hasattr(ln, 'bias') and ln.bias is not None:
 46 |         ln.bias.div_(scales)
 47 | 
 48 |     for fc in fcs:
 49 |         fc.weight.mul_(scales.view(1, -1))
 50 | 
 51 |     for p in ln.parameters():
 52 |         assert torch.isnan(p).sum() == 0
 53 |     for fc in fcs:
 54 |         for p in fc.parameters():
 55 |             assert torch.isnan(p).sum() == 0
 56 | 
 57 | 
 58 | @torch.no_grad()
 59 | def scale_fc_fc(fc1, fc2, scales):
 60 |     assert isinstance(fc1, nn.Linear)
 61 |     assert isinstance(fc2, nn.Linear)
 62 |     # assert fc1.out_features == fc2.in_features
 63 |     
 64 |     scales = scales.to(fc1.weight.device)
 65 | 
 66 |     # fc1.weight.div_(scales.view(-1, 1))
 67 |     fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
 68 |     if fc1.bias is not None:
 69 |         fc1.bias.div_(scales.view(-1))
 70 | 
 71 |     fc2.weight.mul_(scales.view(1, -1))
 72 | 
 73 |     for p in fc1.parameters():
 74 |         assert torch.isnan(p).sum() == 0
 75 |     for p in fc2.parameters():
 76 |         assert torch.isnan(p).sum() == 0
 77 | 
 78 | 
 79 | @torch.no_grad()
 80 | def scale_gelu_fc(gelu, fc, scales):
 81 |     assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
 82 |     assert isinstance(fc, nn.Linear)
 83 | 
 84 |     fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
 85 | 
 86 |     for p in fc.parameters():
 87 |         assert torch.isnan(p).sum() == 0
 88 | 
 89 | def apply_scale(module, scales_list, input_feat_dict=None):
 90 |     for prev_op_name, layer_names, scales in scales_list:
 91 |         prev_op = get_op_by_name(module, prev_op_name)
 92 |         layers = [get_op_by_name(module, name) for name in layer_names]
 93 |         
 94 |         if isinstance(prev_op, nn.Linear):
 95 |             assert len(layers) == 1
 96 |             scale_fc_fc(prev_op, layers[0], scales)
 97 |         elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm, MixtralRMSNorm, MistralRMSNorm)) or 'rmsnorm' in str(prev_op.__class__).lower():
 98 |             scale_ln_fcs(prev_op, layers, scales)
 99 |         elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
100 |             new_module = ScaledActivation(prev_op, scales)
101 |             set_op_by_name(module, prev_op_name, new_module)
102 |             scale_gelu_fc(prev_op, layers[0], scales)
103 |         else:
104 |             raise NotImplementedError(
105 |                 f"prev_op {type(prev_op)} not supported yet!")
106 |             
107 |         # apply the scaling to input feat if given; prepare it for clipping
108 |         if input_feat_dict is not None:  
109 |             for layer_name in layer_names:
110 |                 inp = input_feat_dict[layer_name]
111 |                 inp.div_(scales.view(1, -1).to(inp.device))
112 | 
113 | @torch.no_grad()
114 | def apply_clip(module, clip_list):
115 |     from .utils import get_op_by_name
116 |     for name, max_val in clip_list:
117 |         layer = get_op_by_name(module, name)
118 |         max_val = max_val.to(layer.weight.device)
119 |         org_shape = layer.weight.shape
120 |         layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
121 |         layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
122 |         layer.weight.data = layer.weight.data.reshape(org_shape)
123 | 
124 | def apply_awq(model, awq_results):
125 |     apply_scale(model, awq_results["scale"])
126 |     apply_clip(model, awq_results["clip"])


--------------------------------------------------------------------------------
/qllm_eval/methods/rep/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def get_op_by_name(module, op_name):
 4 |     # get the op by its name relative to the module
 5 |     for name, m in module.named_modules():
 6 |         if name == op_name:
 7 |             return m
 8 |     raise ValueError(f"Cannot find op {op_name} in module {module}")
 9 | 
10 | 
11 | def set_op_by_name(layer, name, new_module):
12 |     levels = name.split('.')
13 |     if len(levels) > 1:
14 |         mod_ = layer
15 |         for l_idx in range(len(levels)-1):
16 |             if levels[l_idx].isdigit():
17 |                 mod_ = mod_[int(levels[l_idx])]
18 |             else:
19 |                 mod_ = getattr(mod_, levels[l_idx])
20 |         setattr(mod_, levels[-1], new_module)
21 |     else:
22 |         setattr(layer, name, new_module)
23 | 
24 | 
25 | def get_op_name(module, op):
26 |     # get the name of the op relative to the module
27 |     for name, m in module.named_modules():
28 |         if m is op:
29 |             return name
30 |     raise ValueError(f"Cannot find op {op} in module {module}")
31 | 
32 | 
33 | def append_str_prefix(x, prefix):
34 |     if isinstance(x, str):
35 |         return prefix + x
36 |     elif isinstance(x, tuple):
37 |         return tuple([append_str_prefix(y, prefix) for y in x])
38 |     elif isinstance(x, list):
39 |         return [append_str_prefix(y, prefix) for y in x]
40 |     else:
41 |         return x


--------------------------------------------------------------------------------
/qllm_eval/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/qllm_eval/quantization/qattn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qattn/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/quantization/qattn/sw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qattn/sw/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/quantization/qlinear/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/quantization/qlinear/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/quantization/qlinear/sqwa.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from functools import partial
 4 | from ..quant_funcs import *
 5 | 
 6 | class WALinear(nn.Module):
 7 |     def __init__(self, in_features, out_features, bias=True, act_quant='per_token', a_bit=8, w_bit=8, quantize_output=False, dev='cuda'):
 8 |         super().__init__()
 9 |         self.in_features = in_features
10 |         self.out_features = out_features
11 |         self.a_bit = a_bit
12 |         self.w_bit = w_bit
13 | 
14 |         self.register_buffer('weight', torch.zeros(self.out_features,
15 |                                                    self.in_features, dtype=torch.float16, requires_grad=False, device=dev))
16 |         if bias:
17 |             self.register_buffer('bias', torch.zeros(
18 |                 (1, self.out_features), dtype=torch.float16, requires_grad=False, device=dev))
19 |         else:
20 |             self.register_buffer('bias', None)
21 | 
22 |         if act_quant == 'per_token':
23 |             self.act_quant_name = 'per_token'
24 |             self.act_quant = partial(
25 |                 quantize_activation_per_token_absmax, n_bits=self.a_bit)
26 |         elif act_quant == 'per_tensor':
27 |             self.act_quant_name = 'per_tensor'
28 |             self.act_quant = partial(
29 |                 quantize_activation_per_tensor_absmax, n_bits=self.a_bit)
30 |         else:
31 |             raise ValueError(f'Invalid act_quant: {act_quant}')
32 | 
33 |         if quantize_output:
34 |             self.output_quant_name = self.act_quant_name
35 |             self.output_quant = self.act_quant
36 |         else:
37 |             self.output_quant_name = 'None'
38 |             self.output_quant = lambda x: x
39 | 
40 |     def to(self, *args, **kwargs):
41 |         super(WALinear, self).to(*args, **kwargs)
42 |         self.weight = self.weight.to(*args, **kwargs)
43 |         if self.bias is not None:
44 |             self.bias = self.bias.to(*args, **kwargs)
45 |         return self
46 | 
47 |     @torch.no_grad()
48 |     def forward(self, x):
49 |         q_x = self.act_quant(x)
50 |         y = torch.functional.F.linear(q_x, self.weight, self.bias)
51 |         q_y = self.output_quant(y)
52 |         return q_y
53 | 
54 |     @staticmethod
55 |     def from_float(module, weight_quant='per_channel', act_quant='per_token', w_bit=4, a_bit=8, weight_group=128, quantize_output=False):
56 |         assert isinstance(module, torch.nn.Linear)
57 |         new_module = WALinear(
58 |             module.in_features, module.out_features, module.bias is not None, act_quant=act_quant, a_bit=a_bit, w_bit=w_bit, quantize_output=quantize_output, dev=module.weight.device)
59 |         
60 |         # Quantize the weight matrices
61 |         if weight_quant == 'per_channel':
62 |             new_module.weight = quantize_weight_per_channel_absmax(module.weight, n_bits=w_bit)
63 |         elif weight_quant == 'per_tensor':
64 |             new_module.weight = quantize_weight_per_tensor_absmax(module.weight, n_bits=w_bit)
65 |         elif weight_quant == 'per_group':
66 |             new_module.weight = pseudo_quantize_tensor(module.weight, n_bits=w_bit, q_group_size=weight_group, inplace=True)
67 |         else:
68 |             raise ValueError(f'Invalid weight_quant: {weight_quant}')
69 |         
70 |         new_module.weight_quant_name = weight_quant
71 |         if module.bias is not None:
72 |             new_module.bias = module.bias
73 |         del module
74 |         return new_module
75 | 
76 |     def __repr__(self):
77 |         return 'W{}A{}Linear'.format(self.w_bit, self.a_bit)
78 | 
79 | 


--------------------------------------------------------------------------------
/qllm_eval/quantization/quant_funcs.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | @torch.no_grad()
 4 | def pseudo_quantize_tensor(tensor, n_bits=8, zero_point=True, q_group_size=-1, per_tensor=False, inplace=False):
 5 |     """
 6 |     The basic quantization function for weight, activation and KV cache.
 7 |     """
 8 |     org_tensor_shape = tensor.shape
 9 |     if q_group_size > 0:
10 |         assert org_tensor_shape[-1] % q_group_size == 0
11 |         tensor = tensor.reshape(-1, q_group_size)
12 |     if per_tensor:
13 |         tensor = tensor.reshape(1, -1)
14 |     assert tensor.dim() == 2
15 |     if zero_point:
16 |         max_val = tensor.amax(dim=1, keepdim=True)
17 |         min_val = tensor.amin(dim=1, keepdim=True)
18 |         max_int = 2**n_bits - 1
19 |         min_int = 0
20 |         scales = (max_val - min_val).clamp(min=1e-5) / max_int
21 |         zeros = (-torch.round(min_val / scales)).clamp_(min_int, max_int)
22 |     else:
23 |         max_val = tensor.abs().amax(dim=1, keepdim=True)
24 |         max_val = max_val.clamp(min=1e-5)
25 |         max_int = 2 ** (n_bits - 1) - 1
26 |         min_int = -(2 ** (n_bits - 1))
27 |         scales = max_val / max_int
28 |         zeros = 0
29 | 
30 |     if inplace:
31 |         (
32 |             (tensor.div_(scales).round_().add_(zeros)).clamp_(min_int, max_int).sub_(zeros)
33 |         ).mul_(scales)
34 |     else:
35 |         tensor = (
36 |             torch.clamp(torch.round(tensor / scales) + zeros, min_int, max_int) - zeros
37 |         ) * scales
38 | 
39 |     assert torch.isnan(tensor).sum() == 0
40 | 
41 |     tensor = tensor.reshape(org_tensor_shape)
42 | 
43 |     # return the quantized tonsor, the scaling factor and the zero point value
44 |     # return tensor, scales.view(tensor.shape[0], -1), zeros.view(tensor.shape[0], -1)
45 |     return tensor
46 | 
47 | 
48 | @torch.no_grad()
49 | def quantize_weight_per_channel_absmax(w, n_bits=8):
50 |     """
51 |     The basic quantization function for weight, activation and KV cache.
52 |     """
53 |     tensor = pseudo_quantize_tensor(w, n_bits=n_bits, zero_point=False, q_group_size=-1, per_tensor=False, inplace=False)
54 |     return tensor
55 |     
56 | @torch.no_grad()
57 | def quantize_activation_per_token_absmax(t, n_bits=8):
58 |     t_shape = t.shape
59 |     t = t.view(-1, t_shape[-1])
60 |     t = pseudo_quantize_tensor(t, n_bits=n_bits, zero_point=True, q_group_size=-1, per_tensor=False, inplace=False)
61 |     return t.reshape(t_shape)
62 |     
63 | @torch.no_grad()
64 | def quantize_weight_per_tensor_absmax(w, n_bits=8):
65 |     """
66 |     The basic quantization function for weight, activation and KV cache.
67 |     """
68 |     tensor = pseudo_quantize_tensor(w, n_bits=n_bits, zero_point=False, q_group_size=-1, per_tensor=True, inplace=False)
69 |     return tensor
70 |     
71 | @torch.no_grad()
72 | def quantize_activation_per_tensor_absmax(t, n_bits=8):
73 |     t_shape = t.shape
74 |     t = t.view(-1, t_shape[-1])
75 |     t = pseudo_quantize_tensor(t, n_bits=n_bits, zero_point=True, q_group_size=-1, per_tensor=True, inplace=False)
76 |     return t.reshape(t_shape)
77 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thu-nics/qllm-eval/6d047af23fed4299241a5bd2fc92f85022248949/qllm_eval/visualization/__init__.py


--------------------------------------------------------------------------------
/qllm_eval/visualization/basic/assets/lambada_w_only.py:
--------------------------------------------------------------------------------
 1 | # opt series
 2 | # demo data for the plot script
 3 | # the elements in each list is the result of precision [FP16, W8, W4, W3, W2]
 4 | OPT_125M= [39.63, 39.69, 38.70, 18.61,  0.06]
 5 | OPT_1B3 = [59.60, 59.60, 56.37, 26.57,  0.02]
 6 | OPT_2B7 = [64.60, 64.76, 61.96,  3.09,  0.00]
 7 | OPT_6B7 = [68.08, 67.98, 65.13, 45.60,  0.04]
 8 | OPT_13B = [69.96, 69.78, 68.12, 20.30,  0.04]
 9 | OPT_30B = [71.49, 71.36, 69.88, 51.66,  0.06]
10 | OPT_66B = [73.43, 73.45, 72.95,  8.11,  0.04]
11 | 
12 | # falcon series
13 | Falcon_7B =   [74.11, 74.09, 73.39, 70.60,  0.35]
14 | Falcon_40B =  [76.23, 76.21, 75.70, 74.52,  0.14]
15 | Falcon_180B = [79.62, 79.60, 78.27, 75.53, 50.48]
16 | 
17 | # llama2 series
18 | LlaMA2_7B =  [73.32, 73.37, 71.73, 66.61,  0.00]
19 | LlaMA2_13B = [76.48, 76.58, 75.99, 71.73,  1.71]
20 | LlaMA2_70B = [78.96, 78.89, 77.88, 76.46, 16.96]
21 | 
22 | # Bloom series
23 | Bloom_560M = [36.13, 36.21, 32.68, 21.21,  0.06]
24 | Bloom_1B1  = [44.15, 44.13, 42.29, 34.12,  1.14]
25 | Bloom_1B7  = [48.36, 48.22, 48.22, 38.15,  0.14]
26 | Bloom_3B =   [54.16, 54.14, 51.87, 46.73,  4.39]
27 | Bloom_7B1  = [58.26, 58.06, 56.90, 52.11, 23.54]
28 | Bloom_175B = [67.53, 67.59, 67.01, 66.37, 38.31]
29 | 
30 | # Bloomz series
31 | Bloomz_560M = [41.59, 41.12, 39.36, 26.59,  0.10]
32 | Bloomz_1B1  = [49.70, 49.60, 48.98, 40.27,  3.14]
33 | Bloomz_1B7  = [50.71, 50.77, 48.24, 43.82,  0.25]
34 | Bloomz_3B =   [52.49, 52.65, 51.85, 47.33,  6.64]
35 | Bloomz_7B1  = [56.59, 56.39, 56.74, 53.11, 28.93]
36 | Bloomz_175B = [68.23, 68.12, 68.15, 67.15, 46.44]
37 | 
38 | # ChatGLM3 series
39 | ChatGLM3_6B =     [51.97, 52.01, 48.71, 32.72,  0.14]
40 | ChatGLM3_6B_32K = [63.79, 60.39, 63.30, 63.63,  0.02]
41 | 
42 | # MOE models
43 | Mistral_7B =   [75.04, 75.18, 73.90, 68.00, 0.21]
44 | Mixtral_8x7B = [76.79, 76.79, 74.85, 70.02, 0.02]
45 | 
46 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/basic/heatmap_plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import seaborn as sns
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | from scipy import stats
  9 | from importlib import import_module
 10 | 
 11 | from utils import *
 12 | 
 13 | save_name = None
 14 | save_path = './results/heatmap_plot/'
 15 | 
 16 | # whether use manually processed order or not
 17 | normalize = False
 18 | 
 19 | mode = 'w_only'
 20 | modes = ['w_only', 'w_a', 'kv_cache']
 21 | selected_models = 'llama2+falcon+bloomz+chatglm3+moe+opt'
 22 | # selected_models = 'llama2+bloomz+opt'
 23 | selected_bitwidth = 'w3'
 24 | 
 25 | label_size = 20
 26 | 
 27 | 
 28 | '''
 29 | # full list of the tested models.
 30 | models_to_plot = {
 31 |     'opt':      ['OPT_125M', 'OPT_1B3', 'OPT_2B7', 'OPT_6B7', 'OPT_13B', 'OPT_30B', 'OPT_66B'],
 32 |     'falcon':   ['Falcon_7B', 'Falcon_40B', 'Falcon_180B'],
 33 |     'llama2':   ['LlaMA2_7B', 'LlaMA2_13B', 'LlaMA2_70B'],
 34 |     'bloom':    ['Bloom_560M', 'Bloom_1B1', 'Bloom_1B7', 'Bloom_3B', 'Bloom_7B1', 'Bloom_175B'],
 35 |     'bloomz':   ['Bloomz_560M', 'Bloomz_1B1', 'Bloomz_1B7', 'Bloomz_3B', 'Bloomz_7B1', 'Bloomz_175B'],
 36 |     'chatglm3': ['ChatGLM3_6B'],
 37 | }
 38 | '''
 39 | # selected models for easier display.
 40 | models_to_plot = {
 41 |     'opt':      ['OPT_6B7', 'OPT_13B'],
 42 |     'falcon':   ['Falcon_7B', 'Falcon_180B'],
 43 |     'llama2':   ['LlaMA2_7B', 'LlaMA2_70B'],
 44 |     # 'bloom':    ['Bloom_3B', 'Bloom_7B1'],
 45 |     'bloomz':   ['Bloomz_3B', 'Bloomz_175B'],
 46 |     'chatglm3': ['ChatGLM3_6B'],
 47 |     'moe':      ['Mistral_7B', 'Mixtral_8x7B'],
 48 | }
 49 | 
 50 | # datasets=['chid', 'winogrande', 'race', 'lambada', 'rte', 'piqa', 'siqa']
 51 | datasets=['chid', 'winogrande', 'race', 'lambada', 'piqa', 'siqa']
 52 | datasets_ = ['CHID', 'Winogrande', 'Race', 'LAMBADA', 'PIQA', 'SIQA']
 53 | 
 54 | if __name__ == '__main__':
 55 |     if not os.path.exists(save_path):
 56 |         os.makedirs(save_path)
 57 | 
 58 |     save_name = save_name + '.pdf' if save_name is not None else \
 59 |                     'heatmap_' + selected_bitwidth + '.pdf'
 60 |     save_name = save_path + save_name
 61 | 
 62 | 
 63 |     # first gather the test results.
 64 |     model_perfs = {}
 65 |     selected_models = selected_models.split('+')
 66 |     for dataset in datasets:
 67 |         fp_perfs = []
 68 |         quant_perfs = []
 69 |         results_module = import_module('assets.' + dataset + '_' + mode)
 70 |         for selected_model_family in selected_models:
 71 |             for model in models_to_plot[selected_model_family]:
 72 |                 perfs = getattr(results_module, model)
 73 |                 fp_perfs.append(perfs[0])
 74 |                 quant_idx = bitwidth_idx_mapping[mode][selected_bitwidth]
 75 |                 quant_perfs.append(perfs[quant_idx])
 76 |             model_perfs.update({dataset: {'fp_perfs': fp_perfs, 'quant_perfs': quant_perfs}})
 77 | 
 78 |     spearman_corr = np.zeros((12, 12))
 79 |     # the order of the datasets is the same as <List> datasets. The FP results occupy the first 6 col & row, followed
 80 |     # by 6 quant results.
 81 |     for row in range(12):
 82 |         for col in range(12):
 83 | 
 84 |             row_dataset = datasets[row % 6]
 85 |             row_fp_flag = not(row // 6)
 86 |             row_perfs = np.array(model_perfs[row_dataset]['fp_perfs' if row_fp_flag else 'quant_perfs'])
 87 | 
 88 |             col_dataset = datasets[col % 6]
 89 |             col_fp_flag = not(col // 6)
 90 |             col_perfs = np.array(model_perfs[col_dataset]['fp_perfs' if col_fp_flag else 'quant_perfs'])
 91 | 
 92 |             sp_corr_value = stats.spearmanr(row_perfs, col_perfs).correlation
 93 |             spearman_corr[row][col] = sp_corr_value
 94 | 
 95 |     # plot the heatmap figure
 96 |     spearman_corr = pd.DataFrame(spearman_corr)
 97 | 
 98 |     x_labels = ["FP16 " + datasets_[i % 6] if i < 6 else selected_bitwidth.upper() + " " + datasets_[i % 6] for i in range(12)]
 99 |     y_labels = ["FP16 " + datasets_[i % 6] if i < 6 else selected_bitwidth.upper() + " " + datasets_[i % 6] for i in range(12)]
100 | 
101 |     plot = sns.heatmap(spearman_corr, cmap="YlGnBu", xticklabels=x_labels, yticklabels=y_labels, annot=True,
102 |                        annot_kws={'fontsize': 6}, fmt='.2g')
103 | 
104 |     # plot.set(xlabel=x_labels, ylabel=y_labels)
105 |     plot.xaxis.tick_top()
106 |     plt.xticks(rotation=45, ha='left', rotation_mode='anchor')
107 |     # plt.yticks(rotation=45)
108 |     plt.subplots_adjust(left=0.22, right=0.89, top=0.77, bottom=0.13)
109 |     # plt.xlabel('Dataset - Quantization Bitwidth', fontsize=label_size)
110 |     # plt.ylabel('Dataset - Quantization Bitwidth', fontsize=label_size)
111 |     plt.savefig(save_name)
112 |     plt.show()
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/basic/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # dataset related properties
 4 | dataset_minimum = {
 5 |     'chid': 16.67,
 6 |     'winogrande': 50.,
 7 |     'race': 25.,
 8 |     'lambada': 0.,
 9 |     'rte': 50.,
10 |     'piqa': 50.,
11 |     'siqa': 33.33,
12 | }
13 | 
14 | bitwidth_idx_mapping = {
15 |     'w_only':   {'fp16': 0, 'w8': 1, 'w4': 2, 'w3': 3, 'w2': 4},
16 |     'w_a':      {'fp16': 0, 'w8a8': 1, 'w4a8': 2, 'w4a4': 3},
17 |     'kv_cache': {'fp16': 0, 'kv8': 1, 'kv4': 2, 'kv3': 3, 'kv2': 4}
18 | }
19 | 
20 | 
21 | # helper functions
22 | def normalize_results(raw_results, fp_idx=0, minimal=None, range=1, w_fp=True):
23 |     # raw_results = result_pad(raw_results)
24 |     has_nonzero_fp_result = raw_results[fp_idx] is not None and raw_results[fp_idx] != 0
25 |     if has_nonzero_fp_result and w_fp:
26 |         # do not consider the minimal value of the dataset
27 |         fp_result = raw_results[fp_idx]
28 |         if minimal is None:
29 |             norm_results = [i / fp_result if i is not None else None for i in raw_results]
30 |             print('Result Normalization Succeeded.')
31 |         else:
32 |             norm_results = [max((i - minimal) / (fp_result - minimal), 0) \
33 |                                 if i is not None else None for i in raw_results]
34 |     else:
35 |         norm_results = raw_results
36 |         print('The input results have no FP precision, return original results.')
37 |     assert range in [1, 100]
38 |     if range == 100:
39 |         norm_results = [i * 100 if i is not None else i for i in norm_results]
40 |         print('Result Normalization Succeeded.')
41 |     elif range == 1 and norm_results == raw_results:
42 |         norm_results = [i / 100. if i is not None else i for i in norm_results]
43 |         print('Result Normalization Succeeded.')
44 |     return norm_results
45 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/dialogue/assets/__init__.py:
--------------------------------------------------------------------------------
1 | from . import mt_bench_w_only as w_only
2 | from . import mt_bench_w_a as wa
3 | from . import mt_bench_kv_cache as kv


--------------------------------------------------------------------------------
/qllm_eval/visualization/dialogue/assets/mt_bench_kv_cache.py:
--------------------------------------------------------------------------------
 1 | # results for mt-bench
 2 | # the elements in each list are the results of round1 and round2
 3 | # the elements in each sub-list are the results of precision [[FP16, KV8, KV4, KV3], [FP16, KV8, KV4, KV3]]
 4 | 
 5 | results = {
 6 |     # llama2-chat series
 7 |     'llama-2_7b_chat' : [[5.31, 5.25, 5.46, 4.38], [4.14, 4.21, 4.16, 2.76]],
 8 |     'llama-2_13b_chat' : [[5.72, 5.84, 5.86, 5.53], [5.05, 5.04, 4.38, 3.85]],
 9 |     'llama-2_70b_chat' : [[6.26, 6.41, 6.30, 6.25], [5.99, 5.91, 6.03, 5.66]],
10 | 
11 |     # falcon-instruct series
12 |     'falcon_7b_instruct' : [[3.79, 3.64, 3.60, 2.92], [2.30, 2.27, 2.24, 1.86]],
13 |     'falcon_40b_instruct' : [[4.92, 4.86, 4.76, 4.45], [3.36, 3.35, 3.40, 3.34]],
14 |     'falcon_180b_chat' : [[6.35, 6.68, 6.61, 6.58], [5.59, 5.63, 5.70, 5.46]],
15 | 
16 |     # mistral-instruct series
17 |     'mistral_7b_instruct' : [[6.70, 6.70, 6.55, 6.26], [6.00, 6.42, 6.22, 4.96]],
18 |     'mixtral_8x7b_instruct' : [[7.89, 7.62, 6.97, 6.26], [6.55, 6.94, 6.34, 4.96]],
19 | 
20 |     # chatglm3 series
21 |     'chatglm3_6b' : [[5.13, 5.08, 4.58, 4.81], [3.68, 3.54, 3.26, 3.18]],
22 | 
23 |     # stablelm series
24 |     'stablelm_zephyr_3b' : [[5.03, 5.08, 5.08, 4.38], [4.25, 4.14, 3.78, 3.19]],
25 | 
26 |     # gemma series
27 |     'gemma_2b_it' : [[4.06, 3.98, 3.41, 3.34], [2.81, 2.62, 2.80, 2.52]],
28 |     'gemma_7b_it' : [[5.25, 5.28, 4.98, 4.94], [3.61, 3.63, 3.49, 3.37]],
29 | }
30 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/dialogue/assets/mt_bench_w_a.py:
--------------------------------------------------------------------------------
 1 | # results for mt-bench
 2 | # the elements in each list are the results of round1 and round2
 3 | # the elements in each sub-list are the results of precision [[FP16, W8A8, W4A8, W4A4], [FP16, W8A8, W4A8, W4A4]]
 4 | 
 5 | results = {
 6 |     # llama2-chat series
 7 |     'llama-2_7b_chat' : [[5.31, 4.94, 5.12, 1.00], [4.14, 3.79, 4.22, 1.00]],
 8 |     'llama-2_13b_chat' : [[5.72, 5.83, 5.88, 1.00], [5.05, 5.12, 4.97, 1.00]],
 9 |     'llama-2_70b_chat' : [[6.26, 6.17, 6.11, 1.00], [5.99, 5.86, 5.86, 1.00]],
10 | 
11 |     # falcon-instruct series
12 |     'falcon_7b_instruct' : [[3.79, 3.59, 3.15, 1.00], [2.30, 2.14, 2.05, 1.00]],
13 |     'falcon_40b_instruct' : [[4.92, 4.71, 4.40, 1.00], [3.36, 3.55, 3.14, 1.00]],
14 |     'falcon_180b_chat' : [[6.35, 6.35, 6.56, 1.00], [5.59, 5.43, 5.88, 1.00]],
15 | 
16 |     # mistral-instruct series
17 |     'mistral_7b_instruct' : [[6.70, 6.74, 6.53, 1.00], [6.00, 6.08, 5.76, 1.00]],
18 |     'mixtral_8x7b_instruct' : [[7.89, 7.22, 7.42, 1.00], [6.55, 6.61, 6.49, 1.00]],
19 | 
20 |     # chatglm3 series
21 |     'chatglm3_6b' : [[5.13, 5.14, 4.85, 1.00], [3.68, 3.92, 3.50, 1.00]],
22 | 
23 |     # stablelm series
24 |     'stablelm_zephyr_3b' : [[5.03, 5.29, 5.48, 1.14], [4.25, 4.10, 3.94, 1.00]],
25 | 
26 |     # gemma series
27 |     'gemma_2b_it' : [[4.06, 3.91, 3.64, 1.00], [2.81, 2.95, 2.73, 1.00]],
28 |     'gemma_7b_it' : [[5.25, 5.09, 5.14, 1.00], [3.62, 3.79, 3.28, 1.00]],
29 | 
30 |     # mamba series
31 |     'mamba_2b8_chat' : [[1.95, 2.08, 1.83, 1.41], [1.40, 1.45, 1.12, 1.00]],
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/dialogue/assets/mt_bench_w_only.py:
--------------------------------------------------------------------------------
 1 | # results for mt-bench
 2 | # the elements in each list are the results of round1 and round2
 3 | # the elements in each sub-list are the results of precision [[FP16, W8, W4, W3], [FP16, W8, W4, W3]],
 4 | 
 5 | results = {
 6 |     # llama2-chat series
 7 |     'llama-2_7b_chat' : [[5.31, 5.16, 5.47, 4.94], [4.14, 4.04, 4.30, 3.52]],
 8 |     'llama-2_13b_chat' : [[5.72, 5.95, 5.74, 5.38], [5.05, 5.31, 4.65, 4.26]],
 9 |     'llama-2_70b_chat' : [[6.26, 6.49, 5.91, 5.86], [5.99, 5.83, 5.55, 5.12]],
10 | 
11 |     # falcon-instruct series
12 |     'falcon_7b_instruct' : [[3.79, 3.71, 3.39, 3.06], [2.30, 2.19, 2.27, 1.89]],
13 |     'falcon_40b_instruct' : [[4.92, 4.81, 4.66, 4.38], [3.36, 3.01, 3.69, 3.54]],
14 |     'falcon_180b_chat' : [[6.35, 6.62, 6.25, 5.79], [5.59, 5.70, 6.05, 5.17]],
15 | 
16 |     # mistral-instruct series
17 |     'mistral_7b_instruct' : [[6.70, 6.78, 6.44, 6.18], [6.00, 6.01, 5.88, 5.49]],
18 |     'mixtral_8x7b_instruct' : [[7.89, 7.53, 7.21, 6.72], [6.55, 6.64, 6.21, 5.53]],
19 | 
20 |     # chatglm3 series
21 |     'chatglm3_6b' : [[5.13, 4.91, 5.06, 4.35], [3.68, 3.88, 4.09, 3.12]],
22 | 
23 |     # stablelm series
24 |     'stablelm_zephyr_3b' : [[5.03, 5.09, 5.58, 3.15], [4.25, 3.86, 4.09, 2.11]],
25 | 
26 |     # gemma series
27 |     'gemma_2b_it' : [[4.06, 4.18, 3.67, 3.39], [2.81, 3.11, 3.06, 2.60]],
28 |     'gemma_7b_it' : [[5.25, 5.29, 4.94, 4.74], [3.62, 3.88, 3.51, 3.19]],
29 | 
30 |     # mamba series
31 |     'mamba_2b8_chat' : [[1.95, 2.06, 1.73, 1.09], [1.40, 1.29, 1.29, 1.00]],
32 | }
33 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/dialogue/gen_table.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from assets import w_only, wa, kv
 3 | 
 4 | model_name_map = {
 5 |     'llama-2_7b_chat': 'LLaMA2-7B-chat',
 6 |     'llama-2_13b_chat': 'LLaMA2-13B-chat',
 7 |     'llama-2_70b_chat': 'LLaMA2-70B-chat',
 8 |     'falcon_7b_instruct': 'Falcon-7B-instruct',
 9 |     'falcon_40b_instruct': 'Falcon-40B-instruct',
10 |     'falcon_180b_chat': 'Falcon-180B-chat',
11 |     'mistral_7b_instruct': 'Mistral-7B-instruct-v0.2',
12 |     'mixtral_8x7b_instruct': 'Mixtral-8x7B-instruct-v0.1',
13 |     'chatglm3_6b': 'ChatGLM3-6B',
14 |     'stablelm_zephyr_3b': 'StableLM-Zephyr-3B',
15 |     'gemma_2b_it': 'Gemma-2B-it',
16 |     'gemma_7b_it': 'Gemma-7B-it',
17 |     'mamba_2b8_chat': "Mamba-2.8B-chat"
18 | } 
19 | 
20 | def gen_tex_table_lines(save_path='./tables/table.tex'):
21 |     """
22 |         \midrule
23 |         \multirow{2}{*}{MODEL-NAME} & 1 & rd1_fp16 & rd1_w8 & rd1_w4 & rd1_w3_awq & rd1_w3_awq & rd1_w8a8 & rd1_w4a8 & rd1_w4a4 & rd1_w4a4_sq & rd1_kv8 & rd1_kv4 & rd1_kv3 \\
24 |         & 2 & rd2_fp16 & rd2_w8 & rd2_w4 & rd2_w3_awq & rd2_w3_awq & rd2_w8a8 & rd2_w4a8 & rd2_w4a4 & rd2_w4a4_sq & rd2_kv8 & rd2_kv4 & rd2_kv3 \\ 
25 |     """
26 |     # clear the previous table
27 |     if os.path.exists(save_path):
28 |         os.remove(save_path)
29 |     os.makedirs(os.path.split(save_path)[0], exist_ok=True)
30 | 
31 |     with open(save_path, 'a') as f:
32 |         for model, model_name in model_name_map.items():
33 |             f.write(r'\midrule' + '\n') 
34 |             if 'mamba' in model.lower():    # mamba doesn't have kv cache
35 |                 # round 1
36 |                 f.write(r'\multirow{2}{*}{' + model_name + r'} & 1 ' + \
37 |                         ''.join([f'& {w_only.results[model][0][i]:.2f} ' for i in range(len(w_only.results[model][0]))]) + '& - ' + \
38 |                         ''.join([f'& {wa.results[model][0][i]:.2f} ' for i in range(1, len(wa.results[model][0]))]) + '& - ' + \
39 |                         ''.join([f'& - ' for _ in range(3)]) + \
40 |                         r'\\' + '\n')
41 |                 # round2
42 |                 f.write(r'& 2 ' + \
43 |                         ''.join([f'& {w_only.results[model][1][i]:.2f} ' for i in range(len(w_only.results[model][1]))]) + '& - ' + \
44 |                         ''.join([f'& {wa.results[model][1][i]:.2f} ' for i in range(1, len(wa.results[model][1]))]) + '& - ' + \
45 |                         ''.join([f'& - ' for _ in range(3)]) + \
46 |                         r'\\' + '\n')   
47 |             else:
48 |                 # round1
49 |                 f.write(r'\multirow{2}{*}{' + model_name + r'} & 1 ' + \
50 |                         ''.join([f'& {w_only.results[model][0][i]:.2f} ' for i in range(len(w_only.results[model][0]))]) + '& - ' + \
51 |                         ''.join([f'& {wa.results[model][0][i]:.2f} ' for i in range(1, len(wa.results[model][0]))]) + '& - ' + \
52 |                         ''.join([f'& {kv.results[model][0][i]:.2f} ' for i in range(1, len(kv.results[model][0]))]) + \
53 |                         r'\\' + '\n')
54 |                 # round2
55 |                 f.write(r'& 2 ' + \
56 |                         ''.join([f'& {w_only.results[model][1][i]:.2f} ' for i in range(len(w_only.results[model][1]))]) + '& - ' + \
57 |                         ''.join([f'& {wa.results[model][1][i]:.2f} ' for i in range(1, len(wa.results[model][1]))]) + '& - ' + \
58 |                         ''.join([f'& {kv.results[model][1][i]:.2f} ' for i in range(1, len(kv.results[model][1]))]) + \
59 |                         r'\\' + '\n')                   
60 | 
61 | def gen_md_table_lines(save_path='./tables/table.md'):
62 |     pass
63 | 
64 | if __name__ == '__main__':
65 |     gen_tex_table_lines()


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-c_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	36.61	36.61	36.27	33.56	22.37
 2 | LLaMA2-13B-chat	41.69	42.37	40.34	37.97	31.19
 3 | LLaMA2-70B-chat	40.34	39.32	38.64	38.31	38.31
 4 | Falcon-7B-instruct	34.58	34.58	35.25	32.88	23.05
 5 | Falcon-40B-instruct	45.76	45.76	45.08	41.36	32.2
 6 | Falcon-180B-chat	36.95	36.27	35.93	34.24	34.92
 7 | ChatGLM3-6B	30.85	31.19	30.17	28.14	26.44
 8 | ChatGLM3-6B-32k	42.37	42.71	40.68	44.75	29.49
 9 | Mistral-7B-instruct	48.14	48.14	48.14	51.53	37.63
10 | Mixtral-8x7B-instruct	55.25	53.56	51.86	50.85	28.81
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-c_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	36.61	36.61	34.92	31.19	24.75
 2 | LLaMA2-13B-chat	41.69	42.37	41.02	40.34	20.68
 3 | LLaMA2-70B-chat	40.34	40.34	36.61	37.97	26.44
 4 | Falcon-7B-instruct	34.58	34.58	31.86	32.2	24.75
 5 | Falcon-40B-instruct	45.76	45.42	45.08	45.76	22.37
 6 | Falcon-180B-chat	36.95	36.27	34.92	33.22	24.75
 7 | ChatGLM3-6B	30.85	31.19	28.47	30.17	21.02
 8 | ChatGLM3-6B-32k	42.37	42.37	42.37	38.98	24.41
 9 | Mistral-7B-instruct	48.14	48.81	49.49	44.41	21.02
10 | Mixtral-8x7B-instruct	55.25	56.61	52.2	46.78	24.41
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-c_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	36.61	37.63	34.92	25.42
 2 | LLaMA2-13B-chat	41.69	43.05	40.34	22.37
 3 | LLaMA2-70B-chat	40.34	41.36	37.97	21.69
 4 | Falcon-7B-instruct	34.58	33.9	32.88	20.68
 5 | Falcon-40B-instruct	45.76	45.76	45.42	22.03
 6 | Falcon-180B-chat	36.95	35.59	35.93	22.37
 7 | ChatGLM3-6B	30.85	31.86	29.15	24.41
 8 | ChatGLM3-6B-32k	42.37	42.71	41.36	22.03
 9 | Mistral-7B-instruct	48.14	48.81	48.14	27.8
10 | Mixtral-8x7B-instruct	55.25	53.56	49.83	27.46
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-e_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	60.32	60.49	59.08	57.67	39.51
 2 | LLaMA2-13B-chat	64.2	64.02	63.84	62.61	50.79
 3 | LLaMA2-70B-chat	60.67	60.67	60.85	61.02	61.55
 4 | Falcon-7B-instruct	58.91	58.73	58.73	54.67	37.39
 5 | Falcon-40B-instruct	71.08	70.72	71.25	70.02	58.73
 6 | Falcon-180b-chat	55.73	55.73	55.56	55.56	55.38
 7 | ChatGLM3-6B	44.27	44.44	45.5	43.03	40.21
 8 | ChatGLM3-6B-32k	66.67	66.31	67.37	63.14	48.5
 9 | Mistral-7B-instruct	67.37	67.37	67.37	63.84	55.73
10 | Mixtral-8x7B-instruct	79.19	78.84	78.13	71.08	50.79
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-e_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	60.32	60.14	56.44	51.85	25.4
 2 | LLaMA2-13B-chat	64.2	64.02	63.32	61.9	28.22
 3 | LLaMA2-70B-chat	60.67	61.02	59.79	56.26	48.5
 4 | Falcon-7B-instruct	58.91	58.91	58.2	55.73	38.1
 5 | Falcon-40B-instruct	71.08	70.9	70.55	71.6	29.98
 6 | Falcon-180B-chat	55.73	56.26	56.26	58.38	40.39
 7 | ChatGLM3-6B	44.27	43.92	42.86	41.62	27.34
 8 | ChatGLM3-6B-32k	66.67	66.84	65.96	64.37	29.63
 9 | Mistral-7B-instruct	67.37	67.55	65.26	62.43	32.1
10 | Mixtral-8x7B-instruct	79.19	79.72	76.19	72.49	26.28
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/arc-e_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	60.32	60.14	54.85	25.04
 2 | LLaMA2-13B-chat	64.2	64.73	63.14	29.28
 3 | LLaMA2-70B-chat	60.67	62.26	59.96	29.63
 4 | Falcon-7B-instruct	58.91	59.08	58.91	35.98
 5 | Falcon-40B-instruct	71.08	71.08	70.19	23.46
 6 | Falcon-180B-chat	55.73	54.5	56.97	24.87
 7 | ChatGLM3-6B	44.27	42.5	42.86	29.98
 8 | ChatGLM3-6B-32k	66.67	67.02	66.14	29.28
 9 | Mistral-7B-instruct	67.37	66.49	65.61	37.04
10 | Mixtral-8x7B-instruct	79.19	76.54	74.96	29.98
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/ceval_kv.txt:
--------------------------------------------------------------------------------
1 | ChatGLM3-6B	53.84	53.92	52.36	51.84	34.53
2 | ChatGLM3-6B-32k	60.74	60.43	59.39	50.74	24.97
3 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/ceval_w.txt:
--------------------------------------------------------------------------------
1 | ChatGLM3-6B	53.84	53.85	53.19	49.87	24.82
2 | ChatGLM3-6B-32k	60.74	60.73	59.69	57.36	26.2


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/ceval_wa.txt:
--------------------------------------------------------------------------------
1 | ChatGLM3-6B	53.84	53.78	53.51	22.89
2 | ChatGLM3-6B-32k	60.74	60.55	59.02	26.33


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/gsm8k_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	26.76	25.78	23.96	12.05	0.83
 2 | LLaMA2-13B	42.84	42.68	39.35	31.61	0.91
 3 | LLaMA2-70B	59.14	59.14	58	53.75	3.79
 4 | Falcon-7B	3.64				
 5 | Falcon-40B	38.36	37.68	37.23	29.57	2.2
 6 | Falcon-180B	67.63	67.7	66.87	59.29	2.35
 7 | ChatGLM3-6B	55.8	56.79	56.79	47.31	3.49
 8 | ChatGLM3-6B-32k	54.21	55.57	51.1	37.76	
 9 | Mistral-7B	48.52	47.61	43.52	29.87	1.52
10 | Mixtral-8x7B	71.11	72.4	66.79	49.2	0.53
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/gsm8k_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	26.76	25.47	25.17	15.62	1.9
 2 | LLaMA2-13B	42.84	42.61	37.38	28.43	0.15
 3 | LLaMA2-70B	59.14	59.82	58.07	50.87	0.99
 4 | Falcon-7B	3.64	3.41	3.94		
 5 | Falcon-40B	38.36	38.29	36.24	32.37	0
 6 | Falcon-180B	67.63	67.7	67.02	47.46	1.59
 7 | ChatGLM3-6B	55.8	57.39	53.68	45.26	0.3
 8 | ChatGLM3-6B-32k	54.21	54.28	52.54	42.3	1.52
 9 | Mistral-7B	48.52	48.52	44.5	29.72	1.52
10 | Mixtral-8x7B	71.11	70.96	69.52	51.71	0.68
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/gsm8k_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	26.76	25.93	22.37	0.91
 2 | LLaMA2-13B	42.84	42.46	38.36	0.23
 3 | LLaMA2-70B	59.14	58.23	56.48	0.99
 4 | Falcon-7B	3.64			
 5 | Falcon-40B	38.36	36.16	34.95	1.29
 6 | Falcon-180B	67.63	67.32	68.61	0.08
 7 | ChatGLM3-6B	55.8	55.04	53.68	1.21
 8 | ChatGLM3-6B-32k	54.21	52.39	49.96	0.53
 9 | Mistral-7B	48.52	47.08	43.44	1.36
10 | Mixtral-8x7B	71.11	71.95	70.36	0.15
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/hellaswag_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	70.28	70.24	69.84	68.66	42.7
 2 | LLaMA2-13B-chat	74.32	74.37	74.09	73.29	55.37
 3 | LLaMA2-70B-chat	77.09	77.11	76.96	76.28	71.87
 4 | Falcon-7B-instruct	62.81	62.75	62.4	60.32	34.82
 5 | Falcon-40B-instruct	76.6	76.6	76.57	75.51	67.11
 6 | Falcon-180B-chat	82.22	82.21	82.4	81.59	76.98
 7 | ChatGLM3-6B	59.83	59.79	59.41	58.05	51.38
 8 | ChatGLM3-6B-32k	66.09	66.04	65.45	63.22	42.86
 9 | Mistral-7B-instruct	74.16	74.14	73.34	73.46	61.45
10 | Mixtral-8x7B-instruct	80.43	80.29	80.17	79.24	64.26
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/hellaswag_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	70.28	70.3	68.9	65.47	24.82
 2 | LLaMA2-13B-chat	74.32	74.31	73.78	70.94	32.61
 3 | LLaMA2-70B-chat	77.09	77.04	76.38	74.37	52.72
 4 | Falcon-7B-instruct	62.81	62.74	61.91	59.24	41.44
 5 | Falcon-40B-instruct	76.6	76.59	76.65	76.25	27.37
 6 | Falcon-180B-chat	82.22	82.18	82.04	79.89	60.75
 7 | ChatGLM3-6B	59.83	59.74	58.98	57.58	33.7
 8 | ChatGLM3-6B-32k	66.09	66.04	65.67	61.85	33.2
 9 | Mistral-7B-instruct	74.16	74.19	73.45	70.75	32.65
10 | Mixtral-8x7B-instruct	80.43	80.4	80.21	77.67	27.72
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/hellaswag_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B-chat	70.28	69.66	68.81	30.12
 2 | LLaMA2-13B-chat	74.32	74.47	73.58	28.28
 3 | LLaMA2-70B-chat	77.09	77.2	76.11	27.39
 4 | Falcon-7B-instruct	62.81	62.88	61.77	33.36
 5 | Falcon-40B-instruct	76.6	76.5	76.31	25.89
 6 | Falcon-180b-chat	82.22	81.58	81.76	25.71
 7 | ChatGLM3-6B	59.83	59.18	58.49	30.02
 8 | ChatGLM3-6B-32k	66.09	65.94	65.46	27.39
 9 | Mistral-7B-instruct	74.16	73.44	73.21	44.99
10 | Mixtral-8x7B-instruct	80.43	80.34	79.9	33.6
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu-calib_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	56.05/56.33	56.42/56.45	54.64/54.68	51.17/51.37	50.1/50.15
 2 | LLaMA2-13B	59.88/59.48	59.82/59.29	61.02/60.66	58.08/57.71	49.96/50.09
 3 | LLaMA2-70B	70.26/70.47	70.09/70.57	69.54/69.56	66.87/67.19	50.73/50.61
 4 | Falcon-7B	51.59/51.43	51.62/51.54	51.82/51.68	50.24/50.16	49.54/49.59
 5 | Falcon-40B	61.59/61.87	61.49/61.43	60.7/60.51	57.13/57.13	49.55/49.65
 6 | Falcon-180B	72.87/73.62	73/73.82	72.01/72.55	70.02/70.34	51.67/51.46
 7 | ChatGLM3-6B	55.66/55.38	55.32/55.18	55.68/55.43	53.62/53.4	50.13/49.91
 8 | Mistral-7B	67.83/67.27	67.55/66.84	67.47/66.79	64.09/63.53	49.97/49.88
 9 | Mixtral-8x7B	73.29/72.98	73.25/73.16	71/70.58	62.93/62.95	50.04/50.12
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu-calib_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	56.05/56.33	56.15/56.31	55.29/54.94	50.35/50.47	49.75/49.91
 2 | LLaMA2-13B	59.88/59.48	59.9/59.58	60.13/59.6	57.46/57.11	50.06/50.16
 3 | LLaMA2-70B	70.26/70.47	69.75/70.08	67.9/68.6	66.81/67.09	49.89/50
 4 | Falcon-7B	51.59/51.43	51.89/51.98	52.14/52.12		50.1/50.08
 5 | Falcon-40B	61.59/61.87	61.44/61.68	61.13/60.91	53.3/53.31	50/50.01
 6 | Falcon-180B	72.87/73.62	72.3/73.57	72.81/73.79	69.59/69.48	53.04/53.13
 7 | ChatGLM3-6B	55.66/55.38	55.58/55.21	55.13/54.84	52.08/52.01	50.6/50.57
 8 | Mistral-7B	67.83/67.27	67.58/66.98	66.26/65.56	63.64/62.99	49.99/50.03
 9 | Mixtral-8x7B	73.29/72.98	73.18/73.45	71.97/71.2	66.85/66.2	50.05/49.81
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu-calib_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	56.05/56.33	56.34/56.32	54.57/54.24	4967/49.83
 2 | LLaMA2-13B	59.88/59.48	59.21/59.06	60.28/59.55	49.94/49.94
 3 | LLaMA2-70B	70.26/70.47	70.41/70.53	69/69.07	50.18/50.17
 4 | Falcon-7B	51.59/51.43	52.04/51.97	51.91/51.95	49.54/49.5
 5 | Falcon-40B	61.59/61.87	60.84/60.82	60.18/60.07	50/50
 6 | Falcon-180B	72.87/73.62	72.89/73.54	72.15/73.18	49.66/49.93
 7 | ChatGLM3-6B	55.66/55.38	54.91/54.76	55.04/54.71	49.68/49.8
 8 | Mistral-7B	67.83/67.27	67.76/67.24	66.03/65.74	49.86/49.55
 9 | Mixtral-8x7B	73.29/72.98	70.72/70.5	69.18/68.66	50.68/49.90
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	46.69/45.89	46.64/45.86	46.19/44.98	41.15/40.44	24.98/25.21
 2 | LLaMA2-13B	55.81/55.68	55.75/55.6	54.51/54.35	49.96/49.27	25.54/25.49
 3 | LLaMA2-70B	69.80/69.13	69.81/69.09	68.97/68.56	67.06/66.17	35.98/36.16
 4 | Falcon-7B	27.55/27.11				
 5 | Falcon-40B	57.03/56	57.04/56.02	55.9/55.31	52.03/50.83	26.96/27.2
 6 | Falcon-180B	70.45/70.41	70.5/70.45	69.78/69.74	67.51/66.95	42.83/41.79
 7 | ChatGLM3-6B	51.93/50.58	51.95/50.6	51.52/50.11	49.04/47.71	34.64/34.02
 8 | ChatGLM3-6B-32k	58.01/57.29	58.08/57.36	57.11/56.15	51.26/50.17	24.44/24.98
 9 | Mistral-7B	64.06/62.57	64.1/62.65	62.81/61.33	59.08/57.76	27.36/27.08
10 | Mixtral-8x7B	71.91/70.49	71.63/70.21	68.07/66.98	58.77/57.77	25.02/24.91
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	46.69/45.89	46.92/45.98	45.45/44.91	38.52/37.87	24.23/24.39
 2 | LLaMA2-13B	55.81/55.68	55.73/55.69	54.55/54.28	51.07/50.51	25.08/24.94
 3 | LLaMA2-70B	69.80/69.13	69.74/69.1	68.49/68.02	65.21/64.69	27.13/26.63
 4 | Falcon-7B	27.55/27.11	27.71/27.08	28.6/28.27	28.42/27.71	24.78/24.6
 5 | Falcon-40B	57.03/56	57.09/56.02	56.1/55.26	53.7/53.3	23.75/23.84
 6 | Falcon-180B	70.45/70.41	70.4/70.42	69.38/69.57	65.91/65.58	37.76/36.83
 7 | ChatGLM3-6B	51.93/50.58	51.99/50.58	51.75/50.24	48.91/47.33	25.25/25.11
 8 | ChatGLM3-6B-32k	58.01/57.29	58.17/57.35	57.55/56.84	54/52.9	24.41/24.13
 9 | Mistral-7B	64.06/62.57	64.07/62.53	62.64/61.15	56.04/54.96	23.08/23.13
10 | Mixtral-8x7B	71.91/70.49	71.93/70.54	70.48/69.1	64.27/63.16	24.91/24.72
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/mmlu_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	46.69/45.89	47.05/46.14	45.7/44.7	23.82/23.82
 2 | LLaMA2-13B	55.81/55.68	55.77/55.32	54.28/54.06	24.17/24.08
 3 | LLaMA2-70B	69.80/69.13	69.55/68.66	67.8/67.12	23.79/23.62
 4 | Falcon-7B	27.55/27.11			
 5 | Falcon-40B	57.03/56	56.3/55.38	55.24/54.42	27.23/27
 6 | Falcon-180B	70.45/70.41	70.1/70.05	69.82/69.76	24/23.61
 7 | ChatGLM3-6B	51.93/50.58	51.87/50.36	51.16/49.85	24.07/23.99
 8 | ChatGLM3-6B-32k	58.01/57.29	57.96/57.09	57.12/56.45	23.88/23.61
 9 | Mistral-7B	64.06/62.57	63.73/62.26	62.43/60.89	24.86/24.75
10 | Mixtral-8x7B	71.91/70.49	69.29/68.15	67.35/66.52	24.27/24.14
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/strategyqa_kv.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	64.37	64.59	62.4	60.83	25.76
 2 | LLaMA2-13B	67.69	67.95	68.03	66.46	44.59
 3 | LLaMA2-70B	77.16	77.38	76.9	76.77	50.48
 4 | Falcon-7B	54.32	54.59	54.1	53.97	0.04
 5 | Falcon-40B	69.43	69.52	68.56	67.69	50.9
 6 | Falcon-180B	75.37	75.33	75.41	75.63	52.23
 7 | ChatGLM3-6B	48.86	48.73	50.31	50.31	47.99
 8 | ChatGLM3-6B-32k	81.14	81.27	79.87	72.88	28.56
 9 | Mistral-7B	67.69	67.51	66.72	63.28	40.66
10 | Mixtral-8x7B	75.76	75.81	74.72	70.48	31.7
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/strategyqa_w.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	64.37	64.28	61.44	60.39	11.48
 2 | LLaMA2-13B	67.69	68.08	66.51	63.76	25.24
 3 | LLaMA2-70B	77.16	77.12	75.24	73.14	49.56
 4 | Falcon-7B	54.32	54.37	53.23	54.63	9.96
 5 | Falcon-40B	69.43	69.48	69.43	66.77	31.92
 6 | Falcon-180B	75.37	75.02	75.85	72.1	23.06
 7 | ChatGLM3-6B	48.86	51.35	49.34	47.95	1.66
 8 | ChatGLM3-6B-32k	81.14	81.09	78.91	71.79	2.62
 9 | Mistral-7B	67.69	67.86	66.16	61.05	18.25
10 | Mixtral-8x7B	75.76	75.9	76.07	68.56	1.22
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/emergent/strategyqa_wa.txt:
--------------------------------------------------------------------------------
 1 | LLaMA2-7B	64.37	64.59	61.4	2.23
 2 | LLaMA2-13B	67.69	68.17	66.55	5.46
 3 | LLaMA2-70B	77.16	76.77	75.37	4.41
 4 | Falcon-7B	54.32	54.5	54.98	44.8
 5 | Falcon-40B	69.43	68.73	69.61	1.18
 6 | Falcon-180B	75.37	75.76	75.28	1.12
 7 | ChatGLM3-6B	48.86	49.83	48.69	17.25
 8 | ChatGLM3-6B-32k	81.14	80.52	79.43	9.91
 9 | Mistral-7B	67.69	67.42	65.9	47.51
10 | Mixtral-8x7B	75.76	75.81	74.72	2.71
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/emergent/plot_ability.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | 
 4 | modes = ["w", "wa", "kv"]
 5 | model_names = ["Falcon-180B"]
 6 | datasets = ["mmlu", "arc-c", "arc-e", "hellaswag", "gsm8k", "strategyqa", "mmlu-calib"]
 7 | for mode in modes:
 8 |     for model_name in model_names:
 9 |         acc = {}
10 |         minimum = {"mmlu": 25, "arc-c": 25, "arc-e": 25, "hellaswag": 25, "gsm8k": 0, "strategyqa": 0, "mmlu-calib": 50}
11 |         for dataset in datasets:
12 |             data_file_name = dataset + "_" + mode + ".txt"
13 |             with open("./emergent/" + data_file_name, "r") as f:
14 |                 while True:
15 |                     txt = f.readline()
16 |                     if txt == "":
17 |                         break
18 |                     fmodel_name = txt.split("\t")[0]
19 |                     if "32k" in fmodel_name:
20 |                         continue
21 |                     if "instruct" in fmodel_name or "chat" in fmodel_name:
22 |                         fmodel_name = fmodel_name.replace("-instruct", "")
23 |                         fmodel_name = fmodel_name.replace("-chat", "")
24 |                     if fmodel_name != model_name:
25 |                         continue
26 |                     acc0 = txt.split("\t")[1:]
27 |                     for i in range(len(acc0)):
28 |                         if acc0[i].split("/")[-1] == "" or acc0[i].split("/")[-1] == "\n":
29 |                             acc0[i] = None
30 |                         else:
31 |                             acc0[i] = float(acc0[i].split("/")[-1])
32 |                             if i == 0:
33 |                                 std = acc0[i]
34 |                             acc0[i] = max((acc0[i] - minimum[dataset]) / (std - minimum[dataset]), 0)
35 |                     acc[dataset] = acc0
36 |         ability = {}
37 |         # print(acc)
38 |         if mode == "w":
39 |             x_label = ["FP16", "W8", "W4", "W3", "W2"]
40 |         elif mode == "wa":
41 |             x_label = ["FP16", "W8A8", "W4A8", "W4A4"]
42 |         else:
43 |             x_label = ["FP16", "KV8", "KV4", "KV3", "KV2"]
44 | 
45 |         for i in range(len(x_label)):
46 |             ability[x_label[i]] = []
47 |             ability[x_label[i]].append(acc["mmlu"][i])
48 |             ability[x_label[i]].append(acc["strategyqa"][i])
49 |             ability[x_label[i]].append(acc["gsm8k"][i])
50 |             ability[x_label[i]].append(acc["arc-c"][i])
51 |             ability[x_label[i]].append(acc["mmlu-calib"][i])
52 |             ability[x_label[i]].append(acc["mmlu"][i])
53 |         angles = np.linspace(0, 2 * np.pi, 5, endpoint=False)
54 |         angles = np.concatenate((angles, [angles[0]]))
55 |         angles += np.pi / 10
56 |         feature = ["ICL", "C-MR", "M-MR", "IF", "SC", "ICL"]
57 |         fig = plt.figure()
58 |         ax = fig.add_subplot(111, polar=True)
59 | 
60 |         for k, v in ability.items():
61 |             ax.plot(angles, v, "o-", linewidth=2, label=k)
62 |             ax.fill(angles, v, alpha=0.25)
63 |         ax.set_thetagrids(angles * 180 / np.pi, feature, size=15)
64 |         plt.legend(loc="upper right", fontsize=13)
65 |         plt.savefig("./figure/radarpdf/" + model_name + "_" + mode + ".pdf", bbox_inches="tight")
66 |         plt.close()
67 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/litm/plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.lines import Line2D
  5 | 
  6 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the
  7 | # script instead of being passed as command args.
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument('--save_path', type=str, default='./figures/')
 11 | parser.add_argument('--save_name', type=str, default=None)
 12 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv'])
 13 | parser.add_argument('--dataset_name', type=str, default='Multi-Doc_QA')
 14 | parser.add_argument('--norm', action='store_true')
 15 | parser.add_argument("--num_bins", type=int, default=30, help="bin number")
 16 | parser.add_argument('--x_label', type=str, default='Position', choices=['Position'])
 17 | parser.add_argument('--y_label', type=str, default='Accuracy')
 18 | parser.add_argument('--legend_loc', type=str, default='lower left')
 19 | args = parser.parse_args()
 20 | 
 21 | 
 22 | ##################################### Helper Functions ######################################
 23 | def result_pad(raw_results, fp_idx=0):
 24 |     raw_fp_result = raw_results[fp_idx]
 25 |     new_results = [x if x is not None else 0 for x in raw_results]
 26 |     new_results[fp_idx] = raw_fp_result
 27 |     return new_results
 28 | 
 29 | def normalize_results(raw_results, fp_idx=0):
 30 |     # raw_results = result_pad(raw_results)
 31 |     if raw_results[fp_idx] is not None and raw_results[fp_idx] != 0:
 32 |         norm_results = [i / raw_results[fp_idx] * 100 if i is not None else None for i in raw_results]
 33 |         print('Result Normalization Succeeded.')
 34 |     else:
 35 |         norm_results = raw_results
 36 |         print('The input results have no FP precision, return original results.')
 37 |     return norm_results
 38 | 
 39 | MARKERS = ['o', '^', 's', 'd', '+']
 40 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown']
 41 | MODELS = ["Mistral_7B", "Vicuna_7B"]
 42 | 
 43 | 
 44 | ###################################### Plot Functions ######################################
 45 | if __name__ == '__main__':
 46 |     if args.plot_mode == 'kv':
 47 |         import pos.kv_length_result as data
 48 |     elif args.plot_mode == 'w':
 49 |         import pos.w_length_result as data
 50 |     elif args.plot_mode == 'wa':
 51 |         import pos.wa_length_result as data
 52 | 
 53 |     # create a folder to save the resulting plot.
 54 |     if not os.path.exists(args.save_path):
 55 |         os.makedirs(args.save_path)
 56 | 
 57 |     # plot name
 58 |     save_name = args.save_name + '.pdf' if args.save_name is not None else \
 59 |                     args.dataset_name + '_' + args.plot_mode + '_' + args.x_label + '.pdf'
 60 |     save_name = args.save_path + save_name
 61 | 
 62 |     # init canvas
 63 |     fig, ax = plt.subplots(figsize=[8., 6.])
 64 | 
 65 |     # make the plot compact
 66 |     plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11)
 67 | 
 68 |     # set figure labels
 69 |     plt.xlabel(args.x_label, fontsize=20)
 70 |     plt.ylabel(args.y_label if not args.norm else 'Normalized ' + args.y_label, fontsize=20)
 71 |     plt.ylim(0.2, 0.75)
 72 | 
 73 |     # set axes font size
 74 |     ax.tick_params(axis='x', labelsize=20)
 75 |     ax.tick_params(axis='y', labelsize=20)
 76 | 
 77 |     x_axis = {
 78 |         'Position':   ['0', '1', '2', '3', '4', '5', '6'],
 79 |     }[args.x_label]
 80 | 
 81 |     # mode = {
 82 |     #     'w':   ['FP16', 'W8', 'W4', 'W3', 'W2'],
 83 |     #     'wa':    ['FP16', 'W8A8', 'W4A8', 'W4A4'],
 84 |     #     'kv': ['FP16', 'KV8', 'KV4', 'KV3', 'KV2'],
 85 |     # }[args.plot_mode]
 86 | 
 87 |     mode = {
 88 |         'w':   ['FP16', 'W8', 'W4', 'W3'],
 89 |         'wa':    ['FP16', 'W8A8', 'W4A8'],
 90 |         'kv': ['FP16', 'KV8', 'KV4', 'KV3'],
 91 |     }[args.plot_mode]
 92 | 
 93 |     for i, model_name in enumerate(MODELS):
 94 |         for j, _ in enumerate(mode):
 95 |             # ax.plot(x_axis, eval("data.{}".format(model_name))[j], marker=MARKERS[j], markersize=13, label=model_name + '-' + mode[j], linestyle="-", color=COLORS[i])
 96 |             y_value = eval("data.{}".format(model_name))[j]
 97 |             ax.plot(x_axis[:len(y_value)], y_value, marker=MARKERS[j], markersize=13, linestyle="-", color=COLORS[i])
 98 | 
 99 |     # set legend location
100 |     # ax.legend(loc=args.legend_loc)
101 |     legend_elements = []
102 |     for i, _ in enumerate(mode):
103 |         legend_elements.append(Line2D([0], [0], marker=MARKERS[i], color='w', label=mode[i], markerfacecolor='tab:brown', markersize=16))
104 |     ax.legend(handles=legend_elements, fontsize=20)
105 | 
106 |     # for i, _ in enumerate(MODELS):
107 |     #     legend_elements.append(Line2D([0], [0], color=COLORS[i], label=MODELS[i], linewidth=4))
108 |     # ax.legend(handles=legend_elements, fontsize=20)
109 | 
110 |     # Warning: please do not change the figure you are previewing
111 |     # preview the figure
112 |     plt.savefig(save_name)
113 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/litm/pos/kv_length_result.py:
--------------------------------------------------------------------------------
 1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6429378531073446, 0.49642184557438795, 0.4726930320150659, 0.4553672316384181, 0.4542372881355932, 0.45800376647834273, 0.503578154425612], [0.6071563088512241, 0.487758945386064, 0.46440677966101696, 0.45084745762711864, 0.4369114877589454, 0.44143126177024483, 0.48851224105461394], [0.4512241054613936, 0.3630885122410546, 0.34199623352165726, 0.34764595103578155, 0.3694915254237288, 0.36271186440677966, 0.4056497175141243]]
 2 | LongChat_13B = [[0.6700564971751413, 0.5480225988700564, 0.5276836158192091, 0.5291902071563088, 0.5205273069679849, 0.5133709981167608, 0.5506591337099812], [0.6625235404896422, 0.5532956685499059, 0.5318267419962335, 0.5250470809792843, 0.5212806026365349, 0.5133709981167608, 0.5427495291902071], [0.6557438794726931, 0.5318267419962335, 0.5205273069679849, 0.5129943502824859, 0.5096045197740113, 0.4919020715630885, 0.5299435028248588], [0.5811676082862524, 0.439924670433145, 0.423728813559322, 0.4233521657250471, 0.4263653483992467, 0.4248587570621469, 0.43615819209039547]]
 3 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6817325800376648, 0.5555555555555556, 0.47532956685499056, 0.4384180790960452, 0.4425612052730697, 0.4354048964218456, 0.47796610169491527], [0.6429378531073446, 0.5065913370998116, 0.4568738229755179, 0.4256120527306968, 0.42259887005649716, 0.41883239171374764, 0.4689265536723164], [0.49227871939736345, 0.38267419962335214, 0.2998116760828625, 0.29303201506591336, 0.3013182674199623, 0.2568738229755179, 0.32354048964218457]]
 4 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.688512241054614, 0.5020715630885122, 0.44745762711864406, 0.4256120527306968, 0.43163841807909603, 0.4312617702448211, 0.44143126177024483], [0.6700564971751413, 0.5114877589453861, 0.4651600753295669, 0.44030131826741997, 0.43088512241054616, 0.4350282485875706, 0.4497175141242938], [0.5329566854990584, 0.3728813559322034, 0.351789077212806, 0.34048964218455746, 0.3480225988700565, 0.33634651600753296, 0.3951035781544256]]
 5 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.48436911487758944, 0.30772128060263654, 0.3047080979284369, 0.2903954802259887, 0.30282485875706217, 0.31148775894538605, 0.39623352165725045], [0.47005649717514125, 0.2885122410546139, 0.2952919020715631, 0.2847457627118644, 0.2768361581920904, 0.2760828625235405, 0.37589453860640304], [0.3871939736346516, 0.23126177024482109, 0.2519774011299435, 0.2384180790960452, 0.2192090395480226, 0.21242937853107344, 0.30998116760828626]]
 6 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.5902071563088512, 0.49566854990583803, 0.4888888888888889, 0.49491525423728816, 0.46365348399246703, 0.5069679849340867, 0.5209039548022599], [0.5834274952919021, 0.4873822975517891, 0.4806026365348399, 0.47909604519774013, 0.4591337099811676, 0.5001883239171375, 0.5152542372881356], [0.4651600753295669, 0.35555555555555557, 0.3465160075329567, 0.35480225988700564, 0.32919020715630887, 0.36610169491525424, 0.37928436911487756]]
 7 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5969868173258004, 0.5898305084745763, 0.6143126177024482, 0.639924670433145, 0.64030131826742, 0.639924670433145, 0.6606403013182675], [0.5947269303201507, 0.5792843691148776, 0.5992467043314501, 0.6369114877589453, 0.6380414312617703, 0.6391713747645951, 0.6538606403013183], [0.5370998116760829, 0.46779661016949153, 0.4975517890772128, 0.5329566854990584, 0.5370998116760829, 0.5664783427495292, 0.5807909604519774]]
 8 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6467043314500942, 0.655367231638418, 0.6775894538606403, 0.672316384180791, 0.6595103578154425, 0.663653483992467, 0.7224105461393597], [0.6595103578154425, 0.6451977401129944, 0.6538606403013183, 0.6621468926553672, 0.6632768361581921, 0.6583804143126177, 0.7096045197740113], [0.5875706214689266, 0.5653483992467043, 0.567984934086629, 0.5770244821092279, 0.5642184557438795, 0.592467043314501, 0.6587570621468927]]
 9 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5819209039548022, 0.5205273069679849, 0.5389830508474577, 0.5883239171374764, 0.696045197740113], [0.5725047080979284, 0.5295668549905838, 0.5386064030131826, 0.5905838041431262, 0.6934086629001883], [0.5269303201506591, 0.4990583804143126, 0.5269303201506591, 0.5774011299435028, 0.6677966101694915]]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/litm/pos/w_length_result.py:
--------------------------------------------------------------------------------
 1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6440677966101694, 0.4967984934086629, 0.47871939736346514, 0.4610169491525424, 0.4512241054613936, 0.4632768361581921, 0.503578154425612], [0.6165725047080979, 0.46440677966101696, 0.45725047080979286, 0.4444444444444444, 0.44105461393596984, 0.432015065913371, 0.4463276836158192], [0.40941619585687383, 0.30772128060263654, 0.33973634651600754, 0.3344632768361582, 0.32693032015065915, 0.3167608286252354, 0.36271186440677966]]
 2 | LongChat_13B = [[0.6700564971751413, 0.5480225988700564, 0.5276836158192091, 0.5291902071563088, 0.5205273069679849, 0.5133709981167608, 0.5506591337099812], [0.6741996233521658, 0.5435028248587571, 0.5310734463276836, 0.5242937853107345, 0.5193973634651601, 0.5163841807909605, 0.5510357815442561], [0.6666666666666666, 0.5276836158192091, 0.5054613935969868, 0.49227871939736345, 0.5005649717514125, 0.49416195856873824, 0.5291902071563088], [0.6060263653483993, 0.43427495291902074, 0.4214689265536723, 0.423728813559322, 0.4346516007532957, 0.4312617702448211, 0.40414312617702447]]
 3 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6813559322033899, 0.5574387947269304, 0.47306967984934084, 0.4429378531073446, 0.44105461393596984, 0.4327683615819209, 0.48361581920903957], [0.6387947269303201, 0.5322033898305085, 0.4674199623352166, 0.4463276836158192, 0.44708097928436913, 0.4534839924670433, 0.4937853107344633], [0.5570621468926553, 0.42297551789077215, 0.392090395480226, 0.37175141242937854, 0.36610169491525424, 0.3694915254237288, 0.3559322033898305]]
 4 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.6836158192090396, 0.5005649717514125, 0.4485875706214689, 0.42674199623352166, 0.42674199623352166, 0.4331450094161959, 0.43804143126177025], [0.6937853107344633, 0.5382297551789077, 0.46817325800376647, 0.4335216572504708, 0.4274952919020716, 0.44143126177024483, 0.4527306967984934], [0.5638418079096045, 0.46854990583804146, 0.44595103578154427, 0.4290018832391714, 0.4105461393596987, 0.42109227871939736, 0.44745762711864406]]
 5 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.4858757062146893, 0.3088512241054614, 0.30583804143126175, 0.29792843691148774, 0.29303201506591336, 0.30847457627118646, 0.3935969868173258], [0.4854990583804143, 0.303954802259887, 0.2858757062146893, 0.28210922787193976, 0.3013182674199623, 0.3160075329566855, 0.41883239171374764], [0.41506591337099813, 0.25348399246704334, 0.24180790960451978, 0.232015065913371, 0.24858757062146894, 0.2749529190207156, 0.367984934086629]]
 6 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.60075329566855, 0.49566854990583803, 0.48512241054613936, 0.4915254237288136, 0.47532956685499056, 0.5096045197740113, 0.5231638418079096], [0.5849340866290019, 0.4839924670433145, 0.456120527306968, 0.46779661016949153, 0.45310734463276836, 0.47721280602636534, 0.5009416195856874], [0.5261770244821092, 0.4233521657250471, 0.416195856873823, 0.42523540489642186, 0.42862523540489644, 0.455743879472693, 0.480225988700565]]
 7 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5977401129943503, 0.5951035781544256, 0.6131826741996234, 0.6444444444444445, 0.6414312617702448, 0.6380414312617703, 0.6598870056497175], [0.5728813559322034, 0.5630885122410546, 0.5894538606403014, 0.6203389830508474, 0.6109227871939736, 0.6120527306967984, 0.6621468926553672], [0.5005649717514125, 0.5137476459510358, 0.5397363465160075, 0.5777777777777777, 0.5917137476459511, 0.5894538606403014, 0.6188323917137476]]
 8 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6516007532956686, 0.6508474576271186, 0.6700564971751413, 0.672316384180791, 0.6685499058380414, 0.6659133709981168, 0.7182674199623352], [0.6278719397363465, 0.6380414312617703, 0.6583804143126177, 0.6681732580037665, 0.6467043314500942, 0.6523540489642184, 0.7250470809792844], [0.6030131826741997, 0.5834274952919021, 0.6022598870056497, 0.6165725047080979, 0.6116760828625235, 0.6222222222222222, 0.672316384180791]]
 9 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5807909604519774, 0.528060263653484, 0.5419962335216573, 0.591337099811676, 0.6949152542372882], [0.551789077212806, 0.5190207156308851, 0.5340866290018832, 0.5856873822975518, 0.6937853107344633], [0.5043314500941619, 0.46817325800376647, 0.48851224105461394, 0.5013182674199623, 0.6444444444444445]]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/litm/pos/wa_length_result.py:
--------------------------------------------------------------------------------
1 | LongChat_7B = [[0.6467043314500942, 0.4952919020715631, 0.4768361581920904, 0.4583804143126177, 0.44670433145009414, 0.45649717514124294, 0.5028248587570622], [0.6459510357815442, 0.49227871939736345, 0.47721280602636534, 0.45084745762711864, 0.4346516007532957, 0.4485875706214689, 0.49830508474576274], [0.6146892655367232, 0.4711864406779661, 0.46365348399246703, 0.4519774011299435, 0.45160075329566857, 0.44030131826741997, 0.4598870056497175]]
2 | Vicuna_7B = [[0.6749529190207156, 0.5585687382297552, 0.4726930320150659, 0.43954802259887005, 0.4421845574387947, 0.43578154425612053, 0.47871939736346514], [0.6760828625235404, 0.5555555555555556, 0.4741996233521657, 0.4406779661016949, 0.4354048964218456, 0.43728813559322033, 0.4768361581920904], [0.6282485875706215, 0.5269303201506591, 0.4610169491525424, 0.43163841807909603, 0.44595103578154427, 0.448210922787194, 0.4817325800376648]]
3 | Vicuna_13B = [[0.6896421845574388, 0.5013182674199623, 0.4463276836158192, 0.42259887005649716, 0.4256120527306968, 0.4312617702448211, 0.4384180790960452], [0.6952919020715631, 0.511864406779661, 0.46854990583804146, 0.4354048964218456, 0.43653483992467046, 0.44519774011299434, 0.4576271186440678], [0.6934086629001883, 0.5374764595103578, 0.4704331450094162, 0.4425612052730697, 0.4331450094161959, 0.45235404896421844, 0.4519774011299435]]
4 | ChatGLM2_6B = [[0.4847457627118644, 0.30922787193973633, 0.31186440677966104, 0.2941619585687382, 0.29152542372881357, 0.3065913370998117, 0.4011299435028249], [0.4711864406779661, 0.3129943502824859, 0.30583804143126175, 0.28775894538606406, 0.296045197740113, 0.3016949152542373, 0.4011299435028249], [0.4821092278719397, 0.2994350282485876, 0.2952919020715631, 0.28210922787193976, 0.2983050847457627, 0.319774011299435, 0.4143126177024482]]
5 | ChatGLM3_6B = [[0.5966101694915255, 0.4915254237288136, 0.4847457627118644, 0.5020715630885122, 0.45461393596986815, 0.5009416195856874, 0.5235404896421846], [0.5947269303201507, 0.5016949152542373, 0.4749529190207156, 0.4817325800376648, 0.45875706214689266, 0.5028248587570622, 0.5212806026365349], [0.5958568738229755, 0.4817325800376648, 0.46026365348399245, 0.4775894538606403, 0.4429378531073446, 0.4760828625235405, 0.4986817325800377]]
6 | Mistral_7B = [[0.6022598870056497, 0.5951035781544256, 0.6146892655367232, 0.639924670433145, 0.6365348399246704, 0.6414312617702448, 0.6602636534839925], [0.5981167608286252, 0.5856873822975518, 0.6210922787193973, 0.6474576271186441, 0.6425612052730697, 0.63954802259887, 0.663653483992467], [0.5709981167608287, 0.5634651600753295, 0.5935969868173258, 0.616195856873823, 0.6116760828625235, 0.6154425612052731, 0.6527306967984934]]
7 | Mixtral_8x7B = [[0.6489642184557439, 0.6519774011299435, 0.6704331450094162, 0.6753295668549906, 0.664783427495292, 0.6613935969868173, 0.7239171374764595], [0.6448210922787194, 0.6610169491525424, 0.6662900188323917, 0.6693032015065913, 0.6651600753295669, 0.6591337099811676, 0.7227871939736347], [0.6278719397363465, 0.6308851224105462, 0.6538606403013183, 0.6651600753295669, 0.6444444444444445, 0.647834274952919, 0.7182674199623352]]
8 | LLaMA2_70B = [[0.5789077212806026, 0.5276836158192091, 0.5393596986817326, 0.5883239171374764, 0.695668549905838], [0.5672316384180791, 0.5163841807909605, 0.5333333333333333, 0.5871939736346516, 0.6839924670433145], [0.544256120527307, 0.5020715630885122, 0.5261770244821092, 0.5702448210922787, 0.6903954802259887]]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/length/kv_length_result.py:
--------------------------------------------------------------------------------
1 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.954, 0.954, 0.896, 0.896, 0.882, 0.856], [0.936, 0.928, 0.868, 0.872, 0.814, 0.822], [0.478, 0.412, 0.410, 0.310, 0.240, 0.222], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
2 | ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.484, 0.288, 0.628, 0.864, 0.826, 0.864], [0.03, 0.024, 0.104, 0.05, 0.120, 0.206], [0.012, 0.024, 0.008, 0.022, 0.038, 0.014], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.966, 0.920, 0.630, 0.556, 0.456, 0.342], [0.910, 0.866, 0.552, 0.458, 0.336, 0.256], [0.518, 0.398, 0.164, 0.062, 0.032, 0.028], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.866, 0.686, 0.518, 0.396, 0.184], [0.846, 0.738, 0.522, 0.348, 0.23, 0.106], [0.242, 0.074, 0.064, 0.020, 0.01, 0.002], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.964, 0.902, 0.73, 0.536], [0.964, 0.964, 0.904, 0.754, 0.55, 0.36], [0.300, 0.242, 0.070, 0.04, 0.01, 0.004], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.96, 0.896, 0.668, 0.394], [0.962, 0.96, 0.910, 0.804, 0.564, 0.312], [0.500, 0.296, 0.164, 0.042, 0.064, 0.002], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.998, 1.0], [0.998, 0.996, 0.994, 0.994, 0.982, 0.982], [0.692, 0.644, 0.602, 0.544, 0.46, 0.452], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.958, 0.954, 0.934], [0.996, 0.982, 0.96, 0.928, 0.914, 0.904], [0.784, 0.712, 0.628, 0.542, 0.5, 0.406], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/length/w_length_result.py:
--------------------------------------------------------------------------------
 1 | # ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.45, 0.27, 0.602, 0.832, 0.816, 0.856], [0.874, 0.77, 0.908, 0.928, 0.93, 0.908], [0.816, 0.42, 0.794, 0.768, 0.868, 0.852], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 2 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.95, 0.95, 0.896, 0.89, 0.882, 0.854], [0.978, 0.968, 0.91, 0.906, 0.88, 0.86], [0.904, 0.88, 0.83, 0.774, 0.752, 0.676], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.964, 0.924, 0.888, 0.88, 0.778, 0.722], [0.966, 0.898, 0.832, 0.83, 0.708, 0.608], [0.806, 0.786, 0.702, 0.602, 0.45, 0.392], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.874, 0.798, 0.648, 0.588, 0.384], [0.924, 0.808, 0.724, 0.654, 0.44, 0.286], [0.676, 0.54, 0.422, 0.292, 0.16, 0.082], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.968, 0.91, 0.74, 0.578], [0.996, 0.992, 0.98, 0.902, 0.754, 0.614], [0.928, 0.838, 0.66, 0.524, 0.392, 0.304], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.96, 0.904, 0.668, 0.428], [0.976, 0.95, 0.902, 0.834, 0.67, 0.348], [0.674, 0.864, 0.84, 0.642, 0.482, 0.282], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.998, 1.0], [1.0, 1.0, 0.998, 0.998, 0.998, 0.996], [0.954, 0.93, 0.914, 0.846, 0.828, 0.82], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.974, 0.968, 0.944], [0.996, 0.998, 0.988, 0.978, 0.954, 0.922], [0.988, 0.944, 0.91, 0.852, 0.78, 0.76], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 9 | 
10 | ChatGLM2_6B_AWQ = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.952, 0.954, 0.896, 0.896, 0.882, 0.856], [0.94, 0.932, 0.872, 0.86, 0.862, 0.822], [0.92, 0.91, 0.848, 0.798, 0.808, 0.758], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
11 | LongChat_13B_AWQ = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.966, 0.924, 0.884, 0.878, 0.772, 0.724], [0.976, 0.914, 0.864, 0.83, 0.714, 0.61], [0.928, 0.876, 0.778, 0.7, 0.556, 0.438], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
12 | LongChat_7B_AWQ = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.874, 0.794, 0.656, 0.594, 0.388], [0.918, 0.868, 0.786, 0.684, 0.562, 0.372], [0.77, 0.604, 0.5, 0.476, 0.326, 0.264], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
13 | Vicuna_7B_AWQ = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.99, 0.964, 0.912, 0.736, 0.582], [0.998, 0.992, 0.964, 0.892, 0.71, 0.578], [0.974, 0.860, 0.904, 0.772, 0.57, 0.412], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
14 | Vicuna_13B_AWQ = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.974, 0.956, 0.896, 0.67, 0.408], [0.986, 0.966, 0.944, 0.852, 0.58, 0.33], [0.948, 0.918, 0.714, 0.528, 0.36, 0.164], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
15 | Mixtral_8x7B_AWQ = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [0.998, 0.998, 1.0, 0.998, 0.994, 0.996], [0.968, 0.95, 0.882, 0.924, 0.886, 0.902], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
16 | Mistral_7B_AWQ = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.994, 0.994, 0.98, 0.97, 0.948], [0.998, 0.998, 0.988, 0.964, 0.954, 0.926], [0.996, 0.996, 0.966, 0.942, 0.926, 0.87], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
17 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/length/wa_length_result.py:
--------------------------------------------------------------------------------
1 | ChatGLM2_6B = [[0.952, 0.956, 0.896, 0.9, 0.882, 0.854], [0.972, 0.964, 0.916, 0.904, 0.888, 0.864], [0.972, 0.96, 0.912, 0.918, 0.894, 0.866], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
2 | ChatGLM3_6B = [[0.476, 0.284, 0.614, 0.854, 0.824, 0.85], [0.74, 0.64, 0.82, 0.908, 0.808, 0.86], [0.894, 0.896, 0.912, 0.922, 0.914, 0.902], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
3 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.974, 0.92, 0.904, 0.902, 0.794, 0.708], [0.964, 0.862, 0.798, 0.802, 0.71, 0.618], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
4 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.956, 0.884, 0.824, 0.75, 0.618, 0.448], [0.93, 0.848, 0.776, 0.61, 0.42, 0.308], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
5 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.99, 0.972, 0.908, 0.72, 0.586], [0.996, 0.996, 0.966, 0.888, 0.696, 0.586], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
6 | Vicuna_13B = [[0.99, 0.974, 0.956, 0.898, 0.672, 0.416], [0.99, 0.974, 0.95, 0.906, 0.716, 0.462], [0.978, 0.948, 0.882, 0.862, 0.696, 0.446], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
7 | Mixtral_8x7B = [[1.0, 0.998, 1.0, 0.996, 0.996, 1.0], [1.0, 0.996, 1.0, 0.994, 0.994, 0.996], [1.0, 0.998, 0.998, 1.0, 0.998, 0.996], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
8 | Mistral_7B = [[0.998, 0.994, 0.994, 0.978, 0.97, 0.948], [0.998, 0.996, 0.996, 0.978, 0.954, 0.932], [0.998, 0.994, 0.99, 0.96, 0.944, 0.908], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/length/wkv_length_result.py:
--------------------------------------------------------------------------------
1 | LongChat_13B = [[0.964, 0.924, 0.888, 0.874, 0.776, 0.718], [0.968, 0.924, 0.618, 0.562, 0.456, 0.342], [0.962, 0.904, 0.63, 0.566, 0.494, 0.378], [0.924, 0.872, 0.55, 0.466, 0.374, 0.248], [0.91, 0.836, 0.552, 0.44, 0.36, 0.266]]
2 | LongChat_7B = [[0.944, 0.876, 0.794, 0.66, 0.598, 0.398], [0.946, 0.872, 0.682, 0.506, 0.394, 0.18], [0.912, 0.814, 0.7, 0.544, 0.338, 0.198], [0.83, 0.752, 0.52, 0.332, 0.19, 0.102], [0.81, 0.664, 0.534, 0.326, 0.198, 0.108]]
3 | Vicuna_7B = [[0.996, 0.988, 0.968, 0.914, 0.738, 0.578], [0.996, 0.988, 0.966, 0.898, 0.72, 0.544], [0.996, 0.994, 0.974, 0.904, 0.73, 0.596], [0.964, 0.966, 0.888, 0.744, 0.554, 0.372], [0.964, 0.952, 0.872, 0.788, 0.586, 0.37]]
4 | Vicuna_13B = [[0.990, 0.974, 0.956, 0.898, 0.672, 0.416], [0.992, 0.976, 0.96, 0.894, 0.678, 0.408], [0.978, 0.946, 0.91, 0.83, 0.632, 0.364], [0.966, 0.958, 0.904, 0.798, 0.554, 0.336], [0.942, 0.91, 0.682, 0.732, 0.508, 0.29]]
5 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/pos/wa_length_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM2_6B = [[0.8125, 0.9661016949152542, 0.8055555555555556, 0.875, 0.6825396825396826, 0.6585365853658537, 0.8771929824561403, 0.9056603773584906, 0.9818181818181818, 0.9375], [0.8333333333333334, 0.9491525423728814, 0.8333333333333334, 0.875, 0.6984126984126984, 0.7560975609756098, 0.8421052631578947, 0.9245283018867925, 0.9818181818181818, 0.9375], [0.8958333333333334, 0.9661016949152542, 0.8888888888888888, 0.85, 0.7936507936507936, 0.5609756097560976, 0.8596491228070176, 0.8867924528301887, 0.9636363636363636, 0.9375], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 2 | ChatGLM3_6B = [[0.8125, 0.6949152542372882, 0.8055555555555556, 0.925, 0.7936507936507936, 0.8292682926829268, 0.8421052631578947, 0.8867924528301887, 1.0, 0.9375], [0.9166666666666666, 0.7627118644067796, 0.6944444444444444, 0.925, 0.8253968253968254, 0.8292682926829268, 0.8771929824561403, 0.8679245283018868, 0.9454545454545454, 0.9375], [0.9375, 0.8813559322033898, 0.8888888888888888, 0.95, 0.8571428571428571, 0.8780487804878049, 0.9298245614035088, 0.9245283018867925, 0.9090909090909091, 0.875], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 3 | LongChat_13B = [[0.3333333333333333, 0.423728813559322, 0.6111111111111112, 0.675, 0.7777777777777778, 0.8048780487804879, 0.7719298245614035, 0.8867924528301887, 0.9272727272727272, 0.9375], [0.3541666666666667, 0.3389830508474576, 0.5833333333333334, 0.725, 0.7619047619047619, 0.7804878048780488, 0.8070175438596491, 0.8867924528301887, 0.8909090909090909, 0.9375], [0.3333333333333333, 0.2033898305084746, 0.5, 0.625, 0.6349206349206349, 0.5853658536585366, 0.7368421052631579, 0.7735849056603774, 0.9090909090909091, 0.8541666666666666], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 4 | LongChat_7B = [[0.125, 0.22033898305084745, 0.4444444444444444, 0.5, 0.5396825396825397, 0.3170731707317073, 0.3157894736842105, 0.41509433962264153, 0.5636363636363636, 0.5416666666666666], [0.125, 0.1864406779661017, 0.4444444444444444, 0.65, 0.5396825396825397, 0.3902439024390244, 0.45614035087719296, 0.49056603773584906, 0.6181818181818182, 0.6041666666666666], [0.16666666666666666, 0.06779661016949153, 0.2777777777777778, 0.475, 0.31746031746031744, 0.2926829268292683, 0.3157894736842105, 0.3584905660377358, 0.45454545454545453, 0.3958333333333333], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 5 | Vicuna_7B = [[0.10416666666666667, 0.2711864406779661, 0.3055555555555556, 0.7, 0.49206349206349204, 0.3902439024390244, 0.6842105263157895, 0.8301886792452831, 0.9636363636363636, 0.9583333333333334], [0.14583333333333334, 0.2542372881355932, 0.3888888888888889, 0.7, 0.49206349206349204, 0.4146341463414634, 0.6842105263157895, 0.8679245283018868, 0.9090909090909091, 0.9583333333333334], [0.0625, 0.2711864406779661, 0.3333333333333333, 0.525, 0.5873015873015873, 0.4878048780487805, 0.7368421052631579, 0.8679245283018868, 0.9272727272727272, 0.9375], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 6 | Vicuna_13B = [[0.16666666666666666, 0.0847457627118644, 0.05555555555555555, 0.325, 0.3333333333333333, 0.1951219512195122, 0.543859649122807, 0.6415094339622641, 0.8181818181818182, 0.8541666666666666], [0.22916666666666666, 0.11864406779661017, 0.08333333333333333, 0.275, 0.2698412698412698, 0.4634146341463415, 0.6140350877192983, 0.7358490566037735, 0.8363636363636363, 0.8958333333333334], [0.25, 0.11864406779661017, 0.05555555555555555, 0.475, 0.3968253968253968, 0.21951219512195122, 0.5263157894736842, 0.6226415094339622, 0.7818181818181819, 0.8958333333333334], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 7 | Mixtral_8x7B = [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 0.9722222222222222, 1.0, 1.0, 1.0, 1.0, 0.9811320754716981, 1.0, 1.0], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9824561403508771, 1.0, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 8 | Mistral_7B = [[0.9791666666666666, 1.0, 1.0, 0.975, 0.9047619047619048, 0.8292682926829268, 0.8596491228070176, 0.9433962264150944, 1.0, 1.0], [1.0, 1.0, 1.0, 0.975, 0.8412698412698413, 0.7804878048780488, 0.8245614035087719, 0.9245283018867925, 1.0, 1.0], [0.9791666666666666, 0.9830508474576272, 0.9444444444444444, 0.925, 0.7936507936507936, 0.7073170731707317, 0.7894736842105263, 0.9811320754716981, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
 9 | Mistral_7B = [[0.9791666666666666, 1.0, 1.0, 0.975, 0.9047619047619048, 0.8292682926829268, 0.8596491228070176, 0.9433962264150944, 1.0, 1.0], [1.0, 1.0, 1.0, 0.975, 0.8412698412698413, 0.7804878048780488, 0.8245614035087719, 0.9245283018867925, 1.0, 1.0], [0.9791666666666666, 0.9830508474576272, 0.9444444444444444, 0.925, 0.7936507936507936, 0.7073170731707317, 0.7894736842105263, 0.9811320754716981, 0.9818181818181818, 1.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/long/longeval/pos/wkv_length_result.py:
--------------------------------------------------------------------------------
1 | LongChat_13B = [[0.3333333333333333, 0.423728813559322, 0.6111111111111112, 0.675, 0.7777777777777778, 0.8048780487804879, 0.7719298245614035, 0.8867924528301887, 0.9272727272727272, 0.9375], [0.22916666666666666, 0.1694915254237288, 0.19444444444444445, 0.55, 0.4126984126984127, 0.2682926829268293, 0.3157894736842105, 0.2830188679245283, 0.45454545454545453, 0.5416666666666666], [0.25, 0.11864406779661017, 0.25, 0.45, 0.47619047619047616, 0.21951219512195122, 0.38596491228070173, 0.4339622641509434, 0.5636363636363636, 0.5833333333333334], [0.1875, 0.1016949152542373, 0.0, 0.3, 0.38095238095238093, 0.24390243902439024, 0.21052631578947367, 0.18867924528301888, 0.38181818181818183, 0.4166666666666667], [0.20833333333333334, 0.0847457627118644, 0.1111111111111111, 0.25, 0.3333333333333333, 0.1951219512195122, 0.24561403508771928, 0.22641509433962265, 0.34545454545454546, 0.625]]
2 | LongChat_7B = [[0.125, 0.22033898305084745, 0.4444444444444444, 0.5, 0.5396825396825397, 0.3170731707317073, 0.3157894736842105, 0.41509433962264153, 0.5636363636363636, 0.5416666666666666], [0.08333333333333333, 0.11864406779661017, 0.2222222222222222, 0.25, 0.23809523809523808, 0.1951219512195122, 0.12280701754385964, 0.1509433962264151, 0.2, 0.25], [0.1875, 0.05084745762711865, 0.1111111111111111, 0.325, 0.2698412698412698, 0.17073170731707318, 0.10526315789473684, 0.18867924528301888, 0.2727272727272727, 0.3125], [0.0625, 0.01694915254237288, 0.1111111111111111, 0.15, 0.1111111111111111, 0.0975609756097561, 0.07017543859649122, 0.09433962264150944, 0.18181818181818182, 0.14583333333333334], [0.10416666666666667, 0.03389830508474576, 0.05555555555555555, 0.05, 0.14285714285714285, 0.07317073170731707, 0.08771929824561403, 0.018867924528301886, 0.2727272727272727, 0.20833333333333334]]
3 | Vicuna_7B = [[0.10416666666666667, 0.2711864406779661, 0.3055555555555556, 0.7, 0.49206349206349204, 0.3902439024390244, 0.6842105263157895, 0.8301886792452831, 0.9636363636363636, 0.9583333333333334], [0.10416666666666667, 0.2542372881355932, 0.3333333333333333, 0.6, 0.3968253968253968, 0.4146341463414634, 0.6140350877192983, 0.8679245283018868, 0.8909090909090909, 0.9166666666666666], [0.10416666666666667, 0.23728813559322035, 0.4444444444444444, 0.525, 0.6031746031746031, 0.5609756097560976, 0.6842105263157895, 0.9245283018867925, 0.8909090909090909, 0.9166666666666666], [0.020833333333333332, 0.13559322033898305, 0.2777777777777778, 0.425, 0.36507936507936506, 0.14634146341463414, 0.40350877192982454, 0.5094339622641509, 0.6909090909090909, 0.6875], [0.041666666666666664, 0.1016949152542373, 0.2222222222222222, 0.3, 0.2698412698412698, 0.24390243902439024, 0.3508771929824561, 0.5471698113207547, 0.7636363636363637, 0.8125]]
4 | Vicuna_13B = [[0.16666666666666666, 0.0847457627118644, 0.05555555555555555, 0.325, 0.3333333333333333, 0.1951219512195122, 0.543859649122807, 0.6415094339622641, 0.8181818181818182, 0.8541666666666666], [0.1875, 0.1016949152542373, 0.08333333333333333, 0.325, 0.19047619047619047, 0.1951219512195122, 0.5087719298245614, 0.7169811320754716, 0.7818181818181819, 0.8958333333333334], [0.125, 0.05084745762711865, 0.08333333333333333, 0.35, 0.31746031746031744, 0.1951219512195122, 0.40350877192982454, 0.41509433962264153, 0.8, 0.8125], [0.10416666666666667, 0.05084745762711865, 0.027777777777777776, 0.25, 0.20634920634920634, 0.17073170731707318, 0.3508771929824561, 0.660377358490566, 0.6, 0.8541666666666666], [0.125, 0.05084745762711865, 0.05555555555555555, 0.15, 0.30158730158730157, 0.17073170731707318, 0.19298245614035087, 0.41509433962264153, 0.6, 0.75]]
5 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.27
 3 | adv_mnli,bd8ef0,acc_after,gen,39.67
 4 | adv_mnli,bd8ef0,acc_before,gen,54.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.26
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,38.66
 8 | adv_qnli,0b7326,acc_drop,gen,0.14
 9 | adv_qnli,0b7326,acc_after,gen,54.89
10 | adv_qnli,0b7326,acc_before,gen,63.91
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.06
15 | adv_rte,8cc547,acc_after,gen,55.56
16 | adv_rte,8cc547,acc_before,gen,59.26
17 | adv_sst2,ee8d3b,acc_drop,gen,0.18
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,60.31
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21
 3 | adv_mnli,bd8ef0,acc_after,gen,39.67
 4 | adv_mnli,bd8ef0,acc_before,gen,50.41
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.10
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,39.50
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,0.09
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,46.48
14 | adv_rte,8cc547,acc_drop,gen,0.04
15 | adv_rte,8cc547,acc_after,gen,55.56
16 | adv_rte,8cc547,acc_before,gen,58.02
17 | adv_sst2,ee8d3b,acc_drop,gen,0.03
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,51.15
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.17
 3 | adv_mnli,bd8ef0,acc_after,gen,47.11
 4 | adv_mnli,bd8ef0,acc_before,gen,57.02
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.31
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38
 8 | adv_qnli,0b7326,acc_drop,gen,0.28
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,69.92
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.10
 3 | adv_mnli,bd8ef0,acc_after,gen,45.45
 4 | adv_mnli,bd8ef0,acc_before,gen,50.41
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.37
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,27.73
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70
 8 | adv_qnli,0b7326,acc_drop,gen,0.25
 9 | adv_qnli,0b7326,acc_after,gen,53.38
10 | adv_qnli,0b7326,acc_before,gen,71.43
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16
 3 | adv_mnli,bd8ef0,acc_after,gen,39.67
 4 | adv_mnli,bd8ef0,acc_before,gen,47.11
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.29
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,43.70
 8 | adv_qnli,0b7326,acc_drop,gen,0.15
 9 | adv_qnli,0b7326,acc_after,gen,57.14
10 | adv_qnli,0b7326,acc_before,gen,66.92
11 | adv_qqp,cdc277,acc_drop,gen,-0.04
12 | adv_qqp,cdc277,acc_after,gen,39.44
13 | adv_qqp,cdc277,acc_before,gen,38.03
14 | adv_rte,8cc547,acc_drop,gen,-0.04
15 | adv_rte,8cc547,acc_after,gen,61.73
16 | adv_rte,8cc547,acc_before,gen,59.26
17 | adv_sst2,ee8d3b,acc_drop,gen,-0.09
18 | adv_sst2,ee8d3b,acc_after,gen,48.09
19 | adv_sst2,ee8d3b,acc_before,gen,44.27
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.32
 3 | adv_mnli,bd8ef0,acc_after,gen,35.54
 4 | adv_mnli,bd8ef0,acc_before,gen,52.07
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.14
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,30.25
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,35.29
 8 | adv_qnli,0b7326,acc_drop,gen,0.14
 9 | adv_qnli,0b7326,acc_after,gen,52.63
10 | adv_qnli,0b7326,acc_before,gen,60.90
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.08
15 | adv_rte,8cc547,acc_after,gen,60.49
16 | adv_rte,8cc547,acc_before,gen,65.43
17 | adv_sst2,ee8d3b,acc_drop,gen,0.18
18 | adv_sst2,ee8d3b,acc_after,gen,51.91
19 | adv_sst2,ee8d3b,acc_before,gen,63.36
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.25
 3 | adv_mnli,bd8ef0,acc_after,gen,36.36
 4 | adv_mnli,bd8ef0,acc_before,gen,48.76
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.31
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,37.82
 8 | adv_qnli,0b7326,acc_drop,gen,0.19
 9 | adv_qnli,0b7326,acc_after,gen,52.63
10 | adv_qnli,0b7326,acc_before,gen,64.66
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.08
15 | adv_rte,8cc547,acc_after,gen,59.26
16 | adv_rte,8cc547,acc_before,gen,64.20
17 | adv_sst2,ee8d3b,acc_drop,gen,0.12
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,56.49
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18
 3 | adv_mnli,bd8ef0,acc_after,gen,45.45
 4 | adv_mnli,bd8ef0,acc_before,gen,55.37
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.26
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,29.41
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,39.50
 8 | adv_qnli,0b7326,acc_drop,gen,-0.01
 9 | adv_qnli,0b7326,acc_after,gen,57.14
10 | adv_qnli,0b7326,acc_before,gen,56.39
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,45.07
14 | adv_rte,8cc547,acc_drop,gen,-0.04
15 | adv_rte,8cc547,acc_after,gen,59.26
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.13
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,58.02
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.17
 3 | adv_mnli,bd8ef0,acc_after,gen,45.45
 4 | adv_mnli,bd8ef0,acc_before,gen,54.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.22
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,42.86
 8 | adv_qnli,0b7326,acc_drop,gen,0.08
 9 | adv_qnli,0b7326,acc_after,gen,55.64
10 | adv_qnli,0b7326,acc_before,gen,60.15
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.06
15 | adv_rte,8cc547,acc_after,gen,55.56
16 | adv_rte,8cc547,acc_before,gen,59.26
17 | adv_sst2,ee8d3b,acc_drop,gen,-0.06
18 | adv_sst2,ee8d3b,acc_after,gen,58.02
19 | adv_sst2,ee8d3b,acc_before,gen,54.96
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.35
 3 | adv_mnli,bd8ef0,acc_after,gen,59.50
 4 | adv_mnli,bd8ef0,acc_before,gen,91.74
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.11
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24
 8 | adv_qnli,0b7326,acc_drop,gen,0.19
 9 | adv_qnli,0b7326,acc_after,gen,66.17
10 | adv_qnli,0b7326,acc_before,gen,81.20
11 | adv_qqp,cdc277,acc_drop,gen,0.18
12 | adv_qqp,cdc277,acc_after,gen,69.01
13 | adv_qqp,cdc277,acc_before,gen,84.51
14 | adv_rte,8cc547,acc_drop,gen,0.15
15 | adv_rte,8cc547,acc_after,gen,77.78
16 | adv_rte,8cc547,acc_before,gen,91.36
17 | adv_sst2,ee8d3b,acc_drop,gen,0.37
18 | adv_sst2,ee8d3b,acc_after,gen,55.73
19 | adv_sst2,ee8d3b,acc_before,gen,87.79
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,58.68
 4 | adv_mnli,bd8ef0,acc_before,gen,77.69
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.18
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,85.71
 8 | adv_qnli,0b7326,acc_drop,gen,0.18
 9 | adv_qnli,0b7326,acc_after,gen,58.65
10 | adv_qnli,0b7326,acc_before,gen,71.43
11 | adv_qqp,cdc277,acc_drop,gen,0.23
12 | adv_qqp,cdc277,acc_after,gen,67.61
13 | adv_qqp,cdc277,acc_before,gen,87.32
14 | adv_rte,8cc547,acc_drop,gen,0.21
15 | adv_rte,8cc547,acc_after,gen,72.84
16 | adv_rte,8cc547,acc_before,gen,92.59
17 | adv_sst2,ee8d3b,acc_drop,gen,0.38
18 | adv_sst2,ee8d3b,acc_after,gen,53.44
19 | adv_sst2,ee8d3b,acc_before,gen,86.26
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.40
 3 | adv_mnli,bd8ef0,acc_after,gen,52.89
 4 | adv_mnli,bd8ef0,acc_before,gen,87.60
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,67.23
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03
 8 | adv_qnli,0b7326,acc_drop,gen,0.18
 9 | adv_qnli,0b7326,acc_after,gen,69.17
10 | adv_qnli,0b7326,acc_before,gen,84.21
11 | adv_qqp,cdc277,acc_drop,gen,0.11
12 | adv_qqp,cdc277,acc_after,gen,76.06
13 | adv_qqp,cdc277,acc_before,gen,85.92
14 | adv_rte,8cc547,acc_drop,gen,0.21
15 | adv_rte,8cc547,acc_after,gen,74.07
16 | adv_rte,8cc547,acc_before,gen,93.83
17 | adv_sst2,ee8d3b,acc_drop,gen,0.32
18 | adv_sst2,ee8d3b,acc_after,gen,58.78
19 | adv_sst2,ee8d3b,acc_before,gen,87.02
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.31
 3 | adv_mnli,bd8ef0,acc_after,gen,62.81
 4 | adv_mnli,bd8ef0,acc_before,gen,90.91
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24
 8 | adv_qnli,0b7326,acc_drop,gen,0.19
 9 | adv_qnli,0b7326,acc_after,gen,68.42
10 | adv_qnli,0b7326,acc_before,gen,84.21
11 | adv_qqp,cdc277,acc_drop,gen,0.21
12 | adv_qqp,cdc277,acc_after,gen,76.06
13 | adv_qqp,cdc277,acc_before,gen,95.77
14 | adv_rte,8cc547,acc_drop,gen,0.12
15 | adv_rte,8cc547,acc_after,gen,82.72
16 | adv_rte,8cc547,acc_before,gen,93.83
17 | adv_sst2,ee8d3b,acc_drop,gen,0.32
18 | adv_sst2,ee8d3b,acc_after,gen,60.31
19 | adv_sst2,ee8d3b,acc_before,gen,89.31
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.27
 3 | adv_mnli,bd8ef0,acc_after,gen,61.98
 4 | adv_mnli,bd8ef0,acc_before,gen,85.12
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.11
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,86.55
 8 | adv_qnli,0b7326,acc_drop,gen,0.18
 9 | adv_qnli,0b7326,acc_after,gen,63.91
10 | adv_qnli,0b7326,acc_before,gen,78.20
11 | adv_qqp,cdc277,acc_drop,gen,0.17
12 | adv_qqp,cdc277,acc_after,gen,70.42
13 | adv_qqp,cdc277,acc_before,gen,84.51
14 | adv_rte,8cc547,acc_drop,gen,0.09
15 | adv_rte,8cc547,acc_after,gen,79.01
16 | adv_rte,8cc547,acc_before,gen,86.42
17 | adv_sst2,ee8d3b,acc_drop,gen,0.29
18 | adv_sst2,ee8d3b,acc_after,gen,61.07
19 | adv_sst2,ee8d3b,acc_before,gen,85.50
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33
 3 | adv_mnli,bd8ef0,acc_after,gen,61.98
 4 | adv_mnli,bd8ef0,acc_before,gen,92.56
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,89.08
 8 | adv_qnli,0b7326,acc_drop,gen,0.13
 9 | adv_qnli,0b7326,acc_after,gen,69.17
10 | adv_qnli,0b7326,acc_before,gen,79.70
11 | adv_qqp,cdc277,acc_drop,gen,0.12
12 | adv_qqp,cdc277,acc_after,gen,71.83
13 | adv_qqp,cdc277,acc_before,gen,81.69
14 | adv_rte,8cc547,acc_drop,gen,0.16
15 | adv_rte,8cc547,acc_after,gen,70.37
16 | adv_rte,8cc547,acc_before,gen,83.95
17 | adv_sst2,ee8d3b,acc_drop,gen,0.26
18 | adv_sst2,ee8d3b,acc_after,gen,62.60
19 | adv_sst2,ee8d3b,acc_before,gen,84.73
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.32
 3 | adv_mnli,bd8ef0,acc_after,gen,61.16
 4 | adv_mnli,bd8ef0,acc_before,gen,90.08
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,73.95
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,90.76
 8 | adv_qnli,0b7326,acc_drop,gen,0.18
 9 | adv_qnli,0b7326,acc_after,gen,66.17
10 | adv_qnli,0b7326,acc_before,gen,80.45
11 | adv_qqp,cdc277,acc_drop,gen,0.21
12 | adv_qqp,cdc277,acc_after,gen,69.01
13 | adv_qqp,cdc277,acc_before,gen,87.32
14 | adv_rte,8cc547,acc_drop,gen,0.19
15 | adv_rte,8cc547,acc_after,gen,70.37
16 | adv_rte,8cc547,acc_before,gen,86.42
17 | adv_sst2,ee8d3b,acc_drop,gen,0.22
18 | adv_sst2,ee8d3b,acc_after,gen,64.12
19 | adv_sst2,ee8d3b,acc_before,gen,82.44
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33
 3 | adv_mnli,bd8ef0,acc_after,gen,61.16
 4 | adv_mnli,bd8ef0,acc_before,gen,91.74
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24
 8 | adv_qnli,0b7326,acc_drop,gen,0.15
 9 | adv_qnli,0b7326,acc_after,gen,69.92
10 | adv_qnli,0b7326,acc_before,gen,81.95
11 | adv_qqp,cdc277,acc_drop,gen,0.07
12 | adv_qqp,cdc277,acc_after,gen,74.65
13 | adv_qqp,cdc277,acc_before,gen,80.28
14 | adv_rte,8cc547,acc_drop,gen,0.14
15 | adv_rte,8cc547,acc_after,gen,77.78
16 | adv_rte,8cc547,acc_before,gen,90.12
17 | adv_sst2,ee8d3b,acc_drop,gen,0.36
18 | adv_sst2,ee8d3b,acc_after,gen,57.25
19 | adv_sst2,ee8d3b,acc_before,gen,89.31
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_chatglm3_6b_32k/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,chatglm3-6b-32k
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.37
 3 | adv_mnli,bd8ef0,acc_after,gen,57.85
 4 | adv_mnli,bd8ef0,acc_before,gen,91.74
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.18
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,88.24
 8 | adv_qnli,0b7326,acc_drop,gen,0.17
 9 | adv_qnli,0b7326,acc_after,gen,66.92
10 | adv_qnli,0b7326,acc_before,gen,80.45
11 | adv_qqp,cdc277,acc_drop,gen,0.21
12 | adv_qqp,cdc277,acc_after,gen,67.61
13 | adv_qqp,cdc277,acc_before,gen,85.92
14 | adv_rte,8cc547,acc_drop,gen,0.10
15 | adv_rte,8cc547,acc_after,gen,79.01
16 | adv_rte,8cc547,acc_before,gen,87.65
17 | adv_sst2,ee8d3b,acc_drop,gen,0.34
18 | adv_sst2,ee8d3b,acc_after,gen,57.25
19 | adv_sst2,ee8d3b,acc_before,gen,87.02
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,43.80
 4 | adv_mnli,bd8ef0,acc_before,gen,54.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,42.02
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,50.42
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,55.64
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.02
 3 | adv_mnli,bd8ef0,acc_after,gen,44.63
 4 | adv_mnli,bd8ef0,acc_before,gen,45.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.03
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,47.37
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.08
 3 | adv_mnli,bd8ef0,acc_after,gen,48.76
 4 | adv_mnli,bd8ef0,acc_before,gen,52.89
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,43.70
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,47.06
 8 | adv_qnli,0b7326,acc_drop,gen,0.01
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,50.38
11 | adv_qqp,cdc277,acc_drop,gen,0.26
12 | adv_qqp,cdc277,acc_after,gen,49.30
13 | adv_qqp,cdc277,acc_before,gen,66.20
14 | adv_rte,8cc547,acc_drop,gen,0.06
15 | adv_rte,8cc547,acc_after,gen,60.49
16 | adv_rte,8cc547,acc_before,gen,64.20
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,43.80
 4 | adv_mnli,bd8ef0,acc_before,gen,54.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,43.70
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,52.10
 8 | adv_qnli,0b7326,acc_drop,gen,0.10
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,54.89
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.06
 3 | adv_mnli,bd8ef0,acc_after,gen,41.32
 4 | adv_mnli,bd8ef0,acc_before,gen,38.84
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.09
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.93
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,29.41
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,47.37
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,57.75
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,59.26
16 | adv_rte,8cc547,acc_before,gen,59.26
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-hf
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.11
 3 | adv_mnli,bd8ef0,acc_after,gen,47.93
 4 | adv_mnli,bd8ef0,acc_before,gen,53.72
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.06
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,55.46
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,58.82
 8 | adv_qnli,0b7326,acc_drop,gen,0.17
 9 | adv_qnli,0b7326,acc_after,gen,48.12
10 | adv_qnli,0b7326,acc_before,gen,57.89
11 | adv_qqp,cdc277,acc_drop,gen,-0.08
12 | adv_qqp,cdc277,acc_after,gen,59.15
13 | adv_qqp,cdc277,acc_before,gen,54.93
14 | adv_rte,8cc547,acc_drop,gen,-0.02
15 | adv_rte,8cc547,acc_after,gen,72.84
16 | adv_rte,8cc547,acc_before,gen,71.60
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.09
 3 | adv_mnli,bd8ef0,acc_after,gen,48.76
 4 | adv_mnli,bd8ef0,acc_before,gen,53.72
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.06
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,55.46
 8 | adv_qnli,0b7326,acc_drop,gen,0.22
 9 | adv_qnli,0b7326,acc_after,gen,48.12
10 | adv_qnli,0b7326,acc_before,gen,61.65
11 | adv_qqp,cdc277,acc_drop,gen,0.05
12 | adv_qqp,cdc277,acc_after,gen,52.11
13 | adv_qqp,cdc277,acc_before,gen,54.93
14 | adv_rte,8cc547,acc_drop,gen,-0.09
15 | adv_rte,8cc547,acc_after,gen,76.54
16 | adv_rte,8cc547,acc_before,gen,70.37
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-hf
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,44.63
 4 | adv_mnli,bd8ef0,acc_before,gen,57.02
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,44.54
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,50.42
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,55.64
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,56.79
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_40b_instruct/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-40b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,43.80
 4 | adv_mnli,bd8ef0,acc_before,gen,56.20
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,42.02
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38
 8 | adv_qnli,0b7326,acc_drop,gen,0.12
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,57.14
11 | adv_qqp,cdc277,acc_drop,gen,-0.11
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,-0.02
15 | adv_rte,8cc547,acc_after,gen,59.26
16 | adv_rte,8cc547,acc_before,gen,58.02
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,33.06
 4 | adv_mnli,bd8ef0,acc_before,gen,32.23
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93
 8 | adv_qnli,0b7326,acc_drop,gen,-0.07
 9 | adv_qnli,0b7326,acc_after,gen,48.87
10 | adv_qnli,0b7326,acc_before,gen,45.86
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,47.89
14 | adv_rte,8cc547,acc_drop,gen,-0.05
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,46.91
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,27.27
 4 | adv_mnli,bd8ef0,acc_before,gen,33.88
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.30
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,29.41
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,22.69
 8 | adv_qnli,0b7326,acc_drop,gen,-0.10
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,45.11
11 | adv_qqp,cdc277,acc_drop,gen,0.14
12 | adv_qqp,cdc277,acc_after,gen,43.66
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,0.09
15 | adv_rte,8cc547,acc_after,gen,39.51
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.02
 3 | adv_mnli,bd8ef0,acc_after,gen,33.88
 4 | adv_mnli,bd8ef0,acc_before,gen,33.06
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.09
 8 | adv_qnli,0b7326,acc_drop,gen,0.01
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,51.88
11 | adv_qqp,cdc277,acc_drop,gen,0.05
12 | adv_qqp,cdc277,acc_after,gen,54.93
13 | adv_qqp,cdc277,acc_before,gen,57.75
14 | adv_rte,8cc547,acc_drop,gen,-0.07
15 | adv_rte,8cc547,acc_after,gen,54.32
16 | adv_rte,8cc547,acc_before,gen,50.62
17 | adv_sst2,ee8d3b,acc_drop,gen,-0.03
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,48.85
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.05
 3 | adv_mnli,bd8ef0,acc_after,gen,32.23
 4 | adv_mnli,bd8ef0,acc_before,gen,30.58
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,30.25
 8 | adv_qnli,0b7326,acc_drop,gen,-0.08
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,46.62
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,47.89
14 | adv_rte,8cc547,acc_drop,gen,-0.05
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,46.91
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,32.23
 4 | adv_mnli,bd8ef0,acc_before,gen,41.32
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.03
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,27.73
 8 | adv_qnli,0b7326,acc_drop,gen,-0.06
 9 | adv_qnli,0b7326,acc_after,gen,42.86
10 | adv_qnli,0b7326,acc_before,gen,40.60
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,46.91
16 | adv_rte,8cc547,acc_before,gen,46.91
17 | adv_sst2,ee8d3b,acc_drop,gen,0.04
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,52.67
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,31.40
 4 | adv_mnli,bd8ef0,acc_before,gen,32.23
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.04
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,22.69
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,23.53
 8 | adv_qnli,0b7326,acc_drop,gen,-0.05
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,48.12
11 | adv_qqp,cdc277,acc_drop,gen,-0.03
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,46.48
14 | adv_rte,8cc547,acc_drop,gen,-0.10
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,51.85
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,31.40
 4 | adv_mnli,bd8ef0,acc_before,gen,32.23
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,22.69
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,24.37
 8 | adv_qnli,0b7326,acc_drop,gen,-0.05
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,48.12
11 | adv_qqp,cdc277,acc_drop,gen,0.09
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,49.30
14 | adv_rte,8cc547,acc_drop,gen,0.05
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,51.85
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,33.06
 4 | adv_mnli,bd8ef0,acc_before,gen,32.23
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.16
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.89
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93
 8 | adv_qnli,0b7326,acc_drop,gen,-0.06
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,46.62
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,47.89
14 | adv_rte,8cc547,acc_drop,gen,-0.05
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,46.91
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_falcon_7b_instruct/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,falcon-7b-instruct
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,31.40
 4 | adv_mnli,bd8ef0,acc_before,gen,30.58
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,28.57
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93
 8 | adv_qnli,0b7326,acc_drop,gen,-0.08
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,-0.06
12 | adv_qqp,cdc277,acc_after,gen,50.70
13 | adv_qqp,cdc277,acc_before,gen,47.89
14 | adv_rte,8cc547,acc_drop,gen,-0.11
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,44.44
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16
 3 | adv_mnli,bd8ef0,acc_after,gen,38.02
 4 | adv_mnli,bd8ef0,acc_before,gen,45.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.09
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,36.13
 8 | adv_qnli,0b7326,acc_drop,gen,0.09
 9 | adv_qnli,0b7326,acc_after,gen,56.39
10 | adv_qnli,0b7326,acc_before,gen,61.65
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,49.30
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,62.96
16 | adv_rte,8cc547,acc_before,gen,62.96
17 | adv_sst2,ee8d3b,acc_drop,gen,0.26
18 | adv_sst2,ee8d3b,acc_after,gen,45.80
19 | adv_sst2,ee8d3b,acc_before,gen,61.83
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,28.93
 4 | adv_mnli,bd8ef0,acc_before,gen,38.02
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,63.91
10 | adv_qnli,0b7326,acc_before,gen,68.42
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,45.07
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,58.02
16 | adv_rte,8cc547,acc_before,gen,58.02
17 | adv_sst2,ee8d3b,acc_drop,gen,0.06
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,53.44
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.33
 3 | adv_mnli,bd8ef0,acc_after,gen,41.32
 4 | adv_mnli,bd8ef0,acc_before,gen,61.98
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.25
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,39.50
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,52.94
 8 | adv_qnli,0b7326,acc_drop,gen,0.09
 9 | adv_qnli,0b7326,acc_after,gen,60.90
10 | adv_qnli,0b7326,acc_before,gen,66.92
11 | adv_qqp,cdc277,acc_drop,gen,-0.13
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.02
15 | adv_rte,8cc547,acc_after,gen,60.49
16 | adv_rte,8cc547,acc_before,gen,61.73
17 | adv_sst2,ee8d3b,acc_drop,gen,0.28
18 | adv_sst2,ee8d3b,acc_after,gen,45.04
19 | adv_sst2,ee8d3b,acc_before,gen,62.60
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16
 3 | adv_mnli,bd8ef0,acc_after,gen,38.02
 4 | adv_mnli,bd8ef0,acc_before,gen,45.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.07
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.93
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.03
 9 | adv_qnli,0b7326,acc_after,gen,57.89
10 | adv_qnli,0b7326,acc_before,gen,59.40
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,49.30
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,62.96
16 | adv_rte,8cc547,acc_before,gen,62.96
17 | adv_sst2,ee8d3b,acc_drop,gen,0.18
18 | adv_sst2,ee8d3b,acc_after,gen,45.04
19 | adv_sst2,ee8d3b,acc_before,gen,54.96
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,39.67
 4 | adv_mnli,bd8ef0,acc_before,gen,49.59
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.08
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,50.42
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,54.62
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,60.15
10 | adv_qnli,0b7326,acc_before,gen,67.67
11 | adv_qqp,cdc277,acc_drop,gen,-0.26
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,38.03
14 | adv_rte,8cc547,acc_drop,gen,0.12
15 | adv_rte,8cc547,acc_after,gen,46.91
16 | adv_rte,8cc547,acc_before,gen,53.09
17 | adv_sst2,ee8d3b,acc_drop,gen,0.02
18 | adv_sst2,ee8d3b,acc_after,gen,47.33
19 | adv_sst2,ee8d3b,acc_before,gen,48.09
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.08
 3 | adv_mnli,bd8ef0,acc_after,gen,29.75
 4 | adv_mnli,bd8ef0,acc_before,gen,32.23
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.05
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,36.97
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,35.29
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,54.14
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,43.66
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,41.98
16 | adv_rte,8cc547,acc_before,gen,41.98
17 | adv_sst2,ee8d3b,acc_drop,gen,0.10
18 | adv_sst2,ee8d3b,acc_after,gen,48.85
19 | adv_sst2,ee8d3b,acc_before,gen,54.20
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,32.23
 4 | adv_mnli,bd8ef0,acc_before,gen,31.40
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,40.34
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,40.34
 8 | adv_qnli,0b7326,acc_drop,gen,0.05
 9 | adv_qnli,0b7326,acc_after,gen,53.38
10 | adv_qnli,0b7326,acc_before,gen,56.39
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,47.89
14 | adv_rte,8cc547,acc_drop,gen,0.03
15 | adv_rte,8cc547,acc_after,gen,40.74
16 | adv_rte,8cc547,acc_before,gen,41.98
17 | adv_sst2,ee8d3b,acc_drop,gen,0.12
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,56.49
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.15
 3 | adv_mnli,bd8ef0,acc_after,gen,38.84
 4 | adv_mnli,bd8ef0,acc_before,gen,45.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.14
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,31.09
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,36.13
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,58.65
10 | adv_qnli,0b7326,acc_before,gen,58.65
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,47.89
13 | adv_qqp,cdc277,acc_before,gen,49.30
14 | adv_rte,8cc547,acc_drop,gen,0.06
15 | adv_rte,8cc547,acc_after,gen,62.96
16 | adv_rte,8cc547,acc_before,gen,66.67
17 | adv_sst2,ee8d3b,acc_drop,gen,0.11
18 | adv_sst2,ee8d3b,acc_after,gen,48.09
19 | adv_sst2,ee8d3b,acc_before,gen,54.20
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_13b_chat/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-13b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.29
 3 | adv_mnli,bd8ef0,acc_after,gen,37.19
 4 | adv_mnli,bd8ef0,acc_before,gen,52.07
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.35
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,26.05
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,40.34
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,57.14
10 | adv_qnli,0b7326,acc_before,gen,61.65
11 | adv_qqp,cdc277,acc_drop,gen,-0.23
12 | adv_qqp,cdc277,acc_after,gen,52.11
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.04
15 | adv_rte,8cc547,acc_after,gen,56.79
16 | adv_rte,8cc547,acc_before,gen,59.26
17 | adv_sst2,ee8d3b,acc_drop,gen,0.15
18 | adv_sst2,ee8d3b,acc_after,gen,52.67
19 | adv_sst2,ee8d3b,acc_before,gen,61.83
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.25
 3 | adv_mnli,bd8ef0,acc_after,gen,48.76
 4 | adv_mnli,bd8ef0,acc_before,gen,65.29
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,68.07
 8 | adv_qnli,0b7326,acc_drop,gen,0.10
 9 | adv_qnli,0b7326,acc_after,gen,63.91
10 | adv_qnli,0b7326,acc_before,gen,70.68
11 | adv_qqp,cdc277,acc_drop,gen,-0.32
12 | adv_qqp,cdc277,acc_after,gen,46.48
13 | adv_qqp,cdc277,acc_before,gen,35.21
14 | adv_rte,8cc547,acc_drop,gen,0.19
15 | adv_rte,8cc547,acc_after,gen,64.20
16 | adv_rte,8cc547,acc_before,gen,79.01
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.77
 3 | adv_mnli,bd8ef0,acc_after,gen,19.01
 4 | adv_mnli,bd8ef0,acc_before,gen,10.74
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,15.97
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,19.33
 8 | adv_qnli,0b7326,acc_drop,gen,-0.05
 9 | adv_qnli,0b7326,acc_after,gen,33.08
10 | adv_qnli,0b7326,acc_before,gen,31.58
11 | adv_qqp,cdc277,acc_drop,gen,-1.77
12 | adv_qqp,cdc277,acc_after,gen,50.70
13 | adv_qqp,cdc277,acc_before,gen,18.31
14 | adv_rte,8cc547,acc_drop,gen,-0.75
15 | adv_rte,8cc547,acc_after,gen,17.28
16 | adv_rte,8cc547,acc_before,gen,9.88
17 | adv_sst2,ee8d3b,acc_drop,gen,0.18
18 | adv_sst2,ee8d3b,acc_after,gen,28.24
19 | adv_sst2,ee8d3b,acc_before,gen,34.35
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.00
 3 | adv_mnli,bd8ef0,acc_after,gen,14.88
 4 | adv_mnli,bd8ef0,acc_before,gen,14.88
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.47
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,14.29
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,26.89
 8 | adv_qnli,0b7326,acc_drop,gen,0.20
 9 | adv_qnli,0b7326,acc_after,gen,29.32
10 | adv_qnli,0b7326,acc_before,gen,36.84
11 | adv_qqp,cdc277,acc_drop,gen,0.33
12 | adv_qqp,cdc277,acc_after,gen,33.80
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,0.37
15 | adv_rte,8cc547,acc_after,gen,14.81
16 | adv_rte,8cc547,acc_before,gen,23.46
17 | adv_sst2,ee8d3b,acc_drop,gen,0.38
18 | adv_sst2,ee8d3b,acc_after,gen,19.08
19 | adv_sst2,ee8d3b,acc_before,gen,30.53
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.13
 3 | adv_mnli,bd8ef0,acc_after,gen,14.05
 4 | adv_mnli,bd8ef0,acc_before,gen,12.40
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,23.53
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,26.89
 8 | adv_qnli,0b7326,acc_drop,gen,-0.14
 9 | adv_qnli,0b7326,acc_after,gen,30.08
10 | adv_qnli,0b7326,acc_before,gen,26.32
11 | adv_qqp,cdc277,acc_drop,gen,0.42
12 | adv_qqp,cdc277,acc_after,gen,29.58
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,-0.25
15 | adv_rte,8cc547,acc_after,gen,12.35
16 | adv_rte,8cc547,acc_before,gen,9.88
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,25.95
19 | adv_sst2,ee8d3b,acc_before,gen,25.95
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,45.45
 4 | adv_mnli,bd8ef0,acc_before,gen,59.50
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,47.90
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,62.18
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,54.14
10 | adv_qnli,0b7326,acc_before,gen,60.90
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,50.70
13 | adv_qqp,cdc277,acc_before,gen,52.11
14 | adv_rte,8cc547,acc_drop,gen,0.13
15 | adv_rte,8cc547,acc_after,gen,72.84
16 | adv_rte,8cc547,acc_before,gen,83.95
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.15
 3 | adv_mnli,bd8ef0,acc_after,gen,50.41
 4 | adv_mnli,bd8ef0,acc_before,gen,59.50
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.24
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,51.26
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,67.23
 8 | adv_qnli,0b7326,acc_drop,gen,0.10
 9 | adv_qnli,0b7326,acc_after,gen,62.41
10 | adv_qnli,0b7326,acc_before,gen,69.17
11 | adv_qqp,cdc277,acc_drop,gen,-0.13
12 | adv_qqp,cdc277,acc_after,gen,49.30
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.14
15 | adv_rte,8cc547,acc_after,gen,67.90
16 | adv_rte,8cc547,acc_before,gen,79.01
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.16
 3 | adv_mnli,bd8ef0,acc_after,gen,50.41
 4 | adv_mnli,bd8ef0,acc_before,gen,60.33
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.27
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,46.22
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,63.03
 8 | adv_qnli,0b7326,acc_drop,gen,0.02
 9 | adv_qnli,0b7326,acc_after,gen,64.66
10 | adv_qnli,0b7326,acc_before,gen,66.17
11 | adv_qqp,cdc277,acc_drop,gen,-0.19
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,38.03
14 | adv_rte,8cc547,acc_drop,gen,0.18
15 | adv_rte,8cc547,acc_after,gen,67.90
16 | adv_rte,8cc547,acc_before,gen,82.72
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21
 3 | adv_mnli,bd8ef0,acc_after,gen,47.93
 4 | adv_mnli,bd8ef0,acc_before,gen,60.33
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.23
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,52.10
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,68.07
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,63.16
10 | adv_qnli,0b7326,acc_before,gen,70.68
11 | adv_qqp,cdc277,acc_drop,gen,-0.15
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,36.62
14 | adv_rte,8cc547,acc_drop,gen,0.13
15 | adv_rte,8cc547,acc_after,gen,64.20
16 | adv_rte,8cc547,acc_before,gen,74.07
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_70b_chat/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-70b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,48.76
 4 | adv_mnli,bd8ef0,acc_before,gen,62.81
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.22
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,51.26
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,65.55
 8 | adv_qnli,0b7326,acc_drop,gen,0.09
 9 | adv_qnli,0b7326,acc_after,gen,64.66
10 | adv_qnli,0b7326,acc_before,gen,70.68
11 | adv_qqp,cdc277,acc_drop,gen,-0.10
12 | adv_qqp,cdc277,acc_after,gen,46.48
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.07
15 | adv_rte,8cc547,acc_after,gen,64.20
16 | adv_rte,8cc547,acc_before,gen,69.14
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06
 3 | adv_mnli,bd8ef0,acc_after,gen,23.97
 4 | adv_mnli,bd8ef0,acc_before,gen,25.62
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,52.63
10 | adv_qnli,0b7326,acc_before,gen,56.39
11 | adv_qqp,cdc277,acc_drop,gen,0.11
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,0.07
15 | adv_rte,8cc547,acc_after,gen,51.85
16 | adv_rte,8cc547,acc_before,gen,55.56
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,26.45
 4 | adv_mnli,bd8ef0,acc_before,gen,34.71
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.04
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,36.97
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,38.66
 8 | adv_qnli,0b7326,acc_drop,gen,-0.02
 9 | adv_qnli,0b7326,acc_after,gen,48.12
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,0.13
12 | adv_qqp,cdc277,acc_after,gen,46.48
13 | adv_qqp,cdc277,acc_before,gen,53.52
14 | adv_rte,8cc547,acc_drop,gen,0.25
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,65.43
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06
 3 | adv_mnli,bd8ef0,acc_after,gen,24.79
 4 | adv_mnli,bd8ef0,acc_before,gen,26.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,33.61
 8 | adv_qnli,0b7326,acc_drop,gen,0.04
 9 | adv_qnli,0b7326,acc_after,gen,55.64
10 | adv_qnli,0b7326,acc_before,gen,57.89
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,42.25
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,-0.19
15 | adv_rte,8cc547,acc_after,gen,54.32
16 | adv_rte,8cc547,acc_before,gen,45.68
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.06
 3 | adv_mnli,bd8ef0,acc_after,gen,23.97
 4 | adv_mnli,bd8ef0,acc_before,gen,25.62
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.00
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,34.45
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,51.88
10 | adv_qnli,0b7326,acc_before,gen,55.64
11 | adv_qqp,cdc277,acc_drop,gen,0.16
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,53.52
14 | adv_rte,8cc547,acc_drop,gen,0.02
15 | adv_rte,8cc547,acc_after,gen,51.85
16 | adv_rte,8cc547,acc_before,gen,53.09
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.10
 3 | adv_mnli,bd8ef0,acc_after,gen,36.36
 4 | adv_mnli,bd8ef0,acc_before,gen,40.50
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,40.34
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,45.38
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,46.62
10 | adv_qnli,0b7326,acc_before,gen,52.63
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,59.15
13 | adv_qqp,cdc277,acc_before,gen,57.75
14 | adv_rte,8cc547,acc_drop,gen,-0.04
15 | adv_rte,8cc547,acc_after,gen,60.49
16 | adv_rte,8cc547,acc_before,gen,58.02
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.00
 3 | adv_mnli,bd8ef0,acc_after,gen,23.97
 4 | adv_mnli,bd8ef0,acc_before,gen,23.97
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.11
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,35.29
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,31.93
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,47.37
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,0.16
12 | adv_qqp,cdc277,acc_after,gen,36.62
13 | adv_qqp,cdc277,acc_before,gen,43.66
14 | adv_rte,8cc547,acc_drop,gen,0.07
15 | adv_rte,8cc547,acc_after,gen,50.62
16 | adv_rte,8cc547,acc_before,gen,54.32
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,-0.03
 3 | adv_mnli,bd8ef0,acc_after,gen,27.27
 4 | adv_mnli,bd8ef0,acc_before,gen,26.45
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,-0.08
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,35.29
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,32.77
 8 | adv_qnli,0b7326,acc_drop,gen,0.00
 9 | adv_qnli,0b7326,acc_after,gen,47.37
10 | adv_qnli,0b7326,acc_before,gen,47.37
11 | adv_qqp,cdc277,acc_drop,gen,0.03
12 | adv_qqp,cdc277,acc_after,gen,40.85
13 | adv_qqp,cdc277,acc_before,gen,42.25
14 | adv_rte,8cc547,acc_drop,gen,0.09
15 | adv_rte,8cc547,acc_after,gen,49.38
16 | adv_rte,8cc547,acc_before,gen,54.32
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18
 3 | adv_mnli,bd8ef0,acc_after,gen,23.14
 4 | adv_mnli,bd8ef0,acc_before,gen,28.10
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.05
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,32.77
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,52.63
10 | adv_qnli,0b7326,acc_before,gen,56.39
11 | adv_qqp,cdc277,acc_drop,gen,0.11
12 | adv_qqp,cdc277,acc_after,gen,45.07
13 | adv_qqp,cdc277,acc_before,gen,50.70
14 | adv_rte,8cc547,acc_drop,gen,0.07
15 | adv_rte,8cc547,acc_after,gen,51.85
16 | adv_rte,8cc547,acc_before,gen,55.56
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_llama2_7b_chat/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,llama2-7b-chat
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.12
 3 | adv_mnli,bd8ef0,acc_after,gen,24.79
 4 | adv_mnli,bd8ef0,acc_before,gen,28.10
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.02
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,33.61
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,34.45
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,51.88
10 | adv_qnli,0b7326,acc_before,gen,55.64
11 | adv_qqp,cdc277,acc_drop,gen,-0.06
12 | adv_qqp,cdc277,acc_after,gen,52.11
13 | adv_qqp,cdc277,acc_before,gen,49.30
14 | adv_rte,8cc547,acc_drop,gen,0.05
15 | adv_rte,8cc547,acc_after,gen,64.20
16 | adv_rte,8cc547,acc_before,gen,67.90
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,50.38
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,57.02
 4 | adv_mnli,bd8ef0,acc_before,gen,72.73
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,80.67
 8 | adv_qnli,0b7326,acc_drop,gen,0.06
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,-0.06
15 | adv_rte,8cc547,acc_after,gen,45.68
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.29
 3 | adv_mnli,bd8ef0,acc_after,gen,53.72
 4 | adv_mnli,bd8ef0,acc_before,gen,76.03
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.87
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,74.79
 8 | adv_qnli,0b7326,acc_drop,gen,0.05
 9 | adv_qnli,0b7326,acc_after,gen,58.65
10 | adv_qnli,0b7326,acc_before,gen,61.65
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.03
15 | adv_rte,8cc547,acc_after,gen,41.98
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,54.55
 4 | adv_mnli,bd8ef0,acc_before,gen,67.77
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,62.18
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.15
 8 | adv_qnli,0b7326,acc_drop,gen,0.08
 9 | adv_qnli,0b7326,acc_after,gen,49.62
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.03
15 | adv_rte,8cc547,acc_after,gen,41.98
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,57.02
 4 | adv_mnli,bd8ef0,acc_before,gen,72.73
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,80.67
 8 | adv_qnli,0b7326,acc_drop,gen,0.06
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,-0.03
15 | adv_rte,8cc547,acc_after,gen,45.68
16 | adv_rte,8cc547,acc_before,gen,44.44
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,52.89
 4 | adv_mnli,bd8ef0,acc_before,gen,67.77
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,56.30
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,69.75
 8 | adv_qnli,0b7326,acc_drop,gen,0.12
 9 | adv_qnli,0b7326,acc_after,gen,50.38
10 | adv_qnli,0b7326,acc_before,gen,57.14
11 | adv_qqp,cdc277,acc_drop,gen,-0.05
12 | adv_qqp,cdc277,acc_after,gen,60.56
13 | adv_qqp,cdc277,acc_before,gen,57.75
14 | adv_rte,8cc547,acc_drop,gen,-0.03
15 | adv_rte,8cc547,acc_after,gen,44.44
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.02
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,50.38
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21
 3 | adv_mnli,bd8ef0,acc_after,gen,57.85
 4 | adv_mnli,bd8ef0,acc_before,gen,73.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,64.71
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,76.47
 8 | adv_qnli,0b7326,acc_drop,gen,0.04
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,53.38
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,-0.16
15 | adv_rte,8cc547,acc_after,gen,45.68
16 | adv_rte,8cc547,acc_before,gen,39.51
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18
 3 | adv_mnli,bd8ef0,acc_after,gen,60.33
 4 | adv_mnli,bd8ef0,acc_before,gen,73.55
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.19
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.87
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99
 8 | adv_qnli,0b7326,acc_drop,gen,0.04
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,53.38
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,-0.08
15 | adv_rte,8cc547,acc_after,gen,53.09
16 | adv_rte,8cc547,acc_before,gen,49.38
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,57.85
 4 | adv_mnli,bd8ef0,acc_before,gen,76.03
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.21
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,62.18
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99
 8 | adv_qnli,0b7326,acc_drop,gen,0.06
 9 | adv_qnli,0b7326,acc_after,gen,51.13
10 | adv_qnli,0b7326,acc_before,gen,54.14
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,-0.06
15 | adv_rte,8cc547,acc_after,gen,45.68
16 | adv_rte,8cc547,acc_before,gen,43.21
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mistral_7b_instruct/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mistral-7b-instruct-v0.2
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,57.85
 4 | adv_mnli,bd8ef0,acc_before,gen,72.73
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.20
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,63.03
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99
 8 | adv_qnli,0b7326,acc_drop,gen,0.03
 9 | adv_qnli,0b7326,acc_after,gen,54.14
10 | adv_qnli,0b7326,acc_before,gen,55.64
11 | adv_qqp,cdc277,acc_drop,gen,-0.02
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.03
15 | adv_rte,8cc547,acc_after,gen,43.21
16 | adv_rte,8cc547,acc_before,gen,44.44
17 | adv_sst2,ee8d3b,acc_drop,gen,0.00
18 | adv_sst2,ee8d3b,acc_after,gen,49.62
19 | adv_sst2,ee8d3b,acc_before,gen,49.62
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/fp16.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,60.33
 4 | adv_mnli,bd8ef0,acc_before,gen,79.34
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.11
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,74.79
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,75.19
10 | adv_qnli,0b7326,acc_before,gen,81.20
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,61.97
13 | adv_qqp,cdc277,acc_before,gen,66.20
14 | adv_rte,8cc547,acc_drop,gen,0.09
15 | adv_rte,8cc547,acc_after,gen,71.60
16 | adv_rte,8cc547,acc_before,gen,79.01
17 | adv_sst2,ee8d3b,acc_drop,gen,0.31
18 | adv_sst2,ee8d3b,acc_after,gen,58.78
19 | adv_sst2,ee8d3b,acc_before,gen,84.73
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.20
 3 | adv_mnli,bd8ef0,acc_after,gen,62.81
 4 | adv_mnli,bd8ef0,acc_before,gen,78.51
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.10
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,68.91
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,76.47
 8 | adv_qnli,0b7326,acc_drop,gen,0.02
 9 | adv_qnli,0b7326,acc_after,gen,71.43
10 | adv_qnli,0b7326,acc_before,gen,72.93
11 | adv_qqp,cdc277,acc_drop,gen,0.00
12 | adv_qqp,cdc277,acc_after,gen,56.34
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.08
15 | adv_rte,8cc547,acc_after,gen,66.67
16 | adv_rte,8cc547,acc_before,gen,72.84
17 | adv_sst2,ee8d3b,acc_drop,gen,0.19
18 | adv_sst2,ee8d3b,acc_after,gen,51.15
19 | adv_sst2,ee8d3b,acc_before,gen,63.36
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.14
 3 | adv_mnli,bd8ef0,acc_after,gen,61.98
 4 | adv_mnli,bd8ef0,acc_before,gen,71.90
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,81.51
 8 | adv_qnli,0b7326,acc_drop,gen,0.06
 9 | adv_qnli,0b7326,acc_after,gen,72.93
10 | adv_qnli,0b7326,acc_before,gen,77.44
11 | adv_qqp,cdc277,acc_drop,gen,0.09
12 | adv_qqp,cdc277,acc_after,gen,59.15
13 | adv_qqp,cdc277,acc_before,gen,64.79
14 | adv_rte,8cc547,acc_drop,gen,0.00
15 | adv_rte,8cc547,acc_after,gen,71.60
16 | adv_rte,8cc547,acc_before,gen,71.60
17 | adv_sst2,ee8d3b,acc_drop,gen,0.25
18 | adv_sst2,ee8d3b,acc_after,gen,58.78
19 | adv_sst2,ee8d3b,acc_before,gen,78.63
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/kv8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,61.16
 4 | adv_mnli,bd8ef0,acc_before,gen,78.51
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.87
 8 | adv_qnli,0b7326,acc_drop,gen,0.05
 9 | adv_qnli,0b7326,acc_after,gen,74.44
10 | adv_qnli,0b7326,acc_before,gen,78.20
11 | adv_qqp,cdc277,acc_drop,gen,0.06
12 | adv_qqp,cdc277,acc_after,gen,64.79
13 | adv_qqp,cdc277,acc_before,gen,69.01
14 | adv_rte,8cc547,acc_drop,gen,0.07
15 | adv_rte,8cc547,acc_after,gen,70.37
16 | adv_rte,8cc547,acc_before,gen,75.31
17 | adv_sst2,ee8d3b,acc_drop,gen,0.31
18 | adv_sst2,ee8d3b,acc_after,gen,58.78
19 | adv_sst2,ee8d3b,acc_before,gen,84.73
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w3.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.22
 3 | adv_mnli,bd8ef0,acc_after,gen,59.50
 4 | adv_mnli,bd8ef0,acc_before,gen,76.03
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.17
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,53.78
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,64.71
 8 | adv_qnli,0b7326,acc_drop,gen,0.08
 9 | adv_qnli,0b7326,acc_after,gen,75.94
10 | adv_qnli,0b7326,acc_before,gen,82.71
11 | adv_qqp,cdc277,acc_drop,gen,-0.07
12 | adv_qqp,cdc277,acc_after,gen,60.56
13 | adv_qqp,cdc277,acc_before,gen,56.34
14 | adv_rte,8cc547,acc_drop,gen,0.13
15 | adv_rte,8cc547,acc_after,gen,76.54
16 | adv_rte,8cc547,acc_before,gen,87.65
17 | adv_sst2,ee8d3b,acc_drop,gen,0.26
18 | adv_sst2,ee8d3b,acc_after,gen,53.44
19 | adv_sst2,ee8d3b,acc_before,gen,71.76
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w4.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.21
 3 | adv_mnli,bd8ef0,acc_after,gen,63.64
 4 | adv_mnli,bd8ef0,acc_before,gen,80.99
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,70.59
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,81.51
 8 | adv_qnli,0b7326,acc_drop,gen,0.14
 9 | adv_qnli,0b7326,acc_after,gen,73.68
10 | adv_qnli,0b7326,acc_before,gen,85.71
11 | adv_qqp,cdc277,acc_drop,gen,0.07
12 | adv_qqp,cdc277,acc_after,gen,60.56
13 | adv_qqp,cdc277,acc_before,gen,64.79
14 | adv_rte,8cc547,acc_drop,gen,0.14
15 | adv_rte,8cc547,acc_after,gen,69.14
16 | adv_rte,8cc547,acc_before,gen,80.25
17 | adv_sst2,ee8d3b,acc_drop,gen,0.25
18 | adv_sst2,ee8d3b,acc_after,gen,59.54
19 | adv_sst2,ee8d3b,acc_before,gen,79.39
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w4a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.19
 3 | adv_mnli,bd8ef0,acc_after,gen,65.29
 4 | adv_mnli,bd8ef0,acc_before,gen,80.99
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.13
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,68.91
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,78.99
 8 | adv_qnli,0b7326,acc_drop,gen,0.11
 9 | adv_qnli,0b7326,acc_after,gen,76.69
10 | adv_qnli,0b7326,acc_before,gen,85.71
11 | adv_qqp,cdc277,acc_drop,gen,0.07
12 | adv_qqp,cdc277,acc_after,gen,57.75
13 | adv_qqp,cdc277,acc_before,gen,61.97
14 | adv_rte,8cc547,acc_drop,gen,0.02
15 | adv_rte,8cc547,acc_after,gen,72.84
16 | adv_rte,8cc547,acc_before,gen,74.07
17 | adv_sst2,ee8d3b,acc_drop,gen,0.23
18 | adv_sst2,ee8d3b,acc_after,gen,61.07
19 | adv_sst2,ee8d3b,acc_before,gen,79.39
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.24
 3 | adv_mnli,bd8ef0,acc_after,gen,59.50
 4 | adv_mnli,bd8ef0,acc_before,gen,78.51
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.12
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,72.27
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,82.35
 8 | adv_qnli,0b7326,acc_drop,gen,0.07
 9 | adv_qnli,0b7326,acc_after,gen,75.19
10 | adv_qnli,0b7326,acc_before,gen,81.20
11 | adv_qqp,cdc277,acc_drop,gen,0.04
12 | adv_qqp,cdc277,acc_after,gen,61.97
13 | adv_qqp,cdc277,acc_before,gen,64.79
14 | adv_rte,8cc547,acc_drop,gen,0.12
15 | adv_rte,8cc547,acc_after,gen,70.37
16 | adv_rte,8cc547,acc_before,gen,80.25
17 | adv_sst2,ee8d3b,acc_drop,gen,0.30
18 | adv_sst2,ee8d3b,acc_after,gen,56.49
19 | adv_sst2,ee8d3b,acc_before,gen,80.15
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/assets/hf_mixtral_8x7b_instruct/w8a8.csv:
--------------------------------------------------------------------------------
 1 | dataset,version,metric,mode,mixtral-8x7b-instruct-v0.1
 2 | adv_mnli,bd8ef0,acc_drop,gen,0.18
 3 | adv_mnli,bd8ef0,acc_after,gen,65.29
 4 | adv_mnli,bd8ef0,acc_before,gen,79.34
 5 | adv_mnli_mm,bd8ef0,acc_drop,gen,0.15
 6 | adv_mnli_mm,bd8ef0,acc_after,gen,71.43
 7 | adv_mnli_mm,bd8ef0,acc_before,gen,84.03
 8 | adv_qnli,0b7326,acc_drop,gen,0.05
 9 | adv_qnli,0b7326,acc_after,gen,75.19
10 | adv_qnli,0b7326,acc_before,gen,78.95
11 | adv_qqp,cdc277,acc_drop,gen,0.08
12 | adv_qqp,cdc277,acc_after,gen,63.38
13 | adv_qqp,cdc277,acc_before,gen,69.01
14 | adv_rte,8cc547,acc_drop,gen,0.02
15 | adv_rte,8cc547,acc_after,gen,74.07
16 | adv_rte,8cc547,acc_before,gen,75.31
17 | adv_sst2,ee8d3b,acc_drop,gen,0.27
18 | adv_sst2,ee8d3b,acc_after,gen,56.49
19 | adv_sst2,ee8d3b,acc_before,gen,77.10
20 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/adv_glue/proc_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | 
 4 | root_path = 'outputs'
 5 | out_path = 'assets'
 6 | 
 7 | os.mkdir(out_path)
 8 | 
 9 | for model in os.listdir(root_path):
10 |     os.mkdir(os.path.join(out_path, model))
11 |     for precision in os.listdir(os.path.join(root_path, model)):
12 |         for i in range(len(os.listdir(os.path.join(root_path, model, precision)))):
13 |             timestamp = os.listdir(os.path.join(root_path, model, precision))[i]
14 |             if os.path.exists(os.path.join(root_path, model, precision, timestamp, 'summary')):
15 |                 break
16 |         for file_name in os.listdir(os.path.join(root_path, model, precision, timestamp, 'summary')):
17 |             if file_name.endswith('.csv'):
18 |                 break
19 |         src_file = os.path.join(root_path, model, precision, timestamp, 'summary', file_name)
20 |         dst_file = os.path.join(out_path, model, f'{precision}.csv')
21 |         shutil.copy(src_file, dst_file)
22 |         
23 | 
24 |         
25 | 
26 |         
27 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/kv_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.5927927927927928, 0.5976833976833977, 0.5812097812097812, 0.5554697554697555]
 2 | Mistral_7B = [0.584041184041184, 0.583011583011583, 0.5976833976833977, 0.4800514800514801]
 3 | Mixtral_8x7B = [0.7799227799227799, 0.7773487773487774, 0.759073359073359, 0.6800514800514801]
 4 | LLaMA2_7B = [0.425997425997426, 0.4252252252252252, 0.43217503217503217, 0.29523809523809524]
 5 | LLaMA2_13B = [0.6146718146718146, 0.6164736164736164, 0.6138996138996139, 0.49034749034749037]
 6 | LLaMA2_70B = [0.7235521235521235, 0.7235521235521235, 0.7196911196911197, 0.7184041184041184]
 7 | Falcon_7B = [0.41132561132561135, 0.41338481338481337, 0.33745173745173745, 0.2494208494208494]
 8 | Falcon_40B = [0.007207207207207207, 0.007207207207207207, 0.008494208494208495, 0.01904761904761905]
 9 | Falcon_180B = [0.6478764478764478, 0.6483912483912484, 0.631917631917632, 0.5680823680823681]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/w_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.5927927927927928, 0.6, 0.5745173745173745, 0.5096525096525096]
 2 | Mistral_7B = [0.584041184041184, 0.5812097812097812, 0.5747747747747748, 0.564993564993565]
 3 | Mixtral_8x7B = [0.7799227799227799, 0.7727155727155727, 0.7693693693693694, 0.6586872586872586]
 4 | LLaMA2_7B = [0.425997425997426, 0.42368082368082366, 0.45302445302445304, 0.5181467181467182]
 5 | LLaMA2_13B = [0.6146718146718146, 0.6128700128700129, 0.5783783783783784, 0.6015444015444016]
 6 | LLaMA2_70B = [0.7235521235521235, 0.7240669240669241, 0.7132561132561133, 0.6563706563706564]
 7 | Falcon_7B = [0.41132561132561135, 0.41235521235521233, 0.43346203346203344, 0.27953667953667954]
 8 | Falcon_40B = [0.007207207207207207, 0.0066924066924066924, 0.011583011583011582, 0.23191763191763193]
 9 | Falcon_180B = [0.6478764478764478, 0.6507078507078508, 0.6756756756756757, 0.668983268983269]
10 | LLaMA2_7B_AWQ = [0.425997425997426, 0.425997425997426, 0.39510939510939513, 0.4936936936936937]
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/commonsense/wa_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.5927927927927928, 0.5871299871299871, 0.563963963963964]
 2 | Mistral_7B = [0.584041184041184, 0.5791505791505791, 0.5716859716859717]
 3 | Mixtral_8x7B = [0.7799227799227799, 0.7711711711711712, 0.767052767052767]
 4 | LLaMA2_7B = [0.425997425997426, 0.4095238095238095, 0.435006435006435]
 5 | LLaMA2_13B = [0.6146718146718146, 0.6162162162162163, 0.5752895752895753]
 6 | LLaMA2_70B = [0.7235521235521235, 0.7202059202059202, 0.722007722007722]
 7 | Falcon_7B = [0.41132561132561135, 0.4200772200772201, 0.43346203346203344]
 8 | Falcon_40B = [0.007207207207207207, 0.005920205920205921, 0.010553410553410553]
 9 | Falcon_180B = [0.6478764478764478, 0.6316602316602317, 0.6398970398970399]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/kv_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.585928809788654, 0.5914905450500556, 0.5817575083426029, 0.5745272525027809]
 2 | Mistral_7B = [0.5670189098998888, 0.5667408231368187, 0.5656284760845384, 0.5444938820912124]
 3 | Mixtral_8x7B = [0.6270856507230256, 0.628476084538376, 0.6084538375973304, 0.5684093437152391]
 4 | LLaMA2_7B = [0.5795328142380423, 0.5792547274749722, 0.5561735261401557, 0.49638487208008897]
 5 | LLaMA2_13B = [0.3576195773081201, 0.3542825361512792, 0.46662958843159064, 0.4051724137931034]
 6 | LLaMA2_70B = [0.5203003337041157, 0.5191879866518354, 0.482202447163515, 0.5439377085650723]
 7 | Falcon_7B = [0.4635706340378198, 0.4624582869855395, 0.43325917686318133, 0.36012235817575083]
 8 | Falcon_40B = [0.09983314794215796, 0.10066740823136819, 0.12903225806451613, 0.1735261401557286]
 9 | Falcon_180B = [0.5525583982202447, 0.5517241379310345, 0.5439377085650723, 0.39265850945494996]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/w_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.585928809788654, 0.5862068965517241, 0.5976084538375973, 0.5781423804226918]
 2 | Mistral_7B = [0.5670189098998888, 0.5672969966629589, 0.550055617352614, 0.5478309232480534]
 3 | Mixtral_8x7B = [0.6270856507230256, 0.6307007786429366, 0.6031701890989989, 0.564238042269188]
 4 | LLaMA2_7B = [0.5795328142380423, 0.5759176863181312, 0.5439377085650723, 0.3879310344827586]
 5 | LLaMA2_13B = [0.3576195773081201, 0.3681868743047831, 0.13264738598442713, 0.27335928809788657]
 6 | LLaMA2_70B = [0.5203003337041157, 0.5186318131256952, 0.514182424916574, 0.4972191323692992]
 7 | Falcon_7B = [0.4635706340378198, 0.4582869855394883, 0.47302558398220246, 0.4435483870967742]
 8 | Falcon_40B = [0.09983314794215796, 0.09955506117908787, 0.05589543937708565, 0.21885428253615127]
 9 | Falcon_180B = [0.5525583982202447, 0.550055617352614, 0.5389321468298109, 0.16879866518353726]
10 | LLaMA2_7B_AWQ = [0.5795328142380423, 0.5803670745272526, 0.5689655172413793, 0.5150166852057843]
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/deontology/wa_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.585928809788654, 0.5862068965517241, 0.5906562847608454]
 2 | Mistral_7B = [0.5670189098998888, 0.5709121245828699, 0.5556173526140156]
 3 | Mixtral_8x7B = [0.6270856507230256, 0.6256952169076752, 0.6048387096774194]
 4 | LLaMA2_7B = [0.5795328142380423, 0.5700778642936596, 0.5419911012235817]
 5 | LLaMA2_13B = [0.3576195773081201, 0.3973859844271413, 0.15878754171301446]
 6 | LLaMA2_70B = [0.5203003337041157, 0.546440489432703, 0.5414349276974416]
 7 | Falcon_7B = [0.4635706340378198, 0.4363181312569522, 0.47163515016685204]
 8 | Falcon_40B = [0.09983314794215796, 0.10372636262513904, 0.04922135706340378]
 9 | Falcon_180B = [0.5525583982202447, 0.47914349276974416, 0.48526140155728587]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/justice/kv_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.7255917159763313, 0.7189349112426036, 0.7089497041420119, 0.6501479289940828]
 2 | Mistral_7B = [0.6664201183431953, 0.6667899408284024, 0.6697485207100592, 0.6793639053254438]
 3 | Mixtral_8x7B = [0.7340976331360947, 0.7344674556213018, 0.7181952662721893, 0.650887573964497]
 4 | LLaMA2_7B = [0.6035502958579881, 0.6042899408284024, 0.6142751479289941, 0.5232988165680473]
 5 | LLaMA2_13B = [0.33986686390532544, 0.33986686390532544, 0.4345414201183432, 0.3890532544378698]
 6 | LLaMA2_70B = [0.6590236686390533, 0.6605029585798816, 0.6649408284023669, 0.5968934911242604]
 7 | Falcon_7B = [0.46079881656804733, 0.45710059171597633, 0.45968934911242604, 0.3816568047337278]
 8 | Falcon_40B = [0.04105029585798817, 0.04105029585798817, 0.0695266272189349, 0.13942307692307693]
 9 | Falcon_180B = [0.6268491124260355, 0.6290680473372781, 0.621301775147929, 0.4478550295857988]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/justice/w_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.7255917159763313, 0.7200443786982249, 0.7218934911242604, 0.6527366863905325]
 2 | Mistral_7B = [0.6664201183431953, 0.6671597633136095, 0.6797337278106509, 0.6338757396449705]
 3 | Mixtral_8x7B = [0.7340976331360947, 0.7329881656804734, 0.7233727810650887, 0.5732248520710059]
 4 | LLaMA2_7B = [0.6035502958579881, 0.6057692307692307, 0.5972633136094675, 0.46449704142011833]
 5 | LLaMA2_13B = [0.33986686390532544, 0.34985207100591714, 0.14497041420118342, 0.24149408284023668]
 6 | LLaMA2_70B = [0.6590236686390533, 0.6593934911242604, 0.6105769230769231, 0.5277366863905325]
 7 | Falcon_7B = [0.46079881656804733, 0.46116863905325445, 0.46486686390532544, 0.4992603550295858]
 8 | Falcon_40B = [0.04105029585798817, 0.039571005917159764, 0.015162721893491124, 0.21930473372781065]
 9 | Falcon_180B = [0.6268491124260355, 0.6316568047337278, 0.6183431952662722, 0.21375739644970415]
10 | LLaMA2_7B_AWQ = [0.6035502958579881, 0.6042899408284024, 0.6076183431952663, 0.4966715976331361]
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/justice/wa_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.7255917159763313, 0.7230029585798816, 0.7211538461538461]
 2 | Mistral_7B = [0.6664201183431953, 0.6704881656804734, 0.6826923076923077]
 3 | Mixtral_8x7B = [0.7340976331360947, 0.7326183431952663, 0.7270710059171598]
 4 | LLaMA2_7B = [0.6035502958579881, 0.6150147928994083, 0.5832100591715976]
 5 | LLaMA2_13B = [0.33986686390532544, 0.36797337278106507, 0.18010355029585798]
 6 | LLaMA2_70B = [0.6590236686390533, 0.6886094674556213, 0.6264792899408284]
 7 | Falcon_7B = [0.46079881656804733, 0.4430473372781065, 0.4467455621301775]
 8 | Falcon_40B = [0.04105029585798817, 0.059911242603550297, 0.01257396449704142]
 9 | Falcon_180B = [0.6268491124260355, 0.5329142011834319, 0.5417899408284024]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/kv_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.8578894472361809, 0.8655276381909548, 0.8625125628140704, 0.834572864321608]
 2 | Mistral_7B = [0.6840201005025126, 0.6850251256281407, 0.6974874371859296, 0.6148743718592965]
 3 | Mixtral_8x7B = [0.9252261306532663, 0.9246231155778895, 0.9214070351758794, 0.878391959798995]
 4 | LLaMA2_7B = [0.6753768844221105, 0.6757788944723618, 0.700502512562814, 0.6858291457286432]
 5 | LLaMA2_13B = [0.7995979899497487, 0.7989949748743719, 0.8142713567839196, 0.7155778894472362]
 6 | LLaMA2_70B = [0.9242211055276381, 0.9246231155778895, 0.9232160804020101, 0.914572864321608]
 7 | Falcon_7B = [0.16522613065326633, 0.16241206030150754, 0.22914572864321608, 0.3210050251256281]
 8 | Falcon_40B = [0.08321608040201005, 0.08442211055276382, 0.06793969849246231, 0.04904522613065326]
 9 | Falcon_180B = [0.9401005025125628, 0.939497487437186, 0.938894472361809, 0.9298492462311557]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/w_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.8578894472361809, 0.8613065326633166, 0.8574874371859297, 0.8315577889447237]
 2 | Mistral_7B = [0.6840201005025126, 0.6846231155778895, 0.6671356783919598, 0.76]
 3 | Mixtral_8x7B = [0.9252261306532663, 0.9256281407035176, 0.9238190954773869, 0.9053266331658292]
 4 | LLaMA2_7B = [0.6753768844221105, 0.6719597989949748, 0.7141708542713567, 0.7382914572864322]
 5 | LLaMA2_13B = [0.7995979899497487, 0.8008040201005026, 0.8168844221105528, 0.8247236180904522]
 6 | LLaMA2_70B = [0.9242211055276381, 0.9248241206030151, 0.9230150753768844, 0.9139698492462311]
 7 | Falcon_7B = [0.16522613065326633, 0.17427135678391958, 0.31316582914572866, 0.2064321608040201]
 8 | Falcon_40B = [0.08321608040201005, 0.0850251256281407, 0.05849246231155779, 0.05628140703517588]
 9 | Falcon_180B = [0.9401005025125628, 0.938894472361809, 0.9350753768844221, 0.9306532663316583]
10 | LLaMA2_7B_AWQ = [0.6753768844221105, 0.675175879396985, 0.6605025125628141, 0.647035175879397]
11 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/ethics/virtue/wa_result.py:
--------------------------------------------------------------------------------
 1 | ChatGLM3_6B = [0.8578894472361809, 0.8574874371859297, 0.8536683417085427]
 2 | Mistral_7B = [0.6840201005025126, 0.6727638190954773, 0.650251256281407]
 3 | Mixtral_8x7B = [0.9252261306532663, 0.925427135678392, 0.9228140703517588]
 4 | LLaMA2_7B = [0.6753768844221105, 0.6655276381909547, 0.7035175879396985]
 5 | LLaMA2_13B = [0.7995979899497487, 0.8044221105527638, 0.815678391959799]
 6 | LLaMA2_70B = [0.9242211055276381, 0.9232160804020101, 0.9189949748743719]
 7 | Falcon_7B = [0.16522613065326633, 0.1306532663316583, 0.3499497487437186]
 8 | Falcon_40B = [0.08321608040201005, 0.05748743718592965, 0.027537688442211054]
 9 | Falcon_180B = [0.9401005025125628, 0.938894472361809, 0.935678391959799]
10 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/ethics/plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | import matplotlib.pyplot as plt
  4 | from matplotlib.lines import Line2D
  5 | from matplotlib.ticker import FormatStrFormatter
  6 | import importlib
  7 | import numpy as np
  8 | 
  9 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the
 10 | # script instead of being passed as command args.
 11 | 
 12 | parser = argparse.ArgumentParser()
 13 | parser.add_argument('--save_path', type=str, default='./figures/')
 14 | parser.add_argument('--save_name', type=str, default=None)
 15 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv'])
 16 | parser.add_argument('--dataset_name', type=str, default='Multi-Doc_QA')
 17 | parser.add_argument('--norm', action='store_true')
 18 | parser.add_argument("--num_bins", type=int, default=30, help="bin number")
 19 | parser.add_argument('--x_label', type=str, default='Bit-width')
 20 | parser.add_argument('--y_label', type=str, default='Accuracy')
 21 | parser.add_argument('--legend_loc', type=str, default='lower left')
 22 | args = parser.parse_args()
 23 | 
 24 | 
 25 | ##################################### Helper Functions ######################################
 26 | def result_pad(raw_results, fp_idx=0):
 27 |     raw_fp_result = raw_results[fp_idx]
 28 |     new_results = [x if x is not None else 0 for x in raw_results]
 29 |     new_results[fp_idx] = raw_fp_result
 30 |     return new_results
 31 | 
 32 | def normalize_results(raw_results, fp_idx=0):
 33 |     # raw_results = result_pad(raw_results)
 34 |     if raw_results[fp_idx] is not None and raw_results[fp_idx] != 0:
 35 |         norm_results = [i / raw_results[fp_idx] * 100 if i is not None else None for i in raw_results]
 36 |         print('Result Normalization Succeeded.')
 37 |     else:
 38 |         norm_results = raw_results
 39 |         print('The input results have no FP precision, return original results.')
 40 |     return norm_results
 41 | 
 42 | MARKERS = ['o', '^', 's', 'd', '+']
 43 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown']
 44 | MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_70B"]
 45 | TASKS = ['commonsense', 'deontology', 'justice', 'virtue']
 46 | MIN = [0.2, 0.2, 0.5, 0.6]
 47 | MAX = [0.81, 0.71, 0.8, 0.95]
 48 | 
 49 | ###################################### Plot Functions ######################################
 50 | if __name__ == '__main__':
 51 |     # create a folder to save the resulting plot.
 52 |     if not os.path.exists(args.save_path):
 53 |         os.makedirs(args.save_path)
 54 | 
 55 |     # plot different tasks
 56 |     for t_i, task in enumerate(TASKS):
 57 |         # plot name
 58 |         save_name = args.save_name + '.pdf' if args.save_name is not None else \
 59 |                         task + '_' + args.plot_mode + '_' + args.x_label + '.pdf'
 60 |         save_name = args.save_path + save_name
 61 | 
 62 |         # import data
 63 |         data = importlib.import_module("ethics.{}.{}_result".format(task, args.plot_mode))
 64 |         
 65 |         # init canvas
 66 |         fig, ax = plt.subplots(figsize=[8., 6.])
 67 | 
 68 |         # make the plot compact
 69 |         plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11)
 70 | 
 71 |         # set figure labels
 72 |         plt.xlabel(args.x_label, fontsize=20)
 73 |         plt.ylabel(args.y_label if not args.norm else 'Normalized ' + args.y_label, fontsize=20)
 74 |         plt.ylim(MIN[t_i], MAX[t_i])
 75 |         plt.yticks(np.arange(MIN[t_i], MAX[t_i], 0.1))
 76 | 
 77 |         # set axes font size
 78 |         ax.tick_params(axis='x', labelsize=20)
 79 |         ax.tick_params(axis='y', labelsize=20)
 80 | 
 81 |         x_axis = {
 82 |             'w':   ['FP16', 'W8', 'W4', 'W3'],
 83 |             'wa':    ['FP16', 'W8A8', 'W4A8'],
 84 |             'kv': ['FP16', 'KV8', 'KV4', 'KV3'],
 85 |         }[args.plot_mode]
 86 | 
 87 |         for i, model_name in enumerate(MODELS):
 88 |             y_value = eval("data.{}".format(model_name))
 89 |             ax.plot(x_axis, y_value, marker=MARKERS[0], markersize=16, label=MODELS[i], linestyle="-", color=COLORS[i])
 90 |             # ax.plot(x_axis[:len(y_value)], y_value, marker=MARKERS[j], markersize=13, linestyle="-", color=COLORS[i])
 91 | 
 92 |         # set legend location
 93 |         # ax.legend(loc=args.legend_loc, fontsize=20)
 94 |         ax.legend(fontsize=20)
 95 | 
 96 |         # legend_elements = []
 97 |         # for i, _ in enumerate(MODELS):
 98 |         #     legend_elements.append(Line2D([0], [0], marker=MARKERS[i], color='w', label=mode[i], markerfacecolor='tab:brown', markersize=16))
 99 |         # ax.legend(handles=legend_elements, fontsize=20)
100 | 
101 |         plt.savefig(save_name)
102 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/plot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import matplotlib.pyplot as plt
 4 | from matplotlib.lines import Line2D
 5 | from matplotlib.ticker import FormatStrFormatter
 6 | import importlib
 7 | import numpy as np
 8 | 
 9 | # Considering we plot the figures with the GUI mode, the options below should be set mannually and locally in the
10 | # script instead of being passed as command args.
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--save_path', type=str, default='./figures/')
14 | parser.add_argument('--save_name', type=str, default=None)
15 | parser.add_argument('--plot_mode', type=str, default='kv', choices=['w', 'wa', 'kv'])
16 | parser.add_argument("--num_bins", type=int, default=30, help="bin number")
17 | parser.add_argument("--metric", type=str, default="mc1", choices=["mc1", "mc2"])
18 | parser.add_argument('--x_label', type=str, default='Bit-width')
19 | parser.add_argument('--y_label', type=str, default='Accuracy')
20 | parser.add_argument('--legend_loc', type=str, default='lower left')
21 | args = parser.parse_args()
22 | 
23 | 
24 | ##################################### Helper Functions ######################################
25 | def result_pad(raw_results, fp_idx=0):
26 |     raw_fp_result = raw_results[fp_idx]
27 |     new_results = [x if x is not None else 0 for x in raw_results]
28 |     new_results[fp_idx] = raw_fp_result
29 |     return new_results
30 | 
31 | MARKERS = ['o', '^', 's', 'd', '+']
32 | COLORS = ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple', 'tab:brown', 'tab:pink']
33 | # MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_13B", "LLaMA2_70B", "Falcon_7B", "Falcon_40B"]
34 | MODELS = ["Mistral_7B", "Mixtral_8x7B", "LLaMA2_7B", "LLaMA2_13B", "LLaMA2_70B"]
35 | 
36 | ###################################### Plot Functions ######################################
37 | if __name__ == '__main__':
38 |     # create a folder to save the resulting plot.
39 |     if not os.path.exists(args.save_path):
40 |         os.makedirs(args.save_path)
41 | 
42 |     # plot name
43 |     save_name = args.save_name + '.pdf' if args.save_name is not None else \
44 |                     args.plot_mode + '_' + args.x_label + '.pdf'
45 |     save_name = args.save_path + save_name
46 | 
47 |     # import data
48 |     data = importlib.import_module("results.{}_{}_result".format(args.plot_mode, args.metric))
49 |     
50 |     # init canvas
51 |     fig, ax = plt.subplots(figsize=[8., 6.])
52 | 
53 |     # make the plot compact
54 |     plt.subplots_adjust(left=0.11, right=0.99, top=0.99, bottom=0.11)
55 | 
56 |     # set figure labels
57 |     plt.xlabel(args.x_label, fontsize=20)
58 |     plt.ylabel(args.y_label, fontsize=20)
59 |     plt.ylim(0.2, 0.6)
60 |     plt.yticks(np.arange(0.2, 0.6, 0.1))
61 | 
62 |     # set axes font size
63 |     ax.tick_params(axis='x', labelsize=20)
64 |     ax.tick_params(axis='y', labelsize=20)
65 | 
66 |     # x_axis = {
67 |     #     'w':   ['FP16', 'W8', 'W4', 'W3'],
68 |     #     'wa':    ['FP16', 'W8A8', 'W4A8'],
69 |     #     'kv': ['FP16', 'KV8', 'KV4', 'KV3'],
70 |     # }[args.plot_mode]
71 |     x_axis = {
72 |         'w':   ['FP16', 'W8', 'W4', 'W3', 'W2'],
73 |         'wa':    ['FP16', 'W8A8', 'W4A8', 'W4A4'],
74 |         'kv': ['FP16', 'KV8', 'KV4', 'KV3', 'KV2'],
75 |     }[args.plot_mode]
76 | 
77 |     for i, model_name in enumerate(MODELS):
78 |         y_value = eval("data.{}".format(model_name))
79 |         ax.plot(x_axis, y_value, marker=MARKERS[0], markersize=16, label=MODELS[i], linestyle="-", color=COLORS[i])
80 | 
81 |     ax.legend(fontsize=20)
82 | 
83 |     plt.savefig(save_name)
84 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/kv_mc1_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.5177478580171359, 0.5177478580171359, 0.49938800489596086, 0.47368421052631576, 0.27539779681762544]
2 | Mixtral_8x7B = [0.4969400244798042, 0.49326805385556916, 0.4541003671970624, 0.4222766217870257, 0.24112607099143207]
3 | LLaMA2_7B = [0.29498164014687883, 0.2974296205630355, 0.30599755201958384, 0.24969400244798043, 0.2386780905752754]
4 | LLaMA2_13B = [0.28518971848225216, 0.2827417380660955, 0.2741738066095471, 0.2913096695226438, 0.2252141982864137]
5 | LLaMA2_70B = [0.3561811505507956, 0.3574051407588739, 0.3561811505507956, 0.34149326805385555, 0.23011015911872704]
6 | Falcon_7B = [0.2876376988984088, 0.2876376988984088, 0.2876376988984088, 0.26560587515299877, 0.22643818849449204]
7 | Falcon_40B = [0.3733170134638923, 0.37209302325581395, 0.3806609547123623, 0.37454100367197063, 0.26805385556915545]
8 | Falcon_180B = [0.36964504283965727, 0.36964504283965727, 0.36964504283965727, 0.3708690330477356, 0.29498164014687883]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/kv_mc2_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.6688017220512057, 0.6691715738543765, 0.6564044217068896, 0.6427100548674619, 0.48027649255784544]
2 | Mixtral_8x7B = [0.6484817260542265, 0.6495993649117421, 0.629318751867763, 0.5994892449410594, 0.4926746070653021]
3 | LLaMA2_7B = [0.44926537410693956, 0.44936809200977723, 0.45751973212284464, 0.39193280615715015, 0.47475433863980704]
4 | LLaMA2_13B = [0.4406564723437067, 0.43997647262514694, 0.43465132855093525, 0.4525000742515952, 0.4510122041900766]
5 | LLaMA2_70B = [0.5275437344080325, 0.5279487941769914, 0.526220891244331, 0.5207520579654967, 0.4378068545954345]
6 | Falcon_7B = [0.4407104445643683, 0.4410793932995058, 0.44445645688981794, 0.41868173734281455, 0.45280967780345155]
7 | Falcon_40B = [0.5247662521731141, 0.5246133475850496, 0.5284034579971917, 0.522891477364859, 0.46078848831981034]
8 | Falcon_180B = [0.5335740147660477, 0.5333871339139281, 0.5332405981470844, 0.5334614774880382, 0.46799068584054326]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/w_mc1_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.5177478580171359, 0.5165238678090576, 0.4883720930232558, 0.45532435740514077, 0.27050183598531213]
2 | Mixtral_8x7B = [0.4969400244798042, 0.4944920440636475, 0.4724602203182375, 0.4724602203182375, 0.24112607099143207]
3 | LLaMA2_7B = [0.29498164014687883, 0.2962056303549572, 0.29498164014687883, 0.2937576499388005, 0.26193390452876375]
4 | LLaMA2_13B = [0.28518971848225216, 0.2802937576499388, 0.2864137086903305, 0.26805385556915545, 0.25458996328029376]
5 | LLaMA2_70B = [0.3561811505507956, 0.3561811505507956, 0.34516523867809057, 0.32068543451652387, 0.24969400244798043]
6 | Falcon_7B = [0.2876376988984088, 0.28886168910648713, 0.2778457772337821, 0.2668298653610771, 0.2350061199510404]
7 | Falcon_40B = [0.3733170134638923, 0.3733170134638923, 0.36964504283965727, 0.36107711138310894, 0.23133414932680538]
8 | Falcon_180B = [0.36964504283965727, 0.36964504283965727, 0.3574051407588739, 0.34761321909424725, 0.2876376988984088]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/w_mc2_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.6688017220512057, 0.6696166302279059, 0.6577095905502711, 0.6138682987896682, 0.5058903110407331]
2 | Mixtral_8x7B = [0.6484817260542265, 0.6515374278400258, 0.6247955109142671, 0.6397627935078577, 0.4830758940244642]
3 | LLaMA2_7B = [0.44926537410693956, 0.4496653069702225, 0.456922166094929, 0.45485949562068634, 0.511788507924486]
4 | LLaMA2_13B = [0.4406564723437067, 0.43933660745531866, 0.43538230062712213, 0.411918030468475, 0.47564118986968823]
5 | LLaMA2_70B = [0.5275437344080325, 0.5275547367866754, 0.5159843363616804, 0.49391937697797156, 0.39547118299925027]
6 | Falcon_7B = [0.4407104445643683, 0.4409154529480796, 0.43489265554092676, 0.4297975191130771, 0.4701201417345436]
7 | Falcon_40B = [0.5247662521731141, 0.5249681834735866, 0.515695665047965, 0.5164459705459207, 0.4500814796622027]
8 | Falcon_180B = [0.5335740147660477, 0.5335126034400929, 0.5318512061393834, 0.5068406829747578, 0.477815457771522]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/wa_mc1_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.5177478580171359, 0.5201958384332925, 0.4969400244798042, 0.28886168910648713]
2 | Mixtral_8x7B = [0.4969400244798042, 0.4834761321909425, 0.4834761321909425, 0.25703794369645044]
3 | LLaMA2_7B = [0.29498164014687883, 0.2962056303549572, 0.2937576499388005, 0.25458996328029376]
4 | LLaMA2_13B = [0.28518971848225216, 0.28151774785801714, 0.27906976744186046, 0.2558139534883721]
5 | LLaMA2_70B = [0.3561811505507956, 0.34394124847001223, 0.3378212974296206, 0.23133414932680538]
6 | Falcon_7B = [0.2876376988984088, 0.2937576499388005, 0.2802937576499388, 0.23255813953488372]
7 | Falcon_40B = [0.3733170134638923, 0.3769889840881273, 0.3623011015911873, 0.22031823745410037]
8 | Falcon_180B = [0.36964504283965727, 0.37209302325581395, 0.3623011015911873, 0.22766217870257038]
9 | 


--------------------------------------------------------------------------------
/qllm_eval/visualization/trustworthiness/truthfulqa/results/wa_mc2_result.py:
--------------------------------------------------------------------------------
1 | Mistral_7B = [0.6688017220512057, 0.6723008908877496, 0.6595733109987866, 0.4852569797437285]
2 | Mixtral_8x7B = [0.6484817260542265, 0.6450170044402032, 0.6305351114450428, 0.4898087829899215]
3 | LLaMA2_7B = [0.44926537410693956, 0.4491759979927774, 0.4563169228667865, 0.4912608412585181]
4 | LLaMA2_13B = [0.4406564723437067, 0.4387751750922574, 0.42988544244530175, 0.5008732700965536]
5 | LLaMA2_70B = [0.5275437344080325, 0.520717869815872, 0.5098099728598039, 0.48717163966923716]
6 | Falcon_7B = [0.4407104445643683, 0.4450711520882632, 0.4350850113436507, 0.4324344947777]
7 | Falcon_40B = [0.5247662521731141, 0.5216728790262783, 0.5123887987899755, 0.4816507929107791]
8 | Falcon_180B = [0.5335740147660477, 0.5367397258662, 0.5329333474493646, 0.4821977821247397]
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # basic requirements
 2 | torch>=2.0
 3 | xopen
 4 | anthropic
 5 | 
 6 | # huggingface requirements
 7 | accelerate>=0.21.0
 8 | transformers==4.44.2
 9 | datasets
10 | evaluate
11 | 
12 | # for logging or special models, please try installing the following packages
13 | wandb
14 | sentencepiece
15 | 
16 | # for longbench
17 | tqdm
18 | rouge
19 | jieba
20 | fuzzywuzzy
21 | einops
22 | 
23 | # lm_eval
24 | lm_eval==0.3.0
25 | 
26 | # for visualization
27 | seaborn
28 | 
29 | # for conversation tasks
30 | anthropic
31 | openai==0.28
32 | fschat
33 | 
34 | # use flash attention
35 | packaging
36 | ninja
37 | # flash-attn
38 | triton>=2.1.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup
 4 | import setuptools
 5 | 
 6 | with open("README.md", "r") as fh:
 7 |     long_description = fh.read()
 8 | 
 9 | setup(
10 |     name="qllm_eval",
11 |     version="0.1.0",
12 |     description="QLLM_Evaluation",
13 |     author="Shiyao Li",
14 |     author_email="shiyao1620@gmail.com",
15 |     # url="https://github.com/LSY-noya/QLLM-Evaluation.git",
16 |     packages=setuptools.find_packages(),
17 |     license="MIT",
18 |     long_description=long_description,
19 |     classifiers=[
20 |         "Programming Language :: Python :: 3",
21 |         "Operating System :: OS Independent",
22 |     ],
23 | )
24 | 


--------------------------------------------------------------------------------