├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── calm_logo.png
    ├── calm_logo_prev.png
    ├── calm_suite.png
    └── causal_task.png
├── calm
    ├── data_processing
    │   ├── generate_questions.py
    │   ├── prompt
    │   │   ├── AC-B_causal_judgement.py
    │   │   ├── AR-B_CaLM-AR.py
    │   │   ├── ATE.py
    │   │   ├── BAS-B_backadj.py
    │   │   ├── BAS-C_max-BAS.py
    │   │   ├── BAS-C_min-BAS.py
    │   │   ├── BAS-C_mix-BAS.py
    │   │   ├── CA-B_FA.py
    │   │   ├── CA-B_FP.py
    │   │   ├── CB-B_collider-bias.py
    │   │   ├── CDE.py
    │   │   ├── CEG-O_E-CARE.py
    │   │   ├── CEI-B.py
    │   │   ├── CORR-B_correlation.py
    │   │   ├── CR-B_det-counterfactual.py
    │   │   ├── CR-C_CRASS.py
    │   │   ├── EAE-B_exp-away.py
    │   │   ├── ECI-B_CTB.py
    │   │   ├── ECI-B_ESC.py
    │   │   ├── ECI-B_MAVEN-ERE.py
    │   │   ├── ETT.py
    │   │   ├── FAS-C_FAS.py
    │   │   ├── IV-C_CaLM-IV.py
    │   │   ├── NDE.py
    │   │   ├── NIE.py
    │   │   ├── PCD-B_COPA.py
    │   │   ├── PCD-B_E-CARE.py
    │   │   ├── PCD-C_COPA.py
    │   │   ├── PCD-C_E-CARE.py
    │   │   ├── PN.py
    │   │   └── PS.py
    │   └── task_hiearchy.py
    ├── evaluate.py
    ├── evaluation
    │   ├── accuracy
    │   │   ├── choice.py
    │   │   ├── open-ended.py
    │   │   └── prob.py
    │   ├── aggregate_metrics.py
    │   ├── core_metrics.py
    │   ├── element_properties
    │   │   ├── model_info.py
    │   │   └── random_guess.py
    │   ├── error
    │   │   ├── basic_adversarial
    │   │   │   ├── AC-B_causal_judgement.py
    │   │   │   ├── AR-B_CaLM-AR.py
    │   │   │   ├── AS.py
    │   │   │   ├── CA-B.py
    │   │   │   ├── CEI-B.py
    │   │   │   ├── CLADDER.py
    │   │   │   ├── CR-C_CRASS.py
    │   │   │   ├── ECI.py
    │   │   │   ├── Natural.py
    │   │   │   ├── PCD-B.py
    │   │   │   ├── PCD-C.py
    │   │   │   └── Probability.py
    │   │   ├── cot
    │   │   │   ├── AC-B_causal_judgement.py
    │   │   │   ├── AR-B_CaLM-AR.py
    │   │   │   ├── AS.py
    │   │   │   ├── CA-B.py
    │   │   │   ├── CEI-B.py
    │   │   │   ├── CLADDER.py
    │   │   │   ├── CR-C_CRASS.py
    │   │   │   ├── ECI.py
    │   │   │   ├── Natural.py
    │   │   │   ├── PCD-B.py
    │   │   │   ├── PCD-C.py
    │   │   │   └── Probability.py
    │   │   └── icl
    │   │   │   ├── AC-B_causal_judgement.py
    │   │   │   ├── AR-B_CaLM-AR.py
    │   │   │   ├── AS.py
    │   │   │   ├── CA-B.py
    │   │   │   ├── CEI-B.py
    │   │   │   ├── CLADDER.py
    │   │   │   ├── CR-C_CRASS.py
    │   │   │   ├── ECI.py
    │   │   │   ├── Natural.py
    │   │   │   ├── PCD-B.py
    │   │   │   ├── PCD-C.py
    │   │   │   └── Probability.py
    │   ├── errors.py
    │   └── labeling
    │   │   ├── AC-B_causal_judgement.py
    │   │   ├── AR-B_CaLM-AR.py
    │   │   ├── AS.py
    │   │   ├── CA-B_FA.py
    │   │   ├── CA-B_FP.py
    │   │   ├── CEG-O_E-CARE.py
    │   │   ├── CEI-B.py
    │   │   ├── CLADDER.py
    │   │   ├── CR-C_CRASS.py
    │   │   ├── ECI.py
    │   │   ├── Natural.py
    │   │   ├── PCD-B.py
    │   │   ├── PCD-C.py
    │   │   ├── Probability.py
    │   │   └── common_answers.py
    ├── models
    │   ├── model_apis
    │   │   ├── baichuan1_7b_api.py
    │   │   ├── baichuan1_chat_13b_api.py
    │   │   ├── baichuan2_chat_13b_api.py
    │   │   ├── baichuan2_chat_7b_api.py
    │   │   ├── chatgpt_api.py
    │   │   ├── claude2_api.py
    │   │   ├── gpt4_api.py
    │   │   ├── internlm_chat_20b_api.py
    │   │   ├── internlm_chat_7b_api.py
    │   │   ├── koala_13b_api.py
    │   │   ├── llama2_13b_api.py
    │   │   ├── llama2_70b_api.py
    │   │   ├── llama2_7b_api.py
    │   │   ├── llama2_chat_70b_api.py
    │   │   ├── qwen_14b_api.py
    │   │   ├── qwen_7b_api.py
    │   │   ├── vicuna_33b_api.py
    │   │   └── wizardcoder_15b_api.py
    │   └── model_loader.py
    ├── run.py
    └── utils
    │   ├── load_items.py
    │   └── logger_info.py
├── calm_dataset
    ├── association
    │   ├── correlation
    │   │   ├── CORR-B_correlation_CN.json
    │   │   └── CORR-B_correlation_EN.json
    │   └── explaining_away_effect
    │   │   ├── EAE-B_exp-away_CN.json
    │   │   └── EAE-B_exp-away_EN.json
    ├── causal_discovery
    │   ├── abstract_reasoning
    │   │   ├── AR-B_CaLM-AR_CN.json
    │   │   └── AR-B_CaLM-AR_EN.json
    │   ├── causal_attribution
    │   │   ├── CA-B_FA_CN.json
    │   │   ├── CA-B_FA_EN.json
    │   │   ├── CA-B_FP_CN.json
    │   │   └── CA-B_FP_EN.json
    │   ├── event_causality_identification
    │   │   ├── ECI-B_CTB_CN.json
    │   │   ├── ECI-B_CTB_EN.json
    │   │   ├── ECI-B_ESC_CN.json
    │   │   ├── ECI-B_ESC_EN.json
    │   │   ├── ECI-B_MAVEN-ERE_CN.json
    │   │   └── ECI-B_MAVEN-ERE_EN.json
    │   └── pairwise_causal_discovery
    │   │   ├── PCD-B_COPA_CN.json
    │   │   ├── PCD-B_COPA_EN.json
    │   │   ├── PCD-B_E-CARE_CN.json
    │   │   ├── PCD-B_E-CARE_EN.json
    │   │   ├── PCD-C_COPA_CN.json
    │   │   ├── PCD-C_COPA_EN.json
    │   │   ├── PCD-C_E-CARE_CN.json
    │   │   └── PCD-C_E-CARE_EN.json
    ├── counterfactual
    │   ├── actual_causality
    │   │   ├── AC-B_causal_judgement_CN.json
    │   │   └── AC-B_causal_judgement_EN.json
    │   ├── causal_explanation_generation
    │   │   ├── CEG-O_E-CARE_CN.json
    │   │   └── CEG-O_E-CARE_EN.json
    │   ├── counterfactual_reasoning
    │   │   ├── CR-B_det-counterfactual_CN.json
    │   │   ├── CR-B_det-counterfactual_EN.json
    │   │   ├── CR-C_CRASS_CN.json
    │   │   └── CR-C_CRASS_EN.json
    │   ├── effect_of_the_treatment_on_the_treated
    │   │   ├── ETT-B_ETT-natural_CN.json
    │   │   ├── ETT-B_ETT-natural_EN.json
    │   │   ├── ETT-P_ETT-basic_CN.json
    │   │   ├── ETT-P_ETT-basic_EN.json
    │   │   ├── ETT-P_ETT-hard_CN.json
    │   │   └── ETT-P_ETT-hard_EN.json
    │   ├── natural_direct_effect
    │   │   ├── NDE-B_NDE-natural_CN.json
    │   │   ├── NDE-B_NDE-natural_EN.json
    │   │   ├── NDE-P_NDE-basic_CN.json
    │   │   ├── NDE-P_NDE-basic_EN.json
    │   │   ├── NDE-P_NDE-hard_CN.json
    │   │   └── NDE-P_NDE-hard_EN.json
    │   ├── natural_indirect_effect
    │   │   ├── NIE-B_NIE-natural_CN.json
    │   │   ├── NIE-B_NIE-natural_EN.json
    │   │   ├── NIE-P_NIE-basic_CN.json
    │   │   ├── NIE-P_NIE-basic_EN.json
    │   │   ├── NIE-P_NIE-hard_CN.json
    │   │   └── NIE-P_NIE-hard_EN.json
    │   ├── probability_of_necessity
    │   │   ├── PN-P_PN-basic_CN.json
    │   │   ├── PN-P_PN-basic_EN.json
    │   │   ├── PN-P_PN-hard_CN.json
    │   │   └── PN-P_PN-hard_EN.json
    │   └── probability_of_sufficiency
    │   │   ├── PS-P_PS-basic_CN.json
    │   │   ├── PS-P_PS-basic_EN.json
    │   │   ├── PS-P_PS-hard_CN.json
    │   │   └── PS-P_PS-hard_EN.json
    └── intervention
    │   ├── average_treatment_effect
    │       ├── ATE-B_ATE-natural_CN.json
    │       ├── ATE-B_ATE-natural_EN.json
    │       ├── ATE-P_ATE-basic_CN.json
    │       ├── ATE-P_ATE-basic_EN.json
    │       ├── ATE-P_ATE-hard_CN.json
    │       └── ATE-P_ATE-hard_EN.json
    │   ├── backdoor_adjustment_set
    │       ├── BAS-B_backadj_CN.json
    │       ├── BAS-B_backadj_EN.json
    │       ├── BAS-C_max-BAS_CN.json
    │       ├── BAS-C_max-BAS_EN.json
    │       ├── BAS-C_min-BAS_CN.json
    │       ├── BAS-C_min-BAS_EN.json
    │       ├── BAS-C_mix-BAS_CN.json
    │       └── BAS-C_mix-BAS_EN.json
    │   ├── causal_effect_identification
    │       ├── CEI-B_0.2-UC_CN.json
    │       ├── CEI-B_0.2-UC_EN.json
    │       ├── CEI-B_0.4-UC_CN.json
    │       ├── CEI-B_0.4-UC_EN.json
    │       ├── CEI-B_0.6-UC_CN.json
    │       ├── CEI-B_0.6-UC_EN.json
    │       ├── CEI-B_0.8-UC_CN.json
    │       └── CEI-B_0.8-UC_EN.json
    │   ├── collider_bias
    │       ├── CB-B_collider-bias_CN.json
    │       └── CB-B_collider-bias_EN.json
    │   ├── controlled_direct_effect
    │       ├── CDE-B_CDE-natural_CN.json
    │       ├── CDE-B_CDE-natural_EN.json
    │       ├── CDE-P_CDE-basic_CN.json
    │       ├── CDE-P_CDE-basic_EN.json
    │       ├── CDE-P_CDE-hard_CN.json
    │       └── CDE-P_CDE-hard_EN.json
    │   ├── frontdoor_adjustment_set
    │       ├── FAS-C_FAS_CN.json
    │       └── FAS-C_FAS_EN.json
    │   └── instrumental_variable
    │       ├── IV-C_CaLM-IV_CN.json
    │       └── IV-C_CaLM-IV_EN.json
├── calm_dataset_gt_label
    ├── association
    │   ├── correlation
    │   │   ├── CORR-B_correlation_CN.json
    │   │   └── CORR-B_correlation_EN.json
    │   └── explaining_away_effect
    │   │   ├── EAE-B_exp-away_CN.json
    │   │   └── EAE-B_exp-away_EN.json
    ├── causal_discovery
    │   ├── event_causality_identification
    │   │   ├── ECI-B_CTB_CN.json
    │   │   ├── ECI-B_CTB_EN.json
    │   │   ├── ECI-B_ESC_CN.json
    │   │   ├── ECI-B_ESC_EN.json
    │   │   ├── ECI-B_MAVEN-ERE_CN.json
    │   │   └── ECI-B_MAVEN-ERE_EN.json
    │   └── pairwise_causal_discovery
    │   │   ├── PCD-B_COPA_CN.json
    │   │   ├── PCD-B_COPA_EN.json
    │   │   ├── PCD-B_E-CARE_CN.json
    │   │   ├── PCD-B_E-CARE_EN.json
    │   │   ├── PCD-C_COPA_CN.json
    │   │   ├── PCD-C_COPA_EN.json
    │   │   ├── PCD-C_E-CARE_CN.json
    │   │   └── PCD-C_E-CARE_EN.json
    ├── counterfactual
    │   ├── actual_causality
    │   │   ├── AC-B_causal_judgement_CN.json
    │   │   └── AC-B_causal_judgement_EN.json
    │   ├── causal_explanation_generation
    │   │   ├── CEG-O_E-CARE_CN.json
    │   │   └── CEG-O_E-CARE_EN.json
    │   └── counterfactual_reasoning
    │   │   ├── CR-B_det-counterfactual_CN.json
    │   │   ├── CR-B_det-counterfactual_EN.json
    │   │   ├── CR-C_CRASS_CN.json
    │   │   └── CR-C_CRASS_EN.json
    └── intervention
    │   ├── backdoor_adjustment_set
    │       ├── BAS-B_backadj_CN.json
    │       └── BAS-B_backadj_EN.json
    │   └── collider_bias
    │       ├── CB-B_collider-bias_CN.json
    │       └── CB-B_collider-bias_EN.json
├── calm_lite_dataset
    ├── association
    │   ├── correlation
    │   │   ├── CORR-B_correlation_CN.json
    │   │   └── CORR-B_correlation_EN.json
    │   └── explaining_away_effect
    │   │   ├── EAE-B_exp-away_CN.json
    │   │   └── EAE-B_exp-away_EN.json
    ├── causal_discovery
    │   ├── abstract_reasoning
    │   │   ├── AR-B_CaLM-AR_CN.json
    │   │   └── AR-B_CaLM-AR_EN.json
    │   ├── causal_attribution
    │   │   ├── CA-B_FA_CN.json
    │   │   ├── CA-B_FA_EN.json
    │   │   ├── CA-B_FP_CN.json
    │   │   └── CA-B_FP_EN.json
    │   ├── event_causality_identification
    │   │   ├── ECI-B_CTB_CN.json
    │   │   ├── ECI-B_CTB_EN.json
    │   │   ├── ECI-B_ESC_CN.json
    │   │   ├── ECI-B_ESC_EN.json
    │   │   ├── ECI-B_MAVEN-ERE_CN.json
    │   │   └── ECI-B_MAVEN-ERE_EN.json
    │   └── pairwise_causal_discovery
    │   │   ├── PCD-B_COPA_CN.json
    │   │   ├── PCD-B_COPA_EN.json
    │   │   ├── PCD-B_E-CARE_CN.json
    │   │   ├── PCD-B_E-CARE_EN.json
    │   │   ├── PCD-C_COPA_CN.json
    │   │   ├── PCD-C_COPA_EN.json
    │   │   ├── PCD-C_E-CARE_CN.json
    │   │   └── PCD-C_E-CARE_EN.json
    ├── counterfactual
    │   ├── actual_causality
    │   │   ├── AC-B_causal_judgement_CN.json
    │   │   └── AC-B_causal_judgement_EN.json
    │   ├── causal_explanation_generation
    │   │   ├── CEG-O_E-CARE_CN.json
    │   │   └── CEG-O_E-CARE_EN.json
    │   ├── counterfactual_reasoning
    │   │   ├── CR-B_det-counterfactual_CN.json
    │   │   ├── CR-B_det-counterfactual_EN.json
    │   │   ├── CR-C_CRASS_CN.json
    │   │   └── CR-C_CRASS_EN.json
    │   ├── effect_of_the_treatment_on_the_treated
    │   │   ├── ETT-B_ETT-natural_CN.json
    │   │   ├── ETT-B_ETT-natural_EN.json
    │   │   ├── ETT-P_ETT-basic_CN.json
    │   │   ├── ETT-P_ETT-basic_EN.json
    │   │   ├── ETT-P_ETT-hard_CN.json
    │   │   └── ETT-P_ETT-hard_EN.json
    │   ├── natural_direct_effect
    │   │   ├── NDE-B_NDE-natural_CN.json
    │   │   ├── NDE-B_NDE-natural_EN.json
    │   │   ├── NDE-P_NDE-basic_CN.json
    │   │   ├── NDE-P_NDE-basic_EN.json
    │   │   ├── NDE-P_NDE-hard_CN.json
    │   │   └── NDE-P_NDE-hard_EN.json
    │   ├── natural_indirect_effect
    │   │   ├── NIE-B_NIE-natural_CN.json
    │   │   ├── NIE-B_NIE-natural_EN.json
    │   │   ├── NIE-P_NIE-basic_CN.json
    │   │   ├── NIE-P_NIE-basic_EN.json
    │   │   ├── NIE-P_NIE-hard_CN.json
    │   │   └── NIE-P_NIE-hard_EN.json
    │   ├── probability_of_necessity
    │   │   ├── PN-P_PN-basic_CN.json
    │   │   ├── PN-P_PN-basic_EN.json
    │   │   ├── PN-P_PN-hard_CN.json
    │   │   └── PN-P_PN-hard_EN.json
    │   └── probability_of_sufficiency
    │   │   ├── PS-P_PS-basic_CN.json
    │   │   ├── PS-P_PS-basic_EN.json
    │   │   ├── PS-P_PS-hard_CN.json
    │   │   └── PS-P_PS-hard_EN.json
    └── intervention
    │   ├── average_treatment_effect
    │       ├── ATE-B_ATE-natural_CN.json
    │       ├── ATE-B_ATE-natural_EN.json
    │       ├── ATE-P_ATE-basic_CN.json
    │       ├── ATE-P_ATE-basic_EN.json
    │       ├── ATE-P_ATE-hard_CN.json
    │       └── ATE-P_ATE-hard_EN.json
    │   ├── backdoor_adjustment_set
    │       ├── BAS-B_backadj_CN.json
    │       ├── BAS-B_backadj_EN.json
    │       ├── BAS-C_max-BAS_CN.json
    │       ├── BAS-C_max-BAS_EN.json
    │       ├── BAS-C_min-BAS_CN.json
    │       ├── BAS-C_min-BAS_EN.json
    │       ├── BAS-C_mix-BAS_CN.json
    │       └── BAS-C_mix-BAS_EN.json
    │   ├── causal_effect_identification
    │       ├── CEI-B_0.2-UC_CN.json
    │       ├── CEI-B_0.2-UC_EN.json
    │       ├── CEI-B_0.4-UC_CN.json
    │       ├── CEI-B_0.4-UC_EN.json
    │       ├── CEI-B_0.6-UC_CN.json
    │       ├── CEI-B_0.6-UC_EN.json
    │       ├── CEI-B_0.8-UC_CN.json
    │       └── CEI-B_0.8-UC_EN.json
    │   ├── collider_bias
    │       ├── CB-B_collider-bias_CN.json
    │       └── CB-B_collider-bias_EN.json
    │   ├── controlled_direct_effect
    │       ├── CDE-B_CDE-natural_CN.json
    │       ├── CDE-B_CDE-natural_EN.json
    │       ├── CDE-P_CDE-basic_CN.json
    │       ├── CDE-P_CDE-basic_EN.json
    │       ├── CDE-P_CDE-hard_CN.json
    │       └── CDE-P_CDE-hard_EN.json
    │   ├── frontdoor_adjustment_set
    │       ├── FAS-C_FAS_CN.json
    │       └── FAS-C_FAS_EN.json
    │   └── instrumental_variable
    │       ├── IV-C_CaLM-IV_CN.json
    │       └── IV-C_CaLM-IV_EN.json
├── documents
    ├── calm-lite.md
    ├── model_details.md
    └── tasks.md
├── model_configs
    ├── baichuan1_7b.json
    ├── baichuan1_chat_13b.json
    ├── chatgpt.json
    ├── claude2.json
    ├── default.json
    └── gpt4.json
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | output/*
2 | loggers/*
3 | loggers
4 | output
5 | __pycache__/
6 | *.pyc
7 | *.pyo
8 | *.pyd


--------------------------------------------------------------------------------
/assets/calm_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCausaLab/CaLM/1c1e93a80c3f9b3250bd79faf4dee8f8bc3479dd/assets/calm_logo.png


--------------------------------------------------------------------------------
/assets/calm_logo_prev.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCausaLab/CaLM/1c1e93a80c3f9b3250bd79faf4dee8f8bc3479dd/assets/calm_logo_prev.png


--------------------------------------------------------------------------------
/assets/calm_suite.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCausaLab/CaLM/1c1e93a80c3f9b3250bd79faf4dee8f8bc3479dd/assets/calm_suite.png


--------------------------------------------------------------------------------
/assets/causal_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenCausaLab/CaLM/1c1e93a80c3f9b3250bd79faf4dee8f8bc3479dd/assets/causal_task.png


--------------------------------------------------------------------------------
/calm/data_processing/prompt/AR-B_CaLM-AR.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Input Event: If %s.
  2 | Question: Does %s cause %s?  
  3 | Answer (Yes or No ?): """,
  4 |                "basic-CN":"""输入信息：如果%s。
  5 | 问题：%s是否导致%s？
  6 | 答案（是或否？）：""",
  7 |                "adversarial-ignore":"""Input Event: If %s.
  8 | Question: Does %s cause %s?  
  9 | Answer (Yes or No ?): """,
 10 |                "adversarial-ignore-CN":"""输入信息：如果%s。
 11 | 问题：%s是否导致%s？
 12 | 答案（是或否？）：""",
 13 |                 "adversarial-doubt":"""Input Event: If %s.
 14 | Question: Does %s cause %s?  
 15 | Answer (Yes or No ?): """,
 16 |                 "adversarial-doubt-CN":"""输入信息：如果%s。
 17 | 问题：%s是否导致%s？
 18 | 答案（是或否？）：""",
 19 |                "zero-shot-IcL":"""Answer questions based on causal relations in a given causal graph.
 20 | Input Event: If %s.
 21 | Question: Does %s cause %s? 
 22 | Answer (Yes or No ?):""",
 23 |                "zero-shot-IcL-CN":"""根据给定因果图中的因果关系回答问题。
 24 | 输入信息：如果%s。
 25 | 问题：%s是否导致%s？
 26 | 答案（是或否？）：""",
 27 |                "one-shot-IcL":"""Answer questions based on causal relations in a given causal graph.
 28 | Input Event: If A causes D, A causes E, B causes E, C causes D, and D causes E.
 29 | Question: Does C cause A? 
 30 | Answer (Yes or No ?): No
 31 | 
 32 | Input Event: If %s.
 33 | Question: Does %s cause %s? 
 34 | Answer (Yes or No ?):""",
 35 |                "one-shot-IcL-CN":"""根据给定因果图中的因果关系回答问题。
 36 | 输入信息：如果A导致D, A导致E, B导致E, C导致D, 以及D导致E。
 37 | 问题：C是否导致A？
 38 | 答案（是或否？）：否
 39 | 
 40 | 输入信息：如果%s。
 41 | 问题：%s是否导致%s？
 42 | 答案（是或否？）：""",
 43 |                "three-shot-IcL":"""Answer questions based on causal relations in a given causal graph.
 44 | Input Event: If A causes D, A causes E, B causes E, C causes D, and D causes E.
 45 | Question: Does C cause A? 
 46 | Answer (Yes or No ?): No
 47 | 
 48 | Input Event: If A causes B, A causes E, B causes E, B causes D, C causes E, and C causes D.
 49 | Question: Does C cause D? 
 50 | Answer (Yes or No ?): Yes
 51 | 
 52 | Input Event: If A causes D, A causes C, B causes E, C causes D, and D causes E.
 53 | Question: Does E cause E? 
 54 | Answer (Yes or No ?): No
 55 | 
 56 | Input Event: If %s.
 57 | Question: Does %s cause %s? 
 58 | Answer (Yes or No ?):""",
 59 |                "three-shot-IcL-CN":"""根据给定因果图中的因果关系回答问题。
 60 | 输入信息：如果A导致D, A导致E, B导致E, C导致D, 以及D导致E。
 61 | 问题：C是否导致A？
 62 | 答案（是或否？）：否
 63 | 
 64 | 输入信息：如果A导致B, A导致E, B导致E, B导致D, C导致E, 以及C导致D。
 65 | 问题：C是否导致D？
 66 | 答案（是或否？）：是
 67 | 
 68 | 输入信息：如果A导致D, A导致C, B导致E, C导致D, 以及D导致E。
 69 | 问题：E是否导致E？
 70 | 答案（是或否？）：否
 71 | 
 72 | 输入信息：如果%s。
 73 | 问题：%s是否导致%s？
 74 | 答案（是或否？）：""",
 75 |                "zero-shot-CoT":"""Input Event: If %s.
 76 | Question: Does %s cause %s? Let's think step by step.
 77 | Answer (Yes or No ?):"""
 78 | ,
 79 |                "zero-shot-CoT-CN":"""输入信息：如果%s。
 80 | 问题：%s是否导致%s？请逐步思考。
 81 | 答案（是或否？）："""
 82 | ,
 83 |                "manual-CoT":"""Here are three examples of causal abstract reasoning using chain of thought, and a question to answer.
 84 | 
 85 | Input Event: If A causes D, A causes C, A causes B, B causes E, B causes D, and C causes D.
 86 | Question: Does A cause C? 
 87 | Answer (Yes or No ?): The input states that A causes C. Therefore, the answer is Yes.
 88 | 
 89 | Input Event: If A causes B, A causes C, A causes D, B causes E, and C causes E.
 90 | Question: Does E cause A? 
 91 | Answer (Yes or No ?): A is not caused by anything in the input, thus E does not cause A. Therefore, the answer is No.
 92 | 
 93 | Input Event: If A causes C, A causes H, A causes E, A causes B, B causes H, B causes G, B causes F, B causes C, C causes F, C causes H, C causes D, C causes E, D causes E, E causes G, E causes H, E causes F, F causes H, and F causes G.
 94 | Question: Does D cause F? 
 95 | Answer (Yes or No ?): Given D causes E, and E causes F, such that D causes F. Therefore, the answer is Yes.
 96 | 
 97 | Input Event: If %s.
 98 | Question: Does %s cause %s? 
 99 | Answer (Yes or No ?):
100 | """,
101 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的因果抽象推理的示例，和一个需要回答的问题。
102 | 
103 | 输入信息：A导致D, A导致C, A导致B, B导致E, B导致D, 以及C导致D。
104 | 问题：A是否导致C？
105 | 答案（是或否？）：输入信息中说明了A导致C。因此答案为“是”。
106 | 
107 | 输入信息：A导致B, A导致C, A导致D, B导致E, 以及C导致E。
108 | 问题：E是否导致A？
109 | 答案（是或否？）：输入信息中没有任何元素导致A，因此E没有导致A。因此答案为“否”。
110 | 
111 | 输入信息：如果A导致C, A导致H, A导致E, A导致B, B导致H, B导致G, B导致F, B导致C, C导致F, C导致H, C导致D, C导致E, D导致E, E导致G, E导致H, E导致F, F导致H, 以及F导致G。
112 | 问题：D是否导致F？
113 | 答案（是或否？）：D导致E，E导致F，所以D导致F。因此答案为“是”。
114 | 
115 | 输入信息：如果%s。
116 | 问题：%s是否导致%s？
117 | 答案（是或否？）：
118 | """,
119 |                 "explicit-function":"""You are a helpful assistant for abstract reasoning.
120 | Input Event: If %s.
121 | Question: Does %s cause %s? 
122 | Answer (Yes or No ?):""",
123 |                 "explicit-function-CN":"""你是一个用于抽象推理的得力助手。
124 | 输入信息：如果%s。
125 | 问题：%s是否导致%s？
126 | 答案（是或否？）：""",
127 |                }
128 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
129 |         base = base_prompt_dict[prompt_style]
130 | 
131 |         prompt = prompt_style_str + base % (item["ar_edges"], item["former"], item["latter"])
132 |         return prompt


--------------------------------------------------------------------------------
/calm/data_processing/prompt/BAS-B_backadj.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Input Info: %s
  2 | Question: %s
  3 | Answer (Yes or No ?):""",
  4 |                "basic-CN":"""输入信息：%s
  5 | 问题：%s
  6 | 答案（是或否？）：""",
  7 |                "adversarial-ignore":"""Input Info: %s
  8 | Question: %s
  9 | Answer (Yes or No ?):""",
 10 |                "adversarial-ignore-CN":"""输入信息：%s
 11 | 问题：%s
 12 | 答案（是或否？）：""",
 13 |                 "adversarial-doubt":"""Input Info: %s
 14 | Question: %s
 15 | Answer (Yes or No ?):""",
 16 |                 "adversarial-doubt-CN":"""输入信息：%s
 17 | 问题：%s
 18 | 答案（是或否？）：""",
 19 |                "zero-shot-IcL":"""Answer questions by considering what constitutes a valid adjustment set that can block all backdoor spurious correlations between two events.
 20 | Input Info: %s
 21 | Question: %s
 22 | Answer (Yes or No ?):""",
 23 |                "zero-shot-IcL-CN":"""通过考虑什么构成一个有效的调整集，以阻断两个事件之间所有后门伪相关，来回答问题。
 24 | 输入信息：%s
 25 | 问题：%s
 26 | 答案（是或否？）：""",
 27 |                "one-shot-IcL":"""Answer questions by considering what constitutes a valid adjustment set that can block all backdoor spurious correlations between two events.
 28 | Input Info: Method 1: We look at how husband correlates with alarm clock case by case according to wife. Method 2: We look directly at how husband correlates with alarm clock in general.
 29 | Question: To understand how husband affects alarm clock, is it more correct to use the Method 1 than Method 2?
 30 | Answer (Yes or No ?): no
 31 | 
 32 | Input Info: %s
 33 | Question: %s
 34 | Answer (Yes or No ?):""",
 35 |                "one-shot-IcL-CN":"""通过考虑什么构成一个有效的调整集，以阻断两个事件之间所有后门伪相关，来回答问题。
 36 | 输入信息：方法1：根据妻子的情况，我们逐个研究丈夫与闹钟之间的关联；方法2：我们直接研究一般情况下丈夫与闹钟之间的关联。
 37 | 问题：要了解丈夫如何影响闹钟，使用方法1比方法2更准确吗？
 38 | 答案（是或否？）：否
 39 | 
 40 | 输入信息：%s
 41 | 问题：%s
 42 | 答案（是或否？）：""",
 43 |                "three-shot-IcL":"""Answer questions by considering what constitutes a valid adjustment set that can block all backdoor spurious correlations between two events.
 44 | Input Info: Method 1: We look at how husband correlates with alarm clock case by case according to wife. Method 2: We look directly at how husband correlates with alarm clock in general.
 45 | Question: To understand how husband affects alarm clock, is it more correct to use the Method 1 than Method 2?
 46 | Answer (Yes or No ?): no
 47 | 
 48 | Input Info: Method 1: We look directly at how husband correlates with alarm clock in general. Method 2: We look at this correlation case by case according to wife.
 49 | Question: To understand how husband affects alarm clock, is it more correct to use the Method 1 than Method 2?
 50 | Answer (Yes or No ?): yes
 51 | 
 52 | Input Info: Method 1: We look directly at how the man in the room correlates with room in general. Method 2: We look at this correlation case by case according to the candle.
 53 | Question: To understand how the man in the room affects room, is it more correct to use the Method 1 than Method 2?
 54 | Answer (Yes or No ?): yes
 55 | 
 56 | Input Info: %s
 57 | Question: %s
 58 | Answer (Yes or No ?):""",
 59 |                "three-shot-IcL-CN":"""通过考虑什么构成一个有效的调整集，以阻断两个事件之间所有后门伪相关，来回答问题。
 60 | 输入信息：方法1：根据妻子的情况，我们逐个研究丈夫与闹钟之间的关联；方法2：我们直接研究一般情况下丈夫与闹钟之间的关联。
 61 | 问题：要了解丈夫如何影响闹钟，使用方法1比方法2更准确吗？
 62 | 答案（是或否？）：否
 63 | 
 64 | 输入信息：方法1：我们直接研究一般情况下丈夫与闹钟之间的关联。方法2：根据妻子的情况，我们逐个研究这种关联。
 65 | 问题：要了解丈夫如何影响闹钟，使用方法1比方法2更准确吗？
 66 | 答案（是或否？）：是
 67 | 
 68 | 输入信息：方法1: 我们直接研究一般情况下房间里的男人与房间之间的关联;方法2:根据蜡烛，我们逐个研究这种关联。
 69 | 问题：要了解房间里的男子如何影响房间，使用方法1比方法2更准确吗？
 70 | 答案（是或否？）：是
 71 | 
 72 | 输入信息：%s
 73 | 问题：%s
 74 | 答案（是或否？）：""",
 75 |                "zero-shot-CoT":"""Input Info: %s
 76 | Question: %s Let's think step by step.
 77 | Answer (Yes or No ?):"""
 78 | ,
 79 |                "zero-shot-CoT-CN":"""输入信息：%s
 80 | 问题：%s请逐步思考。
 81 | 答案（是或否？）："""
 82 | ,
 83 |                "manual-CoT":"""Here are three examples for problems about considering backdoor adjustment set with chain of thought. 
 84 | Input Info: Method 1: We look directly at how jyka correlates with lirg in general. Method 2: We look at this correlation case by case according to gyzp.
 85 | Question: To understand how jyka affects lirg, is it more correct to use the Method 1 than Method 2?
 86 | Answer (Yes or No ?): Since gyzp is a confounder, both affects jyka and lirg, looking directly at the relation between jyka and lirg like Method 1 is not correct. Therefore, the answer is No.
 87 | 
 88 | Input Info: Method 1: We look directly at how encouragement level correlates with brown eyes in general. Method 2: We look at this correlation case by case according to studying habit.
 89 | Question: To understand how encouragement level affects brown eyes, is it more correct to use the Method 1 than Method 2?
 90 | Answer (Yes or No ?): Since studying habit is a result of encouragement level, there is no need to consider studying habit when studying the relation between encouragement level and brown eyes. Therefore, the answer is Yes.
 91 | 
 92 | Input Info: Method 1: We look directly at how zuph correlates with glimx in general. Method 2: We look at this correlation case by case according to zory.
 93 | Question: To understand how zuph affects glimx, is it more correct to use the Method 1 than Method 2?
 94 | Answer (Yes or No ?): Since zory is a confounder, both affects zuph and glimx, looking at the correlation without considering zory is not correct. Therefore, the answer is No.
 95 | 
 96 | Input Info: %s
 97 | Question: %s
 98 | Answer (Yes or No ?):""",
 99 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的有关后门变量集合的问题：
100 | 
101 | 输入信息：方法1: 我们直接研究一般情况下房间里的男人与房间之间的关联;方法2:根据蜡烛，我们逐个研究这种关联。
102 | 问题：要了解房间里的男子如何影响房间，使用方法1比方法2更准确吗？
103 | 答案（是或否？）：因为房间里的男人和蜡烛对房间的影响是相互独立的，所以蜡烛不会影响房间里的男人和房间之间的关联。因此方法1更好。因此答案为“是”。
104 | 
105 | 输入信息：方法1：我们直接研究一般情况下jyka与lirg之间的关联。方法2：根据gyzp，我们逐个研究这种关联。
106 | 问题：要了解gyzp如何影响lirg，使用方法1比方法2更准确吗？
107 | 答案（是或否？）：因为gyzp作为混淆变量会同时影响jyka和lirg，使用方法1会导致对jyka和lirg之间的关联产生错误判断。因此答案为“否”。
108 | 
109 | 输入信息：方法1：我们直接研究一般情况下鼓励程度与考试成绩之间的关联。方法2：根据学习习惯，我们逐个研究这种关联。
110 | 问题：要了解鼓励程度如何影响考试成绩，使用方法1比方法2更准确吗？
111 | 答案（是或否？）：因为学习成绩是鼓励程度的结果，不会影响鼓励程度和考试成绩之间的关联。因此方法1更好。因此答案为“是”。
112 | 
113 | 输入信息：%s
114 | 问题：%s
115 | 答案（是或否？）""",
116 |                 "explicit-function":"""You are a helpful assistant for backdoor adjustment set.
117 | Input Info: %s
118 | Question: %s
119 | Answer (Yes or No ?):""",
120 |                 "explicit-function-CN":"""你是一个用于后门调节的得力助手。
121 | 输入信息：%s
122 | 问题：%s
123 | 答案（是或否？）：""",
124 |                }
125 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
126 |         base = base_prompt_dict[prompt_style]
127 | 
128 |         prompt = prompt_style_str + base % (item["given_info"], item["question"])
129 |         return prompt


--------------------------------------------------------------------------------
/calm/data_processing/prompt/CB-B_collider-bias.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Input Info: %s
  2 | Question: %s
  3 | Answer (Yes or No ?):""",
  4 |                "basic-CN":"""输入信息：%s
  5 | 问题：%s
  6 | 答案（是或否？）：""",
  7 |                "adversarial-ignore":"""Input Info: %s
  8 | Question: %s
  9 | Answer (Yes or No ?):""",
 10 |                "adversarial-ignore-CN":"""输入信息：%s
 11 | 问题：%s
 12 | 答案（是或否？）：""",
 13 |                 "adversarial-doubt":"""Input Info: %s
 14 | Question: %s
 15 | Answer (Yes or No ?):""",
 16 |                 "adversarial-doubt-CN":"""输入信息：%s
 17 | 问题：%s
 18 | 答案（是或否？）：""",
 19 |                "zero-shot-IcL":"""Answer questions about collider bias.
 20 | Input Info: %s
 21 | Question: %s
 22 | Answer (Yes or No ?):""",
 23 |                "zero-shot-IcL-CN":"""请回答有关碰撞偏见的问题。
 24 | 输入信息：%s
 25 | 问题：%s
 26 | 答案（是或否？）：""",
 27 |                "one-shot-IcL":"""Answer questions about collider bias.
 28 | Input Info: For people who are famous, the correlation between attractive appearance and talent is -0.08.
 29 | Question: If we look at people who are famous, does it mean that attractive appearance does not affect talent?
 30 | Answer (Yes or No ?):Yes.
 31 | 
 32 | Input Info: %s
 33 | Question: %s
 34 | Answer (Yes or No ?):""",
 35 |                "one-shot-IcL-CN":"""请回答有关碰撞偏见的问题。
 36 | 输入信息：对于那些出名的人来说，有吸引力的外表和才华之间的相关系数为-0.08。
 37 | 问题：如果我们观察那些出名的人，这是否意味着有吸引力的外表不会影响才华?
 38 | 答案（是或否？）：是
 39 | 
 40 | 输入信息：%s
 41 | 问题：%s
 42 | 答案（是或否？）：""",
 43 |                "three-shot-IcL":"""Answer questions about collider bias.
 44 | Input Info: For people who are famous, the correlation between attractive appearance and talent is -0.08.
 45 | Question: If we look at people who are famous, does it mean that attractive appearance does not affect talent?
 46 | Answer (Yes or No ?):Yes.
 47 | 
 48 | Input Info: For people who are famous, the correlation between attractive appearance and talent is -0.16.
 49 | Question: If we look at people who are famous, does it mean that attractive appearance does not affect talent?
 50 | Answer (Yes or No ?): yes
 51 | 
 52 | Input Info: For people who are famous, the correlation between attractive appearance and talent is -0.23.
 53 | Question: If we look at people who are famous, does it mean that attractive appearance affects talent?
 54 | Answer (Yes or No ?): no
 55 | 
 56 | Input Info: %s
 57 | Question: %s
 58 | Answer (Yes or No ?):""",
 59 |                "three-shot-IcL-CN":"""请回答有关碰撞偏见的问题。
 60 | 输入信息：对于那些出名的人来说，有吸引力的外表和才华之间的相关系数为-0.08。
 61 | 问题：如果我们观察那些出名的人，这是否意味着有吸引力的外表不会影响才华?
 62 | 答案（是或否？）：是
 63 | 
 64 | 输入信息：对于那些出名的人来说，有吸引力的外表和才华之间的相关系数为-0.16。
 65 | 问题：如果我们观察那些出名的人，这是否意味着有吸引力的外表不会影响才华?
 66 | 答案（是或否？）：是
 67 | 
 68 | 输入信息：对于那些出名的人来说，有吸引力的外表和才华之间的相关系数为-0.23。
 69 | 问题：如果我们观察那些出名的人，这是否意味着有吸引力的外表会影响才华？
 70 | 答案（是或否？）：否
 71 | 
 72 | 输入信息：%s
 73 | 问题：%s
 74 | 答案（是或否？）：""",
 75 |                "zero-shot-CoT":"""Input Info: %s
 76 | Question: %s Let's think step by step.
 77 | Answer (Yes or No ?):"""
 78 | ,
 79 |                "zero-shot-CoT-CN":"""输入信息：%s
 80 | 问题：%s 请逐步思考。
 81 | 答案（是或否？）："""
 82 | ,
 83 |                "manual-CoT":"""Here are eight examples of problems with collider bias answered with chain of thought.
 84 | 
 85 | Input Info: For people who are famous, the correlation between attractive appearance and talent is -0.08.
 86 | Question: If we look at people who are famous, does it mean that attractive appearance does not affect talent?
 87 | Answer (Yes or No ?): Both attractive appearance and talent have direct effects on fame. This collision creates a spurious association between attractive appearance and talent when analysis is limited to famous people. Therefore, the answer is Yes.
 88 | 
 89 | Input Info: For hospitalized individuals, the correlation between respiratory issues and broken bones is -0.25.
 90 | Question: If we look at hospitalized individuals, does it mean that respiratory issues affects broken bones?
 91 | Answer (Yes or No ?): Both respiratory issues and broken bones affect hospitalization status. This collision creates a spurious association between respiratory issues and broken bones when analysis is limited to hospitalized individuals. Therefore, the answer is No.
 92 | 
 93 | Input Info: For students accepted to elite institutions, the correlation between listening to jazz and being hard-working is -0.06.
 94 | Question: If we look at students accepted to elite institutions, does it mean that listening to jazz does not affect being hard-working?
 95 | Answer (Yes or No ?): Both listening to jazz and effort affect elite institution admission status. This collision creates a spurious association between listening to jazz and hard-working when analysis is limited to students accepted to elite institutions. Therefore, the answer is Yes.
 96 | 
 97 | Input Info: For those who are yupt, the correlation between jyka and kwox is 0.02.
 98 | Question: If we look at those who are yupt, does it mean that jyka does not affect kwox?
 99 | Answer (Yes or No ?): Both jyka and kwox affect yupt. This collision creates a spurious association between jyka and kwox when analysis is limited to those who are yupt. Therefore, the answer is Yes.
100 | 
101 | Input Info: For those who are zupj, the correlation between yupt and muvq is -0.15.
102 | Question: If we look at those who are zupj, does it mean that yupt affects muvq?
103 | Answer (Yes or No ?): Both yupt and muvq affect zupj. This collision creates a spurious association between yupt and muvq when analysis is limited to those who are zupj. Therefore, the answer is No.
104 | 
105 | Input Info: For those who are swoq, the correlation between kwox and kwoz is -0.25.
106 | Question: If we look at those who are swoq, does it mean that kwox affects kwoz?
107 | Answer (Yes or No ?): Both kwox and kwoz affect swoq. This collision creates a spurious association between kwox and kwoz when analysis is limited to those who are swoq. Therefore, the answer is No.
108 | 
109 | Input Info: For those who are wibl, the correlation between zuph and uvzi is -0.01.
110 | Question: If we look at those who are wibl, does it mean that zuph affects uvzi?
111 | Answer (Yes or No ?): Both zuph and uvzi affect wibl. This collision creates a spurious association between zuph and uvzi when analysis is limited to those who are wibl. Therefore, the answer is No.
112 | 
113 | Input Info: For those who are jyka, the correlation between zuph and glimx is -0.04.
114 | Question: If we look at those who are jyka, does it mean that zuph does not affect glimx?
115 | Answer (Yes or No ?): Both zuph and glimx affect jyka. This collision creates a spurious association between zuph and glimx when analysis is limited to those who are jyka. Therefore, the answer is Yes.
116 | 
117 | Input Info: %s
118 | Question: %s
119 | Answer (Yes or No ?):"""
120 | ,
121 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的对撞偏差问题：
122 | 
123 | 输入信息：对于那些出名的人来说，有吸引力的外表和才华之间的相关系数为-0.08。
124 | 问题：如果我们观察那些出名的人，这是否意味着有吸引力的外表不会影响才华?
125 | 答案（是或否？）：有吸引力的外表和才华都会影响名气。如果只分析出名的人，这些影响可能会造成有吸引力的外表和才华之间的虚假关系。因此答案为“是”。
126 | 
127 | 输入信息：对于住院患者，呼吸问题与骨折之间的相关系数为-0.25。
128 | 问题：如果我们观察住院患者，这是否意味着呼吸问题会影响骨折？
129 | 答案（是或否？）：呼吸问题和骨折都会导致患者住院。如果只分析住院患者，这些影响可能会造成呼吸问题和骨折之间的虚假关系。因此答案为“否”。
130 | 
131 | 输入信息：对于那些swoq的人来说，kwox和kwoz之间的相关系数为-0.25。
132 | 问题：如果我们观察那些swoq的人，这是否意味着kwox会影响kwoz？
133 | 答案（是或否？）：kwox和kwoz都会对swoq产生直接影响。如果只分析那些swoq的人，这些影响可能会造成kwox和kwoz之间的虚假关系。因此答案为“否”。
134 | 
135 | 输入信息：%s
136 | 问题：%s
137 | 答案（是或否？）""",
138 |                 "explicit-function":"""You are a helpful assistant for collider bias analysis.
139 | Input Info: %s
140 | Question: %s
141 | Answer (Yes or No ?):""",
142 |                 "explicit-function-CN":"""你是一个用于分析汇聚偏差的得力助手。
143 | 输入信息：%s
144 | 问题：%s
145 | 答案（是或否？）：""",
146 |                }
147 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
148 |         base = base_prompt_dict[prompt_style]
149 | 
150 |         prompt = prompt_style_str + base % (item["given_info"], item["question"])
151 |         return prompt


--------------------------------------------------------------------------------
/calm/data_processing/prompt/CEG-O_E-CARE.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Cause: %s
  2 | Effect: %s
  3 | Question: why the cause can lead to the effect ?
  4 | Answer:""" ,
  5 |                "basic-CN":"""原因：%s
  6 | 结果：%s
  7 | 问题：为什么原因会导致这样的结果？
  8 | 答案：""",
  9 |                "adversarial-ignore":"""Cause: %s
 10 | Effect: %s
 11 | Question: why the cause can lead to the effect ?
 12 | Answer:""" ,
 13 |                "adversarial-ignore-CN":"""原因：%s
 14 | 结果：%s
 15 | 问题：为什么原因会导致这样的结果？
 16 | 答案：""",
 17 |                 "adversarial-doubt":"""Cause: %s
 18 | Effect: %s
 19 | Question: why the cause can lead to the effect ?
 20 | Answer:""" ,
 21 |                 "adversarial-doubt-CN":"""原因：%s
 22 | 结果：%s
 23 | 问题：为什么原因会导致这样的结果？
 24 | 答案：""",
 25 |                "zero-shot-IcL":"""generate explanations for causal relations between events.
 26 | Cause: %s
 27 | Effect: %s
 28 | Question: why the cause can lead to the effect ?
 29 | Answer:""" ,
 30 |                "zero-shot-IcL-CN":"""请生成事件之间因果关系的解释。
 31 | 原因：%s
 32 | 结果：%s
 33 | 问题：为什么原因会导致这样的结果？
 34 | 答案：""" ,
 35 |                "one-shot-IcL":"""generate explanations for causal relations between events.
 36 | Cause: The woman gave birth to a child.
 37 | Effect: The child brought psycho-physical phenomena on a new life.
 38 | Question: why the cause can lead to the effect ?
 39 | Answer: Birth is the arising of the psycho-physical phenomena.
 40 | 
 41 | Cause: %s
 42 | Effect: %s
 43 | Question: why the cause can lead to the effect ?
 44 | Answer:""" ,
 45 |                "one-shot-IcL-CN":"""请生成事件之间因果关系的解释。
 46 | 原因：这位女士生下了一个孩子。
 47 | 结果：这个孩子给新生活带来了心理-生理现象。
 48 | 问题：为什么原因会导致这样的结果？
 49 | 答案：出生是心理-生理现象的产生原因。
 50 |             
 51 | 原因：%s
 52 | 结果：%s
 53 | 问题：为什么原因会导致这样的结果？
 54 | 答案：""" ,
 55 |                "three-shot-IcL":"""generate explanations for causal relations between events.
 56 | Cause: The woman gave birth to a child.
 57 | Effect: The child brought psycho-physical phenomena on a new life.
 58 | Question: why the cause can lead to the effect ?
 59 | Answer: Birth is the arising of the psycho-physical phenomena.
 60 | 
 61 | Cause: Otters enter their new habitat.
 62 | Effect: Otters start looking for abalone for food.
 63 | Question: why the cause can lead to the effect ?
 64 | Answer: Abalone are one of the first food items taken by otters as they move into new habitat.
 65 | 
 66 | Cause: Lila loves classification of her things.
 67 | Effect: Lila can find what she wants quickly.
 68 | Question: why the cause can lead to the effect ?
 69 | Answer: Classifications yield accuracy.
 70 | 
 71 | Cause: %s
 72 | Effect: %s
 73 | Question: why the cause can lead to the effect ?
 74 | Answer:""" ,
 75 |                "three-shot-IcL-CN":"""请生成事件之间因果关系的解释。
 76 | 原因：这位女士生下了一个孩子。
 77 | 结果：这个孩子给生活带来了新的心理-生理现象。
 78 | 问题：为什么原因会导致这样的结果？
 79 | 答案：出生是心理-生理现象的起源。
 80 | 
 81 | 原因：水獭进入它们的新栖息地。
 82 | 结果：水獭开始寻找鲍鱼作为食物。
 83 | 问题：为什么原因会导致这样的结果？
 84 | 答案：鲍鱼是水獭搬进新栖息地时最先吃的食物之一。
 85 | 
 86 | 原因：莉拉喜欢对她的东西进行分类。
 87 | 结果：莉莉可以很快地找到她想要的东西。
 88 | 问题：为什么原因会导致这样的结果？
 89 | 答案：分类可以提高准确度。
 90 | 
 91 | 原因：%s
 92 | 结果：%s
 93 | 问题：为什么原因会导致这样的结果？
 94 | 答案：""" ,
 95 |                "zero-shot-CoT":"""Cause: %s
 96 | Effect: %s
 97 | Question: why the cause can lead to the effect ? Let's think step by step.
 98 | Answer:""" 
 99 | ,
100 |                "zero-shot-CoT-CN":"""原因：%s
101 | 结果：%s
102 | 问题：为什么原因会导致这样的结果？请逐步思考。
103 | 答案："""
104 | ,
105 |                "manual-CoT":"""Here we will provide eight chain-of-thought exemplars, followed by a causal explanation generating question that needs to be answered with chain-of-thought. 
106 | 
107 | Cause: His action led to the movement of the wheels.
108 | Effect: The machine was set in motion.
109 | Question: why the cause can lead to the effect? 
110 | Answer(with chain-of-thought): Movement results in motion. The initial movement caused by the action eventually builds up and transitions into the sustained motion of the machine.
111 | 
112 | Cause: All relatives entered the family room.
113 | Effect: They sat on the chairs one by one.
114 | Question: why the cause can lead to the effect?
115 | Answer(with chain-of-thought): Chairs sit in family rooms. The presence of chairs in the family room sets the stage for the expected behavior of sitting down when relatives enter the room.
116 | 
117 | Cause: Seals are mammals.
118 | Effect: They can live well in winter.
119 | Question: why the cause can lead to the effect ? Let's think step by step.
120 | Answer(with chain-of-thought): Seals are protected from the cold by a thick layer of blubber combined with a thick fur coat. Thus, they could withstand cold temperatures and maintain their body heat. This adaptation aligns with the effect of being able to live well in winter.
121 | 
122 | Cause: A stove is an enclosed space in which fuel is burned to provide heating.
123 | Effect: Its surfaces protect people from hurting.
124 | Question: why the cause can lead to the effect? 
125 | Answer(with chain-of-thought): Stoves have surfaces. Stove surfaces are a crucial safety feature that shields individuals from direct contact with the heat and flames generated during the burning of fuel inside the stove.
126 | 
127 | Cause: The student majored in medicine had to choose a research interest.
128 | Effect: He chose Psychiatry.
129 | Question: why the cause can lead to the effect? 
130 | Answer(with chain-of-thought): Psychiatry is a branch of medicine. The student's background in medicine makes Psychiatry a logical and suitable research interest.
131 | 
132 | Cause: The doctor told William that his eyesight was gradually losing.
133 | Effect: The doctor used radiotherapy to treat William.
134 | Question: why the cause can lead to the effect? 
135 | Answer(with chain-of-thought): Radiotherapy uses low dose radiation to stop the progression of vision loss on the retina. It is a medical intervention that can be utilized to address certain conditions causing vision loss on the retina.
136 | 
137 | Cause: The angel controls the Kingdom of Heaven.
138 | Effect: Dominion is part of his responsibility.
139 | Question: why the cause can lead to the effect? 
140 | Answer(with chain-of-thought): Dominion is a type of the Kingdom of Heaven. By controlling the Kingdom of Heaven, the angel's responsibilities include exercising authority and rule, which align with the concept of dominion.
141 | 
142 | Cause: The government published a new policy.
143 | Effect: The public knew its meaning.
144 | Question: why the cause can lead to the effect? 
145 | Answer(with chain-of-thought): Policy makes senses. Policies are constructed to convey information in a way that makes sense to the readers.
146 | 
147 | Cause: %s
148 | Effect: %s
149 | Question: why the cause can lead to the effect ?
150 | Answer:"""
151 | ,
152 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的问题:
153 | 
154 | 原因：莱勒有眼科医生。
155 | 结果：莱尔的医生用激光治疗了他。
156 | 问题：为什么原因会导致这样的结果？
157 | 答案：眼科医生通常用激光治疗增生性视网膜病变。
158 | 
159 | 原因：作者运用了拟人手法来描述无生命物体。
160 | 结果：读者觉得它好像有人类的能力。
161 | 问题：为什么原因会导致这样的结果？
162 | 答案：拟人手法是将无生命物体描述成具有人类特征的表达方式。
163 | 
164 | 原因：约翰想种一棵半耐寒多年生植物。
165 | 结果：他种了蒲公英。
166 | 问题：为什么原因会导致这样的结果？
167 | 答案：蒲公英是半耐寒多年生植物。
168 | 
169 | 原因：%s
170 | 结果：%s
171 | 问题：为什么原因会导致这样的结果？
172 | 答案："""
173 | ,
174 |                 "explicit-function":"""You are a helpful assistant for causal explanation generation.
175 | Cause: %s
176 | Effect: %s
177 | Question: why the cause can lead to the effect ?
178 | Answer:""",
179 |                 "explicit-function-CN":"""你是一个用于因果解释生成的得力助手。
180 | 原因：%s
181 | 结果：%s
182 | 问题：为什么原因会导致这样的结果？
183 | 答案""",
184 |                }
185 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
186 |         base = base_prompt_dict[prompt_style]
187 | 
188 |         prompt = prompt_style_str + base % (item["cause"], item["effect"])
189 |         return prompt
190 | 


--------------------------------------------------------------------------------
/calm/data_processing/prompt/CORR-B_correlation.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Input Info: %s
  2 | Question: %s
  3 | Answer (Yes or No ?):""",
  4 |                "basic-CN":"""输入信息：%s
  5 | 问题：%s
  6 | 答案（是或否？）：""",
  7 |                "adversarial-ignore":"""Input Info: %s
  8 | Question: %s
  9 | Answer (Yes or No ?):""",
 10 |                "adversarial-ignore-CN":"""输入信息：%s
 11 | 问题：%s
 12 | 答案（是或否？）：""",
 13 |                 "adversarial-doubt":"""Input Info: %s
 14 | Question: %s
 15 | Answer (Yes or No ?):""",
 16 |                 "adversarial-doubt-CN":"""输入信息：%s
 17 | 问题：%s
 18 | 答案（是或否？）：""",
 19 |                "zero-shot-IcL":"""Answer questions about correlation.
 20 | Input Info: %s
 21 | Question: %s
 22 | Answer (Yes or No ?):""",
 23 |                "zero-shot-IcL-CN":"""回答有关相关性的问题。
 24 | 输入信息：%s
 25 | 问题：%s
 26 | 答案（是或否？）：""",
 27 |                "one-shot-IcL":"""Answer questions about correlation.
 28 | Input Info: The overall probability of alarm set by husband is 0.74. The probability of alarm not set by husband and ringing alarm is 0.09. The probability of alarm set by husband and ringing alarm is 0.51.
 29 | Question: Is the chance of ringing alarm smaller when observing alarm set by husband?
 30 | Answer (Yes or No ?): No.
 31 | 
 32 | Input Info: %s
 33 | Question: %s
 34 | Answer (Yes or No ?):""",
 35 |                "one-shot-IcL-CN":"""回答有关相关性的问题。
 36 | 输入信息：丈夫设置闹钟的总体概率为74%%，丈夫未设置闹钟而闹钟响起的概率为9%%，丈夫设置闹钟且闹钟响起的概率为51%%。
 37 | 问题：观察到丈夫设置闹钟是否会降低闹钟响铃的概率？
 38 | 答案（是或否？）：否
 39 | 
 40 | 输入信息：%s
 41 | 问题：%s
 42 | 答案（是或否？）：""",
 43 |                "three-shot-IcL":"""Answer questions about correlation.
 44 | Input Info: The overall probability of alarm set by husband is 0.74. The probability of alarm not set by husband and ringing alarm is 0.09. The probability of alarm set by husband and ringing alarm is 0.51.
 45 | Question: Is the chance of ringing alarm smaller when observing alarm set by husband?
 46 | Answer (Yes or No ?): No.
 47 | 
 48 | Input Info: The overall probability of alarm set by husband is 69%%. The probability of alarm not set by husband and ringing alarm is 15%%. The probability of alarm set by husband and ringing alarm is 38%%.
 49 | Question: Is the chance of ringing alarm larger when observing alarm set by husband?
 50 | Answer (Yes or No ?): yes
 51 | 
 52 | Input Info: The overall probability of alarm set by husband is 86%%. The probability of alarm not set by husband and ringing alarm is 7%%. The probability of alarm set by husband and ringing alarm is 71%%.
 53 | Question: Is the chance of ringing alarm larger when observing alarm set by husband?
 54 | Answer (Yes or No ?): yes
 55 | 
 56 | Input Info: %s
 57 | Question: %s
 58 | Answer (Yes or No ?):""",
 59 |                "three-shot-IcL-CN":"""回答有关相关性的问题。
 60 | 输入信息：丈夫设置闹钟的总体概率为74%%，丈夫未设置闹钟而闹钟响起的概率为9%%，丈夫设置闹钟且闹钟响起的概率为51%%。
 61 | 问题：观察到丈夫设置闹钟是否会降低闹钟响铃的概率？
 62 | 答案（是或否？）：否
 63 | 
 64 | 输入信息：丈夫设置闹钟的总体概率为69%%，丈夫未设置闹钟而闹钟响起的概率为15%%，丈夫设置闹钟且闹钟响起的概率为38%%。
 65 | 问题：观察到丈夫设置闹钟是否会增加闹钟响铃的概率？
 66 | 答案（是或否？）：是
 67 | 
 68 | 输入信息：丈夫设置闹钟的总体概率为86%%，丈夫未设置闹钟而闹钟响起的概率为7%%，丈夫设置闹钟且闹钟响起的概率为71%%。
 69 | 问题：观察到丈夫设置闹钟是否会增加闹钟响铃的概率？
 70 | 答案（是或否？）：是
 71 | 
 72 | 输入信息：%s
 73 | 问题：%s
 74 | 答案（是或否？）：""",
 75 |                "zero-shot-CoT":"""Input Info: %s
 76 | Question: %s Let's think step by step.
 77 | Answer (Yes or No ?):"""
 78 | ,
 79 |                "zero-shot-CoT-CN":"""输入信息：%s
 80 | 问题：%s 请逐步思考。
 81 | 答案（是或否？）："""
 82 | ,
 83 |                "manual-CoT":"""Here are three examples of problems about considering correlation with chain of thought.
 84 | 
 85 | Input Info: The overall probability of encouragement is 13%%. The probability of discouragement and high exam score is 24%%. The probability of encouragement and high exam score is 9%%.
 86 | Question: Is the chance of high exam score larger when observing encouragement?
 87 | Answer (Yes or No ?): Let X = encouragement level; V2 = studying habit; Y = exam score. The causal relations are: X->V2,X->Y,V2->Y. P(X=1=1) = 0.51\nP(Y=1, X=0=1) = 0.16\nP(Y=1, X=1=1) = 0.33. P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.33/0.51 - 0.16/0.49 = 0.32>0. Thus, the chance of high exam score is larger when observing encouragement. Therefore, the answer is Yes.
 88 | 
 89 | Input Info: The overall probability of high hospital bill is 53%%. The probability of low hospital bill and recovery is 34%%. The probability of high hospital bill and recovery is 16%%.
 90 | Question: Is the chance of recovery larger when observing high hospital bill?
 91 | Answer (Yes or No ?): Let V1 = age; X = hospital costs; Y = recovery. The causal relations are: V1->X,V1->Y,X->Y. P(X=1=1) = 0.53\nP(Y=1, X=0=1) = 0.34\nP(Y=1, X=1=1) = 0.16. P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.16/0.53 - 0.34/0.47 = -0.43<0. Thus, the chance of recovery is not larger when observing high hospital bill. Therefore, the answer is No.
 92 | 
 93 | Input Info: The overall probability of male gender is 7%%. The probability of non-male gender and freckles is 34%%. The probability of male gender and freckles is 3%%.
 94 | Question: Is the chance of freckles smaller when observing male gender?
 95 | Answer (Yes or No ?): Let V2 = residency status; X = gender; V3 = department competitiveness; Y = freckles. The causal relations are: X->V3,V2->V3,X->Y,V2->Y,V3->Y. P(X=1=1) = 0.07\nP(Y=1, X=0=1) = 0.34\nP(Y=1, X=1=1) = 0.03. P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.03/0.07 - 0.34/0.93 = 0.03>0. Thus, the chance of freckles is not smaller when observing male gender. Therefore, the answer is No.
 96 | 
 97 | Input Info: %s
 98 | Question: %s
 99 | Answer (Yes or No ?):""",
100 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的有关统计关联程度的问题：
101 | 
102 | 输入信息：丈夫设置闹钟的总体概率为86%%，丈夫未设置闹钟而闹钟响起的概率为7%%，丈夫设置闹钟且闹钟响起的概率为71%%。
103 | 问题：观察到丈夫设置闹钟是否会增加闹钟响铃的概率？
104 | 答案（是或否？）：令 X = 丈夫; V2 = 妻子; Y = 闹钟响。因果关系有：X->V2,X->Y,V2->Y。P(X=1=1) = 0.86\nP(Y=1, X=0=1) = 0.07\nP(Y=1, X=1=1) = 0.71。P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.71/0.86 - 0.07/0.14 = 0.29>0。因此丈夫设置闹钟会增加闹钟响铃的概率。因此答案为“是”。
105 | 
106 | 输入信息：进行美黑沙龙护理的总体概率为1%%，没有进行美黑沙龙护理但皮肤被晒黑的概率是22%%。进行美黑沙龙护理后皮肤被晒黑的概率为0%%。
107 | 问题：观察到进行美黑沙龙护理是否会增加皮肤被晒黑的概率？
108 | 答案（是或否？）：令 V2 = 去海滩; X = 美黑沙龙护理; Y = 皮肤。因果关系有：X->Y,V2->Y。P(X=1=1) = 0.01\nP(Y=1, X=0=1) = 0.22\nP(Y=1, X=1=1) = 0.00。P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.00/0.01 - 0.22/0.99 = 0.56>0。因此进行美黑沙龙护理会增加皮肤被晒黑的概率。因此答案为“是”。
109 | 
110 | 输入信息：乘坐电梯的总体概率为34%%。走楼梯导致企鹅死亡的概率为30%%。乘坐电梯导致企鹅死亡的概率为16%%。
111 | 问题：观察到乘坐电梯是否会降低企鹅死亡的概率？
112 | 答案（是或否？）：令 X = 我的决定; V2 = 企鹅的情绪; Y = 企鹅存活。因果关系有：X->V2,X->Y,V2->Y。P(X=1=1) = 0.34\nP(Y=1, X=0=1) = 0.30\nP(Y=1, X=1=1) = 0.16。P(X = 1, Y = 1)/P(X = 1) - P(X = 0, Y = 1)/P(X = 0)=0.35/0.60 - 0.23/0.40 = 0.01>0。因此乘坐电梯不会降低企鹅死亡的概率。因此答案为“否”。
113 | 
114 | 输入信息：%s
115 | 问题：%s
116 | 答案（是或否？）：""",
117 |                 "explicit-function":"""You are a helpful assistant for identifying correlation.
118 | Input Info: %s
119 | Question: %s
120 | Answer (Yes or No ?):""",
121 |                 "explicit-function-CN":"""你是一个识别相关关系的得力助手。
122 | 输入信息：%s
123 | 问题：%s
124 | 答案（是或否？）：""",
125 |                }
126 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
127 |         base = base_prompt_dict[prompt_style]
128 | 
129 |         prompt = prompt_style_str + base % (item["given_info"], item["question"])
130 |         return prompt


--------------------------------------------------------------------------------
/calm/data_processing/prompt/CR-B_det-counterfactual.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Input Info: %s
  2 | Question: %s
  3 | Answer (Yes or No ?):""",
  4 |                "basic-CN":"""输入信息：%s
  5 | 问题：%s
  6 | 答案（是或否？）：""",
  7 |                "adversarial-ignore":"""Input Info: %s
  8 | Question: %s
  9 | Answer (Yes or No ?):""",
 10 |                "adversarial-ignore-CN":"""输入信息：%s
 11 | 问题：%s
 12 | 答案（是或否？）：""",
 13 |                 "adversarial-doubt":"""Input Info: %s
 14 | Question: %s
 15 | Answer (Yes or No ?):""",
 16 |                 "adversarial-doubt-CN":"""输入信息：%s
 17 | 问题：%s
 18 | 答案（是或否？）：""",
 19 |                "zero-shot-IcL":"""Answer questions about deterministic counterfactual.
 20 | Input Info: %s
 21 | Question: %s
 22 | Answer (Yes or No ?):""",
 23 |                "zero-shot-IcL-CN":"""请回答有关确定性反事实的问题。
 24 | 输入信息：%s
 25 | 问题：%s
 26 | 答案（是或否？）：""",
 27 |                "one-shot-IcL":"""Answer questions about deterministic counterfactual.
 28 | Input Info: We know that alarm set by husband causes alarm not set by wife. alarm set by husband or alarm set by wife causes ringing alarm.
 29 | Question: Would the alarm rings the next morning if alarm not set by husband instead of alarm set by husband?
 30 | Answer (Yes or No ?): Yes
 31 | 
 32 | Input Info: %s
 33 | Question: %s
 34 | Answer (Yes or No ?):""",
 35 |                "one-shot-IcL-CN":"""请回答有关确定性反事实的问题。
 36 | 输入信息：我们知道丈夫设置闹钟会导致妻子没有设置闹钟，丈夫设置闹钟或妻子设置闹钟会导致闹钟响铃。
 37 | 问题：如果丈夫没有设置闹钟，而不是丈夫设置闹钟，第二天早上闹钟会响吗？
 38 | 答案（是或否？）：是
 39 | 
 40 | 输入信息：%s
 41 | 问题：%s
 42 | 答案（是或否？）：""",
 43 |                "three-shot-IcL":"""Answer questions about deterministic counterfactual.
 44 | Input Info: We know that alarm set by husband causes alarm not set by wife. alarm set by husband or alarm set by wife causes ringing alarm.
 45 | Question: Would the alarm rings the next morning if alarm not set by husband instead of alarm set by husband?
 46 | Answer (Yes or No ?): Yes
 47 | 
 48 | Input Info: We know that alarm set by husband causes alarm set by wife. alarm set by husband or alarm set by wife causes ringing alarm.
 49 | Question: Would the alarm rings the next morning if alarm not set by husband instead of alarm set by husband?
 50 | Answer (Yes or No ?): no
 51 | 
 52 | Input Info: We know that alarm set by husband causes alarm set by wife. alarm set by husband or alarm set by wife causes ringing alarm.
 53 | Question: Would the alarm doesn't ring the next morning if alarm set by husband instead of alarm not set by husband?
 54 | Answer (Yes or No ?): no
 55 | 
 56 | Input Info: %s
 57 | Question: %s
 58 | Answer (Yes or No ?):""",
 59 |                "three-shot-IcL-CN":"""请回答有关确定性反事实的问题。
 60 | 输入信息：我们知道丈夫设置闹钟会导致妻子没有设置闹钟，丈夫设置闹钟或妻子设置闹钟会导致闹钟响铃。
 61 | 问题：如果丈夫没有设置闹钟，而不是丈夫设置闹钟，第二天早上闹钟会响吗？
 62 | 答案（是或否？）：是
 63 | 
 64 | 输入信息：我们知道丈夫设置闹钟会导致妻子设置闹钟，丈夫设置闹钟或妻子设置闹钟会导致闹钟响铃。
 65 | 问题：如果丈夫没有设置闹钟，而不是丈夫设置闹钟，第二天早上闹钟会响吗？
 66 | 答案（是或否？）：否
 67 | 
 68 | 输入信息：我们知道丈夫设置闹钟会导致妻子设置闹钟，丈夫设置闹钟或妻子设置闹钟会导致闹钟响铃。
 69 | 问题：如果是丈夫设置闹钟，而不是丈夫没有设置闹钟，第二天早上闹钟不会响吗？
 70 | 答案（是或否？）：否
 71 | 
 72 | 输入信息：%s
 73 | 问题：%s
 74 | 答案（是或否？）：""",
 75 |                "zero-shot-CoT":"""Input Info: %s
 76 | Question: %s Let's think step by step.
 77 | Answer (Yes or No ?):"""
 78 | ,
 79 |                "zero-shot-CoT-CN":"""输入信息：%s
 80 | 问题：%s请逐步思考。
 81 | 答案（是或否？）："""
 82 | ,
 83 |                "manual-CoT":"""Here are three examples of problems about deterministic counterfactual with chain of thought.
 84 |                 
 85 | Input Info: We know that having a sister causes the corporal shooting and the private not shooting. the corporal shooting and the private shooting causes the prisoner's death.
 86 | Question: Would the prisoner is dead if not having a sister instead of having a sister?
 87 | Answer (Yes or No ?): Let X = having a sister; V3 = the private; V2 = the corporal; Y = prisoner. The causal relations are: X->V3,X->V2,V2->Y,V3->Y. Set Y_{X=0} = 1 | , then solve for Y, given the evidence and the action. V2 = X\nV3 = not V2\nY = V2 and V3. Then we get Y = [0] = 0 and 1. Thus, the prisoner would not be dead if not having a sister instead of having a sister. Therefore, the answer is No.
 88 | 
 89 | Input Info: We know that citrus intake causes vitamin C deficiency, and we know that sufficient vitamin C causes straight hair.
 90 | Question: Would the patient has curly hair if citrus intake instead of absence of citrus?
 91 | Answer (Yes or No ?): Let X = eating citrus; V2 = vitmain C; Y = curly hair. The causal relations are: X->V2,V2->Y. Set Y_{X=1} = 1 | , then solve for Y, given the evidence and the action. V2 = not X\nY = not V2. Then we get Y = [1] = not 0. Thus, the patient would have curly hair if citrus intake instead of absence of citrus. Therefore, the answer is Yes.
 92 | 
 93 | Input Info: We know that zuph causes not rixq. zuph and rixq causes xevu. We observed an individual is zuph.
 94 | Question: Would an individual is not xevu if not rixq instead of rixq?
 95 | Answer (Yes or No ?): Let V1 = zuph; X = rixq; Y = xevu. The causal relations are: V1->X,V1->Y,X->Y. Set Y_{X=0} = 0 | V1=1, then solve for Y, given the evidence and the action. V1 = 1\nX = not V1\nY = V1 and X. Then we get Y = 0 = 1 and 0. Thus, an individual would not be xevu if not rixq instead of rixq. Therefore, the answer is Yes.
 96 | 
 97 | Input Info: %s
 98 | Question: %s
 99 | Answer (Yes or No ?):
100 | """,
101 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的有关反事实的问题：
102 | 
103 | 输入信息：我们知道丈夫设置闹钟会导致妻子设置闹钟，丈夫设置闹钟或妻子设置闹钟会导致闹钟响铃。
104 | 问题：如果丈夫没有设置闹钟，而不是丈夫设置闹钟，第二天早上闹钟会响吗？
105 | 答案（是或否？）：令 X = 丈夫; V2 = 妻子; Y = 闹钟响铃; 该问题下因果关系有：X->V2,X->Y,V2->Y。令Y_{X=0} = 1 | , 在已知事实和动作下求解Y。V2 = X\nY = X or V2。解得Y = 0 = 0 or 0。因此如果丈夫没有设置闹钟，而不是丈夫设置闹钟，第二天早上闹钟不会响。因此答案为“否”。
106 | 
107 | 输入信息：我们知道晚起床和交通拥堵会导致准时到校，我们观察到路上有严重的交通堵塞。
108 | 问题：如果爱丽丝晚起床而不是准时起床，她会上学迟到吗？
109 | 答案（是或否？）：令 V2 = 交通; X = 爱丽丝起床; Y = 爱丽丝到学校; 该问题下因果关系有：X->Y,V2->Y。令Y_{X=1} = 0 | V2=1，在在已知事实和动作下求解Y。V2 = 1\nY = X and V2。解得Y = 1 = 1 and 1。因此如果爱丽丝晚起床而不是准时起床，她不会上学迟到。因此答案为“否”。
110 | 
111 | 输入信息：我们知道摄入柑橘会导致维生素C缺乏，我们也知道摄入足够的维生素C会导致坏血病。
112 | 问题：如果患者摄入柑橘而不是不摄入柑橘，他会从坏血病中康复吗？
113 | 答案（是或否？）：令 X = 摄入柑橘; V2 = 维生素C; Y = 坏血病; 该问题下因果关系有：X->V2,V2->Y. Set Y_{X=1} = 0 | ，在在已知事实和动作下求解Y。V2 = not X\nY = V2。解得Y = [0] = 0。因此如果患者摄入柑橘而不是不摄入柑橘，他会从坏血病中康复。因此答案为“是”。
114 | 
115 | 输入信息：%s
116 | 问题：%s
117 | 答案（是或否？）：""",
118 |                 "explicit-function":"""You are a helpful assistant for deterministic counterfactual.
119 | Input Info: %s
120 | Question: %s
121 | Answer (Yes or No ?):""",
122 |                 "explicit-function-CN":"""你是用于决定论反事实的得力助手。
123 | 输入信息：%s
124 | 问题：%s
125 | 答案（是或否？）""",
126 |                }
127 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
128 |         base = base_prompt_dict[prompt_style]
129 | 
130 |         prompt = prompt_style_str + base % (item["given_info"], item["question"])
131 |         return prompt


--------------------------------------------------------------------------------
/calm/data_processing/prompt/PCD-B_COPA.py:
--------------------------------------------------------------------------------
  1 | base_prompt_dict = {"basic":"""Event A: %s
  2 | Event B: %s
  3 | Question: is there a causal relationship between Event A and Event B ?
  4 | Answer (Yes or No ?):""",
  5 |                "basic-CN":"""事件一：%s
  6 | 事件二：%s
  7 | 问题：事件一和事件二之间是否存在因果关系？
  8 | 答案（是或否？）：""",
  9 |                "adversarial-ignore":"""Event A: %s
 10 | Event B: %s
 11 | Question: is there a causal relationship between Event A and Event B ?
 12 | Answer (Yes or No ?):""",
 13 |                "adversarial-ignore-CN":"""事件一：%s
 14 | 事件二：%s
 15 | 问题：事件一和事件二之间是否存在因果关系？
 16 | 答案（是或否？）：""",
 17 |                 "adversarial-doubt":"""Event A: %s
 18 | Event B: %s
 19 | Question: is there a causal relationship between Event A and Event B ?
 20 | Answer (Yes or No ?):""",
 21 |                 "adversarial-doubt-CN":"""事件一：%s
 22 | 事件二：%s
 23 | 问题：事件一和事件二之间是否存在因果关系？
 24 | 答案（是或否？）：""",
 25 |                "zero-shot-IcL":"""determine whether there is a causal relationship between the two input events.
 26 | Event A: %s
 27 | Event B: %s
 28 | Question: is there a causal relationship between Event A and Event B ?
 29 | Answer (Yes or No ?):""",
 30 |                "zero-shot-IcL-CN":"""确定两个输入事件之间是否存在因果关系。
 31 | 事件一：%s
 32 | 事件二：%s
 33 | 问题：事件一和事件二之间是否存在因果关系？
 34 | 答案（是或否？）：""",
 35 |                "one-shot-IcL":"""determine whether there is a causal relationship between the two input events.
 36 | Event A: My body cast a shadow over the grass.
 37 | Event B: The sun was rising.
 38 | Question: is there a causal relationship between Event A and Event B ?
 39 | Answer (Yes or No ?): Yes
 40 | Event A: %s
 41 | Event B: %s
 42 | Question: is there a causal relationship between Event A and Event B ?
 43 | Answer (Yes or No ?):""",
 44 |                "one-shot-IcL-CN":"""确定两个输入事件之间是否存在因果关系。
 45 | 事件一：我的身体投下了阴影，落在草地上。
 46 | 事件二：太阳正在升起。
 47 | 问题：事件一和事件二之间是否存在因果关系？
 48 | 答案（是或否？）：是
 49 | 事件一：%s
 50 | 事件二：%s
 51 | 问题：事件一和事件二之间是否存在因果关系？
 52 | 答案（是或否？）：""",
 53 |                "three-shot-IcL":"""determine whether there is a causal relationship between the two input events.
 54 | Event A: My body cast a shadow over the grass.
 55 | Event B: The sun was rising.
 56 | Question: is there a causal relationship between Event A and Event B ?
 57 | Answer (Yes or No ?): Yes
 58 | 
 59 | Event A: The politician lost the election.
 60 | Event B: He ran negative campaign ads.
 61 | Question: is there a causal relationship between Event A and Event B ?
 62 | Answer (Yes or No ?): No
 63 | 
 64 | Event A: The physician misdiagnosed the patient.
 65 | Event B: The patient filed a malpractice lawsuit against the physician.
 66 | Question: is there a causal relationship between Event A and Event B ?
 67 | Answer (Yes or No ?): Yes
 68 | 
 69 | Event A: %s
 70 | Event B: %s
 71 | Question: is there a causal relationship between Event A and Event B ?
 72 | Answer (Yes or No ?):""",
 73 |                "three-shot-IcL-CN":"""确定两个输入事件之间是否存在因果关系。
 74 | 事件一：我的身体投下了阴影，落在草地上。
 75 | 事件二：太阳正在升起。
 76 | 问题：事件一和事件二之间是否存在因果关系？
 77 | 答案（是或否？）：是
 78 | 
 79 | 事件一：政治家在选举中落败了。
 80 | 事件二：他播放了负面竞选广告。
 81 | 问题：事件一和事件二之间是否存在因果关系？
 82 | 答案（是或否？）：否
 83 | 
 84 | 事件一：这位医生误诊了病人。
 85 | 事件二：病人向医生提起了医疗事故诉讼。
 86 | 问题：事件一和事件二之间是否存在因果关系？
 87 | 答案（是或否？）：是
 88 | 
 89 | 事件一：%s
 90 | 事件二：%s
 91 | 问题：事件一和事件二之间是否存在因果关系？
 92 | 答案（是或否？）：""",
 93 |                "zero-shot-CoT":"""Event A: %s
 94 | Event B: %s
 95 | Question: is there a causal relationship between Event A and Event B ? Let's think step by step.
 96 | Answer (Yes or No ?):"""
 97 | ,
 98 |                "zero-shot-CoT-CN":"""事件一：%s
 99 | 事件二：%s
100 | 问题：事件一和事件二之间是否存在因果关系？请逐步思考。
101 | 答案（是或否？）："""
102 | ,
103 |                "manual-CoT":"""Here we will provide eight chain-of-thought exemplars, followed by a binary question that needs to be answered. 
104 | 
105 | Event A: My body cast a shadow over the grass.
106 | Event B: The sun was rising.
107 | Question: is there a causal relationship between Event A and Event B ? 
108 | Answer(yes or no with chain of thought): The shadow is mostly being cast by the speaker’s body. There must be a light source in the correct position to form the shadow. And the sun is the most plausible cause of the shadow. Thus, Event B may be the cause of Event A. Therefore, the answer is yes.
109 | 
110 | Event A: I hung up the phone.
111 | Event B: The caller identified himself to me.
112 | Question: is there a causal relationship between Event A and Event B ? 
113 | Answer(yes or no with chain of thought): People always hung up the phone after the ending of their conversation, while they always identify themselves at the beginning of the call. Therefore, the answer is no.
114 | 
115 | Event A: The cook stirred the ingredients in the bowl.
116 | Event B: The ingredients melted.
117 | Question: is there a causal relationship between Event A and Event B ? 
118 | Answer(yes or no with chain of thought): Stirring is a common method used in cooking to blend and mix ingredients. But melting ingredients always need high temperature, which can not be brought by stirring. Therefore, the answer is no.
119 | 
120 | Event A: The book became a huge bestseller.
121 | Event B: It was adapted into a movie.
122 | Question: is there a causal relationship between Event A and Event B ? 
123 | Answer(yes or no with chain of thought): When a book becomes a huge bestseller, it often attracts the attention of filmmakers and can lead to movie adaptations, and authors generally gain more recognition and fame. Thus, Event B may be the effect of Event A. Therefore, the answer is yes.
124 | 
125 | Event A: The man anticipated cold weather on his trip.
126 | Event B: He travelled with a big suitcase.
127 | Question: is there a causal relationship between Event A and Event B ? 
128 | Answer(yes or no with chain of thought): When someone expects cold weather, they may take some warm clothes or other things to keep warm. But it is not logical for them to take a big suitcase. Therefore, the answer is no.
129 | 
130 | Event A: I turned on the fan.
131 | Event B: I felt cool air pass over me.
132 | Question: is there a causal relationship between Event A and Event B ? 
133 | Answer(yes or no with chain of thought): A typical function of a fan is to circulates air and creates a cooling effect. Thus, Event B may be the effect of Event A. Therefore, the answer is yes.
134 | 
135 | Event A: The woman struggled to walk.
136 | Event B: She wore high heels.
137 | Question: is there a causal relationship between Event A and Event B ? 
138 | Answer(yes or no with chain of thought): High heels can be uncomfortable and challenging to walk in for some individual. Therefore, Event B may be the cause of Event A. Therefore, the answer is yes.
139 | 
140 | Event A: I vacuumed the carpet.
141 | Event B: My roommate spilled punch.
142 | Question: is there a causal relationship between Event A and Event B ? 
143 | Answer(yes or no with chain of thought): Vacuum cleaners generally can't handle liquids like punch. Therefore, the answer is no.
144 | 
145 | Event A: %s
146 | Event B: %s
147 | Question: is there a causal relationship between Event A and Event B ?
148 | Answer (Yes or No ?): """
149 | ,
150 |                "manual-CoT-CN":"""如下为三个使用思维链进行推理的问题:
151 | 
152 | 事件一：那个女孩许了一个愿望。
153 | 事件二：她看到了一只黑猫。
154 | 问题：事件一和事件二之间是否存在因果关系？
155 | 答案（是或否？）：看到一只黑猫通常不会导致人们许愿，因此答案是“否”。
156 | 
157 | 事件一：龙卷风袭击了这座城镇。
158 | 事件二：法院大楼的屋顶被吹掉了。
159 | 问题：事件一和事件二之间是否存在因果关系？
160 | 答案（是或否？）：龙卷风通常会带来强风，破坏建筑物，因此答案是“是”。
161 | 
162 | 事件一：商店收银员叫保安了。
163 | 事件二：客户使用了假钞。
164 | 问题：事件一和事件二之间是否存在因果关系？
165 | 答案（是或否？）：商店收银员叫保安通常是因为有可疑和异常情况，包括客户用假钞，因此答案是“是”。
166 | 
167 | 事件一：%s
168 | 事件二：%s
169 | 问题：事件一和事件二之间是否存在因果关系？
170 | 答案（是或否？）："""
171 | ,
172 |                 "explicit-function":"""You are a helpful assistant for event causality identification.
173 | Event A: %s
174 | Event B: %s
175 | Question: is there a causal relationship between Event A and Event B ?
176 | Answer (Yes or No ?):""",
177 |                 "explicit-function-CN":"""你是一个用于因果发现的得力助手。
178 | 事件一：%s
179 | 事件二：%s
180 | 问题：事件一和事件二之间是否存在因果关系？
181 | 答案（是或否？）：""",
182 |                }
183 | def get_prompt(task_name, prompt_style, item, prompt_style_str=""):
184 |         base = base_prompt_dict[prompt_style]
185 | 
186 |         prompt = prompt_style_str + base % (item["premise"], item["hypothesis"])
187 |         return prompt


--------------------------------------------------------------------------------
/calm/evaluation/accuracy/choice.py:
--------------------------------------------------------------------------------
1 | def compute_acc(gt_list, pred_list):
2 |     correct_num = sum(pred == gt for gt, pred in zip(gt_list, pred_list))
3 |     acc = correct_num / len(gt_list)
4 |     return acc


--------------------------------------------------------------------------------
/calm/evaluation/accuracy/open-ended.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import jieba
 3 | from nltk import bleu
 4 | from nltk.translate.bleu_score import SmoothingFunction
 5 | from rouge import Rouge
 6 | 
 7 | def is_chinese(text):
 8 |     for char in text:
 9 |         if '\u4e00' <= char <= '\u9fff':
10 |             return True
11 |     return False
12 | 
13 | def compute_acc(gt_list, pred_list):
14 |     rouge_l = 0
15 |     rouge = Rouge()
16 | 
17 |     for pred, gold in zip(pred_list, gt_list):
18 |         if is_chinese(pred):
19 |             prediction = " ".join(jieba.cut(pred))
20 |             gold = " ".join(jieba.cut(gold))
21 |         else:
22 |             prediction = pred
23 |             gold = gold
24 |         
25 |         try:
26 |             scores = rouge.get_scores(prediction, gold)
27 |             rouge_l += scores[0]['rouge-l']['r']
28 |         except:
29 |             continue
30 |     avg_rougel = rouge_l / len(gt_list)
31 |     return avg_rougel
32 | 


--------------------------------------------------------------------------------
/calm/evaluation/accuracy/prob.py:
--------------------------------------------------------------------------------
 1 | def compute_acc(gt_list, pred_list):
 2 |         correct_num = 0
 3 |         for pred, gold in zip(pred_list, gt_list):
 4 |                 kept_pred = round(pred, 4) if pred != None else pred
 5 |                 kept_gold = round(gold, 4)
 6 |                 if kept_pred == kept_gold:
 7 |                     correct_num += 1
 8 |         acc = correct_num / len(gt_list)
 9 |         return acc
10 | 


--------------------------------------------------------------------------------
/calm/evaluation/aggregate_metrics.py:
--------------------------------------------------------------------------------
 1 | from evaluation.element_properties.model_info import limited_model_list
 2 | def eval_understandability(third_quartile, median, random_guess):
 3 |     if third_quartile < random_guess:
 4 |         understanding = "very hard"
 5 |     elif median < random_guess:
 6 |         understanding = "hard"
 7 |     elif median >= random_guess:
 8 |         understanding = "easy"
 9 |     else:
10 |         raise ValueError("The understanding is not defined")
11 |     
12 |     return understanding
13 | 
14 | def eval_solvability(max_value, max_average_value, third_max_average_value, random_guess=0.0):
15 |     if max_value < random_guess:
16 |         solvability = "unsolvable"
17 |     elif max_value >= random_guess and max_value < 80:
18 |         solvability = "challenging"
19 |     elif max_value >= random_guess and  max_value >= 80 and max_average_value <70:
20 |         solvability = "potentially solvable"
21 |     elif max_value >= random_guess and  max_value >= 80 and max_average_value >=70 and third_max_average_value <70:
22 |         solvability = "solvable"
23 |     elif max_value >= random_guess and  max_value >= 80 and max_average_value >=70 and third_max_average_value >=70:
24 |         solvability = "well-solved"
25 |     else:
26 |         raise ValueError("The solvable is not defined")
27 |     
28 |     return solvability
29 | 
30 | def eval_open_limited_gap(top_5_average_model):
31 |     count_limited = sum([1 for i in top_5_average_model if i in limited_model_list])
32 |     if count_limited <= 3:
33 |         gap = "small"
34 |     elif count_limited == 4:
35 |         gap = "moderate"
36 |     elif count_limited == 5:
37 |         gap = "large"
38 |     else:
39 |         raise ValueError("The gap is not defined")
40 |     return gap


--------------------------------------------------------------------------------
/calm/evaluation/element_properties/model_info.py:
--------------------------------------------------------------------------------
1 | limited_model_list = ["text-ada-001","text-babbage-001","ada (0.35B)","babbage (1.3B)", "curie (6.7B)","davinci (175B)","text-davinci-003","GPT-3.5-Turbo","text-curie-001","Claude2","text-davinci-002","text-davinci-001","GPT-4"] # TODO: if your tested model is limited, add to the limited model list
2 | 


--------------------------------------------------------------------------------
/calm/evaluation/element_properties/random_guess.py:
--------------------------------------------------------------------------------
 1 | task_random_guess_value = {
 2 |         # association/
 3 |             # correlation/
 4 |                 "CORR-B_correlation_CN":50.,
 5 |                 "CORR-B_correlation_EN":50.,
 6 |             # explaining_away_effect/
 7 |                 "EAE-B_exp-away_CN":50.,
 8 |                 "EAE-B_exp-away_EN":50.,
 9 |         # causal_discovery/
10 |             # abstract_reasoning/
11 |                 "AR-B_CaLM-AR_CN":50.,
12 |                 "AR-B_CaLM-AR_EN":50.,
13 |             # causal_attribution/
14 |                 "CA-B_FA_CN":50.,
15 |                 "CA-B_FA_EN":50.,
16 |                 "CA-B_FP_CN":50.,
17 |                 "CA-B_FP_EN":50.,
18 |             # event_causality_identification/
19 |                 "ECI-B_CTB_CN":50.,
20 |                 "ECI-B_CTB_EN":50.,
21 |                 "ECI-B_ESC_CN":50.,
22 |                 "ECI-B_ESC_EN":50.,
23 |                 "ECI-B_MAVEN-ERE_CN":50.,
24 |                 "ECI-B_MAVEN-ERE_EN":50.,
25 |             # pairwise_causal_discovery/
26 |                 "PCD-B_COPA_CN":50.,
27 |                 "PCD-B_COPA_EN":50.,
28 |                 "PCD-B_E-CARE_CN":50.,
29 |                 "PCD-B_E-CARE_EN":50.,
30 |                 "PCD-C_COPA_CN":50.,
31 |                 "PCD-C_COPA_EN":50.,
32 |                 "PCD-C_E-CARE_CN":50.,
33 |                 "PCD-C_E-CARE_EN":50.,
34 |         # counterfactual/
35 |             # actual_causality/
36 |                 "AC-B_causal_judgement_CN":50.,
37 |                 "AC-B_causal_judgement_EN":50.,
38 |             # counterfactual_reasoning/
39 |                 "CR-B_det-counterfactual_CN":50.,
40 |                 "CR-B_det-counterfactual_EN":50.,
41 |                 "CR-C_CRASS_CN":25.,
42 |                 "CR-C_CRASS_EN":25.,
43 |             # effect_of_the_treatment_on_the_treated/
44 |                 "ETT-B_ETT-natural_CN":50.,
45 |                 "ETT-B_ETT-natural_EN":50.,
46 |             # natural_direct_effect/
47 |                 "NDE-B_NDE-natural_CN":50.,
48 |                 "NDE-B_NDE-natural_EN":50.,
49 |             # natural_indirect_effect/
50 |                 "NIE-B_NIE-natural_CN":50.,
51 |                 "NIE-B_NIE-natural_EN":50.,
52 |         # intervention/
53 |             # average_treatment_effect/
54 |                 "ATE-B_ATE-natural_CN":50.,
55 |                 "ATE-B_ATE-natural_EN":50.,
56 |             # backdoor_adjustment_set/
57 |                 "BAS-B_backadj_CN":50.,
58 |                 "BAS-B_backadj_EN":50.,
59 |                 "BAS-C_max-BAS_CN":33.3,
60 |                 "BAS-C_max-BAS_EN":33.3,
61 |                 "BAS-C_min-BAS_CN":33.3,
62 |                 "BAS-C_min-BAS_EN":33.3,
63 |                 "BAS-C_mix-BAS_CN":33.3,
64 |                 "BAS-C_mix-BAS_EN":33.3,
65 |             # causal_effect_identification/
66 |                 "CEI-B_0.2-UC_CN":50.,
67 |                 "CEI-B_0.2-UC_EN":50.,
68 |                 "CEI-B_0.4-UC_CN":50.,
69 |                 "CEI-B_0.4-UC_EN":50.,
70 |                 "CEI-B_0.6-UC_CN":50.,
71 |                 "CEI-B_0.6-UC_EN":50.,
72 |                 "CEI-B_0.8-UC_CN":50.,
73 |                 "CEI-B_0.8-UC_EN":50.,
74 |             # collider_bias/
75 |                 "CB-B_collider-bias_CN":50.,
76 |                 "CB-B_collider-bias_EN":50.,
77 |             # controlled_direct_effect/
78 |                 "CDE-B_CDE-natural_CN":50.,
79 |                 "CDE-B_CDE-natural_EN":50.,
80 |             # frontdoor_adjustment_set/
81 |                 "FAS-C_FAS_CN":33.3,
82 |                 "FAS-C_FAS_EN":33.3,
83 |             # instrumental_variable/
84 |                 "IV-C_CaLM-IV_CN":33.3,
85 |                 "IV-C_CaLM-IV_EN":33.3,
86 |     }


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/AC-B_causal_judgement.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是","- yes","- 是","- no","- 否")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","question: how would a typical person answer each of the following questions about causation?", "答案（是或否？）", "问题：对于以下关于因果关系的问题，一个普通人会怎么回答？"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
34 |                     "All No" if all(pred == 0 for pred in preds) else 0
35 |     return abnormalities
36 | 
37 | 


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/AR-B_CaLM-AR.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","input event: if", "输入信息：如果", "答案（是或否？）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
34 |                     "All No" if all(pred == 0 for pred in preds) else 0
35 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/AS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("option 1","option 2","option 3","选项一","选项二","选项三")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or option 2 or option 3 ?)","you will be presented with a causal graph in the following form:", "答案（选项一或选项二或选项三？）", "给定如下因果图"]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
34 |                     "All option2" if all(pred == 2 for pred in preds) else \
35 |                     "All option3" if all(pred == 3 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/CA-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
34 |                     "All No" if all(pred == 0 for pred in preds) else 0
35 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/CEI-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/CLADDER.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","input info", "输入信息：", "答案（是或否？）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/CR-C_CRASS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("1","2","3","4","option 1","option 2","option 3","option 4","选项一","选项二","选项三","选项四")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or 2 or 3 or 4?)","input event:", "答案（选项一或选项二或选项三或选项四？）", "输入事件："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
34 |                     "All option2" if all(pred == 2 for pred in preds) else \
35 |                     "All option3" if all(pred == 3 for pred in preds) else \
36 |                     "All option4" if all(pred == 4 for pred in preds) else 0
37 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/ECI.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/Natural.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places and a final \"yes\" or \"no\" answer in json format", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All Yes" if all(pred == "yes" or pred == "是" for pred in preds) else \
34 |                     "All No" if all(pred == "no" or pred == "否" for pred in preds) else 0
35 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/PCD-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 |     
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (yes or no ?)","event a:", "答案（是或否？）", "事件一："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 |     return result
24 | 
25 | def contains_english(text):
26 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
27 |     result = 1 if english_pattern.search(text) is not None else 0
28 | 
29 |     return result
30 | 
31 | def check_abnormality(preds):
32 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
33 |                     "All No" if all(pred == 0 for pred in preds) else 0
34 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/PCD-C.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("option 1","option 2","选项一","选项二")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or option 2 ?)","input event:", "答案（选项一或选项二？）", "输入事件："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 |     return result
24 | 
25 | def contains_english(text):
26 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
27 |     result = 1 if english_pattern.search(text) is not None else 0
28 | 
29 |     return result
30 | 
31 | def check_abnormality(preds):
32 |     abnormalities = "All option1" if all(pred == 0 for pred in preds) else \
33 |                     "All option2" if all(pred == 1 for pred in preds) else 0
34 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/basic_adversarial/Probability.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if any (match in type for match in ['NIE','NDE','ETT','CDE','ATE']):
 4 |         if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")):
 5 |             return 0
 6 |         else:
 7 |             return 1
 8 |     elif any (match in type for match in ['PN','PS']):
 9 |         if model_response.startswith(("{\"prob\":")) and model_response.endswith(("}")):
10 |             return 0
11 |         else:
12 |             return 1
13 |     
14 | def check_empty(model_response):
15 |     if model_response == "":
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def check_repetition(model_response):
21 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
22 |         return 1
23 |     else:
24 |         return 0
25 | 
26 | def contains_chinese(text):
27 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
28 |     result = 1 if chinese_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def contains_english(text):
33 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
34 |     result = 1 if english_pattern.search(text) is not None else 0
35 | 
36 |     return result
37 | 
38 | def check_abnormality(preds):
39 |     affect_num = sum(1 for pred in preds if pred == 0.1234) # 0.1234 is the example value in prompt for probability computation
40 |     affected = affect_num / len(preds)
41 |     abnormalities = "All Yes" if affected == 1 else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/AC-B_causal_judgement.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是","- yes","- 是","- no","- 否")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”","答案是“否”。","答案是“是”。","答案是“否”","答案是“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 | 
15 |     
16 | def check_empty(model_response):
17 |     if model_response == "":
18 |         return 1
19 |     else:
20 |         return 0
21 | 
22 |     
23 | def check_repetition(model_response):
24 |     if any(response in model_response for response in ["answer (yes or no ?)","question: how would a typical person answer each of the following questions about causation?", "答案（是或否？）", "问题：对于以下关于因果关系的问题，一个普通人会怎么回答？"]):
25 |         return 1
26 |     else:
27 |         return 0
28 | 
29 | def contains_chinese(text):
30 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
31 |     result = 1 if chinese_pattern.search(text) is not None else 0
32 |     return result
33 | 
34 | def contains_english(text):
35 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
36 |     result = 1 if english_pattern.search(text) is not None else 0
37 | 
38 |     return result
39 | 
40 | def check_abnormality(preds):
41 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities
44 | 
45 | 


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/AR-B_CaLM-AR.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","input event: if", "输入信息：如果", "答案（是或否？）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/AS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("option 1","option 2","option 3","选项一","选项二","选项三")) \
 5 |             or any(response in model_response for response in ["answer is option 1","answer is option 2","answer is option 3","答案为选项一","答案为选项二","答案为选项三"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is option 1","the answer is option 2","the answer is option 3","the answer is option 1.","the answer is option 2.","the answer is option 3.","答案为选项一","答案为选项二","答案为选项三","答案为选项一。","答案为选项二。","答案为选项三。")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def check_repetition(model_response):
22 |     if any(response in model_response for response in ["answer (option 1 or option 2 or option 3 ?)","you will be presented with a causal graph in the following form:", "答案（选项一或选项二或选项三？）", "给定如下因果图"]):
23 |         return 1
24 |     else:
25 |         return 0
26 | 
27 | def contains_chinese(text):
28 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
29 |     result = 1 if chinese_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def contains_english(text):
34 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
35 |     result = 1 if english_pattern.search(text) is not None else 0
36 | 
37 |     return result
38 | 
39 | def check_abnormality(preds):
40 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
41 |                     "All option2" if all(pred == 2 for pred in preds) else \
42 |                     "All option3" if all(pred == 3 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/CA-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 |     
22 | def check_repetition(model_response):
23 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
24 |         return 1
25 |     else:
26 |         return 0
27 | 
28 | def contains_chinese(text):
29 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
30 |     result = 1 if chinese_pattern.search(text) is not None else 0
31 | 
32 |     return result
33 | 
34 | def contains_english(text):
35 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
36 |     result = 1 if english_pattern.search(text) is not None else 0
37 | 
38 |     return result
39 | 
40 | def check_abnormality(preds):
41 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/CEI-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 |     
22 | def check_repetition(model_response):
23 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
24 |         return 1
25 |     else:
26 |         return 0
27 | 
28 | def contains_chinese(text):
29 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
30 |     result = 1 if chinese_pattern.search(text) is not None else 0
31 | 
32 |     return result
33 | 
34 | def contains_english(text):
35 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
36 |     result = 1 if english_pattern.search(text) is not None else 0
37 | 
38 |     return result
39 | 
40 | def check_abnormality(preds):
41 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/CLADDER.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 |     
22 | def check_repetition(model_response):
23 |     if any(response in model_response for response in ["answer (yes or no ?)","input info", "输入信息：", "答案（是或否？）"]):
24 |         return 1
25 |     else:
26 |         return 0
27 | 
28 | def contains_chinese(text):
29 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
30 |     result = 1 if chinese_pattern.search(text) is not None else 0
31 | 
32 |     return result
33 | 
34 | def contains_english(text):
35 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
36 |     result = 1 if english_pattern.search(text) is not None else 0
37 | 
38 |     return result
39 | 
40 | def check_abnormality(preds):
41 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/CR-C_CRASS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("1","2","3","4","option 1","option 2","option 3","option 4","选项一","选项二","选项三","选项四","一","二","三","四")) \
 5 |             or any(response in model_response for response in ["answer is 1","answer is 2","answer is 3","answer is 4","answer is option 1","answer is option 2","answer is option 3","answer is option 4","答案是选项一","答案是选项二","答案是选项三","答案是选项四"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is 1","the answer is 2","the answer is 3","the answer is 4","the answer is 1.","the answer is 2.","the answer is 3.","the answer is 4.","the answer is option 1","the answer is option 2","the answer is option 3","the answer is option 4","the answer is option 1.","the answer is option 2.","the answer is option 3.","the answer is option 4.","答案是选项一","答案是选项二","答案是选项三","答案是选项四","答案是选项一。","答案是选项二。","答案是选项三。","答案是选项四。")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def check_repetition(model_response):
22 |     if any(response in model_response for response in ["answer (option 1 or 2 or 3 or 4?)","input event:", "答案（选项一或选项二或选项三或选项四？）", "输入事件："]):
23 |         return 1
24 |     else:
25 |         return 0
26 | 
27 | def contains_chinese(text):
28 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
29 |     result = 1 if chinese_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def contains_english(text):
34 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
35 |     result = 1 if english_pattern.search(text) is not None else 0
36 | 
37 |     return result
38 | 
39 | def check_abnormality(preds):
40 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
41 |                     "All option2" if all(pred == 2 for pred in preds) else \
42 |                     "All option3" if all(pred == 3 for pred in preds) else \
43 |                     "All option4" if all(pred == 4 for pred in preds) else 0
44 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/ECI.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 |     
22 | def check_repetition(model_response):
23 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
24 |         return 1
25 |     else:
26 |         return 0
27 | 
28 | def contains_chinese(text):
29 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
30 |     result = 1 if chinese_pattern.search(text) is not None else 0
31 | 
32 |     return result
33 | 
34 | def contains_english(text):
35 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
36 |     result = 1 if english_pattern.search(text) is not None else 0
37 | 
38 |     return result
39 | 
40 | def check_abnormality(preds):
41 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/Natural.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")) \
 5 |             or model_response.startswith(("{\"prob\":")) and model_response.endswith(("}")) \
 6 |             or any(response in model_response for response in ["{\"answer\":","{\"prob\":"]):
 7 |             return 0
 8 |         else:
 9 |             return 1
10 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
11 |         if "answer is" in model_response and "{\"answer\":" in model_response and "\"prob\":" in model_response and "}" in model_response \
12 |         or "answer is" in model_response and "{\"prob\":" in model_response and "}" in model_response:
13 |             return 0
14 |         else:
15 |             return 1
16 |     
17 | def check_empty(model_response):
18 |     if model_response == "":
19 |         return 1
20 |     else:
21 |         return 0
22 | 
23 |     
24 | def check_repetition(model_response):
25 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places and a final \"yes\" or \"no\" answer in json format", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
26 |         return 1
27 |     else:
28 |         return 0
29 | 
30 | def contains_chinese(text):
31 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
32 |     result = 1 if chinese_pattern.search(text) is not None else 0
33 | 
34 |     return result
35 | 
36 | def contains_english(text):
37 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
38 |     result = 1 if english_pattern.search(text) is not None else 0
39 | 
40 |     return result
41 | 
42 | def check_abnormality(preds):
43 |     abnormalities = "All Yes" if all(pred == "yes" or pred == "是" for pred in preds) else \
44 |                     "All No" if all(pred == "no" or pred == "否" for pred in preds) else 0
45 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/PCD-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("no","否","yes","是")) \
 5 |             or any(response in model_response for response in ["answer is yes","answer is no","answer: yes","answer is: yes","answer: no","answer is: no","答案是:是","答案是是","答案是:否","答案是否"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is yes","the answer is no","the answer is yes.","the answer is no.","答案为“否”。","答案为“是”。","答案为“否”","答案为“是”","答案是“否”。","答案是“是”。","答案是“否”","答案是“是”")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 |     
21 | def check_repetition(model_response):
22 |     if any(response in model_response for response in ["answer (yes or no ?)","event a:", "答案（是或否？）", "事件一："]):
23 |         return 1
24 |     else:
25 |         return 0
26 | 
27 | def contains_chinese(text):
28 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
29 |     result = 1 if chinese_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def contains_english(text):
34 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
35 |     result = 1 if english_pattern.search(text) is not None else 0
36 | 
37 |     return result
38 | 
39 | def check_abnormality(preds):
40 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
41 |                     "All No" if all(pred == 0 for pred in preds) else 0
42 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/PCD-C.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("option 1","option 2","选项一","选项二")) \
 5 |             or any(response in model_response for response in ["answer is option 1","answer is option 2","答案为选项一","答案为选项二"]):
 6 |             return 0
 7 |         else:
 8 |             return 1
 9 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
10 |         if model_response.endswith(("the answer is option 1","the answer is option 2","the answer is option 1.","the answer is option 2.","答案为选项一","答案为选项二","答案为选项一。","答案为选项二。","答案是选项一","答案是选项二","答案是选项一。","答案是选项二。")):
11 |             return 0
12 |         else:
13 |             return 1
14 |     
15 | def check_empty(model_response):
16 |     if model_response == "":
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def check_repetition(model_response):
22 |     if any(response in model_response for response in ["answer (option 1 or option 2 ?)","input event:", "答案（选项一或选项二？）", "输入事件："]):
23 |         return 1
24 |     else:
25 |         return 0
26 | 
27 | def contains_chinese(text):
28 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
29 |     result = 1 if chinese_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def contains_english(text):
34 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
35 |     result = 1 if english_pattern.search(text) is not None else 0
36 | 
37 |     return result
38 | 
39 | def check_abnormality(preds):
40 |     abnormalities = "All option1" if all(pred == 0 for pred in preds) else \
41 |                     "All option2" if all(pred == 1 for pred in preds) else 0
42 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/cot/Probability.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if prompt_style == "zero-shot-CoT" or prompt_style == "zero-shot-CoT-CN":
 4 |         if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")) \
 5 |             or model_response.startswith(("{\"prob\":")) and model_response.endswith(("}")) \
 6 |             or any(response in model_response for response in ["{\"answer\":","{\"prob\":"]):
 7 |             return 0
 8 |         else:
 9 |             return 1
10 |     elif prompt_style == "manual-CoT" or prompt_style == "manual-CoT-CN":
11 |         if "answer is" in model_response and "{\"answer\":" in model_response and "\"prob\":" in model_response and "}" in model_response \
12 |         or "answer is" in model_response and "{\"prob\":" in model_response and "}" in model_response:
13 |             return 0
14 |         else:
15 |             return 1
16 |     
17 | def check_empty(model_response):
18 |     if model_response == "":
19 |         return 1
20 |     else:
21 |         return 0
22 | 
23 | def check_repetition(model_response):
24 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
25 |         return 1
26 |     else:
27 |         return 0
28 | 
29 | def contains_chinese(text):
30 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
31 |     result = 1 if chinese_pattern.search(text) is not None else 0
32 | 
33 |     return result
34 | 
35 | def contains_english(text):
36 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
37 |     result = 1 if english_pattern.search(text) is not None else 0
38 | 
39 |     return result
40 | 
41 | def check_abnormality(preds):
42 |     affect_num = sum(1 for pred in preds if pred == 0.1234) # 0.1234 is the example value in prompt for probability computation
43 |     affected = affect_num / len(preds)
44 |     abnormalities = "All Yes" if affected == 1 else \
45 |                     "All No" if all(pred == 0 for pred in preds) else 0
46 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/AC-B_causal_judgement.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是","- yes","- 是","- no","- 否")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","question: how would a typical person answer each of the following questions about causation?", "答案（是或否？）", "问题：对于以下关于因果关系的问题，一个普通人会怎么回答？"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/AR-B_CaLM-AR.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","input event: if", "输入信息：如果", "答案（是或否？）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/AS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("option 1","option 2","option 3","选项一","选项二","选项三")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or option 2 or option 3 ?)","you will be presented with a causal graph in the following form:", "答案（选项一或选项二或选项三？）", "给定如下因果图"]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
34 |                     "All option2" if all(pred == 2 for pred in preds) else \
35 |                     "All option3" if all(pred == 3 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/CA-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/CEI-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/CLADDER.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","input info", "输入信息：", "答案（是或否？）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/CR-C_CRASS.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("1","2","3","4","option 1","option 2","option 3","option 4","选项一","选项二","选项三","选项四","一","二","三","四")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or 2 or 3 or 4?)","input event:", "答案（选项一或选项二或选项三或选项四？）", "输入事件："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All option1" if all(pred == 1 for pred in preds) else \
34 |                     "All option2" if all(pred == 2 for pred in preds) else \
35 |                     "All option3" if all(pred == 3 for pred in preds) else \
36 |                     "All option4" if all(pred == 4 for pred in preds) else 0
37 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/ECI.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["answer (yes or no ?)","you will be presented with a causal graph in the following form:", "答案（是或否？）", "给定如下因果图"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
35 |                     "All No" if all(pred == 0 for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/Natural.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 |     
15 | def check_repetition(model_response):
16 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places and a final \"yes\" or \"no\" answer in json format", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
17 |         return 1
18 |     else:
19 |         return 0
20 | 
21 | def contains_chinese(text):
22 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
23 |     result = 1 if chinese_pattern.search(text) is not None else 0
24 | 
25 |     return result
26 | 
27 | def contains_english(text):
28 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
29 |     result = 1 if english_pattern.search(text) is not None else 0
30 | 
31 |     return result
32 | 
33 | def check_abnormality(preds):
34 |     abnormalities = "All Yes" if all(pred == "yes" or pred == "是" for pred in preds) else \
35 |                     "All No" if all(pred == "no" or pred == "否" for pred in preds) else 0
36 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/PCD-B.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("no","否","yes","是")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 |     
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (yes or no ?)","event a:", "答案（是或否？）", "事件一："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All Yes" if all(pred == 1 for pred in preds) else \
34 |                     "All No" if all(pred == 0 for pred in preds) else 0
35 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/PCD-C.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if model_response.startswith(("option 1","option 2","选项一","选项二")):
 4 |         return 0
 5 |     else:
 6 |         return 1
 7 |     
 8 | def check_empty(model_response):
 9 |     if model_response == "":
10 |         return 1
11 |     else:
12 |         return 0
13 | 
14 | def check_repetition(model_response):
15 |     if any(response in model_response for response in ["answer (option 1 or option 2 ?)","input event:", "答案（选项一或选项二？）", "输入事件："]):
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def contains_chinese(text):
21 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
22 |     result = 1 if chinese_pattern.search(text) is not None else 0
23 | 
24 |     return result
25 | 
26 | def contains_english(text):
27 |     english_pattern = re.compile(r'[A-Za-z]{2,}')
28 |     result = 1 if english_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def check_abnormality(preds):
33 |     abnormalities = "All option1" if all(pred == 0 for pred in preds) else \
34 |                     "All option2" if all(pred == 1 for pred in preds) else 0
35 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/error/icl/Probability.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | def check_standalization(model_response, prompt_style, type):
 3 |     if any (match in type for match in ['NIE','NDE','ETT','CDE','ATE']):
 4 |         if model_response.startswith(("{\"answer\":")) and model_response.endswith(("}")):
 5 |             return 0
 6 |         else:
 7 |             return 1
 8 |     elif any (match in type for match in ['PN','PS']):
 9 |         if model_response.startswith(("{\"prob\":")) and model_response.endswith(("}")):
10 |             return 0
11 |         else:
12 |             return 1
13 |     
14 | def check_empty(model_response):
15 |     if model_response == "":
16 |         return 1
17 |     else:
18 |         return 0
19 | 
20 | def check_repetition(model_response):
21 |     if any(response in model_response for response in ["input info: imagine a self-contained","provide the calculation result to four decimal places", "输入信息：设想一个", "请根据上述信息，给出计算结果（答案保留四位小数）"]):
22 |         return 1
23 |     else:
24 |         return 0
25 | 
26 | def contains_chinese(text):
27 |     chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
28 |     result = 1 if chinese_pattern.search(text) is not None else 0
29 | 
30 |     return result
31 | 
32 | def contains_english(text):
33 |     english_pattern = re.compile(r'[A-Za-z]{7,}') # Taking into account 'fake' and 'random' modes, and considering that the shortest occurrence of English characters in an 'answer' is of length 6, therefore detecting lengths of 7 or more.
34 |     result = 1 if english_pattern.search(text) is not None else 0
35 | 
36 |     return result
37 | 
38 | def check_abnormality(preds):
39 |     affect_num = sum(1 for pred in preds if pred == 0.1234) # 0.1234 is the example value in prompt for probability computation
40 |     affected = affect_num / len(preds)
41 |     abnormalities = "All Yes" if affected == 1 else \
42 |                     "All No" if all(pred == 0 for pred in preds) else 0
43 |     return abnormalities


--------------------------------------------------------------------------------
/calm/evaluation/labeling/AC-B_causal_judgement.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         if item["gt_answer"] == "Yes":
 5 |                 gt_label = 1
 6 |         elif item["gt_answer"] == "No":
 7 |                 gt_label = 0
 8 |         return gt_label
 9 | 
10 | def get_pred_label(model_response, item, prompt_style, type):
11 |         model_response = model_response.strip().lower()
12 |         low_index = len(model_response)
13 |         start_str1_dict = common_start_true_dict
14 |         start_str2_dict = common_start_false_dict
15 | 
16 |         start_option1_list,start_option2_list = [],[] 
17 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
18 |         for key in start_str1_dict.keys():
19 |                 for str1 in start_str1_dict[key]:
20 |                         for i in range(key, len(str1)+1):
21 |                                 start_option1_list.append(str1[-i:])
22 |         for key in start_str2_dict.keys():
23 |                 for str2 in start_str2_dict[key]:
24 |                         for i in range(key, len(str2)+1):
25 |                                 start_option2_list.append(str2[-i:])    
26 | 
27 |         inner_option1_list = common_true_list
28 |         inner_option2_list = common_false_list
29 |         if "- yes" in model_response and "- no" in model_response \
30 |                 or "- 是" in model_response and "- 否" in model_response:
31 |                 label = -1
32 |         elif model_response.startswith(tuple(start_option1_list)):
33 |                 label = 1
34 |         elif model_response.startswith(tuple(start_option2_list)):
35 |                 label = 0
36 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
37 |                 label = 1
38 |                 if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
39 |                         label = 0
40 |         elif any(response in model_response for response in inner_option2_list):
41 |                 label = 0
42 |         else:
43 |                 return -1
44 |         return label 
45 | 


--------------------------------------------------------------------------------
/calm/evaluation/labeling/AR-B_CaLM-AR.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 |         start_str1_dict = common_start_true_dict
10 |         start_str2_dict = common_start_false_dict
11 |         start_option1_list,start_option2_list = [],[]
12 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
13 |         for key1, key2 in zip(start_str1_dict.keys(), start_str2_dict.keys()):
14 |                 for str1,str2 in zip(start_str1_dict[key1], start_str2_dict[key2]):
15 |                         for i in range(key1, len(str1)+1):
16 |                                 start_option1_list.append(str1[-i:])
17 |                         for i in range(key2, len(str2)+1):
18 |                                 start_option2_list.append(str2[-i:])
19 | 
20 |         inner_option1_list = common_true_list
21 |         inner_option2_list = common_false_list
22 |         if model_response.startswith(tuple(start_option1_list)):
23 |                 label = 1
24 |         elif model_response.startswith(tuple(start_option2_list)):
25 |                 label = 0
26 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list) \
27 |                 or "yes" in model_response and ("causes" in model_response or "does cause" in model_response) \
28 |                 or "是" in model_response and "会导致" in model_response:
29 |                 label = 1
30 |                 if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
31 |                         label = 0
32 |         elif any(response in model_response for response in inner_option2_list) \
33 |                 or "否" in model_response and "不会导致" in model_response:
34 |                 label = 0
35 |         else:
36 |                 return -1
37 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/AS.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return int(item["gt_answer"])
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type): 
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 |         Answer1 = item["option1"].strip().lower()
10 |         Answer2 = item["option2"].strip().lower()
11 |         Answer3 = item["option3"].strip().lower()
12 |         start_str1_dict = {**common_start_op1_dict,
13 |                             len(Answer1)-1:[f"答案（选项一或选项二或选项三？）：{Answer1[:-1]}",
14 |                                         f"答案（选项一或选项二或选项三？）： {Answer1[:-1]}",
15 |                                         f"answer (option 1 or 2 or 3?):{Answer1[:-1]}",
16 |                                         f"answer (option 1 or 2 or 3?): {Answer1[:-1]}"]}
17 |         start_str2_dict = {**common_start_op2_dict,
18 |                         len(Answer2)-1:[f"答案（选项一或选项二或选项三？）：{Answer2[:-1]}",
19 |                                         f"答案（选项一或选项二或选项三？）： {Answer2[:-1]}",
20 |                                         f"answer (option 1 or 2 or 3?):{Answer2[:-1]}",
21 |                                         f"answer (option 1 or 2 or 3?): {Answer2[:-1]}"]}
22 |         start_str3_dict = {**common_start_op3_dict,
23 |                         len(Answer3)-1:[f"答案（选项一或选项二或选项三？）：{Answer3[:-1]}",
24 |                                         f"答案（选项一或选项二或选项三？）： {Answer3[:-1]}",
25 |                                         f"answer (option 1 or 2 or 3?):{Answer3[:-1]}",
26 |                                         f"answer (option 1 or 2 or 3?): {Answer3[:-1]}"]}
27 |                             
28 |         start_option1_list,start_option2_list,start_option3_list = [],[],[]
29 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
30 |         for key1, key2, key3 in zip(start_str1_dict.keys(), start_str2_dict.keys(), start_str3_dict.keys()):
31 |             for str1, str2, str3 in zip(start_str1_dict[key1], start_str2_dict[key2], start_str3_dict[key3]):
32 |                 for i in range(key1, len(str1)+1):
33 |                     start_option1_list.append(str1[-i:])
34 |                 for i in range(key2, len(str2)+1):
35 |                     start_option2_list.append(str2[-i:])
36 |                 for i in range(key3, len(str3)+1):
37 |                     start_option3_list.append(str3[-i:])
38 | 
39 |         inner_option1_list = ["answer (option 1 or 2 or 3 ?): {}".format(Answer1[:-1]),"(option 1 or 2 or 3?): {}".format({Answer1[:-1]})]+common_option_1_list
40 |         inner_option2_list = ["answer (option 1 or 2 or 3 ?): {}".format(Answer2[:-1]),"(option 1 or 2 or 3?): {}".format({Answer2[:-1]})]+common_option_2_list
41 |         inner_option3_list = ["answer (option 1 or 2 or 3 ?): {}".format(Answer3[:-1]),"(option 1 or 2 or 3?): {}".format({Answer3[:-1]})]+common_option_3_list
42 | 
43 |         if any(option in model_response for option in ["选项一或选项二","选项二或选项三","option 1 or option 2", "option2 or option 3"]) \
44 |             or "option 1" in model_response and "option 2" in model_response and "option 3" in model_response \
45 |             or "选项一" in model_response and "选项二" in model_response and "选项三" in model_response \
46 |             or len(model_response) == 0:
47 |             return -1
48 |         elif model_response.startswith(tuple(start_option1_list)) \
49 |             or any(Answer1 == option for option in [model_response]) \
50 |             or len(Answer1) > 1 and len(model_response) > 0 and (model_response in Answer1):
51 |             label = 1
52 |         elif model_response.startswith(tuple(start_option2_list)) \
53 |             or any(Answer2 == option for option in [model_response]) \
54 |             or len(Answer2) > 1 and len(model_response) > 0 and (model_response in Answer2):
55 |             label = 2
56 |         elif model_response.startswith(tuple(start_option3_list)) \
57 |             or any(Answer3 == option for option in [model_response]) \
58 |             or len(Answer3) > 1 and len(model_response) > 0 and (model_response in Answer3):
59 |             label = 3
60 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list)\
61 |             or "正确答案" in model_response and ("选项一" in model_response):
62 |             label = 1
63 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
64 |                 label = 2
65 |                 if any(option in model_response and model_response.find(option) < low_index for option in inner_option3_list):
66 |                     label = 3
67 |         elif any(model_response.find(option) > -1 for option in inner_option2_list)\
68 |             or "正确答案" in model_response and ("选项二" in model_response):
69 |             label = 2
70 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option3_list):
71 |                 label = 3
72 |         elif any(model_response.find(option) > -1 for option in inner_option3_list)\
73 |             or "正确答案" in model_response and ("选项三" in model_response):
74 |             label = 3
75 |         else:
76 |             return -1
77 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CA-B_FA.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 |         start_str1_dict = common_start_true_dict
10 |         start_str2_dict = common_start_false_dict
11 | 
12 |         start_option1_list,start_option2_list = [],[]
13 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
14 |         for key1, key2 in zip(start_str1_dict.keys(), start_str2_dict.keys()):
15 |             for str1,str2 in zip(start_str1_dict[key1], start_str2_dict[key2]):
16 |                 for i in range(key1, len(str1)+1):
17 |                     start_option1_list.append(str1[-i:])
18 |                 for i in range(key2, len(str2)+1):
19 |                     start_option2_list.append(str2[-i:])
20 | 
21 |         inner_option1_list = ["serves as the parent node of","serves as a parent node of"]+common_true_list
22 |         inner_option2_list = common_false_list
23 |         if model_response.startswith(tuple(start_option1_list)):
24 |             label = 1
25 |         elif model_response.startswith(tuple(start_option2_list)):
26 |             label = 0
27 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list) \
28 |             or "yes" in model_response and ("is the ancestor of" in model_response or "is an ancestor of" in model_response or "serves as the ancestor node of" in model_response or "serves as an ancestor node of" in model_response) \
29 |             or "是" in model_response and "祖先节点" in model_response:
30 |             label = 1
31 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
32 |                 label = 0
33 |         elif any(response in model_response for response in inner_option2_list)\
34 |             or "不是" in model_response and "祖先节点" in model_response:
35 |             label = 0
36 |         else:
37 |             return -1
38 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CA-B_FP.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 |         start_str1_dict = common_start_true_dict
10 |         start_str2_dict = common_start_false_dict
11 | 
12 |         start_option1_list,start_option2_list = [],[]
13 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
14 |         for key1, key2 in zip(start_str1_dict.keys(), start_str2_dict.keys()):
15 |             for str1,str2 in zip(start_str1_dict[key1], start_str2_dict[key2]):
16 |                 for i in range(key1, len(str1)+1):
17 |                     start_option1_list.append(str1[-i:])
18 |                 for i in range(key2, len(str2)+1):
19 |                     start_option2_list.append(str2[-i:])
20 | 
21 |         inner_option1_list = ["serves as the parent node of","serves as a parent node of"]+common_true_list
22 |         inner_option2_list = common_false_list
23 |         if model_response.startswith(tuple(start_option1_list)):
24 |             label = 1
25 |         elif model_response.startswith(tuple(start_option2_list)):
26 |             label = 0
27 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list) \
28 |             or "yes" in model_response and "is the parent of" in model_response \
29 |             or "是" in model_response and "父节点" in model_response:
30 |             label = 1
31 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
32 |                 label = 0
33 |         elif any(response in model_response for response in inner_option2_list)\
34 |             or ("不是" in model_response and "父节点" in model_response):
35 |             label = 0
36 |         else:
37 |             return -1
38 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CEG-O_E-CARE.py:
--------------------------------------------------------------------------------
1 | def get_gt_label(item):
2 |         return item["gt_answer"]
3 | 
4 | def get_pred_label(model_response, item, prompt_style, type):
5 |         return model_response
6 | 
7 | 


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CEI-B.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 | 
 9 |         low_index = len(model_response)
10 |         start_str1_dict = common_start_true_dict
11 |         start_str2_dict = common_start_false_dict
12 | 
13 |         start_option1_list,start_option2_list = [],[]
14 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
15 |         for key in start_str1_dict.keys():
16 |             for str1 in start_str1_dict[key]:
17 |                 for i in range(key, len(str1)+1):
18 |                     start_option1_list.append(str1[-i:])
19 |         for key in start_str2_dict.keys():
20 |             for str2 in start_str2_dict[key]:
21 |                 for i in range(key, len(str2)+1):
22 |                     start_option2_list.append(str2[-i:])    
23 | 
24 |         inner_option1_list = ["can be identified","可以被识别","能被识别","answer (yes or no?): yes","answer is yes","\"yes\"","answer: yes","answer is: yes","answer is:\n\nyes","answer is:\nyes","is identified.","can be identified","可以被识别","能被识别","答案是:是","答案是:\n\n是","答案是:\n是","答案:是","答案是是","\"是\"","是的","答案为“是”","答案是“是”","可以识别","答案：是","答案：可以","答案：“是”","thus answering yes","henceforth; answering yes","by answering yes","answeristheyes","answer would be yes","answer (yes)","hence answering yes","hence my answer yes","answer would definitely become yes","answer remains yes","my answer was 'yes'","thus concludes our answer yes","must answer yes","answer should be 'yes'","answer remains 'yes'","henceforth answering yes","answer should be marked yes","answer comes out yes","should answer 'yes","our answer should be yes","you should answer yes","concluding answer - yes","answer should indeed say yes","answer : yes","answer should also be yes","hence answering yes","the answer is trivially yes","answer:  yes","the answer is (yes)","答案应为“是”"]+common_true_list
25 |         inner_option2_list = ["not identified","不能被识别","无法被识别","answer (yes or no?): no","answer is no","\"no\"","answer: no","answer is: no","answer is:\n\nno","answer is:\nno","not identified","不能被识别","无法被识别","答案是:否","答案是:\n\n否","答案是:\n否","答案:否","答案是否","\"否\"","回答是:否","答案为“否”","答案是“否”","因果效应不可被识别","答案：否","答案：无法识别","不存在可识别的因果效应","doesn't have a causal relationship","the correct answer should be no","answer would be no","hence answering no","answering your query 'no'","therefore answering no","answer would be “no”","thus answering no","this answers no","thus, answering no","answer should also be no","answer would also turn out to be no","answer would have to be no","answer would be – no","thus answering “no”","answer = no","answer should be no","answer would definitely be no","answer would need to be no","answer would need to be marked no","hence why i answered “no","hence answering 'no'","answer must necessarily remain no","answer should marked no","answer would most likely be no","answer would also be no","answer for now might have to be `no`","henceforth - answer no","answer could only be no","answer would also be no","henceforth answering “no","answer would be no","hence answering no","cannot be identified","answer (yes or no ?): no","答案为“不”","henceforth answering no","答案为:否","答案应该是“否","因果效应不可被"]+common_false_list
26 |         if model_response.startswith(tuple(start_option1_list)):
27 |             label = 1
28 |         elif model_response.startswith(tuple(start_option2_list)):
29 |             label = 0
30 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
31 |             label = 1
32 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
33 |                 label = 0
34 |         elif any(response in model_response for response in inner_option2_list):
35 |             label = 0
36 |         else:
37 |             return -1
38 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CLADDER.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | def get_gt_label(item):
 3 |         if item["gt_answer"] == "yes":
 4 |                 gt_label = 1
 5 |         elif item["gt_answer"] == "no":
 6 |                 gt_label = 0
 7 |         return gt_label
 8 | 
 9 | def get_pred_label(model_response, item, prompt_style, type):
10 |         model_response = model_response.strip().lower()
11 |         low_index = len(model_response)
12 |         start_str1_dict = common_start_true_dict
13 |         start_str2_dict = common_start_false_dict
14 | 
15 |         start_option1_list,start_option2_list = [],[]
16 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
17 |         for key in start_str1_dict.keys():
18 |             for str1 in start_str1_dict[key]:
19 |                 for i in range(key, len(str1)+1):
20 |                     start_option1_list.append(str1[-i:])
21 |         for key in start_str2_dict.keys():
22 |             for str2 in start_str2_dict[key]:
23 |                 for i in range(key, len(str2)+1):
24 |                     start_option2_list.append(str2[-i:])
25 | 
26 |         inner_option1_list = ["method 1 is more correct","使用方法1更准确"]+common_true_list
27 |         inner_option2_list = ["method 2 is more correct","method 2 is correct","correct to use method 2", "方法2比方法1更准确","方法2"]+common_false_list
28 |         if model_response.startswith(tuple(start_option1_list)):
29 |             label = 1
30 |         elif model_response.startswith(tuple(start_option2_list)):
31 |             label = 0
32 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
33 |             label = 1
34 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
35 |                 label = 0
36 |         elif any(response in model_response for response in inner_option2_list):
37 |             label = 0
38 |         else:
39 |             return -1
40 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/CR-C_CRASS.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return int(item["gt_answer"])
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 |         Answer1 = item["Answer1"].strip().lower()
10 |         Answer2 = item["Answer2"].strip().lower()
11 |         Answer3 = item["Answer3"].strip().lower()
12 |         Answer4 = item["Answer4"].strip().lower()
13 | 
14 |         start_str1_dict = {**common_start_op1_dict,len(Answer1)-1:[f"答案（选项一或选项二或选项三或选项四？）：{Answer1[:-1]}",
15 |                                         f"答案（选项一或选项二或选项三或选项四？）： {Answer1[:-1]}",
16 |                                         f"answer (option 1 or 2 or 3 or 4?):{Answer1[:-1]}",
17 |                                         f"answer (option 1 or 2 or 3 or 4?): {Answer1[:-1]}"]}
18 |         start_str2_dict = {**common_start_op2_dict,len(Answer2)-1:[f"答案（选项一或选项二或选项三或选项四？）：{Answer2[:-1]}",
19 |                                         f"答案（选项一或选项二或选项三或选项四？）： {Answer2[:-1]}",
20 |                                         f"answer (option 1 or 2 or 3 or 4?):{Answer2[:-1]}",
21 |                                         f"answer (option 1 or 2 or 3 or 4?): {Answer2[:-1]}"]}
22 |         start_str3_dict = {**common_start_op3_dict,len(Answer3)-1:[f"答案（选项一或选项二或选项三或选项四？）：{Answer3[:-1]}",
23 |                                         f"答案（选项一或选项二或选项三或选项四？）： {Answer3[:-1]}",
24 |                                         f"answer (option 1 or 2 or 3 or 4?):{Answer3[:-1]}",
25 |                                         f"answer (option 1 or 2 or 3 or 4?): {Answer3[:-1]}"]}
26 |         start_str4_dict = {**common_start_op4_dict,
27 |                         len(Answer4)-1:[f"答案（选项一或选项二或选项三或选项四？）：{Answer4[:-1]}",
28 |                                         f"答案（选项一或选项二或选项三或选项四？）： {Answer4[:-1]}",
29 |                                         f"answer (option 1 or 2 or 3 or 4?):{Answer4[:-1]}",
30 |                                         f"answer (option 1 or 2 or 3 or 4?): {Answer4[:-1]}"]}
31 |                             
32 |         start_option1_list,start_option2_list,start_option3_list,start_option4_list = [],[],[],[]
33 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
34 |         for key1, key2, key3, key4 in zip(start_str1_dict.keys(), start_str2_dict.keys(), start_str3_dict.keys(), start_str4_dict.keys()):
35 |             for str1, str2, str3, str4 in zip(start_str1_dict[key1], start_str2_dict[key2], start_str3_dict[key3], start_str4_dict[key4]):
36 |                 for i in range(key1, len(str1)+1):
37 |                     start_option1_list.append(str1[-i:])
38 |                 for i in range(key2, len(str2)+1):
39 |                     start_option2_list.append(str2[-i:])
40 |                 for i in range(key3, len(str3)+1):
41 |                     start_option3_list.append(str3[-i:])
42 |                 for i in range(key4, len(str4)+1):
43 |                     start_option4_list.append(str4[-i:])
44 | 
45 |         inner_option1_list = ["answer (option 1 or 2 or 3 or 4 ?): {}".format(Answer1[:-1]),"(option 1 or 2 or 3 or 4?): {}".format({Answer1[:-1]})]+common_option_1_list
46 |         inner_option2_list = ["answer (option 1 or 2 or 3 or 4 ?): {}".format(Answer2[:-1]),"(option 1 or 2 or 3 or 4?): {}".format({Answer2[:-1]}), ]+common_option_2_list
47 |         inner_option3_list = ["answer (option 1 or 2 or 3 or 4 ?): {}".format(Answer3[:-1]),"(option 1 or 2 or 3 or 4?): {}".format({Answer3[:-1]})]+common_option_3_list
48 |         inner_option4_list = ["answer (option 1 or 2 or 3 or 4 ?): {}".format(Answer4[:-1]),"(option 1 or 2 or 3 or 4?): {}".format({Answer4[:-1]})]+common_option_4_list
49 | 
50 |         if any(option in model_response for option in ["选项一或选项二","选项三或选项四"]) \
51 |             or "选项一" in model_response and "选项二" in model_response and "选项三" in model_response and "选项四" in model_response:
52 |             return -1
53 |         elif model_response.startswith(tuple(start_option1_list)) \
54 |             or any(Answer1 == option for option in [model_response]) \
55 |             or len(Answer1) > 1 and len(model_response) > 0 and (model_response in Answer1 or Answer1 in model_response):
56 |             label = 1
57 |         elif model_response.startswith(tuple(start_option2_list)) \
58 |             or any(Answer2 == option for option in [model_response]) \
59 |             or len(Answer2) > 1 and len(model_response) > 0 and (model_response in Answer2 or Answer2 in model_response):
60 |             label = 2
61 |         elif model_response.startswith(tuple(start_option3_list)) \
62 |             or any(Answer3 == option for option in [model_response]) \
63 |             or len(Answer3) > 1 and len(model_response) > 0 and (model_response in Answer3 or Answer3 in model_response):
64 |             label = 3
65 |         elif model_response.startswith(tuple(start_option4_list)) \
66 |             or any(Answer4 == option for option in [model_response]) \
67 |             or len(Answer4) > 1 and len(model_response) > 0 and (model_response in Answer4 or Answer4 in model_response):
68 |             label = 4
69 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
70 |             label = 1
71 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
72 |                 label = 2
73 |                 if any(option in model_response and model_response.find(option) < low_index for option in inner_option3_list):
74 |                     label = 3
75 |                     if any(option in model_response and model_response.find(option) < low_index for option in inner_option4_list):
76 |                         label = 4
77 |         elif any(model_response.find(option) > -1 for option in inner_option2_list):
78 |             label = 2
79 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option3_list):
80 |                 label = 3
81 |                 if any(option in model_response and model_response.find(option) < low_index for option in inner_option4_list):
82 |                     label = 4
83 |         elif any(model_response.find(option) > -1 for option in inner_option3_list):
84 |             label = 3
85 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option4_list):
86 |                 label = 4
87 |         elif any(model_response.find(option) > -1 for option in inner_option4_list):
88 |             label = 4
89 |         else:
90 |             return -1
91 |         return label


--------------------------------------------------------------------------------
/calm/evaluation/labeling/Natural.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from .common_answers import add_quotes_to_unquoted, is_numeric, change_quotation
 4 | 
 5 | def get_gt_label(item):
 6 |         return item["gt_answer"].strip().lower()
 7 | 
 8 | def extract_answer(model_response, item, prompt_style, type):
 9 |         model_response += "}"
10 |         if "CoT" in prompt_style and any (match in type for match in ['NIE','NDE','ETT','CDE','ATE']):
11 |             matches = re.findall(r'\{\"answer\":.*?\}', model_response, re.DOTALL)
12 |         else:
13 |             matches = re.findall(r"\{+.*?\}+", model_response, re.DOTALL | re.IGNORECASE)
14 |         matched_str = None
15 |         for match in matches:
16 |             if match:
17 |                 matched_str = match.lower()
18 |                 if matched_str.startswith("{{") and matched_str.endswith("}}}"):
19 |                     matched_str = matched_str[1:-2]
20 |                 elif matched_str.startswith("{{") and matched_str.endswith("}}"):
21 |                     matched_str = matched_str[1:-1]
22 |                 elif matched_str.startswith("{{") and matched_str.endswith("}"):
23 |                     matched_str = matched_str[1:]
24 |                 elif matched_str.startswith("{") and matched_str.endswith("}}"):
25 |                     matched_str = matched_str[:-1]
26 |             else:
27 |                 matched_str = None
28 | 
29 |             if matched_str:
30 |                 try:
31 |                     inner_json_obj = json.loads(matched_str)
32 |                 except json.JSONDecodeError:
33 |                     # If parsing fails, try adding quotes to unquoted words and parse again
34 |                     fixed_json_str = add_quotes_to_unquoted(matched_str)
35 |                     fixed_json_str = change_quotation(fixed_json_str)
36 |                     try:
37 |                         inner_json_obj = json.loads(fixed_json_str)
38 |                     except:
39 |                         inner_json_obj = {}
40 | 
41 |                 prob_str_value = inner_json_obj.get("answer", None)
42 |                 if prob_str_value is not None:
43 |                     break
44 |         if matched_str is None:
45 |             prob_str_value = None
46 | 
47 |         return prob_str_value
48 | 
49 | def get_pred_label(model_response, item, prompt_style, type):
50 |         model_response = model_response.strip().lower()
51 |         pred = extract_answer(model_response, item, prompt_style, type)
52 |         return pred


--------------------------------------------------------------------------------
/calm/evaluation/labeling/PCD-B.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         low_index = len(model_response)
 9 | 
10 |         start_str1_dict = common_start_true_dict
11 |         start_str2_dict = common_start_false_dict
12 | 
13 |         start_option1_list,start_option2_list = [],[]
14 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
15 |         for key in start_str1_dict.keys():
16 |             for str1 in start_str1_dict[key]:
17 |                 for i in range(key, len(str1)+1):
18 |                     start_option1_list.append(str1[-i:])
19 |         for key in start_str2_dict.keys():
20 |             for str2 in start_str2_dict[key]:
21 |                 for i in range(key, len(str2)+1):
22 |                     start_option2_list.append(str2[-i:])    
23 | 
24 |         inner_option1_list = ["there is a causal relationship", "存在因果关系","有因果关系","answer (yes or no?): yes","answer is yes","\"yes\"","answer: yes","answer is: yes","answer is:\n\nyes","answer is:\nyes","there is a causal relationship","存在因果关系","存在","有因果关系","答案是:是","答案是:\n\n是","答案是:\n是","答案:是","答案是是","答案为是","\"是\"","是的","存在明确的因果关系"]+common_true_list
25 |         inner_option2_list = ["there is no causal relationship", "不存在因果关系","没有因果关系","没有明显的因果关系","不存在","answer (yes or no?): no","answer is no","\"no\"","answer: no","answer is: no","answer is:\n\nno","answer is:\nno","there is no causal relationship","不存在因果关系","没有因果关系","没有明显的因果关系","不存在","答案是:否","答案是:\n\n否","答案是:\n否","答案:否","答案是否","答案为否","\"否\"","回答是:否","没有直接的因果关系"]+common_false_list
26 | 
27 |         if model_response.startswith(tuple(start_option1_list)):
28 |             return 1
29 |         elif model_response.startswith(tuple(start_option2_list)):
30 |             return 0
31 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
32 |             label = 1
33 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
34 |                 label = 0
35 |             return label
36 |         elif any(response in model_response for response in inner_option2_list):
37 |             return 0
38 |         else:
39 |             return -1
40 |     


--------------------------------------------------------------------------------
/calm/evaluation/labeling/PCD-C.py:
--------------------------------------------------------------------------------
 1 | from .common_answers import common_true_list, common_false_list, common_option_1_list, common_option_2_list, common_option_3_list, common_option_4_list, common_start_true_dict, common_start_false_dict, common_start_op1_dict, common_start_op2_dict, common_start_op3_dict, common_start_op4_dict    
 2 | 
 3 | def get_gt_label(item):
 4 |         return item["gt_answer"]
 5 | 
 6 | def get_pred_label(model_response, item, prompt_style, type):
 7 |         model_response = model_response.strip().lower()
 8 |         hypothesis1 = item["hypothesis1"].strip().lower()
 9 |         hypothesis2 = item["hypothesis2"].strip().lower()
10 |         len1 = len(hypothesis1)
11 |         len2 = len(hypothesis2)
12 |         low_index = len(model_response)
13 |         ask_for = item["ask-for"]
14 | 
15 |         start_str1_dict = {**common_start_op1_dict,
16 |                         len(hypothesis1)-1:[f"答案（选项一或选项二？）：{hypothesis1[:-1]}",f"answer (option 1 or option 2) : {hypothesis1[:-1]}"]}
17 |         start_str2_dict = {**common_start_op2_dict,
18 |                         len(hypothesis2)-1:
19 |                             [f"答案（选项一或选项二？）：{hypothesis2[:-1]}",f"answer (option 1 or option 2) : {hypothesis2[:-1]}"]}
20 |         start_option1_list,start_option2_list = [],[]
21 |         # some of the model will give response containing the question, we usually preprocess the response to remove the question part, but sometimes due to the model's response format, some of the question part is not removed, so here we are checking the response with the question part as well.
22 |         for key in start_str1_dict.keys():
23 |             for str1 in start_str1_dict[key]:
24 |                 for i in range(key, len(str1)+1):
25 |                     start_option1_list.append(str1[-i:])
26 |         for key in start_str2_dict.keys():
27 |             for str2 in start_str2_dict[key]:
28 |                 for i in range(key, len(str2)+1):
29 |                     start_option2_list.append(str2[-i:])                      
30 | 
31 |         inner_option1_list = ["answer (option 1 or option 2 ?): {}".format(hypothesis1[:len1-1]),"answer (option 1 or option 2?): {}".format({hypothesis1[:len1-1]}), "the {} of the input event is that {}".format(ask_for,hypothesis1[:len1-1]),
32 |                             "the {} of the input event is option 1".format(ask_for),
33 |                             "because {}".format(hypothesis1[:len1-1]),
34 |                             "answer is option 1","answer is: option 1","answer: option 1", hypothesis1, hypothesis1[:len1-1], "should be 1","i believe option 1", "is 1","select option 1","正确答案是选项一","答案为选项一","应该选择选项一","答案：选项一","答案是选项一"
35 |                             ]+common_option_1_list
36 |         inner_option2_list = ["answer (option 1 or option 2 ?): {}".format(hypothesis2[:len2-1]),"answer (option 1 or option 2?): {}".format({hypothesis2[:len2-1]}), "the {} of the input event is that {}".format(ask_for,hypothesis2[:len1-1]),
37 |                             "the {} of the input event is option 2".format(ask_for),
38 |                             "because {}".format(hypothesis2[:len2-1]),
39 |                             "answer is option 2","answer is: option 2","answer: option 2", hypothesis2, hypothesis2[:len2-1], "should be 2","i believe option 2", "is 2","select option 2","正确答案是选项二","答案为选项二","应该选择选项二","答案是选项二"]+common_option_2_list
40 | 
41 |         if model_response.startswith(tuple(start_option1_list)) \
42 |             or any(hypothesis1 == option for option in [model_response, model_response[:len1], model_response + "."]) \
43 |             or model_response in hypothesis1 and len(model_response)>1:
44 |             label = 0
45 |         elif model_response.startswith(tuple(start_option2_list)) \
46 |             or any(hypothesis2 == option for option in [model_response, model_response[:len2], model_response + "."]) \
47 |             or model_response in hypothesis2 and len(model_response)>1:
48 |             label = 1
49 |         elif any(model_response.find(option)>-1 and (low_index:=min(low_index, model_response.find(option)))>-1 for option in inner_option1_list):
50 |             label = 0
51 |             if any(option in model_response and model_response.find(option) < low_index for option in inner_option2_list):
52 |                 label = 1
53 |         elif any(model_response.find(option) > -1 for option in inner_option2_list):
54 |             label = 1
55 |         else:
56 |             return -1
57 |         return label
58 |     


--------------------------------------------------------------------------------
/calm/evaluation/labeling/Probability.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from .common_answers import add_quotes_to_unquoted, is_numeric, change_quotation
 4 | 
 5 | def get_gt_label(item):
 6 |         return item["gt_answer"]
 7 | 
 8 | # common function for maths
 9 | def extract_prob(model_response, prompt_style, type):
10 |         model_response += "}"
11 |         if "CoT" in prompt_style and any (match in type for match in ['NIE','NDE','ETT','CDE','ATE']):
12 |             matches = re.findall(r'\{\"answer\":.*?\}', model_response, re.DOTALL)
13 |         else:
14 |             matches = re.findall(r"\{+.*?\}+", model_response, re.DOTALL | re.IGNORECASE)
15 |         matched_str = None
16 |         for match in matches:
17 |             if match:
18 |                 matched_str = match.lower()
19 |                 if matched_str.startswith("{{") and matched_str.endswith("}}}"):
20 |                     matched_str = matched_str[1:-2]
21 |                 elif matched_str.startswith("{{") and matched_str.endswith("}}"):
22 |                     matched_str = matched_str[1:-1]
23 |                 elif matched_str.startswith("{{") and matched_str.endswith("}"):
24 |                     matched_str = matched_str[1:]
25 |                 elif matched_str.startswith("{") and matched_str.endswith("}}"):
26 |                     matched_str = matched_str[:-1]
27 |             else:
28 |                 matched_str = None
29 | 
30 |             if matched_str:
31 |                 try:
32 |                     inner_json_obj = json.loads(matched_str)
33 |                 except json.JSONDecodeError:
34 |                     # If parsing fails, try adding quotes to unquoted words and parse again
35 |                     fixed_json_str = add_quotes_to_unquoted(matched_str)
36 |                     fixed_json_str = change_quotation(fixed_json_str)
37 |                     try:
38 |                         inner_json_obj = json.loads(fixed_json_str)
39 |                     except:
40 |                         inner_json_obj = {}
41 | 
42 |                 prob_str_value = inner_json_obj.get("prob", None)
43 |                 if prob_str_value is not None:
44 |                     break
45 |         if matched_str is None:
46 |             prob_str_value = None
47 | 
48 |         pred_value = float(prob_str_value) if prob_str_value and is_numeric(prob_str_value) else None
49 | 
50 |         return pred_value
51 | 
52 | def get_pred_label(model_response, item, prompt_style, type):
53 |         model_response = model_response.strip().lower()
54 |         pred = extract_prob(model_response, prompt_style, type)
55 |         return pred


--------------------------------------------------------------------------------
/calm/models/model_apis/baichuan1_7b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | from transformers.generation.utils import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | from rtpt import RTPT
 7 | import os.path
 8 | from os import path
 9 | 
10 | 
11 | def startup(ROOT_PATH):
12 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
13 |     rtpt.start()
14 |     model_path =  os.path.join(ROOT_PATH, "baichuan-7B" )
15 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
16 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
17 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
18 | 
19 |     return model, tokenizer
20 | 
21 | 
22 | def query(context, query_text, dry_run=False, max_new_tokens=200):
23 |     model, tokenizer = context
24 |     if dry_run:
25 |         return None
26 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
27 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
28 | 
29 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
30 |     return results[0][len(query_text):]
31 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/baichuan1_chat_13b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | from transformers.generation.utils import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | 
 7 | from rtpt import RTPT
 8 | import os.path
 9 | from os import path
10 | 
11 | 
12 | def startup(ROOT_PATH):
13 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
14 |     rtpt.start()
15 |     model_path =  os.path.join(ROOT_PATH, "Baichuan-13B-Chat" )
16 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
17 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
18 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
19 | 
20 |     return model, tokenizer
21 | 
22 | 
23 | def query(context, query_text, dry_run=False):
24 |     model, tokenizer = context
25 |     if dry_run:
26 |         return None
27 |     messages = []
28 |     messages.append({"role": "user", "content": query_text})
29 |     response = model.chat(tokenizer, messages)
30 |     return response
31 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/baichuan2_chat_13b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | from transformers.generation.utils import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | 
 7 | # import torch
 8 | from rtpt import RTPT
 9 | import os.path
10 | from os import path
11 | 
12 | 
13 | 
14 | def startup(ROOT_PATH):
15 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
16 |     rtpt.start()
17 |     model_path =  os.path.join(ROOT_PATH, "Baichuan2-13B-Chat" )
18 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
19 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
20 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
21 | 
22 |     return model, tokenizer
23 | 
24 | 
25 | def query(context, query_text, dry_run=False):
26 |     model, tokenizer = context
27 |     if dry_run:
28 |         return None
29 |     messages = []
30 |     messages.append({"role": "user", "content": query_text})
31 |     response = model.chat(tokenizer, messages)
32 |     return response
33 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/baichuan2_chat_7b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | from transformers.generation.utils import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | 
 7 | # import torch
 8 | from rtpt import RTPT
 9 | import os.path
10 | from os import path
11 | 
12 | 
13 | def startup(ROOT_PATH):
14 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
15 |     rtpt.start()
16 |     model_path =  os.path.join(ROOT_PATH, "Baichuan2-7B-Chat" )
17 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
18 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
19 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
20 | 
21 |     return model, tokenizer
22 | 
23 | 
24 | def query(context, query_text, dry_run=False):
25 |     model, tokenizer = context
26 |     if dry_run:
27 |         return None
28 |     messages = []
29 |     messages.append({"role": "user", "content": query_text})
30 |     response = model.chat(tokenizer, messages)
31 |     return response
32 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/chatgpt_api.py:
--------------------------------------------------------------------------------
 1 | # import openai
 2 | from openai import OpenAI
 3 | import time
 4 | MODEL = "gpt-3.5-turbo"
 5 | 
 6 | def startup(api_key):
 7 |     return api_key
 8 | 
 9 | 
10 | def query(context, query_text, dry_run=False):
11 |     
12 |     client = OpenAI(api_key = context)
13 | 
14 |     response = client.chat.completions.create(
15 |         model=MODEL,
16 |         messages=[
17 |             {"role": "user", "content": query_text}
18 |             ],
19 |         temperature=0,
20 |     )
21 | 
22 |     return response.choices[0].message.content
23 | 
24 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/claude2_api.py:
--------------------------------------------------------------------------------
 1 | from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT
 2 | import requests
 3 | 
 4 | 
 5 | def startup(context):
 6 |     return context
 7 | 
 8 | def query(context, query_text, dry_run=False):
 9 |     anthropic = Anthropic(
10 |     api_key=context,
11 |     max_retries=5,
12 |     )    
13 |     if dry_run:
14 |         return None
15 |     query = f"{HUMAN_PROMPT}" + query_text + f"{AI_PROMPT}"
16 |     response = anthropic.completions.create(
17 |         model="claude-2",
18 |         prompt=query,
19 |         max_tokens_to_sample=200,
20 |         temperature=0,
21 |     )
22 |     return response.completion
23 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/gpt4_api.py:
--------------------------------------------------------------------------------
 1 | # import openai
 2 | from openai import OpenAI
 3 | import time
 4 | MODEL = "gpt-4"
 5 | 
 6 | def startup(api_key):
 7 |     return api_key
 8 | 
 9 | 
10 | def query(context, query_text, dry_run=False):
11 |     
12 |     client = OpenAI(api_key = context)
13 | 
14 |     response = client.chat.completions.create(
15 |         model=MODEL,
16 |         messages=[
17 |             {"role": "user", "content": query_text}
18 |             ],
19 |         temperature=0,
20 |     )
21 | 
22 |     return response.choices[0].message.content
23 | 
24 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/internlm_chat_20b_api.py:
--------------------------------------------------------------------------------
 1 | from modelscope import AutoModelForCausalLM, AutoTokenizer
 2 | import torch
 3 | import os
 4 | 
 5 | # import torch
 6 | from rtpt import RTPT
 7 | import os.path
 8 | from os import path
 9 | 
10 | 
11 | def startup(ROOT_PATH):
12 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
13 |     rtpt.start()
14 |     model_path =  os.path.join(ROOT_PATH, "internlm-chat-20b" )
15 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
16 |     model = model.eval()
17 |     tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
18 | 
19 |     return model, tokenizer
20 | 
21 | 
22 | def query(context, query_text, dry_run=False):
23 |     model, tokenizer = context
24 |     if dry_run:
25 |         return None
26 |     
27 |     output, history = model.chat(tokenizer, query_text)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/internlm_chat_7b_api.py:
--------------------------------------------------------------------------------
 1 | from modelscope import AutoModelForCausalLM, AutoTokenizer
 2 | import torch
 3 | import os
 4 | 
 5 | # import torch
 6 | from rtpt import RTPT
 7 | import os.path
 8 | from os import path
 9 | 
10 | 
11 | def startup(ROOT_PATH):
12 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
13 |     rtpt.start()
14 |     model_path =  os.path.join(ROOT_PATH, "internlm-chat-7b-v1_1" )
15 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
16 |     model = model.eval()
17 |     tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
18 | 
19 |     return model, tokenizer
20 | 
21 | 
22 | def query(context, query_text, dry_run=False):
23 |     model, tokenizer = context
24 |     if dry_run:
25 |         return None
26 |     
27 |     output, history = model.chat(tokenizer, query_text)
28 |     return output
29 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/koala_13b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import torch
 3 | from rtpt import RTPT
 4 | import os.path
 5 | from os import path
 6 | 
 7 | 
 8 | def startup(ROOT_PATH):
 9 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
10 |     rtpt.start()
11 | 
12 |     model_path =  os.path.join(ROOT_PATH, "koala-13B-HF" )
13 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
14 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
15 | 
16 |     return model, tokenizer
17 | 
18 | 
19 | def query(context, query_text, dry_run=False, max_new_tokens=400):
20 |     model, tokenizer = context
21 |     if dry_run:
22 |         return None
23 |     device = torch.device("cuda")
24 | 
25 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
26 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
27 | 
28 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
29 |     return results[0][len(query_text):]
30 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/llama2_13b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import os
 3 | 
 4 | import torch
 5 | from rtpt import RTPT
 6 | import os.path
 7 | from os import path
 8 | 
 9 | 
10 | device_map = None  # single gpu
11 | 
12 | def startup(ROOT_PATH):
13 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=200)
14 |     rtpt.start()
15 | 
16 |     model_path =  os.path.join(ROOT_PATH, "Llama-2-13b-hf" )
17 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
18 | 
19 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
20 | 
21 |     return model, tokenizer
22 | 
23 | 
24 | def query(context, query_text, dry_run=False, max_new_tokens=120):
25 |     model, tokenizer = context
26 |     if dry_run:
27 |         return None
28 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
29 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
30 | 
31 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
32 |     return results[0][len(query_text):]
33 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/llama2_70b_api.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import torch
 3 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
 4 | import os.path
 5 | from os import path
 6 | from rtpt import RTPT
 7 | 
 8 | 
 9 | def startup(ROOT_PATH):
10 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=200)
11 |     rtpt.start()
12 | 
13 |     model_path =  os.path.join(ROOT_PATH, "Llama-2-70b-hf")
14 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
15 | 
16 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
17 | 
18 |     return model, tokenizer
19 | 
20 | 
21 | def query(context, query_text, dry_run=False, max_new_tokens=120):
22 |     model, tokenizer = context
23 |     if dry_run:
24 |         return None
25 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
26 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
27 | 
28 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
29 |     return results[0][len(query_text):]


--------------------------------------------------------------------------------
/calm/models/model_apis/llama2_7b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import torch
 3 | 
 4 | import os
 5 | 
 6 | # import torch
 7 | from rtpt import RTPT
 8 | import os.path
 9 | from os import path
10 | 
11 | 
12 | 
13 | def startup(ROOT_PATH):
14 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
15 |     rtpt.start()
16 | 
17 |     model_path =  os.path.join(ROOT_PATH, "Llama-2-7b-hf" )
18 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
19 | 
20 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
21 | 
22 |     return model, tokenizer
23 | 
24 | 
25 | def query(context, query_text, dry_run=False, max_new_tokens=200):
26 |     model, tokenizer = context
27 |     if dry_run:
28 |         return None
29 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
30 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
31 | 
32 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
33 |     return results[0][len(query_text):]
34 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/llama2_chat_70b_api.py:
--------------------------------------------------------------------------------
 1 | import os 
 2 | import torch
 3 | from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
 4 | import os.path
 5 | from os import path
 6 | from rtpt import RTPT
 7 | 
 8 | 
 9 | def startup(ROOT_PATH):
10 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=200)
11 |     rtpt.start()
12 | 
13 |     model_path =  os.path.join(ROOT_PATH, "Llama-2-70b-chat-hf")
14 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
15 | 
16 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
17 | 
18 |     return model, tokenizer
19 | 
20 | 
21 | def query(context, query_text, dry_run=False, max_new_tokens=120):
22 |     model, tokenizer = context
23 |     if dry_run:
24 |         return None
25 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
26 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
27 | 
28 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
29 |     return results[0][len(query_text):]
30 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/qwen_14b_api.py:
--------------------------------------------------------------------------------
 1 | from modelscope import AutoModelForCausalLM, AutoTokenizer
 2 | from modelscope import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | 
 7 | # import torch
 8 | from rtpt import RTPT
 9 | import os.path
10 | from os import path
11 | 
12 | 
13 | def startup(ROOT_PATH):
14 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
15 |     rtpt.start()
16 |     model_path =  os.path.join(ROOT_PATH, "Qwen-14B" )
17 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
18 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
19 |     tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
20 | 
21 |     return model, tokenizer
22 | 
23 | 
24 | def query(context, query_text, dry_run=False, max_new_tokens=200):
25 |     model, tokenizer = context
26 |     if dry_run:
27 |         return None
28 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
29 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
30 | 
31 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
32 |     return results[0][len(query_text):]
33 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/qwen_7b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | from transformers.generation import GenerationConfig
 3 | import torch
 4 | 
 5 | import os
 6 | 
 7 | # import torch
 8 | from rtpt import RTPT
 9 | import os.path
10 | from os import path
11 | 
12 | 
13 | def startup(ROOT_PATH):
14 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
15 |     rtpt.start()
16 |     model_path =  os.path.join(ROOT_PATH, "Qwen-7B" )
17 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval()
18 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
19 |     tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
20 | 
21 |     return model, tokenizer
22 | 
23 | 
24 | def query(context, query_text, dry_run=False, max_new_tokens=400):
25 |     model, tokenizer = context
26 |     if dry_run:
27 |         return None
28 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
29 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
30 | 
31 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
32 |     return results[0][len(query_text):]
33 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/vicuna_33b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import torch
 3 | from rtpt import RTPT
 4 | import os.path
 5 | from os import path
 6 | 
 7 | 
 8 | def startup(ROOT_PATH):
 9 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
10 |     rtpt.start()
11 | 
12 |     model_path =  os.path.join(ROOT_PATH, "vicuna-33b-v1.3" )
13 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
14 | 
15 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
16 | 
17 |     return model, tokenizer
18 | 
19 | 
20 | def query(context, query_text, dry_run=False, max_new_tokens=400):
21 |     model, tokenizer = context
22 |     if dry_run:
23 |         return None
24 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
25 |     #input_ids = tokenizer(query_text, return_tensors="pt").input_ids
26 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
27 | 
28 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
29 |     return results[0][len(query_text):]
30 | 


--------------------------------------------------------------------------------
/calm/models/model_apis/wizardcoder_15b_api.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, AutoModelForCausalLM
 2 | import os
 3 | import torch
 4 | from rtpt import RTPT
 5 | import os.path
 6 | from os import path
 7 | 
 8 | 
 9 | def startup(ROOT_PATH):
10 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
11 |     rtpt.start()
12 | 
13 |     model_path =  os.path.join(ROOT_PATH, "WizardCoder-15B-V1.0" )
14 |     model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, device_map="auto")
15 | 
16 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
17 | 
18 |     return model, tokenizer
19 | 
20 | 
21 | def query(context, query_text, dry_run=False, max_new_tokens=400):
22 |     model, tokenizer = context
23 | 
24 |     if dry_run:
25 |         return None
26 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
27 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
28 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
29 |     return results[0][len(query_text)-30:]
30 | 


--------------------------------------------------------------------------------
/calm/models/model_loader.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | 
 3 | def load_model(model_name, model_args):
 4 |     """
 5 |     Loads the specified model and returns its context and query function.
 6 | 
 7 |     Args:
 8 |         model_name (str): The name of the model to load.
 9 |         model_args (dict): Arguments required to initialize the model.
10 | 
11 |     Returns:
12 |         tuple: A tuple containing the model context and query function.
13 |     """
14 |     model_api = importlib.import_module(f"models.model_apis.{model_name}_api")
15 |     model_context = model_api.startup(**model_args)
16 |     model_query_func = model_api.query
17 |     return model_context, model_query_func
18 | 


--------------------------------------------------------------------------------
/calm/utils/load_items.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | def load_query_instances(path):
 5 |     """
 6 |     Loads query instances from a JSON file.
 7 | 
 8 |     Args:
 9 |         path (str or Path): The path to the JSON file.
10 | 
11 |     Returns:
12 |         list: A list of query instances loaded from the JSON file.
13 |     """
14 |     if isinstance(path, str):
15 |         path = Path(path)
16 |     with path.open("r", encoding="utf-8") as f:
17 |         item_list = [json.loads(line) for line in f.readlines()]
18 |     return item_list


--------------------------------------------------------------------------------
/calm/utils/logger_info.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | 
 4 | def create_adversarial_info(prompt_style):
 5 |     """
 6 |     Creates an informational message about the adversarial prompting style.
 7 | 
 8 |     Args:
 9 |         prompt_style (str): The style of adversarial prompting.
10 | 
11 |     Returns:
12 |         str: The informational message corresponding to the specified prompt style.
13 |     """
14 |     info_map = {
15 |         "adversarial-ignore": "\nAdversarial Prompting - Ignore & response:\n",
16 |         "adversarial-doubt": "\nAdversarial Prompting - Doubt & response:\n",
17 |         "adversarial-ignore-CN": "\n对抗性提示 - 忽略 & 模型答复:\n",
18 |         "adversarial-doubt-CN": "\n对抗性提示 - 质疑 & 模型答复:\n"
19 |     }
20 |     
21 |     info = info_map.get(prompt_style,"")
22 |     
23 |     return f"{info}"   
24 | 
25 | def get_logger(target_file, is_file=True, is_console=True,level=logging.DEBUG, mode="a+"):
26 |     """
27 |     Creates a logger and configures it to log messages to a file and/or console.
28 | 
29 |     Args:
30 |         target_file (str): The file path where logs will be written.
31 |         is_file (bool): Whether to log messages to a file. Default is True.
32 |         is_console (bool): Whether to log messages to the console. Default is True.
33 |         level (int): The logging level. Default is logging.DEBUG.
34 |         mode (str): The file mode for opening the log file. Default is "a+".
35 | 
36 |     Returns:
37 |         logging.Logger: The configured logger instance.
38 |     """
39 |     logger = logging.getLogger(target_file)
40 |     logger.setLevel(level=level)
41 |     formatter = logging.Formatter('[%(asctime)s - %(name)s - %(levelname)s] %(message)s')
42 |     if is_file:
43 |         os.makedirs(os.path.dirname(target_file), exist_ok=True)
44 |         handler = logging.FileHandler(target_file, mode=mode, encoding="UTF-8")
45 |         handler.setLevel(level)
46 |         handler.setFormatter(formatter)
47 |         logger.addHandler(handler)
48 |     if is_console:
49 |         console = logging.StreamHandler()
50 |         console.setLevel(level)
51 |         console.setFormatter(formatter)
52 |         logger.addHandler(console)
53 |     return logger


--------------------------------------------------------------------------------
/calm_dataset_gt_label/association/explaining_away_effect/EAE-B_exp-away_CN.json:
--------------------------------------------------------------------------------
  1 | {"index": 1, "gt_answer": "no"}
  2 | {"index": 2, "gt_answer": "no"}
  3 | {"index": 3, "gt_answer": "yes"}
  4 | {"index": 4, "gt_answer": "yes"}
  5 | {"index": 5, "gt_answer": "no"}
  6 | {"index": 6, "gt_answer": "no"}
  7 | {"index": 7, "gt_answer": "yes"}
  8 | {"index": 8, "gt_answer": "no"}
  9 | {"index": 9, "gt_answer": "yes"}
 10 | {"index": 10, "gt_answer": "yes"}
 11 | {"index": 11, "gt_answer": "no"}
 12 | {"index": 12, "gt_answer": "yes"}
 13 | {"index": 13, "gt_answer": "no"}
 14 | {"index": 14, "gt_answer": "no"}
 15 | {"index": 15, "gt_answer": "no"}
 16 | {"index": 16, "gt_answer": "no"}
 17 | {"index": 17, "gt_answer": "yes"}
 18 | {"index": 18, "gt_answer": "no"}
 19 | {"index": 19, "gt_answer": "no"}
 20 | {"index": 20, "gt_answer": "no"}
 21 | {"index": 21, "gt_answer": "yes"}
 22 | {"index": 22, "gt_answer": "yes"}
 23 | {"index": 23, "gt_answer": "yes"}
 24 | {"index": 24, "gt_answer": "no"}
 25 | {"index": 25, "gt_answer": "yes"}
 26 | {"index": 26, "gt_answer": "yes"}
 27 | {"index": 27, "gt_answer": "no"}
 28 | {"index": 28, "gt_answer": "yes"}
 29 | {"index": 29, "gt_answer": "no"}
 30 | {"index": 30, "gt_answer": "yes"}
 31 | {"index": 31, "gt_answer": "no"}
 32 | {"index": 32, "gt_answer": "yes"}
 33 | {"index": 33, "gt_answer": "yes"}
 34 | {"index": 34, "gt_answer": "yes"}
 35 | {"index": 35, "gt_answer": "yes"}
 36 | {"index": 36, "gt_answer": "no"}
 37 | {"index": 37, "gt_answer": "yes"}
 38 | {"index": 38, "gt_answer": "no"}
 39 | {"index": 39, "gt_answer": "yes"}
 40 | {"index": 40, "gt_answer": "no"}
 41 | {"index": 41, "gt_answer": "yes"}
 42 | {"index": 42, "gt_answer": "no"}
 43 | {"index": 43, "gt_answer": "no"}
 44 | {"index": 44, "gt_answer": "yes"}
 45 | {"index": 45, "gt_answer": "no"}
 46 | {"index": 46, "gt_answer": "yes"}
 47 | {"index": 47, "gt_answer": "yes"}
 48 | {"index": 48, "gt_answer": "no"}
 49 | {"index": 49, "gt_answer": "no"}
 50 | {"index": 50, "gt_answer": "yes"}
 51 | {"index": 51, "gt_answer": "no"}
 52 | {"index": 52, "gt_answer": "no"}
 53 | {"index": 53, "gt_answer": "yes"}
 54 | {"index": 54, "gt_answer": "yes"}
 55 | {"index": 55, "gt_answer": "no"}
 56 | {"index": 56, "gt_answer": "yes"}
 57 | {"index": 57, "gt_answer": "no"}
 58 | {"index": 58, "gt_answer": "yes"}
 59 | {"index": 59, "gt_answer": "no"}
 60 | {"index": 60, "gt_answer": "yes"}
 61 | {"index": 61, "gt_answer": "no"}
 62 | {"index": 62, "gt_answer": "yes"}
 63 | {"index": 63, "gt_answer": "yes"}
 64 | {"index": 64, "gt_answer": "no"}
 65 | {"index": 65, "gt_answer": "yes"}
 66 | {"index": 66, "gt_answer": "yes"}
 67 | {"index": 67, "gt_answer": "no"}
 68 | {"index": 68, "gt_answer": "no"}
 69 | {"index": 69, "gt_answer": "yes"}
 70 | {"index": 70, "gt_answer": "no"}
 71 | {"index": 71, "gt_answer": "yes"}
 72 | {"index": 72, "gt_answer": "no"}
 73 | {"index": 73, "gt_answer": "no"}
 74 | {"index": 74, "gt_answer": "no"}
 75 | {"index": 75, "gt_answer": "yes"}
 76 | {"index": 76, "gt_answer": "no"}
 77 | {"index": 77, "gt_answer": "no"}
 78 | {"index": 78, "gt_answer": "no"}
 79 | {"index": 79, "gt_answer": "yes"}
 80 | {"index": 80, "gt_answer": "yes"}
 81 | {"index": 81, "gt_answer": "no"}
 82 | {"index": 82, "gt_answer": "yes"}
 83 | {"index": 83, "gt_answer": "no"}
 84 | {"index": 84, "gt_answer": "yes"}
 85 | {"index": 85, "gt_answer": "yes"}
 86 | {"index": 86, "gt_answer": "yes"}
 87 | {"index": 87, "gt_answer": "no"}
 88 | {"index": 88, "gt_answer": "yes"}
 89 | {"index": 89, "gt_answer": "yes"}
 90 | {"index": 90, "gt_answer": "yes"}
 91 | {"index": 91, "gt_answer": "no"}
 92 | {"index": 92, "gt_answer": "no"}
 93 | {"index": 93, "gt_answer": "no"}
 94 | {"index": 94, "gt_answer": "yes"}
 95 | {"index": 95, "gt_answer": "no"}
 96 | {"index": 96, "gt_answer": "no"}
 97 | {"index": 97, "gt_answer": "no"}
 98 | {"index": 98, "gt_answer": "yes"}
 99 | {"index": 99, "gt_answer": "yes"}
100 | {"index": 100, "gt_answer": "no"}
101 | {"index": 101, "gt_answer": "yes"}
102 | {"index": 102, "gt_answer": "yes"}
103 | {"index": 103, "gt_answer": "no"}
104 | {"index": 104, "gt_answer": "no"}
105 | {"index": 105, "gt_answer": "no"}
106 | {"index": 106, "gt_answer": "no"}
107 | {"index": 107, "gt_answer": "yes"}
108 | {"index": 108, "gt_answer": "yes"}
109 | {"index": 109, "gt_answer": "yes"}
110 | {"index": 110, "gt_answer": "yes"}
111 | {"index": 111, "gt_answer": "yes"}
112 | {"index": 112, "gt_answer": "no"}
113 | {"index": 113, "gt_answer": "yes"}
114 | {"index": 114, "gt_answer": "no"}
115 | {"index": 115, "gt_answer": "yes"}
116 | {"index": 116, "gt_answer": "yes"}
117 | {"index": 117, "gt_answer": "no"}
118 | {"index": 118, "gt_answer": "no"}
119 | {"index": 119, "gt_answer": "yes"}
120 | {"index": 120, "gt_answer": "no"}
121 | {"index": 121, "gt_answer": "yes"}
122 | {"index": 122, "gt_answer": "no"}
123 | {"index": 123, "gt_answer": "yes"}
124 | {"index": 124, "gt_answer": "no"}
125 | {"index": 125, "gt_answer": "yes"}
126 | {"index": 126, "gt_answer": "no"}
127 | {"index": 127, "gt_answer": "yes"}
128 | {"index": 128, "gt_answer": "no"}
129 | {"index": 129, "gt_answer": "no"}
130 | {"index": 130, "gt_answer": "yes"}
131 | {"index": 131, "gt_answer": "yes"}
132 | {"index": 132, "gt_answer": "yes"}
133 | {"index": 133, "gt_answer": "no"}
134 | {"index": 134, "gt_answer": "no"}
135 | {"index": 135, "gt_answer": "yes"}
136 | {"index": 136, "gt_answer": "no"}
137 | {"index": 137, "gt_answer": "yes"}
138 | {"index": 138, "gt_answer": "yes"}
139 | {"index": 139, "gt_answer": "no"}
140 | {"index": 140, "gt_answer": "yes"}
141 | {"index": 141, "gt_answer": "yes"}
142 | {"index": 142, "gt_answer": "yes"}
143 | {"index": 143, "gt_answer": "no"}
144 | {"index": 144, "gt_answer": "no"}
145 | {"index": 145, "gt_answer": "no"}
146 | {"index": 146, "gt_answer": "no"}
147 | {"index": 147, "gt_answer": "no"}
148 | {"index": 148, "gt_answer": "no"}
149 | {"index": 149, "gt_answer": "yes"}
150 | {"index": 150, "gt_answer": "yes"}
151 | {"index": 151, "gt_answer": "yes"}
152 | {"index": 152, "gt_answer": "no"}
153 | {"index": 153, "gt_answer": "yes"}
154 | {"index": 154, "gt_answer": "no"}
155 | {"index": 155, "gt_answer": "yes"}
156 | {"index": 156, "gt_answer": "no"}
157 | {"index": 157, "gt_answer": "yes"}
158 | {"index": 158, "gt_answer": "yes"}
159 | {"index": 159, "gt_answer": "no"}
160 | {"index": 160, "gt_answer": "no"}
161 | {"index": 161, "gt_answer": "yes"}
162 | {"index": 162, "gt_answer": "yes"}
163 | {"index": 163, "gt_answer": "no"}
164 | {"index": 164, "gt_answer": "no"}
165 | {"index": 165, "gt_answer": "yes"}
166 | {"index": 166, "gt_answer": "no"}
167 | {"index": 167, "gt_answer": "yes"}
168 | {"index": 168, "gt_answer": "no"}
169 | 


--------------------------------------------------------------------------------
/calm_dataset_gt_label/association/explaining_away_effect/EAE-B_exp-away_EN.json:
--------------------------------------------------------------------------------
  1 | {"index": 1, "gt_answer": "no"}
  2 | {"index": 2, "gt_answer": "no"}
  3 | {"index": 3, "gt_answer": "yes"}
  4 | {"index": 4, "gt_answer": "yes"}
  5 | {"index": 5, "gt_answer": "no"}
  6 | {"index": 6, "gt_answer": "no"}
  7 | {"index": 7, "gt_answer": "yes"}
  8 | {"index": 8, "gt_answer": "no"}
  9 | {"index": 9, "gt_answer": "yes"}
 10 | {"index": 10, "gt_answer": "yes"}
 11 | {"index": 11, "gt_answer": "no"}
 12 | {"index": 12, "gt_answer": "yes"}
 13 | {"index": 13, "gt_answer": "no"}
 14 | {"index": 14, "gt_answer": "no"}
 15 | {"index": 15, "gt_answer": "no"}
 16 | {"index": 16, "gt_answer": "no"}
 17 | {"index": 17, "gt_answer": "yes"}
 18 | {"index": 18, "gt_answer": "no"}
 19 | {"index": 19, "gt_answer": "no"}
 20 | {"index": 20, "gt_answer": "no"}
 21 | {"index": 21, "gt_answer": "yes"}
 22 | {"index": 22, "gt_answer": "yes"}
 23 | {"index": 23, "gt_answer": "yes"}
 24 | {"index": 24, "gt_answer": "no"}
 25 | {"index": 25, "gt_answer": "yes"}
 26 | {"index": 26, "gt_answer": "yes"}
 27 | {"index": 27, "gt_answer": "no"}
 28 | {"index": 28, "gt_answer": "yes"}
 29 | {"index": 29, "gt_answer": "no"}
 30 | {"index": 30, "gt_answer": "yes"}
 31 | {"index": 31, "gt_answer": "no"}
 32 | {"index": 32, "gt_answer": "yes"}
 33 | {"index": 33, "gt_answer": "yes"}
 34 | {"index": 34, "gt_answer": "yes"}
 35 | {"index": 35, "gt_answer": "yes"}
 36 | {"index": 36, "gt_answer": "no"}
 37 | {"index": 37, "gt_answer": "yes"}
 38 | {"index": 38, "gt_answer": "no"}
 39 | {"index": 39, "gt_answer": "yes"}
 40 | {"index": 40, "gt_answer": "no"}
 41 | {"index": 41, "gt_answer": "yes"}
 42 | {"index": 42, "gt_answer": "no"}
 43 | {"index": 43, "gt_answer": "no"}
 44 | {"index": 44, "gt_answer": "yes"}
 45 | {"index": 45, "gt_answer": "no"}
 46 | {"index": 46, "gt_answer": "yes"}
 47 | {"index": 47, "gt_answer": "yes"}
 48 | {"index": 48, "gt_answer": "no"}
 49 | {"index": 49, "gt_answer": "no"}
 50 | {"index": 50, "gt_answer": "yes"}
 51 | {"index": 51, "gt_answer": "no"}
 52 | {"index": 52, "gt_answer": "no"}
 53 | {"index": 53, "gt_answer": "yes"}
 54 | {"index": 54, "gt_answer": "yes"}
 55 | {"index": 55, "gt_answer": "no"}
 56 | {"index": 56, "gt_answer": "yes"}
 57 | {"index": 57, "gt_answer": "no"}
 58 | {"index": 58, "gt_answer": "yes"}
 59 | {"index": 59, "gt_answer": "no"}
 60 | {"index": 60, "gt_answer": "yes"}
 61 | {"index": 61, "gt_answer": "no"}
 62 | {"index": 62, "gt_answer": "yes"}
 63 | {"index": 63, "gt_answer": "yes"}
 64 | {"index": 64, "gt_answer": "no"}
 65 | {"index": 65, "gt_answer": "yes"}
 66 | {"index": 66, "gt_answer": "yes"}
 67 | {"index": 67, "gt_answer": "no"}
 68 | {"index": 68, "gt_answer": "no"}
 69 | {"index": 69, "gt_answer": "yes"}
 70 | {"index": 70, "gt_answer": "no"}
 71 | {"index": 71, "gt_answer": "yes"}
 72 | {"index": 72, "gt_answer": "no"}
 73 | {"index": 73, "gt_answer": "no"}
 74 | {"index": 74, "gt_answer": "no"}
 75 | {"index": 75, "gt_answer": "yes"}
 76 | {"index": 76, "gt_answer": "no"}
 77 | {"index": 77, "gt_answer": "no"}
 78 | {"index": 78, "gt_answer": "no"}
 79 | {"index": 79, "gt_answer": "yes"}
 80 | {"index": 80, "gt_answer": "yes"}
 81 | {"index": 81, "gt_answer": "no"}
 82 | {"index": 82, "gt_answer": "yes"}
 83 | {"index": 83, "gt_answer": "no"}
 84 | {"index": 84, "gt_answer": "yes"}
 85 | {"index": 85, "gt_answer": "yes"}
 86 | {"index": 86, "gt_answer": "yes"}
 87 | {"index": 87, "gt_answer": "no"}
 88 | {"index": 88, "gt_answer": "yes"}
 89 | {"index": 89, "gt_answer": "yes"}
 90 | {"index": 90, "gt_answer": "yes"}
 91 | {"index": 91, "gt_answer": "no"}
 92 | {"index": 92, "gt_answer": "no"}
 93 | {"index": 93, "gt_answer": "no"}
 94 | {"index": 94, "gt_answer": "yes"}
 95 | {"index": 95, "gt_answer": "no"}
 96 | {"index": 96, "gt_answer": "no"}
 97 | {"index": 97, "gt_answer": "no"}
 98 | {"index": 98, "gt_answer": "yes"}
 99 | {"index": 99, "gt_answer": "yes"}
100 | {"index": 100, "gt_answer": "no"}
101 | {"index": 101, "gt_answer": "yes"}
102 | {"index": 102, "gt_answer": "yes"}
103 | {"index": 103, "gt_answer": "no"}
104 | {"index": 104, "gt_answer": "no"}
105 | {"index": 105, "gt_answer": "no"}
106 | {"index": 106, "gt_answer": "no"}
107 | {"index": 107, "gt_answer": "yes"}
108 | {"index": 108, "gt_answer": "yes"}
109 | {"index": 109, "gt_answer": "yes"}
110 | {"index": 110, "gt_answer": "yes"}
111 | {"index": 111, "gt_answer": "yes"}
112 | {"index": 112, "gt_answer": "no"}
113 | {"index": 113, "gt_answer": "yes"}
114 | {"index": 114, "gt_answer": "no"}
115 | {"index": 115, "gt_answer": "yes"}
116 | {"index": 116, "gt_answer": "yes"}
117 | {"index": 117, "gt_answer": "no"}
118 | {"index": 118, "gt_answer": "no"}
119 | {"index": 119, "gt_answer": "yes"}
120 | {"index": 120, "gt_answer": "no"}
121 | {"index": 121, "gt_answer": "yes"}
122 | {"index": 122, "gt_answer": "no"}
123 | {"index": 123, "gt_answer": "yes"}
124 | {"index": 124, "gt_answer": "no"}
125 | {"index": 125, "gt_answer": "yes"}
126 | {"index": 126, "gt_answer": "no"}
127 | {"index": 127, "gt_answer": "yes"}
128 | {"index": 128, "gt_answer": "no"}
129 | {"index": 129, "gt_answer": "no"}
130 | {"index": 130, "gt_answer": "yes"}
131 | {"index": 131, "gt_answer": "yes"}
132 | {"index": 132, "gt_answer": "yes"}
133 | {"index": 133, "gt_answer": "no"}
134 | {"index": 134, "gt_answer": "no"}
135 | {"index": 135, "gt_answer": "yes"}
136 | {"index": 136, "gt_answer": "no"}
137 | {"index": 137, "gt_answer": "yes"}
138 | {"index": 138, "gt_answer": "yes"}
139 | {"index": 139, "gt_answer": "no"}
140 | {"index": 140, "gt_answer": "yes"}
141 | {"index": 141, "gt_answer": "yes"}
142 | {"index": 142, "gt_answer": "yes"}
143 | {"index": 143, "gt_answer": "no"}
144 | {"index": 144, "gt_answer": "no"}
145 | {"index": 145, "gt_answer": "no"}
146 | {"index": 146, "gt_answer": "no"}
147 | {"index": 147, "gt_answer": "no"}
148 | {"index": 148, "gt_answer": "no"}
149 | {"index": 149, "gt_answer": "yes"}
150 | {"index": 150, "gt_answer": "yes"}
151 | {"index": 151, "gt_answer": "yes"}
152 | {"index": 152, "gt_answer": "no"}
153 | {"index": 153, "gt_answer": "yes"}
154 | {"index": 154, "gt_answer": "no"}
155 | {"index": 155, "gt_answer": "yes"}
156 | {"index": 156, "gt_answer": "no"}
157 | {"index": 157, "gt_answer": "yes"}
158 | {"index": 158, "gt_answer": "yes"}
159 | {"index": 159, "gt_answer": "no"}
160 | {"index": 160, "gt_answer": "no"}
161 | {"index": 161, "gt_answer": "yes"}
162 | {"index": 162, "gt_answer": "yes"}
163 | {"index": 163, "gt_answer": "no"}
164 | {"index": 164, "gt_answer": "no"}
165 | {"index": 165, "gt_answer": "yes"}
166 | {"index": 166, "gt_answer": "no"}
167 | {"index": 167, "gt_answer": "yes"}
168 | {"index": 168, "gt_answer": "no"}
169 | 


--------------------------------------------------------------------------------
/calm_dataset_gt_label/intervention/collider_bias/CB-B_collider-bias_CN.json:
--------------------------------------------------------------------------------
  1 | {"index": 1, "gt_answer": "yes"}
  2 | {"index": 2, "gt_answer": "yes"}
  3 | {"index": 3, "gt_answer": "no"}
  4 | {"index": 4, "gt_answer": "yes"}
  5 | {"index": 5, "gt_answer": "yes"}
  6 | {"index": 6, "gt_answer": "yes"}
  7 | {"index": 7, "gt_answer": "yes"}
  8 | {"index": 8, "gt_answer": "no"}
  9 | {"index": 9, "gt_answer": "no"}
 10 | {"index": 10, "gt_answer": "yes"}
 11 | {"index": 11, "gt_answer": "no"}
 12 | {"index": 12, "gt_answer": "yes"}
 13 | {"index": 13, "gt_answer": "no"}
 14 | {"index": 14, "gt_answer": "yes"}
 15 | {"index": 15, "gt_answer": "yes"}
 16 | {"index": 16, "gt_answer": "yes"}
 17 | {"index": 17, "gt_answer": "yes"}
 18 | {"index": 18, "gt_answer": "no"}
 19 | {"index": 19, "gt_answer": "yes"}
 20 | {"index": 20, "gt_answer": "no"}
 21 | {"index": 21, "gt_answer": "yes"}
 22 | {"index": 22, "gt_answer": "no"}
 23 | {"index": 23, "gt_answer": "yes"}
 24 | {"index": 24, "gt_answer": "no"}
 25 | {"index": 25, "gt_answer": "yes"}
 26 | {"index": 26, "gt_answer": "no"}
 27 | {"index": 27, "gt_answer": "no"}
 28 | {"index": 28, "gt_answer": "yes"}
 29 | {"index": 29, "gt_answer": "yes"}
 30 | {"index": 30, "gt_answer": "no"}
 31 | {"index": 31, "gt_answer": "yes"}
 32 | {"index": 32, "gt_answer": "yes"}
 33 | {"index": 33, "gt_answer": "yes"}
 34 | {"index": 34, "gt_answer": "no"}
 35 | {"index": 35, "gt_answer": "no"}
 36 | {"index": 36, "gt_answer": "no"}
 37 | {"index": 37, "gt_answer": "yes"}
 38 | {"index": 38, "gt_answer": "no"}
 39 | {"index": 39, "gt_answer": "no"}
 40 | {"index": 40, "gt_answer": "yes"}
 41 | {"index": 41, "gt_answer": "no"}
 42 | {"index": 42, "gt_answer": "no"}
 43 | {"index": 43, "gt_answer": "no"}
 44 | {"index": 44, "gt_answer": "no"}
 45 | {"index": 45, "gt_answer": "yes"}
 46 | {"index": 46, "gt_answer": "no"}
 47 | {"index": 47, "gt_answer": "yes"}
 48 | {"index": 48, "gt_answer": "no"}
 49 | {"index": 49, "gt_answer": "yes"}
 50 | {"index": 50, "gt_answer": "yes"}
 51 | {"index": 51, "gt_answer": "no"}
 52 | {"index": 52, "gt_answer": "no"}
 53 | {"index": 53, "gt_answer": "no"}
 54 | {"index": 54, "gt_answer": "no"}
 55 | {"index": 55, "gt_answer": "no"}
 56 | {"index": 56, "gt_answer": "yes"}
 57 | {"index": 57, "gt_answer": "no"}
 58 | {"index": 58, "gt_answer": "yes"}
 59 | {"index": 59, "gt_answer": "yes"}
 60 | {"index": 60, "gt_answer": "no"}
 61 | {"index": 61, "gt_answer": "yes"}
 62 | {"index": 62, "gt_answer": "no"}
 63 | {"index": 63, "gt_answer": "yes"}
 64 | {"index": 64, "gt_answer": "no"}
 65 | {"index": 65, "gt_answer": "yes"}
 66 | {"index": 66, "gt_answer": "no"}
 67 | {"index": 67, "gt_answer": "no"}
 68 | {"index": 68, "gt_answer": "yes"}
 69 | {"index": 69, "gt_answer": "no"}
 70 | {"index": 70, "gt_answer": "yes"}
 71 | {"index": 71, "gt_answer": "yes"}
 72 | {"index": 72, "gt_answer": "no"}
 73 | {"index": 73, "gt_answer": "yes"}
 74 | {"index": 74, "gt_answer": "yes"}
 75 | {"index": 75, "gt_answer": "no"}
 76 | {"index": 76, "gt_answer": "yes"}
 77 | {"index": 77, "gt_answer": "no"}
 78 | {"index": 78, "gt_answer": "yes"}
 79 | {"index": 79, "gt_answer": "no"}
 80 | {"index": 80, "gt_answer": "yes"}
 81 | {"index": 81, "gt_answer": "no"}
 82 | {"index": 82, "gt_answer": "yes"}
 83 | {"index": 83, "gt_answer": "no"}
 84 | {"index": 84, "gt_answer": "yes"}
 85 | {"index": 85, "gt_answer": "no"}
 86 | {"index": 86, "gt_answer": "yes"}
 87 | {"index": 87, "gt_answer": "yes"}
 88 | {"index": 88, "gt_answer": "no"}
 89 | {"index": 89, "gt_answer": "no"}
 90 | {"index": 90, "gt_answer": "yes"}
 91 | {"index": 91, "gt_answer": "yes"}
 92 | {"index": 92, "gt_answer": "no"}
 93 | {"index": 93, "gt_answer": "yes"}
 94 | {"index": 94, "gt_answer": "yes"}
 95 | {"index": 95, "gt_answer": "no"}
 96 | {"index": 96, "gt_answer": "yes"}
 97 | {"index": 97, "gt_answer": "no"}
 98 | {"index": 98, "gt_answer": "no"}
 99 | {"index": 99, "gt_answer": "no"}
100 | {"index": 100, "gt_answer": "yes"}
101 | {"index": 101, "gt_answer": "no"}
102 | {"index": 102, "gt_answer": "yes"}
103 | {"index": 103, "gt_answer": "no"}
104 | {"index": 104, "gt_answer": "yes"}
105 | {"index": 105, "gt_answer": "yes"}
106 | {"index": 106, "gt_answer": "no"}
107 | {"index": 107, "gt_answer": "yes"}
108 | {"index": 108, "gt_answer": "yes"}
109 | {"index": 109, "gt_answer": "no"}
110 | {"index": 110, "gt_answer": "no"}
111 | {"index": 111, "gt_answer": "yes"}
112 | {"index": 112, "gt_answer": "no"}
113 | {"index": 113, "gt_answer": "yes"}
114 | {"index": 114, "gt_answer": "no"}
115 | {"index": 115, "gt_answer": "yes"}
116 | {"index": 116, "gt_answer": "no"}
117 | {"index": 117, "gt_answer": "yes"}
118 | {"index": 118, "gt_answer": "yes"}
119 | {"index": 119, "gt_answer": "yes"}
120 | {"index": 120, "gt_answer": "yes"}
121 | {"index": 121, "gt_answer": "no"}
122 | {"index": 122, "gt_answer": "yes"}
123 | {"index": 123, "gt_answer": "no"}
124 | {"index": 124, "gt_answer": "yes"}
125 | {"index": 125, "gt_answer": "no"}
126 | {"index": 126, "gt_answer": "no"}
127 | {"index": 127, "gt_answer": "yes"}
128 | {"index": 128, "gt_answer": "yes"}
129 | {"index": 129, "gt_answer": "no"}
130 | {"index": 130, "gt_answer": "yes"}
131 | {"index": 131, "gt_answer": "yes"}
132 | {"index": 132, "gt_answer": "no"}
133 | {"index": 133, "gt_answer": "no"}
134 | {"index": 134, "gt_answer": "yes"}
135 | {"index": 135, "gt_answer": "yes"}
136 | {"index": 136, "gt_answer": "yes"}
137 | {"index": 137, "gt_answer": "no"}
138 | {"index": 138, "gt_answer": "no"}
139 | {"index": 139, "gt_answer": "no"}
140 | {"index": 140, "gt_answer": "yes"}
141 | {"index": 141, "gt_answer": "yes"}
142 | {"index": 142, "gt_answer": "no"}
143 | {"index": 143, "gt_answer": "no"}
144 | {"index": 144, "gt_answer": "no"}
145 | {"index": 145, "gt_answer": "no"}
146 | {"index": 146, "gt_answer": "yes"}
147 | {"index": 147, "gt_answer": "no"}
148 | {"index": 148, "gt_answer": "yes"}
149 | {"index": 149, "gt_answer": "yes"}
150 | {"index": 150, "gt_answer": "no"}
151 | {"index": 151, "gt_answer": "yes"}
152 | {"index": 152, "gt_answer": "yes"}
153 | {"index": 153, "gt_answer": "no"}
154 | {"index": 154, "gt_answer": "no"}
155 | {"index": 155, "gt_answer": "yes"}
156 | {"index": 156, "gt_answer": "yes"}
157 | {"index": 157, "gt_answer": "no"}
158 | {"index": 158, "gt_answer": "yes"}
159 | {"index": 159, "gt_answer": "no"}
160 | {"index": 160, "gt_answer": "no"}
161 | {"index": 161, "gt_answer": "no"}
162 | {"index": 162, "gt_answer": "yes"}
163 | {"index": 163, "gt_answer": "no"}
164 | 


--------------------------------------------------------------------------------
/calm_dataset_gt_label/intervention/collider_bias/CB-B_collider-bias_EN.json:
--------------------------------------------------------------------------------
  1 | {"index": 1, "gt_answer": "yes"}
  2 | {"index": 2, "gt_answer": "yes"}
  3 | {"index": 3, "gt_answer": "no"}
  4 | {"index": 4, "gt_answer": "yes"}
  5 | {"index": 5, "gt_answer": "yes"}
  6 | {"index": 6, "gt_answer": "yes"}
  7 | {"index": 7, "gt_answer": "yes"}
  8 | {"index": 8, "gt_answer": "no"}
  9 | {"index": 9, "gt_answer": "no"}
 10 | {"index": 10, "gt_answer": "yes"}
 11 | {"index": 11, "gt_answer": "no"}
 12 | {"index": 12, "gt_answer": "yes"}
 13 | {"index": 13, "gt_answer": "no"}
 14 | {"index": 14, "gt_answer": "yes"}
 15 | {"index": 15, "gt_answer": "yes"}
 16 | {"index": 16, "gt_answer": "yes"}
 17 | {"index": 17, "gt_answer": "yes"}
 18 | {"index": 18, "gt_answer": "no"}
 19 | {"index": 19, "gt_answer": "yes"}
 20 | {"index": 20, "gt_answer": "no"}
 21 | {"index": 21, "gt_answer": "yes"}
 22 | {"index": 22, "gt_answer": "no"}
 23 | {"index": 23, "gt_answer": "yes"}
 24 | {"index": 24, "gt_answer": "no"}
 25 | {"index": 25, "gt_answer": "yes"}
 26 | {"index": 26, "gt_answer": "no"}
 27 | {"index": 27, "gt_answer": "no"}
 28 | {"index": 28, "gt_answer": "yes"}
 29 | {"index": 29, "gt_answer": "yes"}
 30 | {"index": 30, "gt_answer": "no"}
 31 | {"index": 31, "gt_answer": "yes"}
 32 | {"index": 32, "gt_answer": "yes"}
 33 | {"index": 33, "gt_answer": "yes"}
 34 | {"index": 34, "gt_answer": "no"}
 35 | {"index": 35, "gt_answer": "no"}
 36 | {"index": 36, "gt_answer": "no"}
 37 | {"index": 37, "gt_answer": "yes"}
 38 | {"index": 38, "gt_answer": "no"}
 39 | {"index": 39, "gt_answer": "no"}
 40 | {"index": 40, "gt_answer": "yes"}
 41 | {"index": 41, "gt_answer": "no"}
 42 | {"index": 42, "gt_answer": "no"}
 43 | {"index": 43, "gt_answer": "no"}
 44 | {"index": 44, "gt_answer": "no"}
 45 | {"index": 45, "gt_answer": "yes"}
 46 | {"index": 46, "gt_answer": "no"}
 47 | {"index": 47, "gt_answer": "yes"}
 48 | {"index": 48, "gt_answer": "no"}
 49 | {"index": 49, "gt_answer": "yes"}
 50 | {"index": 50, "gt_answer": "yes"}
 51 | {"index": 51, "gt_answer": "no"}
 52 | {"index": 52, "gt_answer": "no"}
 53 | {"index": 53, "gt_answer": "no"}
 54 | {"index": 54, "gt_answer": "no"}
 55 | {"index": 55, "gt_answer": "no"}
 56 | {"index": 56, "gt_answer": "yes"}
 57 | {"index": 57, "gt_answer": "no"}
 58 | {"index": 58, "gt_answer": "yes"}
 59 | {"index": 59, "gt_answer": "yes"}
 60 | {"index": 60, "gt_answer": "no"}
 61 | {"index": 61, "gt_answer": "yes"}
 62 | {"index": 62, "gt_answer": "no"}
 63 | {"index": 63, "gt_answer": "yes"}
 64 | {"index": 64, "gt_answer": "no"}
 65 | {"index": 65, "gt_answer": "yes"}
 66 | {"index": 66, "gt_answer": "no"}
 67 | {"index": 67, "gt_answer": "no"}
 68 | {"index": 68, "gt_answer": "yes"}
 69 | {"index": 69, "gt_answer": "no"}
 70 | {"index": 70, "gt_answer": "yes"}
 71 | {"index": 71, "gt_answer": "yes"}
 72 | {"index": 72, "gt_answer": "no"}
 73 | {"index": 73, "gt_answer": "yes"}
 74 | {"index": 74, "gt_answer": "yes"}
 75 | {"index": 75, "gt_answer": "no"}
 76 | {"index": 76, "gt_answer": "yes"}
 77 | {"index": 77, "gt_answer": "no"}
 78 | {"index": 78, "gt_answer": "yes"}
 79 | {"index": 79, "gt_answer": "no"}
 80 | {"index": 80, "gt_answer": "yes"}
 81 | {"index": 81, "gt_answer": "no"}
 82 | {"index": 82, "gt_answer": "yes"}
 83 | {"index": 83, "gt_answer": "no"}
 84 | {"index": 84, "gt_answer": "yes"}
 85 | {"index": 85, "gt_answer": "no"}
 86 | {"index": 86, "gt_answer": "yes"}
 87 | {"index": 87, "gt_answer": "yes"}
 88 | {"index": 88, "gt_answer": "no"}
 89 | {"index": 89, "gt_answer": "no"}
 90 | {"index": 90, "gt_answer": "yes"}
 91 | {"index": 91, "gt_answer": "yes"}
 92 | {"index": 92, "gt_answer": "no"}
 93 | {"index": 93, "gt_answer": "yes"}
 94 | {"index": 94, "gt_answer": "yes"}
 95 | {"index": 95, "gt_answer": "no"}
 96 | {"index": 96, "gt_answer": "yes"}
 97 | {"index": 97, "gt_answer": "no"}
 98 | {"index": 98, "gt_answer": "no"}
 99 | {"index": 99, "gt_answer": "no"}
100 | {"index": 100, "gt_answer": "yes"}
101 | {"index": 101, "gt_answer": "no"}
102 | {"index": 102, "gt_answer": "yes"}
103 | {"index": 103, "gt_answer": "no"}
104 | {"index": 104, "gt_answer": "yes"}
105 | {"index": 105, "gt_answer": "yes"}
106 | {"index": 106, "gt_answer": "no"}
107 | {"index": 107, "gt_answer": "yes"}
108 | {"index": 108, "gt_answer": "yes"}
109 | {"index": 109, "gt_answer": "no"}
110 | {"index": 110, "gt_answer": "no"}
111 | {"index": 111, "gt_answer": "yes"}
112 | {"index": 112, "gt_answer": "no"}
113 | {"index": 113, "gt_answer": "yes"}
114 | {"index": 114, "gt_answer": "no"}
115 | {"index": 115, "gt_answer": "yes"}
116 | {"index": 116, "gt_answer": "no"}
117 | {"index": 117, "gt_answer": "yes"}
118 | {"index": 118, "gt_answer": "yes"}
119 | {"index": 119, "gt_answer": "yes"}
120 | {"index": 120, "gt_answer": "yes"}
121 | {"index": 121, "gt_answer": "no"}
122 | {"index": 122, "gt_answer": "yes"}
123 | {"index": 123, "gt_answer": "no"}
124 | {"index": 124, "gt_answer": "yes"}
125 | {"index": 125, "gt_answer": "no"}
126 | {"index": 126, "gt_answer": "no"}
127 | {"index": 127, "gt_answer": "yes"}
128 | {"index": 128, "gt_answer": "yes"}
129 | {"index": 129, "gt_answer": "no"}
130 | {"index": 130, "gt_answer": "yes"}
131 | {"index": 131, "gt_answer": "yes"}
132 | {"index": 132, "gt_answer": "no"}
133 | {"index": 133, "gt_answer": "no"}
134 | {"index": 134, "gt_answer": "yes"}
135 | {"index": 135, "gt_answer": "yes"}
136 | {"index": 136, "gt_answer": "yes"}
137 | {"index": 137, "gt_answer": "no"}
138 | {"index": 138, "gt_answer": "no"}
139 | {"index": 139, "gt_answer": "no"}
140 | {"index": 140, "gt_answer": "yes"}
141 | {"index": 141, "gt_answer": "yes"}
142 | {"index": 142, "gt_answer": "no"}
143 | {"index": 143, "gt_answer": "no"}
144 | {"index": 144, "gt_answer": "no"}
145 | {"index": 145, "gt_answer": "no"}
146 | {"index": 146, "gt_answer": "yes"}
147 | {"index": 147, "gt_answer": "no"}
148 | {"index": 148, "gt_answer": "yes"}
149 | {"index": 149, "gt_answer": "yes"}
150 | {"index": 150, "gt_answer": "no"}
151 | {"index": 151, "gt_answer": "yes"}
152 | {"index": 152, "gt_answer": "yes"}
153 | {"index": 153, "gt_answer": "no"}
154 | {"index": 154, "gt_answer": "no"}
155 | {"index": 155, "gt_answer": "yes"}
156 | {"index": 156, "gt_answer": "yes"}
157 | {"index": 157, "gt_answer": "no"}
158 | {"index": 158, "gt_answer": "yes"}
159 | {"index": 159, "gt_answer": "no"}
160 | {"index": 160, "gt_answer": "no"}
161 | {"index": 161, "gt_answer": "no"}
162 | {"index": 162, "gt_answer": "yes"}
163 | {"index": 163, "gt_answer": "no"}
164 | 


--------------------------------------------------------------------------------
/documents/calm-lite.md:
--------------------------------------------------------------------------------
 1 | # CaLM Lite
 2 | **CaLM Lite** is a lightweight version of CaLM. The [OpenCompass](https://github.com/open-compass/opencompass/tree/main/opencompass/configs/datasets/calm) version of CaLM Lite offers a convenient way to test basic prompts. For more prompt styles and comprehensive aggregated metrics, you can conduct tests in our repository.
 3 | 
 4 | ## Quick Start
 5 | First, get model responses, for example:
 6 | ```
 7 | python calm/run.py --models vicuna_33b -p zero-shot-IcL -t PCD-B_E-CARE_EN  -mcfg ./model_configs -o ./output -l
 8 | ```
 9 | Then run evaluation script to get evaluation metrics and errors:
10 | ```
11 | python calm/evaluate.py --models vicuna_33b -p zero-shot-IcL -t PCD-B_E-CARE_EN -cm -ea -am -o ./output -l
12 | ```
13 | 
14 | ## Available Causal Tasks
15 | We provide 92 tasks for causal evaluation, stored in the `calm_lite_dataset` folder. For more information about our causal tasks, refer to [tasks](tasks.md).
16 | The directory structure is:
17 | 
18 | ```
19 | ├── calm
20 | | ├── association
21 | | ├── causal_discovery # Rung of the causal ladder
22 | | │ ├── abstract_reasoning # Causal scenario
23 | | │ │ ├── AR-B_CaLM-AR_CN.json # Causal task
24 | | │ | └── AR-B_CaLM-AR_EN.json # Causal task
25 | | │ └── ...
26 | | └── ...
27 | └── ...
28 | ```
29 | 
30 | ## Dataset
31 | - **Dataset size**: CaLM Lite leverages a light dataset of **9200**, while CaLM uses a significantly larger dataset of 126,334. The table below details the English dataset composition, with the Chinese version structured identically.
32 | - **Dataset configuration**: We prioritize balance in our dataset for **binary classification** and **choice selection** questions. By ensuring an equal number of each GT label, we minimize the risk of introducing bias into the model's testing. For **probability calculation**, CaLM-Lite takes extra attention to balance the number of problems across different causal reasoning processes. (For more details on how causal reasoning process is defined, please refer to Section 9.1.6 of the [paper](https://arxiv.org/abs/2405.00622).)
33 | 
34 | | Causal ladder | Causal scenario | Subset | Question type | Mode | CaLM Lite | CaLM |
35 | |---------------|-----------------|--------|---------------|------|-----------|------|
36 | | Causal discovery | PCD | E-CARE | Binary classification | Natural | 100 | 2000 |
37 | | Causal discovery | PCD | E-CARE | Choice selection | Natural | 100 | 1000 |
38 | | Causal discovery | PCD | COPA | Binary classification | Natural | 100 | 2000 |
39 | | Causal discovery | PCD | COPA | Choice selection | Natural | 100 | 1000 |
40 | | Causal discovery | ECI | CTB | Binary classification | Natural | 100 | 596 |
41 | | Causal discovery | ECI | ESC | Binary classification | Natural | 100 | 1000 |
42 | | Causal discovery | ECI | MAVEN-ERE | Binary classification | Natural | 100 | 1000 |
43 | | Causal discovery | AR | CaLM-AR | Binary classification | Symbolic | 100 | 1600 |
44 | | Causal discovery | CA | FP | Binary classification | Symbolic | 100 | 1600 |
45 | | Causal discovery | CA | FA | Binary classification | Symbolic | 100 | 1600 |
46 | | Association | CORR | correlation | Binary classification | Natural | 100 | 1476 |
47 | | Association | EAE | exp-away | Binary classification | Natural | 100 | 168 |
48 | | Intervention | CB | collider-bias | Binary classification | Natural | 100 | 163 |
49 | | Intervention | ATE | ATE-natural | Binary classification | Natural | 100 | 1600 |
50 | | Intervention | ATE | ATE-basic | Probability calculation | Mathematical | 100 | 1600 |
51 | | Intervention | ATE | ATE-hard | Probability calculation | Mathematical | 100 | 1600 |
52 | | Intervention | CDE | CDE-natural | Binary classification | Natural | 100 | 1600 |
53 | | Intervention | CDE | CDE-basic | Probability calculation | Mathematical | 100 | 1600 |
54 | | Intervention | CDE | CDE-hard | Probability calculation | Mathematical | 100 | 1600 |
55 | | Intervention | BAS | backadj | Binary classification | Natural | 100 | 227 |
56 | | Intervention | BAS | max-BAS | Choice selection | Symbolic | 100 | 1600 |
57 | | Intervention | BAS | min-BAS | Choice selection | Symbolic | 100 | 1600 |
58 | | Intervention | BAS | mix-BAS | Choice selection | Symbolic | 100 | 1600 |
59 | | Intervention | FAS | FAS | Choice selection | Symbolic | 100 | 1600 |
60 | | Intervention | IV | CaLM-IV | Choice selection | Symbolic | 100 | 1600 |
61 | | Intervention | CEI | 0.2-UC | Binary classification | Symbolic | 100 | 1600 |
62 | | Intervention | CEI | 0.4-UC | Binary classification | Symbolic | 100 | 1600 |
63 | | Intervention | CEI | 0.6-UC | Binary classification | Symbolic | 100 | 1600 |
64 | | Intervention | CEI | 0.8-UC | Binary classification | Symbolic | 100 | 1600 |
65 | | Counterfactuals | ETT | ETT-natural | Binary classification | Natural | 100 | 1600 |
66 | | Counterfactuals | ETT | ETT-basic | Probability calculation | Mathematical | 100 | 1600 |
67 | | Counterfactuals | ETT | ETT-hard | Probability calculation | Mathematical | 100 | 1600 |
68 | | Counterfactuals | NDE | NDE-natural | Binary classification | Natural | 100 | 1600 |
69 | | Counterfactuals | NDE | NDE-basic | Probability calculation | Mathematical | 100 | 1600 |
70 | | Counterfactuals | NDE | NDE-hard | Probability calculation | Mathematical | 100 | 1600 |
71 | | Counterfactuals | NIE | NIE-natural | Binary classification | Natural | 100 | 1600 |
72 | | Counterfactuals | NIE | NIE-basic | Probability calculation | Mathematical | 100 | 1600 |
73 | | Counterfactuals | NIE | NIE-hard | Probability calculation | Mathematical | 100 | 1600 |
74 | | Counterfactuals | PN | PN-basic | Probability calculation | Mathematical | 100 | 1600 |
75 | | Counterfactuals | PN | PN-hard | Probability calculation | Mathematical | 100 | 1600 |
76 | | Counterfactuals | PS | PS-basic | Probability calculation | Mathematical | 100 | 1600 |
77 | | Counterfactuals | PS | PS-hard | Probability calculation | Mathematical | 100 | 1600 |
78 | | Counterfactuals | AC | causal judgement | Binary classification | Natural | 100 | 187 |
79 | | Counterfactuals | CR | CRASS | Choice selection | Natural | 100 | 274 |
80 | | Counterfactuals | CR | det-counterfactual | Binary classification | Natural | 100 | 1476 |
81 | | Counterfactuals | CEG | E-CARE | Open-ended generation | Natural | 100 | 1000 |
82 | | **Total** | | | | | 4600 | 63167 |
83 | 
84 | 


--------------------------------------------------------------------------------
/documents/model_details.md:
--------------------------------------------------------------------------------
  1 | # Model Details
  2 | ## OpenAI Series
  3 | OpenAi has deleted some of the model apis. Therefore, only gpt4 and gpt3.5 are available for our code. Before using, you need to copy your api key to the model config `{model}.json`, e.g., `gpt4.json`.
  4 | ```
  5 | {
  6 |     "startup":
  7 |         {
  8 |             "api_key": "TODO: put your api key here"
  9 |         },
 10 |     "query":
 11 |     {
 12 |         "default":{
 13 |         }
 14 |     }
 15 | }
 16 | ```
 17 | The following table shows the API version and our evaluation time for our benchmark. The evaluation date is June 2023~December 2023.
 18 | 
 19 | <table>
 20 |   <tr>
 21 |     <th>Model</th>
 22 |     <th>API Version</th>
 23 |   </tr>
 24 |   <tr>
 25 |     <td>ada</td>
 26 |     <td>ada</td>
 27 |   </tr>
 28 |   <tr>
 29 |     <td>babbage</td>
 30 |     <td>babbage</td>
 31 |   </tr>
 32 |   <tr>
 33 |     <td>curie</td>
 34 |     <td>curie</td>
 35 |   </tr>
 36 |   <tr>
 37 |     <td>davinci</td>
 38 |     <td>davinci</td>
 39 |   </tr>
 40 |   <tr>
 41 |     <td>text-ada-001</td>
 42 |     <td>text-ada-001</td>
 43 |   </tr>
 44 |   <tr>
 45 |     <td>text-babbage-001</td>
 46 |     <td>text-babbage-001</td>
 47 |   </tr>
 48 |   <tr>
 49 |     <td>text-curie-001</td>
 50 |     <td>text-curie-001</td>
 51 |   </tr>
 52 |   <tr>
 53 |     <td>text-davinci-001</td>
 54 |     <td>text-davinci-001</td>
 55 |   </tr>
 56 |   <tr>
 57 |     <td>text-davinci-002</td>
 58 |     <td>text-davinci-002</td>
 59 |   </tr>
 60 |   <tr>
 61 |     <td>text-davinci-003</td>
 62 |     <td>text-davinci-003</td>
 63 |   </tr>
 64 |   <tr>
 65 |     <td>GPT-3.5</td>
 66 |     <td>gpt-3.5-turbo</td>
 67 |   </tr>
 68 |   <tr>
 69 |     <td>GPT-4</td>
 70 |     <td>gpt-4</td>
 71 |   </tr>
 72 | </table>
 73 | 
 74 | 
 75 | ## Claude 2
 76 | We use the claude-2 api for evaluation. Similar to OpenAI series, copy your api key to the model config before using.
 77 | 
 78 | ## InternLM-chat (7B)
 79 | Download link:  https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-7b/summary
 80 | 
 81 | ## InternLM-chat (20B)
 82 | Download link:  https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary
 83 | 
 84 | ## Qwen (7B)
 85 | Download link:  https://huggingface.co/Qwen/Qwen-7B
 86 | 
 87 | ## Qwen (14B)
 88 | Download link:  https://huggingface.co/Qwen/Qwen-14B
 89 | 
 90 | ## Baichuan1 (7B)
 91 | Download link:  https://huggingface.co/baichuan-inc/Baichuan-7B
 92 | 
 93 | ## Baichuan1-chat (13B)
 94 | Download link:  https://huggingface.co/baichuan-inc/Baichuan-13B-Chat
 95 | 
 96 | ## Baichuan2-chat (7B)
 97 | Download link:  https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat
 98 | 
 99 | ## Baichuan2-chat (13B)
100 | Download link:  https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat
101 | 
102 | ## Llama2 (7B)
103 | Download link:  https://huggingface.co/meta-llama/Llama-2-7b-hf
104 | 
105 | ## Llama2 (13B)
106 | Download link:  https://huggingface.co/meta-llama/Llama-2-13b
107 | 
108 | ## Llama2 (70B)
109 | Download link:  https://huggingface.co/meta-llama/Llama-2-70b
110 | 
111 | ## Llama2-chat (70B)
112 | Download link:  https://huggingface.co/meta-llama/Llama-2-70b-hf
113 | 
114 | ## Wizardcoder (15B)
115 | Download link:  https://github.com/nlpxucan/WizardLM
116 | 
117 | ## Koala (13B)
118 | Download link:  https://huggingface.co/TheBloke/koala-13B-HF
119 | 
120 | ## Vicuna-v1.3 (33B)
121 | Download link:  https://huggingface.co/lmsys/vicuna-33b-v1.3
122 | 
123 | # Adding your own models
124 | Write your own models api and add it to `calm/models/model_apis`,name your api file the same name as your pass to the `--models` arguments so that our code can load the api file automatically. Your api should contain two parts: a `startup` function which returns the params that is shared along every model queries, and a `query` function that query the model with query text(str) and returns the model response (str).
125 | 
126 | Take `baichuan1_7b_api` as example:
127 | ```
128 | def startup(ROOT_PATH):
129 |     rtpt = RTPT(name_initials='MW', experiment_name='', max_iterations=300)
130 |     rtpt.start()
131 |     model_path =  os.path.join(ROOT_PATH, "baichuan-7B" )
132 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
133 |     model.generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
134 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, trust_remote_code=True)
135 | 
136 |     return model, tokenizer
137 | 
138 | 
139 | def query(context, query_text, dry_run=False, max_new_tokens=200):
140 |     model, tokenizer = context
141 |     if dry_run:
142 |         return None
143 |     input_ids = tokenizer(query_text, return_tensors="pt").input_ids.cuda()
144 |     generated_ids = model.generate(input_ids, num_return_sequences=1, max_new_tokens=max_new_tokens)
145 | 
146 |     results = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
147 |     return results[0][len(query_text):]
148 | ```
149 | 
150 | After sucessfully loaded your own model, you can process to run and evaluate your model. During your evaluation, please note that if your model is not open-sourced, to compute the open-limited gap, ensure to include your new model in the limited model list in evaluation.element_properties.model_info:
151 | ```
152 | limited_model_list = ["text-ada-001","text-babbage-001",..., "your new limited model"]
153 | ```


--------------------------------------------------------------------------------
/documents/tasks.md:
--------------------------------------------------------------------------------
  1 | # Tasks
  2 | Our supported tasks are:
  3 | - association
  4 |     - correlation
  5 |         - CORR-B_correlation_CN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
  6 |         - CORR-B_correlation_EN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
  7 |     - explaining_away_effect
  8 |         - EAE-B_exp-away_CN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
  9 |         - EAE-B_exp-away_EN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
 10 | - causal_discovery
 11 |     - abstract_reasoning
 12 |         - AR-B_CaLM-AR_CN 
 13 |         - AR-B_CaLM-AR_EN 
 14 |     - causal_attribution
 15 |         - CA-B_FA_CN
 16 |         - CA-B_FA_EN
 17 |         - CA-B_FP_CN
 18 |         - CA-B_FP_EN
 19 |     - event_causality_identification
 20 |         - ECI-B_CTB_CN ([Pustejovsky et al., 2006](https://catalog.ldc.upenn.edu/docs/LDC2006T08/timebank.html))
 21 |         - ECI-B_CTB_EN ([Pustejovsky et al., 2006](https://catalog.ldc.upenn.edu/docs/LDC2006T08/timebank.html))
 22 |         - ECI-B_ESC_CN ([Mirza et al., 2014](https://aclanthology.org/W14-0702/))
 23 |         - ECI-B_ESC_EN ([Mirza et al., 2014](https://aclanthology.org/W14-0702/))
 24 |         - ECI-B_MAVEN-ERE_CN ([Wang et al., 2022](https://aclanthology.org/2022.emnlp-main.60/))
 25 |         - ECI-B_MAVEN-ERE_EN ([Wang et al., 2022](https://aclanthology.org/2022.emnlp-main.60/))
 26 |     - pairwise_causal_discovery
 27 |         - PCD-B_COPA_CN ([Roemmele et al., 2011](https://aaai.org/papers/02418-2418-choice-of-plausible-alternatives-an-evaluation-of-commonsense-causal-reasoning/))
 28 |         - PCD-B_COPA_EN ([Roemmele et al., 2011](https://aaai.org/papers/02418-2418-choice-of-plausible-alternatives-an-evaluation-of-commonsense-causal-reasoning/))
 29 |         - PCD-B_E-CARE_CN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 30 |         - PCD-B_E-CARE_EN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 31 |         - PCD-C_COPA_CN ([Roemmele et al., 2011](https://aaai.org/papers/02418-2418-choice-of-plausible-alternatives-an-evaluation-of-commonsense-causal-reasoning/))
 32 |         - PCD-C_COPA_EN ([Roemmele et al., 2011](https://aaai.org/papers/02418-2418-choice-of-plausible-alternatives-an-evaluation-of-commonsense-causal-reasoning/))
 33 |         - PCD-C_E-CARE_CN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 34 |         - PCD-C_E-CARE_EN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 35 | - counterfactual
 36 |     - actual_causality
 37 |         - AC-B_causal_judgement_CN ([Suzgun et al., 2022](https://aclanthology.org/2023.findings-acl.824.pdf))
 38 |         - AC-B_causal_judgement_EN ([Suzgun et al., 2022](https://aclanthology.org/2023.findings-acl.824.pdf))
 39 |     - causal_explanation_generation
 40 |         - CEG-O_E-CARE_CN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 41 |         - CEG-O_E-CARE_EN ([Du et al., 2022](https://aclanthology.org/2022.acl-long.33/))
 42 |     - counterfactual_reasoning
 43 |         - CR-B_det-counterfactual_CN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
 44 |         - CR-B_det-counterfactual_EN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
 45 |         - CR-C_CRASS_CN ([Frohberg & Binder, 2022](https://aclanthology.org/2022.lrec-1.229/))
 46 |         - CR-C_CRASS_EN ([Frohberg & Binder, 2022](https://aclanthology.org/2022.lrec-1.229/))
 47 |     - effect_of_the_treatment_on_the_treated
 48 |         - ETT-B_ETT-natural_CN
 49 |         - ETT-B_ETT-natural_EN
 50 |         - ETT-P_ETT-basic_CN
 51 |         - ETT-P_ETT-basic_EN
 52 |         - ETT-P_ETT-hard_CN
 53 |         - ETT-P_ETT-hard_EN
 54 |     - natural_direct_effect
 55 |         - NDE-B_NDE-natural_CN
 56 |         - NDE-B_NDE-natural_EN
 57 |         - NDE-P_NDE-basic_CN
 58 |         - NDE-P_NDE-basic_EN
 59 |         - NDE-P_NDE-hard_CN
 60 |         - NDE-P_NDE-hard_EN
 61 |     - natural_indirect_effect
 62 |         - NIE-B_NIE-natural_CN
 63 |         - NIE-B_NIE-natural_EN
 64 |         - NIE-P_NIE-basic_CN
 65 |         - NIE-P_NIE-basic_EN
 66 |         - NIE-P_NIE-hard_CN
 67 |         - NIE-P_NIE-hard_EN
 68 |     - probability_of_necessity
 69 |         - PN-P_PN-basic_CN
 70 |         - PN-P_PN-basic_EN
 71 |         - PN-P_PN-hard_CN
 72 |         - PN-P_PN-hard_EN
 73 |     - probability_of_sufficiency
 74 |         - PS-P_PS-basic_CN
 75 |         - PS-P_PS-basic_EN
 76 |         - PS-P_PS-hard_CN
 77 |         - PS-P_PS-hard_EN
 78 | - intervention
 79 |     - average_treatment_effect
 80 |         - ATE-B_ATE-natural_CN
 81 |         - ATE-B_ATE-natural_EN
 82 |         - ATE-P_ATE-basic_CN
 83 |         - ATE-P_ATE-basic_EN
 84 |         - ATE-P_ATE-hard_CN
 85 |         - ATE-P_ATE-hard_EN
 86 |     - backdoor_adjustment_set
 87 |         - BAS-B_backadj_CN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
 88 |         - BAS-B_backadj_EN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
 89 |         - BAS-C_max-BAS_CN
 90 |         - BAS-C_max-BAS_EN
 91 |         - BAS-C_min-BAS_CN
 92 |         - BAS-C_min-BAS_EN
 93 |         - BAS-C_mix-BAS_CN
 94 |         - BAS-C_mix-BAS_EN
 95 |     - causal_effect_identification
 96 |         - CEI-B_0.2-UC_CN
 97 |         - CEI-B_0.2-UC_EN
 98 |         - CEI-B_0.4-UC_CN
 99 |         - CEI-B_0.4-UC_EN
100 |         - CEI-B_0.6-UC_CN
101 |         - CEI-B_0.6-UC_EN
102 |         - CEI-B_0.8-UC_CN
103 |         - CEI-B_0.8-UC_EN
104 |     - collider_bias
105 |         - CB-B_collider-bias_CN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
106 |         - CB-B_collider-bias_EN ([Jin et al., 2023a](https://arxiv.org/abs/2312.04350))
107 |     - controlled_direct_effect
108 |         - CDE-B_CDE-natural_CN
109 |         - CDE-B_CDE-natural_EN
110 |         - CDE-P_CDE-basic_CN
111 |         - CDE-P_CDE-basic_EN
112 |         - CDE-P_CDE-hard_CN
113 |         - CDE-P_CDE-hard_EN
114 |     - frontdoor_adjustment_set
115 |         - FAS-C_FAS_CN
116 |         - FAS-C_FAS_EN
117 |     - instrumental_variable
118 |         - IV-C_CaLM-IV_CN
119 |         - IV-C_CaLM-IV_EN
120 | 


--------------------------------------------------------------------------------
/model_configs/baichuan1_7b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "ROOT_PATH": "TODO: put your model path here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |             "max_new_tokens":200
10 |         },
11 |         "ATE-B_(ATE-natural)_CN":{
12 |             "basic":{
13 |                 "max_new_tokens":200
14 |             }
15 |         },
16 |         "CORR-B_correlation_CN":{
17 |             "zero-shot-IcL":{
18 |                 "max_new_tokens":200
19 |             }
20 |         }
21 |     }
22 | }
23 |   


--------------------------------------------------------------------------------
/model_configs/baichuan1_chat_13b.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "ROOT_PATH": "TODO: put your model path here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |         }
10 |     }
11 | }
12 |   


--------------------------------------------------------------------------------
/model_configs/chatgpt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "api_key": "TODO: put your api key here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |         }
10 |     }
11 | }
12 |   


--------------------------------------------------------------------------------
/model_configs/claude2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "api_key": "TODO: put your api key here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |         }
10 |     }
11 | }
12 |   


--------------------------------------------------------------------------------
/model_configs/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "ROOT_PATH": "TODO: put your model path here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |         }
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/model_configs/gpt4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "startup":
 3 |         {
 4 |             "api_key": "TODO: put your api key here"
 5 |         },
 6 |     "query":
 7 |     {
 8 |         "default":{
 9 |         }
10 |     }
11 | }
12 |   


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anthropic==0.25.3
 2 | jieba==0.42.1
 3 | modelscope==1.9.2
 4 | nltk==3.7
 5 | openai==1.20.0
 6 | pandas==1.2.3
 7 | Requests==2.31.0
 8 | rouge==1.0.1
 9 | rtpt==0.0.4
10 | torch==2.0.1
11 | transformers==4.33.1
12 | accelerate==0.20.3
13 | sentencepiece==0.1.99
14 | transformers-stream-generator==0.0.4
15 | tiktoken==0.5.1
16 | protobuf==3.20.0
17 | 


--------------------------------------------------------------------------------