├── data
    ├── test.xlsx
    ├── train.xlsx
    ├── test_asr.xlsx
    ├── test_eng.xlsx
    └── train_eng.xlsx
├── SentimentReasoning_ACL2025.png
├── sentiment_reasoning_datasample.png
├── sentiment_reasoning_pipeline.png
├── encoder_eng.py
├── encoder.py
├── README.md
├── seq2seq.py
├── seq2seq_eng.py
├── llm-lora_eng.py
├── llm-lora.py
└── inference.ipynb


/data/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test.xlsx


--------------------------------------------------------------------------------
/data/train.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train.xlsx


--------------------------------------------------------------------------------
/data/test_asr.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_asr.xlsx


--------------------------------------------------------------------------------
/data/test_eng.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_eng.xlsx


--------------------------------------------------------------------------------
/data/train_eng.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train_eng.xlsx


--------------------------------------------------------------------------------
/SentimentReasoning_ACL2025.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/SentimentReasoning_ACL2025.png


--------------------------------------------------------------------------------
/sentiment_reasoning_datasample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_datasample.png


--------------------------------------------------------------------------------
/sentiment_reasoning_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_pipeline.png


--------------------------------------------------------------------------------
/encoder_eng.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset, Dataset
  2 | import pandas as pd
  3 | import argparse
  4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
  5 | from transformers import AutoTokenizer
  6 | import evaluate
  7 | import numpy as np
  8 | import transformers
  9 | 
 10 | # Initialize argparse
 11 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 12 | 
 13 | # Add arguments for training configuration
 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training')
 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer')
 17 | parser.add_argument('--model_checkpoint', type=str, default="emilyalsentzer/Bio_ClinicalBERT", help='Model checkpoint to use')
 18 | 
 19 | # Parse arguments
 20 | args = parser.parse_args()
 21 | 
 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"}
 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2}
 24 | 
 25 | 
 26 | # Assign variables from args
 27 | batch_size = args.batch_size
 28 | num_train_epochs = args.num_train_epochs
 29 | learning_rate = args.learning_rate
 30 | model_checkpoint = args.model_checkpoint
 31 | 
 32 | model = AutoModelForSequenceClassification.from_pretrained(
 33 |     model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
 34 | )
 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 36 | 
 37 | model_name = model_checkpoint.split("/")[-1]
 38 | 
 39 | 
 40 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 41 | train_dataset = Dataset.from_pandas(train_df)
 42 | 
 43 | testset =  pd.read_excel('test_eng.xlsx')
 44 | print(train_df['label'].unique())
 45 | print(testset['label'].unique())
 46 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 47 | 
 48 | 
 49 | 
 50 | 
 51 | def preprocess_function(examples):
 52 |     return tokenizer(examples['text'], truncation=True, padding=True)
 53 | 
 54 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
 55 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True)
 56 | 
 57 | 
 58 | from transformers import DataCollatorWithPadding
 59 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 60 | 
 61 | 
 62 | 
 63 | # Load the individual metrics
 64 | import evaluate
 65 | 
 66 | accuracy = evaluate.load("accuracy")
 67 | f1 = evaluate.load("f1")
 68 | precision = evaluate.load("precision")
 69 | recall = evaluate.load("recall")
 70 | 
 71 | def compute_metrics(eval_pred):
 72 |     predictions, labels = eval_pred
 73 |     predictions = np.argmax(predictions, axis=1)
 74 |     neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
 75 | 
 76 |     # Compute each metric as needed
 77 |     metrics_result = {
 78 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
 79 |         "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
 80 |         "f1_neg": neg,
 81 |         "f1_neu": neu,
 82 |         "f1_pos": pos
 83 | 
 84 |     }
 85 |     
 86 |     return metrics_result
 87 | 
 88 | # This modified function should now work without the TypeError
 89 | 
 90 | 
 91 | ## Train
 92 | 
 93 | 
 94 | training_args = TrainingArguments(
 95 |     output_dir=f"results/{model_name}",
 96 |     lr_scheduler_type='cosine',
 97 |     learning_rate=learning_rate,
 98 |     per_device_train_batch_size=batch_size,
 99 |     per_device_eval_batch_size=batch_size,
100 |     num_train_epochs=num_train_epochs,
101 |     weight_decay=0.01,
102 |     evaluation_strategy="epoch",
103 |     save_strategy="epoch",
104 |     logging_strategy='epoch',
105 |     load_best_model_at_end=True,
106 |     save_total_limit=2,
107 |     bf16=True,
108 |     warmup_ratio=0.05,
109 |     metric_for_best_model='eval_macro_f1',
110 | #     push_to_hub=True,
111 | )
112 | 
113 | trainer = Trainer(
114 |     model=model,
115 |     args=training_args,
116 |     train_dataset=tokenized_dataset_train,
117 |     eval_dataset=tokenized_dataset_test,
118 |     tokenizer=tokenizer,
119 |     data_collator=data_collator,
120 |     compute_metrics=compute_metrics,
121 |     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
122 | 
123 | )
124 | print('model_checkpoint', model_checkpoint)
125 | trainer.train()
126 | trainer.save_model()
127 | trainer.save_state()
128 | trainer.evaluate()


--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
  1 | from datasets import load_dataset, Dataset
  2 | import pandas as pd
  3 | import argparse
  4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
  5 | from transformers import AutoTokenizer
  6 | import evaluate
  7 | import numpy as np
  8 | import transformers
  9 | 
 10 | # Initialize argparse
 11 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 12 | 
 13 | # Add arguments for training configuration
 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training')
 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer')
 17 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use')
 18 | 
 19 | # Parse arguments
 20 | args = parser.parse_args()
 21 | 
 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"}
 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2}
 24 | 
 25 | 
 26 | # Assign variables from args
 27 | batch_size = args.batch_size
 28 | num_train_epochs = args.num_train_epochs
 29 | learning_rate = args.learning_rate
 30 | model_checkpoint = args.model_checkpoint
 31 | 
 32 | model = AutoModelForSequenceClassification.from_pretrained(
 33 |     model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
 34 | )
 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 36 | 
 37 | model_name = model_checkpoint.split("/")[-1]
 38 | 
 39 | 
 40 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 41 | train_dataset = Dataset.from_pandas(train_df)
 42 | 
 43 | testset =  pd.read_excel('test.xlsx')
 44 | test_with_asr = pd.read_excel('test_asr.xlsx')
 45 | testset['text'] = test_with_asr['asr']
 46 | 
 47 | print(train_df['label'].unique())
 48 | print(testset['label'].unique())
 49 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 50 | 
 51 | 
 52 | 
 53 | 
 54 | def preprocess_function(examples):
 55 |     return tokenizer(examples['text'], truncation=True, padding=True)
 56 | 
 57 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True)
 58 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True)
 59 | 
 60 | 
 61 | from transformers import DataCollatorWithPadding
 62 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 63 | 
 64 | 
 65 | 
 66 | # Load the individual metrics
 67 | import evaluate
 68 | 
 69 | accuracy = evaluate.load("accuracy")
 70 | f1 = evaluate.load("f1")
 71 | precision = evaluate.load("precision")
 72 | recall = evaluate.load("recall")
 73 | 
 74 | def compute_metrics(eval_pred):
 75 |     predictions, labels = eval_pred
 76 |     predictions = np.argmax(predictions, axis=1)
 77 |     neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
 78 | 
 79 |     # Compute each metric as needed
 80 |     metrics_result = {
 81 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
 82 |         "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
 83 |         "f1_neg": neg,
 84 |         "f1_neu": neu,
 85 |         "f1_pos": pos
 86 | 
 87 |     }
 88 |     
 89 |     return metrics_result
 90 | 
 91 | # This modified function should now work without the TypeError
 92 | 
 93 | 
 94 | ## Train
 95 | 
 96 | 
 97 | training_args = TrainingArguments(
 98 |     output_dir=f"results/{model_name}",
 99 |     lr_scheduler_type='cosine',
100 |     learning_rate=learning_rate,
101 |     per_device_train_batch_size=batch_size,
102 |     per_device_eval_batch_size=batch_size,
103 |     num_train_epochs=num_train_epochs,
104 |     weight_decay=0.01,
105 |     evaluation_strategy="epoch",
106 |     save_strategy="epoch",
107 |     logging_strategy='epoch',
108 |     load_best_model_at_end=True,
109 |     save_total_limit=2,
110 |     bf16=True,
111 |     warmup_ratio=0.05,
112 |     metric_for_best_model='eval_macro_f1',
113 | #     push_to_hub=True,
114 | )
115 | 
116 | trainer = Trainer(
117 |     model=model,
118 |     args=training_args,
119 |     train_dataset=tokenized_dataset_train,
120 |     eval_dataset=tokenized_dataset_test,
121 |     tokenizer=tokenizer,
122 |     data_collator=data_collator,
123 |     compute_metrics=compute_metrics,
124 |     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
125 | 
126 | )
127 | print('model_checkpoint', model_checkpoint)
128 | trainer.train()
129 | trainer.save_model()
130 | trainer.evaluate()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sentiment Reasoning for Healthcare
 2 | 
 3 | **<div align="center">ACL 2025 Industry Track (Oral)</div>**
 4 | 
 5 | <div align="center"><b>Khai-Nguyen Nguyen*</b>, <b>Khai Le-Duc*</b>, Bach Phan Tat, Duy Le, Long Vo-Dang, Truong-Son Hy</div>
 6 | 
 7 | <div align="center">*Equal contribution</div>
 8 | 
 9 | > Please press ⭐ button and/or cite papers if you feel helpful.
10 | 
11 | <p align="center">
12 |   <img src="https://github.com/leduckhai/Sentiment-Reasoning/blob/master/sentiment_reasoning_pipeline.png" width="700"/>
13 | </p>
14 | <p align="center"><em>Sentiment Reasoning pipeline</em></p>
15 | 
16 | <p align="center">
17 |   <img src="https://github.com/leduckhai/Sentiment-Reasoning/blob/master/SentimentReasoning_ACL2025.png" width="700"/>
18 | </p>
19 | 
20 | * **Abstract:**
21 | Transparency in AI healthcare decision-making is crucial. By incorporating rationales to explain reason for each predicted label, users could understand Large Language Models (LLMs)’s reasoning to make better decision. In this work, we introduce a new task - **Sentiment Reasoning** - for both speech and text modalities, and our proposed multimodal multitask framework and **the world's largest multimodal sentiment analysis dataset**. Sentiment Reasoning is an auxiliary task in sentiment analysis where the model predicts both the sentiment label and generates the rationale behind it based on the input transcript. Our study conducted on both human transcripts and Automatic Speech Recognition (ASR) transcripts shows that Sentiment Reasoning helps improve model transparency by providing rationale for model prediction with quality semantically comparable to humans while also improving model's classification performance (**+2% increase in both accuracy and macro-F1**)  via rationale-augmented fine-tuning. Also, no significant difference in the semantic quality of generated rationales between human and ASR transcripts. All code, data (**five languages - Vietnamese, English, Chinese, German, and French**) and models are published online.
22 | 
23 | * **Citation:**
24 | Please cite this paper: [https://arxiv.org/abs/2407.21054](https://arxiv.org/abs/2407.21054)
25 | 
26 | ``` bibtex
27 | @misc{Sentiment_Reasoning,
28 |       title={Sentiment Reasoning for Healthcare}, 
29 |       author={Khai-Nguyen Nguyen and Khai Le-Duc and Bach Phan Tat and Duy Le and Long Vo-Dang and Truong-Son Hy},
30 |       year={2024},
31 |       eprint={2407.21054},
32 |       url={https://arxiv.org/abs/2407.21054}, 
33 | }
34 | ```
35 | 
36 | This repository contains scripts for automatic speech recognition (ASR) and sentiment reasoning using cascaded sequence-to-sequence (seq2seq) audio-language models. The provided scripts cover model preparation, training, inference, and evaluation processes, based on the dataset in the paper.
37 | 
38 | ## Dataset and Pre-trained Models:
39 | 🤗 **HuggingFace Dataset**: [https://huggingface.co/datasets/leduckhai/Sentiment-Reasoning](https://huggingface.co/datasets/leduckhai/Sentiment-Reasoning)
40 | 
41 | 🤗 **HuggingFace Models**: to be released soon!
42 | 
43 | | Model Name       | Description                                | Link                                                                 |
44 | |------------------|--------------------------------------------|----------------------------------------------------------------------|
45 | | `Vietnamese_Vistral-7B`     | LLM fine-tuned on Vietnamese set        | [Hugging Face models](https://huggingface.co/leduckhai/Sentiment-Reasoning/tree/main/Vietnamese_Vistral-7B) |
46 | | `English-LLM`    | LLM fine-tuned on English set         | to be released soon! |
47 | | `French-LLM`  | LLM fine-tuned on French set          | to be released soon!    |
48 | | `German-LLM`  | LLM fine-tuned on German set          | to be released soon! |
49 | | `Vietnamese-LLM`  | LLM fine-tuned on Chinese set          | to be released soon! |
50 | | `Multilingual-LLM`  | LLM fine-tuned on Multilingual set (5 languages)        | to be released soon! |
51 | 
52 | **Paperswithcodes** to be released soon!
53 | 
54 | <p align="center">
55 |   <img src="https://github.com/leduckhai/Sentiment-Reasoning/blob/master/sentiment_reasoning_datasample.png" width="1000"/>
56 | </p>
57 | <p align="center"><em>Sample data format used in Sentiment Reasoning dataset</em></p>
58 | 
59 | 
60 | ## Contact
61 | 
62 | Core developers:
63 | 
64 | **Khai Le-Duc**
65 | ```
66 | University of Toronto, Canada
67 | Email: duckhai.le@mail.utoronto.ca
68 | GitHub: https://github.com/leduckhai
69 | ```
70 | 
71 | **Khai-Nguyen Nguyen**
72 | ```
73 | College of William and Mary, USA
74 | GitHub: https://github.com/nkn002
75 | Hugging Face: https://huggingface.co/knguyennguyen
76 | ```
77 | 
78 | 


--------------------------------------------------------------------------------
/seq2seq.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import random
  4 | from datasets import Dataset, load_metric
  5 | import transformers
  6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
  7 | from datasets import load_dataset, Dataset
  8 | import pandas as pd
  9 | import evaluate
 10 | import torch
 11 | import nltk
 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
 13 | import nltk
 14 | import argparse
 15 | import numpy as np
 16 | 
 17 | 
 18 | # Initialize argparse
 19 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 20 | 
 21 | # Add arguments for training configuration
 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training')
 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer')
 25 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-large", help='Model checkpoint to use')
 26 | 
 27 | # Parse arguments
 28 | args = parser.parse_args()
 29 | 
 30 | # Assign variables from args
 31 | batch_size = args.batch_size
 32 | num_train_epochs = args.num_train_epochs
 33 | learning_rate = args.learning_rate
 34 | model_checkpoint = args.model_checkpoint
 35 | 
 36 | # Now you can use these variables in your training setup
 37 | print(f"Training setup:")
 38 | print(f"Batch size: {batch_size}")
 39 | print(f"Number of training epochs: {num_train_epochs}")
 40 | print(f"Learning rate: {learning_rate}")
 41 | print(f"Model checkpoint: {model_checkpoint}")
 42 | 
 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"}
 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'}
 45 | 
 46 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 47 | train_df['label'] = train_df['label'].astype(str)
 48 | 
 49 | train_dataset = Dataset.from_pandas(train_df)
 50 | 
 51 | testset =  pd.read_excel('test.xlsx')
 52 | 
 53 | # Then convert the modified DataFrame to a Hugging Face dataset
 54 | testset['label'] = testset['label'].astype(str)
 55 | print(train_df['label'].unique())
 56 | print(testset['label'].unique())
 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 58 | 
 59 | # Output unique values to verify
 60 | 
 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"}
 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'}
 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
 65 | 
 66 | 
 67 | def preprocess_function(examples):
 68 |     inputs = [doc for doc in examples["text"]]
 69 |     model_inputs = tokenizer(inputs, max_length=128, truncation=True)
 70 |     labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True)
 71 | 
 72 |     model_inputs["labels"] = labels["input_ids"]
 73 |     return model_inputs
 74 | 
 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
 77 | 
 78 | print('tokenized_train_dataset', tokenized_train_dataset)
 79 | print('tokenized_test_dataset', tokenized_test_dataset)
 80 | 
 81 | 
 82 | # Load the individual metrics
 83 | accuracy = evaluate.load("accuracy")
 84 | f1 = evaluate.load("f1")
 85 | precision = evaluate.load("precision")
 86 | recall = evaluate.load("recall")
 87 | 
 88 | def compute_metrics(eval_pred):
 89 |     logits, labels = eval_pred
 90 |     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 91 |     decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
 92 |     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 93 |     decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds]  # Replace non-digit predictions with '-1'
 94 |     decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels]  # Replace non-digit labels with '-1'
 95 |     predictions = decoded_preds
 96 |     labels = decoded_labels
 97 |     metrics_result = {
 98 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
 99 | 
100 |     }
101 |     
102 |     return metrics_result
103 | 
104 | # This modified function should now work without the TypeError
105 | 
106 | 
107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
108 | 
109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label'])
110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label'])
111 | model_name = model_checkpoint.split("/")[-1]
112 | 
113 | transformers.logging.set_verbosity_info()
114 | training_args = Seq2SeqTrainingArguments(
115 |     output_dir=f"results/{model_name}",
116 |     eval_strategy="epoch",
117 |     save_strategy="epoch",
118 |     logging_strategy='epoch',
119 |     learning_rate=learning_rate,
120 |     per_device_train_batch_size=batch_size,
121 |     per_device_eval_batch_size=batch_size,
122 |     weight_decay=0.01,
123 |     save_total_limit=2,
124 |     num_train_epochs=num_train_epochs,
125 |     predict_with_generate=True,
126 |     load_best_model_at_end=True,
127 |     metric_for_best_model='eval_accuracy',
128 |     bf16=True,
129 |     lr_scheduler_type='cosine',
130 |     warmup_ratio=0.05,
131 | )
132 | 
133 | # Setting up the trainer
134 | trainer = Seq2SeqTrainer(
135 |     model=model,
136 |     args=training_args,
137 |     train_dataset=tokenized_train_dataset,
138 |     eval_dataset=tokenized_test_dataset,
139 |     tokenizer=tokenizer,
140 |     data_collator=data_collator,
141 |     compute_metrics=compute_metrics,
142 |     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
143 | 
144 | )
145 | 
146 | 
147 | trainer.train()
148 | trainer.save_model()
149 | 


--------------------------------------------------------------------------------
/seq2seq_eng.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import random
  4 | from datasets import Dataset, load_metric
  5 | import transformers
  6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
  7 | from datasets import load_dataset, Dataset
  8 | import pandas as pd
  9 | import evaluate
 10 | import torch
 11 | import nltk
 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
 13 | import nltk
 14 | import argparse
 15 | import numpy as np
 16 | 
 17 | 
 18 | # Initialize argparse
 19 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 20 | 
 21 | # Add arguments for training configuration
 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training')
 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training')
 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer')
 25 | parser.add_argument('--model_checkpoint', type=str, default="luqh/ClinicalT5-base", help='Model checkpoint to use')
 26 | 
 27 | # Parse arguments
 28 | args = parser.parse_args()
 29 | 
 30 | # Assign variables from args
 31 | batch_size = args.batch_size
 32 | num_train_epochs = args.num_train_epochs
 33 | learning_rate = args.learning_rate
 34 | model_checkpoint = args.model_checkpoint
 35 | 
 36 | # Now you can use these variables in your training setup
 37 | print(f"Training setup:")
 38 | print(f"Batch size: {batch_size}")
 39 | print(f"Number of training epochs: {num_train_epochs}")
 40 | print(f"Learning rate: {learning_rate}")
 41 | print(f"Model checkpoint: {model_checkpoint}")
 42 | 
 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"}
 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'}
 45 | 
 46 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 47 | train_df['label'] = train_df['label'].astype(str)
 48 | 
 49 | train_dataset = Dataset.from_pandas(train_df)
 50 | 
 51 | testset =  pd.read_excel('test_eng.xlsx')
 52 | 
 53 | # Then convert the modified DataFrame to a Hugging Face dataset
 54 | testset['label'] = testset['label'].astype(str)
 55 | print(train_df['label'].unique())
 56 | print(testset['label'].unique())
 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 58 | 
 59 | # Output unique values to verify
 60 | 
 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"}
 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'}
 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
 65 | 
 66 | 
 67 | def preprocess_function(examples):
 68 |     inputs = [doc for doc in examples["text"]]
 69 |     model_inputs = tokenizer(inputs, max_length=128, truncation=True)
 70 |     labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True)
 71 | 
 72 |     model_inputs["labels"] = labels["input_ids"]
 73 |     return model_inputs
 74 | 
 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
 77 | 
 78 | print('tokenized_train_dataset', tokenized_train_dataset)
 79 | print('tokenized_test_dataset', tokenized_test_dataset)
 80 | 
 81 | print(train_dataset['text'])
 82 | # Load the individual metrics
 83 | accuracy = evaluate.load("accuracy")
 84 | f1 = evaluate.load("f1")
 85 | precision = evaluate.load("precision")
 86 | recall = evaluate.load("recall")
 87 | 
 88 | def compute_metrics(eval_pred):
 89 |     logits, labels = eval_pred
 90 |     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
 91 |     decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
 92 |     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
 93 |     decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds]  # Replace non-digit predictions with '-1'
 94 |     decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels]  # Replace non-digit labels with '-1'
 95 |     predictions = decoded_preds
 96 |     labels = decoded_labels
 97 |     metrics_result = {
 98 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
 99 | 
100 |     }
101 |     
102 |     return metrics_result
103 | 
104 | # This modified function should now work without the TypeError
105 | 
106 | 
107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
108 | 
109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label'])
110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label'])
111 | model_name = model_checkpoint.split("/")[-1]
112 | 
113 | transformers.logging.set_verbosity_info()
114 | training_args = Seq2SeqTrainingArguments(
115 |     output_dir=f"results/{model_name}",
116 |     eval_strategy="epoch",
117 |     save_strategy="epoch",
118 |     logging_strategy='epoch',
119 |     learning_rate=learning_rate,
120 |     per_device_train_batch_size=batch_size,
121 |     per_device_eval_batch_size=batch_size,
122 |     weight_decay=0.01,
123 |     save_total_limit=2,
124 |     num_train_epochs=num_train_epochs,
125 |     predict_with_generate=True,
126 |     load_best_model_at_end=True,
127 |     metric_for_best_model='eval_accuracy',
128 |     bf16=True,
129 |     lr_scheduler_type='cosine',
130 |     warmup_ratio=0.05,
131 | )
132 | 
133 | # Setting up the trainer
134 | trainer = Seq2SeqTrainer(
135 |     model=model,
136 |     args=training_args,
137 |     train_dataset=tokenized_train_dataset,
138 |     eval_dataset=tokenized_test_dataset,
139 |     tokenizer=tokenizer,
140 |     data_collator=data_collator,
141 |     compute_metrics=compute_metrics,
142 |     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
143 | 
144 | )
145 | 
146 | 
147 | trainer.train()
148 | trainer.save_state()
149 | trainer.save_model()
150 | 


--------------------------------------------------------------------------------
/llm-lora_eng.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import AutoModelForCausalLM, AutoTokenizer
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random
  6 | from datasets import Dataset, load_metric
  7 | import transformers
  8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
  9 | from datasets import load_dataset, Dataset
 10 | import pandas as pd
 11 | import evaluate
 12 | import torch
 13 | import nltk
 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
 15 | import nltk
 16 | import argparse
 17 | import numpy as np
 18 | from transformers import AutoModelForSeq2SeqLM
 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
 20 | from trl import AutoModelForCausalLMWithValueHead
 21 | from transformers import TrainingArguments, Trainer
 22 | from trl import SFTTrainer
 23 | import evaluate
 24 | 
 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm"
 26 | # rationale_col = 'cot_rationale'
 27 | 
 28 | # Initialize argparse
 29 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 30 | 
 31 | # Add arguments for training configuration
 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm')
 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale')
 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer')
 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use')
 36 | 
 37 | # Parse arguments
 38 | args = parser.parse_args()
 39 | model_name_or_path = args.model_name_or_path
 40 | rationale_col = args.rationale_col
 41 | 
 42 | peft_config = LoraConfig(
 43 |     task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
 44 | )
 45 | 
 46 | 
 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token='hf_GxsYTZDZhHcQEzYEvWrus')
 48 | model = AutoModelForCausalLM.from_pretrained(
 49 |     model_name_or_path,
 50 | #     load_in_8bit=True,
 51 |     torch_dtype=torch.bfloat16, 
 52 |     device_map="auto",
 53 | #     use_cache=True,
 54 |     cache_dir='./models',
 55 |     load_in_8bit=True,
 56 |     token='hf_GxsYTZDZhHcQEzYEvWrus'
 57 | 
 58 | )
 59 | 
 60 | model.enable_input_require_grads()
 61 | model = get_peft_model(model, peft_config)
 62 | model.print_trainable_parameters()
 63 | 
 64 | 
 65 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 66 | train_df['label'] = train_df['label'].astype(str)
 67 | 
 68 | train_dataset = Dataset.from_pandas(train_df)
 69 | 
 70 | testset =  pd.read_excel('test_eng.xlsx')
 71 | 
 72 | testset['label'] = testset['label'].astype(str)
 73 | print(train_df['label'].unique())
 74 | print(testset['label'].unique())
 75 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 76 | 
 77 | 
 78 | def template(inp, out, rationale=''):
 79 | #     if rationale 
 80 |     conversation = [
 81 |         {"role": "user", "content": f"""sentiment analysis: '{inp.strip()}'"""},
 82 |     ]
 83 | #     print(out)
 84 |     prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
 85 |     prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip()
 86 |     print(prompt)
 87 |     return prompt
 88 | # , train_dataset[rationale_col]
 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
 90 | if rationale_col == '':
 91 |     new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
 92 | else:
 93 |     new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])]
 94 | train_dataset= train_dataset.add_column("train_text", new_column_train)
 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train)
 97 | 
 98 | 
 99 | # Load the individual metrics
100 | accuracy = evaluate.load("accuracy")
101 | f1 = evaluate.load("f1")
102 | precision = evaluate.load("precision")
103 | recall = evaluate.load("recall")
104 | 
105 | def compute_metrics(eval_pred):
106 |     logits, labels = eval_pred
107 |     logits = np.argmax(logits, axis=-1)
108 |     print(logits, labels)
109 |     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
110 |     decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
111 |     print('decoded_preds', decoded_preds)
112 |     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
113 |     decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds]  # Replace non-digit predictions with '-1'
114 |     decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels]  # Replace non-digit labels with '-1'
115 |     predictions = decoded_preds
116 |     labels = decoded_labels
117 |     print( f1.compute(predictions=predictions, references=labels, average=None)['f1'])
118 |     print(set(decoded_preds))
119 |     neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
120 |     metrics_result = {
121 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
122 |         "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
123 | #         "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
124 | #         "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
125 |         "f1_neg": neg,
126 |         "f1_neu": neu,
127 |         "f1_pos": pos
128 | 
129 |     }
130 |     
131 |     return metrics_result
132 | 
133 | training_args = TrainingArguments(
134 |     per_device_train_batch_size=16,
135 |     gradient_accumulation_steps=2,
136 | #     gradient_checkpointing=True,
137 |     warmup_steps=100,
138 |     report_to=[],
139 |     learning_rate=2e-4,
140 |     lr_scheduler_type="cosine",
141 |     num_train_epochs=5,
142 |     optim="adamw_bnb_8bit",
143 |     bf16=True,
144 |     # gradient_accumulation_steps=2, # simulate larger batch sizes
145 |     output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2",
146 |     logging_strategy="epoch",
147 |     dataloader_num_workers=4,
148 |     save_total_limit=3,
149 |     save_strategy='epoch',
150 | #     eval_strategy='no',
151 | )
152 | 
153 | 
154 | trainer = SFTTrainer(
155 |     model,
156 |     packing=True, # pack samples together for efficient training
157 |     max_seq_length=180, # maximum packed length
158 |     args=training_args,
159 |     train_dataset=train_dataset.shuffle(),
160 |     compute_metrics=compute_metrics,
161 |     peft_config=peft_config,
162 |     dataset_text_field='train_text',
163 | #     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
164 | 
165 | )
166 | trainer.train()
167 | trainer.save_model()
168 | 
169 | # trainer.evaluate()
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/llm-lora.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import AutoModelForCausalLM, AutoTokenizer
  3 | import pandas as pd
  4 | import numpy as np
  5 | import random
  6 | from datasets import Dataset, load_metric
  7 | import transformers
  8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
  9 | from datasets import load_dataset, Dataset
 10 | import pandas as pd
 11 | import evaluate
 12 | import torch
 13 | import nltk
 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM
 15 | import nltk
 16 | import argparse
 17 | import numpy as np
 18 | from transformers import AutoModelForSeq2SeqLM
 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
 20 | from trl import AutoModelForCausalLMWithValueHead
 21 | from transformers import TrainingArguments, Trainer
 22 | from trl import SFTTrainer
 23 | import evaluate
 24 | 
 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm"
 26 | # rationale_col = 'cot_rationale'
 27 | 
 28 | # Initialize argparse
 29 | parser = argparse.ArgumentParser(description='Configure training parameters.')
 30 | 
 31 | # Add arguments for training configuration
 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm')
 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale')
 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer')
 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use')
 36 | 
 37 | # Parse arguments
 38 | args = parser.parse_args()
 39 | model_name_or_path = args.model_name_or_path
 40 | rationale_col = args.rationale_col
 41 | 
 42 | peft_config = LoraConfig(
 43 |     task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
 44 | )
 45 | 
 46 | 
 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
 48 | model = AutoModelForCausalLM.from_pretrained(
 49 |     model_name_or_path,
 50 | #     load_in_8bit=True,
 51 |     torch_dtype=torch.bfloat16, 
 52 |     device_map="auto",
 53 | #     use_cache=True,
 54 |     cache_dir='./models',
 55 |     load_in_8bit=True,
 56 | 
 57 | )
 58 | 
 59 | model.enable_input_require_grads()
 60 | model = get_peft_model(model, peft_config)
 61 | model.print_trainable_parameters()
 62 | 
 63 | 
 64 | train_df = pd.read_excel('multitask/distilling-step-by-step/train_rationale.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)
 65 | train_df['label'] = train_df['label'].astype(str)
 66 | 
 67 | train_dataset = Dataset.from_pandas(train_df)
 68 | 
 69 | testset =  pd.read_excel('test.xlsx')
 70 | 
 71 | testset['label'] = testset['label'].astype(str)
 72 | print(train_df['label'].unique())
 73 | print(testset['label'].unique())
 74 | test_dataset = Dataset.from_pandas(testset[['text', 'label']])
 75 | 
 76 | 
 77 | def template(inp, out, rationale=''):
 78 | #     if rationale 
 79 |     conversation = [
 80 |         {"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." },
 81 |         {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""},
 82 |     ]
 83 | #     print(out)
 84 |     prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
 85 |     prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip()
 86 |     print(prompt)
 87 |     return prompt
 88 | # , train_dataset[rationale_col]
 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]
 90 | if rationale_col == '':
 91 |     new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]
 92 | else:
 93 |     new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])]
 94 | train_dataset= train_dataset.add_column("train_text", new_column_train)
 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]
 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train)
 97 | 
 98 | 
 99 | # Load the individual metrics
100 | accuracy = evaluate.load("accuracy")
101 | f1 = evaluate.load("f1")
102 | precision = evaluate.load("precision")
103 | recall = evaluate.load("recall")
104 | 
105 | def compute_metrics(eval_pred):
106 |     logits, labels = eval_pred
107 |     logits = np.argmax(logits, axis=-1)
108 |     print(logits, labels)
109 |     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
110 |     decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
111 |     print('decoded_preds', decoded_preds)
112 |     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
113 |     decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds]  # Replace non-digit predictions with '-1'
114 |     decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels]  # Replace non-digit labels with '-1'
115 |     predictions = decoded_preds
116 |     labels = decoded_labels
117 |     print( f1.compute(predictions=predictions, references=labels, average=None)['f1'])
118 |     print(set(decoded_preds))
119 |     neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']
120 |     metrics_result = {
121 |         "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'],
122 |         "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],
123 | #         "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],
124 | #         "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],
125 |         "f1_neg": neg,
126 |         "f1_neu": neu,
127 |         "f1_pos": pos
128 | 
129 |     }
130 |     
131 |     return metrics_result
132 | 
133 | training_args = TrainingArguments(
134 |     per_device_train_batch_size=16,
135 |     gradient_accumulation_steps=2,
136 | #     gradient_checkpointing=True,
137 |     warmup_steps=100,
138 |     report_to=[],
139 |     learning_rate=2e-4,
140 |     lr_scheduler_type="cosine",
141 |     num_train_epochs=5,
142 |     optim="adamw_bnb_8bit",
143 |     bf16=True,
144 |     # gradient_accumulation_steps=2, # simulate larger batch sizes
145 |     output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2",
146 |     logging_strategy="epoch",
147 |     dataloader_num_workers=4,
148 |     save_total_limit=3,
149 |     save_strategy='epoch',
150 | #     eval_strategy='no',
151 | )
152 | 
153 | 
154 | trainer = SFTTrainer(
155 |     model,
156 |     packing=True, # pack samples together for efficient training
157 |     max_seq_length=180, # maximum packed length
158 |     args=training_args,
159 |     train_dataset=train_dataset.shuffle(),
160 |     compute_metrics=compute_metrics,
161 |     peft_config=peft_config,
162 |     dataset_text_field='train_text',
163 | #     callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)]
164 | 
165 | )
166 | trainer.train()
167 | trainer.save_model()
168 | 
169 | # trainer.evaluate()
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/inference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from datasets import load_metric, load_dataset\n",
 10 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n",
 11 |     "import torch \n",
 12 |     "import numpy as np\n",
 13 |     "from tqdm import tqdm\n",
 14 |     "metrics = load_metric('accuracy')\n",
 15 |     "import gc\n",
 16 |     "import os\n",
 17 |     "\n",
 18 |     "def inference(path):\n",
 19 |     "  prefix = 'summarize: ' if 'mt5' in path else ''\n",
 20 |     "  tokenizer = AutoTokenizer.from_pretrained(path)\n",
 21 |     "  model = AutoModelForSeq2SeqLM.from_pretrained(path)\n",
 22 |     "  max_length = 1024 if 'bert' not in path else 256\n",
 23 |     "  def preprocess_function(examples):\n",
 24 |     "    inputs = [prefix + doc for doc in examples[\"text\"]]\n",
 25 |     "    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n",
 26 |     "    labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n",
 27 |     "    model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
 28 |     "    return model_inputs\n",
 29 |     "\n",
 30 |     "  testset =  pd.read_excel('test.xlsx')\n",
 31 |     "  testset['label'] = testset['label'].astype(str)\n",
 32 |     "  dataset = Dataset.from_pandas(testset[['text', 'label']])\n",
 33 |     "\n",
 34 |     "#   dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n",
 35 |     "  test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n",
 36 |     "  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n",
 37 |     "  model.to('cuda')\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "  max_target_length = 5\n",
 41 |     "  test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n",
 42 |     "  dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)\n",
 43 |     "\n",
 44 |     "  predictions = []\n",
 45 |     "  references = []\n",
 46 |     "  for i, batch in enumerate(tqdm(dataloader)):\n",
 47 |     "  outputs = model.generate(\n",
 48 |     "    input_ids=batch['input_ids'].to('cuda'),\n",
 49 |     "    max_length=max_target_length,\n",
 50 |     "    attention_mask=batch['attention_mask'].to('cuda'),\n",
 51 |     "  )\n",
 52 |     "  with tokenizer.as_target_tokenizer():\n",
 53 |     "    outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n",
 54 |     "\n",
 55 |     "    labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)\n",
 56 |     "    actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n",
 57 |     "  predictions.extend(outputs)\n",
 58 |     "  references.extend(actuals)\n",
 59 |     "  metrics.add_batch(predictions=outputs, references=actuals)\n",
 60 |     "\n",
 61 |     "  metrics.compute()\n",
 62 |     "\n",
 63 |     "  rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n",
 64 |     "#   new_file_path = './r_scores_faq'\n",
 65 |     "#   # Write to the file\n",
 66 |     "#   try:\n",
 67 |     "#   # Attempt to append to the file\n",
 68 |     "#   with open(new_file_path, 'a') as file:\n",
 69 |     "#     file.write(path.split('/')[-2] + '\\n')\n",
 70 |     "#     for new_content_str in rouges:\n",
 71 |     "#       result = next(iter(new_content_str))\n",
 72 |     "#       file.write(f\"{result}: {new_content_str[result]}\\n\")\n",
 73 |     "#     file.write('\\n')\n",
 74 |     "#   action_result = \"Content appended to the existing file.\"\n",
 75 |     "#   except FileNotFoundError:\n",
 76 |     "#   # File doesn't exist, create it and write the content\n",
 77 |     "#   with open(new_file_path, 'w') as file:\n",
 78 |     "#     file.write(path)\n",
 79 |     "#     file.write(new_content_str)\n",
 80 |     "#   action_result = \"File did not exist, so it was created with the new content.\"\n",
 81 |     "  \n",
 82 |     "#   del model\n",
 83 |     "#   gc.collect()\n"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "import pandas as pd\n",
 93 |     "from datasets import Dataset\n",
 94 |     "import evaluate\n",
 95 |     "from datasets import load_metric, load_dataset\n",
 96 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n",
 97 |     "import torch \n",
 98 |     "import numpy as np\n",
 99 |     "from tqdm import tqdm\n",
100 |     "metrics = load_metric('accuracy')\n",
101 |     "import gc\n",
102 |     "import os\n",
103 |     "\n",
104 |     "accuracy = evaluate.load(\"accuracy\")\n",
105 |     "f1 = evaluate.load(\"f1\")\n",
106 |     "precision = evaluate.load(\"precision\")\n",
107 |     "recall = evaluate.load(\"recall\")\n",
108 |     "\n",
109 |     "path = './multitask/distilling-step-by-step/ckpts/VietAI/vit5-base_human_justification/'\n",
110 |     "# path = 'multitask/distilling-step-by-step/ckpts//'\n",
111 |     "# path = 'results/flan-t5-base/'\n",
112 |     "prefix = 'gt: ' if 'distilling' in path else ''\n",
113 |     "tokenizer = AutoTokenizer.from_pretrained(path)\n",
114 |     "model = AutoModelForSeq2SeqLM.from_pretrained(path)\n",
115 |     "max_length = 1024 if 'bert' not in path else 256\n",
116 |     "def preprocess_function(examples):\n",
117 |     "  inputs = [prefix + doc for doc in examples[\"text\"]]\n",
118 |     "  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n",
119 |     "  labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n",
120 |     "  model_inputs[\"labels\"] = labels[\"input_ids\"]\n",
121 |     "  return model_inputs\n",
122 |     "\n",
123 |     "testset =  pd.read_excel('test.xlsx')\n",
124 |     "test_with_asr = pd.read_excel('test_asr.xlsx')\n",
125 |     "testset['text'] = test_with_asr['asr']\n",
126 |     "testset['label'] = testset['label'].astype(str)\n",
127 |     "dataset = Dataset.from_pandas(testset[['text', 'label']])\n",
128 |     "\n",
129 |     "#   dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n",
130 |     "test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n",
131 |     "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n",
132 |     "model.to('cuda:2')\n",
133 |     "\n",
134 |     "\n",
135 |     "max_target_length = 25\n",
136 |     "test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n",
137 |     "dataloader = torch.utils.data.DataLoader(test_tokenized_datasets.select(idx), collate_fn=data_collator, batch_size=32)\n",
138 |     "\n",
139 |     "predictions = []\n",
140 |     "references = []\n",
141 |     "for i, batch in enumerate(tqdm(dataloader)):\n",
142 |     "  outputs = model.generate(\n",
143 |     "  input_ids=batch['input_ids'].to('cuda:2'),\n",
144 |     "  max_length=max_target_length,\n",
145 |     "  attention_mask=batch['attention_mask'].to('cuda:2'),\n",
146 |     "  )\n",
147 |     "  with tokenizer.as_target_tokenizer():\n",
148 |     "      outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n",
149 |     "\n",
150 |     "      labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)\n",
151 |     "      actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n",
152 |     "      predictions.extend(outputs)\n",
153 |     "      references.extend(actuals)\n",
154 |     "# metrics.add_batch(predictions=outputs, references=actuals)\n",
155 |     "\n",
156 |     "# metrics.compute()\n",
157 |     "\n",
158 |     "def compute_metrics(predictions, references):\n",
159 |     "  decoded_preds, decoded_labels = predictions, references\n",
160 |     "#   logits = np.argmax(logits, axis=1)\n",
161 |     "  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'\n",
162 |     "  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'\n",
163 |     "  predictions = decoded_preds\n",
164 |     "  labels = decoded_labels\n",
165 |     "  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n",
166 |     "  metrics_result = {\n",
167 |     "    \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n",
168 |     "    \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n",
169 |     "#   \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n",
170 |     "#   \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n",
171 |     "    \"f1_neg\": neg,\n",
172 |     "    \"f1_neu\": neu,\n",
173 |     "    \"f1_pos\": pos\n",
174 |     "\n",
175 |     "  }\n",
176 |     "  return metrics_result\n",
177 |     "# rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n",
178 |     "del model\n",
179 |     "gc.collect()\n",
180 |     "\n",
181 |     "print(compute_metrics(predictions, references))"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "from tqdm import tqdm\n",
191 |     "import random\n",
192 |     "from datasets import Dataset, load_metric\n",
193 |     "import transformers\n",
194 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n",
195 |     "from datasets import load_dataset, Dataset\n",
196 |     "import pandas as pd\n",
197 |     "import evaluate\n",
198 |     "import torch\n",
199 |     "import nltk\n",
200 |     "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n",
201 |     "import nltk\n",
202 |     "import argparse\n",
203 |     "import numpy as np\n",
204 |     "from transformers import AutoModelForSeq2SeqLM\n",
205 |     "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n",
206 |     "from trl import AutoModelForCausalLMWithValueHead\n",
207 |     "from transformers import TrainingArguments, Trainer\n",
208 |     "from trl import SFTTrainer\n",
209 |     "import evaluate\n",
210 |     "from transformers import AutoModelForCausalLM\n",
211 |     "from peft import PeftModel\n",
212 |     "\n",
213 |     "\n",
214 |     "base_model_name = 'vtrungnhan9/vmlu-llm'\n",
215 |     "print(\"loading\")\n",
216 |     "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
217 |     "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n",
218 |     "model = PeftModel.from_pretrained(model, './results/vmlu-llm_human_justificationv2/')\n",
219 |     "print('finished loadding')\n",
220 |     "model = model.merge_and_unload()\n",
221 |     "model = model.cuda()\n",
222 |     "\n",
223 |     "\n",
224 |     "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n",
225 |     "train_df['label'] = train_df['label'].astype(str)\n",
226 |     "\n",
227 |     "train_dataset = Dataset.from_pandas(train_df)\n",
228 |     "\n",
229 |     "testset =  pd.read_excel('test.xlsx')\n",
230 |     "\n",
231 |     "testset['label'] = testset['label'].astype(str)\n",
232 |     "print(train_df['label'].unique())\n",
233 |     "print(testset['label'].unique())\n",
234 |     "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n",
235 |     "\n",
236 |     "def template(inp, out):\n",
237 |     "    conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n",
238 |     "                    {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n",
239 |     "                    {'role': 'asssistant', 'content': str(out)}\n",
240 |     "                   ]\n",
241 |     "#     print(out)\n",
242 |     "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
243 |     "#     prompt = prompt + ' '\n",
244 |     "    return prompt\n",
245 |     "\n",
246 |     "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n",
247 |     "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n",
248 |     "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n",
249 |     "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n",
250 |     "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n",
251 |     "\n",
252 |     "outs = []\n",
253 |     "i = 0\n",
254 |     "# print(\"Start inference\")\n",
255 |     "# for tt in (test_dataset['train_text']):\n",
256 |     "#     if i % 100 == 0:\n",
257 |     "#         print(i)\n",
258 |     "#     input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
259 |     "#     out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n",
260 |     "\n",
261 |     "#     assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
262 |     "# #     print(assistant)\n",
263 |     "#     outs.append(assistant)\n",
264 |     "#     i += 1\n",
265 |     "# #     break\n",
266 |     "# del model\n",
267 |     "# gc.collect()\n",
268 |     "outs = []\n",
269 |     "batch_size=32\n",
270 |     "print(\"Start inference\")\n",
271 |     "for i in tqdm(range(0, len(test_dataset), batch_size)):\n",
272 |     "    batch = test_dataset[i:i + batch_size]\n",
273 |     "    inputs = tokenizer(batch['train_text'], return_tensors='pt', padding=True, truncation=True).input_ids.cuda()\n",
274 |     "    outputs = model.generate(inputs, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n",
275 |     "    decoded_outputs = tokenizer.batch_decode(outputs[:, inputs.size(1):], skip_special_tokens=True)\n",
276 |     "    outs.extend([output.strip() for output in decoded_outputs])\n",
277 |     "#     break\n",
278 |     "\n",
279 |     "# Cleanup\n",
280 |     "del model\n",
281 |     "import gc\n",
282 |     "gc.collect()\n",
283 |     "torch.cuda.empty_cache()\n",
284 |     "\n",
285 |     "\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "import pandas as pd\n",
295 |     "from datasets import Dataset\n",
296 |     "import evaluate\n",
297 |     "from datasets import load_metric, load_dataset\n",
298 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n",
299 |     "import torch \n",
300 |     "import numpy as np\n",
301 |     "from tqdm import tqdm\n",
302 |     "metrics = load_metric('accuracy')\n",
303 |     "import gc\n",
304 |     "import os\n",
305 |     "\n",
306 |     "accuracy = evaluate.load(\"accuracy\")\n",
307 |     "f1 = evaluate.load(\"f1\")\n",
308 |     "precision = evaluate.load(\"precision\")\n",
309 |     "recall = evaluate.load(\"recall\")\n",
310 |     "\n",
311 |     "def compute_metrics(predictions, references):\n",
312 |     "  decoded_preds, decoded_labels = predictions, references\n",
313 |     "#   logits = np.argmax(logits, axis=1)\n",
314 |     "  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'\n",
315 |     "  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'\n",
316 |     "  predictions = decoded_preds\n",
317 |     "  labels = decoded_labels\n",
318 |     "  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n",
319 |     "  metrics_result = {\n",
320 |     "    \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n",
321 |     "    \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n",
322 |     "#   \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n",
323 |     "#   \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n",
324 |     "    \"f1_neg\": neg,\n",
325 |     "    \"f1_neu\": neu,\n",
326 |     "    \"f1_pos\": pos\n",
327 |     "\n",
328 |     "  }\n",
329 |     "  return metrics_result\n",
330 |     "\n",
331 |     "references = (testset['label'])\n",
332 |     "compute_metrics(outs, references)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": null,
338 |    "metadata": {},
339 |    "outputs": [],
340 |    "source": [
341 |     "from tqdm import tqdm\n",
342 |     "import random\n",
343 |     "from datasets import Dataset, load_metric\n",
344 |     "import transformers\n",
345 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n",
346 |     "from datasets import load_dataset, Dataset\n",
347 |     "import pandas as pd\n",
348 |     "import evaluate\n",
349 |     "import torch\n",
350 |     "import nltk\n",
351 |     "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n",
352 |     "import nltk\n",
353 |     "import argparse\n",
354 |     "import numpy as np\n",
355 |     "from transformers import AutoModelForSeq2SeqLM\n",
356 |     "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n",
357 |     "from trl import AutoModelForCausalLMWithValueHead\n",
358 |     "from transformers import TrainingArguments, Trainer\n",
359 |     "from trl import SFTTrainer\n",
360 |     "import evaluate\n",
361 |     "from transformers import AutoModelForCausalLM\n",
362 |     "from peft import PeftModel\n",
363 |     "\n",
364 |     "\n",
365 |     "base_model_name = 'vtrungnhan9/vmlu-llm'\n",
366 |     "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
367 |     "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n",
368 |     "# model = PeftModel.from_pretrained(model, './Vistral-7B-Chat_no')\n",
369 |     "\n",
370 |     "# model = model.merge_and_unload()\n",
371 |     "model = model.cuda()\n",
372 |     "\n",
373 |     "\n",
374 |     "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n",
375 |     "test_with_asr = pd.read_excel('test_asr.xlsx')\n",
376 |     "testset['text'] = test_with_asr['asr']\n",
377 |     "\n",
378 |     "train_df['label'] = train_df['label'].astype(str)\n",
379 |     "\n",
380 |     "train_dataset = Dataset.from_pandas(train_df)\n",
381 |     "\n",
382 |     "testset =  pd.read_excel('test.xlsx')\n",
383 |     "\n",
384 |     "testset['label'] = testset['label'].astype(str)\n",
385 |     "print(train_df['label'].unique())\n",
386 |     "print(testset['label'].unique())\n",
387 |     "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n",
388 |     "\n",
389 |     "def template(inp, out):\n",
390 |     "    conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n",
391 |     "                    {\"role\": \"user\", \"content\": f\"\"\"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực. Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n",
392 |     "#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n",
393 |     "                   ]\n",
394 |     "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
395 |     "    print(prompt)\n",
396 |     "#     prompt = prompt + f' {out}'\n",
397 |     "    return prompt\n",
398 |     "\n",
399 |     "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n",
400 |     "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n",
401 |     "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n",
402 |     "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n",
403 |     "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n",
404 |     "\n",
405 |     "outs = []\n",
406 |     "i = 0\n",
407 |     "# for tt in (test_dataset['train_text']):\n",
408 |     "#     if i % 500 == 0:\n",
409 |     "#         print(i)\n",
410 |     "#         print(outs)\n",
411 |     "#     input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
412 |     "#     out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)\n",
413 |     "\n",
414 |     "#     assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
415 |     "#     outs.append(assistant)\n",
416 |     "# #     print(outs)\n",
417 |     "    \n",
418 |     "#     i += 1"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "outs = []\n",
428 |     "for tt in (test_dataset['train_text']):\n",
429 |     "    if i % 500 == 0:\n",
430 |     "        print(i)\n",
431 |     "        print(outs)\n",
432 |     "    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
433 |     "    out_ids = model.generate(input_ids, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n",
434 |     "\n",
435 |     "    assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
436 |     "    outs.append(assistant)\n",
437 |     "#     print(outs)\n",
438 |     "    \n",
439 |     "    i += 1"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": null,
445 |    "metadata": {},
446 |    "outputs": [],
447 |    "source": [
448 |     "def template(inp, out):\n",
449 |     "    conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n",
450 |     "                    {\"role\": \"user\", \"content\": f\"\"\"Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n",
451 |     "#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n",
452 |     "                   ]\n",
453 |     "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
454 |     "    print(prompt)\n",
455 |     "#     prompt = prompt + f' {out}'\n",
456 |     "    return prompt\n",
457 |     "\n",
458 |     "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n",
459 |     "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n",
460 |     "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n",
461 |     "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n",
462 |     "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n",
463 |     "\n",
464 |     "outs = []\n",
465 |     "i = 0\n"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {},
472 |    "outputs": [],
473 |    "source": [
474 |     "from tqdm import tqdm\n",
475 |     "import random\n",
476 |     "from datasets import Dataset, load_metric\n",
477 |     "import transformers\n",
478 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n",
479 |     "from datasets import load_dataset, Dataset\n",
480 |     "import pandas as pd\n",
481 |     "import evaluate\n",
482 |     "import torch\n",
483 |     "import nltk\n",
484 |     "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n",
485 |     "import nltk\n",
486 |     "import argparse\n",
487 |     "import numpy as np\n",
488 |     "from transformers import AutoModelForSeq2SeqLM\n",
489 |     "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n",
490 |     "from trl import AutoModelForCausalLMWithValueHead\n",
491 |     "from transformers import TrainingArguments, Trainer\n",
492 |     "from trl import SFTTrainer\n",
493 |     "import evaluate\n",
494 |     "from transformers import AutoModelForCausalLM\n",
495 |     "from peft import PeftModel\n",
496 |     "\n",
497 |     "\n",
498 |     "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n",
499 |     "print(\"loading\")\n",
500 |     "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
501 |     "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n",
502 |     "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n",
503 |     "print('finished laoding')\n",
504 |     "model = model.merge_and_unload()\n",
505 |     "model = model.to('cuda:7')\n",
506 |     "\n",
507 |     "\n",
508 |     "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n",
509 |     "train_df['label'] = train_df['label'].astype(str)\n",
510 |     "\n",
511 |     "train_dataset = Dataset.from_pandas(train_df)\n",
512 |     "\n",
513 |     "testset =  pd.read_excel('test.xlsx')\n",
514 |     "test_with_asr = pd.read_excel('test_asr.xlsx')\n",
515 |     "testset['text'] = test_with_asr['asr']\n",
516 |     "\n",
517 |     "testset['label'] = testset['label'].astype(str)\n",
518 |     "print(train_df['label'].unique())\n",
519 |     "print(testset['label'].unique())\n",
520 |     "test_dataset = Dataset.from_pandas(testset[['text', 'label', 'human_justification']])\n",
521 |     "\n",
522 |     "def template(inp, out):\n",
523 |     "    conversation = [\n",
524 |     "        {\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n",
525 |     "#                     {\"role\": \"user\", \"content\": f\"\"\"sentiment analysis: '{inp.strip()}'\"\"\"},\n",
526 |     "        {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n",
527 |     "        #                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n",
528 |     "                   ]\n",
529 |     "#     print(out)\n",
530 |     "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
531 |     "    prompt = prompt + f' '\n",
532 |     "    return prompt\n",
533 |     "\n",
534 |     "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n",
535 |     "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n",
536 |     "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n",
537 |     "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n",
538 |     "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n",
539 |     "\n",
540 |     "outs = []\n",
541 |     "i = 0\n",
542 |     "print(\"Start inference\")\n",
543 |     "for tt in (test_dataset.select(idx)['train_text']):\n",
544 |     "    if i % 100 == 0:\n",
545 |     "        print(i, set(outs))\n",
546 |     "    input_ids = tokenizer(tt, return_tensors='pt').input_ids.to('cuda:7')#[:,:-1]\n",
547 |     "    out_ids = model.generate(input_ids, max_new_tokens=25, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n",
548 |     "\n",
549 |     "    assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
550 |     "#     print(assistant)\n",
551 |     "    outs.append(assistant)\n",
552 |     "    i += 1\n",
553 |     "#     break\n",
554 |     "#     print(assistant)\n",
555 |     "del model\n",
556 |     "import gc\n",
557 |     "torch.cuda.empty_cache()\n",
558 |     "gc.collect()"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": null,
564 |    "metadata": {},
565 |    "outputs": [],
566 |    "source": [
567 |     "from evaluate import load\n",
568 |     "bertscore = load(\"bertscore\")\n",
569 |     "predictions = [o[2:] for o in outs]\n",
570 |     "references = test_dataset.select(idx)['human_justification']\n",
571 |     "results = bertscore.compute(predictions=predictions, references=references, lang=\"vi\")\n",
572 |     "sum(results['f1'])/100"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": null,
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": [
581 |     "from evaluate import load\n",
582 |     "rouge = load(\"rouge\")\n",
583 |     "predictions = [o[2:] for o in outs]\n",
584 |     "references = test_dataset.select(idx)['human_justification']\n",
585 |     "results = rouge.compute(predictions=predictions, references=references)\n",
586 |     "results"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": null,
592 |    "metadata": {},
593 |    "outputs": [],
594 |    "source": [
595 |     "test_samples = \"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n",
596 |     "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n",
597 |     "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n",
598 |     "\"\"\".split('\\n')\n",
599 |     "\n",
600 |     "testdf = test_dataset.to_pandas()\n",
601 |     "testdf[testdf.text.isin(test_samples)]"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": [
610 |     "confidence = []\n",
611 |     "outs = []\n",
612 |     "i = 0\n",
613 |     "\"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n",
614 |     "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n",
615 |     "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n",
616 |     "\"\"\".split()\n",
617 |     "for tt in (testdf[testdf.text.isin(test_samples)]['train_text']):\n",
618 |     "    if i % 100 == 0:\n",
619 |     "        print(i, set(outs))\n",
620 |     "    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
621 |     "    output = model.generate(input_ids, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n",
622 |     "\n",
623 |     "    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n",
624 |     "#     print(assistant)\n",
625 |     "    confidence.append(assistant)\n",
626 |     "    \n",
627 |     "    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
628 |     "    outs.append(assistant)\n",
629 |     "    i += 1"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": null,
635 |    "metadata": {},
636 |    "outputs": [],
637 |    "source": [
638 |     "import pandas as pd\n",
639 |     "from datasets import Dataset\n",
640 |     "import evaluate\n",
641 |     "from datasets import load_metric, load_dataset\n",
642 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n",
643 |     "import torch \n",
644 |     "import numpy as np\n",
645 |     "from tqdm import tqdm\n",
646 |     "metrics = load_metric('accuracy')\n",
647 |     "import gc\n",
648 |     "import os\n",
649 |     "\n",
650 |     "accuracy = evaluate.load(\"accuracy\")\n",
651 |     "f1 = evaluate.load(\"f1\")\n",
652 |     "precision = evaluate.load(\"precision\")\n",
653 |     "recall = evaluate.load(\"recall\")\n",
654 |     "\n",
655 |     "def compute_metrics(predictions, references):\n",
656 |     "  decoded_preds, decoded_labels = predictions, references\n",
657 |     "#   logits = np.argmax(logits, axis=1)\n",
658 |     "  decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds]  # Replace non-digit predictions with '-1'\n",
659 |     "  decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels]  # Replace non-digit labels with '-1'\n",
660 |     "  predictions = decoded_preds\n",
661 |     "  labels = decoded_labels\n",
662 |     "  neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n",
663 |     "  metrics_result = {\n",
664 |     "    \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n",
665 |     "    \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n",
666 |     "#   \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n",
667 |     "#   \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n",
668 |     "    \"f1_neg\": neg,\n",
669 |     "    \"f1_neu\": neu,\n",
670 |     "    \"f1_pos\": pos\n",
671 |     "\n",
672 |     "  }\n",
673 |     "  return metrics_result\n",
674 |     "\n",
675 |     "references = (testset['label'])\n",
676 |     "compute_metrics(outs, references)"
677 |    ]
678 |   },
679 |   {
680 |    "cell_type": "code",
681 |    "execution_count": null,
682 |    "metadata": {},
683 |    "outputs": [],
684 |    "source": [
685 |     "from tqdm import tqdm\n",
686 |     "import random\n",
687 |     "from datasets import Dataset, load_metric\n",
688 |     "import transformers\n",
689 |     "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n",
690 |     "from datasets import load_dataset, Dataset\n",
691 |     "import pandas as pd\n",
692 |     "import evaluate\n",
693 |     "import torch\n",
694 |     "import nltk\n",
695 |     "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n",
696 |     "import nltk\n",
697 |     "import argparse\n",
698 |     "import numpy as np\n",
699 |     "from transformers import AutoModelForSeq2SeqLM\n",
700 |     "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n",
701 |     "from trl import AutoModelForCausalLMWithValueHead\n",
702 |     "from transformers import TrainingArguments, Trainer\n",
703 |     "from trl import SFTTrainer\n",
704 |     "import evaluate\n",
705 |     "from transformers import AutoModelForCausalLM\n",
706 |     "from peft import PeftModel\n",
707 |     "\n",
708 |     "\n",
709 |     "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n",
710 |     "print(\"loading\")\n",
711 |     "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n",
712 |     "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n",
713 |     "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n",
714 |     "print('finished laoding')\n",
715 |     "model = model.merge_and_unload()\n",
716 |     "model = model.to('cuda')\n",
717 |     "\n",
718 |     "\n",
719 |     "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n",
720 |     "train_df['label'] = train_df['label'].astype(str)\n",
721 |     "\n",
722 |     "train_dataset = Dataset.from_pandas(train_df)\n",
723 |     "\n",
724 |     "testset =  pd.read_excel('test.xlsx')\n",
725 |     "# test_with_asr = pd.read_excel('test_asr.xlsx')\n",
726 |     "# testset['text'] = test_with_asr['asr']\n",
727 |     "\n",
728 |     "testset['label'] = testset['label'].astype(str)\n",
729 |     "print(train_df['label'].unique())\n",
730 |     "print(testset['label'].unique())\n",
731 |     "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n",
732 |     "\n",
733 |     "def template(inp, out):\n",
734 |     "    conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n",
735 |     "                    {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n",
736 |     "#                     {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n",
737 |     "                   ]\n",
738 |     "#     print(out)\n",
739 |     "    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n",
740 |     "    prompt = prompt + f' '\n",
741 |     "    return prompt\n",
742 |     "\n",
743 |     "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n",
744 |     "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n",
745 |     "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n",
746 |     "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n",
747 |     "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n",
748 |     "\n",
749 |     "outs = []\n",
750 |     "i = 0\n",
751 |     "print(\"Start inference\")\n",
752 |     "\n",
753 |     "confidence = []\n",
754 |     "i = 0\n",
755 |     "for tt in (test_dataset['train_text']):\n",
756 |     "    if i % 100 == 0:\n",
757 |     "        print(i, set(outs))\n",
758 |     "    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
759 |     "    output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n",
760 |     "\n",
761 |     "    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n",
762 |     "#     print(assistant)\n",
763 |     "    confidence.append(assistant)\n",
764 |     "    \n",
765 |     "    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
766 |     "    outs.append(assistant)\n",
767 |     "    i += 1"
768 |    ]
769 |   },
770 |   {
771 |    "cell_type": "code",
772 |    "execution_count": null,
773 |    "metadata": {},
774 |    "outputs": [],
775 |    "source": [
776 |     "i = 0\n",
777 |     "print(\"Start inference\")\n",
778 |     "\n",
779 |     "confidence = []\n",
780 |     "i = 0\n",
781 |     "for tt in (test_dataset['train_text']):\n",
782 |     "    if i % 100 == 0:\n",
783 |     "        print(i, set(outs))\n",
784 |     "    input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n",
785 |     "    output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n",
786 |     "\n",
787 |     "    assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n",
788 |     "#     print(assistant)\n",
789 |     "    confidence.append(assistant)\n",
790 |     "    \n",
791 |     "    assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n",
792 |     "    outs.append(assistant)\n",
793 |     "    i += 1"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "code",
798 |    "execution_count": null,
799 |    "metadata": {},
800 |    "outputs": [],
801 |    "source": []
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": null,
806 |    "metadata": {},
807 |    "outputs": [],
808 |    "source": []
809 |   }
810 |  ],
811 |  "metadata": {
812 |   "kernelspec": {
813 |    "display_name": "distill",
814 |    "language": "python",
815 |    "name": "distill"
816 |   },
817 |   "language_info": {
818 |    "codemirror_mode": {
819 |     "name": "ipython",
820 |     "version": 3
821 |    },
822 |    "file_extension": ".py",
823 |    "mimetype": "text/x-python",
824 |    "name": "python",
825 |    "nbconvert_exporter": "python",
826 |    "pygments_lexer": "ipython3",
827 |    "version": "3.10.4"
828 |   }
829 |  },
830 |  "nbformat": 4,
831 |  "nbformat_minor": 4
832 | }
833 | 


--------------------------------------------------------------------------------