├── data ├── test.xlsx ├── train.xlsx ├── test_asr.xlsx ├── test_eng.xlsx └── train_eng.xlsx ├── SentimentReasoning_ACL2025.png ├── sentiment_reasoning_datasample.png ├── sentiment_reasoning_pipeline.png ├── encoder_eng.py ├── encoder.py ├── README.md ├── seq2seq.py ├── seq2seq_eng.py ├── llm-lora_eng.py ├── llm-lora.py └── inference.ipynb /data/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test.xlsx -------------------------------------------------------------------------------- /data/train.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train.xlsx -------------------------------------------------------------------------------- /data/test_asr.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_asr.xlsx -------------------------------------------------------------------------------- /data/test_eng.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_eng.xlsx -------------------------------------------------------------------------------- /data/train_eng.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train_eng.xlsx -------------------------------------------------------------------------------- /SentimentReasoning_ACL2025.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/SentimentReasoning_ACL2025.png -------------------------------------------------------------------------------- /sentiment_reasoning_datasample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_datasample.png -------------------------------------------------------------------------------- /sentiment_reasoning_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_pipeline.png -------------------------------------------------------------------------------- /encoder_eng.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset 2 | import pandas as pd 3 | import argparse 4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 5 | from transformers import AutoTokenizer 6 | import evaluate 7 | import numpy as np 8 | import transformers 9 | 10 | # Initialize argparse 11 | parser = argparse.ArgumentParser(description='Configure training parameters.') 12 | 13 | # Add arguments for training configuration 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 17 | parser.add_argument('--model_checkpoint', type=str, default="emilyalsentzer/Bio_ClinicalBERT", help='Model checkpoint to use') 18 | 19 | # Parse arguments 20 | args = parser.parse_args() 21 | 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"} 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2} 24 | 25 | 26 | # Assign variables from args 27 | batch_size = args.batch_size 28 | num_train_epochs = args.num_train_epochs 29 | learning_rate = args.learning_rate 30 | model_checkpoint = args.model_checkpoint 31 | 32 | model = AutoModelForSequenceClassification.from_pretrained( 33 | model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id 34 | ) 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 36 | 37 | model_name = model_checkpoint.split("/")[-1] 38 | 39 | 40 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 41 | train_dataset = Dataset.from_pandas(train_df) 42 | 43 | testset = pd.read_excel('test_eng.xlsx') 44 | print(train_df['label'].unique()) 45 | print(testset['label'].unique()) 46 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 47 | 48 | 49 | 50 | 51 | def preprocess_function(examples): 52 | return tokenizer(examples['text'], truncation=True, padding=True) 53 | 54 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True) 55 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True) 56 | 57 | 58 | from transformers import DataCollatorWithPadding 59 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 60 | 61 | 62 | 63 | # Load the individual metrics 64 | import evaluate 65 | 66 | accuracy = evaluate.load("accuracy") 67 | f1 = evaluate.load("f1") 68 | precision = evaluate.load("precision") 69 | recall = evaluate.load("recall") 70 | 71 | def compute_metrics(eval_pred): 72 | predictions, labels = eval_pred 73 | predictions = np.argmax(predictions, axis=1) 74 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 75 | 76 | # Compute each metric as needed 77 | metrics_result = { 78 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 79 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 80 | "f1_neg": neg, 81 | "f1_neu": neu, 82 | "f1_pos": pos 83 | 84 | } 85 | 86 | return metrics_result 87 | 88 | # This modified function should now work without the TypeError 89 | 90 | 91 | ## Train 92 | 93 | 94 | training_args = TrainingArguments( 95 | output_dir=f"results/{model_name}", 96 | lr_scheduler_type='cosine', 97 | learning_rate=learning_rate, 98 | per_device_train_batch_size=batch_size, 99 | per_device_eval_batch_size=batch_size, 100 | num_train_epochs=num_train_epochs, 101 | weight_decay=0.01, 102 | evaluation_strategy="epoch", 103 | save_strategy="epoch", 104 | logging_strategy='epoch', 105 | load_best_model_at_end=True, 106 | save_total_limit=2, 107 | bf16=True, 108 | warmup_ratio=0.05, 109 | metric_for_best_model='eval_macro_f1', 110 | # push_to_hub=True, 111 | ) 112 | 113 | trainer = Trainer( 114 | model=model, 115 | args=training_args, 116 | train_dataset=tokenized_dataset_train, 117 | eval_dataset=tokenized_dataset_test, 118 | tokenizer=tokenizer, 119 | data_collator=data_collator, 120 | compute_metrics=compute_metrics, 121 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 122 | 123 | ) 124 | print('model_checkpoint', model_checkpoint) 125 | trainer.train() 126 | trainer.save_model() 127 | trainer.save_state() 128 | trainer.evaluate() -------------------------------------------------------------------------------- /encoder.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset 2 | import pandas as pd 3 | import argparse 4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 5 | from transformers import AutoTokenizer 6 | import evaluate 7 | import numpy as np 8 | import transformers 9 | 10 | # Initialize argparse 11 | parser = argparse.ArgumentParser(description='Configure training parameters.') 12 | 13 | # Add arguments for training configuration 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 17 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 18 | 19 | # Parse arguments 20 | args = parser.parse_args() 21 | 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"} 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2} 24 | 25 | 26 | # Assign variables from args 27 | batch_size = args.batch_size 28 | num_train_epochs = args.num_train_epochs 29 | learning_rate = args.learning_rate 30 | model_checkpoint = args.model_checkpoint 31 | 32 | model = AutoModelForSequenceClassification.from_pretrained( 33 | model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id 34 | ) 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 36 | 37 | model_name = model_checkpoint.split("/")[-1] 38 | 39 | 40 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 41 | train_dataset = Dataset.from_pandas(train_df) 42 | 43 | testset = pd.read_excel('test.xlsx') 44 | test_with_asr = pd.read_excel('test_asr.xlsx') 45 | testset['text'] = test_with_asr['asr'] 46 | 47 | print(train_df['label'].unique()) 48 | print(testset['label'].unique()) 49 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 50 | 51 | 52 | 53 | 54 | def preprocess_function(examples): 55 | return tokenizer(examples['text'], truncation=True, padding=True) 56 | 57 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True) 58 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True) 59 | 60 | 61 | from transformers import DataCollatorWithPadding 62 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 63 | 64 | 65 | 66 | # Load the individual metrics 67 | import evaluate 68 | 69 | accuracy = evaluate.load("accuracy") 70 | f1 = evaluate.load("f1") 71 | precision = evaluate.load("precision") 72 | recall = evaluate.load("recall") 73 | 74 | def compute_metrics(eval_pred): 75 | predictions, labels = eval_pred 76 | predictions = np.argmax(predictions, axis=1) 77 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 78 | 79 | # Compute each metric as needed 80 | metrics_result = { 81 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 82 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 83 | "f1_neg": neg, 84 | "f1_neu": neu, 85 | "f1_pos": pos 86 | 87 | } 88 | 89 | return metrics_result 90 | 91 | # This modified function should now work without the TypeError 92 | 93 | 94 | ## Train 95 | 96 | 97 | training_args = TrainingArguments( 98 | output_dir=f"results/{model_name}", 99 | lr_scheduler_type='cosine', 100 | learning_rate=learning_rate, 101 | per_device_train_batch_size=batch_size, 102 | per_device_eval_batch_size=batch_size, 103 | num_train_epochs=num_train_epochs, 104 | weight_decay=0.01, 105 | evaluation_strategy="epoch", 106 | save_strategy="epoch", 107 | logging_strategy='epoch', 108 | load_best_model_at_end=True, 109 | save_total_limit=2, 110 | bf16=True, 111 | warmup_ratio=0.05, 112 | metric_for_best_model='eval_macro_f1', 113 | # push_to_hub=True, 114 | ) 115 | 116 | trainer = Trainer( 117 | model=model, 118 | args=training_args, 119 | train_dataset=tokenized_dataset_train, 120 | eval_dataset=tokenized_dataset_test, 121 | tokenizer=tokenizer, 122 | data_collator=data_collator, 123 | compute_metrics=compute_metrics, 124 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 125 | 126 | ) 127 | print('model_checkpoint', model_checkpoint) 128 | trainer.train() 129 | trainer.save_model() 130 | trainer.evaluate() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment Reasoning for Healthcare 2 | 3 | **
ACL 2025 Industry Track (Oral)
** 4 | 5 |
Khai-Nguyen Nguyen*, Khai Le-Duc*, Bach Phan Tat, Duy Le, Long Vo-Dang, Truong-Son Hy
6 | 7 |
*Equal contribution
8 | 9 | > Please press ⭐ button and/or cite papers if you feel helpful. 10 | 11 |

12 | 13 |

14 |

Sentiment Reasoning pipeline

15 | 16 |

17 | 18 |

19 | 20 | * **Abstract:** 21 | Transparency in AI healthcare decision-making is crucial. By incorporating rationales to explain reason for each predicted label, users could understand Large Language Models (LLMs)’s reasoning to make better decision. In this work, we introduce a new task - **Sentiment Reasoning** - for both speech and text modalities, and our proposed multimodal multitask framework and **the world's largest multimodal sentiment analysis dataset**. Sentiment Reasoning is an auxiliary task in sentiment analysis where the model predicts both the sentiment label and generates the rationale behind it based on the input transcript. Our study conducted on both human transcripts and Automatic Speech Recognition (ASR) transcripts shows that Sentiment Reasoning helps improve model transparency by providing rationale for model prediction with quality semantically comparable to humans while also improving model's classification performance (**+2% increase in both accuracy and macro-F1**) via rationale-augmented fine-tuning. Also, no significant difference in the semantic quality of generated rationales between human and ASR transcripts. All code, data (**five languages - Vietnamese, English, Chinese, German, and French**) and models are published online. 22 | 23 | * **Citation:** 24 | Please cite this paper: [https://arxiv.org/abs/2407.21054](https://arxiv.org/abs/2407.21054) 25 | 26 | ``` bibtex 27 | @misc{Sentiment_Reasoning, 28 | title={Sentiment Reasoning for Healthcare}, 29 | author={Khai-Nguyen Nguyen and Khai Le-Duc and Bach Phan Tat and Duy Le and Long Vo-Dang and Truong-Son Hy}, 30 | year={2024}, 31 | eprint={2407.21054}, 32 | url={https://arxiv.org/abs/2407.21054}, 33 | } 34 | ``` 35 | 36 | This repository contains scripts for automatic speech recognition (ASR) and sentiment reasoning using cascaded sequence-to-sequence (seq2seq) audio-language models. The provided scripts cover model preparation, training, inference, and evaluation processes, based on the dataset in the paper. 37 | 38 | ## Dataset and Pre-trained Models: 39 | 🤗 **HuggingFace Dataset**: [https://huggingface.co/datasets/leduckhai/Sentiment-Reasoning](https://huggingface.co/datasets/leduckhai/Sentiment-Reasoning) 40 | 41 | 🤗 **HuggingFace Models**: to be released soon! 42 | 43 | | Model Name | Description | Link | 44 | |------------------|--------------------------------------------|----------------------------------------------------------------------| 45 | | `Vietnamese_Vistral-7B` | LLM fine-tuned on Vietnamese set | [Hugging Face models](https://huggingface.co/leduckhai/Sentiment-Reasoning/tree/main/Vietnamese_Vistral-7B) | 46 | | `English-LLM` | LLM fine-tuned on English set | to be released soon! | 47 | | `French-LLM` | LLM fine-tuned on French set | to be released soon! | 48 | | `German-LLM` | LLM fine-tuned on German set | to be released soon! | 49 | | `Vietnamese-LLM` | LLM fine-tuned on Chinese set | to be released soon! | 50 | | `Multilingual-LLM` | LLM fine-tuned on Multilingual set (5 languages) | to be released soon! | 51 | 52 | **Paperswithcodes** to be released soon! 53 | 54 |

55 | 56 |

57 |

Sample data format used in Sentiment Reasoning dataset

58 | 59 | 60 | ## Contact 61 | 62 | Core developers: 63 | 64 | **Khai Le-Duc** 65 | ``` 66 | University of Toronto, Canada 67 | Email: duckhai.le@mail.utoronto.ca 68 | GitHub: https://github.com/leduckhai 69 | ``` 70 | 71 | **Khai-Nguyen Nguyen** 72 | ``` 73 | College of William and Mary, USA 74 | GitHub: https://github.com/nkn002 75 | Hugging Face: https://huggingface.co/knguyennguyen 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /seq2seq.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | from datasets import Dataset, load_metric 5 | import transformers 6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 7 | from datasets import load_dataset, Dataset 8 | import pandas as pd 9 | import evaluate 10 | import torch 11 | import nltk 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 13 | import nltk 14 | import argparse 15 | import numpy as np 16 | 17 | 18 | # Initialize argparse 19 | parser = argparse.ArgumentParser(description='Configure training parameters.') 20 | 21 | # Add arguments for training configuration 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 25 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-large", help='Model checkpoint to use') 26 | 27 | # Parse arguments 28 | args = parser.parse_args() 29 | 30 | # Assign variables from args 31 | batch_size = args.batch_size 32 | num_train_epochs = args.num_train_epochs 33 | learning_rate = args.learning_rate 34 | model_checkpoint = args.model_checkpoint 35 | 36 | # Now you can use these variables in your training setup 37 | print(f"Training setup:") 38 | print(f"Batch size: {batch_size}") 39 | print(f"Number of training epochs: {num_train_epochs}") 40 | print(f"Learning rate: {learning_rate}") 41 | print(f"Model checkpoint: {model_checkpoint}") 42 | 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"} 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'} 45 | 46 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 47 | train_df['label'] = train_df['label'].astype(str) 48 | 49 | train_dataset = Dataset.from_pandas(train_df) 50 | 51 | testset = pd.read_excel('test.xlsx') 52 | 53 | # Then convert the modified DataFrame to a Hugging Face dataset 54 | testset['label'] = testset['label'].astype(str) 55 | print(train_df['label'].unique()) 56 | print(testset['label'].unique()) 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 58 | 59 | # Output unique values to verify 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"} 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'} 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 65 | 66 | 67 | def preprocess_function(examples): 68 | inputs = [doc for doc in examples["text"]] 69 | model_inputs = tokenizer(inputs, max_length=128, truncation=True) 70 | labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True) 71 | 72 | model_inputs["labels"] = labels["input_ids"] 73 | return model_inputs 74 | 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) 77 | 78 | print('tokenized_train_dataset', tokenized_train_dataset) 79 | print('tokenized_test_dataset', tokenized_test_dataset) 80 | 81 | 82 | # Load the individual metrics 83 | accuracy = evaluate.load("accuracy") 84 | f1 = evaluate.load("f1") 85 | precision = evaluate.load("precision") 86 | recall = evaluate.load("recall") 87 | 88 | def compute_metrics(eval_pred): 89 | logits, labels = eval_pred 90 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 91 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 92 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 93 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 94 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 95 | predictions = decoded_preds 96 | labels = decoded_labels 97 | metrics_result = { 98 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 99 | 100 | } 101 | 102 | return metrics_result 103 | 104 | # This modified function should now work without the TypeError 105 | 106 | 107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) 108 | 109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label']) 110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label']) 111 | model_name = model_checkpoint.split("/")[-1] 112 | 113 | transformers.logging.set_verbosity_info() 114 | training_args = Seq2SeqTrainingArguments( 115 | output_dir=f"results/{model_name}", 116 | eval_strategy="epoch", 117 | save_strategy="epoch", 118 | logging_strategy='epoch', 119 | learning_rate=learning_rate, 120 | per_device_train_batch_size=batch_size, 121 | per_device_eval_batch_size=batch_size, 122 | weight_decay=0.01, 123 | save_total_limit=2, 124 | num_train_epochs=num_train_epochs, 125 | predict_with_generate=True, 126 | load_best_model_at_end=True, 127 | metric_for_best_model='eval_accuracy', 128 | bf16=True, 129 | lr_scheduler_type='cosine', 130 | warmup_ratio=0.05, 131 | ) 132 | 133 | # Setting up the trainer 134 | trainer = Seq2SeqTrainer( 135 | model=model, 136 | args=training_args, 137 | train_dataset=tokenized_train_dataset, 138 | eval_dataset=tokenized_test_dataset, 139 | tokenizer=tokenizer, 140 | data_collator=data_collator, 141 | compute_metrics=compute_metrics, 142 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 143 | 144 | ) 145 | 146 | 147 | trainer.train() 148 | trainer.save_model() 149 | -------------------------------------------------------------------------------- /seq2seq_eng.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | from datasets import Dataset, load_metric 5 | import transformers 6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 7 | from datasets import load_dataset, Dataset 8 | import pandas as pd 9 | import evaluate 10 | import torch 11 | import nltk 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 13 | import nltk 14 | import argparse 15 | import numpy as np 16 | 17 | 18 | # Initialize argparse 19 | parser = argparse.ArgumentParser(description='Configure training parameters.') 20 | 21 | # Add arguments for training configuration 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 25 | parser.add_argument('--model_checkpoint', type=str, default="luqh/ClinicalT5-base", help='Model checkpoint to use') 26 | 27 | # Parse arguments 28 | args = parser.parse_args() 29 | 30 | # Assign variables from args 31 | batch_size = args.batch_size 32 | num_train_epochs = args.num_train_epochs 33 | learning_rate = args.learning_rate 34 | model_checkpoint = args.model_checkpoint 35 | 36 | # Now you can use these variables in your training setup 37 | print(f"Training setup:") 38 | print(f"Batch size: {batch_size}") 39 | print(f"Number of training epochs: {num_train_epochs}") 40 | print(f"Learning rate: {learning_rate}") 41 | print(f"Model checkpoint: {model_checkpoint}") 42 | 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"} 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'} 45 | 46 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 47 | train_df['label'] = train_df['label'].astype(str) 48 | 49 | train_dataset = Dataset.from_pandas(train_df) 50 | 51 | testset = pd.read_excel('test_eng.xlsx') 52 | 53 | # Then convert the modified DataFrame to a Hugging Face dataset 54 | testset['label'] = testset['label'].astype(str) 55 | print(train_df['label'].unique()) 56 | print(testset['label'].unique()) 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 58 | 59 | # Output unique values to verify 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"} 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'} 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 65 | 66 | 67 | def preprocess_function(examples): 68 | inputs = [doc for doc in examples["text"]] 69 | model_inputs = tokenizer(inputs, max_length=128, truncation=True) 70 | labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True) 71 | 72 | model_inputs["labels"] = labels["input_ids"] 73 | return model_inputs 74 | 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) 77 | 78 | print('tokenized_train_dataset', tokenized_train_dataset) 79 | print('tokenized_test_dataset', tokenized_test_dataset) 80 | 81 | print(train_dataset['text']) 82 | # Load the individual metrics 83 | accuracy = evaluate.load("accuracy") 84 | f1 = evaluate.load("f1") 85 | precision = evaluate.load("precision") 86 | recall = evaluate.load("recall") 87 | 88 | def compute_metrics(eval_pred): 89 | logits, labels = eval_pred 90 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 91 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 92 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 93 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 94 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 95 | predictions = decoded_preds 96 | labels = decoded_labels 97 | metrics_result = { 98 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 99 | 100 | } 101 | 102 | return metrics_result 103 | 104 | # This modified function should now work without the TypeError 105 | 106 | 107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) 108 | 109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label']) 110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label']) 111 | model_name = model_checkpoint.split("/")[-1] 112 | 113 | transformers.logging.set_verbosity_info() 114 | training_args = Seq2SeqTrainingArguments( 115 | output_dir=f"results/{model_name}", 116 | eval_strategy="epoch", 117 | save_strategy="epoch", 118 | logging_strategy='epoch', 119 | learning_rate=learning_rate, 120 | per_device_train_batch_size=batch_size, 121 | per_device_eval_batch_size=batch_size, 122 | weight_decay=0.01, 123 | save_total_limit=2, 124 | num_train_epochs=num_train_epochs, 125 | predict_with_generate=True, 126 | load_best_model_at_end=True, 127 | metric_for_best_model='eval_accuracy', 128 | bf16=True, 129 | lr_scheduler_type='cosine', 130 | warmup_ratio=0.05, 131 | ) 132 | 133 | # Setting up the trainer 134 | trainer = Seq2SeqTrainer( 135 | model=model, 136 | args=training_args, 137 | train_dataset=tokenized_train_dataset, 138 | eval_dataset=tokenized_test_dataset, 139 | tokenizer=tokenizer, 140 | data_collator=data_collator, 141 | compute_metrics=compute_metrics, 142 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 143 | 144 | ) 145 | 146 | 147 | trainer.train() 148 | trainer.save_state() 149 | trainer.save_model() 150 | -------------------------------------------------------------------------------- /llm-lora_eng.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import pandas as pd 4 | import numpy as np 5 | import random 6 | from datasets import Dataset, load_metric 7 | import transformers 8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 9 | from datasets import load_dataset, Dataset 10 | import pandas as pd 11 | import evaluate 12 | import torch 13 | import nltk 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 15 | import nltk 16 | import argparse 17 | import numpy as np 18 | from transformers import AutoModelForSeq2SeqLM 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 20 | from trl import AutoModelForCausalLMWithValueHead 21 | from transformers import TrainingArguments, Trainer 22 | from trl import SFTTrainer 23 | import evaluate 24 | 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm" 26 | # rationale_col = 'cot_rationale' 27 | 28 | # Initialize argparse 29 | parser = argparse.ArgumentParser(description='Configure training parameters.') 30 | 31 | # Add arguments for training configuration 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm') 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale') 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer') 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 36 | 37 | # Parse arguments 38 | args = parser.parse_args() 39 | model_name_or_path = args.model_name_or_path 40 | rationale_col = args.rationale_col 41 | 42 | peft_config = LoraConfig( 43 | task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 44 | ) 45 | 46 | 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token='hf_GxsYTZDZhHcQEzYEvWrus') 48 | model = AutoModelForCausalLM.from_pretrained( 49 | model_name_or_path, 50 | # load_in_8bit=True, 51 | torch_dtype=torch.bfloat16, 52 | device_map="auto", 53 | # use_cache=True, 54 | cache_dir='./models', 55 | load_in_8bit=True, 56 | token='hf_GxsYTZDZhHcQEzYEvWrus' 57 | 58 | ) 59 | 60 | model.enable_input_require_grads() 61 | model = get_peft_model(model, peft_config) 62 | model.print_trainable_parameters() 63 | 64 | 65 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 66 | train_df['label'] = train_df['label'].astype(str) 67 | 68 | train_dataset = Dataset.from_pandas(train_df) 69 | 70 | testset = pd.read_excel('test_eng.xlsx') 71 | 72 | testset['label'] = testset['label'].astype(str) 73 | print(train_df['label'].unique()) 74 | print(testset['label'].unique()) 75 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 76 | 77 | 78 | def template(inp, out, rationale=''): 79 | # if rationale 80 | conversation = [ 81 | {"role": "user", "content": f"""sentiment analysis: '{inp.strip()}'"""}, 82 | ] 83 | # print(out) 84 | prompt = tokenizer.apply_chat_template(conversation, tokenize=False) 85 | prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip() 86 | print(prompt) 87 | return prompt 88 | # , train_dataset[rationale_col] 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])] 90 | if rationale_col == '': 91 | new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])] 92 | else: 93 | new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])] 94 | train_dataset= train_dataset.add_column("train_text", new_column_train) 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])] 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train) 97 | 98 | 99 | # Load the individual metrics 100 | accuracy = evaluate.load("accuracy") 101 | f1 = evaluate.load("f1") 102 | precision = evaluate.load("precision") 103 | recall = evaluate.load("recall") 104 | 105 | def compute_metrics(eval_pred): 106 | logits, labels = eval_pred 107 | logits = np.argmax(logits, axis=-1) 108 | print(logits, labels) 109 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 110 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 111 | print('decoded_preds', decoded_preds) 112 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 113 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 114 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 115 | predictions = decoded_preds 116 | labels = decoded_labels 117 | print( f1.compute(predictions=predictions, references=labels, average=None)['f1']) 118 | print(set(decoded_preds)) 119 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 120 | metrics_result = { 121 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 122 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 123 | # "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'], 124 | # "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'], 125 | "f1_neg": neg, 126 | "f1_neu": neu, 127 | "f1_pos": pos 128 | 129 | } 130 | 131 | return metrics_result 132 | 133 | training_args = TrainingArguments( 134 | per_device_train_batch_size=16, 135 | gradient_accumulation_steps=2, 136 | # gradient_checkpointing=True, 137 | warmup_steps=100, 138 | report_to=[], 139 | learning_rate=2e-4, 140 | lr_scheduler_type="cosine", 141 | num_train_epochs=5, 142 | optim="adamw_bnb_8bit", 143 | bf16=True, 144 | # gradient_accumulation_steps=2, # simulate larger batch sizes 145 | output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2", 146 | logging_strategy="epoch", 147 | dataloader_num_workers=4, 148 | save_total_limit=3, 149 | save_strategy='epoch', 150 | # eval_strategy='no', 151 | ) 152 | 153 | 154 | trainer = SFTTrainer( 155 | model, 156 | packing=True, # pack samples together for efficient training 157 | max_seq_length=180, # maximum packed length 158 | args=training_args, 159 | train_dataset=train_dataset.shuffle(), 160 | compute_metrics=compute_metrics, 161 | peft_config=peft_config, 162 | dataset_text_field='train_text', 163 | # callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 164 | 165 | ) 166 | trainer.train() 167 | trainer.save_model() 168 | 169 | # trainer.evaluate() 170 | 171 | 172 | -------------------------------------------------------------------------------- /llm-lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import pandas as pd 4 | import numpy as np 5 | import random 6 | from datasets import Dataset, load_metric 7 | import transformers 8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 9 | from datasets import load_dataset, Dataset 10 | import pandas as pd 11 | import evaluate 12 | import torch 13 | import nltk 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 15 | import nltk 16 | import argparse 17 | import numpy as np 18 | from transformers import AutoModelForSeq2SeqLM 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 20 | from trl import AutoModelForCausalLMWithValueHead 21 | from transformers import TrainingArguments, Trainer 22 | from trl import SFTTrainer 23 | import evaluate 24 | 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm" 26 | # rationale_col = 'cot_rationale' 27 | 28 | # Initialize argparse 29 | parser = argparse.ArgumentParser(description='Configure training parameters.') 30 | 31 | # Add arguments for training configuration 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm') 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale') 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer') 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 36 | 37 | # Parse arguments 38 | args = parser.parse_args() 39 | model_name_or_path = args.model_name_or_path 40 | rationale_col = args.rationale_col 41 | 42 | peft_config = LoraConfig( 43 | task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 44 | ) 45 | 46 | 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 48 | model = AutoModelForCausalLM.from_pretrained( 49 | model_name_or_path, 50 | # load_in_8bit=True, 51 | torch_dtype=torch.bfloat16, 52 | device_map="auto", 53 | # use_cache=True, 54 | cache_dir='./models', 55 | load_in_8bit=True, 56 | 57 | ) 58 | 59 | model.enable_input_require_grads() 60 | model = get_peft_model(model, peft_config) 61 | model.print_trainable_parameters() 62 | 63 | 64 | train_df = pd.read_excel('multitask/distilling-step-by-step/train_rationale.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 65 | train_df['label'] = train_df['label'].astype(str) 66 | 67 | train_dataset = Dataset.from_pandas(train_df) 68 | 69 | testset = pd.read_excel('test.xlsx') 70 | 71 | testset['label'] = testset['label'].astype(str) 72 | print(train_df['label'].unique()) 73 | print(testset['label'].unique()) 74 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 75 | 76 | 77 | def template(inp, out, rationale=''): 78 | # if rationale 79 | conversation = [ 80 | {"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." }, 81 | {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""}, 82 | ] 83 | # print(out) 84 | prompt = tokenizer.apply_chat_template(conversation, tokenize=False) 85 | prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip() 86 | print(prompt) 87 | return prompt 88 | # , train_dataset[rationale_col] 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])] 90 | if rationale_col == '': 91 | new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])] 92 | else: 93 | new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])] 94 | train_dataset= train_dataset.add_column("train_text", new_column_train) 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])] 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train) 97 | 98 | 99 | # Load the individual metrics 100 | accuracy = evaluate.load("accuracy") 101 | f1 = evaluate.load("f1") 102 | precision = evaluate.load("precision") 103 | recall = evaluate.load("recall") 104 | 105 | def compute_metrics(eval_pred): 106 | logits, labels = eval_pred 107 | logits = np.argmax(logits, axis=-1) 108 | print(logits, labels) 109 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 110 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 111 | print('decoded_preds', decoded_preds) 112 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 113 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 114 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 115 | predictions = decoded_preds 116 | labels = decoded_labels 117 | print( f1.compute(predictions=predictions, references=labels, average=None)['f1']) 118 | print(set(decoded_preds)) 119 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 120 | metrics_result = { 121 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 122 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 123 | # "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'], 124 | # "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'], 125 | "f1_neg": neg, 126 | "f1_neu": neu, 127 | "f1_pos": pos 128 | 129 | } 130 | 131 | return metrics_result 132 | 133 | training_args = TrainingArguments( 134 | per_device_train_batch_size=16, 135 | gradient_accumulation_steps=2, 136 | # gradient_checkpointing=True, 137 | warmup_steps=100, 138 | report_to=[], 139 | learning_rate=2e-4, 140 | lr_scheduler_type="cosine", 141 | num_train_epochs=5, 142 | optim="adamw_bnb_8bit", 143 | bf16=True, 144 | # gradient_accumulation_steps=2, # simulate larger batch sizes 145 | output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2", 146 | logging_strategy="epoch", 147 | dataloader_num_workers=4, 148 | save_total_limit=3, 149 | save_strategy='epoch', 150 | # eval_strategy='no', 151 | ) 152 | 153 | 154 | trainer = SFTTrainer( 155 | model, 156 | packing=True, # pack samples together for efficient training 157 | max_seq_length=180, # maximum packed length 158 | args=training_args, 159 | train_dataset=train_dataset.shuffle(), 160 | compute_metrics=compute_metrics, 161 | peft_config=peft_config, 162 | dataset_text_field='train_text', 163 | # callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 164 | 165 | ) 166 | trainer.train() 167 | trainer.save_model() 168 | 169 | # trainer.evaluate() 170 | 171 | 172 | -------------------------------------------------------------------------------- /inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from datasets import load_metric, load_dataset\n", 10 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 11 | "import torch \n", 12 | "import numpy as np\n", 13 | "from tqdm import tqdm\n", 14 | "metrics = load_metric('accuracy')\n", 15 | "import gc\n", 16 | "import os\n", 17 | "\n", 18 | "def inference(path):\n", 19 | " prefix = 'summarize: ' if 'mt5' in path else ''\n", 20 | " tokenizer = AutoTokenizer.from_pretrained(path)\n", 21 | " model = AutoModelForSeq2SeqLM.from_pretrained(path)\n", 22 | " max_length = 1024 if 'bert' not in path else 256\n", 23 | " def preprocess_function(examples):\n", 24 | " inputs = [prefix + doc for doc in examples[\"text\"]]\n", 25 | " model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n", 26 | " labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n", 27 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 28 | " return model_inputs\n", 29 | "\n", 30 | " testset = pd.read_excel('test.xlsx')\n", 31 | " testset['label'] = testset['label'].astype(str)\n", 32 | " dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 33 | "\n", 34 | "# dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n", 35 | " test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n", 36 | " data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n", 37 | " model.to('cuda')\n", 38 | "\n", 39 | "\n", 40 | " max_target_length = 5\n", 41 | " test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n", 42 | " dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)\n", 43 | "\n", 44 | " predictions = []\n", 45 | " references = []\n", 46 | " for i, batch in enumerate(tqdm(dataloader)):\n", 47 | " outputs = model.generate(\n", 48 | " input_ids=batch['input_ids'].to('cuda'),\n", 49 | " max_length=max_target_length,\n", 50 | " attention_mask=batch['attention_mask'].to('cuda'),\n", 51 | " )\n", 52 | " with tokenizer.as_target_tokenizer():\n", 53 | " outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n", 54 | "\n", 55 | " labels = np.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)\n", 56 | " actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n", 57 | " predictions.extend(outputs)\n", 58 | " references.extend(actuals)\n", 59 | " metrics.add_batch(predictions=outputs, references=actuals)\n", 60 | "\n", 61 | " metrics.compute()\n", 62 | "\n", 63 | " rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n", 64 | "# new_file_path = './r_scores_faq'\n", 65 | "# # Write to the file\n", 66 | "# try:\n", 67 | "# # Attempt to append to the file\n", 68 | "# with open(new_file_path, 'a') as file:\n", 69 | "# file.write(path.split('/')[-2] + '\\n')\n", 70 | "# for new_content_str in rouges:\n", 71 | "# result = next(iter(new_content_str))\n", 72 | "# file.write(f\"{result}: {new_content_str[result]}\\n\")\n", 73 | "# file.write('\\n')\n", 74 | "# action_result = \"Content appended to the existing file.\"\n", 75 | "# except FileNotFoundError:\n", 76 | "# # File doesn't exist, create it and write the content\n", 77 | "# with open(new_file_path, 'w') as file:\n", 78 | "# file.write(path)\n", 79 | "# file.write(new_content_str)\n", 80 | "# action_result = \"File did not exist, so it was created with the new content.\"\n", 81 | " \n", 82 | "# del model\n", 83 | "# gc.collect()\n" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "import pandas as pd\n", 93 | "from datasets import Dataset\n", 94 | "import evaluate\n", 95 | "from datasets import load_metric, load_dataset\n", 96 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 97 | "import torch \n", 98 | "import numpy as np\n", 99 | "from tqdm import tqdm\n", 100 | "metrics = load_metric('accuracy')\n", 101 | "import gc\n", 102 | "import os\n", 103 | "\n", 104 | "accuracy = evaluate.load(\"accuracy\")\n", 105 | "f1 = evaluate.load(\"f1\")\n", 106 | "precision = evaluate.load(\"precision\")\n", 107 | "recall = evaluate.load(\"recall\")\n", 108 | "\n", 109 | "path = './multitask/distilling-step-by-step/ckpts/VietAI/vit5-base_human_justification/'\n", 110 | "# path = 'multitask/distilling-step-by-step/ckpts//'\n", 111 | "# path = 'results/flan-t5-base/'\n", 112 | "prefix = 'gt: ' if 'distilling' in path else ''\n", 113 | "tokenizer = AutoTokenizer.from_pretrained(path)\n", 114 | "model = AutoModelForSeq2SeqLM.from_pretrained(path)\n", 115 | "max_length = 1024 if 'bert' not in path else 256\n", 116 | "def preprocess_function(examples):\n", 117 | " inputs = [prefix + doc for doc in examples[\"text\"]]\n", 118 | " model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n", 119 | " labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n", 120 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 121 | " return model_inputs\n", 122 | "\n", 123 | "testset = pd.read_excel('test.xlsx')\n", 124 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 125 | "testset['text'] = test_with_asr['asr']\n", 126 | "testset['label'] = testset['label'].astype(str)\n", 127 | "dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 128 | "\n", 129 | "# dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n", 130 | "test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n", 131 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n", 132 | "model.to('cuda:2')\n", 133 | "\n", 134 | "\n", 135 | "max_target_length = 25\n", 136 | "test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n", 137 | "dataloader = torch.utils.data.DataLoader(test_tokenized_datasets.select(idx), collate_fn=data_collator, batch_size=32)\n", 138 | "\n", 139 | "predictions = []\n", 140 | "references = []\n", 141 | "for i, batch in enumerate(tqdm(dataloader)):\n", 142 | " outputs = model.generate(\n", 143 | " input_ids=batch['input_ids'].to('cuda:2'),\n", 144 | " max_length=max_target_length,\n", 145 | " attention_mask=batch['attention_mask'].to('cuda:2'),\n", 146 | " )\n", 147 | " with tokenizer.as_target_tokenizer():\n", 148 | " outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n", 149 | "\n", 150 | " labels = np.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)\n", 151 | " actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n", 152 | " predictions.extend(outputs)\n", 153 | " references.extend(actuals)\n", 154 | "# metrics.add_batch(predictions=outputs, references=actuals)\n", 155 | "\n", 156 | "# metrics.compute()\n", 157 | "\n", 158 | "def compute_metrics(predictions, references):\n", 159 | " decoded_preds, decoded_labels = predictions, references\n", 160 | "# logits = np.argmax(logits, axis=1)\n", 161 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 162 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 163 | " predictions = decoded_preds\n", 164 | " labels = decoded_labels\n", 165 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 166 | " metrics_result = {\n", 167 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 168 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 169 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 170 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 171 | " \"f1_neg\": neg,\n", 172 | " \"f1_neu\": neu,\n", 173 | " \"f1_pos\": pos\n", 174 | "\n", 175 | " }\n", 176 | " return metrics_result\n", 177 | "# rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n", 178 | "del model\n", 179 | "gc.collect()\n", 180 | "\n", 181 | "print(compute_metrics(predictions, references))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from tqdm import tqdm\n", 191 | "import random\n", 192 | "from datasets import Dataset, load_metric\n", 193 | "import transformers\n", 194 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 195 | "from datasets import load_dataset, Dataset\n", 196 | "import pandas as pd\n", 197 | "import evaluate\n", 198 | "import torch\n", 199 | "import nltk\n", 200 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 201 | "import nltk\n", 202 | "import argparse\n", 203 | "import numpy as np\n", 204 | "from transformers import AutoModelForSeq2SeqLM\n", 205 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 206 | "from trl import AutoModelForCausalLMWithValueHead\n", 207 | "from transformers import TrainingArguments, Trainer\n", 208 | "from trl import SFTTrainer\n", 209 | "import evaluate\n", 210 | "from transformers import AutoModelForCausalLM\n", 211 | "from peft import PeftModel\n", 212 | "\n", 213 | "\n", 214 | "base_model_name = 'vtrungnhan9/vmlu-llm'\n", 215 | "print(\"loading\")\n", 216 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 217 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 218 | "model = PeftModel.from_pretrained(model, './results/vmlu-llm_human_justificationv2/')\n", 219 | "print('finished loadding')\n", 220 | "model = model.merge_and_unload()\n", 221 | "model = model.cuda()\n", 222 | "\n", 223 | "\n", 224 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 225 | "train_df['label'] = train_df['label'].astype(str)\n", 226 | "\n", 227 | "train_dataset = Dataset.from_pandas(train_df)\n", 228 | "\n", 229 | "testset = pd.read_excel('test.xlsx')\n", 230 | "\n", 231 | "testset['label'] = testset['label'].astype(str)\n", 232 | "print(train_df['label'].unique())\n", 233 | "print(testset['label'].unique())\n", 234 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 235 | "\n", 236 | "def template(inp, out):\n", 237 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 238 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 239 | " {'role': 'asssistant', 'content': str(out)}\n", 240 | " ]\n", 241 | "# print(out)\n", 242 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 243 | "# prompt = prompt + ' '\n", 244 | " return prompt\n", 245 | "\n", 246 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 247 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 248 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 249 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 250 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 251 | "\n", 252 | "outs = []\n", 253 | "i = 0\n", 254 | "# print(\"Start inference\")\n", 255 | "# for tt in (test_dataset['train_text']):\n", 256 | "# if i % 100 == 0:\n", 257 | "# print(i)\n", 258 | "# input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 259 | "# out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n", 260 | "\n", 261 | "# assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 262 | "# # print(assistant)\n", 263 | "# outs.append(assistant)\n", 264 | "# i += 1\n", 265 | "# # break\n", 266 | "# del model\n", 267 | "# gc.collect()\n", 268 | "outs = []\n", 269 | "batch_size=32\n", 270 | "print(\"Start inference\")\n", 271 | "for i in tqdm(range(0, len(test_dataset), batch_size)):\n", 272 | " batch = test_dataset[i:i + batch_size]\n", 273 | " inputs = tokenizer(batch['train_text'], return_tensors='pt', padding=True, truncation=True).input_ids.cuda()\n", 274 | " outputs = model.generate(inputs, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n", 275 | " decoded_outputs = tokenizer.batch_decode(outputs[:, inputs.size(1):], skip_special_tokens=True)\n", 276 | " outs.extend([output.strip() for output in decoded_outputs])\n", 277 | "# break\n", 278 | "\n", 279 | "# Cleanup\n", 280 | "del model\n", 281 | "import gc\n", 282 | "gc.collect()\n", 283 | "torch.cuda.empty_cache()\n", 284 | "\n", 285 | "\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "import pandas as pd\n", 295 | "from datasets import Dataset\n", 296 | "import evaluate\n", 297 | "from datasets import load_metric, load_dataset\n", 298 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 299 | "import torch \n", 300 | "import numpy as np\n", 301 | "from tqdm import tqdm\n", 302 | "metrics = load_metric('accuracy')\n", 303 | "import gc\n", 304 | "import os\n", 305 | "\n", 306 | "accuracy = evaluate.load(\"accuracy\")\n", 307 | "f1 = evaluate.load(\"f1\")\n", 308 | "precision = evaluate.load(\"precision\")\n", 309 | "recall = evaluate.load(\"recall\")\n", 310 | "\n", 311 | "def compute_metrics(predictions, references):\n", 312 | " decoded_preds, decoded_labels = predictions, references\n", 313 | "# logits = np.argmax(logits, axis=1)\n", 314 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 315 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 316 | " predictions = decoded_preds\n", 317 | " labels = decoded_labels\n", 318 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 319 | " metrics_result = {\n", 320 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 321 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 322 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 323 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 324 | " \"f1_neg\": neg,\n", 325 | " \"f1_neu\": neu,\n", 326 | " \"f1_pos\": pos\n", 327 | "\n", 328 | " }\n", 329 | " return metrics_result\n", 330 | "\n", 331 | "references = (testset['label'])\n", 332 | "compute_metrics(outs, references)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "from tqdm import tqdm\n", 342 | "import random\n", 343 | "from datasets import Dataset, load_metric\n", 344 | "import transformers\n", 345 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 346 | "from datasets import load_dataset, Dataset\n", 347 | "import pandas as pd\n", 348 | "import evaluate\n", 349 | "import torch\n", 350 | "import nltk\n", 351 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 352 | "import nltk\n", 353 | "import argparse\n", 354 | "import numpy as np\n", 355 | "from transformers import AutoModelForSeq2SeqLM\n", 356 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 357 | "from trl import AutoModelForCausalLMWithValueHead\n", 358 | "from transformers import TrainingArguments, Trainer\n", 359 | "from trl import SFTTrainer\n", 360 | "import evaluate\n", 361 | "from transformers import AutoModelForCausalLM\n", 362 | "from peft import PeftModel\n", 363 | "\n", 364 | "\n", 365 | "base_model_name = 'vtrungnhan9/vmlu-llm'\n", 366 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 367 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 368 | "# model = PeftModel.from_pretrained(model, './Vistral-7B-Chat_no')\n", 369 | "\n", 370 | "# model = model.merge_and_unload()\n", 371 | "model = model.cuda()\n", 372 | "\n", 373 | "\n", 374 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 375 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 376 | "testset['text'] = test_with_asr['asr']\n", 377 | "\n", 378 | "train_df['label'] = train_df['label'].astype(str)\n", 379 | "\n", 380 | "train_dataset = Dataset.from_pandas(train_df)\n", 381 | "\n", 382 | "testset = pd.read_excel('test.xlsx')\n", 383 | "\n", 384 | "testset['label'] = testset['label'].astype(str)\n", 385 | "print(train_df['label'].unique())\n", 386 | "print(testset['label'].unique())\n", 387 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 388 | "\n", 389 | "def template(inp, out):\n", 390 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 391 | " {\"role\": \"user\", \"content\": f\"\"\"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực. Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 392 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 393 | " ]\n", 394 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 395 | " print(prompt)\n", 396 | "# prompt = prompt + f' {out}'\n", 397 | " return prompt\n", 398 | "\n", 399 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 400 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 401 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 402 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 403 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 404 | "\n", 405 | "outs = []\n", 406 | "i = 0\n", 407 | "# for tt in (test_dataset['train_text']):\n", 408 | "# if i % 500 == 0:\n", 409 | "# print(i)\n", 410 | "# print(outs)\n", 411 | "# input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 412 | "# out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)\n", 413 | "\n", 414 | "# assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 415 | "# outs.append(assistant)\n", 416 | "# # print(outs)\n", 417 | " \n", 418 | "# i += 1" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "outs = []\n", 428 | "for tt in (test_dataset['train_text']):\n", 429 | " if i % 500 == 0:\n", 430 | " print(i)\n", 431 | " print(outs)\n", 432 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 433 | " out_ids = model.generate(input_ids, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n", 434 | "\n", 435 | " assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 436 | " outs.append(assistant)\n", 437 | "# print(outs)\n", 438 | " \n", 439 | " i += 1" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "def template(inp, out):\n", 449 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 450 | " {\"role\": \"user\", \"content\": f\"\"\"Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 451 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 452 | " ]\n", 453 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 454 | " print(prompt)\n", 455 | "# prompt = prompt + f' {out}'\n", 456 | " return prompt\n", 457 | "\n", 458 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 459 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 460 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 461 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 462 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 463 | "\n", 464 | "outs = []\n", 465 | "i = 0\n" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "from tqdm import tqdm\n", 475 | "import random\n", 476 | "from datasets import Dataset, load_metric\n", 477 | "import transformers\n", 478 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 479 | "from datasets import load_dataset, Dataset\n", 480 | "import pandas as pd\n", 481 | "import evaluate\n", 482 | "import torch\n", 483 | "import nltk\n", 484 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 485 | "import nltk\n", 486 | "import argparse\n", 487 | "import numpy as np\n", 488 | "from transformers import AutoModelForSeq2SeqLM\n", 489 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 490 | "from trl import AutoModelForCausalLMWithValueHead\n", 491 | "from transformers import TrainingArguments, Trainer\n", 492 | "from trl import SFTTrainer\n", 493 | "import evaluate\n", 494 | "from transformers import AutoModelForCausalLM\n", 495 | "from peft import PeftModel\n", 496 | "\n", 497 | "\n", 498 | "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n", 499 | "print(\"loading\")\n", 500 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 501 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 502 | "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n", 503 | "print('finished laoding')\n", 504 | "model = model.merge_and_unload()\n", 505 | "model = model.to('cuda:7')\n", 506 | "\n", 507 | "\n", 508 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 509 | "train_df['label'] = train_df['label'].astype(str)\n", 510 | "\n", 511 | "train_dataset = Dataset.from_pandas(train_df)\n", 512 | "\n", 513 | "testset = pd.read_excel('test.xlsx')\n", 514 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 515 | "testset['text'] = test_with_asr['asr']\n", 516 | "\n", 517 | "testset['label'] = testset['label'].astype(str)\n", 518 | "print(train_df['label'].unique())\n", 519 | "print(testset['label'].unique())\n", 520 | "test_dataset = Dataset.from_pandas(testset[['text', 'label', 'human_justification']])\n", 521 | "\n", 522 | "def template(inp, out):\n", 523 | " conversation = [\n", 524 | " {\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 525 | "# {\"role\": \"user\", \"content\": f\"\"\"sentiment analysis: '{inp.strip()}'\"\"\"},\n", 526 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 527 | " # {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 528 | " ]\n", 529 | "# print(out)\n", 530 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 531 | " prompt = prompt + f' '\n", 532 | " return prompt\n", 533 | "\n", 534 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 535 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 536 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 537 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 538 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 539 | "\n", 540 | "outs = []\n", 541 | "i = 0\n", 542 | "print(\"Start inference\")\n", 543 | "for tt in (test_dataset.select(idx)['train_text']):\n", 544 | " if i % 100 == 0:\n", 545 | " print(i, set(outs))\n", 546 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.to('cuda:7')#[:,:-1]\n", 547 | " out_ids = model.generate(input_ids, max_new_tokens=25, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n", 548 | "\n", 549 | " assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 550 | "# print(assistant)\n", 551 | " outs.append(assistant)\n", 552 | " i += 1\n", 553 | "# break\n", 554 | "# print(assistant)\n", 555 | "del model\n", 556 | "import gc\n", 557 | "torch.cuda.empty_cache()\n", 558 | "gc.collect()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "from evaluate import load\n", 568 | "bertscore = load(\"bertscore\")\n", 569 | "predictions = [o[2:] for o in outs]\n", 570 | "references = test_dataset.select(idx)['human_justification']\n", 571 | "results = bertscore.compute(predictions=predictions, references=references, lang=\"vi\")\n", 572 | "sum(results['f1'])/100" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "from evaluate import load\n", 582 | "rouge = load(\"rouge\")\n", 583 | "predictions = [o[2:] for o in outs]\n", 584 | "references = test_dataset.select(idx)['human_justification']\n", 585 | "results = rouge.compute(predictions=predictions, references=references)\n", 586 | "results" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "test_samples = \"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n", 596 | "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n", 597 | "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n", 598 | "\"\"\".split('\\n')\n", 599 | "\n", 600 | "testdf = test_dataset.to_pandas()\n", 601 | "testdf[testdf.text.isin(test_samples)]" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "confidence = []\n", 611 | "outs = []\n", 612 | "i = 0\n", 613 | "\"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n", 614 | "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n", 615 | "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n", 616 | "\"\"\".split()\n", 617 | "for tt in (testdf[testdf.text.isin(test_samples)]['train_text']):\n", 618 | " if i % 100 == 0:\n", 619 | " print(i, set(outs))\n", 620 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 621 | " output = model.generate(input_ids, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 622 | "\n", 623 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 624 | "# print(assistant)\n", 625 | " confidence.append(assistant)\n", 626 | " \n", 627 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 628 | " outs.append(assistant)\n", 629 | " i += 1" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "import pandas as pd\n", 639 | "from datasets import Dataset\n", 640 | "import evaluate\n", 641 | "from datasets import load_metric, load_dataset\n", 642 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 643 | "import torch \n", 644 | "import numpy as np\n", 645 | "from tqdm import tqdm\n", 646 | "metrics = load_metric('accuracy')\n", 647 | "import gc\n", 648 | "import os\n", 649 | "\n", 650 | "accuracy = evaluate.load(\"accuracy\")\n", 651 | "f1 = evaluate.load(\"f1\")\n", 652 | "precision = evaluate.load(\"precision\")\n", 653 | "recall = evaluate.load(\"recall\")\n", 654 | "\n", 655 | "def compute_metrics(predictions, references):\n", 656 | " decoded_preds, decoded_labels = predictions, references\n", 657 | "# logits = np.argmax(logits, axis=1)\n", 658 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 659 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 660 | " predictions = decoded_preds\n", 661 | " labels = decoded_labels\n", 662 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 663 | " metrics_result = {\n", 664 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 665 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 666 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 667 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 668 | " \"f1_neg\": neg,\n", 669 | " \"f1_neu\": neu,\n", 670 | " \"f1_pos\": pos\n", 671 | "\n", 672 | " }\n", 673 | " return metrics_result\n", 674 | "\n", 675 | "references = (testset['label'])\n", 676 | "compute_metrics(outs, references)" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "from tqdm import tqdm\n", 686 | "import random\n", 687 | "from datasets import Dataset, load_metric\n", 688 | "import transformers\n", 689 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 690 | "from datasets import load_dataset, Dataset\n", 691 | "import pandas as pd\n", 692 | "import evaluate\n", 693 | "import torch\n", 694 | "import nltk\n", 695 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 696 | "import nltk\n", 697 | "import argparse\n", 698 | "import numpy as np\n", 699 | "from transformers import AutoModelForSeq2SeqLM\n", 700 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 701 | "from trl import AutoModelForCausalLMWithValueHead\n", 702 | "from transformers import TrainingArguments, Trainer\n", 703 | "from trl import SFTTrainer\n", 704 | "import evaluate\n", 705 | "from transformers import AutoModelForCausalLM\n", 706 | "from peft import PeftModel\n", 707 | "\n", 708 | "\n", 709 | "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n", 710 | "print(\"loading\")\n", 711 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 712 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 713 | "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n", 714 | "print('finished laoding')\n", 715 | "model = model.merge_and_unload()\n", 716 | "model = model.to('cuda')\n", 717 | "\n", 718 | "\n", 719 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 720 | "train_df['label'] = train_df['label'].astype(str)\n", 721 | "\n", 722 | "train_dataset = Dataset.from_pandas(train_df)\n", 723 | "\n", 724 | "testset = pd.read_excel('test.xlsx')\n", 725 | "# test_with_asr = pd.read_excel('test_asr.xlsx')\n", 726 | "# testset['text'] = test_with_asr['asr']\n", 727 | "\n", 728 | "testset['label'] = testset['label'].astype(str)\n", 729 | "print(train_df['label'].unique())\n", 730 | "print(testset['label'].unique())\n", 731 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 732 | "\n", 733 | "def template(inp, out):\n", 734 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 735 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 736 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 737 | " ]\n", 738 | "# print(out)\n", 739 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 740 | " prompt = prompt + f' '\n", 741 | " return prompt\n", 742 | "\n", 743 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 744 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 745 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 746 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 747 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 748 | "\n", 749 | "outs = []\n", 750 | "i = 0\n", 751 | "print(\"Start inference\")\n", 752 | "\n", 753 | "confidence = []\n", 754 | "i = 0\n", 755 | "for tt in (test_dataset['train_text']):\n", 756 | " if i % 100 == 0:\n", 757 | " print(i, set(outs))\n", 758 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 759 | " output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 760 | "\n", 761 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 762 | "# print(assistant)\n", 763 | " confidence.append(assistant)\n", 764 | " \n", 765 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 766 | " outs.append(assistant)\n", 767 | " i += 1" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "i = 0\n", 777 | "print(\"Start inference\")\n", 778 | "\n", 779 | "confidence = []\n", 780 | "i = 0\n", 781 | "for tt in (test_dataset['train_text']):\n", 782 | " if i % 100 == 0:\n", 783 | " print(i, set(outs))\n", 784 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 785 | " output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 786 | "\n", 787 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 788 | "# print(assistant)\n", 789 | " confidence.append(assistant)\n", 790 | " \n", 791 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 792 | " outs.append(assistant)\n", 793 | " i += 1" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [] 809 | } 810 | ], 811 | "metadata": { 812 | "kernelspec": { 813 | "display_name": "distill", 814 | "language": "python", 815 | "name": "distill" 816 | }, 817 | "language_info": { 818 | "codemirror_mode": { 819 | "name": "ipython", 820 | "version": 3 821 | }, 822 | "file_extension": ".py", 823 | "mimetype": "text/x-python", 824 | "name": "python", 825 | "nbconvert_exporter": "python", 826 | "pygments_lexer": "ipython3", 827 | "version": "3.10.4" 828 | } 829 | }, 830 | "nbformat": 4, 831 | "nbformat_minor": 4 832 | } 833 | --------------------------------------------------------------------------------