├── data ├── test.xlsx ├── train.xlsx ├── test_asr.xlsx ├── test_eng.xlsx └── train_eng.xlsx ├── SentimentReasoning_ACL2025.png ├── sentiment_reasoning_datasample.png ├── sentiment_reasoning_pipeline.png ├── encoder_eng.py ├── encoder.py ├── README.md ├── seq2seq.py ├── seq2seq_eng.py ├── llm-lora_eng.py ├── llm-lora.py └── inference.ipynb /data/test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test.xlsx -------------------------------------------------------------------------------- /data/train.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train.xlsx -------------------------------------------------------------------------------- /data/test_asr.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_asr.xlsx -------------------------------------------------------------------------------- /data/test_eng.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/test_eng.xlsx -------------------------------------------------------------------------------- /data/train_eng.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/data/train_eng.xlsx -------------------------------------------------------------------------------- /SentimentReasoning_ACL2025.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/SentimentReasoning_ACL2025.png -------------------------------------------------------------------------------- /sentiment_reasoning_datasample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_datasample.png -------------------------------------------------------------------------------- /sentiment_reasoning_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leduckhai/Sentiment-Reasoning/HEAD/sentiment_reasoning_pipeline.png -------------------------------------------------------------------------------- /encoder_eng.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset 2 | import pandas as pd 3 | import argparse 4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 5 | from transformers import AutoTokenizer 6 | import evaluate 7 | import numpy as np 8 | import transformers 9 | 10 | # Initialize argparse 11 | parser = argparse.ArgumentParser(description='Configure training parameters.') 12 | 13 | # Add arguments for training configuration 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 17 | parser.add_argument('--model_checkpoint', type=str, default="emilyalsentzer/Bio_ClinicalBERT", help='Model checkpoint to use') 18 | 19 | # Parse arguments 20 | args = parser.parse_args() 21 | 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"} 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2} 24 | 25 | 26 | # Assign variables from args 27 | batch_size = args.batch_size 28 | num_train_epochs = args.num_train_epochs 29 | learning_rate = args.learning_rate 30 | model_checkpoint = args.model_checkpoint 31 | 32 | model = AutoModelForSequenceClassification.from_pretrained( 33 | model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id 34 | ) 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 36 | 37 | model_name = model_checkpoint.split("/")[-1] 38 | 39 | 40 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 41 | train_dataset = Dataset.from_pandas(train_df) 42 | 43 | testset = pd.read_excel('test_eng.xlsx') 44 | print(train_df['label'].unique()) 45 | print(testset['label'].unique()) 46 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 47 | 48 | 49 | 50 | 51 | def preprocess_function(examples): 52 | return tokenizer(examples['text'], truncation=True, padding=True) 53 | 54 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True) 55 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True) 56 | 57 | 58 | from transformers import DataCollatorWithPadding 59 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 60 | 61 | 62 | 63 | # Load the individual metrics 64 | import evaluate 65 | 66 | accuracy = evaluate.load("accuracy") 67 | f1 = evaluate.load("f1") 68 | precision = evaluate.load("precision") 69 | recall = evaluate.load("recall") 70 | 71 | def compute_metrics(eval_pred): 72 | predictions, labels = eval_pred 73 | predictions = np.argmax(predictions, axis=1) 74 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 75 | 76 | # Compute each metric as needed 77 | metrics_result = { 78 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 79 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 80 | "f1_neg": neg, 81 | "f1_neu": neu, 82 | "f1_pos": pos 83 | 84 | } 85 | 86 | return metrics_result 87 | 88 | # This modified function should now work without the TypeError 89 | 90 | 91 | ## Train 92 | 93 | 94 | training_args = TrainingArguments( 95 | output_dir=f"results/{model_name}", 96 | lr_scheduler_type='cosine', 97 | learning_rate=learning_rate, 98 | per_device_train_batch_size=batch_size, 99 | per_device_eval_batch_size=batch_size, 100 | num_train_epochs=num_train_epochs, 101 | weight_decay=0.01, 102 | evaluation_strategy="epoch", 103 | save_strategy="epoch", 104 | logging_strategy='epoch', 105 | load_best_model_at_end=True, 106 | save_total_limit=2, 107 | bf16=True, 108 | warmup_ratio=0.05, 109 | metric_for_best_model='eval_macro_f1', 110 | # push_to_hub=True, 111 | ) 112 | 113 | trainer = Trainer( 114 | model=model, 115 | args=training_args, 116 | train_dataset=tokenized_dataset_train, 117 | eval_dataset=tokenized_dataset_test, 118 | tokenizer=tokenizer, 119 | data_collator=data_collator, 120 | compute_metrics=compute_metrics, 121 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 122 | 123 | ) 124 | print('model_checkpoint', model_checkpoint) 125 | trainer.train() 126 | trainer.save_model() 127 | trainer.save_state() 128 | trainer.evaluate() -------------------------------------------------------------------------------- /encoder.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset 2 | import pandas as pd 3 | import argparse 4 | from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer 5 | from transformers import AutoTokenizer 6 | import evaluate 7 | import numpy as np 8 | import transformers 9 | 10 | # Initialize argparse 11 | parser = argparse.ArgumentParser(description='Configure training parameters.') 12 | 13 | # Add arguments for training configuration 14 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 15 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 16 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 17 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 18 | 19 | # Parse arguments 20 | args = parser.parse_args() 21 | 22 | id2label = {0: "negative", 1: "neutral", 2: "positive"} 23 | label2id = {"negative": 0, "neutral": 1, 'positive': 2} 24 | 25 | 26 | # Assign variables from args 27 | batch_size = args.batch_size 28 | num_train_epochs = args.num_train_epochs 29 | learning_rate = args.learning_rate 30 | model_checkpoint = args.model_checkpoint 31 | 32 | model = AutoModelForSequenceClassification.from_pretrained( 33 | model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id 34 | ) 35 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 36 | 37 | model_name = model_checkpoint.split("/")[-1] 38 | 39 | 40 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 41 | train_dataset = Dataset.from_pandas(train_df) 42 | 43 | testset = pd.read_excel('test.xlsx') 44 | test_with_asr = pd.read_excel('test_asr.xlsx') 45 | testset['text'] = test_with_asr['asr'] 46 | 47 | print(train_df['label'].unique()) 48 | print(testset['label'].unique()) 49 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 50 | 51 | 52 | 53 | 54 | def preprocess_function(examples): 55 | return tokenizer(examples['text'], truncation=True, padding=True) 56 | 57 | tokenized_dataset_train = train_dataset.map(preprocess_function, batched=True) 58 | tokenized_dataset_test = test_dataset.map(preprocess_function, batched=True) 59 | 60 | 61 | from transformers import DataCollatorWithPadding 62 | data_collator = DataCollatorWithPadding(tokenizer=tokenizer) 63 | 64 | 65 | 66 | # Load the individual metrics 67 | import evaluate 68 | 69 | accuracy = evaluate.load("accuracy") 70 | f1 = evaluate.load("f1") 71 | precision = evaluate.load("precision") 72 | recall = evaluate.load("recall") 73 | 74 | def compute_metrics(eval_pred): 75 | predictions, labels = eval_pred 76 | predictions = np.argmax(predictions, axis=1) 77 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 78 | 79 | # Compute each metric as needed 80 | metrics_result = { 81 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 82 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 83 | "f1_neg": neg, 84 | "f1_neu": neu, 85 | "f1_pos": pos 86 | 87 | } 88 | 89 | return metrics_result 90 | 91 | # This modified function should now work without the TypeError 92 | 93 | 94 | ## Train 95 | 96 | 97 | training_args = TrainingArguments( 98 | output_dir=f"results/{model_name}", 99 | lr_scheduler_type='cosine', 100 | learning_rate=learning_rate, 101 | per_device_train_batch_size=batch_size, 102 | per_device_eval_batch_size=batch_size, 103 | num_train_epochs=num_train_epochs, 104 | weight_decay=0.01, 105 | evaluation_strategy="epoch", 106 | save_strategy="epoch", 107 | logging_strategy='epoch', 108 | load_best_model_at_end=True, 109 | save_total_limit=2, 110 | bf16=True, 111 | warmup_ratio=0.05, 112 | metric_for_best_model='eval_macro_f1', 113 | # push_to_hub=True, 114 | ) 115 | 116 | trainer = Trainer( 117 | model=model, 118 | args=training_args, 119 | train_dataset=tokenized_dataset_train, 120 | eval_dataset=tokenized_dataset_test, 121 | tokenizer=tokenizer, 122 | data_collator=data_collator, 123 | compute_metrics=compute_metrics, 124 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 125 | 126 | ) 127 | print('model_checkpoint', model_checkpoint) 128 | trainer.train() 129 | trainer.save_model() 130 | trainer.evaluate() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sentiment Reasoning for Healthcare 2 | 3 | **
12 |
13 |
Sentiment Reasoning pipeline
15 | 16 |
17 |
18 |
55 |
56 |
Sample data format used in Sentiment Reasoning dataset
58 | 59 | 60 | ## Contact 61 | 62 | Core developers: 63 | 64 | **Khai Le-Duc** 65 | ``` 66 | University of Toronto, Canada 67 | Email: duckhai.le@mail.utoronto.ca 68 | GitHub: https://github.com/leduckhai 69 | ``` 70 | 71 | **Khai-Nguyen Nguyen** 72 | ``` 73 | College of William and Mary, USA 74 | GitHub: https://github.com/nkn002 75 | Hugging Face: https://huggingface.co/knguyennguyen 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /seq2seq.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | from datasets import Dataset, load_metric 5 | import transformers 6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 7 | from datasets import load_dataset, Dataset 8 | import pandas as pd 9 | import evaluate 10 | import torch 11 | import nltk 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 13 | import nltk 14 | import argparse 15 | import numpy as np 16 | 17 | 18 | # Initialize argparse 19 | parser = argparse.ArgumentParser(description='Configure training parameters.') 20 | 21 | # Add arguments for training configuration 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 25 | parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-large", help='Model checkpoint to use') 26 | 27 | # Parse arguments 28 | args = parser.parse_args() 29 | 30 | # Assign variables from args 31 | batch_size = args.batch_size 32 | num_train_epochs = args.num_train_epochs 33 | learning_rate = args.learning_rate 34 | model_checkpoint = args.model_checkpoint 35 | 36 | # Now you can use these variables in your training setup 37 | print(f"Training setup:") 38 | print(f"Batch size: {batch_size}") 39 | print(f"Number of training epochs: {num_train_epochs}") 40 | print(f"Learning rate: {learning_rate}") 41 | print(f"Model checkpoint: {model_checkpoint}") 42 | 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"} 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'} 45 | 46 | train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 47 | train_df['label'] = train_df['label'].astype(str) 48 | 49 | train_dataset = Dataset.from_pandas(train_df) 50 | 51 | testset = pd.read_excel('test.xlsx') 52 | 53 | # Then convert the modified DataFrame to a Hugging Face dataset 54 | testset['label'] = testset['label'].astype(str) 55 | print(train_df['label'].unique()) 56 | print(testset['label'].unique()) 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 58 | 59 | # Output unique values to verify 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"} 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'} 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 65 | 66 | 67 | def preprocess_function(examples): 68 | inputs = [doc for doc in examples["text"]] 69 | model_inputs = tokenizer(inputs, max_length=128, truncation=True) 70 | labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True) 71 | 72 | model_inputs["labels"] = labels["input_ids"] 73 | return model_inputs 74 | 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) 77 | 78 | print('tokenized_train_dataset', tokenized_train_dataset) 79 | print('tokenized_test_dataset', tokenized_test_dataset) 80 | 81 | 82 | # Load the individual metrics 83 | accuracy = evaluate.load("accuracy") 84 | f1 = evaluate.load("f1") 85 | precision = evaluate.load("precision") 86 | recall = evaluate.load("recall") 87 | 88 | def compute_metrics(eval_pred): 89 | logits, labels = eval_pred 90 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 91 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 92 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 93 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 94 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 95 | predictions = decoded_preds 96 | labels = decoded_labels 97 | metrics_result = { 98 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 99 | 100 | } 101 | 102 | return metrics_result 103 | 104 | # This modified function should now work without the TypeError 105 | 106 | 107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) 108 | 109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label']) 110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label']) 111 | model_name = model_checkpoint.split("/")[-1] 112 | 113 | transformers.logging.set_verbosity_info() 114 | training_args = Seq2SeqTrainingArguments( 115 | output_dir=f"results/{model_name}", 116 | eval_strategy="epoch", 117 | save_strategy="epoch", 118 | logging_strategy='epoch', 119 | learning_rate=learning_rate, 120 | per_device_train_batch_size=batch_size, 121 | per_device_eval_batch_size=batch_size, 122 | weight_decay=0.01, 123 | save_total_limit=2, 124 | num_train_epochs=num_train_epochs, 125 | predict_with_generate=True, 126 | load_best_model_at_end=True, 127 | metric_for_best_model='eval_accuracy', 128 | bf16=True, 129 | lr_scheduler_type='cosine', 130 | warmup_ratio=0.05, 131 | ) 132 | 133 | # Setting up the trainer 134 | trainer = Seq2SeqTrainer( 135 | model=model, 136 | args=training_args, 137 | train_dataset=tokenized_train_dataset, 138 | eval_dataset=tokenized_test_dataset, 139 | tokenizer=tokenizer, 140 | data_collator=data_collator, 141 | compute_metrics=compute_metrics, 142 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 143 | 144 | ) 145 | 146 | 147 | trainer.train() 148 | trainer.save_model() 149 | -------------------------------------------------------------------------------- /seq2seq_eng.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import random 4 | from datasets import Dataset, load_metric 5 | import transformers 6 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 7 | from datasets import load_dataset, Dataset 8 | import pandas as pd 9 | import evaluate 10 | import torch 11 | import nltk 12 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 13 | import nltk 14 | import argparse 15 | import numpy as np 16 | 17 | 18 | # Initialize argparse 19 | parser = argparse.ArgumentParser(description='Configure training parameters.') 20 | 21 | # Add arguments for training configuration 22 | parser.add_argument('--batch_size', type=int, default=64, help='Batch size for training') 23 | parser.add_argument('--num_train_epochs', type=int, default=30, help='Number of epochs for training') 24 | parser.add_argument('--learning_rate', type=float, default=2e-5, help='Learning rate for the optimizer') 25 | parser.add_argument('--model_checkpoint', type=str, default="luqh/ClinicalT5-base", help='Model checkpoint to use') 26 | 27 | # Parse arguments 28 | args = parser.parse_args() 29 | 30 | # Assign variables from args 31 | batch_size = args.batch_size 32 | num_train_epochs = args.num_train_epochs 33 | learning_rate = args.learning_rate 34 | model_checkpoint = args.model_checkpoint 35 | 36 | # Now you can use these variables in your training setup 37 | print(f"Training setup:") 38 | print(f"Batch size: {batch_size}") 39 | print(f"Number of training epochs: {num_train_epochs}") 40 | print(f"Learning rate: {learning_rate}") 41 | print(f"Model checkpoint: {model_checkpoint}") 42 | 43 | id2label = {'0': "negative", '1': "neutral", '2': "positive"} 44 | label2id = {"negative": '0', "neutral": '1', 'positive': '2'} 45 | 46 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 47 | train_df['label'] = train_df['label'].astype(str) 48 | 49 | train_dataset = Dataset.from_pandas(train_df) 50 | 51 | testset = pd.read_excel('test_eng.xlsx') 52 | 53 | # Then convert the modified DataFrame to a Hugging Face dataset 54 | testset['label'] = testset['label'].astype(str) 55 | print(train_df['label'].unique()) 56 | print(testset['label'].unique()) 57 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 58 | 59 | # Output unique values to verify 60 | 61 | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) 62 | id2label = {'0': "Negative", '1': "Neutral", '2': "Positive"} 63 | label2id = {"Negative": '0', "Neutral": '1', 'Positive': '2'} 64 | model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) 65 | 66 | 67 | def preprocess_function(examples): 68 | inputs = [doc for doc in examples["text"]] 69 | model_inputs = tokenizer(inputs, max_length=128, truncation=True) 70 | labels = tokenizer(text_target=examples["label"], max_length=8, truncation=True) 71 | 72 | model_inputs["labels"] = labels["input_ids"] 73 | return model_inputs 74 | 75 | tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True) 76 | tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True) 77 | 78 | print('tokenized_train_dataset', tokenized_train_dataset) 79 | print('tokenized_test_dataset', tokenized_test_dataset) 80 | 81 | print(train_dataset['text']) 82 | # Load the individual metrics 83 | accuracy = evaluate.load("accuracy") 84 | f1 = evaluate.load("f1") 85 | precision = evaluate.load("precision") 86 | recall = evaluate.load("recall") 87 | 88 | def compute_metrics(eval_pred): 89 | logits, labels = eval_pred 90 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 91 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 92 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 93 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 94 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 95 | predictions = decoded_preds 96 | labels = decoded_labels 97 | metrics_result = { 98 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 99 | 100 | } 101 | 102 | return metrics_result 103 | 104 | # This modified function should now work without the TypeError 105 | 106 | 107 | data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) 108 | 109 | tokenized_train_dataset=tokenized_train_dataset.remove_columns(['text', 'label']) 110 | tokenized_test_dataset=tokenized_test_dataset.remove_columns(['text', 'label']) 111 | model_name = model_checkpoint.split("/")[-1] 112 | 113 | transformers.logging.set_verbosity_info() 114 | training_args = Seq2SeqTrainingArguments( 115 | output_dir=f"results/{model_name}", 116 | eval_strategy="epoch", 117 | save_strategy="epoch", 118 | logging_strategy='epoch', 119 | learning_rate=learning_rate, 120 | per_device_train_batch_size=batch_size, 121 | per_device_eval_batch_size=batch_size, 122 | weight_decay=0.01, 123 | save_total_limit=2, 124 | num_train_epochs=num_train_epochs, 125 | predict_with_generate=True, 126 | load_best_model_at_end=True, 127 | metric_for_best_model='eval_accuracy', 128 | bf16=True, 129 | lr_scheduler_type='cosine', 130 | warmup_ratio=0.05, 131 | ) 132 | 133 | # Setting up the trainer 134 | trainer = Seq2SeqTrainer( 135 | model=model, 136 | args=training_args, 137 | train_dataset=tokenized_train_dataset, 138 | eval_dataset=tokenized_test_dataset, 139 | tokenizer=tokenizer, 140 | data_collator=data_collator, 141 | compute_metrics=compute_metrics, 142 | callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 143 | 144 | ) 145 | 146 | 147 | trainer.train() 148 | trainer.save_state() 149 | trainer.save_model() 150 | -------------------------------------------------------------------------------- /llm-lora_eng.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import pandas as pd 4 | import numpy as np 5 | import random 6 | from datasets import Dataset, load_metric 7 | import transformers 8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 9 | from datasets import load_dataset, Dataset 10 | import pandas as pd 11 | import evaluate 12 | import torch 13 | import nltk 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 15 | import nltk 16 | import argparse 17 | import numpy as np 18 | from transformers import AutoModelForSeq2SeqLM 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 20 | from trl import AutoModelForCausalLMWithValueHead 21 | from transformers import TrainingArguments, Trainer 22 | from trl import SFTTrainer 23 | import evaluate 24 | 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm" 26 | # rationale_col = 'cot_rationale' 27 | 28 | # Initialize argparse 29 | parser = argparse.ArgumentParser(description='Configure training parameters.') 30 | 31 | # Add arguments for training configuration 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm') 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale') 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer') 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 36 | 37 | # Parse arguments 38 | args = parser.parse_args() 39 | model_name_or_path = args.model_name_or_path 40 | rationale_col = args.rationale_col 41 | 42 | peft_config = LoraConfig( 43 | task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 44 | ) 45 | 46 | 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token='hf_GxsYTZDZhHcQEzYEvWrus') 48 | model = AutoModelForCausalLM.from_pretrained( 49 | model_name_or_path, 50 | # load_in_8bit=True, 51 | torch_dtype=torch.bfloat16, 52 | device_map="auto", 53 | # use_cache=True, 54 | cache_dir='./models', 55 | load_in_8bit=True, 56 | token='hf_GxsYTZDZhHcQEzYEvWrus' 57 | 58 | ) 59 | 60 | model.enable_input_require_grads() 61 | model = get_peft_model(model, peft_config) 62 | model.print_trainable_parameters() 63 | 64 | 65 | train_df = pd.read_excel('train_eng.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 66 | train_df['label'] = train_df['label'].astype(str) 67 | 68 | train_dataset = Dataset.from_pandas(train_df) 69 | 70 | testset = pd.read_excel('test_eng.xlsx') 71 | 72 | testset['label'] = testset['label'].astype(str) 73 | print(train_df['label'].unique()) 74 | print(testset['label'].unique()) 75 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 76 | 77 | 78 | def template(inp, out, rationale=''): 79 | # if rationale 80 | conversation = [ 81 | {"role": "user", "content": f"""sentiment analysis: '{inp.strip()}'"""}, 82 | ] 83 | # print(out) 84 | prompt = tokenizer.apply_chat_template(conversation, tokenize=False) 85 | prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip() 86 | print(prompt) 87 | return prompt 88 | # , train_dataset[rationale_col] 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])] 90 | if rationale_col == '': 91 | new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])] 92 | else: 93 | new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])] 94 | train_dataset= train_dataset.add_column("train_text", new_column_train) 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])] 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train) 97 | 98 | 99 | # Load the individual metrics 100 | accuracy = evaluate.load("accuracy") 101 | f1 = evaluate.load("f1") 102 | precision = evaluate.load("precision") 103 | recall = evaluate.load("recall") 104 | 105 | def compute_metrics(eval_pred): 106 | logits, labels = eval_pred 107 | logits = np.argmax(logits, axis=-1) 108 | print(logits, labels) 109 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 110 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 111 | print('decoded_preds', decoded_preds) 112 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 113 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 114 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 115 | predictions = decoded_preds 116 | labels = decoded_labels 117 | print( f1.compute(predictions=predictions, references=labels, average=None)['f1']) 118 | print(set(decoded_preds)) 119 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 120 | metrics_result = { 121 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 122 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 123 | # "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'], 124 | # "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'], 125 | "f1_neg": neg, 126 | "f1_neu": neu, 127 | "f1_pos": pos 128 | 129 | } 130 | 131 | return metrics_result 132 | 133 | training_args = TrainingArguments( 134 | per_device_train_batch_size=16, 135 | gradient_accumulation_steps=2, 136 | # gradient_checkpointing=True, 137 | warmup_steps=100, 138 | report_to=[], 139 | learning_rate=2e-4, 140 | lr_scheduler_type="cosine", 141 | num_train_epochs=5, 142 | optim="adamw_bnb_8bit", 143 | bf16=True, 144 | # gradient_accumulation_steps=2, # simulate larger batch sizes 145 | output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2", 146 | logging_strategy="epoch", 147 | dataloader_num_workers=4, 148 | save_total_limit=3, 149 | save_strategy='epoch', 150 | # eval_strategy='no', 151 | ) 152 | 153 | 154 | trainer = SFTTrainer( 155 | model, 156 | packing=True, # pack samples together for efficient training 157 | max_seq_length=180, # maximum packed length 158 | args=training_args, 159 | train_dataset=train_dataset.shuffle(), 160 | compute_metrics=compute_metrics, 161 | peft_config=peft_config, 162 | dataset_text_field='train_text', 163 | # callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 164 | 165 | ) 166 | trainer.train() 167 | trainer.save_model() 168 | 169 | # trainer.evaluate() 170 | 171 | 172 | -------------------------------------------------------------------------------- /llm-lora.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModelForCausalLM, AutoTokenizer 3 | import pandas as pd 4 | import numpy as np 5 | import random 6 | from datasets import Dataset, load_metric 7 | import transformers 8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq 9 | from datasets import load_dataset, Dataset 10 | import pandas as pd 11 | import evaluate 12 | import torch 13 | import nltk 14 | from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM 15 | import nltk 16 | import argparse 17 | import numpy as np 18 | from transformers import AutoModelForSeq2SeqLM 19 | from peft import get_peft_config, get_peft_model, LoraConfig, TaskType 20 | from trl import AutoModelForCausalLMWithValueHead 21 | from transformers import TrainingArguments, Trainer 22 | from trl import SFTTrainer 23 | import evaluate 24 | 25 | # model_name_or_path = "vtrungnhan9/vmlu-llm" 26 | # rationale_col = 'cot_rationale' 27 | 28 | # Initialize argparse 29 | parser = argparse.ArgumentParser(description='Configure training parameters.') 30 | 31 | # Add arguments for training configuration 32 | parser.add_argument('--model_name_or_path', type=str, default='vtrungnhan9/vmlu-llm', help='vtrungnhan9/vmlu-llm') 33 | parser.add_argument('--rationale_col', type=str, default='human_justification', help='cot_rationale') 34 | # parser.add_argument('--learning_rate', type=float, default=1e-5, help='Learning rate for the optimizer') 35 | # parser.add_argument('--model_checkpoint', type=str, default="VietAI/vit5-base", help='Model checkpoint to use') 36 | 37 | # Parse arguments 38 | args = parser.parse_args() 39 | model_name_or_path = args.model_name_or_path 40 | rationale_col = args.rationale_col 41 | 42 | peft_config = LoraConfig( 43 | task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1 44 | ) 45 | 46 | 47 | tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) 48 | model = AutoModelForCausalLM.from_pretrained( 49 | model_name_or_path, 50 | # load_in_8bit=True, 51 | torch_dtype=torch.bfloat16, 52 | device_map="auto", 53 | # use_cache=True, 54 | cache_dir='./models', 55 | load_in_8bit=True, 56 | 57 | ) 58 | 59 | model.enable_input_require_grads() 60 | model = get_peft_model(model, peft_config) 61 | model.print_trainable_parameters() 62 | 63 | 64 | train_df = pd.read_excel('multitask/distilling-step-by-step/train_rationale.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True) 65 | train_df['label'] = train_df['label'].astype(str) 66 | 67 | train_dataset = Dataset.from_pandas(train_df) 68 | 69 | testset = pd.read_excel('test.xlsx') 70 | 71 | testset['label'] = testset['label'].astype(str) 72 | print(train_df['label'].unique()) 73 | print(testset['label'].unique()) 74 | test_dataset = Dataset.from_pandas(testset[['text', 'label']]) 75 | 76 | 77 | def template(inp, out, rationale=''): 78 | # if rationale 79 | conversation = [ 80 | {"role": "system", "content": "Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực." }, 81 | {"role": "user", "content": f"""nhận diện cảm xúc: '{inp.strip()}'"""}, 82 | ] 83 | # print(out) 84 | prompt = tokenizer.apply_chat_template(conversation, tokenize=False) 85 | prompt = (prompt +str(out).strip()+'\n'+rationale.strip()).strip() 86 | print(prompt) 87 | return prompt 88 | # , train_dataset[rationale_col] 89 | # reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])] 90 | if rationale_col == '': 91 | new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])] 92 | else: 93 | new_column_train = [template(inp, out, rationale) for inp, out, rationale in zip(train_dataset['text'], train_dataset['label'], train_dataset[rationale_col])] 94 | train_dataset= train_dataset.add_column("train_text", new_column_train) 95 | # new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])] 96 | # test_dataset= test_dataset.add_column("train_text", new_column_train) 97 | 98 | 99 | # Load the individual metrics 100 | accuracy = evaluate.load("accuracy") 101 | f1 = evaluate.load("f1") 102 | precision = evaluate.load("precision") 103 | recall = evaluate.load("recall") 104 | 105 | def compute_metrics(eval_pred): 106 | logits, labels = eval_pred 107 | logits = np.argmax(logits, axis=-1) 108 | print(logits, labels) 109 | labels = np.where(labels != -100, labels, tokenizer.pad_token_id) 110 | decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True) 111 | print('decoded_preds', decoded_preds) 112 | decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 113 | decoded_preds = [pred if pred.isdigit() else -1 for pred in decoded_preds] # Replace non-digit predictions with '-1' 114 | decoded_labels = [label if label.isdigit() else -1 for label in decoded_labels] # Replace non-digit labels with '-1' 115 | predictions = decoded_preds 116 | labels = decoded_labels 117 | print( f1.compute(predictions=predictions, references=labels, average=None)['f1']) 118 | print(set(decoded_preds)) 119 | neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1'] 120 | metrics_result = { 121 | "accuracy": accuracy.compute(predictions=predictions, references=labels)['accuracy'], 122 | "macro_f1": f1.compute(predictions=predictions, references=labels, average='macro')['f1'], 123 | # "macro_precision": precision.compute(predictions=predictions, references=labels, average='macro')['precision'], 124 | # "macro_recall": recall.compute(predictions=predictions, references=labels, average='macro')['recall'], 125 | "f1_neg": neg, 126 | "f1_neu": neu, 127 | "f1_pos": pos 128 | 129 | } 130 | 131 | return metrics_result 132 | 133 | training_args = TrainingArguments( 134 | per_device_train_batch_size=16, 135 | gradient_accumulation_steps=2, 136 | # gradient_checkpointing=True, 137 | warmup_steps=100, 138 | report_to=[], 139 | learning_rate=2e-4, 140 | lr_scheduler_type="cosine", 141 | num_train_epochs=5, 142 | optim="adamw_bnb_8bit", 143 | bf16=True, 144 | # gradient_accumulation_steps=2, # simulate larger batch sizes 145 | output_dir=f"results/{model_name_or_path.split('/')[-1]}_{rationale_col}v2", 146 | logging_strategy="epoch", 147 | dataloader_num_workers=4, 148 | save_total_limit=3, 149 | save_strategy='epoch', 150 | # eval_strategy='no', 151 | ) 152 | 153 | 154 | trainer = SFTTrainer( 155 | model, 156 | packing=True, # pack samples together for efficient training 157 | max_seq_length=180, # maximum packed length 158 | args=training_args, 159 | train_dataset=train_dataset.shuffle(), 160 | compute_metrics=compute_metrics, 161 | peft_config=peft_config, 162 | dataset_text_field='train_text', 163 | # callbacks = [transformers.EarlyStoppingCallback(early_stopping_patience=3)] 164 | 165 | ) 166 | trainer.train() 167 | trainer.save_model() 168 | 169 | # trainer.evaluate() 170 | 171 | 172 | -------------------------------------------------------------------------------- /inference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from datasets import load_metric, load_dataset\n", 10 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 11 | "import torch \n", 12 | "import numpy as np\n", 13 | "from tqdm import tqdm\n", 14 | "metrics = load_metric('accuracy')\n", 15 | "import gc\n", 16 | "import os\n", 17 | "\n", 18 | "def inference(path):\n", 19 | " prefix = 'summarize: ' if 'mt5' in path else ''\n", 20 | " tokenizer = AutoTokenizer.from_pretrained(path)\n", 21 | " model = AutoModelForSeq2SeqLM.from_pretrained(path)\n", 22 | " max_length = 1024 if 'bert' not in path else 256\n", 23 | " def preprocess_function(examples):\n", 24 | " inputs = [prefix + doc for doc in examples[\"text\"]]\n", 25 | " model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n", 26 | " labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n", 27 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 28 | " return model_inputs\n", 29 | "\n", 30 | " testset = pd.read_excel('test.xlsx')\n", 31 | " testset['label'] = testset['label'].astype(str)\n", 32 | " dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 33 | "\n", 34 | "# dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n", 35 | " test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n", 36 | " data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n", 37 | " model.to('cuda')\n", 38 | "\n", 39 | "\n", 40 | " max_target_length = 5\n", 41 | " test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n", 42 | " dataloader = torch.utils.data.DataLoader(test_tokenized_datasets, collate_fn=data_collator, batch_size=32)\n", 43 | "\n", 44 | " predictions = []\n", 45 | " references = []\n", 46 | " for i, batch in enumerate(tqdm(dataloader)):\n", 47 | " outputs = model.generate(\n", 48 | " input_ids=batch['input_ids'].to('cuda'),\n", 49 | " max_length=max_target_length,\n", 50 | " attention_mask=batch['attention_mask'].to('cuda'),\n", 51 | " )\n", 52 | " with tokenizer.as_target_tokenizer():\n", 53 | " outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n", 54 | "\n", 55 | " labels = np.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)\n", 56 | " actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n", 57 | " predictions.extend(outputs)\n", 58 | " references.extend(actuals)\n", 59 | " metrics.add_batch(predictions=outputs, references=actuals)\n", 60 | "\n", 61 | " metrics.compute()\n", 62 | "\n", 63 | " rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n", 64 | "# new_file_path = './r_scores_faq'\n", 65 | "# # Write to the file\n", 66 | "# try:\n", 67 | "# # Attempt to append to the file\n", 68 | "# with open(new_file_path, 'a') as file:\n", 69 | "# file.write(path.split('/')[-2] + '\\n')\n", 70 | "# for new_content_str in rouges:\n", 71 | "# result = next(iter(new_content_str))\n", 72 | "# file.write(f\"{result}: {new_content_str[result]}\\n\")\n", 73 | "# file.write('\\n')\n", 74 | "# action_result = \"Content appended to the existing file.\"\n", 75 | "# except FileNotFoundError:\n", 76 | "# # File doesn't exist, create it and write the content\n", 77 | "# with open(new_file_path, 'w') as file:\n", 78 | "# file.write(path)\n", 79 | "# file.write(new_content_str)\n", 80 | "# action_result = \"File did not exist, so it was created with the new content.\"\n", 81 | " \n", 82 | "# del model\n", 83 | "# gc.collect()\n" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "import pandas as pd\n", 93 | "from datasets import Dataset\n", 94 | "import evaluate\n", 95 | "from datasets import load_metric, load_dataset\n", 96 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 97 | "import torch \n", 98 | "import numpy as np\n", 99 | "from tqdm import tqdm\n", 100 | "metrics = load_metric('accuracy')\n", 101 | "import gc\n", 102 | "import os\n", 103 | "\n", 104 | "accuracy = evaluate.load(\"accuracy\")\n", 105 | "f1 = evaluate.load(\"f1\")\n", 106 | "precision = evaluate.load(\"precision\")\n", 107 | "recall = evaluate.load(\"recall\")\n", 108 | "\n", 109 | "path = './multitask/distilling-step-by-step/ckpts/VietAI/vit5-base_human_justification/'\n", 110 | "# path = 'multitask/distilling-step-by-step/ckpts//'\n", 111 | "# path = 'results/flan-t5-base/'\n", 112 | "prefix = 'gt: ' if 'distilling' in path else ''\n", 113 | "tokenizer = AutoTokenizer.from_pretrained(path)\n", 114 | "model = AutoModelForSeq2SeqLM.from_pretrained(path)\n", 115 | "max_length = 1024 if 'bert' not in path else 256\n", 116 | "def preprocess_function(examples):\n", 117 | " inputs = [prefix + doc for doc in examples[\"text\"]]\n", 118 | " model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding=True)\n", 119 | " labels = tokenizer(text_target=examples[\"label\"], max_length=5, truncation=True, padding=True)\n", 120 | " model_inputs[\"labels\"] = labels[\"input_ids\"]\n", 121 | " return model_inputs\n", 122 | "\n", 123 | "testset = pd.read_excel('test.xlsx')\n", 124 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 125 | "testset['text'] = test_with_asr['asr']\n", 126 | "testset['label'] = testset['label'].astype(str)\n", 127 | "dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 128 | "\n", 129 | "# dataset = load_dataset(\"json\", data_files=\"datasets/faq/test/faq_test.json\", split='train')\n", 130 | "test_tokenized_datasets = dataset.map(preprocess_function, batched=True)\n", 131 | "data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors=\"pt\")\n", 132 | "model.to('cuda:2')\n", 133 | "\n", 134 | "\n", 135 | "max_target_length = 25\n", 136 | "test_tokenized_datasets = test_tokenized_datasets.remove_columns(['text', 'label'])\n", 137 | "dataloader = torch.utils.data.DataLoader(test_tokenized_datasets.select(idx), collate_fn=data_collator, batch_size=32)\n", 138 | "\n", 139 | "predictions = []\n", 140 | "references = []\n", 141 | "for i, batch in enumerate(tqdm(dataloader)):\n", 142 | " outputs = model.generate(\n", 143 | " input_ids=batch['input_ids'].to('cuda:2'),\n", 144 | " max_length=max_target_length,\n", 145 | " attention_mask=batch['attention_mask'].to('cuda:2'),\n", 146 | " )\n", 147 | " with tokenizer.as_target_tokenizer():\n", 148 | " outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]\n", 149 | "\n", 150 | " labels = np.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)\n", 151 | " actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]\n", 152 | " predictions.extend(outputs)\n", 153 | " references.extend(actuals)\n", 154 | "# metrics.add_batch(predictions=outputs, references=actuals)\n", 155 | "\n", 156 | "# metrics.compute()\n", 157 | "\n", 158 | "def compute_metrics(predictions, references):\n", 159 | " decoded_preds, decoded_labels = predictions, references\n", 160 | "# logits = np.argmax(logits, axis=1)\n", 161 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 162 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 163 | " predictions = decoded_preds\n", 164 | " labels = decoded_labels\n", 165 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 166 | " metrics_result = {\n", 167 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 168 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 169 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 170 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 171 | " \"f1_neg\": neg,\n", 172 | " \"f1_neu\": neu,\n", 173 | " \"f1_pos\": pos\n", 174 | "\n", 175 | " }\n", 176 | " return metrics_result\n", 177 | "# rouges = [{k: v.mid.fmeasure} for k,v in metrics.compute(predictions=predictions, references=references).items()]\n", 178 | "del model\n", 179 | "gc.collect()\n", 180 | "\n", 181 | "print(compute_metrics(predictions, references))" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "from tqdm import tqdm\n", 191 | "import random\n", 192 | "from datasets import Dataset, load_metric\n", 193 | "import transformers\n", 194 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 195 | "from datasets import load_dataset, Dataset\n", 196 | "import pandas as pd\n", 197 | "import evaluate\n", 198 | "import torch\n", 199 | "import nltk\n", 200 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 201 | "import nltk\n", 202 | "import argparse\n", 203 | "import numpy as np\n", 204 | "from transformers import AutoModelForSeq2SeqLM\n", 205 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 206 | "from trl import AutoModelForCausalLMWithValueHead\n", 207 | "from transformers import TrainingArguments, Trainer\n", 208 | "from trl import SFTTrainer\n", 209 | "import evaluate\n", 210 | "from transformers import AutoModelForCausalLM\n", 211 | "from peft import PeftModel\n", 212 | "\n", 213 | "\n", 214 | "base_model_name = 'vtrungnhan9/vmlu-llm'\n", 215 | "print(\"loading\")\n", 216 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 217 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 218 | "model = PeftModel.from_pretrained(model, './results/vmlu-llm_human_justificationv2/')\n", 219 | "print('finished loadding')\n", 220 | "model = model.merge_and_unload()\n", 221 | "model = model.cuda()\n", 222 | "\n", 223 | "\n", 224 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 225 | "train_df['label'] = train_df['label'].astype(str)\n", 226 | "\n", 227 | "train_dataset = Dataset.from_pandas(train_df)\n", 228 | "\n", 229 | "testset = pd.read_excel('test.xlsx')\n", 230 | "\n", 231 | "testset['label'] = testset['label'].astype(str)\n", 232 | "print(train_df['label'].unique())\n", 233 | "print(testset['label'].unique())\n", 234 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 235 | "\n", 236 | "def template(inp, out):\n", 237 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 238 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 239 | " {'role': 'asssistant', 'content': str(out)}\n", 240 | " ]\n", 241 | "# print(out)\n", 242 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 243 | "# prompt = prompt + ' '\n", 244 | " return prompt\n", 245 | "\n", 246 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 247 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 248 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 249 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 250 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 251 | "\n", 252 | "outs = []\n", 253 | "i = 0\n", 254 | "# print(\"Start inference\")\n", 255 | "# for tt in (test_dataset['train_text']):\n", 256 | "# if i % 100 == 0:\n", 257 | "# print(i)\n", 258 | "# input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 259 | "# out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n", 260 | "\n", 261 | "# assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 262 | "# # print(assistant)\n", 263 | "# outs.append(assistant)\n", 264 | "# i += 1\n", 265 | "# # break\n", 266 | "# del model\n", 267 | "# gc.collect()\n", 268 | "outs = []\n", 269 | "batch_size=32\n", 270 | "print(\"Start inference\")\n", 271 | "for i in tqdm(range(0, len(test_dataset), batch_size)):\n", 272 | " batch = test_dataset[i:i + batch_size]\n", 273 | " inputs = tokenizer(batch['train_text'], return_tensors='pt', padding=True, truncation=True).input_ids.cuda()\n", 274 | " outputs = model.generate(inputs, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n", 275 | " decoded_outputs = tokenizer.batch_decode(outputs[:, inputs.size(1):], skip_special_tokens=True)\n", 276 | " outs.extend([output.strip() for output in decoded_outputs])\n", 277 | "# break\n", 278 | "\n", 279 | "# Cleanup\n", 280 | "del model\n", 281 | "import gc\n", 282 | "gc.collect()\n", 283 | "torch.cuda.empty_cache()\n", 284 | "\n", 285 | "\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "import pandas as pd\n", 295 | "from datasets import Dataset\n", 296 | "import evaluate\n", 297 | "from datasets import load_metric, load_dataset\n", 298 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 299 | "import torch \n", 300 | "import numpy as np\n", 301 | "from tqdm import tqdm\n", 302 | "metrics = load_metric('accuracy')\n", 303 | "import gc\n", 304 | "import os\n", 305 | "\n", 306 | "accuracy = evaluate.load(\"accuracy\")\n", 307 | "f1 = evaluate.load(\"f1\")\n", 308 | "precision = evaluate.load(\"precision\")\n", 309 | "recall = evaluate.load(\"recall\")\n", 310 | "\n", 311 | "def compute_metrics(predictions, references):\n", 312 | " decoded_preds, decoded_labels = predictions, references\n", 313 | "# logits = np.argmax(logits, axis=1)\n", 314 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 315 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 316 | " predictions = decoded_preds\n", 317 | " labels = decoded_labels\n", 318 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 319 | " metrics_result = {\n", 320 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 321 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 322 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 323 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 324 | " \"f1_neg\": neg,\n", 325 | " \"f1_neu\": neu,\n", 326 | " \"f1_pos\": pos\n", 327 | "\n", 328 | " }\n", 329 | " return metrics_result\n", 330 | "\n", 331 | "references = (testset['label'])\n", 332 | "compute_metrics(outs, references)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "from tqdm import tqdm\n", 342 | "import random\n", 343 | "from datasets import Dataset, load_metric\n", 344 | "import transformers\n", 345 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 346 | "from datasets import load_dataset, Dataset\n", 347 | "import pandas as pd\n", 348 | "import evaluate\n", 349 | "import torch\n", 350 | "import nltk\n", 351 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 352 | "import nltk\n", 353 | "import argparse\n", 354 | "import numpy as np\n", 355 | "from transformers import AutoModelForSeq2SeqLM\n", 356 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 357 | "from trl import AutoModelForCausalLMWithValueHead\n", 358 | "from transformers import TrainingArguments, Trainer\n", 359 | "from trl import SFTTrainer\n", 360 | "import evaluate\n", 361 | "from transformers import AutoModelForCausalLM\n", 362 | "from peft import PeftModel\n", 363 | "\n", 364 | "\n", 365 | "base_model_name = 'vtrungnhan9/vmlu-llm'\n", 366 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 367 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 368 | "# model = PeftModel.from_pretrained(model, './Vistral-7B-Chat_no')\n", 369 | "\n", 370 | "# model = model.merge_and_unload()\n", 371 | "model = model.cuda()\n", 372 | "\n", 373 | "\n", 374 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 375 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 376 | "testset['text'] = test_with_asr['asr']\n", 377 | "\n", 378 | "train_df['label'] = train_df['label'].astype(str)\n", 379 | "\n", 380 | "train_dataset = Dataset.from_pandas(train_df)\n", 381 | "\n", 382 | "testset = pd.read_excel('test.xlsx')\n", 383 | "\n", 384 | "testset['label'] = testset['label'].astype(str)\n", 385 | "print(train_df['label'].unique())\n", 386 | "print(testset['label'].unique())\n", 387 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 388 | "\n", 389 | "def template(inp, out):\n", 390 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 391 | " {\"role\": \"user\", \"content\": f\"\"\"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực. Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 392 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 393 | " ]\n", 394 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 395 | " print(prompt)\n", 396 | "# prompt = prompt + f' {out}'\n", 397 | " return prompt\n", 398 | "\n", 399 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 400 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 401 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 402 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 403 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 404 | "\n", 405 | "outs = []\n", 406 | "i = 0\n", 407 | "# for tt in (test_dataset['train_text']):\n", 408 | "# if i % 500 == 0:\n", 409 | "# print(i)\n", 410 | "# print(outs)\n", 411 | "# input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 412 | "# out_ids = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id)\n", 413 | "\n", 414 | "# assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 415 | "# outs.append(assistant)\n", 416 | "# # print(outs)\n", 417 | " \n", 418 | "# i += 1" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "outs = []\n", 428 | "for tt in (test_dataset['train_text']):\n", 429 | " if i % 500 == 0:\n", 430 | " print(i)\n", 431 | " print(outs)\n", 432 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 433 | " out_ids = model.generate(input_ids, max_new_tokens=2, pad_token_id=tokenizer.eos_token_id)\n", 434 | "\n", 435 | " assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 436 | " outs.append(assistant)\n", 437 | "# print(outs)\n", 438 | " \n", 439 | " i += 1" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "def template(inp, out):\n", 449 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 450 | " {\"role\": \"user\", \"content\": f\"\"\"Nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 451 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 452 | " ]\n", 453 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 454 | " print(prompt)\n", 455 | "# prompt = prompt + f' {out}'\n", 456 | " return prompt\n", 457 | "\n", 458 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 459 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 460 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 461 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 462 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 463 | "\n", 464 | "outs = []\n", 465 | "i = 0\n" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "from tqdm import tqdm\n", 475 | "import random\n", 476 | "from datasets import Dataset, load_metric\n", 477 | "import transformers\n", 478 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 479 | "from datasets import load_dataset, Dataset\n", 480 | "import pandas as pd\n", 481 | "import evaluate\n", 482 | "import torch\n", 483 | "import nltk\n", 484 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 485 | "import nltk\n", 486 | "import argparse\n", 487 | "import numpy as np\n", 488 | "from transformers import AutoModelForSeq2SeqLM\n", 489 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 490 | "from trl import AutoModelForCausalLMWithValueHead\n", 491 | "from transformers import TrainingArguments, Trainer\n", 492 | "from trl import SFTTrainer\n", 493 | "import evaluate\n", 494 | "from transformers import AutoModelForCausalLM\n", 495 | "from peft import PeftModel\n", 496 | "\n", 497 | "\n", 498 | "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n", 499 | "print(\"loading\")\n", 500 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 501 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 502 | "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n", 503 | "print('finished laoding')\n", 504 | "model = model.merge_and_unload()\n", 505 | "model = model.to('cuda:7')\n", 506 | "\n", 507 | "\n", 508 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 509 | "train_df['label'] = train_df['label'].astype(str)\n", 510 | "\n", 511 | "train_dataset = Dataset.from_pandas(train_df)\n", 512 | "\n", 513 | "testset = pd.read_excel('test.xlsx')\n", 514 | "test_with_asr = pd.read_excel('test_asr.xlsx')\n", 515 | "testset['text'] = test_with_asr['asr']\n", 516 | "\n", 517 | "testset['label'] = testset['label'].astype(str)\n", 518 | "print(train_df['label'].unique())\n", 519 | "print(testset['label'].unique())\n", 520 | "test_dataset = Dataset.from_pandas(testset[['text', 'label', 'human_justification']])\n", 521 | "\n", 522 | "def template(inp, out):\n", 523 | " conversation = [\n", 524 | " {\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 525 | "# {\"role\": \"user\", \"content\": f\"\"\"sentiment analysis: '{inp.strip()}'\"\"\"},\n", 526 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 527 | " # {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 528 | " ]\n", 529 | "# print(out)\n", 530 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 531 | " prompt = prompt + f' '\n", 532 | " return prompt\n", 533 | "\n", 534 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 535 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 536 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 537 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 538 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 539 | "\n", 540 | "outs = []\n", 541 | "i = 0\n", 542 | "print(\"Start inference\")\n", 543 | "for tt in (test_dataset.select(idx)['train_text']):\n", 544 | " if i % 100 == 0:\n", 545 | " print(i, set(outs))\n", 546 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.to('cuda:7')#[:,:-1]\n", 547 | " out_ids = model.generate(input_ids, max_new_tokens=25, pad_token_id=tokenizer.eos_token_id, output_scores=True)\n", 548 | "\n", 549 | " assistant = tokenizer.batch_decode(out_ids[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 550 | "# print(assistant)\n", 551 | " outs.append(assistant)\n", 552 | " i += 1\n", 553 | "# break\n", 554 | "# print(assistant)\n", 555 | "del model\n", 556 | "import gc\n", 557 | "torch.cuda.empty_cache()\n", 558 | "gc.collect()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "from evaluate import load\n", 568 | "bertscore = load(\"bertscore\")\n", 569 | "predictions = [o[2:] for o in outs]\n", 570 | "references = test_dataset.select(idx)['human_justification']\n", 571 | "results = bertscore.compute(predictions=predictions, references=references, lang=\"vi\")\n", 572 | "sum(results['f1'])/100" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": null, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "from evaluate import load\n", 582 | "rouge = load(\"rouge\")\n", 583 | "predictions = [o[2:] for o in outs]\n", 584 | "references = test_dataset.select(idx)['human_justification']\n", 585 | "results = rouge.compute(predictions=predictions, references=references)\n", 586 | "results" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "test_samples = \"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n", 596 | "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n", 597 | "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n", 598 | "\"\"\".split('\\n')\n", 599 | "\n", 600 | "testdf = test_dataset.to_pandas()\n", 601 | "testdf[testdf.text.isin(test_samples)]" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "confidence = []\n", 611 | "outs = []\n", 612 | "i = 0\n", 613 | "\"\"\"trả lại cho họ chất lượng cuộc sống bình thường như bao người khác là được nghe được nói thế nhưng điều kỳ diệu đã xảy\n", 614 | "những chia sẻ vô cùng hữu ích và thiết thực vừa rồi ạ có thể thấy là hầu hết người bệnh nằm điều trị trong\n", 615 | "khám suốt tiểu đường nó vẫn mệt mỏi vô khám tai biến bộ não vô khám nhưng mà xương thì nó loãng xương rất là nhiều\n", 616 | "\"\"\".split()\n", 617 | "for tt in (testdf[testdf.text.isin(test_samples)]['train_text']):\n", 618 | " if i % 100 == 0:\n", 619 | " print(i, set(outs))\n", 620 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 621 | " output = model.generate(input_ids, max_new_tokens=20, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 622 | "\n", 623 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 624 | "# print(assistant)\n", 625 | " confidence.append(assistant)\n", 626 | " \n", 627 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 628 | " outs.append(assistant)\n", 629 | " i += 1" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "import pandas as pd\n", 639 | "from datasets import Dataset\n", 640 | "import evaluate\n", 641 | "from datasets import load_metric, load_dataset\n", 642 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, TrainingArguments, Seq2SeqTrainingArguments\n", 643 | "import torch \n", 644 | "import numpy as np\n", 645 | "from tqdm import tqdm\n", 646 | "metrics = load_metric('accuracy')\n", 647 | "import gc\n", 648 | "import os\n", 649 | "\n", 650 | "accuracy = evaluate.load(\"accuracy\")\n", 651 | "f1 = evaluate.load(\"f1\")\n", 652 | "precision = evaluate.load(\"precision\")\n", 653 | "recall = evaluate.load(\"recall\")\n", 654 | "\n", 655 | "def compute_metrics(predictions, references):\n", 656 | " decoded_preds, decoded_labels = predictions, references\n", 657 | "# logits = np.argmax(logits, axis=1)\n", 658 | " decoded_preds = [pred if pred.isdigit() else '-1' for pred in decoded_preds] # Replace non-digit predictions with '-1'\n", 659 | " decoded_labels = [label if label.isdigit() else '-1' for label in decoded_labels] # Replace non-digit labels with '-1'\n", 660 | " predictions = decoded_preds\n", 661 | " labels = decoded_labels\n", 662 | " neg,neu,pos = f1.compute(predictions=predictions, references=labels, average=None)['f1']\n", 663 | " metrics_result = {\n", 664 | " \"accuracy\": accuracy.compute(predictions=predictions, references=labels)['accuracy'],\n", 665 | " \"macro_f1\": f1.compute(predictions=predictions, references=labels, average='macro')['f1'],\n", 666 | "# \"macro_precision\": precision.compute(predictions=predictions, references=labels, average='macro')['precision'],\n", 667 | "# \"macro_recall\": recall.compute(predictions=predictions, references=labels, average='macro')['recall'],\n", 668 | " \"f1_neg\": neg,\n", 669 | " \"f1_neu\": neu,\n", 670 | " \"f1_pos\": pos\n", 671 | "\n", 672 | " }\n", 673 | " return metrics_result\n", 674 | "\n", 675 | "references = (testset['label'])\n", 676 | "compute_metrics(outs, references)" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "from tqdm import tqdm\n", 686 | "import random\n", 687 | "from datasets import Dataset, load_metric\n", 688 | "import transformers\n", 689 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq\n", 690 | "from datasets import load_dataset, Dataset\n", 691 | "import pandas as pd\n", 692 | "import evaluate\n", 693 | "import torch\n", 694 | "import nltk\n", 695 | "from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM\n", 696 | "import nltk\n", 697 | "import argparse\n", 698 | "import numpy as np\n", 699 | "from transformers import AutoModelForSeq2SeqLM\n", 700 | "from peft import get_peft_config, get_peft_model, LoraConfig, TaskType\n", 701 | "from trl import AutoModelForCausalLMWithValueHead\n", 702 | "from transformers import TrainingArguments, Trainer\n", 703 | "from trl import SFTTrainer\n", 704 | "import evaluate\n", 705 | "from transformers import AutoModelForCausalLM\n", 706 | "from peft import PeftModel\n", 707 | "\n", 708 | "\n", 709 | "base_model_name = 'Viet-Mistral/Vistral-7B-Chat'\n", 710 | "print(\"loading\")\n", 711 | "tokenizer = AutoTokenizer.from_pretrained(base_model_name)\n", 712 | "model = AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir='./models')\n", 713 | "model = PeftModel.from_pretrained(model, './results/Vistral-7B-Chat_human_justification/')\n", 714 | "print('finished laoding')\n", 715 | "model = model.merge_and_unload()\n", 716 | "model = model.to('cuda')\n", 717 | "\n", 718 | "\n", 719 | "train_df = pd.read_excel('train.xlsx')#pd.concat([df, df_dev]).reset_index(drop=True)\n", 720 | "train_df['label'] = train_df['label'].astype(str)\n", 721 | "\n", 722 | "train_dataset = Dataset.from_pandas(train_df)\n", 723 | "\n", 724 | "testset = pd.read_excel('test.xlsx')\n", 725 | "# test_with_asr = pd.read_excel('test_asr.xlsx')\n", 726 | "# testset['text'] = test_with_asr['asr']\n", 727 | "\n", 728 | "testset['label'] = testset['label'].astype(str)\n", 729 | "print(train_df['label'].unique())\n", 730 | "print(testset['label'].unique())\n", 731 | "test_dataset = Dataset.from_pandas(testset[['text', 'label']])\n", 732 | "\n", 733 | "def template(inp, out):\n", 734 | " conversation = [{\"role\": \"system\", \"content\": \"Bạn là trợ lý nhận diện cảm xúc. Với một văn bản, trả lời 0 nếu cảm xúc tiêu cực, 1 nếu không có cảm xúc, 2 nếu cảm xúc tích cực.\" },\n", 735 | " {\"role\": \"user\", \"content\": f\"\"\"nhận diện cảm xúc: '{inp.strip()}'\"\"\"},\n", 736 | "# {'role': 'asssistant', 'content': str(out) + 'herqwwewf'}\n", 737 | " ]\n", 738 | "# print(out)\n", 739 | " prompt = tokenizer.apply_chat_template(conversation, tokenize=False)\n", 740 | " prompt = prompt + f' '\n", 741 | " return prompt\n", 742 | "\n", 743 | "# reformatted_output = [reformat(inp, out) for inp, out in zip(dataset['train']['words'], dataset['train']['tags'])]\n", 744 | "new_column_train = [template(inp, out) for inp, out in zip(train_dataset['text'], train_dataset['label'])]\n", 745 | "train_dataset= train_dataset.add_column(\"train_text\", new_column_train)\n", 746 | "new_column_train = [template(inp, out) for inp, out in zip(test_dataset['text'], test_dataset['label'])]\n", 747 | "test_dataset= test_dataset.add_column(\"train_text\", new_column_train)\n", 748 | "\n", 749 | "outs = []\n", 750 | "i = 0\n", 751 | "print(\"Start inference\")\n", 752 | "\n", 753 | "confidence = []\n", 754 | "i = 0\n", 755 | "for tt in (test_dataset['train_text']):\n", 756 | " if i % 100 == 0:\n", 757 | " print(i, set(outs))\n", 758 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 759 | " output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 760 | "\n", 761 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 762 | "# print(assistant)\n", 763 | " confidence.append(assistant)\n", 764 | " \n", 765 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 766 | " outs.append(assistant)\n", 767 | " i += 1" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "i = 0\n", 777 | "print(\"Start inference\")\n", 778 | "\n", 779 | "confidence = []\n", 780 | "i = 0\n", 781 | "for tt in (test_dataset['train_text']):\n", 782 | " if i % 100 == 0:\n", 783 | " print(i, set(outs))\n", 784 | " input_ids = tokenizer(tt, return_tensors='pt').input_ids.cuda()\n", 785 | " output = model.generate(input_ids, max_new_tokens=1, pad_token_id=tokenizer.eos_token_id, output_scores=True, output_attentions=True, return_dict_in_generate=True)\n", 786 | "\n", 787 | " assistant = (output.scores)[0].softmax(dim=1)[:,output.sequences[:, input_ids.size(1) ].item()].item()\n", 788 | "# print(assistant)\n", 789 | " confidence.append(assistant)\n", 790 | " \n", 791 | " assistant = tokenizer.batch_decode(output.sequences[:, input_ids.size(1): ], skip_special_tokens=True)[0].strip()\n", 792 | " outs.append(assistant)\n", 793 | " i += 1" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "metadata": {}, 800 | "outputs": [], 801 | "source": [] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [] 809 | } 810 | ], 811 | "metadata": { 812 | "kernelspec": { 813 | "display_name": "distill", 814 | "language": "python", 815 | "name": "distill" 816 | }, 817 | "language_info": { 818 | "codemirror_mode": { 819 | "name": "ipython", 820 | "version": 3 821 | }, 822 | "file_extension": ".py", 823 | "mimetype": "text/x-python", 824 | "name": "python", 825 | "nbconvert_exporter": "python", 826 | "pygments_lexer": "ipython3", 827 | "version": "3.10.4" 828 | } 829 | }, 830 | "nbformat": 4, 831 | "nbformat_minor": 4 832 | } 833 | --------------------------------------------------------------------------------