├── data └── data ├── model_save └── skt-kogpt2-base-v2.pt ├── .gitattributes ├── gpt.png ├── requirements.txt ├── Data_preprocessing.py ├── LICENSE ├── ds_config.json ├── interactive.py ├── new_data_interactive.py ├── README.md └── GPT-2_fine_tune.py /data/data: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /model_save/skt-kogpt2-base-v2.pt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.pt filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /gpt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/momozzing/kiosk_bot/HEAD/gpt.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | deepspeed==0.5.3 2 | pandas==1.3.2 3 | torch==1.13.1 4 | tqdm==4.62.1 5 | transformers==4.30.0 6 | wandb==0.12.5 7 | -------------------------------------------------------------------------------- /Data_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from IPython.display import display 3 | from tqdm import tqdm 4 | 5 | train_data = pd.read_csv("data/cafe_data.txt", delimiter="\t", encoding= "utf-8") 6 | 7 | tmp = [] 8 | for idx in tqdm(range(len(train_data))): 9 | sentense, speakerid, sentenseid = train_data["SENTENCE"][idx], train_data["SPEAKERID"][idx], train_data["SENTENCEID"][idx] 10 | tmp.append([sentense] + [speakerid] + [sentenseid]) 11 | 12 | new_df = pd.DataFrame(tmp, columns=['SENTENCE', 'SPEAKERID', 'SENTENCEID']) 13 | new_df = new_df[:7180] 14 | 15 | text_list = [] 16 | label_list = [] 17 | all_list = [] 18 | 19 | session_num = 0 20 | new_df['dialog_session'] = 0 21 | 22 | for i in range(len(new_df)): 23 | if new_df["SENTENCEID"][i] == '1': 24 | session_num += 1 25 | new_df['dialog_session'][i] =+ session_num 26 | 27 | print(new_df) 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yunho Mo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 64, 3 | "fp16": { 4 | "enabled": true, 5 | "initial_scale_power": 8 6 | }, 7 | "gradient_clipping": 1.0, 8 | "optimizer": { 9 | "type": "Adam", 10 | "params": { 11 | "lr": 3e-5, 12 | "betas": [ 13 | 0.9, 14 | 0.999 15 | ], 16 | "eps": 1e-8, 17 | "weight_decay": 3e-7 18 | } 19 | }, 20 | "scheduler": { 21 | "type": "WarmupLR", 22 | "params": { 23 | "warmup_min_lr": 0, 24 | "warmup_max_lr": 3e-5, 25 | "warmup_num_steps": 100 26 | } 27 | }, 28 | "zero_optimization": { 29 | "stage": 2, 30 | "allgather_partitions": true, 31 | "allgather_bucket_size": 1e8, 32 | "overlap_comm": true, 33 | "reduce_scatter": true, 34 | "reduce_bucket_size": 1e8, 35 | "contiguous_gradients": true, 36 | "offload_optimizer": { 37 | "device": "cpu", 38 | "pin_memory": true, 39 | "fast_init": false 40 | } 41 | }, 42 | "activation_checkpointing": { 43 | "partition_activations": true, 44 | "cpu_checkpointing": true, 45 | "contiguous_memory_optimization": true, 46 | "number_checkpoints": 1, 47 | "synchronize_checkpoint_boundary": false, 48 | "profile": false 49 | }, 50 | "zero_allow_untested_optimizer": true, 51 | "wall_clock_breakdown": false, 52 | "step_per_print": 100 53 | } -------------------------------------------------------------------------------- /interactive.py: -------------------------------------------------------------------------------- 1 | """ 2 | python interactive.py 3 | """ 4 | import torch 5 | from transformers import AutoModelWithLMHead, AutoTokenizer 6 | 7 | model_name = "momo/gpt2-kiosk" 8 | model = AutoModelWithLMHead.from_pretrained(model_name) 9 | tokenizer = AutoTokenizer.from_pretrained(model_name) 10 | 11 | SPECIAL_TOKENS = { 12 | "bos_token": "", 13 | "eos_token": "", 14 | "pad_token": "", 15 | "sep_token": "" 16 | } 17 | SPECIAL_TOKENS_VALUES = ["", "", "", ""] 18 | tokenizer.add_special_tokens(SPECIAL_TOKENS) 19 | 20 | model.cuda() 21 | 22 | with torch.no_grad(): 23 | while True: 24 | t = input("\nUser: ") 25 | tokens = tokenizer( 26 | t, 27 | return_tensors="pt", 28 | truncation=True, 29 | padding=True, 30 | max_length=50 31 | ) 32 | 33 | input_ids = tokens.input_ids.cuda() 34 | attention_mask = tokens.attention_mask.cuda() 35 | sample_output = model.generate( 36 | input_ids, 37 | do_sample=True, 38 | max_length=50, 39 | # max_new_tokens=50, 40 | # top_k=50, 41 | # return_dict_in_generate=True 42 | ) 43 | gen = sample_output[0] 44 | print("System: " + tokenizer.decode(gen[len(input_ids[0]):-1], skip_special_tokens=True)) 45 | -------------------------------------------------------------------------------- /new_data_interactive.py: -------------------------------------------------------------------------------- 1 | """ 2 | python interactive.py 3 | """ 4 | import torch 5 | from transformers import AutoModelWithLMHead, AutoTokenizer 6 | 7 | model_name = "skt/kogpt2-base-v2" 8 | ckpt_name = "model_save/skt-kogpt2-base-v2.pt" 9 | model = AutoModelWithLMHead.from_pretrained(model_name) 10 | tokenizer = AutoTokenizer.from_pretrained(model_name) 11 | 12 | SPECIAL_TOKENS = { 13 | "bos_token": "", 14 | "eos_token": "", 15 | "pad_token": "", 16 | "sep_token": "" 17 | } 18 | SPECIAL_TOKENS_VALUES = ["", "", "", ""] 19 | tokenizer.add_special_tokens(SPECIAL_TOKENS) 20 | model.resize_token_embeddings(len(tokenizer)) 21 | 22 | model.load_state_dict(torch.load(ckpt_name, map_location="cpu")) 23 | model.cuda() 24 | 25 | with torch.no_grad(): 26 | while True: 27 | t = input("\nUser: ") 28 | tokens = tokenizer( 29 | t, 30 | return_tensors="pt", 31 | truncation=True, 32 | padding=True, 33 | max_length=50 34 | ) 35 | 36 | input_ids = tokens.input_ids.cuda() 37 | attention_mask = tokens.attention_mask.cuda() 38 | sample_output = model.generate( 39 | input_ids, 40 | do_sample=True, 41 | max_length=50, 42 | # max_new_tokens=50, 43 | # top_k=50, 44 | # return_dict_in_generate=True 45 | ) 46 | gen = sample_output[0] 47 | print("System: " + tokenizer.decode(gen[len(input_ids[0]):-1], skip_special_tokens=True)) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kiosk_bot 2 | 3 | - KoGPT2를 이용한 간단한 kiosk chatbot 입니다. 4 | 5 | - 데이터는 AiHub의 [한국어대화 데이터](https://aihub.or.kr/aidata/85)를 사용했습니다. 6 | 7 | - 데이터는 학습만 진행하였고 공개는 하지 않습니다. 8 | 9 | ## Architecture 10 | Hugging face의 예제들을 보며 구현하였습니다. 11 | 12 | gpt 13 | 14 | ```input``` 은 ```User + + System + ```로 구현하였다. 15 | 16 | ```output``` 은 ```P(system|user)```를 최대화 시키기위해 모델을 학습하였다. 17 | 18 | ## Install 19 | ```sh 20 | git clone https://github.com/momozzing/kiosk_bot.git 21 | pip install requirements.txt 22 | ``` 23 | 24 | ## How to train? 25 | 기존 학습된 데이터로 대화를 하고 싶으시면 ```install``` 후 ```How to chat``` 으로 넘어가셔도 됩니다. 26 | ``` 27 | cd kiosk_bot/data ## 안에 원하시는 데이터를 넣으시면 됩니다. 28 | 29 | kiosk_bot/GPT-2_fine_tune.py ## 안에 데이터 loader들의 파일이름을 변경해주시면 됩니다. 30 | 31 | deepspeed --num_gpus=1 GPT-2_fine_tune.py ## 변경후 이것으로 실행하시면 됩니다. 32 | 33 | 그후 생성된 skt-kogpt2-base-v2.pt 를 사용하시면 됩니다. 34 | ``` 35 | 36 | ## How to chat? 37 | ``` 38 | python interactive.py ##기존 데이터로 채팅하기 39 | 40 | python new_data_interactive.py ##새로운 데이터로 파인튜닝한것 pt파일 불러와서 채팅하기. 41 | 42 | --기존 데이터 채팅-- 43 | 44 | User: 아메리카노 주세요 45 | System: 따뜻한 거 드릴까요? 46 | 47 | User: 아이스로 주세요 48 | System: 사이즈는 톨 괜찮으세요? 49 | 50 | User: 그란데 사이즈로 주세요 51 | System: 죄송한데 저희 가게에는 레귤러랑 라지 사이즈 밖에 없어요 52 | 53 | User: 그럼 라지사이즈로 주세요 54 | System: 네 고객님 테이크아웃 하시나요? 55 | 56 | User: 먹고갈게요 57 | System: 네 그럼 머그잔으로 준비해드리겠습니다 58 | 59 | User: 감사합니다 60 | System: 좋은 하루 보내세요 61 | ``` 62 | 63 | ## Reference 64 | [HuggingFace](https://huggingface.co/transformers/index.html) 65 | 66 | [KoGPT2](https://github.com/SKT-AI/KoGPT2) 67 | 68 | [AIHUB](https://aihub.or.kr/) 69 | -------------------------------------------------------------------------------- /GPT-2_fine_tune.py: -------------------------------------------------------------------------------- 1 | ''' 2 | deepspeed --num_gpus=1 GPT-2_fine_tune.py 3 | ''' 4 | 5 | from argparse import ArgumentParser 6 | import os 7 | import pandas as pd 8 | import torch 9 | from torch.utils.data import DataLoader 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer, AutoModelWithLMHead 12 | 13 | import deepspeed 14 | from deepspeed.ops.adam import DeepSpeedCPUAdam 15 | import wandb 16 | 17 | os.environ["TOKENIZERS_PARALLELISM"] = "true" 18 | 19 | model_name = "skt/kogpt2-base-v2" 20 | tokenizer = AutoTokenizer.from_pretrained(model_name) 21 | SPECIAL_TOKENS = { 22 | "bos_token": "", 23 | "eos_token": "", 24 | "pad_token": "", 25 | "sep_token": "" 26 | } 27 | tokenizer.add_special_tokens(SPECIAL_TOKENS) 28 | 29 | model = AutoModelWithLMHead.from_pretrained( 30 | model_name 31 | ).cuda() 32 | 33 | model.resize_token_embeddings(len(tokenizer)) 34 | 35 | parser = ArgumentParser() 36 | parser.add_argument("--deepspeed_config", type=str, default="ds_config.json") 37 | parser.add_argument("--local_rank", type=int) 38 | parser.add_argument("--epoch", default=50, type=int) 39 | parser.add_argument("--batch_size", default=128, type=int) 40 | parser.add_argument("--sep_token", default=tokenizer.sep_token, type=str) 41 | parser.add_argument("--bos_token", default=tokenizer.bos_token, type=str) 42 | parser.add_argument("--eos_token", default=tokenizer.eos_token, type=str) 43 | args = parser.parse_args() 44 | 45 | wandb.init(project="mobot", name=f"mobot-{model_name}") 46 | train_data = pd.read_csv("data/cafe_clear_data_test.tsv", delimiter="\t") 47 | train_data = train_data[:3000] 48 | train_text, train_labels = ( 49 | train_data["text"].values, 50 | train_data["label"].values, 51 | ) 52 | 53 | dataset = [ 54 | {"data": t + str(args.bos_token) + l + str(args.eos_token), "label": l} 55 | for t, l in zip(train_text, train_labels) 56 | ] 57 | train_loader = DataLoader( 58 | dataset, 59 | batch_size=args.batch_size, 60 | num_workers=8, 61 | drop_last=True, 62 | pin_memory=True, 63 | ) 64 | 65 | eval_data = pd.read_csv("data/cafe_clear_data_test.tsv", delimiter="\t") 66 | eval_data = eval_data[3000:] 67 | 68 | eval_text, eval_labels = ( 69 | eval_data["text"].values, 70 | eval_data["label"].values, 71 | ) 72 | 73 | dataset = [ 74 | {"data": t + str(args.bos_token) + l + str(args.eos_token), "label": l } 75 | for t, l in zip(eval_text, eval_labels) 76 | ] 77 | eval_loader = DataLoader( 78 | dataset, 79 | batch_size=args.batch_size, 80 | num_workers=8, 81 | drop_last=True, 82 | pin_memory=True, 83 | ) 84 | 85 | optimizer = DeepSpeedCPUAdam( 86 | lr=3e-5, weight_decay=3e-7, model_params=model.parameters() 87 | ) 88 | 89 | engine, optimizer, _, _ = deepspeed.initialize( 90 | args=args, model=model, optimizer=optimizer 91 | ) 92 | for epoch in range(args.epoch): 93 | model.train() 94 | for train in tqdm(train_loader): 95 | optimizer.zero_grad() 96 | text, label = train["data"], train["label"] 97 | text_tokens = tokenizer( 98 | text, 99 | return_tensors="pt", 100 | max_length=50, 101 | truncation=True, 102 | padding=True, 103 | ) 104 | 105 | input_ids = text_tokens.input_ids.cuda() 106 | attention_mask = text_tokens.attention_mask.cuda() 107 | 108 | output = engine.forward( 109 | input_ids=input_ids, 110 | attention_mask=attention_mask, 111 | labels=input_ids, 112 | ) 113 | 114 | loss = output.loss 115 | wandb.log({"loss": loss}) 116 | engine.backward(loss) 117 | optimizer.step() 118 | 119 | with torch.no_grad(): 120 | model.eval() 121 | for eval in tqdm(eval_loader): 122 | eval_text, eval_label = eval["data"], eval["label"] 123 | eval_text_tokens = tokenizer( 124 | eval_text, 125 | return_tensors="pt", 126 | max_length=50, 127 | truncation=True, 128 | padding=True, 129 | ) 130 | 131 | input_ids = eval_text_tokens.input_ids.cuda() 132 | attention_mask = eval_text_tokens.attention_mask.cuda() 133 | 134 | eval_out = engine.forward( 135 | input_ids=input_ids, 136 | attention_mask=attention_mask, 137 | labels=input_ids, 138 | ) 139 | wandb.log({"eval_loss": eval_out.loss}) 140 | wandb.log({"epoch": epoch+1}) 141 | 142 | torch.save(model.state_dict(), f"model_save/{model_name.replace('/', '-')}.pt") 143 | --------------------------------------------------------------------------------