├── data
    └── data
├── model_save
    └── skt-kogpt2-base-v2.pt
├── .gitattributes
├── gpt.png
├── requirements.txt
├── Data_preprocessing.py
├── LICENSE
├── ds_config.json
├── interactive.py
├── new_data_interactive.py
├── README.md
└── GPT-2_fine_tune.py


/data/data:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/model_save/skt-kogpt2-base-v2.pt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.pt filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/gpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/momozzing/kiosk_bot/HEAD/gpt.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | deepspeed==0.5.3
2 | pandas==1.3.2
3 | torch==1.13.1
4 | tqdm==4.62.1
5 | transformers==4.30.0
6 | wandb==0.12.5
7 | 


--------------------------------------------------------------------------------
/Data_preprocessing.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd 
 2 | from IPython.display import display
 3 | from tqdm import tqdm
 4 | 
 5 | train_data = pd.read_csv("data/cafe_data.txt", delimiter="\t", encoding= "utf-8")
 6 | 
 7 | tmp = []
 8 | for idx in tqdm(range(len(train_data))):
 9 |     sentense, speakerid, sentenseid = train_data["SENTENCE"][idx], train_data["SPEAKERID"][idx], train_data["SENTENCEID"][idx]
10 |     tmp.append([sentense] + [speakerid] + [sentenseid])
11 |  
12 | new_df = pd.DataFrame(tmp, columns=['SENTENCE', 'SPEAKERID', 'SENTENCEID'])
13 | new_df = new_df[:7180]
14 | 
15 | text_list = []
16 | label_list = []
17 | all_list = []
18 | 
19 | session_num = 0
20 | new_df['dialog_session'] = 0
21 | 
22 | for i in range(len(new_df)):
23 |     if new_df["SENTENCEID"][i] == '1':
24 |         session_num += 1
25 |         new_df['dialog_session'][i] =+ session_num
26 | 
27 | print(new_df)
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Yunho Mo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 64,
 3 |   "fp16": {
 4 |     "enabled": true,
 5 |     "initial_scale_power": 8
 6 |   },
 7 |   "gradient_clipping": 1.0,
 8 |   "optimizer": {
 9 |     "type": "Adam",
10 |     "params": {
11 |       "lr": 3e-5,
12 |       "betas": [
13 |         0.9,
14 |         0.999
15 |       ],
16 |       "eps": 1e-8,
17 |       "weight_decay": 3e-7
18 |     }
19 |   },
20 |   "scheduler": {
21 |     "type": "WarmupLR",
22 |     "params": {
23 |       "warmup_min_lr": 0,
24 |       "warmup_max_lr": 3e-5,
25 |       "warmup_num_steps": 100
26 |     }
27 |   },
28 |   "zero_optimization": {
29 |     "stage": 2,
30 |     "allgather_partitions": true,
31 |     "allgather_bucket_size": 1e8,
32 |     "overlap_comm": true,
33 |     "reduce_scatter": true,
34 |     "reduce_bucket_size": 1e8,
35 |     "contiguous_gradients": true,
36 |     "offload_optimizer": {
37 |       "device": "cpu",
38 |       "pin_memory": true,
39 |       "fast_init": false
40 |     }
41 |   },
42 |   "activation_checkpointing": {
43 |     "partition_activations": true,
44 |     "cpu_checkpointing": true,
45 |     "contiguous_memory_optimization": true,
46 |     "number_checkpoints": 1,
47 |     "synchronize_checkpoint_boundary": false,
48 |     "profile": false
49 |   },
50 |   "zero_allow_untested_optimizer": true,
51 |   "wall_clock_breakdown": false,
52 |   "step_per_print": 100
53 | }


--------------------------------------------------------------------------------
/interactive.py:
--------------------------------------------------------------------------------
 1 | """
 2 | python interactive.py
 3 | """
 4 | import torch
 5 | from transformers import AutoModelWithLMHead, AutoTokenizer
 6 | 
 7 | model_name = "momo/gpt2-kiosk"
 8 | model = AutoModelWithLMHead.from_pretrained(model_name)
 9 | tokenizer = AutoTokenizer.from_pretrained(model_name)
10 | 
11 | SPECIAL_TOKENS = {
12 |     "bos_token": "<bos>",
13 |     "eos_token": "<eos>",
14 |     "pad_token": "<pad>",
15 |     "sep_token": "<seq>"
16 |     }
17 | SPECIAL_TOKENS_VALUES = ["<bos>", "<eos>", "<pad>", "<seq>"]
18 | tokenizer.add_special_tokens(SPECIAL_TOKENS)
19 | 
20 | model.cuda()
21 | 
22 | with torch.no_grad():
23 |     while True:
24 |         t = input("\nUser: ")
25 |         tokens = tokenizer(
26 |             t,
27 |             return_tensors="pt",
28 |             truncation=True,
29 |             padding=True,
30 |             max_length=50
31 |         )
32 | 
33 |         input_ids = tokens.input_ids.cuda()
34 |         attention_mask = tokens.attention_mask.cuda()
35 |         sample_output = model.generate(
36 |             input_ids, 
37 |             do_sample=True, 
38 |             max_length=50,
39 |             # max_new_tokens=50, 
40 |             # top_k=50,
41 |             # return_dict_in_generate=True
42 |         )
43 |         gen = sample_output[0]
44 |         print("System: " + tokenizer.decode(gen[len(input_ids[0]):-1], skip_special_tokens=True))
45 | 


--------------------------------------------------------------------------------
/new_data_interactive.py:
--------------------------------------------------------------------------------
 1 | """
 2 | python interactive.py
 3 | """
 4 | import torch
 5 | from transformers import AutoModelWithLMHead, AutoTokenizer
 6 | 
 7 | model_name = "skt/kogpt2-base-v2"
 8 | ckpt_name = "model_save/skt-kogpt2-base-v2.pt"
 9 | model = AutoModelWithLMHead.from_pretrained(model_name)
10 | tokenizer = AutoTokenizer.from_pretrained(model_name)
11 | 
12 | SPECIAL_TOKENS = {
13 |     "bos_token": "<bos>",
14 |     "eos_token": "<eos>",
15 |     "pad_token": "<pad>",
16 |     "sep_token": "<seq>"
17 |     }
18 | SPECIAL_TOKENS_VALUES = ["<bos>", "<eos>", "<pad>", "<seq>"]
19 | tokenizer.add_special_tokens(SPECIAL_TOKENS)
20 | model.resize_token_embeddings(len(tokenizer)) 
21 | 
22 | model.load_state_dict(torch.load(ckpt_name, map_location="cpu"))
23 | model.cuda()
24 | 
25 | with torch.no_grad():
26 |     while True:
27 |         t = input("\nUser: ")
28 |         tokens = tokenizer(
29 |             t,
30 |             return_tensors="pt",
31 |             truncation=True,
32 |             padding=True,
33 |             max_length=50
34 |         )
35 | 
36 |         input_ids = tokens.input_ids.cuda()
37 |         attention_mask = tokens.attention_mask.cuda()
38 |         sample_output = model.generate(
39 |             input_ids, 
40 |             do_sample=True, 
41 |             max_length=50,
42 |             # max_new_tokens=50, 
43 |             # top_k=50,
44 |             # return_dict_in_generate=True
45 |         )
46 |         gen = sample_output[0]
47 |         print("System: " + tokenizer.decode(gen[len(input_ids[0]):-1], skip_special_tokens=True))
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # kiosk_bot
 2 | 
 3 | - KoGPT2를 이용한 간단한 kiosk chatbot 입니다. 
 4 | 
 5 | - 데이터는 AiHub의 [한국어대화 데이터](https://aihub.or.kr/aidata/85)를 사용했습니다. 
 6 | 
 7 | - 데이터는 학습만 진행하였고 공개는 하지 않습니다.
 8 | 
 9 | ## Architecture 
10 | Hugging face의 예제들을 보며 구현하였습니다. 
11 | 
12 | <img width="549" alt="gpt" src="https://user-images.githubusercontent.com/60643542/142431681-85db3d74-172d-45f0-9433-de43a8aeae17.png">
13 | 
14 |  ```input``` 은 ```User + <BOS> + System + <EOS>```로 구현하였다. 
15 | 
16 | ```output``` 은 ```P(system|user)```를 최대화 시키기위해 모델을 학습하였다. 
17 | 
18 | ## Install 
19 | ```sh
20 | git clone https://github.com/momozzing/kiosk_bot.git
21 | pip install requirements.txt
22 | ```
23 | 
24 | ## How to train?
25 | 기존 학습된 데이터로 대화를 하고 싶으시면 ```install``` 후 ```How to chat``` 으로 넘어가셔도 됩니다. 
26 | ```
27 | cd kiosk_bot/data  ## 안에 원하시는 데이터를 넣으시면 됩니다. 
28 | 
29 | kiosk_bot/GPT-2_fine_tune.py  ## 안에 데이터 loader들의 파일이름을 변경해주시면 됩니다. 
30 | 
31 | deepspeed --num_gpus=1 GPT-2_fine_tune.py ## 변경후 이것으로 실행하시면 됩니다. 
32 | 
33 | 그후 생성된 skt-kogpt2-base-v2.pt 를 사용하시면 됩니다. 
34 | ```
35 | 
36 | ## How to chat?
37 | ```
38 | python interactive.py   ##기존 데이터로 채팅하기
39 | 
40 | python new_data_interactive.py  ##새로운 데이터로 파인튜닝한것 pt파일 불러와서 채팅하기.
41 | 
42 | --기존 데이터 채팅--
43 | 
44 | User: 아메리카노 주세요
45 | System: 따뜻한 거 드릴까요?
46 | 
47 | User: 아이스로 주세요
48 | System: 사이즈는 톨 괜찮으세요?
49 | 
50 | User: 그란데 사이즈로 주세요
51 | System: 죄송한데 저희 가게에는 레귤러랑 라지 사이즈 밖에 없어요
52 | 
53 | User:  그럼 라지사이즈로 주세요
54 | System: 네 고객님 테이크아웃 하시나요?
55 | 
56 | User: 먹고갈게요
57 | System: 네 그럼 머그잔으로 준비해드리겠습니다
58 | 
59 | User: 감사합니다
60 | System: 좋은 하루 보내세요
61 | ```
62 | 
63 | ## Reference
64 | [HuggingFace](https://huggingface.co/transformers/index.html)
65 | 
66 | [KoGPT2](https://github.com/SKT-AI/KoGPT2)
67 | 
68 | [AIHUB](https://aihub.or.kr/)
69 | 


--------------------------------------------------------------------------------
/GPT-2_fine_tune.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | deepspeed --num_gpus=1 GPT-2_fine_tune.py
  3 | '''
  4 | 
  5 | from argparse import ArgumentParser
  6 | import os
  7 | import pandas as pd
  8 | import torch
  9 | from torch.utils.data import DataLoader
 10 | from tqdm import tqdm
 11 | from transformers import AutoTokenizer, AutoModelWithLMHead
 12 | 
 13 | import deepspeed
 14 | from deepspeed.ops.adam import DeepSpeedCPUAdam
 15 | import wandb
 16 | 
 17 | os.environ["TOKENIZERS_PARALLELISM"] = "true"
 18 | 
 19 | model_name = "skt/kogpt2-base-v2"
 20 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 21 | SPECIAL_TOKENS = {
 22 |     "bos_token": "<bos>",
 23 |     "eos_token": "<eos>",
 24 |     "pad_token": "<pad>",
 25 |     "sep_token": "<seq>"
 26 |     }
 27 | tokenizer.add_special_tokens(SPECIAL_TOKENS)
 28 | 
 29 | model = AutoModelWithLMHead.from_pretrained(
 30 |     model_name
 31 | ).cuda()
 32 | 
 33 | model.resize_token_embeddings(len(tokenizer)) 
 34 | 
 35 | parser = ArgumentParser()
 36 | parser.add_argument("--deepspeed_config", type=str, default="ds_config.json")
 37 | parser.add_argument("--local_rank", type=int)
 38 | parser.add_argument("--epoch", default=50, type=int)
 39 | parser.add_argument("--batch_size", default=128, type=int)
 40 | parser.add_argument("--sep_token", default=tokenizer.sep_token, type=str)
 41 | parser.add_argument("--bos_token", default=tokenizer.bos_token, type=str)
 42 | parser.add_argument("--eos_token", default=tokenizer.eos_token, type=str)
 43 | args = parser.parse_args()
 44 | 
 45 | wandb.init(project="mobot", name=f"mobot-{model_name}")
 46 | train_data = pd.read_csv("data/cafe_clear_data_test.tsv", delimiter="\t")
 47 | train_data = train_data[:3000]
 48 | train_text, train_labels = (
 49 |     train_data["text"].values,
 50 |     train_data["label"].values,
 51 | )
 52 | 
 53 | dataset = [
 54 |     {"data": t + str(args.bos_token) + l + str(args.eos_token), "label": l}
 55 |     for t, l in zip(train_text, train_labels)
 56 | ]
 57 | train_loader = DataLoader(
 58 |     dataset,
 59 |     batch_size=args.batch_size,
 60 |     num_workers=8,
 61 |     drop_last=True,
 62 |     pin_memory=True,
 63 | )
 64 | 
 65 | eval_data = pd.read_csv("data/cafe_clear_data_test.tsv", delimiter="\t")
 66 | eval_data = eval_data[3000:]
 67 | 
 68 | eval_text, eval_labels = (
 69 |     eval_data["text"].values,
 70 |     eval_data["label"].values,
 71 | )
 72 | 
 73 | dataset = [
 74 |     {"data": t + str(args.bos_token) + l + str(args.eos_token), "label": l }
 75 |     for t, l in zip(eval_text, eval_labels)
 76 | ]
 77 | eval_loader = DataLoader(
 78 |     dataset,
 79 |     batch_size=args.batch_size,
 80 |     num_workers=8,
 81 |     drop_last=True,
 82 |     pin_memory=True,
 83 | )
 84 | 
 85 | optimizer = DeepSpeedCPUAdam(
 86 |     lr=3e-5, weight_decay=3e-7, model_params=model.parameters()
 87 | )
 88 | 
 89 | engine, optimizer, _, _ = deepspeed.initialize(
 90 |     args=args, model=model, optimizer=optimizer
 91 | )
 92 | for epoch in range(args.epoch):
 93 |     model.train()
 94 |     for train in tqdm(train_loader):
 95 |         optimizer.zero_grad()
 96 |         text, label = train["data"], train["label"]
 97 |         text_tokens = tokenizer(
 98 |             text,
 99 |             return_tensors="pt",
100 |             max_length=50,
101 |             truncation=True,
102 |             padding=True,
103 |         )
104 | 
105 |         input_ids = text_tokens.input_ids.cuda()
106 |         attention_mask = text_tokens.attention_mask.cuda()
107 | 
108 |         output = engine.forward(
109 |             input_ids=input_ids,
110 |             attention_mask=attention_mask,
111 |             labels=input_ids,
112 |         )
113 | 
114 |         loss = output.loss
115 |         wandb.log({"loss": loss})
116 |         engine.backward(loss)
117 |         optimizer.step()
118 | 
119 |     with torch.no_grad():
120 |         model.eval()
121 |         for eval in tqdm(eval_loader):
122 |             eval_text, eval_label = eval["data"], eval["label"]
123 |             eval_text_tokens = tokenizer(
124 |                 eval_text,
125 |                 return_tensors="pt",
126 |                 max_length=50,
127 |                 truncation=True,
128 |                 padding=True,
129 |             )
130 | 
131 |             input_ids = eval_text_tokens.input_ids.cuda()
132 |             attention_mask = eval_text_tokens.attention_mask.cuda()
133 | 
134 |             eval_out = engine.forward(
135 |                 input_ids=input_ids,
136 |                 attention_mask=attention_mask,
137 |                 labels=input_ids,
138 |             )
139 |             wandb.log({"eval_loss": eval_out.loss})
140 |             wandb.log({"epoch": epoch+1})
141 | 
142 |         torch.save(model.state_dict(), f"model_save/{model_name.replace('/', '-')}.pt")
143 | 


--------------------------------------------------------------------------------