├── modeling
    ├── run_train.sh
    ├── test_evaluate.py
    └── train.py
├── preprocessing
    ├── get_data.sh
    └── get_train_data.py
├── requirments.txt
├── utils.py
├── inference.py
├── .gitignore
├── README.md
└── LICENSE


/modeling/run_train.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=0 python3 train.py


--------------------------------------------------------------------------------
/preprocessing/get_data.sh:
--------------------------------------------------------------------------------
1 | wget https://raw.githubusercontent.com/smilegate-ai/korean_smile_style_dataset/main/smilestyle_dataset.tsv


--------------------------------------------------------------------------------
/requirments.txt:
--------------------------------------------------------------------------------
1 | torch==1.13.1
2 | transformers==4.26.0
3 | pandas==1.5.3
4 | emoji==2.2.0
5 | soynlp==0.0.493
6 | datasets==2.10.1
7 | pandas==1.5.3


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import emoji
 3 | from soynlp.normalizer import repeat_normalize
 4 | 
 5 | pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣]+')
 6 | url_pattern = re.compile(
 7 |     r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
 8 | jamo_pattern = re.compile('[|ㄱ-ㅎ|ㅏ-ㅣ]+')
 9 | 
10 | 
11 | def clean(x):
12 |     x = pattern.sub(' ', x)
13 |     x = emoji.replace_emoji(x, replace='')  # emoji 삭제
14 |     x = url_pattern.sub('', x)
15 |     x = jamo_pattern.sub('', x)
16 |     x = x.strip()
17 |     x = repeat_normalize(x, num_repeats=2)
18 |     return x
19 | 


--------------------------------------------------------------------------------
/modeling/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from datasets import load_metric
 3 | import numpy as np
 4 | from pathlib import Path
 5 | 
 6 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
 7 | import pandas as pd
 8 | 
 9 | # device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
10 | device = 'cpu'
11 | 
12 | BASE_DIR = Path(__file__).resolve().parent.parent
13 | latest_model_path = BASE_DIR.joinpath(
14 |     'modeling', 'saved_model', 'formal_classifier_latest')
15 | 
16 | 
17 | class FormalClassifier(object):
18 |     def __init__(self):
19 |         self.model = AutoModelForSequenceClassification.from_pretrained(
20 |             latest_model_path).to(device)
21 |         self.tokenizer = AutoTokenizer.from_pretrained('beomi/kcbert-base')
22 | 
23 |     def predict(self, text: str):
24 |         inputs = self.tokenizer(
25 |             text, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
26 |         input_ids = inputs["input_ids"].to(device)
27 |         token_type_ids = inputs["token_type_ids"].to(device)
28 |         attentsion_mask = inputs["attention_mask"].to(device)
29 | 
30 |         model_inputs = {
31 |             "input_ids": input_ids,
32 |             "token_type_ids": token_type_ids,
33 |             "attention_mask": attentsion_mask,
34 |         }
35 |         return torch.argmax(self.model(**model_inputs).logits, dim=-1)
36 | 
37 | 
38 | if __name__ == '__main__':
39 | 
40 |     test = pd.read_csv(BASE_DIR.joinpath(
41 |         'modeling', 'data', 'test.tsv'), sep='\t', index_col=0)
42 | 
43 |     test = test.dropna()
44 | 
45 |     metric = load_metric("accuracy")
46 |     classifier = FormalClassifier()
47 | 
48 |     predictions = [classifier.predict(text)
49 |                    for text in test['sentence'].tolist()]
50 |     print(metric.compute(predictions=predictions,
51 |           references=test['label'].tolist()))
52 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | from pathlib import Path
 4 | 
 5 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
 6 | from utils import clean
 7 | 
 8 | BASE_DIR = str(Path(__file__).resolve().parent)
 9 | # model_token = os.getenv('MODEL_TOKEN')
10 | 
11 | latest_model_path = BASE_DIR + '/modeling/saved_model/formal_classifier_latest'
12 | device = 'cpu'
13 | 
14 | # pipeline = transformers.pipeline(
15 | #     "text-classification", model=model, tokenizer=tokenizer)
16 | 
17 | 
18 | class FormalClassifier(object):
19 |     def __init__(self):
20 |         self.model = AutoModelForSequenceClassification.from_pretrained(
21 |             latest_model_path).to(device)
22 |         self.tokenizer = AutoTokenizer.from_pretrained('beomi/kcbert-base')
23 | 
24 |     def predict(self, text: str):
25 |         text = clean(text)
26 |         inputs = self.tokenizer(
27 |             text, return_tensors="pt", max_length=64, truncation=True, padding="max_length")
28 |         input_ids = inputs["input_ids"].to(device)
29 |         token_type_ids = inputs["token_type_ids"].to(device)
30 |         attentsion_mask = inputs["attention_mask"].to(device)
31 | 
32 |         model_inputs = {
33 |             "input_ids": input_ids,
34 |             "token_type_ids": token_type_ids,
35 |             "attention_mask": attentsion_mask,
36 |         }
37 |         return torch.softmax(self.model(**model_inputs).logits, dim=-1)
38 | 
39 |     def is_formal(self, text):
40 |         if self.predict(text)[0][1] > self.predict(text)[0][0]:
41 |             return True
42 |         else:
43 |             return False
44 | 
45 |     def formal_percentage(self, text):
46 |         return round(float(self.predict(text)[0][1]), 2)
47 | 
48 |     def print_message(self, text):
49 |         result = self.formal_percentage(text)
50 |         if result > 0.5:
51 |             print(f'{text} : 존댓말입니다. ( 확률 {result*100}% )')
52 |         if result < 0.5:
53 |             print(f'{text} : 반말입니다. ( 확률 {((1 - result)*100)}% )')
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     classifier = FormalClassifier()
58 |     classifier.print_message("저번에 교수님께서 자료 가져오라고 하셨는데 기억나세요?")
59 |     classifier.print_message("저번에 교수님이 자료 가져오라고 하셨는데 기억나?")
60 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.tsv
  2 | *.ckpt
  3 | modeling/saved_model/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | pip-wheel-metadata/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 


--------------------------------------------------------------------------------
/preprocessing/get_train_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import itertools
 3 | import os
 4 | from typing import Final, List
 5 | from pathlib import Path
 6 | 
 7 | smg_df = pd.read_csv("./meta/smilestyle_dataset.tsv", sep="\t")
 8 | chat_df = pd.read_csv('./meta/aihub_sentiment_dataset.tsv', sep='\t')
 9 | 
10 | BASE_DIR = Path(__file__).resolve().parent.parent
11 | EXPORT_DIR = BASE_DIR.joinpath("modeling", "data")
12 | 
13 | 
14 | def df2sentence(df: pd.DataFrame, cols: List[str]) -> List[str]:
15 |     sentence = [df[col].tolist() for col in cols]
16 |     sentence = list(itertools.chain(*sentence))
17 |     sentence = [s for s in sentence if type(s) == str]
18 |     sentence = [s.split('.') for s in sentence]
19 |     sentence = list(itertools.chain(*sentence))
20 |     sentence = [s.strip() for s in sentence if s.strip()]
21 |     sentence = [s for s in sentence if len(s) > 5]
22 |     return sentence
23 | 
24 | 
25 | formal_cols = ['formal', 'gentle']
26 | informal_cols = ['informal', 'chat', 'enfp', 'sosim', 'choding', 'joongding']
27 | 
28 | smg_formal = df2sentence(smg_df, formal_cols)
29 | smg_infomal = df2sentence(smg_df, informal_cols)
30 | 
31 | chat_formal = df2sentence(chat_df, ['시스템응답1', '시스템응답2', '시스템응답3', '시스템응답4'])
32 | chat_informal = df2sentence(chat_df, ['사람문장1', '사람문장2', '사람문장3', '사람문장4'])
33 | 
34 | formal_data = smg_formal + chat_formal
35 | informal_data = smg_infomal + chat_informal
36 | 
37 | # 존댓말 1 , 반말 0
38 | data = pd.concat([pd.DataFrame({'sentence': informal_data, "label": 0}), pd.DataFrame(
39 |     {'sentence': formal_data, "label": 1})])
40 | 
41 | # # 토큰화
42 | # tokenizer = PeCab()
43 | # data['sentence'] = data['sentence'].apply(lambda x: tokenizer.tokenize(x))
44 | 
45 | # 셔플
46 | data = data.sample(frac=1)
47 | data.reset_index(drop=True, inplace=True)
48 | 
49 | split_rate: Final[float] = 0.1
50 | 
51 | # 테스트&검증 데이터 비율 설정
52 | range_ = int(len(data) * split_rate)
53 | 
54 | # 데이터 분할
55 | dev = data[:range_]
56 | test = data[range_:range_ * 2]
57 | train = data[range_ * 2:]
58 | 
59 | 
60 | # 중복 제거
61 | train.drop_duplicates(subset=['sentence'], inplace=True, ignore_index=True)
62 | test.drop_duplicates(subset=['sentence'], inplace=True, ignore_index=True)
63 | dev.drop_duplicates(subset=['sentence'], inplace=True, ignore_index=True)
64 | 
65 | 
66 | if not os.path.exists(EXPORT_DIR):
67 |     os.makedirs(EXPORT_DIR)
68 | 
69 | # print("train label rate: ",train['label'].value_counts())
70 | # print("dev label rate: ",dev['label'].value_counts())
71 | # print("test label rate: ",test['label'].value_counts())
72 | 
73 | # 데이터 내보내기
74 | train.to_csv(EXPORT_DIR.joinpath("train.tsv"), sep="\t")
75 | dev.to_csv(EXPORT_DIR.joinpath("dev.tsv"), sep="\t")
76 | test.to_csv(EXPORT_DIR.joinpath("test.tsv"), sep="\t")
77 | 


--------------------------------------------------------------------------------
/modeling/train.py:
--------------------------------------------------------------------------------
 1 | from transformers import TrainingArguments
 2 | from transformers import Trainer
 3 | from datasets import load_metric
 4 | import numpy as np
 5 | from transformers import AutoTokenizer, AutoModelForSequenceClassification
 6 | from datasets.dataset_dict import DatasetDict
 7 | from datasets import Dataset
 8 | 
 9 | import torch
10 | import pandas as pd
11 | 
12 | from typing import Final
13 | from pathlib import Path
14 | 
15 | # Base Model (108M)
16 | 
17 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
18 | print(device)
19 | 
20 | BASE_DIR = Path(__file__).resolve().parent.parent
21 | 
22 | 
23 | class FormalClassifier:
24 |     def __init__(self):
25 |         self.model_name = "beomi/kcbert-base"
26 |         self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
27 |         self.model = AutoModelForSequenceClassification.from_pretrained(
28 |             self.model_name).to(device)
29 | 
30 |         self.batch_size: Final[int] = 32
31 |         self.max_len: Final[int] = 64
32 |         self.dataLoader()
33 | 
34 |     def tokenize_function(self, examples):
35 |         return self.tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=self.max_len)
36 | 
37 |     def dataLoader(self):
38 |         train = pd.read_csv(BASE_DIR.joinpath(
39 |             'modeling', 'data', 'train.tsv'), sep='\t', index_col=0)
40 |         dev = pd.read_csv(BASE_DIR.joinpath(
41 |             'modeling', 'data', 'dev.tsv'), sep='\t', index_col=0)
42 | 
43 |         train = train.dropna()
44 |         dev = dev.dropna()
45 | 
46 |         dataset = DatasetDict({
47 |             'train': Dataset.from_dict({'sentence': train['sentence'].tolist(), 'label': train['label'].tolist()}),
48 |             'dev': Dataset.from_dict({'sentence': dev['sentence'].tolist(), 'label': dev['label'].tolist()}),
49 |         })
50 | 
51 |         tokenized_datasets = dataset.map(self.tokenize_function, batched=True)
52 | 
53 |         self.train_dataset = tokenized_datasets["train"]
54 |         self.dev_dataset = tokenized_datasets["dev"]
55 | 
56 |     def compute_metrics(self, eval_pred):
57 |         metric = load_metric("accuracy")
58 |         logits, labels = eval_pred
59 |         predictions = np.argmax(logits, axis=-1)
60 |         return metric.compute(predictions=predictions, references=labels)
61 | 
62 |     def train(self):
63 |         training_args = TrainingArguments("./saved_model",
64 |                                           per_device_train_batch_size=self.batch_size,
65 |                                           num_train_epochs=2,
66 |                                           learning_rate=3e-05,
67 |                                           save_strategy="epoch",
68 |                                           evaluation_strategy="epoch",
69 |                                           fp16=True,
70 |                                           )
71 | 
72 |         trainer = Trainer(
73 |             model=self.model,
74 |             args=training_args,
75 |             train_dataset=self.train_dataset,
76 |             eval_dataset=self.dev_dataset,
77 |             compute_metrics=self.compute_metrics,
78 |         )
79 | 
80 |         trainer.train()
81 |         trainer.evaluate()
82 | 
83 | 
84 | if __name__ == "__main__":
85 |     model = FormalClassifier()
86 |     model.train()
87 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # formal_classifier
  2 | formal classifier or honorific classifier
  3 | 
  4 | ## 한국어 존댓말 반말 분류기
  5 | 
  6 | 오래전에 존댓말 , 반말을 한국어 형태소 분석기로 분류하는 간단한 방법을 소개했다.<br>
  7 | 하지만 이 방법을 실제로 적용하려 했더니, 많은 부분에서 오류가 발생하였다.
  8 | 
  9 | 예를 들면)
 10 | ```bash
 11 | 저번에 교수님께서 자료 가져오라했는데 기억나?
 12 |  ```
 13 | 라는 문구를 "께서"라는 존칭때문에 전체문장을 존댓말로 판단하는 오류가 많이 발생했다. <br>
 14 |  그래서 이번에 딥러닝 모델을 만들고 그 과정을 공유해보고자한다.
 15 | 
 16 | #### 빠르게 가져다 쓰실 분들은 아래 코드로 바로 사용하실 수 있습니다.
 17 | ```python
 18 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 19 | 
 20 | model = AutoModelForSequenceClassification.from_pretrained("j5ng/kcbert-formal-classifier")
 21 | tokenizer = AutoTokenizer.from_pretrained('j5ng/kcbert-formal-classifier')
 22 | 
 23 | formal_classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)
 24 | print(formal_classifier("저번에 교수님께서 자료 가져오라했는데 기억나?")) 
 25 | # [{'label': 'LABEL_0', 'score': 0.9999139308929443}]
 26 | ```
 27 | 
 28 | #### Batch Inference Using Cuda
 29 | ```python
 30 | import torch
 31 | from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
 32 | from tqdm import tqdm
 33 | 
 34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 35 | 
 36 | model_name = "j5ng/kcbert-formal-classifier"
 37 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 38 | model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
 39 | 
 40 | formal_classifier = pipeline(
 41 |     task="text-classification", 
 42 |     model=model, 
 43 |     tokenizer=tokenizer, 
 44 |     device=0 if torch.cuda.is_available() else -1, 
 45 |     batch_size=128,
 46 | )
 47 | 
 48 | chunk_size = 1000  # 각 청크의 크기를 1000으로 설정
 49 | chunks = [sentence[i:i+chunk_size] for i in range(0, len(sentence), chunk_size)]  # 텍스트 리스트를 청크로 나눔
 50 | 
 51 | scores = []
 52 | for chunk in tqdm(chunks):
 53 |     batch_scores = formal_classifier(chunk)
 54 |     batch_scores = [round(1 - i['score'], 2) if i['label'] == 'LABEL_0' else round(i['score'],2) for i in batch_scores]
 55 |     scores.extend(batch_scores)
 56 |  
 57 | # print(scores)
 58 | 
 59 | ```
 60 | 
 61 | ***
 62 | 
 63 | ### 데이터 셋 출처
 64 | 
 65 | #### 스마일게이트 말투 데이터 셋(korean SmileStyle Dataset)
 66 |  : https://github.com/smilegate-ai/korean_smile_style_dataset
 67 | 
 68 | #### AI 허브 감성 대화 말뭉치
 69 |  : https://www.aihub.or.kr/
 70 |  
 71 |  #### 데이터셋 다운로드(AI허브는 직접다운로드만 가능)
 72 |  ```bash
 73 |  wget https://raw.githubusercontent.com/smilegate-ai/korean_smile_style_dataset/main/smilestyle_dataset.tsv
 74 |  ```
 75 |  
 76 |  ### 개발 환경
 77 |  ```bash
 78 |  Python3.9
 79 |  ```
 80 |  
 81 |  ```bash
 82 | torch==1.13.1
 83 | transformers==4.26.0
 84 | pandas==1.5.3
 85 | emoji==2.2.0
 86 | soynlp==0.0.493
 87 | datasets==2.10.1
 88 | pandas==1.5.3
 89 |  ```
 90 |  
 91 |  
 92 |  #### 사용 모델 
 93 |  beomi/kcbert-base 
 94 |   - GitHub : https://github.com/Beomi/KcBERT
 95 |   - HuggingFace : https://huggingface.co/beomi/kcbert-base
 96 | ***
 97 | 
 98 | ## 데이터
 99 | ```bash
100 | get_train_data.py
101 | ```
102 | 
103 | ### 예시
104 | |sentence|label|
105 | |------|---|
106 | |공부를 열심히 해도 열심히 한 만큼 성적이 잘 나오지 않아|0|
107 | |아들에게 보내는 문자를 통해 관계가 회복되길 바랄게요|1|
108 | |참 열심히 사신 보람이 있으시네요|1|
109 | |나도 스시 좋아함 이번 달부터 영국 갈 듯|0|
110 | |본부장님이 내가 할 수 없는 업무를 계속 주셔서 힘들어|0|
111 | 
112 | 
113 | ### 분포
114 | |label|train|test|
115 | |------|---|---|
116 | |0|133,430|34,908|
117 | |1|112,828|29,839|
118 | 
119 | ***
120 | 
121 | ## 학습(train)
122 | ```bash
123 | python3 modeling/train.py
124 | ```
125 | 
126 | ***
127 | 
128 | ## 예측(inference)
129 | ```bash
130 | python3 inference.py
131 | ```
132 | 
133 | ```python
134 | def formal_percentage(self, text):
135 |     return round(float(self.predict(text)[0][1]), 2)
136 | 
137 | def print_message(self, text):
138 |     result = self.formal_persentage(text)
139 |     if result > 0.5:
140 |         print(f'{text} : 존댓말입니다. ( 확률 {result*100}% )')
141 |     if result < 0.5:
142 |         print(f'{text} : 반말입니다. ( 확률 {((1 - result)*100)}% )')
143 | ```
144 | 
145 | 결과 
146 | ```
147 | 저번에 교수님께서 자료 가져오라하셨는데 기억나세요? : 존댓말입니다. ( 확률 99.19% )
148 | 저번에 교수님께서 자료 가져오라했는데 기억나? : 반말입니다. ( 확률 92.86% )
149 | ```
150 | 
151 | 
152 | 
153 | ***
154 | 
155 | ## 인용
156 | ```bash
157 | @misc{SmilegateAI2022KoreanSmileStyleDataset,
158 |   title         = {SmileStyle: Parallel Style-variant Corpus for Korean Multi-turn Chat Text Dataset},
159 |   author        = {Seonghyun Kim},
160 |   year          = {2022},
161 |   howpublished  = {\url{https://github.com/smilegate-ai/korean_smile_style_dataset}},
162 | }
163 | ```
164 | 
165 | ```bash
166 | @inproceedings{lee2020kcbert,
167 |   title={KcBERT: Korean Comments BERT},
168 |   author={Lee, Junbum},
169 |   booktitle={Proceedings of the 32nd Annual Conference on Human and Cognitive Language Technology},
170 |   pages={437--440},
171 |   year={2020}
172 | }
173 | ```
174 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------